KANIKIG

KANIKIG

just for fun | 兴趣使然 Ph.D. in Engineering|❤️ #NFT $ETH| [Twitter](https://twitter.com/kanikig2)|[Github](https://github.com/KANIKIG)|[Telegram channel](https://t.me/kanikigtech)

pandas筆記

image

讀取表格數據文件#

cols = ['col1', 'col2']
pd.read_table(path, sep='|', headers=None, names=cols)
pd.head()	# 前五行

以括號結尾的命令#

df.describe()	# 顯示平均值、標準差、最大值、最小值...

重命名列#

df.rename(columns = {'old_col_name':'new_col_name'}, inplace=True)
cols = new_col_list
df.columns = cols
df.columns = df.columns.str.replace(' ', '_')

刪除列#

df.drop('col', axis=1, inplace=True)
df.drop([rows], axis=0, inplace=True)

排序#

df['col'].sort_values(ascending=False)	# 系列
df.sort_values('col')	# 返回數據框
df.sort_values(['col1', 'col2'])	

過濾 pandas DataFrame 的行#

pd.Series([])
df[df['col'] >= 200]
#	df[df['col'] >= 200]['col2']
df.loc[df['col'] >= 200, 'col2']	#	loc 根據標籤選擇列或行

多重過濾條件#

df[(df['col'] >= 200) | (df['col1'] == 'a')]
df['col'].isin(['a', 'b', 'c'])

讀取兩列#

pd.read_csv(path, usecols=['a', 'b'])
pd.read_csv(path, nrows=3)

迭代#

for c in df['col']:
	print(c)
for index, row in df.iterrows():
  print(index, row['a'], row['b'])

過濾數值列#

import numpy as np
df.select_dtypes(include=[np.number]).dtypes

描述方法#

df.describe(include=['object', 'float64'])

"axis" 參數#

df.mean()	# 系列的平均值
df.mean(axis=1)	# 行平均

字串方法#

df['col'].str.upper()	# 大寫 
df['col'].str.contains('str')	# 返回 t/f 系列
df[df['col'].str.contains('str')]
df['col'].str.replace('a', 'b').str.replace('c', 'd')	# 支持正則表達式

更改數據類型#

df.dtypes
serires.dtype
df['col'] = df['col'].astype('float64')

df = pd.read_csv(path, dtype=('col1':float))
df['col'].str.replace('$', '').astype(float).mean()

groupby#

df.groupby('col1').mean()
df.groupby('col1')['col2'].mean()
df.groupby('col1')['col2'].agg(['count', 'min', 'max', 'mean'])
%matplotlib inline
df.groupby('col1')['col2'].mean().plot(kind='bar')

pandas 系列#

df['col'].describe()
df['col'].value_counts()
df['col'].value_counts(normalize=True)
df['col'].unique()
df['col'].nunique()	# 唯一值的數量
pd.crosstab(df['col1'], df['col2'])
df['col1'].plot(kind='hist')

處理缺失值#

df.tail()
df.isnull().head()
df.isnull().sum()
df.dropna(how='any', inplace=True)	# 如果包含空值則刪除行
df.dropna(subset=['col1', 'col2'], how='any')	# 如果 col1 或 col2 為空則刪除行
df['col'].value_counts(dropna=False)
df['col'].fillna(value='A', inplace=True)

pandas 索引#

df.index
df.columns
df.shape

df.loc[23, 'a']	# 獲取值
df.decribe().loc['25%', 'col']

df.set_index(Series, inplace-True)

df['col'].value_counts()['value1']
df['col'].value_counts().sort_values()
df['col'].value_counts().sort_index()

pd.Series(value array, index=[indices])
pd.concat([df1, df2], axis=1)

選擇多行和多列#

df.loc[index, col_name]	# 按標籤
df.iloc[:, [0, 3]]	# 按索引

df= pd.read_csv(path, index_col='col1')
df.ix['row_label', 0]	# 混合標籤和索引(不建議)
df.ix[1, 'col_label']
df.ix['col1':'col3', 0:2]

使數據框更小更快#

df.memory_usage(deep=True)
df['col'].astype('category')
df['col'] = df['col'].astype('category', categories=['good', 'veray good', 'excellent', ordered=True])

pandas 與 scikit-learn#

pd.DataFrame({'id':array1, 'col':array2}).set_index('id').to_csv('1.csv')
df.to_pickle('1.pkl')
df.read_pickle('1.pkl')

loc 和 iloc#

df.loc[0:4, :]	# 包含標籤
df.iloc[0:4, :]	# 排除索引

大型 csv#

df.sample(n=3, random_state=42)
df.sample(frac=0.75)
train = df.sample()
test = df.loc[~df.index.isin(train.index), :]

虛擬變量#

df['sex_male'] = df['sex'].map({'female':0, 'male':1})
pd.get_dummies(df['sex'], prefix='sex')

日期和時間#

df['time'] = pd.to_datetime(df['time'])
df['time'].dt.weekday_name
df['time'].dt.dayofyear
df.loc[df['time'] > pd.to_datetime('1/1/1999'), :]
(df['time'].max() - df['time'].min()).days

查找和刪除重複行#

df['col'].duplicated().sum()
df.loc[df.duplicated(keep='first'), :]	# 'last' False
df.duplicated(subset=['col1', 'col2'])

避免 SettingWithCopyWarning#

image-20210819221914496

image-20210819222207602

不確定這是副本還是視圖。

更改顯示選項#

pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows', None)
pd.set_option('display.max_colwidth, None)
pd.set_option('display.precision, 2)
df['new_col'] = df['col'] * 1000
pd.set_option('display.float_format', '{:,}'.format)

從其他對象創建 pandas DataFrame#

pd.DataFrame(dict, columns=['a', 'b'])
s = pd.Series([1, 2], index=['c', 'd'], name=shape)

將函數應用於 pandas 系列或數據框#

df['col1'] = df['col2'].apply(function)
df['col1'] = df['col2'].apply(np.ceil)
df['col'].apply(lambda x: x[0])
df['col1'] = df['col2'].apply(np.argmax, axis=1)

多重索引#

ser = df.groupby(['col1', 'col2']).col3.mean()	# 帶有多重索引的系列
ser.unstack()	# 數據框
df.pivot_table(values='col1', index='col2', columns='col3')

df.set_index(['col1', 'col2'], inplace=True)
df.sort_index(inplace=True)

合併數據框#

pd.concat([df1, df2])
pd.merge([df1, df2])

pd.merge(df1, df2, left_on='col1', right_on='col2')
pd.merge(df1, df2, how='inner')	# outer left right
載入中......
此文章數據所有權由區塊鏈加密技術和智能合約保障僅歸創作者所有。