pandas筆記 - KANIKIG

讀取表格數據文件#

cols = ['col1', 'col2']
pd.read_table(path, sep='|', headers=None, names=cols)
pd.head()	# 前五行

以括號結尾的命令#

df.describe()	# 顯示平均值、標準差、最大值、最小值...

重命名列#

df.rename(columns = {'old_col_name':'new_col_name'}, inplace=True)

cols = new_col_list
df.columns = cols
df.columns = df.columns.str.replace(' ', '_')

刪除列#

df.drop('col', axis=1, inplace=True)

df.drop([rows], axis=0, inplace=True)

排序#

df['col'].sort_values(ascending=False)	# 系列

df.sort_values('col')	# 返回數據框
df.sort_values(['col1', 'col2'])

過濾 pandas DataFrame 的行#

pd.Series([])

df[df['col'] >= 200]

#	df[df['col'] >= 200]['col2']
df.loc[df['col'] >= 200, 'col2']	#	loc 根據標籤選擇列或行

多重過濾條件#

df[(df['col'] >= 200) | (df['col1'] == 'a')]

df['col'].isin(['a', 'b', 'c'])

讀取兩列#

pd.read_csv(path, usecols=['a', 'b'])

pd.read_csv(path, nrows=3)

迭代#

for c in df['col']:
	print(c)

for index, row in df.iterrows():
  print(index, row['a'], row['b'])

過濾數值列#

import numpy as np
df.select_dtypes(include=[np.number]).dtypes

描述方法#

df.describe(include=['object', 'float64'])

"axis" 參數#

df.mean()	# 系列的平均值
df.mean(axis=1)	# 行平均

字串方法#

df['col'].str.upper()	# 大寫 
df['col'].str.contains('str')	# 返回 t/f 系列
df[df['col'].str.contains('str')]
df['col'].str.replace('a', 'b').str.replace('c', 'd')	# 支持正則表達式

更改數據類型#

df.dtypes
serires.dtype
df['col'] = df['col'].astype('float64')

df = pd.read_csv(path, dtype=('col1':float))
df['col'].str.replace('$', '').astype(float).mean()

groupby#

df.groupby('col1').mean()
df.groupby('col1')['col2'].mean()
df.groupby('col1')['col2'].agg(['count', 'min', 'max', 'mean'])

%matplotlib inline
df.groupby('col1')['col2'].mean().plot(kind='bar')

pandas 系列#

df['col'].describe()
df['col'].value_counts()
df['col'].value_counts(normalize=True)
df['col'].unique()
df['col'].nunique()	# 唯一值的數量
pd.crosstab(df['col1'], df['col2'])
df['col1'].plot(kind='hist')

處理缺失值#

df.tail()
df.isnull().head()
df.isnull().sum()
df.dropna(how='any', inplace=True)	# 如果包含空值則刪除行
df.dropna(subset=['col1', 'col2'], how='any')	# 如果 col1 或 col2 為空則刪除行
df['col'].value_counts(dropna=False)
df['col'].fillna(value='A', inplace=True)

pandas 索引#

df.index
df.columns
df.shape

df.loc[23, 'a']	# 獲取值
df.decribe().loc['25%', 'col']

df.set_index(Series, inplace-True)

df['col'].value_counts()['value1']
df['col'].value_counts().sort_values()
df['col'].value_counts().sort_index()

pd.Series(value array, index=[indices])
pd.concat([df1, df2], axis=1)

選擇多行和多列#

df.loc[index, col_name]	# 按標籤
df.iloc[:, [0, 3]]	# 按索引

df= pd.read_csv(path, index_col='col1')
df.ix['row_label', 0]	# 混合標籤和索引（不建議）
df.ix[1, 'col_label']
df.ix['col1':'col3', 0:2]

使數據框更小更快#

df.memory_usage(deep=True)
df['col'].astype('category')
df['col'] = df['col'].astype('category', categories=['good', 'veray good', 'excellent', ordered=True])

pandas 與 scikit-learn#

pd.DataFrame({'id':array1, 'col':array2}).set_index('id').to_csv('1.csv')
df.to_pickle('1.pkl')
df.read_pickle('1.pkl')

loc 和 iloc#

df.loc[0:4, :]	# 包含標籤
df.iloc[0:4, :]	# 排除索引

大型 csv#

df.sample(n=3, random_state=42)
df.sample(frac=0.75)
train = df.sample()
test = df.loc[~df.index.isin(train.index), :]

虛擬變量#

df['sex_male'] = df['sex'].map({'female':0, 'male':1})
pd.get_dummies(df['sex'], prefix='sex')

日期和時間#

df['time'] = pd.to_datetime(df['time'])
df['time'].dt.weekday_name
df['time'].dt.dayofyear
df.loc[df['time'] > pd.to_datetime('1/1/1999'), :]
(df['time'].max() - df['time'].min()).days

查找和刪除重複行#

df['col'].duplicated().sum()
df.loc[df.duplicated(keep='first'), :]	# 'last' False
df.duplicated(subset=['col1', 'col2'])

避免 SettingWithCopyWarning#

不確定這是副本還是視圖。

更改顯示選項#

pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows', None)
pd.set_option('display.max_colwidth, None)
pd.set_option('display.precision, 2)
df['new_col'] = df['col'] * 1000
pd.set_option('display.float_format', '{:,}'.format)

從其他對象創建 pandas DataFrame#

pd.DataFrame(dict, columns=['a', 'b'])
s = pd.Series([1, 2], index=['c', 'd'], name=shape)

將函數應用於 pandas 系列或數據框#

df['col1'] = df['col2'].apply(function)
df['col1'] = df['col2'].apply(np.ceil)
df['col'].apply(lambda x: x[0])
df['col1'] = df['col2'].apply(np.argmax, axis=1)

多重索引#

ser = df.groupby(['col1', 'col2']).col3.mean()	# 帶有多重索引的系列
ser.unstack()	# 數據框
df.pivot_table(values='col1', index='col2', columns='col3')

df.set_index(['col1', 'col2'], inplace=True)
df.sort_index(inplace=True)

合併數據框#

pd.concat([df1, df2])
pd.merge([df1, df2])

pd.merge(df1, df2, left_on='col1', right_on='col2')
pd.merge(df1, df2, how='inner')	# outer left right