讀取表格數據文件#
cols = ['col1', 'col2']
pd.read_table(path, sep='|', headers=None, names=cols)
pd.head() # 前五行
以括號結尾的命令#
df.describe() # 顯示平均值、標準差、最大值、最小值...
重命名列#
df.rename(columns = {'old_col_name':'new_col_name'}, inplace=True)
cols = new_col_list
df.columns = cols
df.columns = df.columns.str.replace(' ', '_')
刪除列#
df.drop('col', axis=1, inplace=True)
df.drop([rows], axis=0, inplace=True)
排序#
df['col'].sort_values(ascending=False) # 系列
df.sort_values('col') # 返回數據框
df.sort_values(['col1', 'col2'])
過濾 pandas DataFrame 的行#
pd.Series([])
df[df['col'] >= 200]
# df[df['col'] >= 200]['col2']
df.loc[df['col'] >= 200, 'col2'] # loc 根據標籤選擇列或行
多重過濾條件#
df[(df['col'] >= 200) | (df['col1'] == 'a')]
df['col'].isin(['a', 'b', 'c'])
讀取兩列#
pd.read_csv(path, usecols=['a', 'b'])
pd.read_csv(path, nrows=3)
迭代#
for c in df['col']:
print(c)
for index, row in df.iterrows():
print(index, row['a'], row['b'])
過濾數值列#
import numpy as np
df.select_dtypes(include=[np.number]).dtypes
描述方法#
df.describe(include=['object', 'float64'])
"axis" 參數#
df.mean() # 系列的平均值
df.mean(axis=1) # 行平均
字串方法#
df['col'].str.upper() # 大寫
df['col'].str.contains('str') # 返回 t/f 系列
df[df['col'].str.contains('str')]
df['col'].str.replace('a', 'b').str.replace('c', 'd') # 支持正則表達式
更改數據類型#
df.dtypes
serires.dtype
df['col'] = df['col'].astype('float64')
df = pd.read_csv(path, dtype=('col1':float))
df['col'].str.replace('$', '').astype(float).mean()
groupby#
df.groupby('col1').mean()
df.groupby('col1')['col2'].mean()
df.groupby('col1')['col2'].agg(['count', 'min', 'max', 'mean'])
%matplotlib inline
df.groupby('col1')['col2'].mean().plot(kind='bar')
pandas 系列#
df['col'].describe()
df['col'].value_counts()
df['col'].value_counts(normalize=True)
df['col'].unique()
df['col'].nunique() # 唯一值的數量
pd.crosstab(df['col1'], df['col2'])
df['col1'].plot(kind='hist')
處理缺失值#
df.tail()
df.isnull().head()
df.isnull().sum()
df.dropna(how='any', inplace=True) # 如果包含空值則刪除行
df.dropna(subset=['col1', 'col2'], how='any') # 如果 col1 或 col2 為空則刪除行
df['col'].value_counts(dropna=False)
df['col'].fillna(value='A', inplace=True)
pandas 索引#
df.index
df.columns
df.shape
df.loc[23, 'a'] # 獲取值
df.decribe().loc['25%', 'col']
df.set_index(Series, inplace-True)
df['col'].value_counts()['value1']
df['col'].value_counts().sort_values()
df['col'].value_counts().sort_index()
pd.Series(value array, index=[indices])
pd.concat([df1, df2], axis=1)
選擇多行和多列#
df.loc[index, col_name] # 按標籤
df.iloc[:, [0, 3]] # 按索引
df= pd.read_csv(path, index_col='col1')
df.ix['row_label', 0] # 混合標籤和索引(不建議)
df.ix[1, 'col_label']
df.ix['col1':'col3', 0:2]
使數據框更小更快#
df.memory_usage(deep=True)
df['col'].astype('category')
df['col'] = df['col'].astype('category', categories=['good', 'veray good', 'excellent', ordered=True])
pandas 與 scikit-learn#
pd.DataFrame({'id':array1, 'col':array2}).set_index('id').to_csv('1.csv')
df.to_pickle('1.pkl')
df.read_pickle('1.pkl')
loc 和 iloc#
df.loc[0:4, :] # 包含標籤
df.iloc[0:4, :] # 排除索引
大型 csv#
df.sample(n=3, random_state=42)
df.sample(frac=0.75)
train = df.sample()
test = df.loc[~df.index.isin(train.index), :]
虛擬變量#
df['sex_male'] = df['sex'].map({'female':0, 'male':1})
pd.get_dummies(df['sex'], prefix='sex')
日期和時間#
df['time'] = pd.to_datetime(df['time'])
df['time'].dt.weekday_name
df['time'].dt.dayofyear
df.loc[df['time'] > pd.to_datetime('1/1/1999'), :]
(df['time'].max() - df['time'].min()).days
查找和刪除重複行#
df['col'].duplicated().sum()
df.loc[df.duplicated(keep='first'), :] # 'last' False
df.duplicated(subset=['col1', 'col2'])
避免 SettingWithCopyWarning#
不確定這是副本還是視圖。
更改顯示選項#
pd.set_option('display.max_rows', None)
pd.reset_option('display.max_rows', None)
pd.set_option('display.max_colwidth, None)
pd.set_option('display.precision, 2)
df['new_col'] = df['col'] * 1000
pd.set_option('display.float_format', '{:,}'.format)
從其他對象創建 pandas DataFrame#
pd.DataFrame(dict, columns=['a', 'b'])
s = pd.Series([1, 2], index=['c', 'd'], name=shape)
將函數應用於 pandas 系列或數據框#
df['col1'] = df['col2'].apply(function)
df['col1'] = df['col2'].apply(np.ceil)
df['col'].apply(lambda x: x[0])
df['col1'] = df['col2'].apply(np.argmax, axis=1)
多重索引#
ser = df.groupby(['col1', 'col2']).col3.mean() # 帶有多重索引的系列
ser.unstack() # 數據框
df.pivot_table(values='col1', index='col2', columns='col3')
df.set_index(['col1', 'col2'], inplace=True)
df.sort_index(inplace=True)
合併數據框#
pd.concat([df1, df2])
pd.merge([df1, df2])
pd.merge(df1, df2, left_on='col1', right_on='col2')
pd.merge(df1, df2, how='inner') # outer left right