导包
import numpy as np
import pandas as pd
创建对象
创建一个Series
Series是一维标量数组能够保存任何数据类型(整数,字符串,浮点数,Python对象等)。轴标签统称为索引。创建系列的基本方法是调用:
s = pd.Series(data, index=index)
在这里,data可以有很多不同的东西:
- Python字典
- ndarray
- 标量值(如5)
In [3]: s = pd.Series([1, 3, 5, np.nan, 6, 8])
In [4]: s
Out[4]:
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
创建一个DataFrame
通过传递带有日期时间索引和带标签的列的NumPy数组来创建:
In [5]: dates = pd.date_range('20130101', periods=6)
In [6]: dates
Out[6]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
In [7]: df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
In [8]: df
Out[8]:
A B C D
2013-01-01 0.469112 -0.282863 -1.509059 -1.135632
2013-01-02 1.212112 -0.173215 0.119209 -1.044236
2013-01-03 -0.861849 -2.104569 -0.494929 1.071804
2013-01-04 0.721555 -0.706771 -1.039575 0.271860
2013-01-05 -0.424972 0.567020 0.276232 -1.087401
2013-01-06 -0.673690 0.113648 -1.478427 0.524988
通过传递对象的字典来创建,这些对象可以转换为类似序列的对象。
In [9]: df2 = pd.DataFrame({'A': 1.,
...: 'B': pd.Timestamp('20130102'),
...: 'C': pd.Series(1, index=list(range(4)), dtype='float32'),
...: 'D': np.array([3] * 4, dtype='int32'),
...: 'E': pd.Categorical(["test", "train", "test", "train"]),
...: 'F': 'foo'})
...:
In [10]: df2
Out[10]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
查看数据
.head(i) #查看前i个数据
.tail(i) #查看后i个数据
.index #查看索引
.columns #查看表头
.to_numpy() #转换为numpy数组
.describe() #显示数据的快速统计摘要
.T #转置数据
.sort_index(axis=,ascending=)
#参数axis只有两个值,分别是0和1,而df中只有两个index分别是表最左一列的时间和表最上一行的ABCDEaxis=0对应的是对左边一列的index进行排序,ascending=False代表降序,ascending=True代表升序
.sort_values(by=) #
选择
df['A'] #选出表头为A的那列
df[0:3] #选出前三行
df['20130102':'20130104'] #选出20130102到20130104之间的的行
df.loc[dates[0]] #使用标签获取
df.loc['20130102':'20130104', ['A', 'B']] #通过行列选区
df.loc[dates[0], 'A']#同下
df.at[dates[0], 'A'] #快速访问标量
df.iloc[3] #通过传递整数的位置进行选择
df.iloc[[3:5,0:2]] #通过切片
df.iloc[[1,2,4],[0,2]] #通过整数位置
dfiloc[1:3,:] #切片行
df.iloc[:,1:3] #切片列
df.iloc[1,1] #根据坐标获取
df.iat[1,1] #同上
df[df.A>0] #选出A列大于0的值
df[df>0] #选出所有大于0的值
df[df['a'].isin(['1','2'])] #选出在某一范围的值
设置值
pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) #设置新列自动按索引排序
df.at[dates[0],'A']=0 #通过标签设置
df.iat[0,1]=0 #通过坐标设置
df.loc[:,'D'] = np.array([1,2,3,4,5,6]) #通过numpy数组设置
df2[df2>0]=-df2 #就内个意思
处理缺失数据
df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E']) #加一列
df1.loc[dates[0]:dates[1],'E'] = 1 #E列前两行设置为1
df1.dropna(how='any') #删掉所有有丢失数据的行
df1.fillna(value=5) #填充缺失数据
pd.isna(df1) #值空返回True 不空返回False
操作
df.mean() or df.mean(0) #按列求平均值
df.mean(1) #按行求平均值
pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) #shift从指定位置开始对齐
df.sub(s, axis='index') #对不同维度的 pandas 对象进行减法操作
df.apply(np.cumsum) #将函数应用于数据
.value_counts() #统计每个值出现过几次
.str.lower() #降序
合并
df3 = pd.DataFrame(np.random.randn(10,4))
pieces = [df3[:3],df3[3:7],df3[7:]]
pd.concat(pieces) #前三,三到七,七到十合并
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
print(pd.merge(left,right,on='key'))#sql样式合并
#append把一个表加到另一个表中
df3 = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
print(df3)
s3 = df.iloc[3]
df3=df3.append(s3,ignore_index=True)#需要接收
print(df3)
分组
“分组依据”是指涉及以下一个或多个步骤的过程:
拆分数据到基于某些标准组
将功能独立地应用于每个组
将结果合并为数据结构
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
print(df.groupby('A').sum()) #分组,然后将sum()功能应用于结果组。
print(df.groupby(['A', 'B']).sum()) #通过多列分组形成一个层次结构索引,我们可以再次应用该sum功能。
OUT:
A B C D
0 foo one -0.536516 -0.545869
1 bar one 0.149327 0.629276
2 foo two 1.206367 -2.037879
3 bar three -1.169908 -1.041128
4 foo two 1.040343 0.083255
5 bar two 0.498475 -1.011725
6 foo one -1.368482 -0.503150
7 foo three 1.088012 1.208393
C D
A
bar -0.522105 -1.423577
foo 1.429723 -1.795251
C D
A B
bar one 0.149327 0.629276
three -1.169908 -1.041128
two 0.498475 -1.011725
foo one -1.904999 -1.049019
three 1.088012 1.208393
two 2.246710 -1.954625
reshape
#stack
import numpy as np
import pandas as pd
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
print(df)
print(df2)
stacked=df2.stack()
print(stacked)
print(stacked.unstack())
print(stacked.unstack(1))
print(stacked.unstack(0))
OUT:
A B
first second
bar one 0.704871 0.657547
two -0.029277 -0.890557
baz one -0.211965 1.745703
two -1.930054 -0.925681
foo one -2.387723 1.097615
two -1.963898 -0.425163
qux one 0.952847 0.873658
two -0.810983 1.119091
A B
first second
bar one 0.704871 0.657547
two -0.029277 -0.890557
baz one -0.211965 1.745703
two -1.930054 -0.925681
first second
bar one A 0.704871
B 0.657547
two A -0.029277
B -0.890557
baz one A -0.211965
B 1.745703
two A -1.930054
B -0.925681
dtype: float64
A B
first second
bar one 0.704871 0.657547
two -0.029277 -0.890557
baz one -0.211965 1.745703
two -1.930054 -0.925681
second one two
first
bar A 0.704871 -0.029277
B 0.657547 -0.890557
baz A -0.211965 -1.930054
B 1.745703 -0.925681
first bar baz
second
one A 0.704871 -0.211965
B 0.657547 1.745703
two A -0.029277 -1.930054
B -0.890557 -0.925681
#pivot_table换行列
import numpy as np
import pandas as pd
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
'B': ['A', 'B', 'C'] * 4,
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D': np.random.randn(12),
'E': np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
OUT:
A B C D E
0 one A foo -0.705647 -0.983468
1 one B foo -0.319298 0.664083
2 two C foo 0.948117 -1.523873
3 three A bar 1.018841 0.616861
4 one B bar -2.029511 -0.407078
5 one C bar -0.054141 0.058718
6 two A foo 2.124821 -0.480747
7 three B foo -1.736714 -2.327147
8 one C foo -0.320425 1.191605
9 one A bar -1.113508 1.323916
10 two B bar 1.525207 -0.978258
11 three C bar -0.434330 0.780304
C bar foo
A B
one A -1.113508 -0.705647
B -2.029511 -0.319298
C -0.054141 -0.320425
three A 1.018841 NaN
B NaN -1.736714
C -0.434330 NaN
two A NaN 2.124821
B 1.525207 NaN
C NaN 0.948117