跬步 On Coding

python玩转数据笔记:数据处理

时间索引

>>> import pandas as pd
>>> index = pd.date_range('20160503', periods=5)
>>> index
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
               '2016-05-07'],
              dtype='datetime64[ns]', freq='D')

数据显示

>>> import numpy as np
>>> dates = pd.DataFrame(np.random.randn(5,3),index=index,columns=list('ABC'))
>>> dates
                   A         B         C
2016-05-03  0.320522 -0.854196 -0.477794
2016-05-04 -0.736698  0.220931  0.878340
2016-05-05  1.002439 -0.422472 -1.027093
2016-05-06  1.792256  0.322563  0.285416
2016-05-07 -2.291344  0.633183  1.120919
>>> dates.index # 获取索引
DatetimeIndex(['2016-05-03', '2016-05-04', '2016-05-05', '2016-05-06',
               '2016-05-07'],
              dtype='datetime64[ns]', freq='D')
>>> dates.columns # 列名
Index([u'A', u'B', u'C'], dtype='object')
>>> dates.values # 返回数组,从这里可以看出来DataFrame与Series都是以ndarray为基础的
array([[ 0.32052247, -0.85419619, -0.47779368],
       [-0.73669815,  0.22093113,  0.87834037],
       [ 1.00243859, -0.42247204, -1.02709299],
       [ 1.79225566,  0.32256317,  0.28541609],
       [-2.2913443 ,  0.63318336,  1.12091912]])
>>> dates.describe # 描述
<bound method DataFrame.describe of                    A         B         C
2016-05-03  0.320522 -0.854196 -0.477794
2016-05-04 -0.736698  0.220931  0.878340
2016-05-05  1.002439 -0.422472 -1.027093
2016-05-06  1.792256  0.322563  0.285416
2016-05-07 -2.291344  0.633183  1.120919>

数据筛选

>>> dates.head(1) # 切片,前一行
                   A         B         C
2016-05-03  0.320522 -0.854196 -0.477794
>>> dates.tail(1) # 最后一行
                   A         B         C
2016-05-07 -2.291344  0.633183  1.120919
>>> dates['2016-05-03':'2016-05-05'] # 通过索引切片
                   A         B         C
2016-05-03  0.320522 -0.854196 -0.477794
2016-05-04 -0.736698  0.220931  0.878340
2016-05-05  1.002439 -0.422472 -1.027093
>>> dates.loc['2016-05-03':'2016-05-05', ['A', 'B']] # 区域选择,定义索引,字段名
                   A         B
2016-05-03  0.320522 -0.854196
2016-05-04 -0.736698  0.220931
2016-05-05  1.002439 -0.422472
>>> dates = pd.DataFrame(np.random.randn(5,3))
>>> dates
          0         1         2
0 -0.814773  0.828948  1.279713
1  1.010236 -1.063719 -1.010884
2 -0.206223 -1.756636 -0.777366
3 -0.253049  0.955689 -1.735237
4 -1.269495 -0.321743  0.929069
>>> dates.at[0,0] # 定位到单元格
-0.81477290050615703
>>> dates.iloc[1:4,[1,]] # 通过数字切片loc
                   B
2016-05-04 -1.002059
2016-05-05 -0.298642
2016-05-06 -1.042664
>>> dates.iat[1,1] # 数字索引定位到单元格
-1.0020589880335706
>>> dates[dates.index > '2016-05-05'] # 条件筛选
                   A         B         C
2016-05-06 -0.334060 -1.042664  0.999093
2016-05-07 -2.051357  0.713025 -0.792198
>>> dates[(dates.index > '2016-05-05') & (dates.A < 0)] # 组合筛选
                   A         B         C
2016-05-06 -0.334060 -1.042664  0.999093
2016-05-07 -2.051357  0.713025 -0.792198
>>> dates[dates.A > 0].B # 返回Series
2016-05-06   -0.139405
2016-05-07   -1.437701
Freq: D, Name: B, dtype: float64
>>> dates[dates.A>0].B.mean() # 计算平均值
-0.78855298332911494
>>> len(dates) # 行数
5
>>> import numpy as np
>>> np.sign(np.diff(dates.A))
array([ 1., -1.,  1., -1.])
>>> status = np.sign(np.diff(dates.A))
>>> status[np.where(status == 1.)].size # 算变更类型未1的数量
2
>>> dates.sort(columns='A') # 排序
__main__:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
                   A         B         C
2016-05-03 -0.695635  0.077077  0.694779
2016-05-05 -0.641996  1.420200  0.086330
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-07  0.667998 -1.437701 -0.196523
2016-05-06  1.176916 -0.139405 -0.592849
>>> dates.groupby('A').count() # 分组,分组的字段作为索引
           B  C
A              
-0.695635  1  1
-0.641996  1  1
-0.594039  1  1
 0.667998  1  1
 1.176916  1  1
>>> dates.groupby('A').sum() # 分组计算和
                  B         C
A                            
-0.695635  0.077077  0.694779
-0.641996  1.420200  0.086330
-0.594039 -0.842575 -0.046909
 0.667998 -1.437701 -0.196523
 1.176916 -0.139405 -0.592849

sum

数据合并

>>> bdates = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
>>> bdates
          A         B         C
0 -0.043208  0.332842 -0.390034
1 -0.318745  0.013284  0.147836
>>> dates
                   A         B         C
2016-05-03 -0.695635  0.077077  0.694779
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-05 -0.641996  1.420200  0.086330
2016-05-06  1.176916 -0.139405 -0.592849
2016-05-07  0.667998 -1.437701 -0.196523
>>> dates.append(bdates) # 追加
                            A         B         C
2016-05-03 00:00:00 -0.695635  0.077077  0.694779
2016-05-04 00:00:00 -0.594039 -0.842575 -0.046909
2016-05-05 00:00:00 -0.641996  1.420200  0.086330
2016-05-06 00:00:00  1.176916 -0.139405 -0.592849
2016-05-07 00:00:00  0.667998 -1.437701 -0.196523
0                   -0.043208  0.332842 -0.390034
1                   -0.318745  0.013284  0.147836
>>> pieces = [dates[:2], dates[len(dates)-1:]
>>> pd.concat(pieces) # 合并片段
                   A         B         C
2016-05-03 -0.695635  0.077077  0.694779
2016-05-04 -0.594039 -0.842575 -0.046909
2016-05-07  0.667998 -1.437701 -0.196523
>>> pd.concat(pieces, ignore_index=True) # 忽略不想同的字段,用NaN填充
          A         B         C
0 -0.695635  0.077077  0.694779
1 -0.594039 -0.842575 -0.046909
2  0.667998 -1.437701 -0.196523
>>> pd.merge(dates, bdates, on='A') # join合并,类比sql语句
Empty DataFrame
Columns: [A, B_x, C_x, B_y, C_y]
Index: []