In [3]: df = pd.DataFrame(np.random.randn(10, 4))
In [4]: df.pct_change(periods=3)
Out[4]:
0 1 2 3
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 -0.218320 -1.054001 1.987147 -0.510183
4 -0.439121 -1.816454 0.649715 -4.822809
5 -0.127833 -3.042065 -5.866604 -1.776977
6 -2.596833 -1.959538 -2.111697 -3.798900
7 -0.117826 -2.169058 0.036094 -0.067696
8 2.492606 -1.357320 -1.205802 -1.558697
9 -1.012977 2.324558 -1.003744 -0.371806
In [5]: s1 = pd.Series(np.random.randn(1000))
In [6]: s2 = pd.Series(np.random.randn(1000))
In [7]: s1.cov(s2)
Out[7]: 0.0006801088174310875
同样的,DataFrame.cov() 会计算对应Series的协方差,也会忽略NaN的数据。
In [8]: frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])
In [9]: frame.cov()
Out[9]:
a b c d e
a 1.000882 -0.003177 -0.002698 -0.006889 0.031912
b -0.003177 1.024721 0.000191 0.009212 0.000857
c -0.002698 0.000191 0.950735 -0.031743 -0.005087
d -0.006889 0.009212 -0.031743 1.002983 -0.047952
e 0.031912 0.000857 -0.005087 -0.047952 1.042487
In [10]: frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"])
In [11]: frame.loc[frame.index[:5], "a"] = np.nan
In [12]: frame.loc[frame.index[5:10], "b"] = np.nan
In [13]: frame.cov()
Out[13]:
a b c
a 1.123670 -0.412851 0.018169
b -0.412851 1.154141 0.305260
c 0.018169 0.305260 1.301149
In [14]: frame.cov(min_periods=12)
Out[14]:
a b c
a 1.123670 NaN 0.018169
b NaN 1.154141 0.305260
c 0.018169 0.305260 1.301149
n [15]: frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])
In [16]: frame.iloc[::2] = np.nan
# Series with Series
In [17]: frame["a"].corr(frame["b"])
Out[17]: 0.013479040400098775
In [18]: frame["a"].corr(frame["b"], method="spearman")
Out[18]: -0.007289885159540637
# Pairwise correlation of DataFrame columns
In [19]: frame.corr()
Out[19]:
a b c d e
a 1.000000 0.013479 -0.049269 -0.042239 -0.028525
b 0.013479 1.000000 -0.020433 -0.011139 0.005654
c -0.049269 -0.020433 1.000000 0.018587 -0.054269
d -0.042239 -0.011139 0.018587 1.000000 -0.017060
e -0.028525 0.005654 -0.054269 -0.017060 1.000000
corr同样也支持 min_periods :
In [20]: frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"])
In [21]: frame.loc[frame.index[:5], "a"] = np.nan
In [22]: frame.loc[frame.index[5:10], "b"] = np.nan
In [23]: frame.corr()
Out[23]:
a b c
a 1.000000 -0.121111 0.069544
b -0.121111 1.000000 0.051742
c 0.069544 0.051742 1.000000
In [24]: frame.corr(min_periods=12)
Out[24]:
a b c
a 1.000000 NaN 0.069544
b NaN 1.000000 0.051742
c 0.069544 0.051742 1.000000
corrwith 可以计算不同DF间的相关系数。
In [27]: index = ["a", "b", "c", "d", "e"]
In [28]: columns = ["one", "two", "three", "four"]
In [29]: df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns)
In [30]: df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)
In [31]: df1.corrwith(df2)
Out[31]:
one -0.125501
two -0.493244
three 0.344056
four 0.004183
dtype: float64
In [32]: df2.corrwith(df1, axis=1)
Out[32]:
a -0.675817
b 0.458296
c 0.190809
d -0.186275
e NaN
dtype: float64
rank等级 rank方法可以对Series中的数据进行排列等级。什么叫等级呢? 我们举个例子:
s = pd.Series(np.random.randn(5), index=list("abcde"))
s
Out[51]:
a 0.336259
b 1.073116
c -0.402291
d 0.624186
e -0.422478
dtype: float64
s["d"] = s["b"] # so there's a tie
s
Out[53]:
a 0.336259
b 1.073116
c -0.402291
d 1.073116
e -0.422478
dtype: float64
s.rank()
Out[54]:
a 3.0
b 4.5
c 2.0
d 4.5
e 1.0
dtype: float64