In [2]:
import numpy as np
import pandas as pd
import datetime as dt

pd.set_option('display.max_rows', 16)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

import gc
In [3]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)

数据处理

财务数据

财务数据处理的难点在于“报表数据所处的时间”、“报表报告的时间”、“报表修改时间”带来的复杂性。两种处理方式比较合理:

  1. 预留充足的时间以便在使用报表数据的时间点上,报表数据是可用的(但不一定是最新的)
  2. 无论在哪个时间点上使用报表数据,都只用最新的数据(point-in-time)

交易数据

停牌

  • 停牌在某些时候是可以不处理的,比如计算动量的时候,停牌之后的价格和停牌前的价格计算收益率,可以作为动量的一种衡量
  • 但在有的时候,停牌不处理可能会有问题。
    • 比如计算beta,市场收益率每个交易日都是有的,但个股停牌的时候没有,此时如果设为0,直接回归会有大的偏差
    • 另外比如计算波动率,如果设为0,也有问题
    • 从收益率的角度看,如果我们关注点是月收益率,也应当去掉,因为停牌的股票无法交易,也无法调仓
  • 我们把停牌超过一个月的观测值删去

Data

In [4]:
START = '2007-01-01'
END = '2023-03-31'
In [5]:
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID'].unique()
In [6]:
stk_info
Out[6]:
secID ticker secShortName cnSpell exchangeCD assetClass listStatusCD listDate transCurrCD ISIN partyID delistDate
0 000001.XSHE 000001 平安银行 PAYH XSHE E L 1991-04-03 CNY CNE000000040 2.0 NaN
1 000002.XSHE 000002 万科A WKA XSHE E L 1991-01-29 CNY CNE0000000T2 3.0 NaN
2 000003.XSHE 000003 PT金田A PTJTA XSHE E DE 1991-07-03 CNY CNE1000031Y5 4.0 2002-06-14
3 000004.XSHE 000004 ST国华 STGH XSHE E L 1991-01-14 CNY CNE0000000Y2 5.0 NaN
4 000005.XSHE 000005 ST星源 STXY XSHE E L 1990-12-10 CNY CNE0000001L7 6.0 NaN
5 000006.XSHE 000006 深振业A SZYA XSHE E L 1992-04-27 CNY CNE000000164 7.0 NaN
6 000007.XSHE 000007 全新好 QXH XSHE E L 1992-04-13 CNY CNE0000000P0 8.0 NaN
7 000008.XSHE 000008 神州高铁 SZGT XSHE E L 1992-05-07 CNY CNE0000001C6 9.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
25333 900950.XSHG 900950 新城B股 XCBG XSHG E DE 1997-10-16 USD CNE000000TH1 1429.0 2015-11-23
25334 900951.XSHG 900951 退市大化 TSDH XSHG E DE 1997-10-21 USD CNE000000TJ7 1430.0 2020-08-27
25335 900952.XSHG 900952 锦港B股 JGBG XSHG E L 1998-05-19 USD CNE000000W88 763.0 NaN
25336 900953.XSHG 900953 凯马B KMB XSHG E L 1998-06-24 USD CNE000000WP8 1431.0 NaN
25337 900955.XSHG 900955 退市海B TSHB XSHG E DE 1999-01-18 USD CNE000000YC2 1063.0 2022-07-13
25338 900956.XSHG 900956 东贝B股 DBBG XSHG E DE 1999-07-15 USD CNE000000ZS5 1432.0 2020-11-23
25339 900957.XSHG 900957 凌云B股 LYBG XSHG E L 2000-07-28 USD CNE0000013W9 1433.0 NaN
29888 DY600018.XSHG DY600018 上港集箱 SGJX XSHG E DE 2000-07-19 CNY NaN 618.0 2006-10-20

5239 rows × 12 columns

In [7]:
len(stk_id)
Out[7]:
5239

ST

In [8]:
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
In [9]:
st_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552823 entries, 0 to 552822
Data columns (total 3 columns):
secID        552823 non-null object
tradeDate    552823 non-null object
STflg        552823 non-null object
dtypes: object(3)
memory usage: 12.7+ MB
In [10]:
st_df
Out[10]:
secID tradeDate STflg
0 000001.XSHE 2007-01-04 S
1 000001.XSHE 2007-01-05 S
2 000001.XSHE 2007-01-08 S
3 000001.XSHE 2007-01-09 S
4 000001.XSHE 2007-01-10 S
5 000001.XSHE 2007-01-11 S
6 000001.XSHE 2007-01-12 S
7 000001.XSHE 2007-01-15 S
... ... ... ...
552815 900955.XSHG 2022-06-06 *ST
552816 900955.XSHG 2022-06-07 *ST
552817 900955.XSHG 2022-06-08 *ST
552818 900955.XSHG 2022-06-09 *ST
552819 900955.XSHG 2022-06-10 *ST
552820 900955.XSHG 2022-06-13 *ST
552821 900955.XSHG 2022-06-14 *ST
552822 900955.XSHG 2022-06-15 *ST

552823 rows × 3 columns

In [11]:
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")

Book value

Book/Market ratio, 简称BM,也即价值因子,反映了公司的账面价值和市值的比值。Fama French (1993) 发现估值低(BM高)的股票和高的相比,预期收益为正。

BM ratio Fama-French(1993) 原文的构造方法:

  • 每年的12月底的 book equity
  • 每年12月最后一个交易日的mktcap
  • 上述二者相除,得到 BM ratio
  • 这个 BM ratio 作为下一年6月至下下一年5月的 portfolio 的 sorting variable

处理思路:

  • 优矿的数据有发布日期,数据日期
  • 这里book value比较简单,只取年报数据,也就是“数据日期”都是12月
  • 取发布日期最晚,也就是最新的(也许年报和1季报中数据不同,或者年报发布后马上有更改),但不晚于次年6月
In [12]:
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,reportType="A",beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")

# fundmen_df.to_pickle('./data/fundmen_df.pkl')
In [13]:
fundmen_df = pd.read_pickle('./data/fundmen_df.pkl')
In [14]:
fundmen_df
Out[14]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000001.XSHE 2022-10-25 2021-12-31 2022-09-30 2022-10-24 20:52:23 12 3.954480e+11 3.954480e+11 NaN
1 000001.XSHE 2022-08-18 2021-12-31 2022-06-30 2022-08-17 18:30:42 12 3.954480e+11 3.954480e+11 NaN
2 000001.XSHE 2022-04-27 2021-12-31 2022-03-31 2022-04-26 17:45:15 12 3.954480e+11 3.954480e+11 NaN
3 000001.XSHE 2022-03-10 2021-12-31 2021-12-31 2022-03-09 17:58:07 12 3.954480e+11 3.954480e+11 NaN
4 000001.XSHE 2022-03-10 2020-12-31 2021-12-31 2022-03-09 17:58:07 12 3.641310e+11 3.641310e+11 NaN
5 000001.XSHE 2021-10-21 2020-12-31 2021-09-30 2021-10-20 17:39:15 12 3.641310e+11 3.641310e+11 NaN
6 000001.XSHE 2021-08-20 2020-12-31 2021-06-30 2021-08-19 17:20:35 12 3.641310e+11 3.641310e+11 NaN
7 000001.XSHE 2021-04-21 2020-12-31 2021-03-31 2021-04-20 17:54:36 12 3.641310e+11 3.641310e+11 NaN
... ... ... ... ... ... ... ... ... ...
275342 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 2009-07-31 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
275343 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 2009-04-17 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
275344 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 2009-03-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
275345 900957.XSHG 2009-03-26 2007-12-31 2008-12-31 2009-03-25 18:00:00 12 4.363166e+08 3.769447e+08 59371874.07
275346 900957.XSHG 2008-10-24 2007-12-31 2008-09-30 2008-10-23 18:00:00 12 4.363166e+08 3.769447e+08 59371874.07
275347 900957.XSHG 2008-08-25 2007-12-31 2008-06-30 2008-08-24 18:00:00 12 4.363166e+08 3.769447e+08 59371874.07
275348 900957.XSHG 2008-04-24 2007-12-31 2008-03-31 2008-04-23 18:00:00 12 4.363166e+08 3.769447e+08 59371874.07
275349 900957.XSHG 2008-04-08 2007-12-31 2007-12-31 2008-04-07 18:00:00 12 4.363166e+08 3.769447e+08 59371874.07

275350 rows × 9 columns

In [15]:
fundmen_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275350 entries, 0 to 275349
Data columns (total 9 columns):
secID           275350 non-null object
publishDate     275350 non-null object
endDate         275350 non-null object
endDateRep      275350 non-null object
actPubtime      275350 non-null object
fiscalPeriod    275350 non-null object
TShEquity       273761 non-null float64
TEquityAttrP    273755 non-null float64
minorityInt     208628 non-null float64
dtypes: float64(3), object(6)
memory usage: 18.9+ MB
In [16]:
fundmen_df['fiscalPeriod'].unique()
Out[16]:
array(['12'], dtype=object)
In [17]:
fundmen_df[(fundmen_df['secID']=='000001.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[17]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
9 000001.XSHE 2021-02-02 2019-12-31 2020-12-31 2021-02-01 18:58:35 12 3.129830e+11 3.129830e+11 NaN
10 000001.XSHE 2020-10-22 2019-12-31 2020-09-30 2020-10-21 19:21:43 12 3.129830e+11 3.129830e+11 NaN
11 000001.XSHE 2020-08-28 2019-12-31 2020-06-30 2020-08-27 17:50:41 12 3.129830e+11 3.129830e+11 NaN
12 000001.XSHE 2020-04-21 2019-12-31 2020-03-31 2020-04-20 18:42:38 12 3.129830e+11 3.129830e+11 NaN
13 000001.XSHE 2020-02-14 2019-12-31 2019-12-31 2020-02-13 19:02:36 12 3.129830e+11 3.129830e+11 NaN
In [18]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[18]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
147648 300720.XSHE 2021-04-27 2019-12-31 2020-12-31 2021-04-26 18:40:28 12 4.783596e+08 4.783596e+08 NaN
147649 300720.XSHE 2020-10-30 2019-12-31 2020-09-30 2020-10-29 19:58:45 12 4.783596e+08 4.783596e+08 NaN
147650 300720.XSHE 2020-08-28 2019-12-31 2020-06-30 2020-08-27 22:42:40 12 4.783596e+08 4.783596e+08 NaN
147651 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN
147652 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN
In [19]:
fundmen_df[(fundmen_df['secID'] == '300720.XSHE') & (fundmen_df['endDate']=='2018-12-31')]
Out[19]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
147653 300720.XSHE 2020-04-24 2018-12-31 2019-12-31 2020-04-23 21:04:35 12 4.555515e+08 4.555515e+08 NaN
147654 300720.XSHE 2019-10-30 2018-12-31 2019-09-30 2019-10-29 19:22:34 12 4.555515e+08 4.555515e+08 NaN
147655 300720.XSHE 2019-08-28 2018-12-31 2019-06-30 2019-08-27 19:42:06 12 4.555515e+08 4.555515e+08 NaN
147656 300720.XSHE 2019-04-26 2018-12-31 2019-03-31 2019-04-25 23:27:06 12 4.555515e+08 4.555515e+08 NaN
147657 300720.XSHE 2019-04-26 2018-12-31 2018-12-31 2019-04-25 23:27:06 12 4.555515e+08 4.555515e+08 NaN
  • publishDate: 实际公告日期
  • endDate:数值所在日期
  • endDateRep:数值所在报表日期。03-31是一季报,06-30是半年报,09-30是三季报,12-31是年报。后面的报表可能会对初始值做修改。

比如,300720.XSHE在2020-04-24公布了数据截止至2019-12-31的报告,里面包含了数据截止至2018-12-31的报表数据。

300720.XSHE在2019-08-28公布了数据截止至2019-06-30的报告,里面包含了数据截止至2018-12-31的报表数据。

在t年6月分组时,应当取最新更新过的t-1年12月31日的Book数值。

In [20]:
fundmen_df.drop(['actPubtime','fiscalPeriod'],axis=1, inplace=True)
In [21]:
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
In [22]:
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
In [23]:
fundmen_df
Out[23]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
0 000001.XSHE 2022-10-25 2021-12-31 2022-09-30 3.954480e+11 3.954480e+11 NaN 10 2022 2021
1 000001.XSHE 2022-08-18 2021-12-31 2022-06-30 3.954480e+11 3.954480e+11 NaN 8 2022 2021
2 000001.XSHE 2022-04-27 2021-12-31 2022-03-31 3.954480e+11 3.954480e+11 NaN 4 2022 2021
3 000001.XSHE 2022-03-10 2021-12-31 2021-12-31 3.954480e+11 3.954480e+11 NaN 3 2022 2021
4 000001.XSHE 2022-03-10 2020-12-31 2021-12-31 3.641310e+11 3.641310e+11 NaN 3 2022 2020
5 000001.XSHE 2021-10-21 2020-12-31 2021-09-30 3.641310e+11 3.641310e+11 NaN 10 2021 2020
6 000001.XSHE 2021-08-20 2020-12-31 2021-06-30 3.641310e+11 3.641310e+11 NaN 8 2021 2020
7 000001.XSHE 2021-04-21 2020-12-31 2021-03-31 3.641310e+11 3.641310e+11 NaN 4 2021 2020
... ... ... ... ... ... ... ... ... ... ...
275342 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 4.902596e+08 4.369354e+08 53324231.94 8 2009 2008
275343 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 4.902596e+08 4.369354e+08 53324231.94 4 2009 2008
275344 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 4.902596e+08 4.369354e+08 53324231.94 3 2009 2008
275345 900957.XSHG 2009-03-26 2007-12-31 2008-12-31 4.363166e+08 3.769447e+08 59371874.07 3 2009 2007
275346 900957.XSHG 2008-10-24 2007-12-31 2008-09-30 4.363166e+08 3.769447e+08 59371874.07 10 2008 2007
275347 900957.XSHG 2008-08-25 2007-12-31 2008-06-30 4.363166e+08 3.769447e+08 59371874.07 8 2008 2007
275348 900957.XSHG 2008-04-24 2007-12-31 2008-03-31 4.363166e+08 3.769447e+08 59371874.07 4 2008 2007
275349 900957.XSHG 2008-04-08 2007-12-31 2007-12-31 4.363166e+08 3.769447e+08 59371874.07 4 2008 2007

275350 rows × 10 columns

In [24]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[24]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
147648 300720.XSHE 2021-04-27 2019-12-31 2020-12-31 4.783596e+08 4.783596e+08 NaN 4 2021 2019
147649 300720.XSHE 2020-10-30 2019-12-31 2020-09-30 4.783596e+08 4.783596e+08 NaN 10 2020 2019
147650 300720.XSHE 2020-08-28 2019-12-31 2020-06-30 4.783596e+08 4.783596e+08 NaN 8 2020 2019
147651 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
147652 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
In [25]:
fundmen_df['pub_year'] - fundmen_df['data_year'] 
Out[25]:
0         1
1         1
2         1
3         1
4         2
5         1
6         1
7         1
         ..
275342    1
275343    1
275344    1
275345    2
275346    1
275347    1
275348    1
275349    1
Length: 275350, dtype: int64
In [26]:
fundmen_df
Out[26]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
0 000001.XSHE 2022-10-25 2021-12-31 2022-09-30 3.954480e+11 3.954480e+11 NaN 10 2022 2021
1 000001.XSHE 2022-08-18 2021-12-31 2022-06-30 3.954480e+11 3.954480e+11 NaN 8 2022 2021
2 000001.XSHE 2022-04-27 2021-12-31 2022-03-31 3.954480e+11 3.954480e+11 NaN 4 2022 2021
3 000001.XSHE 2022-03-10 2021-12-31 2021-12-31 3.954480e+11 3.954480e+11 NaN 3 2022 2021
4 000001.XSHE 2022-03-10 2020-12-31 2021-12-31 3.641310e+11 3.641310e+11 NaN 3 2022 2020
5 000001.XSHE 2021-10-21 2020-12-31 2021-09-30 3.641310e+11 3.641310e+11 NaN 10 2021 2020
6 000001.XSHE 2021-08-20 2020-12-31 2021-06-30 3.641310e+11 3.641310e+11 NaN 8 2021 2020
7 000001.XSHE 2021-04-21 2020-12-31 2021-03-31 3.641310e+11 3.641310e+11 NaN 4 2021 2020
... ... ... ... ... ... ... ... ... ... ...
275342 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 4.902596e+08 4.369354e+08 53324231.94 8 2009 2008
275343 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 4.902596e+08 4.369354e+08 53324231.94 4 2009 2008
275344 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 4.902596e+08 4.369354e+08 53324231.94 3 2009 2008
275345 900957.XSHG 2009-03-26 2007-12-31 2008-12-31 4.363166e+08 3.769447e+08 59371874.07 3 2009 2007
275346 900957.XSHG 2008-10-24 2007-12-31 2008-09-30 4.363166e+08 3.769447e+08 59371874.07 10 2008 2007
275347 900957.XSHG 2008-08-25 2007-12-31 2008-06-30 4.363166e+08 3.769447e+08 59371874.07 8 2008 2007
275348 900957.XSHG 2008-04-24 2007-12-31 2008-03-31 4.363166e+08 3.769447e+08 59371874.07 4 2008 2007
275349 900957.XSHG 2008-04-08 2007-12-31 2007-12-31 4.363166e+08 3.769447e+08 59371874.07 4 2008 2007

275350 rows × 10 columns

In [27]:
# 每年6月底计算时,只能看到publishDate在6月之前的数值。
# 取 endDate 相同时,publishDate 最晚(但小于等于6)的那个数值
# 同时pub_year - data_year 不能大于1
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
fundmen_df = fundmen_df[fundmen_df['pub_year'] - fundmen_df['data_year'] == 1]
fundmen_df = fundmen_df[fundmen_df['pub_month'] <= 6]
fundmen_df.sort_values(['secID','endDate','publishDate'],inplace=True)
In [28]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[28]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
147651 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
147652 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
In [29]:
fundmen_df = fundmen_df.groupby(['secID','endDate'],as_index=False).first()
In [30]:
fundmen_df['bm_date'] = fundmen_df['endDate'].dt.to_period('M')
In [31]:
fundmen_df
Out[31]:
secID endDate publishDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date
0 000001.XSHE 2007-12-31 2008-03-20 2007-12-31 1.300606e+10 1.300606e+10 NaN 3 2008 2007 2007-12
1 000001.XSHE 2008-12-31 2009-03-20 2008-12-31 1.640079e+10 1.640079e+10 NaN 3 2009 2008 2008-12
2 000001.XSHE 2009-12-31 2010-03-12 2009-12-31 2.046961e+10 2.046961e+10 NaN 3 2010 2009 2009-12
3 000001.XSHE 2010-12-31 2011-02-25 2010-12-31 3.351288e+10 3.351288e+10 NaN 2 2011 2010 2010-12
4 000001.XSHE 2011-12-31 2012-03-09 2011-12-31 7.538058e+10 7.331084e+10 2.069747e+09 3 2012 2011 2011-12
5 000001.XSHE 2012-12-31 2013-03-08 2012-12-31 8.479878e+10 8.479878e+10 NaN 3 2013 2012 2012-12
6 000001.XSHE 2013-12-31 2014-03-07 2013-12-31 1.120810e+11 1.120810e+11 NaN 3 2014 2013 2013-12
7 000001.XSHE 2014-12-31 2015-03-13 2014-12-31 1.309490e+11 1.309490e+11 NaN 3 2015 2014 2014-12
... ... ... ... ... ... ... ... ... ... ... ...
47857 900957.XSHG 2014-12-31 2015-04-10 2014-12-31 4.072107e+08 3.940309e+08 1.317981e+07 4 2015 2014 2014-12
47858 900957.XSHG 2015-12-31 2016-03-30 2015-12-31 4.106786e+08 3.973929e+08 1.328570e+07 3 2016 2015 2015-12
47859 900957.XSHG 2016-12-31 2017-03-25 2016-12-31 3.938268e+08 3.930721e+08 7.546643e+05 3 2017 2016 2016-12
47860 900957.XSHG 2017-12-31 2018-04-10 2017-12-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12
47861 900957.XSHG 2018-12-31 2019-03-30 2018-12-31 4.515278e+08 4.508051e+08 7.226781e+05 3 2019 2018 2018-12
47862 900957.XSHG 2019-12-31 2020-04-25 2019-12-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12
47863 900957.XSHG 2020-12-31 2021-04-09 2020-12-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12
47864 900957.XSHG 2021-12-31 2022-04-20 2021-12-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12

47865 rows × 11 columns

In [32]:
fundmen_df.fillna(0,inplace=True)
In [33]:
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
In [34]:
fundmen_df = fundmen_df[fundmen_df['book'] > 0]
In [35]:
fundmen_df
Out[35]:
secID endDate publishDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
0 000001.XSHE 2007-12-31 2008-03-20 2007-12-31 1.300606e+10 1.300606e+10 0.000000e+00 3 2008 2007 2007-12 1.300606e+10
1 000001.XSHE 2008-12-31 2009-03-20 2008-12-31 1.640079e+10 1.640079e+10 0.000000e+00 3 2009 2008 2008-12 1.640079e+10
2 000001.XSHE 2009-12-31 2010-03-12 2009-12-31 2.046961e+10 2.046961e+10 0.000000e+00 3 2010 2009 2009-12 2.046961e+10
3 000001.XSHE 2010-12-31 2011-02-25 2010-12-31 3.351288e+10 3.351288e+10 0.000000e+00 2 2011 2010 2010-12 3.351288e+10
4 000001.XSHE 2011-12-31 2012-03-09 2011-12-31 7.538058e+10 7.331084e+10 2.069747e+09 3 2012 2011 2011-12 7.331084e+10
5 000001.XSHE 2012-12-31 2013-03-08 2012-12-31 8.479878e+10 8.479878e+10 0.000000e+00 3 2013 2012 2012-12 8.479878e+10
6 000001.XSHE 2013-12-31 2014-03-07 2013-12-31 1.120810e+11 1.120810e+11 0.000000e+00 3 2014 2013 2013-12 1.120810e+11
7 000001.XSHE 2014-12-31 2015-03-13 2014-12-31 1.309490e+11 1.309490e+11 0.000000e+00 3 2015 2014 2014-12 1.309490e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
47857 900957.XSHG 2014-12-31 2015-04-10 2014-12-31 4.072107e+08 3.940309e+08 1.317981e+07 4 2015 2014 2014-12 3.940309e+08
47858 900957.XSHG 2015-12-31 2016-03-30 2015-12-31 4.106786e+08 3.973929e+08 1.328570e+07 3 2016 2015 2015-12 3.973929e+08
47859 900957.XSHG 2016-12-31 2017-03-25 2016-12-31 3.938268e+08 3.930721e+08 7.546643e+05 3 2017 2016 2016-12 3.930721e+08
47860 900957.XSHG 2017-12-31 2018-04-10 2017-12-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12 4.231040e+08
47861 900957.XSHG 2018-12-31 2019-03-30 2018-12-31 4.515278e+08 4.508051e+08 7.226781e+05 3 2019 2018 2018-12 4.508051e+08
47862 900957.XSHG 2019-12-31 2020-04-25 2019-12-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12 4.761021e+08
47863 900957.XSHG 2020-12-31 2021-04-09 2020-12-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12 4.979110e+08
47864 900957.XSHG 2021-12-31 2022-04-20 2021-12-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12 5.255741e+08

46883 rows × 12 columns

In [36]:
np.allclose(fundmen_df['book'],fundmen_df['TEquityAttrP'])
Out[36]:
True
In [37]:
fundmen_df[fundmen_df['book']-fundmen_df['TEquityAttrP'] > 10]
Out[37]:
secID endDate publishDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book

Risk free rate

In [38]:
pd.read_csv("./data/rf-monthly.csv")
Out[38]:
日期_Date 年份()_Year 月份_Month 月无风险收益率_MonRFRet Unnamed: 4
0 1989-02-01 1989 2 0.006300 NaN
1 1989-03-01 1989 3 0.006300 NaN
2 1989-04-01 1989 4 0.006300 NaN
3 1989-05-01 1989 5 0.006300 NaN
4 1989-06-01 1989 6 0.006300 NaN
5 1989-07-01 1989 7 0.006300 NaN
6 1989-08-01 1989 8 0.006300 NaN
7 1989-09-01 1989 9 0.006300 NaN
... ... ... ... ... ...
401 2022-07-01 2022 7 0.001620 NaN
402 2022-08-01 2022 8 0.001366 NaN
403 2022-09-01 2022 9 0.001342 NaN
404 2022-10-01 2022 10 0.001413 NaN
405 2022-11-01 2022 11 0.001676 NaN
406 2022-12-01 2022 12 0.001931 NaN
407 2023-01-01 2023 1 0.002013 NaN
408 2023-02-01 2023 2 0.002013 NaN

409 rows × 5 columns

In [39]:
rf = pd.read_csv("./data/rf-monthly.csv").drop(["Unnamed: 4", "年份()_Year", "月份_Month"],axis=1)
rf.columns = ['Date', 'rf']
rf['Date'] = pd.to_datetime(rf["Date"])
rf['Date'] = rf['Date'].dt.to_period('M')
rf.rename(columns={'Date':'ym'},inplace=True)
In [40]:
rf
Out[40]:
ym rf
0 1989-02 0.006300
1 1989-03 0.006300
2 1989-04 0.006300
3 1989-05 0.006300
4 1989-06 0.006300
5 1989-07 0.006300
6 1989-08 0.006300
7 1989-09 0.006300
... ... ...
401 2022-07 0.001620
402 2022-08 0.001366
403 2022-09 0.001342
404 2022-10 0.001413
405 2022-11 0.001676
406 2022-12 0.001931
407 2023-01 0.002013
408 2023-02 0.002013

409 rows × 2 columns

Beta

In [41]:
beta_df = pd.read_pickle('./data/beta_df.pkl')

beta_df

beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")

beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')

beta_df[['Beta60','Beta120','Beta252']] = beta_df[['Beta60','Beta120','Beta252']].apply(pd.to_numeric)
In [42]:
# # Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta60_winsor'] = beta_df['Beta60'].clip(lower=beta_df['Beta60'].quantile(lower_q),upper=beta_df['Beta60'].quantile(up_q))
# beta_df['Beta120_winsor'] = beta_df['Beta120'].clip(lower=beta_df['Beta120'].quantile(lower_q),upper=beta_df['Beta120'].quantile(up_q))

Monthly beta

In [43]:
beta_m_df = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()

beta_m_df.rename(columns={'Beta252':'beta'},inplace=True)

beta_m_df
Out[43]:
secID ym beta
0 000001.XSHE 2007-01 0.7949
1 000001.XSHE 2007-02 0.7880
2 000001.XSHE 2007-03 0.8512
3 000001.XSHE 2007-04 0.8642
4 000001.XSHE 2007-05 0.7715
5 000001.XSHE 2007-06 0.4614
6 000001.XSHE 2007-07 0.6423
7 000001.XSHE 2007-08 0.7722
... ... ... ...
554352 689009.XSHG 2022-07 0.7987
554353 689009.XSHG 2022-08 0.8589
554354 689009.XSHG 2022-09 0.9106
554355 689009.XSHG 2022-10 0.7083
554356 689009.XSHG 2022-11 0.7363
554357 689009.XSHG 2022-12 0.6919
554358 689009.XSHG 2023-01 0.7379
554359 689009.XSHG 2023-02 0.7453

554360 rows × 3 columns

Trading data

In [44]:
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
#                                            field=["secID","tradeDate",
#                                                   "closePrice",
#                                                   "negMarketValue"],pandas="1")

# stk_df.to_pickle('./data/stk_df.pkl')
In [45]:
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df['closePrice'] = pd.to_numeric(stk_df['closePrice'])
stk_df['negMarketValue'] = pd.to_numeric(stk_df['negMarketValue'])
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
In [46]:
stk_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11600428 entries, 0 to 11600427
Data columns (total 5 columns):
secID             object
tradeDate         datetime64[ns]
closePrice        float64
negMarketValue    float64
ym                period[M]
dtypes: datetime64[ns](1), float64(2), object(1), period[M](1)
memory usage: 531.0+ MB

Exclude ST

In [47]:
stk_df.dropna().shape
Out[47]:
(11600428, 5)
In [48]:
stk_df.shape
Out[48]:
(11600428, 5)
In [49]:
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
In [50]:
stk_df = stk_df[stk_df['STflg'].isna()].copy()
In [51]:
stk_df.drop('STflg',axis=1,inplace=True)
In [52]:
stk_df.shape
Out[52]:
(11048078, 5)

Monthly trading df

In [53]:
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).tail(1)
In [54]:
stk_df_m
Out[54]:
secID tradeDate closePrice negMarketValue ym
116 000001.XSHE 2007-06-29 870.870 4.266117e+10 2007-06
138 000001.XSHE 2007-07-31 1146.498 5.616330e+10 2007-07
161 000001.XSHE 2007-08-31 1202.510 5.890714e+10 2007-08
181 000001.XSHE 2007-09-28 1265.167 6.197651e+10 2007-09
199 000001.XSHE 2007-10-31 1520.542 7.448652e+10 2007-10
221 000001.XSHE 2007-11-30 1141.751 5.593078e+10 2007-11
241 000001.XSHE 2007-12-28 1221.497 6.574629e+10 2007-12
263 000001.XSHE 2008-01-31 1053.778 5.850212e+10 2008-01
... ... ... ... ... ...
11600307 900957.XSHG 2022-08-31 0.644 1.175760e+08 2022-08
11600328 900957.XSHG 2022-09-30 0.592 1.080080e+08 2022-09
11600344 900957.XSHG 2022-10-31 0.603 1.100320e+08 2022-10
11600366 900957.XSHG 2022-11-30 0.617 1.126080e+08 2022-11
11600388 900957.XSHG 2022-12-30 0.568 1.037760e+08 2022-12
11600404 900957.XSHG 2023-01-31 0.590 1.076400e+08 2023-01
11600424 900957.XSHG 2023-02-28 0.578 1.054320e+08 2023-02
11600427 900957.XSHG 2023-03-03 0.568 1.037760e+08 2023-03

552758 rows × 5 columns

Fill na months

In [55]:
def fill_missing(df, full_dates, id_col='secID', date_col='ym'):
    """
    This function fills the missing dates for stocks.
    Parameters:
        df: The dataframe. Could be a sub-dataframe created by "groupby".
            The dataframe must be sorted on the "date_col".
        full_dates: the unique dates covering all securities in the full dataframe. 
                    Need to be sorted.
        id_col: the security id.
        date_col: the dates column for the security
    Returns:
        A dataframe with the missing dates filled with NA.
    """
    one_stk_id = df[id_col].unique()
    date_start = np.where(full_dates == df[date_col].min())[0][0] 
    date_end = np.where(full_dates == df[date_col].max())[0][0]
    dates = full_dates[date_start:date_end+1]
    idx = pd.MultiIndex.from_product([one_stk_id,dates],
                                     names=(id_col,date_col))
    df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
    return df
In [56]:
full_dates = np.sort(stk_df['ym'].unique())
In [57]:
%%time
stk_df_m = stk_df_m.groupby('secID').apply(fill_missing, full_dates=full_dates)
CPU times: user 34.5 s, sys: 96 ms, total: 34.6 s
Wall time: 34.6 s
In [58]:
stk_df_m.reset_index(drop=True, inplace=True)
In [59]:
stk_df_m
Out[59]:
secID ym tradeDate closePrice negMarketValue
0 000001.XSHE 2007-06 2007-06-29 870.870 4.266117e+10
1 000001.XSHE 2007-07 2007-07-31 1146.498 5.616330e+10
2 000001.XSHE 2007-08 2007-08-31 1202.510 5.890714e+10
3 000001.XSHE 2007-09 2007-09-28 1265.167 6.197651e+10
4 000001.XSHE 2007-10 2007-10-31 1520.542 7.448652e+10
5 000001.XSHE 2007-11 2007-11-30 1141.751 5.593078e+10
6 000001.XSHE 2007-12 2007-12-28 1221.497 6.574629e+10
7 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10
... ... ... ... ... ...
567892 900957.XSHG 2022-08 2022-08-31 0.644 1.175760e+08
567893 900957.XSHG 2022-09 2022-09-30 0.592 1.080080e+08
567894 900957.XSHG 2022-10 2022-10-31 0.603 1.100320e+08
567895 900957.XSHG 2022-11 2022-11-30 0.617 1.126080e+08
567896 900957.XSHG 2022-12 2022-12-30 0.568 1.037760e+08
567897 900957.XSHG 2023-01 2023-01-31 0.590 1.076400e+08
567898 900957.XSHG 2023-02 2023-02-28 0.578 1.054320e+08
567899 900957.XSHG 2023-03 2023-03-03 0.568 1.037760e+08

567900 rows × 5 columns

In [60]:
stk_df_m.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567900 entries, 0 to 567899
Data columns (total 5 columns):
secID             567900 non-null object
ym                567900 non-null period[M]
tradeDate         552758 non-null datetime64[ns]
closePrice        552758 non-null float64
negMarketValue    552758 non-null float64
dtypes: datetime64[ns](1), float64(2), object(1), period[M](1)
memory usage: 21.7+ MB
In [61]:
stk_df_m.drop('tradeDate',axis=1,inplace=True)
In [62]:
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)

# # Use last month's market cap for sorting
stk_df_m['mkt_cap'] = stk_df_m.groupby('secID')['negMarketValue'].shift()
stk_df_m['mkt_cap_date'] = stk_df_m.groupby('secID')['ym'].shift()

stk_df_m.drop(['closePrice','negMarketValue'],axis=1,inplace=True)

stk_df_m
Out[62]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2007-06 NaN NaN NaT
1 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06
2 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07
3 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08
4 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09
5 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10
6 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11
7 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12
... ... ... ... ... ...
567892 900957.XSHG 2022-08 0.093379 1.074560e+08 2022-07
567893 900957.XSHG 2022-09 -0.080745 1.175760e+08 2022-08
567894 900957.XSHG 2022-10 0.018581 1.080080e+08 2022-09
567895 900957.XSHG 2022-11 0.023217 1.100320e+08 2022-10
567896 900957.XSHG 2022-12 -0.079417 1.126080e+08 2022-11
567897 900957.XSHG 2023-01 0.038732 1.037760e+08 2022-12
567898 900957.XSHG 2023-02 -0.020339 1.076400e+08 2023-01
567899 900957.XSHG 2023-03 -0.017301 1.054320e+08 2023-02

567900 rows × 5 columns

In [63]:
stk_df_m[(stk_df_m['secID']=='000007.XSHE') & (stk_df_m['ym']>='2021-03') & (stk_df_m['ym']<='2022-08')]
Out[63]:
secID ym ret mkt_cap mkt_cap_date
1037 000007.XSHE 2021-03 -0.046801 1.254329e+09 2021-02
1038 000007.XSHE 2021-04 0.018085 1.195629e+09 2021-03
1039 000007.XSHE 2021-05 NaN 1.217255e+09 2021-04
1040 000007.XSHE 2021-06 NaN NaN 2021-05
1041 000007.XSHE 2021-07 NaN NaN 2021-06
1042 000007.XSHE 2021-08 NaN NaN 2021-07
1043 000007.XSHE 2021-09 NaN NaN 2021-08
1044 000007.XSHE 2021-10 NaN NaN 2021-09
... ... ... ... ... ...
1047 000007.XSHE 2022-01 NaN NaN 2021-12
1048 000007.XSHE 2022-02 NaN NaN 2022-01
1049 000007.XSHE 2022-03 NaN NaN 2022-02
1050 000007.XSHE 2022-04 NaN NaN 2022-03
1051 000007.XSHE 2022-05 NaN NaN 2022-04
1052 000007.XSHE 2022-06 NaN NaN 2022-05
1053 000007.XSHE 2022-07 NaN NaN 2022-06
1054 000007.XSHE 2022-08 0.090902 2.276947e+09 2022-07

18 rows × 5 columns

In [64]:
stk_df_m[stk_df_m['ret'].isna()]
Out[64]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2007-06 NaN NaN NaT
190 000002.XSHE 2007-01 NaN NaN NaT
385 000004.XSHE 2011-06 NaN NaN NaT
517 000005.XSHE 2008-06 NaN NaN NaT
672 000006.XSHE 2007-01 NaN NaN NaT
867 000007.XSHE 2007-01 NaN NaN NaT
871 000007.XSHE 2007-05 NaN 726805286.6 2007-04
872 000007.XSHE 2007-06 NaN NaN 2007-05
... ... ... ... ... ...
567531 900955.XSHG 2022-01 NaN NaN 2021-12
567532 900955.XSHG 2022-02 NaN NaN 2022-01
567533 900955.XSHG 2022-03 NaN NaN 2022-02
567534 900955.XSHG 2022-04 NaN NaN 2022-03
567535 900955.XSHG 2022-05 NaN NaN 2022-04
567536 900955.XSHG 2022-06 NaN NaN 2022-05
567538 900956.XSHG 2007-01 NaN NaN NaT
567705 900957.XSHG 2007-01 NaN NaN NaT

20973 rows × 5 columns

In [65]:
stk_df_m[stk_df_m['mkt_cap'].isna()]
Out[65]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2007-06 NaN NaN NaT
190 000002.XSHE 2007-01 NaN NaN NaT
385 000004.XSHE 2011-06 NaN NaN NaT
517 000005.XSHE 2008-06 NaN NaN NaT
672 000006.XSHE 2007-01 NaN NaN NaT
867 000007.XSHE 2007-01 NaN NaN NaT
872 000007.XSHE 2007-06 NaN NaN 2007-05
873 000007.XSHE 2007-07 NaN NaN 2007-06
... ... ... ... ... ...
567531 900955.XSHG 2022-01 NaN NaN 2021-12
567532 900955.XSHG 2022-02 NaN NaN 2022-01
567533 900955.XSHG 2022-03 NaN NaN 2022-02
567534 900955.XSHG 2022-04 NaN NaN 2022-03
567535 900955.XSHG 2022-05 NaN NaN 2022-04
567536 900955.XSHG 2022-06 NaN NaN 2022-05
567538 900956.XSHG 2007-01 NaN NaN NaT
567705 900957.XSHG 2007-01 NaN NaN NaT

20313 rows × 5 columns

In [66]:
stk_df_m.dropna(inplace=True)
In [67]:
stk_df_m
Out[67]:
secID ym ret mkt_cap mkt_cap_date
1 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06
2 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07
3 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08
4 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09
5 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10
6 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11
7 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12
8 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01
... ... ... ... ... ...
567892 900957.XSHG 2022-08 0.093379 1.074560e+08 2022-07
567893 900957.XSHG 2022-09 -0.080745 1.175760e+08 2022-08
567894 900957.XSHG 2022-10 0.018581 1.080080e+08 2022-09
567895 900957.XSHG 2022-11 0.023217 1.100320e+08 2022-10
567896 900957.XSHG 2022-12 -0.079417 1.126080e+08 2022-11
567897 900957.XSHG 2023-01 0.038732 1.037760e+08 2022-12
567898 900957.XSHG 2023-02 -0.020339 1.076400e+08 2023-01
567899 900957.XSHG 2023-03 -0.017301 1.054320e+08 2023-02

546927 rows × 5 columns

Merge Book and Market Cap data

In [68]:
fundmen_df
Out[68]:
secID endDate publishDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
0 000001.XSHE 2007-12-31 2008-03-20 2007-12-31 1.300606e+10 1.300606e+10 0.000000e+00 3 2008 2007 2007-12 1.300606e+10
1 000001.XSHE 2008-12-31 2009-03-20 2008-12-31 1.640079e+10 1.640079e+10 0.000000e+00 3 2009 2008 2008-12 1.640079e+10
2 000001.XSHE 2009-12-31 2010-03-12 2009-12-31 2.046961e+10 2.046961e+10 0.000000e+00 3 2010 2009 2009-12 2.046961e+10
3 000001.XSHE 2010-12-31 2011-02-25 2010-12-31 3.351288e+10 3.351288e+10 0.000000e+00 2 2011 2010 2010-12 3.351288e+10
4 000001.XSHE 2011-12-31 2012-03-09 2011-12-31 7.538058e+10 7.331084e+10 2.069747e+09 3 2012 2011 2011-12 7.331084e+10
5 000001.XSHE 2012-12-31 2013-03-08 2012-12-31 8.479878e+10 8.479878e+10 0.000000e+00 3 2013 2012 2012-12 8.479878e+10
6 000001.XSHE 2013-12-31 2014-03-07 2013-12-31 1.120810e+11 1.120810e+11 0.000000e+00 3 2014 2013 2013-12 1.120810e+11
7 000001.XSHE 2014-12-31 2015-03-13 2014-12-31 1.309490e+11 1.309490e+11 0.000000e+00 3 2015 2014 2014-12 1.309490e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
47857 900957.XSHG 2014-12-31 2015-04-10 2014-12-31 4.072107e+08 3.940309e+08 1.317981e+07 4 2015 2014 2014-12 3.940309e+08
47858 900957.XSHG 2015-12-31 2016-03-30 2015-12-31 4.106786e+08 3.973929e+08 1.328570e+07 3 2016 2015 2015-12 3.973929e+08
47859 900957.XSHG 2016-12-31 2017-03-25 2016-12-31 3.938268e+08 3.930721e+08 7.546643e+05 3 2017 2016 2016-12 3.930721e+08
47860 900957.XSHG 2017-12-31 2018-04-10 2017-12-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12 4.231040e+08
47861 900957.XSHG 2018-12-31 2019-03-30 2018-12-31 4.515278e+08 4.508051e+08 7.226781e+05 3 2019 2018 2018-12 4.508051e+08
47862 900957.XSHG 2019-12-31 2020-04-25 2019-12-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12 4.761021e+08
47863 900957.XSHG 2020-12-31 2021-04-09 2020-12-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12 4.979110e+08
47864 900957.XSHG 2021-12-31 2022-04-20 2021-12-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12 5.255741e+08

46883 rows × 12 columns

In [69]:
stk_df_m
Out[69]:
secID ym ret mkt_cap mkt_cap_date
1 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06
2 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07
3 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08
4 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09
5 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10
6 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11
7 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12
8 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01
... ... ... ... ... ...
567892 900957.XSHG 2022-08 0.093379 1.074560e+08 2022-07
567893 900957.XSHG 2022-09 -0.080745 1.175760e+08 2022-08
567894 900957.XSHG 2022-10 0.018581 1.080080e+08 2022-09
567895 900957.XSHG 2022-11 0.023217 1.100320e+08 2022-10
567896 900957.XSHG 2022-12 -0.079417 1.126080e+08 2022-11
567897 900957.XSHG 2023-01 0.038732 1.037760e+08 2022-12
567898 900957.XSHG 2023-02 -0.020339 1.076400e+08 2023-01
567899 900957.XSHG 2023-03 -0.017301 1.054320e+08 2023-02

546927 rows × 5 columns

In [70]:
bm_df = pd.merge(stk_df_m[['secID','mkt_cap','mkt_cap_date']], fundmen_df[['secID','book','bm_date']],
                 left_on=['secID','mkt_cap_date'],right_on=['secID','bm_date'])
bm_df['bm'] = bm_df['book'] / bm_df['mkt_cap']
bm_df.drop(['mkt_cap_date','mkt_cap','book'],axis=1,inplace=True)
In [71]:
bm_df
Out[71]:
secID bm_date bm
0 000001.XSHE 2007-12 0.197822
1 000001.XSHE 2008-12 0.622601
2 000001.XSHE 2009-12 0.287250
3 000001.XSHE 2010-12 0.683467
4 000001.XSHE 2011-12 1.514294
5 000001.XSHE 2012-12 1.704572
6 000001.XSHE 2013-12 1.640895
7 000001.XSHE 2014-12 0.840421
... ... ... ...
41551 900957.XSHG 2014-12 3.482069
41552 900957.XSHG 2015-12 1.465227
41553 900957.XSHG 2016-12 1.893849
41554 900957.XSHG 2017-12 2.373042
41555 900957.XSHG 2018-12 3.977318
41556 900957.XSHG 2019-12 4.653798
41557 900957.XSHG 2020-12 5.379798
41558 900957.XSHG 2021-12 4.526753

41559 rows × 3 columns

Merge data

Merge rf, ret, mktcap, beta

In [72]:
ret_df = pd.merge(stk_df_m, rf, on='ym')

ret_df['exret'] = ret_df['ret'] - ret_df['rf']

ret_df.sort_values(['secID','ym'],inplace=True)

ret_df.reset_index(drop=True,inplace=True)

display(ret_df)
secID ym ret mkt_cap mkt_cap_date rf exret
0 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06 0.002620 0.313877
1 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07 0.002682 0.046173
2 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08 0.002934 0.049171
3 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09 0.003250 0.198601
4 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10 0.003545 -0.252661
5 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11 0.003643 0.066202
6 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12 0.003731 -0.141037
7 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 0.003753 -0.008257
... ... ... ... ... ... ... ...
542043 900957.XSHG 2022-07 -0.023217 1.100320e+08 2022-06 0.001620 -0.024837
542044 900957.XSHG 2022-08 0.093379 1.074560e+08 2022-07 0.001366 0.092013
542045 900957.XSHG 2022-09 -0.080745 1.175760e+08 2022-08 0.001342 -0.082087
542046 900957.XSHG 2022-10 0.018581 1.080080e+08 2022-09 0.001413 0.017168
542047 900957.XSHG 2022-11 0.023217 1.100320e+08 2022-10 0.001676 0.021541
542048 900957.XSHG 2022-12 -0.079417 1.126080e+08 2022-11 0.001931 -0.081348
542049 900957.XSHG 2023-01 0.038732 1.037760e+08 2022-12 0.002013 0.036719
542050 900957.XSHG 2023-02 -0.020339 1.076400e+08 2023-01 0.002013 -0.022352

542051 rows × 7 columns

In [73]:
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mkt_cap_date'],right_on=['secID','ym'])
display(ret_df)
secID ym_x ret mkt_cap mkt_cap_date rf exret ym_y beta
0 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06 0.002620 0.313877 2007-06 0.4614
1 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07 0.002682 0.046173 2007-07 0.6423
2 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08 0.002934 0.049171 2007-08 0.7722
3 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09 0.003250 0.198601 2007-09 0.7596
4 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10 0.003545 -0.252661 2007-10 0.7988
5 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11 0.003643 0.066202 2007-11 0.9560
6 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12 0.003731 -0.141037 2007-12 0.9468
7 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 0.003753 -0.008257 2008-01 0.9654
... ... ... ... ... ... ... ... ... ...
523663 689009.XSHG 2022-07 0.115056 2.264534e+10 2022-06 0.001620 0.113436 2022-06 0.9071
523664 689009.XSHG 2022-08 -0.112656 2.525082e+10 2022-07 0.001366 -0.114022 2022-07 0.7987
523665 689009.XSHG 2022-09 -0.129911 2.240616e+10 2022-08 0.001342 -0.131253 2022-08 0.8589
523666 689009.XSHG 2022-10 -0.164709 1.949535e+10 2022-09 0.001413 -0.166122 2022-09 0.9106
523667 689009.XSHG 2022-11 0.043125 1.637440e+10 2022-10 0.001676 0.041449 2022-10 0.7083
523668 689009.XSHG 2022-12 -0.086579 1.708055e+10 2022-11 0.001931 -0.088510 2022-11 0.7363
523669 689009.XSHG 2023-01 0.088554 1.560173e+10 2022-12 0.002013 0.086541 2022-12 0.6919
523670 689009.XSHG 2023-02 -0.005725 1.698332e+10 2023-01 0.002013 -0.007738 2023-01 0.7379

523671 rows × 9 columns

In [74]:
ret_df.drop(['ym_y'],axis=1,inplace=True)
In [75]:
ret_df.rename(columns={'ym_x':'ret_date',
                       'mkt_cap_date':'mktcap_beta_date'},inplace=True)
In [76]:
ret_df
Out[76]:
secID ret_date ret mkt_cap mktcap_beta_date rf exret beta
0 000001.XSHE 2007-07 0.316497 4.266117e+10 2007-06 0.002620 0.313877 0.4614
1 000001.XSHE 2007-08 0.048855 5.616330e+10 2007-07 0.002682 0.046173 0.6423
2 000001.XSHE 2007-09 0.052105 5.890714e+10 2007-08 0.002934 0.049171 0.7722
3 000001.XSHE 2007-10 0.201851 6.197651e+10 2007-09 0.003250 0.198601 0.7596
4 000001.XSHE 2007-11 -0.249116 7.448652e+10 2007-10 0.003545 -0.252661 0.7988
5 000001.XSHE 2007-12 0.069845 5.593078e+10 2007-11 0.003643 0.066202 0.9560
6 000001.XSHE 2008-01 -0.137306 6.574629e+10 2007-12 0.003731 -0.141037 0.9468
7 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 0.003753 -0.008257 0.9654
... ... ... ... ... ... ... ... ...
523663 689009.XSHG 2022-07 0.115056 2.264534e+10 2022-06 0.001620 0.113436 0.9071
523664 689009.XSHG 2022-08 -0.112656 2.525082e+10 2022-07 0.001366 -0.114022 0.7987
523665 689009.XSHG 2022-09 -0.129911 2.240616e+10 2022-08 0.001342 -0.131253 0.8589
523666 689009.XSHG 2022-10 -0.164709 1.949535e+10 2022-09 0.001413 -0.166122 0.9106
523667 689009.XSHG 2022-11 0.043125 1.637440e+10 2022-10 0.001676 0.041449 0.7083
523668 689009.XSHG 2022-12 -0.086579 1.708055e+10 2022-11 0.001931 -0.088510 0.7363
523669 689009.XSHG 2023-01 0.088554 1.560173e+10 2022-12 0.002013 0.086541 0.6919
523670 689009.XSHG 2023-02 -0.005725 1.698332e+10 2023-01 0.002013 -0.007738 0.7379

523671 rows × 8 columns

In [77]:
ret_df = ret_df[['secID','ret_date','ret','rf','exret','mktcap_beta_date','mkt_cap','beta']]
In [78]:
ret_df
Out[78]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta
0 000001.XSHE 2007-07 0.316497 0.002620 0.313877 2007-06 4.266117e+10 0.4614
1 000001.XSHE 2007-08 0.048855 0.002682 0.046173 2007-07 5.616330e+10 0.6423
2 000001.XSHE 2007-09 0.052105 0.002934 0.049171 2007-08 5.890714e+10 0.7722
3 000001.XSHE 2007-10 0.201851 0.003250 0.198601 2007-09 6.197651e+10 0.7596
4 000001.XSHE 2007-11 -0.249116 0.003545 -0.252661 2007-10 7.448652e+10 0.7988
5 000001.XSHE 2007-12 0.069845 0.003643 0.066202 2007-11 5.593078e+10 0.9560
6 000001.XSHE 2008-01 -0.137306 0.003731 -0.141037 2007-12 6.574629e+10 0.9468
7 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654
... ... ... ... ... ... ... ... ...
523663 689009.XSHG 2022-07 0.115056 0.001620 0.113436 2022-06 2.264534e+10 0.9071
523664 689009.XSHG 2022-08 -0.112656 0.001366 -0.114022 2022-07 2.525082e+10 0.7987
523665 689009.XSHG 2022-09 -0.129911 0.001342 -0.131253 2022-08 2.240616e+10 0.8589
523666 689009.XSHG 2022-10 -0.164709 0.001413 -0.166122 2022-09 1.949535e+10 0.9106
523667 689009.XSHG 2022-11 0.043125 0.001676 0.041449 2022-10 1.637440e+10 0.7083
523668 689009.XSHG 2022-12 -0.086579 0.001931 -0.088510 2022-11 1.708055e+10 0.7363
523669 689009.XSHG 2023-01 0.088554 0.002013 0.086541 2022-12 1.560173e+10 0.6919
523670 689009.XSHG 2023-02 -0.005725 0.002013 -0.007738 2023-01 1.698332e+10 0.7379

523671 rows × 8 columns

Merge all data with bm

调整return月份对应的bm月份

例:2007:12月的bm分组,对应的是 2008:07 -- 2009:06 的return

调整步骤:

  1. ret_year - 1, and set this variable as bm_date
  2. if ret_month is in [1,2,3,4,5,6], ret_year - 1 again
  3. convert bm_date to year-Dec format
In [79]:
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1
In [80]:
ret_df
Out[80]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta year month bm_date
0 000001.XSHE 2007-07 0.316497 0.002620 0.313877 2007-06 4.266117e+10 0.4614 2007 7 2006
1 000001.XSHE 2007-08 0.048855 0.002682 0.046173 2007-07 5.616330e+10 0.6423 2007 8 2006
2 000001.XSHE 2007-09 0.052105 0.002934 0.049171 2007-08 5.890714e+10 0.7722 2007 9 2006
3 000001.XSHE 2007-10 0.201851 0.003250 0.198601 2007-09 6.197651e+10 0.7596 2007 10 2006
4 000001.XSHE 2007-11 -0.249116 0.003545 -0.252661 2007-10 7.448652e+10 0.7988 2007 11 2006
5 000001.XSHE 2007-12 0.069845 0.003643 0.066202 2007-11 5.593078e+10 0.9560 2007 12 2006
6 000001.XSHE 2008-01 -0.137306 0.003731 -0.141037 2007-12 6.574629e+10 0.9468 2008 1 2006
7 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654 2008 2 2006
... ... ... ... ... ... ... ... ... ... ... ...
523663 689009.XSHG 2022-07 0.115056 0.001620 0.113436 2022-06 2.264534e+10 0.9071 2022 7 2021
523664 689009.XSHG 2022-08 -0.112656 0.001366 -0.114022 2022-07 2.525082e+10 0.7987 2022 8 2021
523665 689009.XSHG 2022-09 -0.129911 0.001342 -0.131253 2022-08 2.240616e+10 0.8589 2022 9 2021
523666 689009.XSHG 2022-10 -0.164709 0.001413 -0.166122 2022-09 1.949535e+10 0.9106 2022 10 2021
523667 689009.XSHG 2022-11 0.043125 0.001676 0.041449 2022-10 1.637440e+10 0.7083 2022 11 2021
523668 689009.XSHG 2022-12 -0.086579 0.001931 -0.088510 2022-11 1.708055e+10 0.7363 2022 12 2021
523669 689009.XSHG 2023-01 0.088554 0.002013 0.086541 2022-12 1.560173e+10 0.6919 2023 1 2021
523670 689009.XSHG 2023-02 -0.005725 0.002013 -0.007738 2023-01 1.698332e+10 0.7379 2023 2 2021

523671 rows × 11 columns

In [81]:
ret_df.loc[(ret_df['secID']=='300349.XSHE')&(ret_df['ret_date']>='2013-01')&(ret_df['ret_date']<='2013-12')]
Out[81]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta year month bm_date
261423 300349.XSHE 2013-01 -0.035197 0.003246 -0.038443 2012-12 7.245000e+08 0.6363 2013 1 2011
261424 300349.XSHE 2013-02 0.025751 0.003240 0.022511 2013-01 6.990000e+08 0.5292 2013 2 2011
261425 300349.XSHE 2013-03 -0.073640 0.003236 -0.076876 2013-02 7.170000e+08 0.6351 2013 3 2011
261426 300349.XSHE 2013-04 0.025519 0.003235 0.022284 2013-03 6.642000e+08 0.7784 2013 4 2011
261427 300349.XSHE 2013-05 0.351575 0.003235 0.348340 2013-04 6.811500e+08 0.8078 2013 5 2011
261428 300349.XSHE 2013-06 0.000244 0.004241 -0.003997 2013-05 9.180000e+08 0.7089 2013 6 2011
261429 300349.XSHE 2013-07 0.172520 0.003972 0.168548 2013-06 9.182250e+08 0.5040 2013 7 2012
261430 300349.XSHE 2013-08 0.028202 0.003880 0.024322 2013-07 1.076625e+09 0.5452 2013 8 2012
261431 300349.XSHE 2013-09 -0.087392 0.003884 -0.091276 2013-08 1.107000e+09 0.5464 2013 9 2012
261432 300349.XSHE 2013-10 -0.002221 0.003897 -0.006118 2013-09 1.010250e+09 0.4669 2013 10 2012
261433 300349.XSHE 2013-11 0.166738 0.003920 0.162818 2013-10 1.008000e+09 0.6522 2013 11 2012
261434 300349.XSHE 2013-12 -0.054534 0.004417 -0.058951 2013-11 1.176075e+09 0.6451 2013 12 2012
In [82]:
pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()
Out[82]:
0        2006-12-31
1        2006-12-31
2        2006-12-31
3        2006-12-31
4        2006-12-31
5        2006-12-31
6        2006-12-31
7        2006-12-31
            ...    
523663   2021-12-31
523664   2021-12-31
523665   2021-12-31
523666   2021-12-31
523667   2021-12-31
523668   2021-12-31
523669   2021-12-31
523670   2021-12-31
Name: bm_date, Length: 523671, dtype: datetime64[ns]
In [83]:
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1

ret_df['bm_date'] = pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()

ret_df['bm_date'] = ret_df['bm_date'].dt.to_period('M')

ret_df.drop(['month','year'], axis=1, inplace=True)
In [84]:
ret_df
Out[84]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date
0 000001.XSHE 2007-07 0.316497 0.002620 0.313877 2007-06 4.266117e+10 0.4614 2006-12
1 000001.XSHE 2007-08 0.048855 0.002682 0.046173 2007-07 5.616330e+10 0.6423 2006-12
2 000001.XSHE 2007-09 0.052105 0.002934 0.049171 2007-08 5.890714e+10 0.7722 2006-12
3 000001.XSHE 2007-10 0.201851 0.003250 0.198601 2007-09 6.197651e+10 0.7596 2006-12
4 000001.XSHE 2007-11 -0.249116 0.003545 -0.252661 2007-10 7.448652e+10 0.7988 2006-12
5 000001.XSHE 2007-12 0.069845 0.003643 0.066202 2007-11 5.593078e+10 0.9560 2006-12
6 000001.XSHE 2008-01 -0.137306 0.003731 -0.141037 2007-12 6.574629e+10 0.9468 2006-12
7 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654 2006-12
... ... ... ... ... ... ... ... ... ...
523663 689009.XSHG 2022-07 0.115056 0.001620 0.113436 2022-06 2.264534e+10 0.9071 2021-12
523664 689009.XSHG 2022-08 -0.112656 0.001366 -0.114022 2022-07 2.525082e+10 0.7987 2021-12
523665 689009.XSHG 2022-09 -0.129911 0.001342 -0.131253 2022-08 2.240616e+10 0.8589 2021-12
523666 689009.XSHG 2022-10 -0.164709 0.001413 -0.166122 2022-09 1.949535e+10 0.9106 2021-12
523667 689009.XSHG 2022-11 0.043125 0.001676 0.041449 2022-10 1.637440e+10 0.7083 2021-12
523668 689009.XSHG 2022-12 -0.086579 0.001931 -0.088510 2022-11 1.708055e+10 0.7363 2021-12
523669 689009.XSHG 2023-01 0.088554 0.002013 0.086541 2022-12 1.560173e+10 0.6919 2021-12
523670 689009.XSHG 2023-02 -0.005725 0.002013 -0.007738 2023-01 1.698332e+10 0.7379 2021-12

523671 rows × 9 columns

In [85]:
bm_df
Out[85]:
secID bm_date bm
0 000001.XSHE 2007-12 0.197822
1 000001.XSHE 2008-12 0.622601
2 000001.XSHE 2009-12 0.287250
3 000001.XSHE 2010-12 0.683467
4 000001.XSHE 2011-12 1.514294
5 000001.XSHE 2012-12 1.704572
6 000001.XSHE 2013-12 1.640895
7 000001.XSHE 2014-12 0.840421
... ... ... ...
41551 900957.XSHG 2014-12 3.482069
41552 900957.XSHG 2015-12 1.465227
41553 900957.XSHG 2016-12 1.893849
41554 900957.XSHG 2017-12 2.373042
41555 900957.XSHG 2018-12 3.977318
41556 900957.XSHG 2019-12 4.653798
41557 900957.XSHG 2020-12 5.379798
41558 900957.XSHG 2021-12 4.526753

41559 rows × 3 columns

In [86]:
ret_df = pd.merge(ret_df,bm_df,on=['secID','bm_date'])
In [87]:
ret_df[ret_df['ret_date']<='2010-07']
Out[87]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
0 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672 2007-12 0.197822
1 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966 2007-12 0.197822
2 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386 2007-12 0.197822
3 000001.XSHE 2008-10 -0.271959 0.003522 -0.275481 2008-09 3.210865e+10 1.1184 2007-12 0.197822
4 000001.XSHE 2008-11 0.074075 0.003063 0.071012 2008-10 2.330715e+10 1.1991 2007-12 0.197822
5 000001.XSHE 2008-12 0.052279 0.001908 0.050371 2008-11 2.503361e+10 1.2192 2007-12 0.197822
6 000001.XSHE 2009-01 0.230446 0.001256 0.229190 2008-12 2.634237e+10 1.2206 2007-12 0.197822
7 000001.XSHE 2009-02 0.185567 0.001088 0.184479 2009-01 3.241281e+10 1.2514 2007-12 0.197822
... ... ... ... ... ... ... ... ... ... ...
416073 601999.XSHG 2009-12 -0.025075 0.001516 -0.026591 2009-11 1.709400e+09 1.1428 2008-12 1.413367
416074 601999.XSHG 2010-01 0.145660 0.001553 0.144107 2009-12 1.666480e+09 1.1562 2008-12 1.413367
416075 601999.XSHG 2010-02 0.112405 0.001604 0.110801 2010-01 1.909200e+09 1.0657 2008-12 1.413367
416076 601999.XSHG 2010-03 0.045981 0.001619 0.044362 2010-02 2.123800e+09 1.0307 2008-12 1.413367
416077 601999.XSHG 2010-04 -0.105954 0.001616 -0.107570 2010-03 2.221480e+09 0.9831 2008-12 1.413367
416078 601999.XSHG 2010-05 -0.092406 0.001646 -0.094052 2010-04 1.986160e+09 0.9818 2008-12 1.413367
416079 601999.XSHG 2010-06 -0.162527 0.002004 -0.164531 2010-05 1.802640e+09 0.8813 2008-12 1.413367
416080 601999.XSHG 2010-07 0.161967 0.002134 0.159833 2010-06 1.509600e+09 0.9291 2009-12 0.880399

33946 rows × 10 columns

In [88]:
gc.collect()
Out[88]:
22

Sorting on BM

In [89]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
In [90]:
quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['bm_date'])['bm'].quantile(value)
In [91]:
quantile_df
Out[91]:
q1 q2 q3 q4 q5 q6 q7 q8 q9
bm_date
2007-12 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
2008-12 0.407748 0.528942 0.642522 0.758567 0.904119 1.072294 1.271045 1.553693 2.021077
2009-12 0.155373 0.208979 0.253932 0.307731 0.363082 0.414168 0.518215 0.687670 0.926290
2010-12 0.145600 0.209428 0.265209 0.323351 0.408239 0.517388 0.679519 0.917926 1.266695
2011-12 0.248262 0.347720 0.446247 0.551224 0.684522 0.845515 1.093722 1.431439 1.936640
2012-12 0.267248 0.368493 0.467594 0.570420 0.693980 0.860729 1.073500 1.383808 1.837972
2013-12 0.209901 0.299102 0.391524 0.469051 0.560393 0.663369 0.795627 0.989655 1.332001
2014-12 0.176442 0.245748 0.307676 0.369487 0.427729 0.497832 0.589496 0.714211 0.907696
2015-12 0.114731 0.163949 0.203959 0.249819 0.295992 0.352860 0.414503 0.516854 0.712337
2016-12 0.164844 0.227940 0.285954 0.341008 0.395355 0.463995 0.543622 0.662794 0.880151
2017-12 0.219440 0.303450 0.386904 0.460945 0.538040 0.630824 0.743408 0.876399 1.140541
2018-12 0.317219 0.438059 0.544472 0.645481 0.761952 0.867876 1.028248 1.224002 1.564038
2019-12 0.234829 0.338785 0.430694 0.523064 0.616038 0.727110 0.875099 1.074392 1.391731
2020-12 0.189233 0.284824 0.369058 0.456295 0.554992 0.663582 0.812182 1.011078 1.379245
2021-12 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
In [92]:
ret_df_q = pd.merge(ret_df, quantile_df, on='bm_date')
In [93]:
ret_df_q
Out[93]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm q1 q2 q3 q4 q5 q6 q7 q8 q9
0 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
1 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
2 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
3 000001.XSHE 2008-10 -0.271959 0.003522 -0.275481 2008-09 3.210865e+10 1.1184 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
4 000001.XSHE 2008-11 0.074075 0.003063 0.071012 2008-10 2.330715e+10 1.1991 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
5 000001.XSHE 2008-12 0.052279 0.001908 0.050371 2008-11 2.503361e+10 1.2192 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
6 000001.XSHE 2009-01 0.230446 0.001256 0.229190 2008-12 2.634237e+10 1.2206 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
7 000001.XSHE 2009-02 0.185567 0.001088 0.184479 2009-01 3.241281e+10 1.2514 2007-12 0.197822 0.164941 0.215784 0.269267 0.319476 0.377518 0.442953 0.541884 0.638771 0.832079
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
455411 689009.XSHG 2022-07 0.115056 0.001620 0.113436 2022-06 2.264534e+10 0.9071 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455412 689009.XSHG 2022-08 -0.112656 0.001366 -0.114022 2022-07 2.525082e+10 0.7987 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455413 689009.XSHG 2022-09 -0.129911 0.001342 -0.131253 2022-08 2.240616e+10 0.8589 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455414 689009.XSHG 2022-10 -0.164709 0.001413 -0.166122 2022-09 1.949535e+10 0.9106 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455415 689009.XSHG 2022-11 0.043125 0.001676 0.041449 2022-10 1.637440e+10 0.7083 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455416 689009.XSHG 2022-12 -0.086579 0.001931 -0.088510 2022-11 1.708055e+10 0.7363 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455417 689009.XSHG 2023-01 0.088554 0.002013 0.086541 2022-12 1.560173e+10 0.6919 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875
455418 689009.XSHG 2023-02 -0.005725 0.002013 -0.007738 2023-01 1.698332e+10 0.7379 2021-12 0.137716 0.167910 0.250540 0.328266 0.402552 0.496401 0.605125 0.749251 0.973459 1.324875

455419 rows × 19 columns

In [94]:
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
In [95]:
portfolios['p2']
Out[95]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
0 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672 2007-12 0.197822
1 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966 2007-12 0.197822
2 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386 2007-12 0.197822
3 000001.XSHE 2008-10 -0.271959 0.003522 -0.275481 2008-09 3.210865e+10 1.1184 2007-12 0.197822
4 000001.XSHE 2008-11 0.074075 0.003063 0.071012 2008-10 2.330715e+10 1.1991 2007-12 0.197822
5 000001.XSHE 2008-12 0.052279 0.001908 0.050371 2008-11 2.503361e+10 1.2192 2007-12 0.197822
6 000001.XSHE 2009-01 0.230446 0.001256 0.229190 2008-12 2.634237e+10 1.2206 2007-12 0.197822
7 000001.XSHE 2009-02 0.185567 0.001088 0.184479 2009-01 3.241281e+10 1.2514 2007-12 0.197822
... ... ... ... ... ... ... ... ... ... ...
455307 688777.XSHG 2022-07 0.097296 0.001620 0.095676 2022-06 1.979643e+10 0.9941 2021-12 0.219181
455308 688777.XSHG 2022-08 -0.002748 0.001366 -0.004114 2022-07 2.172275e+10 0.9504 2021-12 0.219181
455309 688777.XSHG 2022-09 -0.018837 0.001342 -0.020179 2022-08 2.159338e+10 0.8483 2021-12 0.219181
455310 688777.XSHG 2022-10 0.240007 0.001413 0.238594 2022-09 2.500795e+10 0.7970 2021-12 0.219181
455311 688777.XSHG 2022-11 -0.061317 0.001676 -0.062993 2022-10 3.101025e+10 0.7226 2021-12 0.219181
455312 688777.XSHG 2022-12 -0.001103 0.001931 -0.003034 2022-11 2.984336e+10 0.3677 2021-12 0.219181
455313 688777.XSHG 2023-01 0.013322 0.002013 0.011309 2022-12 2.981054e+10 0.3575 2021-12 0.219181
455314 688777.XSHG 2023-02 0.064761 0.002013 0.062748 2023-01 3.020767e+10 0.2920 2021-12 0.219181

45663 rows × 10 columns

return by portfolios

In [96]:
portfolios['p2'].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 45663 entries, 0 to 455314
Data columns (total 10 columns):
secID               45663 non-null object
ret_date            45663 non-null period[M]
ret                 45663 non-null float64
rf                  45663 non-null float64
exret               45663 non-null float64
mktcap_beta_date    45663 non-null period[M]
mkt_cap             45663 non-null float64
beta                44580 non-null float64
bm_date             45663 non-null period[M]
bm                  45663 non-null float64
dtypes: float64(6), object(1), period[M](3)
memory usage: 3.8+ MB
In [97]:
for k in portfolios.keys():
    print(portfolios[k].groupby(['ret_date'])['exret'].mean().mean())
0.0072350610306788
0.008844749962653749
0.010848977354697274
0.011225529519617812
0.011914979445716047
0.012396644493784302
0.012503499785698357
0.012803981513003439
0.011792982655524654
0.009439461559541758
In [98]:
portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
In [99]:
portfolios_crs_mean['p1']
Out[99]:
ret_date
2008-07    0.066537
2008-08   -0.257046
2008-09   -0.075499
2008-10   -0.280811
2008-11    0.210002
2008-12    0.066290
2009-01    0.161217
2009-02    0.085603
             ...   
2022-07   -0.005150
2022-08   -0.045974
2022-09   -0.085512
2022-10    0.025376
2022-11    0.055098
2022-12   -0.030963
2023-01    0.081680
2023-02   -0.006816
Freq: M, Name: exret, Length: 176, dtype: float64

More robust by adjusting Newey-West Errors

In [100]:
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p9'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p9-p1'] = reg.params[0]
t_values['p9-p1'] = reg.tvalues[0]
In [101]:
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[101]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p9-p1
mean 0.007235 0.008845 0.010849 0.011226 0.011915 0.012397 0.012503 0.012804 0.011793 0.009439 0.004558
t-value 1.103164 1.368736 1.683617 1.744812 1.861352 1.843339 1.883647 1.968620 1.796847 1.499882 2.170096

既然一年调一次仓,年收益率呢?

In [102]:
portfolios['p1']
Out[102]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
382 000060.XSHE 2008-07 -0.006425 0.003682 -0.010107 2008-06 9.792492e+09 1.4188 2007-12 0.161566
383 000060.XSHE 2008-08 -0.241201 0.003604 -0.244805 2008-07 9.729630e+09 1.4021 2007-12 0.161566
384 000060.XSHE 2008-09 -0.013250 0.003591 -0.016841 2008-08 7.382785e+09 1.3925 2007-12 0.161566
385 000060.XSHE 2008-10 -0.380633 0.003522 -0.384155 2008-09 7.285000e+09 1.3134 2007-12 0.161566
386 000060.XSHE 2008-11 0.145518 0.003063 0.142455 2008-10 4.512090e+09 1.3197 2007-12 0.161566
387 000060.XSHE 2008-12 0.054054 0.001908 0.052146 2008-11 5.511400e+09 1.2790 2007-12 0.161566
388 000060.XSHE 2009-01 0.352555 0.001256 0.351299 2008-12 5.809067e+09 1.2843 2007-12 0.161566
389 000060.XSHE 2009-02 0.054033 0.001088 0.052945 2009-01 7.857136e+09 1.3517 2007-12 0.161566
... ... ... ... ... ... ... ... ... ... ...
455411 689009.XSHG 2022-07 0.115056 0.001620 0.113436 2022-06 2.264534e+10 0.9071 2021-12 0.137716
455412 689009.XSHG 2022-08 -0.112656 0.001366 -0.114022 2022-07 2.525082e+10 0.7987 2021-12 0.137716
455413 689009.XSHG 2022-09 -0.129911 0.001342 -0.131253 2022-08 2.240616e+10 0.8589 2021-12 0.137716
455414 689009.XSHG 2022-10 -0.164709 0.001413 -0.166122 2022-09 1.949535e+10 0.9106 2021-12 0.137716
455415 689009.XSHG 2022-11 0.043125 0.001676 0.041449 2022-10 1.637440e+10 0.7083 2021-12 0.137716
455416 689009.XSHG 2022-12 -0.086579 0.001931 -0.088510 2022-11 1.708055e+10 0.7363 2021-12 0.137716
455417 689009.XSHG 2023-01 0.088554 0.002013 0.086541 2022-12 1.560173e+10 0.6919 2021-12 0.137716
455418 689009.XSHG 2023-02 -0.005725 0.002013 -0.007738 2023-01 1.698332e+10 0.7379 2021-12 0.137716

45634 rows × 10 columns

In [103]:
portfolios[k]['1+ret'] = portfolios[k]['ret']+1
portfolios[k]['1+rf'] = portfolios[k]['rf']+1
In [104]:
portfolios[k]
Out[104]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm 1+ret 1+rf
60 000016.XSHE 2008-07 0.077325 0.003682 0.073643 2008-06 2.249663e+09 1.0276 2007-12 1.425649 1.077325 1.003682
61 000016.XSHE 2008-08 -0.185635 0.003604 -0.189239 2008-07 2.423636e+09 1.0078 2007-12 1.425649 0.814365 1.003604
62 000016.XSHE 2008-09 -0.024326 0.003591 -0.027917 2008-08 1.973704e+09 1.0877 2007-12 1.425649 0.975674 1.003591
63 000016.XSHE 2008-10 -0.115267 0.003522 -0.118789 2008-09 1.925711e+09 1.0745 2007-12 1.425649 0.884733 1.003522
64 000016.XSHE 2008-11 0.151419 0.003063 0.148356 2008-10 1.703744e+09 1.0176 2007-12 1.425649 1.151419 1.003063
65 000016.XSHE 2008-12 -0.036712 0.001908 -0.038620 2008-11 1.961706e+09 1.0640 2007-12 1.425649 0.963288 1.001908
66 000016.XSHE 2009-01 0.085735 0.001256 0.084479 2008-12 1.889712e+09 1.0535 2007-12 1.425649 1.085735 1.001256
67 000016.XSHE 2009-02 0.266060 0.001088 0.264972 2009-01 2.051688e+09 1.0306 2007-12 1.425649 1.266060 1.001088
... ... ... ... ... ... ... ... ... ... ... ... ...
455395 688819.XSHG 2022-07 0.210540 0.001620 0.208920 2022-06 4.443776e+09 1.0191 2021-12 2.699231 1.210540 1.001620
455396 688819.XSHG 2022-08 -0.069037 0.001366 -0.070403 2022-07 5.379307e+09 1.1129 2021-12 2.699231 0.930963 1.001366
455397 688819.XSHG 2022-09 -0.155770 0.001342 -0.157112 2022-08 5.069038e+09 0.9753 2021-12 2.699231 0.844230 1.001342
455398 688819.XSHG 2022-10 0.246820 0.001413 0.245407 2022-09 4.279481e+09 1.1159 2021-12 2.699231 1.246820 1.001413
455399 688819.XSHG 2022-11 -0.070214 0.001676 -0.071890 2022-10 5.335693e+09 1.0272 2021-12 2.699231 0.929786 1.001676
455400 688819.XSHG 2022-12 -0.036704 0.001931 -0.038635 2022-11 4.961076e+09 0.6504 2021-12 2.699231 0.963296 1.001931
455401 688819.XSHG 2023-01 0.074307 0.002013 0.072294 2022-12 4.778970e+09 0.5877 2021-12 2.699231 1.074307 1.002013
455402 688819.XSHG 2023-02 -0.058785 0.002013 -0.060798 2023-01 5.166623e+09 0.5594 2021-12 2.699231 0.941215 1.002013

45646 rows × 12 columns

In [105]:
portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
Out[105]:
secID bm_date 1+ret
0 000001.XSHE 2013-12 1.776255
1 000001.XSHE 2015-12 1.079310
2 000001.XSHE 2016-12 0.982198
3 000011.XSHE 2014-12 0.904518
4 000011.XSHE 2015-12 1.508421
5 000011.XSHE 2018-12 1.744011
6 000016.XSHE 2007-12 1.311983
7 000016.XSHE 2010-12 0.849273
... ... ... ...
3957 688722.XSHG 2021-12 1.234767
3958 688728.XSHG 2021-12 0.838615
3959 688737.XSHG 2021-12 0.996613
3960 688739.XSHG 2021-12 0.668178
3961 688767.XSHG 2021-12 0.752507
3962 688799.XSHG 2021-12 1.125962
3963 688819.XSHG 2021-12 1.074326
3964 688981.XSHG 2020-12 0.730670

3965 rows × 3 columns

In [106]:
pf_year_ret = {}
for k in portfolios.keys():
    portfolios[k]['1+ret'] = portfolios[k]['ret']+1
    portfolios[k]['1+rf'] = portfolios[k]['rf']+1
    pf_year_ret[k] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
    pf_year_ret[k]['1+rf'] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+rf'].prod()['1+rf']
    pf_year_ret[k]['ret'] = pf_year_ret[k]['1+ret'] - 1
    pf_year_ret[k]['rf'] = pf_year_ret[k]['1+rf'] - 1
    pf_year_ret[k]['exret'] = pf_year_ret[k]['ret'] - pf_year_ret[k]['rf']
In [107]:
pf_year_ret['p1']
Out[107]:
secID bm_date 1+ret 1+rf ret rf exret
0 000004.XSHE 2011-12 1.180620 1.039890 0.180620 0.039890 0.140730
1 000004.XSHE 2012-12 1.388007 1.052017 0.388007 0.052017 0.335990
2 000004.XSHE 2013-12 2.773781 1.045296 1.773781 0.045296 1.728485
3 000004.XSHE 2014-12 0.956622 1.030621 -0.043378 0.030621 -0.073999
4 000004.XSHE 2015-12 0.696445 1.036393 -0.303555 0.036393 -0.339948
5 000004.XSHE 2016-12 0.764129 1.045471 -0.235871 0.045471 -0.281341
6 000004.XSHE 2017-12 1.082653 1.030115 0.082653 0.030115 0.052539
7 000004.XSHE 2018-12 1.406224 1.024673 0.406224 0.024673 0.381552
... ... ... ... ... ... ... ...
4047 688390.XSHG 2021-12 1.241143 1.013452 0.241143 0.013452 0.227691
4048 688516.XSHG 2021-12 1.064073 1.013452 0.064073 0.013452 0.050621
4049 688518.XSHG 2021-12 1.040875 1.013452 0.040875 0.013452 0.027423
4050 688536.XSHG 2021-12 0.676111 1.013452 -0.323889 0.013452 -0.337341
4051 688556.XSHG 2021-12 0.915741 1.013452 -0.084259 0.013452 -0.097711
4052 688595.XSHG 2021-12 0.731415 1.013452 -0.268585 0.013452 -0.282037
4053 688598.XSHG 2021-12 0.733828 1.013452 -0.266172 0.013452 -0.279624
4054 689009.XSHG 2021-12 0.741573 1.013452 -0.258427 0.013452 -0.271879

4055 rows × 7 columns

In [108]:
portfolios_crs_mean = dict()
for k in pf_year_ret.keys():
    portfolios_crs_mean[k] = pf_year_ret[k].groupby(['bm_date'])['exret'].mean()
In [109]:
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
In [110]:
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[110]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.068864 0.087974 0.118135 0.126376 0.135423 0.151750 0.148870 0.158574 0.143195 0.117429 0.048565
t-value 1.284789 1.733457 2.183006 2.307970 2.753879 2.273473 2.394727 2.339713 2.098485 1.806868 1.290878

Portfolio characteristics other than return

In [111]:
# average beta in each portfolio
for key in portfolios.keys():
    print(portfolios[key].groupby('bm_date')['bm'].mean().mean())  
0.14615646005784907
0.2558344970352621
0.33475874587831866
0.41124253266416716
0.4935211173021713
0.5892265347229694
0.7088981617503399
0.8728986534940015
1.1189011915748628
2.143066708794292
In [112]:
portfolios['p1'].groupby('bm_date')['secID'].nunique()
Out[112]:
bm_date
2007-12    134
2008-12    143
2009-12    153
2010-12    189
2011-12    222
2012-12    237
2013-12    244
2014-12    261
2015-12    288
2016-12    303
2017-12    339
2018-12    346
2019-12    364
2020-12    394
2021-12    438
Freq: M, Name: secID, dtype: int64
In [113]:
pf_n_stks = pd.DataFrame()
for key, value in portfolios.items():
    pf_n_stks[key] = portfolios[key].groupby('bm_date')['secID'].nunique()

display(pf_n_stks)

pf_n_stks.plot()
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
bm_date
2007-12 134 132 133 131 132 133 132 132 131 132
2008-12 143 142 144 142 144 141 142 142 142 141
2009-12 153 152 151 151 151 151 150 152 149 150
2010-12 189 186 187 186 187 185 186 185 185 185
2011-12 222 218 216 216 215 215 216 215 215 214
2012-12 237 240 234 235 237 236 235 234 234 235
2013-12 244 242 242 240 241 240 241 240 239 240
2014-12 261 254 252 252 254 251 254 253 252 251
2015-12 288 275 273 274 273 271 274 272 274 271
2016-12 303 294 294 292 298 294 293 294 293 292
2017-12 339 337 339 339 338 335 335 336 335 335
2018-12 346 344 340 340 342 343 341 341 341 342
2019-12 364 360 360 354 355 355 354 355 353 354
2020-12 394 389 388 388 386 389 387 387 387 386
2021-12 438 438 437 438 437 438 438 437 438 437
Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba20f1550>
In [114]:
portfolios['p10'].groupby('bm_date')['mkt_cap'].mean()/1e9
Out[114]:
bm_date
2007-12     9.520442
2008-12    24.801299
2009-12    23.330185
2010-12     7.387187
2011-12     4.310136
2012-12     6.852996
2013-12    24.818758
2014-12    30.266811
2015-12    46.521432
2016-12    49.633862
2017-12    25.116625
2018-12    23.554655
2019-12    25.433189
2020-12    27.416370
2021-12    23.515428
Freq: M, Name: mkt_cap, dtype: float64
In [115]:
pf_mktcap = pd.DataFrame()
for key, value in portfolios.items():
    pf_mktcap[key] = portfolios[key].groupby('bm_date')['mkt_cap'].mean()

display(pf_mktcap)

pf_mktcap.plot()
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
bm_date
2007-12 5.715808e+09 4.795101e+09 2.882950e+09 3.434211e+09 3.840968e+09 3.304378e+09 2.571075e+09 2.325255e+09 1.692608e+09 9.520442e+09
2008-12 8.177124e+09 9.130319e+09 7.149702e+09 6.599234e+09 7.352643e+09 5.789497e+09 5.991050e+09 5.301314e+09 7.721629e+09 2.480130e+10
2009-12 1.067969e+10 8.379840e+09 7.375834e+09 8.714215e+09 9.299599e+09 1.347153e+10 1.388075e+10 1.177886e+10 4.367188e+09 2.333018e+10
2010-12 9.788895e+09 6.836670e+09 5.396383e+09 5.858096e+09 5.716337e+09 1.145075e+10 1.965841e+10 1.488090e+10 7.589464e+09 7.387187e+09
2011-12 8.593670e+09 5.343388e+09 5.859148e+09 8.330202e+09 1.316118e+10 8.513601e+09 1.432788e+10 7.445566e+09 3.744470e+09 4.310136e+09
2012-12 7.913350e+09 7.126788e+09 5.878258e+09 6.422015e+09 8.262964e+09 1.419814e+10 1.351916e+10 4.738759e+09 6.146973e+09 6.852996e+09
2013-12 1.039635e+10 9.947602e+09 8.807784e+09 8.933609e+09 9.016754e+09 7.248690e+09 1.388587e+10 1.694208e+10 1.861084e+10 2.481876e+10
2014-12 1.116183e+10 1.250609e+10 1.256581e+10 1.090602e+10 1.280498e+10 1.061681e+10 1.042628e+10 1.919891e+10 1.367151e+10 3.026681e+10
2015-12 9.341924e+09 9.016056e+09 1.004300e+10 9.985461e+09 9.288530e+09 1.068221e+10 1.002104e+10 1.043531e+10 1.542386e+10 4.652143e+10
2016-12 8.644937e+09 1.294780e+10 8.000102e+09 9.380542e+09 1.058725e+10 8.635930e+09 1.149724e+10 1.146498e+10 1.620886e+10 4.963386e+10
2017-12 1.783157e+10 9.153566e+09 7.178220e+09 9.078995e+09 8.969729e+09 8.541627e+09 8.122416e+09 1.129418e+10 1.205107e+10 2.511663e+10
2018-12 2.197334e+10 1.316444e+10 8.406058e+09 8.504631e+09 1.027894e+10 9.095954e+09 1.229830e+10 1.528553e+10 1.136744e+10 2.355466e+10
2019-12 4.097754e+10 1.884871e+10 1.227371e+10 8.953288e+09 1.343037e+10 1.333013e+10 1.564740e+10 1.098639e+10 1.182802e+10 2.543319e+10
2020-12 5.045492e+10 1.591886e+10 1.302870e+10 1.052350e+10 8.420125e+09 1.188646e+10 1.356596e+10 1.104882e+10 1.155407e+10 2.741637e+10
2021-12 3.762744e+10 1.515851e+10 1.245366e+10 1.022247e+10 9.673234e+09 8.130061e+09 8.767111e+09 1.278490e+10 1.199148e+10 2.351543e+10
Out[115]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba1cf1350>
In [116]:
pf_mktcap = pf_mktcap / 1e10
for i in range(10):
    print(pf_mktcap.mean()[i])
1.7285225763373466
1.0551583098923218
0.8486621563464765
0.8389765772572633
0.934024016924777
0.9659718376251994
1.1611995866399725
1.1060784470939953
1.0264631375974875
2.349862499307394

BM 1年调仓单排结论

排序方法:t年7月至t+1年6月,按照t-1年12月的BM排序,考察区间内每月平均收益率以及区间年平均收益率

结论:

  • 月、年平均收益率呈现微弱递增,但p10的收益率较差。年收益的显著性比较强。
  • 最大BM组(也即估值最低组)的market cap起伏很大
  • 最小BM组(也即估值最高组)的market cap在样本后期显著增大
  • BM的效应可能和market cap有关系

BM Point-in-Time

In [117]:
del portfolios, portfolios_crs_mean
In [118]:
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")

# fundmen_df.to_pickle('./data/fundmen_df_pit.pkl')
In [119]:
fundmen_df = pd.read_pickle('./data/fundmen_df_pit.pkl')
In [120]:
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
In [121]:
fundmen_df.sort_values(['secID','publishDate','endDate'],inplace=True)
In [122]:
fundmen_df
Out[122]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
121 000001.XSHE 2007-04-26 2007-03-31 2007-03-31 2007-04-25 18:00:00 3 7.106094e+09 7.106094e+09 NaN
120 000001.XSHE 2007-08-16 2007-06-30 2007-06-30 2007-08-15 18:00:00 6 7.698478e+09 7.698478e+09 NaN
119 000001.XSHE 2007-10-23 2007-09-30 2007-09-30 2007-10-22 18:00:00 9 8.363553e+09 8.363553e+09 NaN
118 000001.XSHE 2008-03-20 2007-12-31 2007-12-31 2008-03-19 18:00:00 12 1.300606e+10 1.300606e+10 NaN
117 000001.XSHE 2008-04-24 2007-12-31 2008-03-31 2008-04-23 18:00:00 12 1.300606e+10 1.300606e+10 NaN
113 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
116 000001.XSHE 2008-08-21 2007-12-31 2008-06-30 2008-08-20 18:00:00 12 1.300606e+10 1.300606e+10 NaN
112 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
... ... ... ... ... ... ... ... ... ...
433177 900957.XSHG 2022-04-20 2020-12-31 2021-12-31 2022-04-19 17:15:56 12 4.987276e+08 4.979110e+08 816555.06
433173 900957.XSHG 2022-04-20 2021-12-31 2021-12-31 2022-04-19 17:15:56 12 5.263733e+08 5.255741e+08 799194.04
433172 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 2022-04-29 15:36:38 12 5.263733e+08 5.255741e+08 799194.04
433169 900957.XSHG 2022-04-30 2022-03-31 2022-03-31 2022-04-29 15:36:38 3 5.341491e+08 5.333509e+08 798170.28
433171 900957.XSHG 2022-08-16 2021-12-31 2022-06-30 2022-08-15 16:24:24 12 5.263733e+08 5.255741e+08 799194.04
433168 900957.XSHG 2022-08-16 2022-06-30 2022-06-30 2022-08-15 16:24:24 6 5.483870e+08 5.476224e+08 764620.52
433170 900957.XSHG 2022-10-28 2021-12-31 2022-09-30 2022-10-27 16:41:28 12 5.263733e+08 5.255741e+08 799194.04
433167 900957.XSHG 2022-10-28 2022-09-30 2022-09-30 2022-10-27 16:41:28 9 5.566301e+08 5.558669e+08 763140.90

433293 rows × 9 columns

In [123]:
fundmen_df.sort_values(['secID','publishDate'])
Out[123]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
121 000001.XSHE 2007-04-26 2007-03-31 2007-03-31 2007-04-25 18:00:00 3 7.106094e+09 7.106094e+09 NaN
120 000001.XSHE 2007-08-16 2007-06-30 2007-06-30 2007-08-15 18:00:00 6 7.698478e+09 7.698478e+09 NaN
119 000001.XSHE 2007-10-23 2007-09-30 2007-09-30 2007-10-22 18:00:00 9 8.363553e+09 8.363553e+09 NaN
118 000001.XSHE 2008-03-20 2007-12-31 2007-12-31 2008-03-19 18:00:00 12 1.300606e+10 1.300606e+10 NaN
117 000001.XSHE 2008-04-24 2007-12-31 2008-03-31 2008-04-23 18:00:00 12 1.300606e+10 1.300606e+10 NaN
113 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
116 000001.XSHE 2008-08-21 2007-12-31 2008-06-30 2008-08-20 18:00:00 12 1.300606e+10 1.300606e+10 NaN
112 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
... ... ... ... ... ... ... ... ... ...
433177 900957.XSHG 2022-04-20 2020-12-31 2021-12-31 2022-04-19 17:15:56 12 4.987276e+08 4.979110e+08 816555.06
433173 900957.XSHG 2022-04-20 2021-12-31 2021-12-31 2022-04-19 17:15:56 12 5.263733e+08 5.255741e+08 799194.04
433172 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 2022-04-29 15:36:38 12 5.263733e+08 5.255741e+08 799194.04
433169 900957.XSHG 2022-04-30 2022-03-31 2022-03-31 2022-04-29 15:36:38 3 5.341491e+08 5.333509e+08 798170.28
433171 900957.XSHG 2022-08-16 2021-12-31 2022-06-30 2022-08-15 16:24:24 12 5.263733e+08 5.255741e+08 799194.04
433168 900957.XSHG 2022-08-16 2022-06-30 2022-06-30 2022-08-15 16:24:24 6 5.483870e+08 5.476224e+08 764620.52
433170 900957.XSHG 2022-10-28 2021-12-31 2022-09-30 2022-10-27 16:41:28 12 5.263733e+08 5.255741e+08 799194.04
433167 900957.XSHG 2022-10-28 2022-09-30 2022-09-30 2022-10-27 16:41:28 9 5.566301e+08 5.558669e+08 763140.90

433293 rows × 9 columns

In [124]:
fundmen_df = fundmen_df.groupby(['secID','publishDate'],as_index=False).last() #不涉及上上个报表的信息
In [125]:
fundmen_df
Out[125]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000001.XSHE 2007-04-26 2007-03-31 2007-03-31 2007-04-25 18:00:00 3 7.106094e+09 7.106094e+09 NaN
1 000001.XSHE 2007-08-16 2007-06-30 2007-06-30 2007-08-15 18:00:00 6 7.698478e+09 7.698478e+09 NaN
2 000001.XSHE 2007-10-23 2007-09-30 2007-09-30 2007-10-22 18:00:00 9 8.363553e+09 8.363553e+09 NaN
3 000001.XSHE 2008-03-20 2007-12-31 2007-12-31 2008-03-19 18:00:00 12 1.300606e+10 1.300606e+10 NaN
4 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
5 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
6 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN
7 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN
... ... ... ... ... ... ... ... ... ...
195857 900957.XSHG 2021-04-09 2020-12-31 2020-12-31 2021-04-08 18:13:16 12 4.987276e+08 4.979110e+08 816555.06
195858 900957.XSHG 2021-04-27 2021-03-31 2021-03-31 2021-04-26 16:30:45 3 5.070935e+08 5.062701e+08 823373.23
195859 900957.XSHG 2021-08-12 2021-06-30 2021-06-30 2021-08-11 16:03:10 6 5.136414e+08 5.128208e+08 820511.29
195860 900957.XSHG 2021-10-29 2021-09-30 2021-09-30 2021-10-28 15:35:42 9 5.197039e+08 5.188844e+08 819528.99
195861 900957.XSHG 2022-04-20 2021-12-31 2021-12-31 2022-04-19 17:15:56 12 5.263733e+08 5.255741e+08 799194.04
195862 900957.XSHG 2022-04-30 2022-03-31 2022-03-31 2022-04-29 15:36:38 3 5.341491e+08 5.333509e+08 798170.28
195863 900957.XSHG 2022-08-16 2022-06-30 2022-06-30 2022-08-15 16:24:24 6 5.483870e+08 5.476224e+08 764620.52
195864 900957.XSHG 2022-10-28 2022-09-30 2022-09-30 2022-10-27 16:41:28 9 5.566301e+08 5.558669e+08 763140.90

195865 rows × 9 columns

In [126]:
fundmen_df = fundmen_df[(fundmen_df['secID'].str.endswith('XSHE')) | (fundmen_df['secID'].str.endswith('XSHG'))]
In [127]:
# # minorityInt 有时报告,有时不报告。空值时,假设就是上一次报告的值
# # fundmen_df['minorityInt'] = fundmen_df.groupby('secID')['minorityInt'].fillna(method='ffill')
# # 第一轮填完空值为有效数值后,剩下的空值再用0填充。
# fundmen_df['minorityInt'].fillna(0,inplace=True)
  • 假设是上一次报告的值可能出现误差,因为股权变动了(注意ffill的方法)
  • 直接用TEquityAttrP
In [128]:
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
In [129]:
fundmen_df
Out[129]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt book
0 000001.XSHE 2007-04-26 2007-03-31 2007-03-31 2007-04-25 18:00:00 3 7.106094e+09 7.106094e+09 NaN 7.106094e+09
1 000001.XSHE 2007-08-16 2007-06-30 2007-06-30 2007-08-15 18:00:00 6 7.698478e+09 7.698478e+09 NaN 7.698478e+09
2 000001.XSHE 2007-10-23 2007-09-30 2007-09-30 2007-10-22 18:00:00 9 8.363553e+09 8.363553e+09 NaN 8.363553e+09
3 000001.XSHE 2008-03-20 2007-12-31 2007-12-31 2008-03-19 18:00:00 12 1.300606e+10 1.300606e+10 NaN 1.300606e+10
4 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN 1.404138e+10
5 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN 1.694330e+10
6 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN 1.837466e+10
7 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN 1.640079e+10
... ... ... ... ... ... ... ... ... ... ...
195857 900957.XSHG 2021-04-09 2020-12-31 2020-12-31 2021-04-08 18:13:16 12 4.987276e+08 4.979110e+08 816555.06 4.979110e+08
195858 900957.XSHG 2021-04-27 2021-03-31 2021-03-31 2021-04-26 16:30:45 3 5.070935e+08 5.062701e+08 823373.23 5.062701e+08
195859 900957.XSHG 2021-08-12 2021-06-30 2021-06-30 2021-08-11 16:03:10 6 5.136414e+08 5.128208e+08 820511.29 5.128208e+08
195860 900957.XSHG 2021-10-29 2021-09-30 2021-09-30 2021-10-28 15:35:42 9 5.197039e+08 5.188844e+08 819528.99 5.188844e+08
195861 900957.XSHG 2022-04-20 2021-12-31 2021-12-31 2022-04-19 17:15:56 12 5.263733e+08 5.255741e+08 799194.04 5.255741e+08
195862 900957.XSHG 2022-04-30 2022-03-31 2022-03-31 2022-04-29 15:36:38 3 5.341491e+08 5.333509e+08 798170.28 5.333509e+08
195863 900957.XSHG 2022-08-16 2022-06-30 2022-06-30 2022-08-15 16:24:24 6 5.483870e+08 5.476224e+08 764620.52 5.476224e+08
195864 900957.XSHG 2022-10-28 2022-09-30 2022-09-30 2022-10-27 16:41:28 9 5.566301e+08 5.558669e+08 763140.90 5.558669e+08

195800 rows × 10 columns

  • 考虑到报表公布时间可能在当天收市以后,以及报表解读可能需要时间,把publishDate往后加1
  • publishDate和 tradeDate merge,
  • publishDate可能是非交易日,所以merge时要outer,左右表格都不丢观测值。
  • 接着要把tradeDate为空值的设置为publishDate,便于排序,方便下面填充
  • 然后按照secID和tradeDate sort,因为publishDate非交易日的被放到merge表格的最后了。
  • 接着 book 空值由上面第一个非空值填充(当时已知的最新的book value)
  • 再把 na 都丢弃即可
In [130]:
# fundmen_df['publishDate+1'] = fundmen_df['publishDate'] + dt.timedelta(days=1)
In [131]:
stk_fundmen_df = pd.merge(stk_df, fundmen_df[['secID','publishDate','endDate','book']], 
                          left_on=['secID','tradeDate'], right_on=['secID','publishDate'],
                          how='outer')
In [132]:
stk_fundmen_df
Out[132]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2007-06-20 987.007 4.835036e+10 2007-06 NaT NaT NaN
1 000001.XSHE 2007-06-21 1085.740 5.318694e+10 2007-06 NaT NaT NaN
2 000001.XSHE 2007-06-22 1120.233 5.487665e+10 2007-06 NaT NaT NaN
3 000001.XSHE 2007-06-25 1113.904 5.456661e+10 2007-06 NaT NaT NaN
4 000001.XSHE 2007-06-26 1113.904 5.456661e+10 2007-06 NaT NaT NaN
5 000001.XSHE 2007-06-27 1019.602 4.994705e+10 2007-06 NaT NaT NaN
6 000001.XSHE 2007-06-28 953.780 4.672266e+10 2007-06 NaT NaT NaN
7 000001.XSHE 2007-06-29 870.870 4.266117e+10 2007-06 NaT NaT NaN
... ... ... ... ... ... ... ... ...
11105027 900957.XSHG NaT NaN NaN NaT 2015-08-15 2015-06-30 3.952715e+08
11105028 900957.XSHG NaT NaN NaN NaT 2016-04-23 2016-03-31 4.005150e+08
11105029 900957.XSHG NaT NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
11105030 900957.XSHG NaT NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
11105031 900957.XSHG NaT NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
11105032 900957.XSHG NaT NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
11105033 900957.XSHG NaT NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
11105034 900957.XSHG NaT NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08

11105035 rows × 8 columns

In [133]:
idx = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna()].index
In [134]:
stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'tradeDate'] = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'publishDate']
In [135]:
stk_fundmen_df.loc[idx]
Out[135]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
11048078 000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
11048079 000001.XSHE 2017-04-22 NaN NaN NaT 2017-04-22 2017-03-31 2.077390e+11
11048080 000001.XSHE 2017-10-21 NaN NaN NaT 2017-10-21 2017-09-30 2.181110e+11
11048081 000002.XSHE 2022-10-29 NaN NaN NaT 2022-10-29 2022-09-30 2.411070e+11
11048082 000003.XSHE 2008-04-30 NaN NaN NaT 2008-04-30 2007-12-31 -2.889290e+09
11048083 000003.XSHE 2008-08-29 NaN NaN NaT 2008-08-29 2008-06-30 -2.872336e+09
11048084 000003.XSHE 2009-04-30 NaN NaN NaT 2009-04-30 2008-12-31 -2.983208e+09
11048085 000003.XSHE 2009-08-28 NaN NaN NaT 2009-08-28 2009-06-30 -2.964330e+09
... ... ... ... ... ... ... ... ...
11105027 900957.XSHG 2015-08-15 NaN NaN NaT 2015-08-15 2015-06-30 3.952715e+08
11105028 900957.XSHG 2016-04-23 NaN NaN NaT 2016-04-23 2016-03-31 4.005150e+08
11105029 900957.XSHG 2016-08-06 NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
11105030 900957.XSHG 2017-03-25 NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
11105031 900957.XSHG 2019-03-30 NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
11105032 900957.XSHG 2019-08-10 NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
11105033 900957.XSHG 2020-04-25 NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
11105034 900957.XSHG 2022-04-30 NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08

56957 rows × 8 columns

In [136]:
stk_fundmen_df
Out[136]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2007-06-20 987.007 4.835036e+10 2007-06 NaT NaT NaN
1 000001.XSHE 2007-06-21 1085.740 5.318694e+10 2007-06 NaT NaT NaN
2 000001.XSHE 2007-06-22 1120.233 5.487665e+10 2007-06 NaT NaT NaN
3 000001.XSHE 2007-06-25 1113.904 5.456661e+10 2007-06 NaT NaT NaN
4 000001.XSHE 2007-06-26 1113.904 5.456661e+10 2007-06 NaT NaT NaN
5 000001.XSHE 2007-06-27 1019.602 4.994705e+10 2007-06 NaT NaT NaN
6 000001.XSHE 2007-06-28 953.780 4.672266e+10 2007-06 NaT NaT NaN
7 000001.XSHE 2007-06-29 870.870 4.266117e+10 2007-06 NaT NaT NaN
... ... ... ... ... ... ... ... ...
11105027 900957.XSHG 2015-08-15 NaN NaN NaT 2015-08-15 2015-06-30 3.952715e+08
11105028 900957.XSHG 2016-04-23 NaN NaN NaT 2016-04-23 2016-03-31 4.005150e+08
11105029 900957.XSHG 2016-08-06 NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
11105030 900957.XSHG 2017-03-25 NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
11105031 900957.XSHG 2019-03-30 NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
11105032 900957.XSHG 2019-08-10 NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
11105033 900957.XSHG 2020-04-25 NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
11105034 900957.XSHG 2022-04-30 NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08

11105035 rows × 8 columns

In [137]:
stk_fundmen_df.sort_values(['secID','tradeDate'],inplace=True)
In [138]:
stk_fundmen_df
Out[138]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
11048078 000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
0 000001.XSHE 2007-06-20 987.007 4.835036e+10 2007-06 NaT NaT NaN
1 000001.XSHE 2007-06-21 1085.740 5.318694e+10 2007-06 NaT NaT NaN
2 000001.XSHE 2007-06-22 1120.233 5.487665e+10 2007-06 NaT NaT NaN
3 000001.XSHE 2007-06-25 1113.904 5.456661e+10 2007-06 NaT NaT NaN
4 000001.XSHE 2007-06-26 1113.904 5.456661e+10 2007-06 NaT NaT NaN
5 000001.XSHE 2007-06-27 1019.602 4.994705e+10 2007-06 NaT NaT NaN
6 000001.XSHE 2007-06-28 953.780 4.672266e+10 2007-06 NaT NaT NaN
... ... ... ... ... ... ... ... ...
11048070 900957.XSHG 2023-02-22 0.577 1.052480e+08 2023-02 NaT NaT NaN
11048071 900957.XSHG 2023-02-23 0.583 1.063520e+08 2023-02 NaT NaT NaN
11048072 900957.XSHG 2023-02-24 0.579 1.056160e+08 2023-02 NaT NaT NaN
11048073 900957.XSHG 2023-02-27 0.578 1.054320e+08 2023-02 NaT NaT NaN
11048074 900957.XSHG 2023-02-28 0.578 1.054320e+08 2023-02 NaT NaT NaN
11048075 900957.XSHG 2023-03-01 0.579 1.056160e+08 2023-03 NaT NaT NaN
11048076 900957.XSHG 2023-03-02 0.573 1.045120e+08 2023-03 NaT NaT NaN
11048077 900957.XSHG 2023-03-03 0.568 1.037760e+08 2023-03 NaT NaT NaN

11105035 rows × 8 columns

In [139]:
temp = stk_fundmen_df[stk_fundmen_df['secID']=='000001.XSHE'].copy()
temp
Out[139]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
11048078 000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
0 000001.XSHE 2007-06-20 987.007 4.835036e+10 2007-06 NaT NaT NaN
1 000001.XSHE 2007-06-21 1085.740 5.318694e+10 2007-06 NaT NaT NaN
2 000001.XSHE 2007-06-22 1120.233 5.487665e+10 2007-06 NaT NaT NaN
3 000001.XSHE 2007-06-25 1113.904 5.456661e+10 2007-06 NaT NaT NaN
4 000001.XSHE 2007-06-26 1113.904 5.456661e+10 2007-06 NaT NaT NaN
5 000001.XSHE 2007-06-27 1019.602 4.994705e+10 2007-06 NaT NaT NaN
6 000001.XSHE 2007-06-28 953.780 4.672266e+10 2007-06 NaT NaT NaN
... ... ... ... ... ... ... ... ...
3813 000001.XSHE 2023-02-22 1834.143 2.720658e+11 2023-02 NaT NaT NaN
3814 000001.XSHE 2023-02-23 1838.068 2.726479e+11 2023-02 NaT NaT NaN
3815 000001.XSHE 2023-02-24 1813.212 2.689609e+11 2023-02 NaT NaT NaN
3816 000001.XSHE 2023-02-27 1790.972 2.656619e+11 2023-02 NaT NaT NaN
3817 000001.XSHE 2023-02-28 1802.746 2.674084e+11 2023-02 NaT NaT NaN
3818 000001.XSHE 2023-03-01 1853.767 2.749766e+11 2023-03 NaT NaT NaN
3819 000001.XSHE 2023-03-02 1862.924 2.763350e+11 2023-03 NaT NaT NaN
3820 000001.XSHE 2023-03-03 1869.466 2.773053e+11 2023-03 NaT NaT NaN

3824 rows × 8 columns

In [140]:
temp[~temp['book'].isna()]
Out[140]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
11048078 000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
41 000001.XSHE 2007-08-16 1147.131 5.619431e+10 2007-08 2007-08-16 2007-06-30 7.698478e+09
84 000001.XSHE 2007-10-23 1300.609 6.371272e+10 2007-10 2007-10-23 2007-09-30 8.363553e+09
184 000001.XSHE 2008-03-20 917.705 5.094780e+10 2008-03 2008-03-20 2007-12-31 1.300606e+10
208 000001.XSHE 2008-04-24 869.921 4.829500e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
290 000001.XSHE 2008-08-21 639.545 4.328991e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
330 000001.XSHE 2008-10-24 380.689 2.576831e+10 2008-10 2008-10-24 2008-09-30 1.837466e+10
428 000001.XSHE 2009-03-20 632.527 4.268801e+10 2009-03 2009-03-20 2008-12-31 1.640079e+10
... ... ... ... ... ... ... ... ...
3318 000001.XSHE 2021-02-02 2968.848 4.517660e+11 2021-02 2021-02-02 2020-12-31 3.641310e+11
3368 000001.XSHE 2021-04-21 2934.415 4.465264e+11 2021-04 2021-04-21 2021-03-31 3.726170e+11
3451 000001.XSHE 2021-08-20 2496.066 3.768598e+11 2021-08 2021-08-20 2021-06-30 3.771930e+11
3488 000001.XSHE 2021-10-21 2570.613 3.881151e+11 2021-10 2021-10-21 2021-09-30 3.888580e+11
3582 000001.XSHE 2022-03-10 1872.692 2.827385e+11 2022-03 2022-03-10 2021-12-31 3.954480e+11
3614 000001.XSHE 2022-04-27 2011.505 3.036964e+11 2022-04 2022-04-27 2022-03-31 4.061750e+11
3691 000001.XSHE 2022-08-18 1602.586 2.377180e+11 2022-08 2022-08-18 2022-06-30 4.120980e+11
3733 000001.XSHE 2022-10-25 1393.269 2.066691e+11 2022-10 2022-10-25 2022-09-30 4.253840e+11

63 rows × 8 columns

In [141]:
stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
Out[141]:
publishDate endDate book
11048078 2007-04-26 2007-03-31 7.106094e+09
0 2007-04-26 2007-03-31 7.106094e+09
1 2007-04-26 2007-03-31 7.106094e+09
2 2007-04-26 2007-03-31 7.106094e+09
3 2007-04-26 2007-03-31 7.106094e+09
4 2007-04-26 2007-03-31 7.106094e+09
5 2007-04-26 2007-03-31 7.106094e+09
6 2007-04-26 2007-03-31 7.106094e+09
... ... ... ...
11048070 2022-10-28 2022-09-30 5.558669e+08
11048071 2022-10-28 2022-09-30 5.558669e+08
11048072 2022-10-28 2022-09-30 5.558669e+08
11048073 2022-10-28 2022-09-30 5.558669e+08
11048074 2022-10-28 2022-09-30 5.558669e+08
11048075 2022-10-28 2022-09-30 5.558669e+08
11048076 2022-10-28 2022-09-30 5.558669e+08
11048077 2022-10-28 2022-09-30 5.558669e+08

11105035 rows × 3 columns

In [142]:
stk_fundmen_df.loc[:,['publishDate','endDate','book']] = stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
In [143]:
## 查看数据
idx = pd.IndexSlice
stk_fundmen_df.set_index(['secID','tradeDate'],inplace=True)
pd.options.display.max_rows = 20
In [144]:
stk_fundmen_df.loc[idx['000001.XSHE','2010-03'],:]
Out[144]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2010-03-01 926.303 6.564637e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-02 953.535 6.757628e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-03 961.374 6.813186e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-04 953.122 6.754704e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-05 960.137 6.804414e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-08 984.068 6.974013e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-09 982.830 6.965240e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-10 965.500 6.842427e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-11 976.228 6.918454e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-12 945.695 6.702070e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
... ... ... ... ... ... ...
2010-03-18 951.059 6.740083e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-19 950.647 6.737159e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-22 955.185 6.769325e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-23 941.982 6.675753e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-24 940.744 6.666981e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-25 920.114 6.520775e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-26 941.157 6.669905e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-29 972.102 6.889213e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-30 975.816 6.915530e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-31 957.248 6.783945e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10

23 rows × 6 columns

In [145]:
stk_fundmen_df.loc[idx['000001.XSHE','2007-04'],:]
Out[145]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
In [146]:
stk_fundmen_df.loc[idx['000001.XSHE','2007-04':'2007-06'],:]
Out[146]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
2007-06-20 987.007 4.835036e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-21 1085.740 5.318694e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-22 1120.233 5.487665e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-25 1113.904 5.456661e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-26 1113.904 5.456661e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-27 1019.602 4.994705e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-28 953.780 4.672266e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2007-06-29 870.870 4.266117e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
In [147]:
stk_fundmen_df.reset_index(inplace=True)
In [148]:
stk_fundmen_df
Out[148]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2007-04-26 NaN NaN NaT 2007-04-26 2007-03-31 7.106094e+09
1 000001.XSHE 2007-06-20 987.007 4.835036e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
2 000001.XSHE 2007-06-21 1085.740 5.318694e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
3 000001.XSHE 2007-06-22 1120.233 5.487665e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
4 000001.XSHE 2007-06-25 1113.904 5.456661e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
5 000001.XSHE 2007-06-26 1113.904 5.456661e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
6 000001.XSHE 2007-06-27 1019.602 4.994705e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
7 000001.XSHE 2007-06-28 953.780 4.672266e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
8 000001.XSHE 2007-06-29 870.870 4.266117e+10 2007-06 2007-04-26 2007-03-31 7.106094e+09
9 000001.XSHE 2007-07-02 867.073 4.247515e+10 2007-07 2007-04-26 2007-03-31 7.106094e+09
... ... ... ... ... ... ... ... ...
11105025 900957.XSHG 2023-02-20 0.582 1.061680e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105026 900957.XSHG 2023-02-21 0.582 1.061680e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105027 900957.XSHG 2023-02-22 0.577 1.052480e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105028 900957.XSHG 2023-02-23 0.583 1.063520e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105029 900957.XSHG 2023-02-24 0.579 1.056160e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105030 900957.XSHG 2023-02-27 0.578 1.054320e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105031 900957.XSHG 2023-02-28 0.578 1.054320e+08 2023-02 2022-10-28 2022-09-30 5.558669e+08
11105032 900957.XSHG 2023-03-01 0.579 1.056160e+08 2023-03 2022-10-28 2022-09-30 5.558669e+08
11105033 900957.XSHG 2023-03-02 0.573 1.045120e+08 2023-03 2022-10-28 2022-09-30 5.558669e+08
11105034 900957.XSHG 2023-03-03 0.568 1.037760e+08 2023-03 2022-10-28 2022-09-30 5.558669e+08

11105035 rows × 8 columns

In [149]:
stk_df_m = stk_fundmen_df.groupby(['secID','ym'],as_index=False).last()

stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
In [150]:
stk_df_m
Out[150]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret
0 000001.XSHE 2007-06 2007-06-29 870.870 4.266117e+10 2007-04-26 2007-03-31 7.106094e+09 NaN
1 000001.XSHE 2007-07 2007-07-31 1146.498 5.616330e+10 2007-04-26 2007-03-31 7.106094e+09 0.316497
2 000001.XSHE 2007-08 2007-08-31 1202.510 5.890714e+10 2007-08-16 2007-06-30 7.698478e+09 0.048855
3 000001.XSHE 2007-09 2007-09-28 1265.167 6.197651e+10 2007-08-16 2007-06-30 7.698478e+09 0.052105
4 000001.XSHE 2007-10 2007-10-31 1520.542 7.448652e+10 2007-10-23 2007-09-30 8.363553e+09 0.201851
5 000001.XSHE 2007-11 2007-11-30 1141.751 5.593078e+10 2007-10-23 2007-09-30 8.363553e+09 -0.249116
6 000001.XSHE 2007-12 2007-12-28 1221.497 6.574629e+10 2007-10-23 2007-09-30 8.363553e+09 0.069845
7 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10 2007-10-23 2007-09-30 8.363553e+09 -0.137306
8 000001.XSHE 2008-02 2008-02-29 1049.032 5.823860e+10 2007-10-23 2007-09-30 8.363553e+09 -0.004504
9 000001.XSHE 2008-03 2008-03-31 892.389 4.954234e+10 2008-03-20 2007-12-31 1.300606e+10 -0.149321
... ... ... ... ... ... ... ... ... ...
552748 900957.XSHG 2022-06 2022-06-30 0.603 1.100320e+08 2022-04-30 2022-03-31 5.333509e+08 0.080645
552749 900957.XSHG 2022-07 2022-07-29 0.589 1.074560e+08 2022-04-30 2022-03-31 5.333509e+08 -0.023217
552750 900957.XSHG 2022-08 2022-08-31 0.644 1.175760e+08 2022-08-16 2022-06-30 5.476224e+08 0.093379
552751 900957.XSHG 2022-09 2022-09-30 0.592 1.080080e+08 2022-08-16 2022-06-30 5.476224e+08 -0.080745
552752 900957.XSHG 2022-10 2022-10-31 0.603 1.100320e+08 2022-10-28 2022-09-30 5.558669e+08 0.018581
552753 900957.XSHG 2022-11 2022-11-30 0.617 1.126080e+08 2022-10-28 2022-09-30 5.558669e+08 0.023217
552754 900957.XSHG 2022-12 2022-12-30 0.568 1.037760e+08 2022-10-28 2022-09-30 5.558669e+08 -0.079417
552755 900957.XSHG 2023-01 2023-01-31 0.590 1.076400e+08 2022-10-28 2022-09-30 5.558669e+08 0.038732
552756 900957.XSHG 2023-02 2023-02-28 0.578 1.054320e+08 2022-10-28 2022-09-30 5.558669e+08 -0.020339
552757 900957.XSHG 2023-03 2023-03-03 0.568 1.037760e+08 2022-10-28 2022-09-30 5.558669e+08 -0.017301

552758 rows × 9 columns

In [151]:
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1)
In [152]:
stk_df_m
Out[152]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date
0 000001.XSHE 2007-06 2007-06-29 870.870 4.266117e+10 2007-04-26 2007-03-31 7.106094e+09 0.316497 2007-07
1 000001.XSHE 2007-07 2007-07-31 1146.498 5.616330e+10 2007-04-26 2007-03-31 7.106094e+09 0.048855 2007-08
2 000001.XSHE 2007-08 2007-08-31 1202.510 5.890714e+10 2007-08-16 2007-06-30 7.698478e+09 0.052105 2007-09
3 000001.XSHE 2007-09 2007-09-28 1265.167 6.197651e+10 2007-08-16 2007-06-30 7.698478e+09 0.201851 2007-10
4 000001.XSHE 2007-10 2007-10-31 1520.542 7.448652e+10 2007-10-23 2007-09-30 8.363553e+09 -0.249116 2007-11
5 000001.XSHE 2007-11 2007-11-30 1141.751 5.593078e+10 2007-10-23 2007-09-30 8.363553e+09 0.069845 2007-12
6 000001.XSHE 2007-12 2007-12-28 1221.497 6.574629e+10 2007-10-23 2007-09-30 8.363553e+09 -0.137306 2008-01
7 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10 2007-10-23 2007-09-30 8.363553e+09 -0.004504 2008-02
8 000001.XSHE 2008-02 2008-02-29 1049.032 5.823860e+10 2007-10-23 2007-09-30 8.363553e+09 -0.149321 2008-03
9 000001.XSHE 2008-03 2008-03-31 892.389 4.954234e+10 2008-03-20 2007-12-31 1.300606e+10 0.050355 2008-04
... ... ... ... ... ... ... ... ... ... ...
552748 900957.XSHG 2022-06 2022-06-30 0.603 1.100320e+08 2022-04-30 2022-03-31 5.333509e+08 -0.023217 2022-07
552749 900957.XSHG 2022-07 2022-07-29 0.589 1.074560e+08 2022-04-30 2022-03-31 5.333509e+08 0.093379 2022-08
552750 900957.XSHG 2022-08 2022-08-31 0.644 1.175760e+08 2022-08-16 2022-06-30 5.476224e+08 -0.080745 2022-09
552751 900957.XSHG 2022-09 2022-09-30 0.592 1.080080e+08 2022-08-16 2022-06-30 5.476224e+08 0.018581 2022-10
552752 900957.XSHG 2022-10 2022-10-31 0.603 1.100320e+08 2022-10-28 2022-09-30 5.558669e+08 0.023217 2022-11
552753 900957.XSHG 2022-11 2022-11-30 0.617 1.126080e+08 2022-10-28 2022-09-30 5.558669e+08 -0.079417 2022-12
552754 900957.XSHG 2022-12 2022-12-30 0.568 1.037760e+08 2022-10-28 2022-09-30 5.558669e+08 0.038732 2023-01
552755 900957.XSHG 2023-01 2023-01-31 0.590 1.076400e+08 2022-10-28 2022-09-30 5.558669e+08 -0.020339 2023-02
552756 900957.XSHG 2023-02 2023-02-28 0.578 1.054320e+08 2022-10-28 2022-09-30 5.558669e+08 -0.017301 2023-03
552757 900957.XSHG 2023-03 2023-03-03 0.568 1.037760e+08 2022-10-28 2022-09-30 5.558669e+08 NaN NaT

552758 rows × 10 columns

In [153]:
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1]
Out[153]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date ym_diff
189 000001.XSHE 2023-03 2023-03-03 1869.466 2.773053e+11 2022-10-25 2022-09-30 4.253840e+11 NaN NaT 9223372036854775170
384 000002.XSHE 2023-03 2023-03-03 2656.185 1.633521e+11 2022-10-29 2022-09-30 2.411070e+11 NaN NaT 9223372036854775170
516 000004.XSHE 2022-05 2022-05-05 85.971 1.463441e+09 2022-04-30 2022-03-31 9.351158e+08 NaN NaT 9223372036854775180
671 000005.XSHE 2021-04 2021-04-30 21.814 2.348641e+09 2021-04-30 2021-03-31 1.248325e+09 NaN NaT 9223372036854775193
866 000006.XSHE 2023-03 2023-03-03 417.729 7.924471e+09 2022-10-28 2022-09-30 7.683314e+09 NaN NaT 9223372036854775170
870 000007.XSHE 2007-04 2007-04-20 48.203 7.268053e+08 NaT NaT NaN 1.612327 2012-05 61
978 000007.XSHE 2021-04 2021-04-29 47.401 1.217255e+09 2021-04-29 2021-03-31 6.318241e+07 0.870572 2022-07 15
987 000007.XSHE 2023-03 2023-03-03 93.359 2.397436e+09 2022-10-26 2022-09-30 9.238106e+07 NaN NaT 9223372036854775170
990 000008.XSHE 2007-03 2007-03-29 22.937 3.197319e+08 NaT NaT NaN 0.814579 2013-04 73
1110 000008.XSHE 2023-03 2023-03-03 69.660 6.811892e+09 2022-10-29 2022-09-30 5.032390e+09 NaN NaT 9223372036854775170
... ... ... ... ... ... ... ... ... ... ... ...
551865 900951.XSHG 2018-07 2018-07-30 0.537 5.370000e+07 2018-04-21 2018-03-31 1.635296e+08 -0.912477 2020-07 24
551867 900951.XSHG 2020-08 2020-08-26 0.069 6.900000e+06 2020-04-25 2020-03-31 -2.115164e+08 NaN NaT 9223372036854775201
552062 900952.XSHG 2023-03 2023-03-03 0.632 6.149473e+07 2022-10-29 2022-09-30 6.615382e+09 NaN NaT 9223372036854775170
552078 900953.XSHG 2008-04 2008-04-30 0.430 1.000800e+08 2008-04-30 2008-03-31 5.505128e+08 0.588372 2009-07 15
552243 900953.XSHG 2023-03 2023-03-03 0.457 1.063200e+08 2022-10-31 2022-09-30 4.893210e+08 NaN NaT 9223372036854775170
552320 900955.XSHG 2013-05 2013-05-02 1.251 1.188000e+08 2013-04-27 2013-03-31 1.602086e+09 -0.194245 2014-04 11
552393 900955.XSHG 2020-04 2020-04-30 0.507 4.818000e+07 2020-04-30 2020-03-31 1.031370e+09 -0.850099 2022-06 26
552395 900955.XSHG 2022-07 2022-07-12 0.073 6.930000e+06 2022-05-07 2021-12-31 6.172171e+08 NaN NaT 9223372036854775178
552562 900956.XSHG 2020-11 2020-11-20 3.424 3.530500e+08 2020-10-29 2020-09-30 1.430181e+09 NaN NaT 9223372036854775198
552757 900957.XSHG 2023-03 2023-03-03 0.568 1.037760e+08 2022-10-28 2022-09-30 5.558669e+08 NaN NaT 9223372036854775170

5831 rows × 11 columns

In [154]:
# 停牌删去
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1,'ret'] = np.nan
In [155]:
# 查看数据
temp = stk_df_m['secID'].unique()
display(stk_df_m[stk_df_m['secID'] == np.random.choice(temp,1)[0]])
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date ym_diff
68321 000901.XSHE 2007-01 2007-01-31 19.923 5.894784e+08 NaT NaT NaN 0.158862 2007-02 1
68322 000901.XSHE 2007-02 2007-02-28 23.088 6.831014e+08 NaT NaT NaN 0.352781 2007-03 1
68323 000901.XSHE 2007-03 2007-03-30 31.233 9.240941e+08 NaT NaT NaN 0.163225 2007-04 1
68324 000901.XSHE 2007-04 2007-04-30 36.331 2.032149e+09 2007-04-25 2007-03-31 3.653547e+08 0.047590 2007-05 1
68325 000901.XSHE 2007-05 2007-05-31 38.060 2.128840e+09 2007-04-25 2007-03-31 3.653547e+08 -0.331792 2007-06 1
68326 000901.XSHE 2007-06 2007-06-29 25.432 1.422504e+09 2007-04-25 2007-03-31 3.653547e+08 0.165893 2007-07 1
68327 000901.XSHE 2007-07 2007-07-31 29.651 1.658495e+09 2007-04-25 2007-03-31 3.653547e+08 0.385350 2007-08 1
68328 000901.XSHE 2007-08 2007-08-31 41.077 2.297639e+09 2007-08-07 2007-06-30 3.506422e+08 0.087738 2007-09 1
68329 000901.XSHE 2007-09 2007-09-28 44.681 2.499215e+09 2007-08-07 2007-06-30 3.506422e+08 -0.277366 2007-10 1
68330 000901.XSHE 2007-10 2007-10-31 32.288 1.805990e+09 2007-10-30 2007-09-30 3.540876e+08 0.155166 2007-11 1
... ... ... ... ... ... ... ... ... ... ... ...
68506 000901.XSHE 2022-06 2022-06-30 59.489 7.255651e+09 2022-04-29 2022-03-31 4.193378e+09 0.055019 2022-07 1
68507 000901.XSHE 2022-07 2022-07-29 62.762 7.654751e+09 2022-04-29 2022-03-31 4.193378e+09 -0.118878 2022-08 1
68508 000901.XSHE 2022-08 2022-08-31 55.301 6.744802e+09 2022-08-29 2022-06-30 4.161902e+09 -0.081662 2022-09 1
68509 000901.XSHE 2022-09 2022-09-30 50.785 6.194043e+09 2022-08-29 2022-06-30 4.161902e+09 0.101802 2022-10 1
68510 000901.XSHE 2022-10 2022-10-31 55.955 6.824622e+09 2022-10-31 2022-09-30 4.165991e+09 0.011706 2022-11 1
68511 000901.XSHE 2022-11 2022-11-30 56.610 6.904442e+09 2022-10-31 2022-09-30 4.165991e+09 -0.056651 2022-12 1
68512 000901.XSHE 2022-12 2022-12-30 53.403 6.513323e+09 2022-10-31 2022-09-30 4.165991e+09 0.159317 2023-01 1
68513 000901.XSHE 2023-01 2023-01-31 61.911 7.550985e+09 2022-10-31 2022-09-30 4.165991e+09 0.152219 2023-02 1
68514 000901.XSHE 2023-02 2023-02-28 71.335 8.700395e+09 2022-10-31 2022-09-30 4.165991e+09 -0.022023 2023-03 1
68515 000901.XSHE 2023-03 2023-03-03 69.764 8.508827e+09 2022-10-31 2022-09-30 4.165991e+09 NaN NaT 9223372036854775170

195 rows × 11 columns

In [156]:
del temp
In [157]:
stk_df_m.drop(['tradeDate','closePrice','publishDate','endDate', 'ym_diff'],axis=1,inplace=True)

stk_df_m.rename(columns={'ym':'mktcap_book_date','negMarketValue':'mktcap'},inplace=True)

stk_df_m.dropna(inplace=True)

stk_df_m
Out[157]:
secID mktcap_book_date mktcap book ret ret_date
0 000001.XSHE 2007-06 4.266117e+10 7.106094e+09 0.316497 2007-07
1 000001.XSHE 2007-07 5.616330e+10 7.106094e+09 0.048855 2007-08
2 000001.XSHE 2007-08 5.890714e+10 7.698478e+09 0.052105 2007-09
3 000001.XSHE 2007-09 6.197651e+10 7.698478e+09 0.201851 2007-10
4 000001.XSHE 2007-10 7.448652e+10 8.363553e+09 -0.249116 2007-11
5 000001.XSHE 2007-11 5.593078e+10 8.363553e+09 0.069845 2007-12
6 000001.XSHE 2007-12 6.574629e+10 8.363553e+09 -0.137306 2008-01
7 000001.XSHE 2008-01 5.850212e+10 8.363553e+09 -0.004504 2008-02
8 000001.XSHE 2008-02 5.823860e+10 8.363553e+09 -0.149321 2008-03
9 000001.XSHE 2008-03 4.954234e+10 1.300606e+10 0.050355 2008-04
... ... ... ... ... ... ...
552747 900957.XSHG 2022-05 1.019360e+08 5.333509e+08 0.080645 2022-06
552748 900957.XSHG 2022-06 1.100320e+08 5.333509e+08 -0.023217 2022-07
552749 900957.XSHG 2022-07 1.074560e+08 5.333509e+08 0.093379 2022-08
552750 900957.XSHG 2022-08 1.175760e+08 5.476224e+08 -0.080745 2022-09
552751 900957.XSHG 2022-09 1.080080e+08 5.476224e+08 0.018581 2022-10
552752 900957.XSHG 2022-10 1.100320e+08 5.558669e+08 0.023217 2022-11
552753 900957.XSHG 2022-11 1.126080e+08 5.558669e+08 -0.079417 2022-12
552754 900957.XSHG 2022-12 1.037760e+08 5.558669e+08 0.038732 2023-01
552755 900957.XSHG 2023-01 1.076400e+08 5.558669e+08 -0.020339 2023-02
552756 900957.XSHG 2023-02 1.054320e+08 5.558669e+08 -0.017301 2023-03

542964 rows × 6 columns

Merge

In [158]:
ret_df = pd.merge(stk_df_m, rf, left_on='ret_date',right_on='ym')
ret_df.drop('ym',axis=1,inplace=True)
ret_df['exret'] = ret_df['ret'] - ret_df['rf']
ret_df.sort_values(['secID','ret_date'],inplace=True)
ret_df.reset_index(drop=True,inplace=True)
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mktcap_book_date'],right_on=['secID','ym'])
# display(ret_df)
ret_df.drop(['ym','rf','ret'],axis=1,inplace=True)
ret_df.rename(columns={'mktcap_book_date':'grouping_date'},inplace=True)
ret_df['bm'] = ret_df['book'] / ret_df['mktcap']
ret_df['size'] = np.log(ret_df['mktcap'])
ret_df = ret_df[['secID','grouping_date','size','mktcap','bm',
                 'beta','ret_date','exret']]
In [159]:
ret_df
Out[159]:
secID grouping_date size mktcap bm beta ret_date exret
0 000001.XSHE 2007-06 24.476555 4.266117e+10 0.166571 0.4614 2007-07 0.313877
1 000001.XSHE 2007-07 24.751529 5.616330e+10 0.126526 0.6423 2007-08 0.046173
2 000001.XSHE 2007-08 24.799228 5.890714e+10 0.130688 0.7722 2007-09 0.049171
3 000001.XSHE 2007-09 24.850021 6.197651e+10 0.124216 0.7596 2007-10 0.198601
4 000001.XSHE 2007-10 25.033884 7.448652e+10 0.112283 0.7988 2007-11 -0.252661
5 000001.XSHE 2007-11 24.747381 5.593078e+10 0.149534 0.9560 2007-12 0.066202
6 000001.XSHE 2007-12 24.909069 6.574629e+10 0.127210 0.9468 2008-01 -0.141037
7 000001.XSHE 2008-01 24.792329 5.850212e+10 0.142962 0.9654 2008-02 -0.008257
8 000001.XSHE 2008-02 24.787814 5.823860e+10 0.143608 1.0292 2008-03 -0.153072
9 000001.XSHE 2008-03 24.626093 4.954234e+10 0.262524 1.0238 2008-04 0.046610
... ... ... ... ... ... ... ... ...
519992 689009.XSHG 2022-04 23.663502 1.892031e+10 0.225936 0.8108 2022-05 0.224708
519993 689009.XSHG 2022-05 23.867639 2.320511e+10 0.186935 0.9844 2022-06 -0.025790
519994 689009.XSHG 2022-06 23.843220 2.264534e+10 0.191556 0.9071 2022-07 0.113436
519995 689009.XSHG 2022-07 23.952125 2.525082e+10 0.171790 0.7987 2022-08 -0.114022
519996 689009.XSHG 2022-08 23.832602 2.240616e+10 0.205191 0.8589 2022-09 -0.131253
519997 689009.XSHG 2022-09 23.693442 1.949535e+10 0.235828 0.9106 2022-10 -0.166122
519998 689009.XSHG 2022-10 23.518985 1.637440e+10 0.292190 0.7083 2022-11 0.041449
519999 689009.XSHG 2022-11 23.561206 1.708055e+10 0.280110 0.7363 2022-12 -0.088510
520000 689009.XSHG 2022-12 23.470648 1.560173e+10 0.306660 0.6919 2023-01 0.086541
520001 689009.XSHG 2023-01 23.555498 1.698332e+10 0.281714 0.7379 2023-02 -0.007738

520002 rows × 8 columns

Sorting on BM poin-in-time

In [160]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))

quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)

ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')

portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)

portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()

mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[160]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.002267 0.005305 0.007695 0.008414 0.010244 0.011246 0.012452 0.013310 0.014357 0.015746 0.013479
t-value 0.336678 0.789832 1.116008 1.204808 1.469645 1.600140 1.761497 1.870286 1.927962 2.008563 3.448444

结论:

  • 用最新的BM更新portfolio可以带来收益率的递增,但每个portfolio本身的收益率并不显著为正,除了p10
  • p10和p1的差距是显著为正的

Sorting on BM with data from Uqer

In [161]:
# %%time
# begin_ = 2007
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# pb_df = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=['secID','tradeDate','PB'],pandas="1")
In [162]:
# # # 从优矿下载 PB,时间较长。由于优矿的限制,每次下载3年的数据

# pb = {}
# begin_ = 2007
# end_ = 2010
# while begin_ <= 2023:
#     if begin_ == 2023:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=['secID','tradeDate','PB'],pandas="1")
#     else:
#         pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=['secID','tradeDate','PB'],pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(pb)):
#     pb_df = pd.DataFrame(np.vstack([_df for _df in pb.values()]),columns=['secID','tradeDate','PB'])
    
# pb_df.to_pickle('./data/pb_df.pkl')
In [164]:
pb_df
Out[164]:
secID tradeDate PB
0 000001.XSHE 2007-01-04 4.5351
1 000002.XSHE 2007-01-04 6.7999
2 000004.XSHE 2007-01-04 4.2535
3 000005.XSHE 2007-01-04 2.7093
4 000006.XSHE 2007-01-04 2.8088
5 000007.XSHE 2007-01-04 6.8244
6 000008.XSHE 2007-01-04 4.2374
7 000014.XSHE 2007-01-04 2.7436
8 000016.XSHE 2007-01-04 0.7852
9 000017.XSHE 2007-01-04 -0.717
... ... ... ...
11190970 688787.XSHG 2023-03-03 10.0327
11190971 688788.XSHG 2023-03-03 1.7234
11190972 688789.XSHG 2023-03-03 9.6548
11190973 688793.XSHG 2023-03-03 5.9281
11190974 688798.XSHG 2023-03-03 4.785
11190975 688799.XSHG 2023-03-03 2.1816
11190976 688800.XSHG 2023-03-03 5.8383
11190977 688819.XSHG 2023-03-03 2.698
11190978 688981.XSHG 2023-03-03 2.5813
11190979 689009.XSHG 2023-03-03 5.1622

11190980 rows × 3 columns

In [172]:
# pb_df = pd.read_pickle('./data/pb_df.pkl')
In [165]:
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])

pb_df['PB'] = pd.to_numeric(pb_df['PB'])

pb_df['grouping_date'] = pb_df['tradeDate'].dt.to_period('M')

pb_df = pb_df.groupby(['secID','grouping_date'],as_index=False).last()

pb_df['bm_uqer'] = 1 / pb_df['PB']

ret_df = pd.merge(ret_df,pb_df[['secID','grouping_date','bm_uqer']],on=['secID','grouping_date'])

del pb_df
In [166]:
ret_df
Out[166]:
secID grouping_date size mktcap bm beta ret_date exret bm_uqer
0 000001.XSHE 2007-06 24.476555 4.266117e+10 0.166571 0.4614 2007-07 0.313877 0.123739
1 000001.XSHE 2007-07 24.751529 5.616330e+10 0.126526 0.6423 2007-08 0.046173 0.093992
2 000001.XSHE 2007-08 24.799228 5.890714e+10 0.130688 0.7722 2007-09 0.049171 0.097085
3 000001.XSHE 2007-09 24.850021 6.197651e+10 0.124216 0.7596 2007-10 0.198601 0.092276
4 000001.XSHE 2007-10 25.033884 7.448652e+10 0.112283 0.7988 2007-11 -0.252661 0.083411
5 000001.XSHE 2007-11 24.747381 5.593078e+10 0.149534 0.9560 2007-12 0.066202 0.111084
6 000001.XSHE 2007-12 24.909069 6.574629e+10 0.127210 0.9468 2008-01 -0.141037 0.094476
7 000001.XSHE 2008-01 24.792329 5.850212e+10 0.142962 0.9654 2008-02 -0.008257 0.109513
8 000001.XSHE 2008-02 24.787814 5.823860e+10 0.143608 1.0292 2008-03 -0.153072 0.110009
9 000001.XSHE 2008-03 24.626093 4.954234e+10 0.262524 1.0238 2008-04 0.046610 0.201102
... ... ... ... ... ... ... ... ... ...
519992 689009.XSHG 2022-04 23.663502 1.892031e+10 0.225936 0.8108 2022-05 0.224708 0.162419
519993 689009.XSHG 2022-05 23.867639 2.320511e+10 0.186935 0.9844 2022-06 -0.025790 0.134383
519994 689009.XSHG 2022-06 23.843220 2.264534e+10 0.191556 0.9071 2022-07 0.113436 0.137005
519995 689009.XSHG 2022-07 23.952125 2.525082e+10 0.171790 0.7987 2022-08 -0.114022 0.122868
519996 689009.XSHG 2022-08 23.832602 2.240616e+10 0.205191 0.8589 2022-09 -0.131253 0.146757
519997 689009.XSHG 2022-09 23.693442 1.949535e+10 0.235828 0.9106 2022-10 -0.166122 0.168319
519998 689009.XSHG 2022-10 23.518985 1.637440e+10 0.292190 0.7083 2022-11 0.041449 0.209701
519999 689009.XSHG 2022-11 23.561206 1.708055e+10 0.280110 0.7363 2022-12 -0.088510 0.201033
520000 689009.XSHG 2022-12 23.470648 1.560173e+10 0.306660 0.6919 2023-01 0.086541 0.220085
520001 689009.XSHG 2023-01 23.555498 1.698332e+10 0.281714 0.7379 2023-02 -0.007738 0.201772

520002 rows × 9 columns

In [173]:
ret_df.loc[ret_df['secID']=='000001.XSHE',['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
Out[173]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7bb0d97b50>
In [180]:
sample_id = np.random.choice(ret_df['secID'].unique(),1)
display(ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']])
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
grouping_date bm bm_uqer
119475 2008-02 0.186432 0.037286
119476 2008-03 0.259620 0.051924
119477 2008-04 0.775002 0.155000
119478 2008-05 0.613057 0.153264
119479 2008-06 0.816463 0.204115
119480 2008-07 0.675406 0.168850
119481 2008-08 0.746163 0.186539
119482 2008-09 0.856908 0.214229
119483 2008-10 1.115926 0.278979
119484 2008-11 1.006362 0.251591
... ... ... ...
119645 2022-04 0.788705 0.572082
119646 2022-05 0.723281 0.521621
119647 2022-06 0.723772 0.575738
119648 2022-07 0.686076 0.547345
119649 2022-08 0.667507 0.532538
119650 2022-09 0.758950 0.606465
119651 2022-10 0.779189 0.622665
119652 2022-11 0.725935 0.580080
119653 2022-12 0.765882 0.608643
119654 2023-01 0.719465 0.571755

180 rows × 3 columns

Out[180]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba2c4da10>
In [181]:
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot(subplots=True)
Out[181]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f7ba2c54390>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f7ba6a65ed0>],
      dtype=object)
In [182]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))

quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['grouping_date'])['bm_uqer'].quantile(value)

ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')

portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm_uqer'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm_uqer']) & (ret_df_q['bm_uqer'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm_uqer'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)

portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()

mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[182]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.000930 0.007560 0.008476 0.009993 0.011839 0.012490 0.012324 0.012705 0.012921 0.011716 0.010786
t-value 0.134959 1.080074 1.152986 1.415292 1.629982 1.776933 1.725170 1.785309 1.831983 1.675358 2.872140

Double Sorting on Size and BM

In [183]:
q_size = dict()
keys = ['q_size_1']
values = [0.5]
q_size.update(zip(keys,values))

q_bm = dict()
keys = ['q_bm_1','q_bm_2']
values = [0.3, 0.7]
q_bm.update(zip(keys,values))

q_size_df = pd.DataFrame()
for key, value in q_size.items():
    q_size_df[key] = ret_df.groupby(['grouping_date'])['size'].quantile(value)

q_bm_df = pd.DataFrame()
for key, value in q_bm.items():
    q_bm_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)

ret_df_q = pd.merge(ret_df, q_size_df, on='grouping_date')
ret_df_q = pd.merge(ret_df_q, q_bm_df, on='grouping_date')

portfolios_size = dict()
portfolios_size['size1'] = ret_df_q.loc[ret_df_q['size'] <= ret_df_q['q_size_1'],
                                           ['secID','grouping_date','ret_date','exret','size','mktcap']]
portfolios_size['size2'] = ret_df_q.loc[ret_df_q['size'] >= ret_df_q['q_size_1'],
                                           ['secID','grouping_date','ret_date','exret','size','mktcap']]

portfolios_bm = dict()
portfolios_bm['bm1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q_bm_1'],
                                           ['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm2'] = ret_df_q.loc[(ret_df_q['bm'] >= ret_df_q['q_bm_1']) & \
                                           (ret_df_q['bm'] <= ret_df_q['q_bm_2']),
                                           ['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm3'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q_bm_2'],
                                           ['secID','grouping_date','ret_date','exret','bm']]

portfolios = dict()
for bm_group in portfolios_bm.keys():
    for size_group in portfolios_size.keys():
        portfolios[f'{bm_group}_{size_group}'] = pd.merge(portfolios_size[size_group],
                                                              portfolios_bm[bm_group][['secID','ret_date','bm']],
                                                              on=['secID','ret_date'])

mean_portfolios_ret = dict()
for pf in portfolios.keys():
    mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
    print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有bm和size分组没有任何交叉

# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['bm1_size1'].index

# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
    y = mean_portfolios_ret[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
(190,)
(190,)
(190,)
(190,)
(190,)
(190,)
Out[183]:
bm1_size1 bm1_size2 bm2_size1 bm2_size2 bm3_size1 bm3_size2
ret_mean 0.010139 0.00216 0.015918 0.005002 0.020509 0.005087
t_values 1.389654 0.32172 2.167410 0.732416 2.547421 0.757503

Fama MacBeth regression

In [184]:
# ret_df[(ret_df['ret_date'] >= '2008-02') & (ret_df['secID'] == '000001.XSHE')]
In [185]:
ret_df['exret100'] = ret_df['exret'] * 100

def fm_reg(df):
    df_  = df.dropna()
    if df_.shape[0] < 15:
        return None
    reg = LinearRegression().fit(y=df_.loc[:,'exret100'], X=df_.loc[:,['beta','size','bm']])
    return np.insert(reg.coef_, 0, reg.intercept_)

temp = ret_df.groupby('ret_date').apply(fm_reg)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept', 'beta_coef','size_coef', 'bm_coef']
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
    y = reg_result_df[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
Out[185]:
intercept beta_coef size_coef bm_coef
ret_mean 9.212739 0.405394 -0.399175 0.039288
t_values 2.619763 1.133055 -2.679284 0.714466
In [186]:
portfolios
Out[186]:
{'bm1_size1':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000014.XSHE       2007-06  2007-07  0.547459  21.108547  1.470028e+09   
 1      000025.XSHE       2007-06  2007-07 -0.201773  20.485442  7.883413e+08   
 2      000033.XSHE       2007-06  2007-07  0.126208  20.845732  1.130281e+09   
 3      000049.XSHE       2007-06  2007-07  0.471941  20.141541  5.589333e+08   
 4      000056.XSHE       2007-06  2007-07  0.156088  20.566809  8.551683e+08   
 5      000411.XSHE       2007-06  2007-07  0.430651  19.585247  3.204531e+08   
 6      000415.XSHE       2007-06  2007-07  0.165811  21.115524  1.480319e+09   
 7      000421.XSHE       2007-06  2007-07  0.146840  21.013886  1.337256e+09   
 8      000515.XSHE       2007-06  2007-07  0.252712  20.838220  1.121822e+09   
 9      000532.XSHE       2007-06  2007-07  0.321315  21.090814  1.444189e+09   
 ...            ...           ...      ...       ...        ...           ...   
 58370  600817.XSHG       2007-05  2007-06 -0.259142  20.932604  1.232862e+09   
 58371  600842.XSHG       2007-05  2007-06 -0.203433  20.925434  1.224054e+09   
 58372  600846.XSHG       2007-05  2007-06 -0.330308  21.306837  1.792429e+09   
 58373  600848.XSHG       2007-05  2007-06 -0.029055  20.366862  7.001897e+08   
 58374  600850.XSHG       2007-05  2007-06 -0.410932  20.656770  9.356667e+08   
 58375  600856.XSHG       2007-05  2007-06 -0.368356  20.756396  1.033685e+09   
 58376  600869.XSHG       2007-05  2007-06 -0.292351  21.137896  1.513811e+09   
 58377  600870.XSHG       2007-05  2007-06 -0.292416  21.209669  1.626455e+09   
 58378  600883.XSHG       2007-05  2007-06 -0.122108  20.870365  1.158468e+09   
 58379  600885.XSHG       2007-05  2007-06 -0.306797  20.460659  7.690443e+08   
 
              bm  
 0      0.231605  
 1      0.171424  
 2      0.276225  
 3      0.276072  
 4      0.191719  
 5      0.270130  
 6      0.269020  
 7      0.317755  
 8      0.309840  
 9      0.323877  
 ...         ...  
 58370  0.250712  
 58371  0.208030  
 58372  0.273707  
 58373  0.204301  
 58374  0.231505  
 58375  0.173433  
 58376  0.097180  
 58377  0.202155  
 58378  0.258326  
 58379  0.184045  
 
 [58380 rows x 7 columns],
 'bm1_size2':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000001.XSHE       2007-06  2007-07  0.313877  24.476555  4.266117e+10   
 1      000002.XSHE       2007-06  2007-07  0.477505  25.259434  9.333248e+10   
 2      000006.XSHE       2007-06  2007-07  0.282520  22.417635  5.443212e+09   
 3      000024.XSHE       2007-06  2007-07  0.221415  23.468754  1.557222e+10   
 4      000031.XSHE       2007-06  2007-07  0.500025  22.876698  8.614372e+09   
 5      000040.XSHE       2007-06  2007-07  0.255868  21.707787  2.676529e+09   
 6      000043.XSHE       2007-06  2007-07  0.353304  21.331385  1.836973e+09   
 7      000060.XSHE       2007-06  2007-07  0.393288  23.262121  1.266516e+10   
 8      000061.XSHE       2007-06  2007-07 -0.002620  22.475253  5.766049e+09   
 9      000069.XSHE       2007-06  2007-07  0.220995  23.693526  1.949699e+10   
 ...            ...           ...      ...       ...        ...           ...   
 97693  600879.XSHG       2007-05  2007-06  0.026076  22.975368  9.507705e+09   
 97694  600880.XSHG       2007-05  2007-06 -0.123901  21.893247  3.221933e+09   
 97695  600881.XSHG       2007-05  2007-06  0.002865  23.759889  2.083477e+10   
 97696  600887.XSHG       2007-05  2007-06 -0.117072  23.471224  1.561072e+10   
 97697  600888.XSHG       2007-05  2007-06  0.077841  21.528878  2.238067e+09   
 97698  600895.XSHG       2007-05  2007-06 -0.118853  23.162926  1.146914e+10   
 97699  600896.XSHG       2007-05  2007-06 -0.019787  22.244590  4.578283e+09   
 97700  600962.XSHG       2007-05  2007-06 -0.162432  21.543806  2.271728e+09   
 97701  600970.XSHG       2007-05  2007-06 -0.068962  22.173919  4.265900e+09   
 97702  600981.XSHG       2007-05  2007-06  0.099843  21.754868  2.805558e+09   
 
              bm  
 0      0.166571  
 1      0.167751  
 2      0.243786  
 3      0.236311  
 4      0.175485  
 5      0.329985  
 6      0.235918  
 7      0.270314  
 8      0.250229  
 9      0.155310  
 ...         ...  
 97693  0.142725  
 97694  0.142852  
 97695  0.127255  
 97696  0.176896  
 97697  0.237113  
 97698  0.235866  
 97699  0.248476  
 97700  0.224796  
 97701  0.199396  
 97702  0.251969  
 
 [97703 rows x 7 columns],
 'bm2_size1':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000019.XSHE       2007-06  2007-07 -0.002620  20.290546  6.487424e+08   
 1       000023.XSHE       2007-06  2007-07  0.348274  20.128646  5.517721e+08   
 2       000028.XSHE       2007-06  2007-07  0.618998  20.868838  1.156702e+09   
 3       000055.XSHE       2007-06  2007-07  0.230079  20.441209  7.542311e+08   
 4       000065.XSHE       2007-06  2007-07  0.315912  20.563070  8.519768e+08   
 5       000159.XSHE       2007-06  2007-07  0.317826  21.009003  1.330742e+09   
 6       000404.XSHE       2007-06  2007-07  0.464207  20.603151  8.868187e+08   
 7       000502.XSHE       2007-06  2007-07  0.959733  20.244751  6.197031e+08   
 8       000504.XSHE       2007-06  2007-07  0.374538  20.710027  9.868487e+08   
 9       000516.XSHE       2007-06  2007-07  0.141541  20.788909  1.067846e+09   
 ...             ...           ...      ...       ...        ...           ...   
 107477  600967.XSHG       2007-05  2007-06 -0.271018  20.856611  1.142644e+09   
 107478  600976.XSHG       2007-05  2007-06 -0.307428  21.175786  1.572269e+09   
 107479  600979.XSHG       2007-05  2007-06 -0.303603  20.689486  9.667840e+08   
 107480  600980.XSHG       2007-05  2007-06 -0.366915  20.430975  7.465511e+08   
 107481  600982.XSHG       2007-05  2007-06  0.004987  20.497738  7.980946e+08   
 107482  600983.XSHG       2007-05  2007-06 -0.318684  20.749450  1.026530e+09   
 107483  600985.XSHG       2007-05  2007-06 -0.335514  20.341439  6.826134e+08   
 107484  600993.XSHG       2007-05  2007-06 -0.124224  21.216830  1.638144e+09   
 107485  600995.XSHG       2007-05  2007-06 -0.245821  21.113707  1.477632e+09   
 107486  601008.XSHG       2007-05  2007-06 -0.018060  21.299654  1.779600e+09   
 
               bm  
 0       0.427137  
 1       0.545605  
 2       0.369999  
 3       0.661388  
 4       0.564491  
 5       0.435299  
 6       0.446288  
 7       0.351514  
 8       0.452700  
 9       0.464874  
 ...          ...  
 107477  0.426070  
 107478  0.453182  
 107479  0.452036  
 107480  0.487037  
 107481  0.464610  
 107482  0.529070  
 107483  0.426462  
 107484  0.393184  
 107485  0.401229  
 107486  0.426846  
 
 [107487 rows x 7 columns],
 'bm2_size2':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000012.XSHE       2007-06  2007-07  0.253637  22.351744  5.096114e+09   
 1       000027.XSHE       2007-06  2007-07  0.123433  23.328501  1.353441e+10   
 2       000029.XSHE       2007-06  2007-07  0.213347  21.363237  1.896427e+09   
 3       000036.XSHE       2007-06  2007-07  0.148847  22.376371  5.223174e+09   
 4       000039.XSHE       2007-06  2007-07 -0.006713  23.878072  2.344849e+10   
 5       000046.XSHE       2007-06  2007-07  0.519563  22.760060  7.665997e+09   
 6       000063.XSHE       2007-06  2007-07  0.047591  24.000356  2.649856e+10   
 7       000070.XSHE       2007-06  2007-07 -0.277638  21.328446  1.831582e+09   
 8       000088.XSHE       2007-06  2007-07  0.048321  22.642563  6.816168e+09   
 9       000089.XSHE       2007-06  2007-07  0.117510  22.598489  6.522272e+09   
 ...             ...           ...      ...       ...        ...           ...   
 100491  600867.XSHG       2007-05  2007-06 -0.354046  22.170913  4.253096e+09   
 100492  600875.XSHG       2007-05  2007-06  0.219101  22.364401  5.161023e+09   
 100493  600884.XSHG       2007-05  2007-06  0.111874  22.126345  4.067705e+09   
 100494  600886.XSHG       2007-05  2007-06  0.007688  22.675347  7.043329e+09   
 100495  600900.XSHG       2007-05  2007-06  0.082105  24.871565  6.332622e+10   
 100496  600971.XSHG       2007-05  2007-06 -0.124636  21.570288  2.332689e+09   
 100497  600973.XSHG       2007-05  2007-06  0.034460  21.483241  2.138222e+09   
 100498  600978.XSHG       2007-05  2007-06  0.037508  22.245342  4.581730e+09   
 100499  600997.XSHG       2007-05  2007-06 -0.072709  22.484789  5.821295e+09   
 100500  601001.XSHG       2007-05  2007-06  0.000842  22.683079  7.098000e+09   
 
               bm  
 0       0.527283  
 1       0.369063  
 2       0.589420  
 3       0.361454  
 4       0.541703  
 5       0.447062  
 6       0.406505  
 7       0.337101  
 8       0.485723  
 9       0.522970  
 ...          ...  
 100491  0.300867  
 100492  0.494309  
 100493  0.361535  
 100494  0.380774  
 100495  0.455506  
 100496  0.448427  
 100497  0.341000  
 100498  0.477266  
 100499  0.473105  
 100500  0.505101  
 
 [100501 rows x 7 columns],
 'bm3_size1':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000018.XSHE       2007-06  2007-07  0.276978  19.237899  2.264193e+08   
 1      000032.XSHE       2007-06  2007-07  0.275006  20.558908  8.484381e+08   
 2      000037.XSHE       2007-06  2007-07  0.228398  20.788640  1.067558e+09   
 3      000045.XSHE       2007-06  2007-07  0.331251  19.445612  2.786903e+08   
 4      000050.XSHE       2007-06  2007-07  0.197368  20.984007  1.297892e+09   
 5      000062.XSHE       2007-06  2007-07  0.256482  21.249745  1.692961e+09   
 6      000096.XSHE       2007-06  2007-07  0.547056  20.818688  1.100124e+09   
 7      000151.XSHE       2007-06  2007-07  0.384384  20.913814  1.209912e+09   
 8      000153.XSHE       2007-06  2007-07  0.207465  20.542752  8.348414e+08   
 9      000155.XSHE       2007-06  2007-07  0.178528  21.155886  1.541290e+09   
 ...            ...           ...      ...       ...        ...           ...   
 94197  600960.XSHG       2007-05  2007-06 -0.236967  20.426873  7.434956e+08   
 94198  600966.XSHG       2007-05  2007-06 -0.187255  21.425218  2.017687e+09   
 94199  600969.XSHG       2007-05  2007-06 -0.239106  20.687282  9.646560e+08   
 94200  600975.XSHG       2007-05  2007-06  0.107920  20.166645  5.731425e+08   
 94201  600986.XSHG       2007-05  2007-06  0.106797  20.640347  9.204254e+08   
 94202  600987.XSHG       2007-05  2007-06 -0.264676  21.128655  1.499886e+09   
 94203  600990.XSHG       2007-05  2007-06 -0.219190  20.034617  5.022540e+08   
 94204  600991.XSHG       2007-05  2007-06 -0.231495  21.073762  1.419772e+09   
 94205  600992.XSHG       2007-05  2007-06 -0.244937  20.633606  9.142420e+08   
 94206  601007.XSHG       2007-05  2007-06 -0.039934  20.999552  1.318225e+09   
 
              bm  
 0      1.214773  
 1      0.746448  
 2      1.487667  
 3      1.241401  
 4      0.685461  
 5      0.764958  
 6      1.225683  
 7      0.740278  
 8      0.748933  
 9      1.093114  
 ...         ...  
 94197  0.792550  
 94198  0.718100  
 94199  0.626084  
 94200  0.811844  
 94201  0.622055  
 94202  0.699624  
 94203  0.627999  
 94204  1.418501  
 94205  0.814164  
 94206  0.629433  
 
 [94207 rows x 7 columns],
 'bm3_size2':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000016.XSHE       2007-06  2007-07  0.160653  21.518036  2.213931e+09   
 1      000021.XSHE       2007-06  2007-07  0.136982  22.393984  5.315985e+09   
 2      000022.XSHE       2007-06  2007-07  0.095368  21.633381  2.484608e+09   
 3      000042.XSHE       2007-06  2007-07  0.319595  21.542105  2.267866e+09   
 4      000059.XSHE       2007-06  2007-07  0.195433  21.712219  2.688418e+09   
 5      000066.XSHE       2007-06  2007-07  0.280551  21.604846  2.414713e+09   
 6      000090.XSHE       2007-06  2007-07  0.152888  21.858640  3.112339e+09   
 7      000420.XSHE       2007-06  2007-07  0.430264  21.313973  1.805264e+09   
 8      000429.XSHE       2007-06  2007-07  0.250411  21.895636  3.229638e+09   
 9      000488.XSHE       2007-06  2007-07  0.109754  22.670157  7.006869e+09   
 ...            ...           ...      ...       ...        ...           ...   
 61860  601318.XSHG       2007-05  2007-06  0.174394  24.279770  3.504050e+10   
 61861  601333.XSHG       2007-05  2007-06 -0.172301  23.538647  1.669954e+10   
 61862  601398.XSHG       2007-05  2007-06 -0.085398  24.641483  5.031067e+10   
 61863  601588.XSHG       2007-05  2007-06 -0.327006  23.388652  1.437350e+10   
 61864  601628.XSHG       2007-05  2007-06  0.108530  24.228823  3.330000e+10   
 61865  601666.XSHG       2007-05  2007-06 -0.063740  22.691706  7.159500e+09   
 61866  601699.XSHG       2007-05  2007-06  0.106236  22.615594  6.634800e+09   
 61867  601872.XSHG       2007-05  2007-06  0.073035  22.892914  8.755200e+09   
 61868  601988.XSHG       2007-05  2007-06 -0.126460  24.119138  2.984066e+10   
 61869  601991.XSHG       2007-05  2007-06  0.470307  22.689538  7.143993e+09   
 
               bm  
 0       1.533636  
 1       0.673904  
 2       0.967896  
 3       0.776067  
 4       0.714057  
 5       0.715789  
 6       0.899783  
 7       0.669211  
 8       0.918247  
 9       0.960996  
 ...          ...  
 61860   2.508041  
 61861   1.269615  
 61862   9.647098  
 61863   0.564303  
 61864   3.816847  
 61865   0.747574  
 61866   0.597250  
 61867   0.996823  
 61868  13.359792  
 61869   3.589068  
 
 [61870 rows x 7 columns]}

Fama French 3 factors

Fama-French 3 factors 的构建:

  • Market return 就是 CAPM 里面的市场收益率
  • 另外还有 SMB,HML,也即 Small-Minus-Big, High-Minus-Low

构建方法:

  • mktcap1 也叫做 Small, mktcap2 Big. bm1 Low, bm2 Medium, bm3 High.
  • 因此对应的,我们的
    • bm1_mktcap1: SL
    • bm2_mktcap1: SM
    • bm3_mktcap1: SH
    • bm1_mktcap2: BL
    • bm2_mktcap2: BM
    • bm3_mktcap2: BH
  • 在 Fama French (1993) 的构建里,mktcap 是在t年6月形成并保持到t+1年5月不变。bm和我们这里的构建一样,t年6月按照t-1年的BM ratio构建,保持到t+1年5月不变。
  • Fama French 计算了这6组资产组合每一年从7月到下一年6月(资产形成期的第二个月的收益率)的 value-weighted excess return。weight 是t年6月的mktcap占所在portfolio 总的 mktcap 的比重。
  • SMB: (SL+SM+SH)/3 - (BL+BM+BH)/3。这样构建的意思是把BM的影响平均掉。
  • HML: (SH+BH)/2 - (SL+BL)/2

这里我们还是按照mktcap组合的构建日期,不改成和 Fama-French (1993) 原文一样的日期(t年6月)

In [187]:
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ret_date')['mktcap'].agg({'mktcapsum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ret_date')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcapsum']
    portfolios[pf]['weighted_exret'] = portfolios[pf]['exret'] * portfolios[pf]['weight']
    portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_exret'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret['bm1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
portfolios_vwret_df.rename(columns={"bm1_size1": "SL",
                                    "bm2_size1": "SM",
                                    "bm3_size1": "SH",
                                    "bm1_size2": "BL",
                                    "bm2_size2": "BM",
                                    "bm3_size2": "BH"},
                          inplace=True) # vw: value weighted
In [188]:
portfolios_vwret_df
Out[188]:
SL BL SM BM SH BH
ret_date
2007-05 0.071362 0.111080 0.080949 0.099096 0.053438 0.082722
2007-06 -0.201392 -0.054136 -0.176249 -0.084690 -0.182697 -0.075832
2007-07 0.235563 0.186060 0.256953 0.189900 0.280729 0.192456
2007-08 0.070802 0.149930 0.102963 0.151911 0.109116 0.181585
2007-09 0.022462 0.019254 0.018286 0.042721 0.047119 0.103958
2007-10 -0.105863 -0.000321 -0.115498 -0.037586 -0.116933 0.017789
2007-11 -0.031662 -0.179535 -0.046609 -0.136686 -0.036438 -0.148290
2007-12 0.205897 0.137131 0.197296 0.165248 0.191610 0.106343
2008-01 -0.082404 -0.103462 -0.060240 -0.103317 -0.042012 -0.169955
2008-02 0.086033 0.016389 0.119168 0.033018 0.104149 0.014488
... ... ... ... ... ... ...
2022-05 0.141931 0.061183 0.118054 0.064728 0.112364 0.010173
2022-06 0.078373 0.128686 0.083706 0.069574 0.085063 0.038331
2022-07 0.046719 -0.046237 0.054177 -0.018071 0.040918 -0.035799
2022-08 -0.045451 -0.042070 -0.031431 -0.021325 -0.027525 0.008635
2022-09 -0.091620 -0.073494 -0.094059 -0.071770 -0.091325 -0.044946
2022-10 0.058868 -0.055479 0.063400 -0.021031 0.038831 -0.049913
2022-11 0.073383 0.051119 0.078023 0.095316 0.084293 0.122252
2022-12 -0.035519 0.000507 -0.036035 -0.035794 -0.053285 -0.027306
2023-01 0.071397 0.070189 0.091062 0.064948 0.085336 0.045333
2023-02 0.033873 -0.020676 0.042664 0.028767 0.039253 0.002038

190 rows × 6 columns

In [189]:
SMB = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3 - \
      (portfolios_vwret_df['BL'] + portfolios_vwret_df['BM'] + portfolios_vwret_df['BH']) / 3 

HML = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2 - \
      (portfolios_vwret_df['SL'] + portfolios_vwret_df['BL']) / 2 

factors_df = pd.DataFrame(np.vstack([SMB,HML])).T
factors_df.columns = ['SMB','HML']
factors_df.index = SMB.index
In [190]:
factors_df
Out[190]:
SMB HML
ret_date
2007-05 -0.029050 -0.023142
2007-06 -0.115226 -0.001500
2007-07 0.068276 0.025781
2007-08 -0.066848 0.034985
2007-09 -0.026022 0.054680
2007-10 -0.106058 0.003520
2007-11 0.116601 0.013235
2007-12 0.062027 -0.022538
2008-01 0.064026 -0.013051
2008-02 0.081818 0.008108
... ... ...
2022-05 0.078755 -0.040289
2022-06 0.003517 -0.041832
2022-07 0.080640 0.002318
2022-08 -0.016549 0.034316
2022-09 -0.028932 0.014421
2022-10 0.095840 -0.007235
2022-11 -0.010996 0.041022
2022-12 -0.020749 -0.022789
2023-01 0.022442 -0.005458
2023-02 0.035220 0.014046

190 rows × 2 columns

百度百科:中证800指数是由中证指数有限公司编制,其成份股是由中证500和沪深300成份股一起构成,中证800指数综合反映沪深证券市场内大中小市值公司的整体状况。

In [191]:
# index_info = DataAPI.SecIDGet(assetClass="IDX",pandas="1")

# 用中证800作为market return
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df['ret_date'] = index_df['tradeDate'].dt.to_period('M')

index_df.sort_values('tradeDate',inplace=True)
index_df = index_df.groupby('ret_date',as_index=False).last()
index_df['mktret'] = index_df['closeIndex'] / index_df['closeIndex'].shift() - 1

index_df = pd.merge(index_df,rf,left_on=['ret_date'],right_on=['ym'])
index_df['exmktret'] = index_df['mktret'] - index_df['rf']

index_df.drop(['ym','mktret','rf','indexID','secShortName','tradeDate',
               'closeIndex','CHGPct'],axis=1,inplace=True)

index_df.dropna(inplace=True)

factors_df = pd.merge(index_df, factors_df, on='ret_date')

factors_df['ret_date'] = factors_df['ret_date'].dt.to_timestamp(how='end').dt.normalize()

factors_df.set_index('ret_date',inplace=True)

((1 + factors_df).cumprod()*100).plot()
Out[191]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba3d0a990>
In [192]:
((1 + factors_df.loc['2018':]).cumprod()*100).plot()
Out[192]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba3de9250>
In [193]:
factors_df
Out[193]:
exmktret SMB HML
ret_date
2007-05-31 0.098693 -0.029050 -0.023142
2007-06-30 -0.074622 -0.115226 -0.001500
2007-07-31 0.192240 0.068276 0.025781
2007-08-31 0.167193 -0.066848 0.034985
2007-09-30 0.047263 -0.026022 0.054680
2007-10-31 -0.010382 -0.106058 0.003520
2007-11-30 -0.157389 0.116601 0.013235
2007-12-31 0.137366 0.062027 -0.022538
2008-01-31 -0.123254 0.064026 -0.013051
2008-02-29 0.024010 0.081818 0.008108
... ... ... ...
2022-05-31 0.028896 0.078755 -0.040289
2022-06-30 0.088380 0.003517 -0.041832
2022-07-31 -0.060974 0.080640 0.002318
2022-08-31 -0.023276 -0.016549 0.034316
2022-09-30 -0.069642 -0.028932 0.014421
2022-10-31 -0.056037 0.095840 -0.007235
2022-11-30 0.086374 -0.010996 0.041022
2022-12-31 -0.010731 -0.020749 -0.022789
2023-01-31 0.071339 0.022442 -0.005458
2023-02-28 -0.014930 0.035220 0.014046

190 rows × 3 columns

In [194]:
factors_df.to_csv('./output_data/factors/ff3.csv')

Long-only factors

In [195]:
small_only = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3 

high_only = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2 

factors_long_df = pd.DataFrame(np.vstack([small_only,high_only])).T
factors_long_df.columns = ['small_only','high_only']
factors_long_df.index = small_only.index

factors_long_df = pd.merge(index_df, factors_long_df, on='ret_date')

factors_long_df['ret_date'] = factors_long_df['ret_date'].dt.to_timestamp(freq='day',how='end').dt.normalize()

factors_long_df.set_index('ret_date',inplace=True)

((1 + factors_long_df).cumprod()*100).plot()
Out[195]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba3c38f90>
In [196]:
((1 + factors_long_df.loc['2018':]).cumprod()*100).plot()
Out[196]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7ba3bc9e10>
In [197]:
factors_long_df
Out[197]:
exmktret small_only high_only
ret_date
2007-05-31 0.098693 0.068583 0.068080
2007-06-30 -0.074622 -0.186779 -0.129264
2007-07-31 0.192240 0.257748 0.236593
2007-08-31 0.167193 0.094294 0.145350
2007-09-30 0.047263 0.029289 0.075538
2007-10-31 -0.010382 -0.112765 -0.049572
2007-11-30 -0.157389 -0.038236 -0.092364
2007-12-31 0.137366 0.198268 0.148977
2008-01-31 -0.123254 -0.061552 -0.105984
2008-02-29 0.024010 0.103117 0.059319
... ... ... ...
2022-05-31 0.028896 0.124116 0.061268
2022-06-30 0.088380 0.082381 0.061697
2022-07-31 -0.060974 0.047271 0.002560
2022-08-31 -0.023276 -0.034803 -0.009445
2022-09-30 -0.069642 -0.092335 -0.068136
2022-10-31 -0.056037 0.053699 -0.005541
2022-11-30 0.086374 0.078566 0.103273
2022-12-31 -0.010731 -0.041613 -0.040295
2023-01-31 0.071339 0.082598 0.065335
2023-02-28 -0.014930 0.038597 0.020645

190 rows × 3 columns

In [198]:
factors_long_df.to_csv('./output_data/factors/ff3_long_only.csv')
In [ ]:
 
In [ ]: