In [1]:
import numpy as np
import pandas as pd
import datetime as dt

pd.set_option('display.max_rows', 16)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

import gc
In [2]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)

数据处理

财务数据

财务数据处理的难点在于“报表数据所处的时间”、“报表报告的时间”、“报表修改时间”带来的复杂性。两种处理方式比较合理:

  1. 预留充足的时间以便在使用报表数据的时间点上,报表数据是可用的(但不一定是最新的)
  2. 无论在哪个时间点上使用报表数据,都只用最新的数据(point-in-time)

如果仔细探究,会(惊讶)发现,很多的研究,都采用了“事后修改”过的数据(例如GDP数据)。如果做的是“预测”性质的研究,那么站在 t 时刻,是不可能知道在 t+5 时刻会修改 t-1 时刻的数据,来进行 t+1 时刻的预测的。因此,很多的研究都站不住脚。

交易数据

停牌

  • 停牌在某些时候是可以不处理的,比如计算动量的时候,停牌之后的价格和停牌前的价格计算收益率,可以作为动量的一种衡量
  • 但在有的时候,停牌不处理可能会有问题。
    • 比如计算beta,市场收益率每个交易日都是有的,但个股停牌的时候没有,此时如果设为0,直接回归会有大的偏差
    • 另外比如计算波动率,如果设为0,也有问题
    • 从收益率的角度看,如果我们关注点是月收益率,也应当去掉,因为停牌的股票无法交易,也无法调仓
  • 我们把停牌超过一个月的观测值删去

Data

In [3]:
START = '2008-01-01'
END = '2025-03-31'
In [4]:
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID'].unique()
In [5]:
stk_info
Out[5]:
secID ticker secShortName cnSpell exchangeCD assetClass listStatusCD listDate transCurrCD ISIN partyID delistDate
0 000001.XSHE 000001 平安银行 PAYH XSHE E L 1991-04-03 CNY CNE000000040 2.0 NaN
1 000002.XSHE 000002 万科A WKA XSHE E L 1991-01-29 CNY CNE0000000T2 3.0 NaN
2 000003.XSHE 000003 PT金田A PTJTA XSHE E DE 1991-07-03 CNY CNE1000031Y5 4.0 2002-06-14
3 000004.XSHE 000004 国华网安 GHWA XSHE E L 1990-12-01 CNY CNE0000000Y2 5.0 NaN
4 000005.XSHE 000005 ST星源 STXY XSHE E DE 1990-12-10 CNY CNE0000001L7 6.0 2024-04-26
5 000006.XSHE 000006 深振业A SZYA XSHE E L 1992-04-27 CNY CNE000000164 7.0 NaN
6 000007.XSHE 000007 全新好 QXH XSHE E L 1992-04-13 CNY CNE0000000P0 8.0 NaN
7 000008.XSHE 000008 神州高铁 SZGT XSHE E L 1992-05-07 CNY CNE0000001C6 9.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
27284 900950.XSHG 900950 新城B股 XCBG XSHG E DE 1997-10-16 USD CNE000000TH1 1429.0 2015-11-23
27285 900951.XSHG 900951 退市大化 TSDH XSHG E DE 1997-10-21 USD CNE000000TJ7 1430.0 2020-08-27
27286 900952.XSHG 900952 ST锦港B STJGB XSHG E L 1998-05-19 USD CNE000000W88 763.0 NaN
27287 900953.XSHG 900953 凯马B KMB XSHG E DE 1998-06-24 USD CNE000000WP8 1431.0 2024-07-05
27288 900955.XSHG 900955 退市海B TSHB XSHG E DE 1999-01-18 USD CNE000000YC2 1063.0 2022-07-13
27289 900956.XSHG 900956 东贝B股 DBBG XSHG E DE 1999-07-15 USD CNE000000ZS5 1432.0 2020-11-23
27290 900957.XSHG 900957 凌云B股 LYBG XSHG E L 2000-07-28 USD CNE0000013W9 1433.0 NaN
32144 DY600018.XSHG DY600018 上港集箱 SGJX XSHG E DE 2000-07-19 CNY NaN 618.0 2006-10-20

5547 rows × 12 columns

In [6]:
len(stk_id)
Out[6]:
5547

ST

In [7]:
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
In [8]:
st_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562914 entries, 0 to 562913
Data columns (total 3 columns):
secID        562914 non-null object
tradeDate    562914 non-null object
STflg        562914 non-null object
dtypes: object(3)
memory usage: 12.9+ MB
In [9]:
st_df
Out[9]:
secID tradeDate STflg
0 000004.XSHE 2008-01-02 ST
1 000004.XSHE 2008-01-03 ST
2 000004.XSHE 2008-01-04 ST
3 000004.XSHE 2008-01-07 ST
4 000004.XSHE 2008-01-08 ST
5 000004.XSHE 2008-01-09 ST
6 000004.XSHE 2008-01-10 ST
7 000004.XSHE 2008-01-11 ST
... ... ... ...
562906 900955.XSHG 2022-06-06 *ST
562907 900955.XSHG 2022-06-07 *ST
562908 900955.XSHG 2022-06-08 *ST
562909 900955.XSHG 2022-06-09 *ST
562910 900955.XSHG 2022-06-10 *ST
562911 900955.XSHG 2022-06-13 *ST
562912 900955.XSHG 2022-06-14 *ST
562913 900955.XSHG 2022-06-15 *ST

562914 rows × 3 columns

In [10]:
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")

Book value

Book/Market ratio, 简称BM,也即价值因子,反映了公司的账面价值和市值的比值。Fama French (1993) 发现估值低(BM高)的股票和高的相比,预期收益为正。

BM ratio Fama-French(1993) 原文的构造方法:

  • 每年的12月底的 book equity
  • 每年12月最后一个交易日的mktcap
  • 上述二者相除,得到 BM ratio
  • 这个 BM ratio 作为下一年7月至下下一年6月的 portfolio 的 sorting variable

处理思路:

  • 优矿的数据有发布日期,数据日期
  • 这里book value比较简单,只取年报数据,也就是“数据日期”都是12月
  • 取发布日期最晚,也就是最新的(也许年报和1季报中数据不同,或者年报发布后马上有更改),但不晚于次年6月
In [11]:
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,reportType="A",beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")

# fundmen_df.to_pickle('./data/fundmen_df.pkl')
In [12]:
fundmen_df = pd.read_pickle('./data/fundmen_df.pkl')
In [13]:
fundmen_df
Out[13]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000001.XSHE 2025-03-15 2024-12-31 2024-12-31 2025-03-14 19:28:13 12 4.948420e+11 4.948420e+11 NaN
1 000001.XSHE 2025-03-15 2023-12-31 2024-12-31 2025-03-14 19:28:13 12 4.723280e+11 4.723280e+11 NaN
2 000001.XSHE 2024-10-19 2023-12-31 2024-09-30 2024-10-18 18:00:33 12 4.723280e+11 4.723280e+11 NaN
3 000001.XSHE 2024-08-16 2023-12-31 2024-06-30 2024-08-15 19:33:28 12 4.723280e+11 4.723280e+11 NaN
4 000001.XSHE 2024-04-20 2023-12-31 2024-03-31 2024-04-19 17:56:30 12 4.723280e+11 4.723280e+11 NaN
5 000001.XSHE 2024-03-15 2023-12-31 2023-12-31 2024-03-14 18:46:58 12 4.723280e+11 4.723280e+11 NaN
6 000001.XSHE 2024-03-15 2022-12-31 2023-12-31 2024-03-14 18:46:58 12 4.346800e+11 4.346800e+11 NaN
7 000001.XSHE 2023-10-25 2022-12-31 2023-09-30 2023-10-24 17:52:46 12 4.346800e+11 4.346800e+11 NaN
... ... ... ... ... ... ... ... ... ...
329413 900957.XSHG 2010-08-14 2009-12-31 2010-06-30 2010-08-13 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97
329414 900957.XSHG 2010-04-21 2009-12-31 2010-03-31 2010-04-20 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97
329415 900957.XSHG 2010-03-30 2009-12-31 2009-12-31 2010-03-29 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97
329416 900957.XSHG 2010-03-30 2008-12-31 2009-12-31 2010-03-29 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
329417 900957.XSHG 2009-10-26 2008-12-31 2009-09-30 2009-10-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
329418 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 2009-07-31 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
329419 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 2009-04-17 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
329420 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 2009-03-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94

329421 rows × 9 columns

In [14]:
fundmen_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329421 entries, 0 to 329420
Data columns (total 9 columns):
secID           329421 non-null object
publishDate     329421 non-null object
endDate         329421 non-null object
endDateRep      329421 non-null object
actPubtime      329421 non-null object
fiscalPeriod    329421 non-null object
TShEquity       327631 non-null float64
TEquityAttrP    327628 non-null float64
minorityInt     247471 non-null float64
dtypes: float64(3), object(6)
memory usage: 22.6+ MB
In [15]:
fundmen_df['fiscalPeriod'].unique()
Out[15]:
array(['12'], dtype=object)
In [16]:
fundmen_df[(fundmen_df['secID']=='000001.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[16]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
21 000001.XSHE 2021-02-02 2019-12-31 2020-12-31 2021-02-01 18:58:35 12 3.129830e+11 3.129830e+11 NaN
22 000001.XSHE 2020-10-22 2019-12-31 2020-09-30 2020-10-21 19:21:43 12 3.129830e+11 3.129830e+11 NaN
23 000001.XSHE 2020-08-28 2019-12-31 2020-06-30 2020-08-27 17:50:41 12 3.129830e+11 3.129830e+11 NaN
24 000001.XSHE 2020-04-21 2019-12-31 2020-03-31 2020-04-20 18:42:38 12 3.129830e+11 3.129830e+11 NaN
25 000001.XSHE 2020-02-14 2019-12-31 2019-12-31 2020-02-13 19:02:36 12 3.129830e+11 3.129830e+11 NaN
In [17]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[17]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
168698 300720.XSHE 2021-04-27 2019-12-31 2020-12-31 2021-04-26 18:40:28 12 4.783596e+08 4.783596e+08 NaN
168699 300720.XSHE 2020-10-30 2019-12-31 2020-09-30 2020-10-29 19:58:45 12 4.783596e+08 4.783596e+08 NaN
168700 300720.XSHE 2020-08-28 2019-12-31 2020-06-30 2020-08-27 22:42:40 12 4.783596e+08 4.783596e+08 NaN
168701 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN
168702 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN
In [18]:
fundmen_df[(fundmen_df['secID'] == '300720.XSHE') & (fundmen_df['endDate']=='2018-12-31')]
Out[18]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
168703 300720.XSHE 2020-04-24 2018-12-31 2019-12-31 2020-04-23 21:04:35 12 4.555515e+08 4.555515e+08 NaN
168704 300720.XSHE 2019-10-30 2018-12-31 2019-09-30 2019-10-29 19:22:34 12 4.555515e+08 4.555515e+08 NaN
168705 300720.XSHE 2019-08-28 2018-12-31 2019-06-30 2019-08-27 19:42:06 12 4.555515e+08 4.555515e+08 NaN
168706 300720.XSHE 2019-04-26 2018-12-31 2019-03-31 2019-04-25 23:27:06 12 4.555515e+08 4.555515e+08 NaN
168707 300720.XSHE 2019-04-26 2018-12-31 2018-12-31 2019-04-25 23:27:06 12 4.555515e+08 4.555515e+08 NaN
  • publishDate: 实际公告日期
  • endDate:数值所在日期
  • endDateRep:数值所在报表日期(数值是哪个报表发布的)。03-31是一季报,06-30是半年报,09-30是三季报,12-31是年报。

后面的报表可能会对初始值做修改。

比如,300720.XSHE在2020-04-24公布了数据截止至2019-12-31的报告,里面包含了数据截止至2018-12-31的报表数据。

300720.XSHE在2019-08-28公布了数据截止至2019-06-30的报告,里面包含了数据截止至2018-12-31的报表数据。

在t年6月分组时,应当取最新更新过的t-1年12月31日的Book数值。

In [19]:
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
In [20]:
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
In [21]:
fundmen_df
Out[21]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
0 000001.XSHE 2025-03-15 2024-12-31 2024-12-31 2025-03-14 19:28:13 12 4.948420e+11 4.948420e+11 NaN 3 2025 2024
1 000001.XSHE 2025-03-15 2023-12-31 2024-12-31 2025-03-14 19:28:13 12 4.723280e+11 4.723280e+11 NaN 3 2025 2023
2 000001.XSHE 2024-10-19 2023-12-31 2024-09-30 2024-10-18 18:00:33 12 4.723280e+11 4.723280e+11 NaN 10 2024 2023
3 000001.XSHE 2024-08-16 2023-12-31 2024-06-30 2024-08-15 19:33:28 12 4.723280e+11 4.723280e+11 NaN 8 2024 2023
4 000001.XSHE 2024-04-20 2023-12-31 2024-03-31 2024-04-19 17:56:30 12 4.723280e+11 4.723280e+11 NaN 4 2024 2023
5 000001.XSHE 2024-03-15 2023-12-31 2023-12-31 2024-03-14 18:46:58 12 4.723280e+11 4.723280e+11 NaN 3 2024 2023
6 000001.XSHE 2024-03-15 2022-12-31 2023-12-31 2024-03-14 18:46:58 12 4.346800e+11 4.346800e+11 NaN 3 2024 2022
7 000001.XSHE 2023-10-25 2022-12-31 2023-09-30 2023-10-24 17:52:46 12 4.346800e+11 4.346800e+11 NaN 10 2023 2022
... ... ... ... ... ... ... ... ... ... ... ... ...
329413 900957.XSHG 2010-08-14 2009-12-31 2010-06-30 2010-08-13 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97 8 2010 2009
329414 900957.XSHG 2010-04-21 2009-12-31 2010-03-31 2010-04-20 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97 4 2010 2009
329415 900957.XSHG 2010-03-30 2009-12-31 2009-12-31 2010-03-29 18:00:00 12 4.782246e+08 4.221760e+08 56048539.97 3 2010 2009
329416 900957.XSHG 2010-03-30 2008-12-31 2009-12-31 2010-03-29 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94 3 2010 2008
329417 900957.XSHG 2009-10-26 2008-12-31 2009-09-30 2009-10-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94 10 2009 2008
329418 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 2009-07-31 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94 8 2009 2008
329419 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 2009-04-17 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94 4 2009 2008
329420 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 2009-03-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94 3 2009 2008

329421 rows × 12 columns

In [22]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[22]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
168698 300720.XSHE 2021-04-27 2019-12-31 2020-12-31 2021-04-26 18:40:28 12 4.783596e+08 4.783596e+08 NaN 4 2021 2019
168699 300720.XSHE 2020-10-30 2019-12-31 2020-09-30 2020-10-29 19:58:45 12 4.783596e+08 4.783596e+08 NaN 10 2020 2019
168700 300720.XSHE 2020-08-28 2019-12-31 2020-06-30 2020-08-27 22:42:40 12 4.783596e+08 4.783596e+08 NaN 8 2020 2019
168701 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN 4 2020 2019
168702 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 2020-04-23 21:04:35 12 4.783596e+08 4.783596e+08 NaN 4 2020 2019
In [23]:
fundmen_df['pub_year'] - fundmen_df['data_year'] 
Out[23]:
0         1
1         2
2         1
3         1
4         1
5         1
6         2
7         1
         ..
329413    1
329414    1
329415    1
329416    2
329417    1
329418    1
329419    1
329420    1
Length: 329421, dtype: int64
In [24]:
(fundmen_df['pub_year'] - fundmen_df['data_year']).unique()
Out[24]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])
In [25]:
(fundmen_df['pub_year'] - fundmen_df['data_year']).value_counts()
Out[25]:
1     242527
2      72935
3      11273
4       1906
5        362
6        191
7         98
8         48
9         27
10        17
11        13
12        11
13         7
14         3
15         2
16         1
dtype: int64
In [26]:
fundmen_df.loc[(fundmen_df['pub_year'] - fundmen_df['data_year'])==16]
Out[26]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
208880 600287.XSHG 2024-04-30 2008-12-31 2009-12-31 2024-04-29 20:52:15 12 1.137622e+09 8.267516e+08 3.108704e+08 4 2024 2008
In [27]:
fundmen_df.drop(['actPubtime','fiscalPeriod'],axis=1, inplace=True)
In [28]:
fundmen_df
Out[28]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
0 000001.XSHE 2025-03-15 2024-12-31 2024-12-31 4.948420e+11 4.948420e+11 NaN 3 2025 2024
1 000001.XSHE 2025-03-15 2023-12-31 2024-12-31 4.723280e+11 4.723280e+11 NaN 3 2025 2023
2 000001.XSHE 2024-10-19 2023-12-31 2024-09-30 4.723280e+11 4.723280e+11 NaN 10 2024 2023
3 000001.XSHE 2024-08-16 2023-12-31 2024-06-30 4.723280e+11 4.723280e+11 NaN 8 2024 2023
4 000001.XSHE 2024-04-20 2023-12-31 2024-03-31 4.723280e+11 4.723280e+11 NaN 4 2024 2023
5 000001.XSHE 2024-03-15 2023-12-31 2023-12-31 4.723280e+11 4.723280e+11 NaN 3 2024 2023
6 000001.XSHE 2024-03-15 2022-12-31 2023-12-31 4.346800e+11 4.346800e+11 NaN 3 2024 2022
7 000001.XSHE 2023-10-25 2022-12-31 2023-09-30 4.346800e+11 4.346800e+11 NaN 10 2023 2022
... ... ... ... ... ... ... ... ... ... ...
329413 900957.XSHG 2010-08-14 2009-12-31 2010-06-30 4.782246e+08 4.221760e+08 56048539.97 8 2010 2009
329414 900957.XSHG 2010-04-21 2009-12-31 2010-03-31 4.782246e+08 4.221760e+08 56048539.97 4 2010 2009
329415 900957.XSHG 2010-03-30 2009-12-31 2009-12-31 4.782246e+08 4.221760e+08 56048539.97 3 2010 2009
329416 900957.XSHG 2010-03-30 2008-12-31 2009-12-31 4.902596e+08 4.369354e+08 53324231.94 3 2010 2008
329417 900957.XSHG 2009-10-26 2008-12-31 2009-09-30 4.902596e+08 4.369354e+08 53324231.94 10 2009 2008
329418 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 4.902596e+08 4.369354e+08 53324231.94 8 2009 2008
329419 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 4.902596e+08 4.369354e+08 53324231.94 4 2009 2008
329420 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 4.902596e+08 4.369354e+08 53324231.94 3 2009 2008

329421 rows × 10 columns

In [29]:
# 每年6月底计算时,只能看到publishDate在6月之前的数值。
# 取 endDate 相同时,publishDate 最晚(但小于等于6)的那个数值
# 同时pub_year - data_year 不能大于1(最近的报告)
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
fundmen_df = fundmen_df[(fundmen_df['pub_year'] - fundmen_df['data_year']) == 1]
fundmen_df = fundmen_df[fundmen_df['pub_month'] <= 6]
fundmen_df.sort_values(['secID','endDate','publishDate','endDateRep'],inplace=True)
In [30]:
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
Out[30]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
168702 300720.XSHE 2020-04-24 2019-12-31 2019-12-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
168701 300720.XSHE 2020-04-24 2019-12-31 2020-03-31 4.783596e+08 4.783596e+08 NaN 4 2020 2019
In [31]:
fundmen_df.loc[fundmen_df.duplicated(['secID','endDate'], keep=False)]
Out[31]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
80 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 1.640079e+10 1.640079e+10 NaN 3 2009 2008
79 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008
75 000001.XSHE 2010-03-12 2009-12-31 2009-12-31 2.046961e+10 2.046961e+10 NaN 3 2010 2009
74 000001.XSHE 2010-04-29 2009-12-31 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009
70 000001.XSHE 2011-02-25 2010-12-31 2010-12-31 3.351288e+10 3.351288e+10 NaN 2 2011 2010
69 000001.XSHE 2011-04-27 2010-12-31 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010
65 000001.XSHE 2012-03-09 2011-12-31 2011-12-31 7.538058e+10 7.331084e+10 2.069747e+09 3 2012 2011
64 000001.XSHE 2012-04-26 2011-12-31 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011
... ... ... ... ... ... ... ... ... ... ...
329358 900957.XSHG 2021-04-09 2020-12-31 2020-12-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020
329357 900957.XSHG 2021-04-27 2020-12-31 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020
329353 900957.XSHG 2022-04-20 2021-12-31 2021-12-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021
329352 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021
329348 900957.XSHG 2023-04-08 2022-12-31 2022-12-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022
329347 900957.XSHG 2023-04-27 2022-12-31 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022
329343 900957.XSHG 2024-04-12 2023-12-31 2023-12-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023
329342 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023

117967 rows × 10 columns

In [32]:
fundmen_df.loc[fundmen_df['pub_month'].idxmax()] 
# 观察最晚的披露时间是否超过了6月?
Out[32]:
secID                   000005.XSHE
publishDate     2013-06-15 00:00:00
endDate         2012-12-31 00:00:00
endDateRep               2012-12-31
TShEquity               6.82099e+08
TEquityAttrP            6.82099e+08
minorityInt                     NaN
pub_month                         6
pub_year                       2013
data_year                      2012
Name: 347, dtype: object
In [33]:
fundmen_df.shape
Out[33]:
(120140, 10)
In [34]:
fundmen_df.drop_duplicates(['secID','endDate'],keep='last')
Out[34]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
79 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008
74 000001.XSHE 2010-04-29 2009-12-31 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009
69 000001.XSHE 2011-04-27 2010-12-31 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010
64 000001.XSHE 2012-04-26 2011-12-31 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011
59 000001.XSHE 2013-04-24 2012-12-31 2013-03-31 8.479900e+10 8.479900e+10 NaN 4 2013 2012
54 000001.XSHE 2014-04-24 2013-12-31 2014-03-31 1.120810e+11 1.120810e+11 NaN 4 2014 2013
49 000001.XSHE 2015-04-24 2014-12-31 2015-03-31 1.309490e+11 1.309490e+11 NaN 4 2015 2014
44 000001.XSHE 2016-04-21 2015-12-31 2016-03-31 1.615000e+11 1.615000e+11 NaN 4 2016 2015
... ... ... ... ... ... ... ... ... ... ...
329377 900957.XSHG 2017-04-26 2016-12-31 2017-03-31 3.938268e+08 3.930721e+08 7.546643e+05 4 2017 2016
329372 900957.XSHG 2018-04-26 2017-12-31 2018-03-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017
329367 900957.XSHG 2019-04-25 2018-12-31 2019-03-31 4.515278e+08 4.508051e+08 7.226781e+05 4 2019 2018
329362 900957.XSHG 2020-04-29 2019-12-31 2020-03-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019
329357 900957.XSHG 2021-04-27 2020-12-31 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020
329352 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021
329347 900957.XSHG 2023-04-27 2022-12-31 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022
329342 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023

57446 rows × 10 columns

In [35]:
fundmen_df.groupby(['secID','endDate'],as_index=False).last()
Out[35]:
secID endDate publishDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year
0 000001.XSHE 2008-12-31 2009-04-24 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008
1 000001.XSHE 2009-12-31 2010-04-29 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009
2 000001.XSHE 2010-12-31 2011-04-27 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010
3 000001.XSHE 2011-12-31 2012-04-26 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011
4 000001.XSHE 2012-12-31 2013-04-24 2013-03-31 8.479900e+10 8.479900e+10 NaN 4 2013 2012
5 000001.XSHE 2013-12-31 2014-04-24 2014-03-31 1.120810e+11 1.120810e+11 NaN 4 2014 2013
6 000001.XSHE 2014-12-31 2015-04-24 2015-03-31 1.309490e+11 1.309490e+11 NaN 4 2015 2014
7 000001.XSHE 2015-12-31 2016-04-21 2016-03-31 1.615000e+11 1.615000e+11 NaN 4 2016 2015
... ... ... ... ... ... ... ... ... ... ...
57438 900957.XSHG 2016-12-31 2017-04-26 2017-03-31 3.938268e+08 3.930721e+08 7.546643e+05 4 2017 2016
57439 900957.XSHG 2017-12-31 2018-04-26 2018-03-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017
57440 900957.XSHG 2018-12-31 2019-04-25 2019-03-31 4.515278e+08 4.508051e+08 7.226781e+05 4 2019 2018
57441 900957.XSHG 2019-12-31 2020-04-29 2020-03-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019
57442 900957.XSHG 2020-12-31 2021-04-27 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020
57443 900957.XSHG 2021-12-31 2022-04-30 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021
57444 900957.XSHG 2022-12-31 2023-04-27 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022
57445 900957.XSHG 2023-12-31 2024-04-26 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023

57446 rows × 10 columns

In [36]:
fundmen_df.drop_duplicates(['secID','endDate'],keep='last', inplace=True)
In [37]:
fundmen_df.shape
Out[37]:
(57446, 10)
In [38]:
fundmen_df['bm_date'] = fundmen_df['endDate'].dt.to_period('M')
In [39]:
fundmen_df
Out[39]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date
79 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008 2008-12
74 000001.XSHE 2010-04-29 2009-12-31 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009 2009-12
69 000001.XSHE 2011-04-27 2010-12-31 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010 2010-12
64 000001.XSHE 2012-04-26 2011-12-31 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011 2011-12
59 000001.XSHE 2013-04-24 2012-12-31 2013-03-31 8.479900e+10 8.479900e+10 NaN 4 2013 2012 2012-12
54 000001.XSHE 2014-04-24 2013-12-31 2014-03-31 1.120810e+11 1.120810e+11 NaN 4 2014 2013 2013-12
49 000001.XSHE 2015-04-24 2014-12-31 2015-03-31 1.309490e+11 1.309490e+11 NaN 4 2015 2014 2014-12
44 000001.XSHE 2016-04-21 2015-12-31 2016-03-31 1.615000e+11 1.615000e+11 NaN 4 2016 2015 2015-12
... ... ... ... ... ... ... ... ... ... ... ...
329377 900957.XSHG 2017-04-26 2016-12-31 2017-03-31 3.938268e+08 3.930721e+08 7.546643e+05 4 2017 2016 2016-12
329372 900957.XSHG 2018-04-26 2017-12-31 2018-03-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12
329367 900957.XSHG 2019-04-25 2018-12-31 2019-03-31 4.515278e+08 4.508051e+08 7.226781e+05 4 2019 2018 2018-12
329362 900957.XSHG 2020-04-29 2019-12-31 2020-03-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12
329357 900957.XSHG 2021-04-27 2020-12-31 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12
329352 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12
329347 900957.XSHG 2023-04-27 2022-12-31 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022 2022-12
329342 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023 2023-12

57446 rows × 11 columns

In [40]:
# # minorityInt 有时报告,有时不报告。空值时,假设就是上一次报告的值
# # fundmen_df['minorityInt'] = fundmen_df.groupby('secID')['minorityInt'].fillna(method='ffill')
# # 第一轮填完空值为有效数值后,剩下的空值再用0填充。
# fundmen_df['minorityInt'].fillna(0,inplace=True)
  • 假设是上一次报告的值可能出现误差,因为股权变动了(注意ffill的方法)
  • 直接用TEquityAttrP
In [41]:
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
In [42]:
# np.allclose(fundmen_df['book'],fundmen_df['TEquityAttrP'])
In [43]:
# fundmen_df[fundmen_df['book']-fundmen_df['TEquityAttrP'] > 10]
In [44]:
fundmen_df.loc[fundmen_df['TShEquity']<0]
Out[44]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
193 000003.XSHE 2009-04-30 2008-12-31 2008-12-31 -3.063484e+09 -2.983208e+09 -8.027613e+07 4 2009 2008 2008-12 -2.983208e+09
189 000003.XSHE 2011-04-28 2010-12-31 2010-12-31 -2.994943e+09 -2.994943e+09 NaN 4 2011 2010 2010-12 -2.994943e+09
186 000003.XSHE 2012-04-26 2011-12-31 2011-12-31 -3.277899e+09 -3.277899e+09 NaN 4 2012 2011 2011-12 -3.277899e+09
183 000003.XSHE 2013-04-26 2012-12-31 2012-12-31 -3.292266e+09 -3.292266e+09 NaN 4 2013 2012 2012-12 -3.292266e+09
180 000003.XSHE 2014-04-29 2013-12-31 2013-12-31 -3.286909e+09 -3.286909e+09 NaN 4 2014 2013 2013-12 -3.286909e+09
177 000003.XSHE 2015-04-29 2014-12-31 2014-12-31 -3.299867e+09 -3.299867e+09 NaN 4 2015 2014 2014-12 -3.299867e+09
550 000007.XSHE 2009-04-29 2008-12-31 2009-03-31 -7.487471e+07 -8.457082e+07 9.696114e+06 4 2009 2008 2008-12 -8.457082e+07
545 000007.XSHE 2010-04-27 2009-12-31 2010-03-31 -5.445400e+07 -5.844145e+07 3.987457e+06 4 2010 2009 2009-12 -5.844145e+07
... ... ... ... ... ... ... ... ... ... ... ... ...
327620 900931.XSHG 2017-04-26 2016-12-31 2017-03-31 -4.542747e+07 -4.769018e+07 2.262713e+06 4 2017 2016 2016-12 -4.769018e+07
328036 900937.XSHG 2022-04-28 2021-12-31 2022-03-31 -2.099971e+09 -2.108546e+09 8.574239e+06 4 2022 2021 2021-12 -2.108546e+09
328185 900938.XSHG 2009-04-30 2008-12-31 2009-03-31 -2.074566e+08 -2.091595e+08 1.702871e+06 4 2009 2008 2008-12 -2.091595e+08
328180 900938.XSHG 2010-04-28 2009-12-31 2010-03-31 -4.359970e+08 -4.376105e+08 1.613520e+06 4 2010 2009 2009-12 -4.376105e+08
328175 900938.XSHG 2011-04-27 2010-12-31 2011-03-31 -4.040346e+08 -3.947695e+08 -9.265102e+06 4 2011 2010 2010-12 -3.947695e+08
328170 900938.XSHG 2012-04-27 2011-12-31 2012-03-31 -5.609697e+08 -5.515713e+08 -9.398335e+06 4 2012 2011 2011-12 -5.515713e+08
328608 900945.XSHG 2021-04-30 2020-12-31 2021-03-31 -2.225391e+10 -2.837151e+10 6.117605e+09 4 2021 2020 2020-12 -2.837151e+10
328984 900951.XSHG 2020-04-25 2019-12-31 2020-03-31 -1.835457e+08 -1.835457e+08 NaN 4 2020 2019 2019-12 -1.835457e+08

689 rows × 12 columns

In [45]:
fundmen_df.loc[fundmen_df['book'] < 0]
Out[45]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
193 000003.XSHE 2009-04-30 2008-12-31 2008-12-31 -3.063484e+09 -2.983208e+09 -8.027613e+07 4 2009 2008 2008-12 -2.983208e+09
189 000003.XSHE 2011-04-28 2010-12-31 2010-12-31 -2.994943e+09 -2.994943e+09 NaN 4 2011 2010 2010-12 -2.994943e+09
186 000003.XSHE 2012-04-26 2011-12-31 2011-12-31 -3.277899e+09 -3.277899e+09 NaN 4 2012 2011 2011-12 -3.277899e+09
183 000003.XSHE 2013-04-26 2012-12-31 2012-12-31 -3.292266e+09 -3.292266e+09 NaN 4 2013 2012 2012-12 -3.292266e+09
180 000003.XSHE 2014-04-29 2013-12-31 2013-12-31 -3.286909e+09 -3.286909e+09 NaN 4 2014 2013 2013-12 -3.286909e+09
177 000003.XSHE 2015-04-29 2014-12-31 2014-12-31 -3.299867e+09 -3.299867e+09 NaN 4 2015 2014 2014-12 -3.299867e+09
550 000007.XSHE 2009-04-29 2008-12-31 2009-03-31 -7.487471e+07 -8.457082e+07 9.696114e+06 4 2009 2008 2008-12 -8.457082e+07
545 000007.XSHE 2010-04-27 2009-12-31 2010-03-31 -5.445400e+07 -5.844145e+07 3.987457e+06 4 2010 2009 2009-12 -5.844145e+07
... ... ... ... ... ... ... ... ... ... ... ... ...
327620 900931.XSHG 2017-04-26 2016-12-31 2017-03-31 -4.542747e+07 -4.769018e+07 2.262713e+06 4 2017 2016 2016-12 -4.769018e+07
328036 900937.XSHG 2022-04-28 2021-12-31 2022-03-31 -2.099971e+09 -2.108546e+09 8.574239e+06 4 2022 2021 2021-12 -2.108546e+09
328185 900938.XSHG 2009-04-30 2008-12-31 2009-03-31 -2.074566e+08 -2.091595e+08 1.702871e+06 4 2009 2008 2008-12 -2.091595e+08
328180 900938.XSHG 2010-04-28 2009-12-31 2010-03-31 -4.359970e+08 -4.376105e+08 1.613520e+06 4 2010 2009 2009-12 -4.376105e+08
328175 900938.XSHG 2011-04-27 2010-12-31 2011-03-31 -4.040346e+08 -3.947695e+08 -9.265102e+06 4 2011 2010 2010-12 -3.947695e+08
328170 900938.XSHG 2012-04-27 2011-12-31 2012-03-31 -5.609697e+08 -5.515713e+08 -9.398335e+06 4 2012 2011 2011-12 -5.515713e+08
328608 900945.XSHG 2021-04-30 2020-12-31 2021-03-31 -2.225391e+10 -2.837151e+10 6.117605e+09 4 2021 2020 2020-12 -2.837151e+10
328984 900951.XSHG 2020-04-25 2019-12-31 2020-03-31 -1.835457e+08 -1.835457e+08 NaN 4 2020 2019 2019-12 -1.835457e+08

695 rows × 12 columns

In [46]:
# fundmen_df = fundmen_df[fundmen_df['book'] > 0]
In [47]:
fundmen_df
Out[47]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
79 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008 2008-12 1.640079e+10
74 000001.XSHE 2010-04-29 2009-12-31 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009 2009-12 2.046961e+10
69 000001.XSHE 2011-04-27 2010-12-31 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010 2010-12 3.351288e+10
64 000001.XSHE 2012-04-26 2011-12-31 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011 2011-12 7.331084e+10
59 000001.XSHE 2013-04-24 2012-12-31 2013-03-31 8.479900e+10 8.479900e+10 NaN 4 2013 2012 2012-12 8.479900e+10
54 000001.XSHE 2014-04-24 2013-12-31 2014-03-31 1.120810e+11 1.120810e+11 NaN 4 2014 2013 2013-12 1.120810e+11
49 000001.XSHE 2015-04-24 2014-12-31 2015-03-31 1.309490e+11 1.309490e+11 NaN 4 2015 2014 2014-12 1.309490e+11
44 000001.XSHE 2016-04-21 2015-12-31 2016-03-31 1.615000e+11 1.615000e+11 NaN 4 2016 2015 2015-12 1.615000e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
329377 900957.XSHG 2017-04-26 2016-12-31 2017-03-31 3.938268e+08 3.930721e+08 7.546643e+05 4 2017 2016 2016-12 3.930721e+08
329372 900957.XSHG 2018-04-26 2017-12-31 2018-03-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12 4.231040e+08
329367 900957.XSHG 2019-04-25 2018-12-31 2019-03-31 4.515278e+08 4.508051e+08 7.226781e+05 4 2019 2018 2018-12 4.508051e+08
329362 900957.XSHG 2020-04-29 2019-12-31 2020-03-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12 4.761021e+08
329357 900957.XSHG 2021-04-27 2020-12-31 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12 4.979110e+08
329352 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12 5.255741e+08
329347 900957.XSHG 2023-04-27 2022-12-31 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022 2022-12 5.660700e+08
329342 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023 2023-12 6.063475e+08

57446 rows × 12 columns

Risk free rate

In [48]:
pd.read_csv("./data/rf-monthly-2025.csv", encoding='UTF')
Out[48]:
日期_Date 年份()_Year 月份_Month 月无风险收益率_MonRFRet
0 1989-02-01 1989 2 0.006300
1 1989-03-01 1989 3 0.006300
2 1989-04-01 1989 4 0.006300
3 1989-05-01 1989 5 0.006300
4 1989-06-01 1989 6 0.006300
5 1989-07-01 1989 7 0.006300
6 1989-08-01 1989 8 0.006300
7 1989-09-01 1989 9 0.006300
... ... ... ... ...
425 2024-07-01 2024 7 0.002068
426 2024-08-01 2024 8 0.002068
427 2024-09-01 2024 9 0.002068
428 2024-10-01 2024 10 0.002068
429 2024-11-01 2024 11 0.002068
430 2024-12-01 2024 12 0.002068
431 2025-01-01 2025 1 0.002068
432 2025-02-01 2025 2 0.002068

433 rows × 4 columns

In [49]:
rf = pd.read_csv("./data/rf-monthly-2025.csv", encoding='UTF').drop(["年份()_Year", "月份_Month"],axis=1)
rf.columns = ['Date', 'rf']
rf['Date'] = pd.to_datetime(rf["Date"])
rf['Date'] = rf['Date'].dt.to_period('M')
rf.rename(columns={'Date':'ym'},inplace=True)
In [50]:
rf
Out[50]:
ym rf
0 1989-02 0.006300
1 1989-03 0.006300
2 1989-04 0.006300
3 1989-05 0.006300
4 1989-06 0.006300
5 1989-07 0.006300
6 1989-08 0.006300
7 1989-09 0.006300
... ... ...
425 2024-07 0.002068
426 2024-08 0.002068
427 2024-09 0.002068
428 2024-10 0.002068
429 2024-11 0.002068
430 2024-12 0.002068
431 2025-01 0.002068
432 2025-02 0.002068

433 rows × 2 columns

Beta

In [51]:
beta_df = pd.read_pickle('./data/beta_df.pkl')

beta_df

beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")

beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')

beta_df[['Beta60','Beta120','Beta252']] = beta_df[['Beta60','Beta120','Beta252']].apply(pd.to_numeric)
In [52]:
beta_df
Out[52]:
secID tradeDate Beta60 Beta120 Beta252 ym
0 000001.XSHE 2008-01-02 1.2063 1.2399 0.9433 2008-01
1 000002.XSHE 2008-01-02 0.9363 1.0889 1.0912 2008-01
2 000004.XSHE 2008-01-02 0.2562 0.5505 0.5230 2008-01
3 000005.XSHE 2008-01-02 0.7417 0.8581 0.6239 2008-01
4 000006.XSHE 2008-01-02 1.0481 0.9951 1.0443 2008-01
5 000007.XSHE 2008-01-02 0.3757 0.6156 0.5067 2008-01
6 000008.XSHE 2008-01-02 0.6072 0.7102 0.6607 2008-01
7 000009.XSHE 2008-01-02 0.9292 0.9433 0.8962 2008-01
... ... ... ... ... ... ...
13325728 688702.XSHG 2025-03-14 1.5236 1.8525 1.8662 2025-03
13325729 688707.XSHG 2025-03-14 1.3776 1.7874 1.7132 2025-03
13325730 688708.XSHG 2025-03-14 0.9792 0.4225 0.4291 2025-03
13325731 688709.XSHG 2025-03-14 2.5297 1.8813 1.9892 2025-03
13325732 688710.XSHG 2025-03-14 1.8830 1.2293 1.3642 2025-03
13325733 688711.XSHG 2025-03-14 1.8779 1.9532 1.9615 2025-03
13325734 688716.XSHG 2025-03-14 1.8888 1.5459 1.6813 2025-03
13325735 688717.XSHG 2025-03-14 1.1226 1.5987 1.5399 2025-03

13325736 rows × 6 columns

In [53]:
# # Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta60_winsor'] = beta_df['Beta60'].clip(lower=beta_df['Beta60'].quantile(lower_q),upper=beta_df['Beta60'].quantile(up_q))
# beta_df['Beta120_winsor'] = beta_df['Beta120'].clip(lower=beta_df['Beta120'].quantile(lower_q),upper=beta_df['Beta120'].quantile(up_q))

Monthly beta

In [54]:
beta_m_df = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()

beta_m_df.rename(columns={'Beta252':'beta'},inplace=True)

beta_m_df
Out[54]:
secID ym beta
0 000001.XSHE 2008-01 0.9654
1 000001.XSHE 2008-02 1.0292
2 000001.XSHE 2008-03 1.0238
3 000001.XSHE 2008-04 1.0212
4 000001.XSHE 2008-05 1.0697
5 000001.XSHE 2008-06 1.0672
6 000001.XSHE 2008-07 1.0966
7 000001.XSHE 2008-08 1.0386
... ... ... ...
663635 689009.XSHG 2024-08 1.1651
663636 689009.XSHG 2024-09 1.0361
663637 689009.XSHG 2024-10 1.3022
663638 689009.XSHG 2024-11 1.2585
663639 689009.XSHG 2024-12 1.2255
663640 689009.XSHG 2025-01 1.1695
663641 689009.XSHG 2025-02 1.2526
663642 689009.XSHG 2025-03 1.2889

663643 rows × 3 columns

Trading data

In [55]:
# %%time
# # About 10 mins
# # # 从优矿下载股票信息,时间较长。由于优矿的限制,每次下载3年的数据

# stk_dict = {}
# begin_ = dt.datetime.strptime(START, '%Y-%m-%d').year
# end_ = dt.datetime.strptime(START, '%Y-%m-%d').year+3
# while begin_ <= 2025:
#     if begin_ == 2025:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=['secID','tradeDate','closePrice','negMarketValue'],pandas="1")
#     else:
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=['secID','tradeDate','closePrice','negMarketValue'],pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(stk_dict)):
#     stk_df = pd.DataFrame(np.vstack([_df for _df in stk_dict.values()]),columns=['secID','tradeDate','closePrice','negMarketValue'])
    
# stk_df.to_pickle('./data/stk_df.pkl')
In [56]:
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df['closePrice'] = pd.to_numeric(stk_df['closePrice'])
stk_df['negMarketValue'] = pd.to_numeric(stk_df['negMarketValue'])
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
In [57]:
stk_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13784564 entries, 0 to 13784563
Data columns (total 5 columns):
secID             object
tradeDate         datetime64[ns]
closePrice        float64
negMarketValue    float64
ym                period[M]
dtypes: datetime64[ns](1), float64(2), object(1), period[M](1)
memory usage: 631.0+ MB

Exclude ST

In [58]:
stk_df
Out[58]:
secID tradeDate closePrice negMarketValue ym
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01
... ... ... ... ... ...
13784556 900957.XSHG 2025-03-13 0.290 5.299200e+07 2025-03
13784557 900957.XSHG 2025-03-14 0.310 5.667200e+07 2025-03
13784558 900957.XSHG 2025-03-17 0.319 5.814400e+07 2025-03
13784559 900957.XSHG 2025-03-18 0.320 5.832800e+07 2025-03
13784560 900957.XSHG 2025-03-19 0.323 5.888000e+07 2025-03
13784561 900957.XSHG 2025-03-20 0.315 5.759200e+07 2025-03
13784562 900957.XSHG 2025-03-21 0.315 5.759200e+07 2025-03
13784563 900957.XSHG 2025-03-24 0.310 5.667200e+07 2025-03

13784564 rows × 5 columns

In [59]:
stk_df
Out[59]:
secID tradeDate closePrice negMarketValue ym
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01
... ... ... ... ... ...
13784556 900957.XSHG 2025-03-13 0.290 5.299200e+07 2025-03
13784557 900957.XSHG 2025-03-14 0.310 5.667200e+07 2025-03
13784558 900957.XSHG 2025-03-17 0.319 5.814400e+07 2025-03
13784559 900957.XSHG 2025-03-18 0.320 5.832800e+07 2025-03
13784560 900957.XSHG 2025-03-19 0.323 5.888000e+07 2025-03
13784561 900957.XSHG 2025-03-20 0.315 5.759200e+07 2025-03
13784562 900957.XSHG 2025-03-21 0.315 5.759200e+07 2025-03
13784563 900957.XSHG 2025-03-24 0.310 5.667200e+07 2025-03

13784564 rows × 5 columns

In [60]:
stk_df.dropna().shape
Out[60]:
(13784564, 5)
In [61]:
stk_df.shape
Out[61]:
(13784564, 5)
In [62]:
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
In [63]:
stk_df = stk_df[stk_df['STflg'].isna()].copy()
In [64]:
stk_df.drop('STflg',axis=1,inplace=True)
In [65]:
stk_df.shape
Out[65]:
(13221779, 5)

Monthly trading df

In [66]:
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).tail(1)
In [67]:
stk_df_m
Out[67]:
secID tradeDate closePrice negMarketValue ym
21 000001.XSHE 2008-01-31 1053.778 5.850212e+10 2008-01
37 000001.XSHE 2008-02-29 1049.032 5.823860e+10 2008-02
58 000001.XSHE 2008-03-31 892.389 4.954234e+10 2008-03
79 000001.XSHE 2008-04-30 937.325 5.203702e+10 2008-04
99 000001.XSHE 2008-05-30 798.403 4.432458e+10 2008-05
119 000001.XSHE 2008-06-30 611.698 4.140495e+10 2008-06
142 000001.XSHE 2008-07-31 658.216 4.455369e+10 2008-07
163 000001.XSHE 2008-08-29 639.229 4.326849e+10 2008-08
... ... ... ... ... ...
13784431 900957.XSHG 2024-08-30 0.270 4.931200e+07 2024-08
13784450 900957.XSHG 2024-09-30 0.354 6.458400e+07 2024-09
13784468 900957.XSHG 2024-10-31 0.401 7.323200e+07 2024-10
13784489 900957.XSHG 2024-11-29 0.392 7.157600e+07 2024-11
13784511 900957.XSHG 2024-12-31 0.316 5.777600e+07 2024-12
13784529 900957.XSHG 2025-01-27 0.340 6.200800e+07 2025-01
13784547 900957.XSHG 2025-02-28 0.286 5.225600e+07 2025-02
13784563 900957.XSHG 2025-03-24 0.310 5.667200e+07 2025-03

657646 rows × 5 columns

Fill na months

In [68]:
def fill_missing(df, full_dates, id_col='secID', date_col='ym'):
    """
    This function fills the missing dates for stocks.
    Parameters:
        df: The dataframe. Could be a sub-dataframe created by "groupby".
            The dataframe must be sorted on the "date_col".
        full_dates: the unique dates covering all securities in the full dataframe. 
                    Need to be sorted.
        id_col: the security id.
        date_col: the dates column for the security
    Returns:
        A dataframe with the missing dates filled with NA.
    """
    one_stk_id = df[id_col].unique()
    date_start = np.where(full_dates == df[date_col].min())[0][0] 
    date_end = np.where(full_dates == df[date_col].max())[0][0]
    dates = full_dates[date_start:date_end+1]
    idx = pd.MultiIndex.from_product([one_stk_id,dates],
                                     names=(id_col,date_col))
    df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
    return df
In [69]:
full_dates = np.sort(stk_df['ym'].unique())
In [70]:
%%time
stk_df_m = stk_df_m.groupby('secID').apply(fill_missing, full_dates=full_dates)
CPU times: user 41.2 s, sys: 192 ms, total: 41.4 s
Wall time: 41.4 s
In [71]:
stk_df_m.reset_index(drop=True, inplace=True)
In [72]:
stk_df_m
Out[72]:
secID ym tradeDate closePrice negMarketValue
0 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10
1 000001.XSHE 2008-02 2008-02-29 1049.032 5.823860e+10
2 000001.XSHE 2008-03 2008-03-31 892.389 4.954234e+10
3 000001.XSHE 2008-04 2008-04-30 937.325 5.203702e+10
4 000001.XSHE 2008-05 2008-05-30 798.403 4.432458e+10
5 000001.XSHE 2008-06 2008-06-30 611.698 4.140495e+10
6 000001.XSHE 2008-07 2008-07-31 658.216 4.455369e+10
7 000001.XSHE 2008-08 2008-08-29 639.229 4.326849e+10
... ... ... ... ... ...
672955 900957.XSHG 2024-08 2024-08-30 0.270 4.931200e+07
672956 900957.XSHG 2024-09 2024-09-30 0.354 6.458400e+07
672957 900957.XSHG 2024-10 2024-10-31 0.401 7.323200e+07
672958 900957.XSHG 2024-11 2024-11-29 0.392 7.157600e+07
672959 900957.XSHG 2024-12 2024-12-31 0.316 5.777600e+07
672960 900957.XSHG 2025-01 2025-01-27 0.340 6.200800e+07
672961 900957.XSHG 2025-02 2025-02-28 0.286 5.225600e+07
672962 900957.XSHG 2025-03 2025-03-24 0.310 5.667200e+07

672963 rows × 5 columns

In [73]:
stk_df_m.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672963 entries, 0 to 672962
Data columns (total 5 columns):
secID             672963 non-null object
ym                672963 non-null period[M]
tradeDate         657646 non-null datetime64[ns]
closePrice        657646 non-null float64
negMarketValue    657646 non-null float64
dtypes: datetime64[ns](1), float64(2), object(1), period[M](1)
memory usage: 25.7+ MB
In [74]:
stk_df_m.drop('tradeDate',axis=1,inplace=True)
In [75]:
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)

# # Use last month's market cap for sorting
stk_df_m['mkt_cap'] = stk_df_m.groupby('secID')['negMarketValue'].shift()
stk_df_m['mkt_cap_date'] = stk_df_m.groupby('secID')['ym'].shift()

stk_df_m.drop(['closePrice','negMarketValue'],axis=1,inplace=True)

stk_df_m
Out[75]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2008-01 NaN NaN NaT
1 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01
2 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02
3 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03
4 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04
5 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05
6 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06
7 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07
... ... ... ... ... ...
672955 900957.XSHG 2024-08 0.436170 3.440800e+07 2024-07
672956 900957.XSHG 2024-09 0.311111 4.931200e+07 2024-08
672957 900957.XSHG 2024-10 0.132768 6.458400e+07 2024-09
672958 900957.XSHG 2024-11 -0.022444 7.323200e+07 2024-10
672959 900957.XSHG 2024-12 -0.193878 7.157600e+07 2024-11
672960 900957.XSHG 2025-01 0.075949 5.777600e+07 2024-12
672961 900957.XSHG 2025-02 -0.158824 6.200800e+07 2025-01
672962 900957.XSHG 2025-03 0.083916 5.225600e+07 2025-02

672963 rows × 5 columns

In [76]:
stk_df_m[(stk_df_m['secID']=='000007.XSHE') & (stk_df_m['ym']>='2021-03') & (stk_df_m['ym']<='2022-08')]
Out[76]:
secID ym ret mkt_cap mkt_cap_date
1048 000007.XSHE 2021-03 -0.046801 1.254329e+09 2021-02
1049 000007.XSHE 2021-04 0.018085 1.195629e+09 2021-03
1050 000007.XSHE 2021-05 NaN 1.217255e+09 2021-04
1051 000007.XSHE 2021-06 NaN NaN 2021-05
1052 000007.XSHE 2021-07 NaN NaN 2021-06
1053 000007.XSHE 2021-08 NaN NaN 2021-07
1054 000007.XSHE 2021-09 NaN NaN 2021-08
1055 000007.XSHE 2021-10 NaN NaN 2021-09
... ... ... ... ... ...
1058 000007.XSHE 2022-01 NaN NaN 2021-12
1059 000007.XSHE 2022-02 NaN NaN 2022-01
1060 000007.XSHE 2022-03 NaN NaN 2022-02
1061 000007.XSHE 2022-04 NaN NaN 2022-03
1062 000007.XSHE 2022-05 NaN NaN 2022-04
1063 000007.XSHE 2022-06 NaN NaN 2022-05
1064 000007.XSHE 2022-07 NaN NaN 2022-06
1065 000007.XSHE 2022-08 0.090902 2.276947e+09 2022-07

18 rows × 5 columns

In [77]:
stk_df_m[stk_df_m['ret'].isna()]
Out[77]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2008-01 NaN NaN NaT
207 000002.XSHE 2008-01 NaN NaN NaT
414 000004.XSHE 2011-06 NaN NaN NaT
546 000004.XSHE 2022-06 NaN 1.463441e+09 2022-05
547 000004.XSHE 2022-07 NaN NaN 2022-06
548 000004.XSHE 2022-08 NaN NaN 2022-07
549 000004.XSHE 2022-09 NaN NaN 2022-08
550 000004.XSHE 2022-10 NaN NaN 2022-09
... ... ... ... ... ...
672594 900955.XSHG 2022-01 NaN NaN 2021-12
672595 900955.XSHG 2022-02 NaN NaN 2022-01
672596 900955.XSHG 2022-03 NaN NaN 2022-02
672597 900955.XSHG 2022-04 NaN NaN 2022-03
672598 900955.XSHG 2022-05 NaN NaN 2022-04
672599 900955.XSHG 2022-06 NaN NaN 2022-05
672601 900956.XSHG 2008-01 NaN NaN NaT
672756 900957.XSHG 2008-01 NaN NaN NaT

21510 rows × 5 columns

In [78]:
stk_df_m[stk_df_m['mkt_cap'].isna()]
Out[78]:
secID ym ret mkt_cap mkt_cap_date
0 000001.XSHE 2008-01 NaN NaN NaT
207 000002.XSHE 2008-01 NaN NaN NaT
414 000004.XSHE 2011-06 NaN NaN NaT
547 000004.XSHE 2022-07 NaN NaN 2022-06
548 000004.XSHE 2022-08 NaN NaN 2022-07
549 000004.XSHE 2022-09 NaN NaN 2022-08
550 000004.XSHE 2022-10 NaN NaN 2022-09
551 000004.XSHE 2022-11 NaN NaN 2022-10
... ... ... ... ... ...
672594 900955.XSHG 2022-01 NaN NaN 2021-12
672595 900955.XSHG 2022-02 NaN NaN 2022-01
672596 900955.XSHG 2022-03 NaN NaN 2022-02
672597 900955.XSHG 2022-04 NaN NaN 2022-03
672598 900955.XSHG 2022-05 NaN NaN 2022-04
672599 900955.XSHG 2022-06 NaN NaN 2022-05
672601 900956.XSHG 2008-01 NaN NaN NaT
672756 900957.XSHG 2008-01 NaN NaN NaT

20795 rows × 5 columns

In [79]:
stk_df_m.dropna(inplace=True)
In [80]:
stk_df_m
Out[80]:
secID ym ret mkt_cap mkt_cap_date
1 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01
2 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02
3 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03
4 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04
5 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05
6 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06
7 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07
8 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08
... ... ... ... ... ...
672955 900957.XSHG 2024-08 0.436170 3.440800e+07 2024-07
672956 900957.XSHG 2024-09 0.311111 4.931200e+07 2024-08
672957 900957.XSHG 2024-10 0.132768 6.458400e+07 2024-09
672958 900957.XSHG 2024-11 -0.022444 7.323200e+07 2024-10
672959 900957.XSHG 2024-12 -0.193878 7.157600e+07 2024-11
672960 900957.XSHG 2025-01 0.075949 5.777600e+07 2024-12
672961 900957.XSHG 2025-02 -0.158824 6.200800e+07 2025-01
672962 900957.XSHG 2025-03 0.083916 5.225600e+07 2025-02

651453 rows × 5 columns

新股上市后首月交易不考虑

需要考虑2008年后上市的股票。因为数据从2008开始,有一些在此之前上市的股票无需处理。

思路:找到每只股票的上市日期。用最早的数据时间跟其比较

In [81]:
stk_info
Out[81]:
secID ticker secShortName cnSpell exchangeCD assetClass listStatusCD listDate transCurrCD ISIN partyID delistDate
0 000001.XSHE 000001 平安银行 PAYH XSHE E L 1991-04-03 CNY CNE000000040 2.0 NaN
1 000002.XSHE 000002 万科A WKA XSHE E L 1991-01-29 CNY CNE0000000T2 3.0 NaN
2 000003.XSHE 000003 PT金田A PTJTA XSHE E DE 1991-07-03 CNY CNE1000031Y5 4.0 2002-06-14
3 000004.XSHE 000004 国华网安 GHWA XSHE E L 1990-12-01 CNY CNE0000000Y2 5.0 NaN
4 000005.XSHE 000005 ST星源 STXY XSHE E DE 1990-12-10 CNY CNE0000001L7 6.0 2024-04-26
5 000006.XSHE 000006 深振业A SZYA XSHE E L 1992-04-27 CNY CNE000000164 7.0 NaN
6 000007.XSHE 000007 全新好 QXH XSHE E L 1992-04-13 CNY CNE0000000P0 8.0 NaN
7 000008.XSHE 000008 神州高铁 SZGT XSHE E L 1992-05-07 CNY CNE0000001C6 9.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
27284 900950.XSHG 900950 新城B股 XCBG XSHG E DE 1997-10-16 USD CNE000000TH1 1429.0 2015-11-23
27285 900951.XSHG 900951 退市大化 TSDH XSHG E DE 1997-10-21 USD CNE000000TJ7 1430.0 2020-08-27
27286 900952.XSHG 900952 ST锦港B STJGB XSHG E L 1998-05-19 USD CNE000000W88 763.0 NaN
27287 900953.XSHG 900953 凯马B KMB XSHG E DE 1998-06-24 USD CNE000000WP8 1431.0 2024-07-05
27288 900955.XSHG 900955 退市海B TSHB XSHG E DE 1999-01-18 USD CNE000000YC2 1063.0 2022-07-13
27289 900956.XSHG 900956 东贝B股 DBBG XSHG E DE 1999-07-15 USD CNE000000ZS5 1432.0 2020-11-23
27290 900957.XSHG 900957 凌云B股 LYBG XSHG E L 2000-07-28 USD CNE0000013W9 1433.0 NaN
32144 DY600018.XSHG DY600018 上港集箱 SGJX XSHG E DE 2000-07-19 CNY NaN 618.0 2006-10-20

5547 rows × 12 columns

In [82]:
list_date = stk_info[['secID','listDate']].copy()
list_date['listDate'] = pd.to_datetime(list_date['listDate'],format="%Y-%m-%d")
list_date['listMonth'] = list_date['listDate'].dt.to_period('M')
# list_date

stk_df_m = pd.merge(stk_df_m,list_date[['secID','listMonth']],on='secID')
# stk_df_m

stk_listMonth = stk_df_m.groupby('secID')[['ym','listMonth']].first()
stk_listMonth['listm_dist'] = stk_listMonth['ym'] - stk_listMonth['listMonth']

# stk_listMonth

# pandas 版本太久
stk_listMonth['listm_dist'] = stk_listMonth['listm_dist'].apply(lambda x: x.n)

# stk_listMonth

# 上市日期在样本内的新上市股票
stk_listMonth.loc[stk_listMonth['listm_dist'] < 2]

newlist_secID = stk_listMonth.loc[stk_listMonth['listm_dist'] < 2].index
newlist_secID

stk_df_m.loc[stk_df_m['secID'].isin(newlist_secID)]

# 每只股票丢掉第一个观测值
temp = stk_df_m.loc[stk_df_m['secID'].isin(newlist_secID)].groupby('secID').apply(lambda x: x.iloc[1:]).drop('secID',axis=1).reset_index().drop('level_1',axis=1)

stk_df_m = pd.concat([stk_df_m.loc[~stk_df_m['secID'].isin(newlist_secID)],temp],axis=0)

# stk_df_m.drop('listMonth',axis=1,inplace=True)

stk_df_m
Out[82]:
secID ym ret mkt_cap mkt_cap_date listMonth
0 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 1991-04
1 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02 1991-04
2 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03 1991-04
3 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04 1991-04
4 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05 1991-04
5 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06 1991-04
6 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07 1991-04
7 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08 1991-04
... ... ... ... ... ... ...
343660 689009.XSHG 2024-08 0.090770 2.238688e+10 2024-07 2020-10
343661 689009.XSHG 2024-09 0.078307 2.442059e+10 2024-08 2020-10
343662 689009.XSHG 2024-10 -0.080923 2.633272e+10 2024-09 2020-10
343663 689009.XSHG 2024-11 0.004506 2.420206e+10 2024-10 2020-10
343664 689009.XSHG 2024-12 0.067436 2.431436e+10 2024-11 2020-10
343665 689009.XSHG 2025-01 0.080840 2.610548e+10 2024-12 2020-10
343666 689009.XSHG 2025-02 0.113168 2.822514e+10 2025-01 2020-10
343667 689009.XSHG 2025-03 0.095356 3.141930e+10 2025-02 2020-10

647621 rows × 6 columns

In [83]:
stk_df_m[stk_df_m['secID']==np.random.choice(stk_df_m['secID'],1)[0]]
Out[83]:
secID ym ret mkt_cap mkt_cap_date listMonth
401155 600166.XSHG 2008-02 0.020559 6.333465e+09 2008-01 1998-06
401156 600166.XSHG 2008-03 -0.121676 6.463676e+09 2008-02 1998-06
401157 600166.XSHG 2008-04 -0.002748 5.677202e+09 2008-03 1998-06
401158 600166.XSHG 2008-05 -0.134311 5.661576e+09 2008-04 1998-06
401159 600166.XSHG 2008-06 -0.308584 4.901144e+09 2008-05 1998-06
401160 600166.XSHG 2008-07 0.086963 3.390008e+09 2008-06 1998-06
401161 600166.XSHG 2008-08 -0.214302 3.834574e+09 2008-07 1998-06
401162 600166.XSHG 2008-09 -0.054544 3.012879e+09 2008-08 1998-06
... ... ... ... ... ... ...
401353 600166.XSHG 2024-08 -0.189483 1.873930e+10 2024-07 1998-06
401354 600166.XSHG 2024-09 0.181824 1.518869e+10 2024-08 1998-06
401355 600166.XSHG 2024-10 -0.014654 1.795027e+10 2024-09 1998-06
401356 600166.XSHG 2024-11 -0.044603 1.745495e+10 2024-10 1998-06
401357 600166.XSHG 2024-12 -0.023343 1.667629e+10 2024-11 1998-06
401358 600166.XSHG 2025-01 -0.067728 1.628696e+10 2024-12 1998-06
401359 600166.XSHG 2025-02 0.000000 1.518386e+10 2025-01 1998-06
401360 600166.XSHG 2025-03 0.034177 1.518386e+10 2025-02 1998-06

206 rows × 6 columns

Merge Book and Market Cap data

In [84]:
fundmen_df
Out[84]:
secID publishDate endDate endDateRep TShEquity TEquityAttrP minorityInt pub_month pub_year data_year bm_date book
79 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 1.640079e+10 1.640079e+10 NaN 4 2009 2008 2008-12 1.640079e+10
74 000001.XSHE 2010-04-29 2009-12-31 2010-03-31 2.046961e+10 2.046961e+10 NaN 4 2010 2009 2009-12 2.046961e+10
69 000001.XSHE 2011-04-27 2010-12-31 2011-03-31 3.351288e+10 3.351288e+10 NaN 4 2011 2010 2010-12 3.351288e+10
64 000001.XSHE 2012-04-26 2011-12-31 2012-03-31 7.538058e+10 7.331084e+10 2.069747e+09 4 2012 2011 2011-12 7.331084e+10
59 000001.XSHE 2013-04-24 2012-12-31 2013-03-31 8.479900e+10 8.479900e+10 NaN 4 2013 2012 2012-12 8.479900e+10
54 000001.XSHE 2014-04-24 2013-12-31 2014-03-31 1.120810e+11 1.120810e+11 NaN 4 2014 2013 2013-12 1.120810e+11
49 000001.XSHE 2015-04-24 2014-12-31 2015-03-31 1.309490e+11 1.309490e+11 NaN 4 2015 2014 2014-12 1.309490e+11
44 000001.XSHE 2016-04-21 2015-12-31 2016-03-31 1.615000e+11 1.615000e+11 NaN 4 2016 2015 2015-12 1.615000e+11
... ... ... ... ... ... ... ... ... ... ... ... ...
329377 900957.XSHG 2017-04-26 2016-12-31 2017-03-31 3.938268e+08 3.930721e+08 7.546643e+05 4 2017 2016 2016-12 3.930721e+08
329372 900957.XSHG 2018-04-26 2017-12-31 2018-03-31 4.238426e+08 4.231040e+08 7.386715e+05 4 2018 2017 2017-12 4.231040e+08
329367 900957.XSHG 2019-04-25 2018-12-31 2019-03-31 4.515278e+08 4.508051e+08 7.226781e+05 4 2019 2018 2018-12 4.508051e+08
329362 900957.XSHG 2020-04-29 2019-12-31 2020-03-31 4.768689e+08 4.761021e+08 7.667705e+05 4 2020 2019 2019-12 4.761021e+08
329357 900957.XSHG 2021-04-27 2020-12-31 2021-03-31 4.987276e+08 4.979110e+08 8.165551e+05 4 2021 2020 2020-12 4.979110e+08
329352 900957.XSHG 2022-04-30 2021-12-31 2022-03-31 5.263733e+08 5.255741e+08 7.991940e+05 4 2022 2021 2021-12 5.255741e+08
329347 900957.XSHG 2023-04-27 2022-12-31 2023-03-31 5.669258e+08 5.660700e+08 8.557882e+05 4 2023 2022 2022-12 5.660700e+08
329342 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 6.071647e+08 6.063475e+08 8.171545e+05 4 2024 2023 2023-12 6.063475e+08

57446 rows × 12 columns

In [85]:
stk_df_m
Out[85]:
secID ym ret mkt_cap mkt_cap_date listMonth
0 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 1991-04
1 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02 1991-04
2 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03 1991-04
3 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04 1991-04
4 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05 1991-04
5 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06 1991-04
6 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07 1991-04
7 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08 1991-04
... ... ... ... ... ... ...
343660 689009.XSHG 2024-08 0.090770 2.238688e+10 2024-07 2020-10
343661 689009.XSHG 2024-09 0.078307 2.442059e+10 2024-08 2020-10
343662 689009.XSHG 2024-10 -0.080923 2.633272e+10 2024-09 2020-10
343663 689009.XSHG 2024-11 0.004506 2.420206e+10 2024-10 2020-10
343664 689009.XSHG 2024-12 0.067436 2.431436e+10 2024-11 2020-10
343665 689009.XSHG 2025-01 0.080840 2.610548e+10 2024-12 2020-10
343666 689009.XSHG 2025-02 0.113168 2.822514e+10 2025-01 2020-10
343667 689009.XSHG 2025-03 0.095356 3.141930e+10 2025-02 2020-10

647621 rows × 6 columns

In [86]:
pd.merge(stk_df_m[['secID','mkt_cap','mkt_cap_date']], fundmen_df[['secID','book','bm_date']],
                 left_on=['secID','mkt_cap_date'],right_on=['secID','bm_date'])
Out[86]:
secID mkt_cap mkt_cap_date book bm_date
0 000001.XSHE 2.634237e+10 2008-12 1.640079e+10 2008-12
1 000001.XSHE 7.126066e+10 2009-12 2.046961e+10 2009-12
2 000001.XSHE 4.903361e+10 2010-12 3.351288e+10 2010-12
3 000001.XSHE 4.841254e+10 2011-12 7.331084e+10 2011-12
4 000001.XSHE 4.974784e+10 2012-12 8.479900e+10 2012-12
5 000001.XSHE 6.830480e+10 2013-12 1.120810e+11 2013-12
6 000001.XSHE 1.558135e+11 2014-12 1.309490e+11 2014-12
7 000001.XSHE 1.415306e+11 2015-12 1.615000e+11 2015-12
... ... ... ... ... ...
49866 688981.XSHG 6.007335e+10 2020-12 9.912804e+10 2020-12
49867 688981.XSHG 9.914631e+10 2021-12 1.091988e+11 2021-12
49868 688981.XSHG 8.052755e+10 2022-12 1.333719e+11 2022-12
49869 688981.XSHG 1.046408e+11 2023-12 1.424758e+11 2023-12
49870 689009.XSHG 4.948724e+09 2020-12 3.696778e+09 2020-12
49871 689009.XSHG 3.104066e+10 2021-12 4.274780e+09 2021-12
49872 689009.XSHG 1.560173e+10 2022-12 4.896933e+09 2022-12
49873 689009.XSHG 1.552630e+10 2023-12 5.456715e+09 2023-12

49874 rows × 5 columns

In [87]:
bm_df = pd.merge(stk_df_m[['secID','mkt_cap','mkt_cap_date']], fundmen_df[['secID','book','bm_date']],
                 left_on=['secID','mkt_cap_date'],right_on=['secID','bm_date'])
bm_df['bm'] = bm_df['book'] / bm_df['mkt_cap']
bm_df.drop(['mkt_cap_date','mkt_cap','book'],axis=1,inplace=True)
In [88]:
bm_df
Out[88]:
secID bm_date bm
0 000001.XSHE 2008-12 0.622601
1 000001.XSHE 2009-12 0.287250
2 000001.XSHE 2010-12 0.683467
3 000001.XSHE 2011-12 1.514294
4 000001.XSHE 2012-12 1.704577
5 000001.XSHE 2013-12 1.640895
6 000001.XSHE 2014-12 0.840421
7 000001.XSHE 2015-12 1.141096
... ... ... ...
49866 688981.XSHG 2020-12 1.650117
49867 688981.XSHG 2021-12 1.101390
49868 688981.XSHG 2022-12 1.656227
49869 688981.XSHG 2023-12 1.361571
49870 689009.XSHG 2020-12 0.747016
49871 689009.XSHG 2021-12 0.137716
49872 689009.XSHG 2022-12 0.313871
49873 689009.XSHG 2023-12 0.351450

49874 rows × 3 columns

Merge data

Merge rf, ret, mktcap, beta

In [89]:
ret_df = pd.merge(stk_df_m, rf, on='ym')

ret_df['exret'] = ret_df['ret'] - ret_df['rf']

ret_df.sort_values(['secID','ym'],inplace=True)

ret_df.reset_index(drop=True,inplace=True)

display(ret_df)
secID ym ret mkt_cap mkt_cap_date listMonth rf exret
0 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 1991-04 0.003753 -0.008257
1 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02 1991-04 0.003751 -0.153072
2 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03 1991-04 0.003745 0.046610
3 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04 1991-04 0.003743 -0.151954
4 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05 1991-04 0.003732 -0.237580
5 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06 1991-04 0.003682 0.072365
6 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07 1991-04 0.003604 -0.032450
7 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08 1991-04 0.003591 -0.261513
... ... ... ... ... ... ... ... ...
642532 900957.XSHG 2024-07 0.080460 3.183200e+07 2024-06 2000-07 0.002068 0.078392
642533 900957.XSHG 2024-08 0.436170 3.440800e+07 2024-07 2000-07 0.002068 0.434102
642534 900957.XSHG 2024-09 0.311111 4.931200e+07 2024-08 2000-07 0.002068 0.309043
642535 900957.XSHG 2024-10 0.132768 6.458400e+07 2024-09 2000-07 0.002068 0.130700
642536 900957.XSHG 2024-11 -0.022444 7.323200e+07 2024-10 2000-07 0.002068 -0.024512
642537 900957.XSHG 2024-12 -0.193878 7.157600e+07 2024-11 2000-07 0.002068 -0.195946
642538 900957.XSHG 2025-01 0.075949 5.777600e+07 2024-12 2000-07 0.002068 0.073881
642539 900957.XSHG 2025-02 -0.158824 6.200800e+07 2025-01 2000-07 0.002068 -0.160892

642540 rows × 8 columns

In [90]:
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mkt_cap_date'],right_on=['secID','ym'])
display(ret_df)
secID ym_x ret mkt_cap mkt_cap_date listMonth rf exret ym_y beta
0 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 1991-04 0.003753 -0.008257 2008-01 0.9654
1 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02 1991-04 0.003751 -0.153072 2008-02 1.0292
2 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03 1991-04 0.003745 0.046610 2008-03 1.0238
3 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04 1991-04 0.003743 -0.151954 2008-04 1.0212
4 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05 1991-04 0.003732 -0.237580 2008-05 1.0697
5 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06 1991-04 0.003682 0.072365 2008-06 1.0672
6 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07 1991-04 0.003604 -0.032450 2008-07 1.0966
7 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08 1991-04 0.003591 -0.261513 2008-08 1.0386
... ... ... ... ... ... ... ... ... ... ...
623103 689009.XSHG 2024-07 0.113302 2.008689e+10 2024-06 2020-10 0.002068 0.111234 2024-06 1.5890
623104 689009.XSHG 2024-08 0.090770 2.238688e+10 2024-07 2020-10 0.002068 0.088702 2024-07 1.5046
623105 689009.XSHG 2024-09 0.078307 2.442059e+10 2024-08 2020-10 0.002068 0.076239 2024-08 1.1651
623106 689009.XSHG 2024-10 -0.080923 2.633272e+10 2024-09 2020-10 0.002068 -0.082991 2024-09 1.0361
623107 689009.XSHG 2024-11 0.004506 2.420206e+10 2024-10 2020-10 0.002068 0.002438 2024-10 1.3022
623108 689009.XSHG 2024-12 0.067436 2.431436e+10 2024-11 2020-10 0.002068 0.065368 2024-11 1.2585
623109 689009.XSHG 2025-01 0.080840 2.610548e+10 2024-12 2020-10 0.002068 0.078772 2024-12 1.2255
623110 689009.XSHG 2025-02 0.113168 2.822514e+10 2025-01 2020-10 0.002068 0.111100 2025-01 1.1695

623111 rows × 10 columns

In [91]:
ret_df.drop(['ym_y'],axis=1,inplace=True)
In [92]:
ret_df.rename(columns={'ym_x':'ret_date',
                       'mkt_cap_date':'mktcap_beta_date'},inplace=True)
In [93]:
ret_df
Out[93]:
secID ret_date ret mkt_cap mktcap_beta_date listMonth rf exret beta
0 000001.XSHE 2008-02 -0.004504 5.850212e+10 2008-01 1991-04 0.003753 -0.008257 0.9654
1 000001.XSHE 2008-03 -0.149321 5.823860e+10 2008-02 1991-04 0.003751 -0.153072 1.0292
2 000001.XSHE 2008-04 0.050355 4.954234e+10 2008-03 1991-04 0.003745 0.046610 1.0238
3 000001.XSHE 2008-05 -0.148211 5.203702e+10 2008-04 1991-04 0.003743 -0.151954 1.0212
4 000001.XSHE 2008-06 -0.233848 4.432458e+10 2008-05 1991-04 0.003732 -0.237580 1.0697
5 000001.XSHE 2008-07 0.076047 4.140495e+10 2008-06 1991-04 0.003682 0.072365 1.0672
6 000001.XSHE 2008-08 -0.028846 4.455369e+10 2008-07 1991-04 0.003604 -0.032450 1.0966
7 000001.XSHE 2008-09 -0.257922 4.326849e+10 2008-08 1991-04 0.003591 -0.261513 1.0386
... ... ... ... ... ... ... ... ... ...
623103 689009.XSHG 2024-07 0.113302 2.008689e+10 2024-06 2020-10 0.002068 0.111234 1.5890
623104 689009.XSHG 2024-08 0.090770 2.238688e+10 2024-07 2020-10 0.002068 0.088702 1.5046
623105 689009.XSHG 2024-09 0.078307 2.442059e+10 2024-08 2020-10 0.002068 0.076239 1.1651
623106 689009.XSHG 2024-10 -0.080923 2.633272e+10 2024-09 2020-10 0.002068 -0.082991 1.0361
623107 689009.XSHG 2024-11 0.004506 2.420206e+10 2024-10 2020-10 0.002068 0.002438 1.3022
623108 689009.XSHG 2024-12 0.067436 2.431436e+10 2024-11 2020-10 0.002068 0.065368 1.2585
623109 689009.XSHG 2025-01 0.080840 2.610548e+10 2024-12 2020-10 0.002068 0.078772 1.2255
623110 689009.XSHG 2025-02 0.113168 2.822514e+10 2025-01 2020-10 0.002068 0.111100 1.1695

623111 rows × 9 columns

In [94]:
ret_df = ret_df[['secID','ret_date','ret','rf','exret','mktcap_beta_date','mkt_cap','beta']]
In [95]:
ret_df
Out[95]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta
0 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654
1 000001.XSHE 2008-03 -0.149321 0.003751 -0.153072 2008-02 5.823860e+10 1.0292
2 000001.XSHE 2008-04 0.050355 0.003745 0.046610 2008-03 4.954234e+10 1.0238
3 000001.XSHE 2008-05 -0.148211 0.003743 -0.151954 2008-04 5.203702e+10 1.0212
4 000001.XSHE 2008-06 -0.233848 0.003732 -0.237580 2008-05 4.432458e+10 1.0697
5 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672
6 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966
7 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386
... ... ... ... ... ... ... ... ...
623103 689009.XSHG 2024-07 0.113302 0.002068 0.111234 2024-06 2.008689e+10 1.5890
623104 689009.XSHG 2024-08 0.090770 0.002068 0.088702 2024-07 2.238688e+10 1.5046
623105 689009.XSHG 2024-09 0.078307 0.002068 0.076239 2024-08 2.442059e+10 1.1651
623106 689009.XSHG 2024-10 -0.080923 0.002068 -0.082991 2024-09 2.633272e+10 1.0361
623107 689009.XSHG 2024-11 0.004506 0.002068 0.002438 2024-10 2.420206e+10 1.3022
623108 689009.XSHG 2024-12 0.067436 0.002068 0.065368 2024-11 2.431436e+10 1.2585
623109 689009.XSHG 2025-01 0.080840 0.002068 0.078772 2024-12 2.610548e+10 1.2255
623110 689009.XSHG 2025-02 0.113168 0.002068 0.111100 2025-01 2.822514e+10 1.1695

623111 rows × 8 columns

Merge all data with bm

调整return月份对应的bm月份

例:2007:12月的bm分组,对应的是 2008:07 -- 2009:06 的return

调整步骤:

  1. ret_year - 1, and set this variable as bm_date
  2. if ret_month is in [1,2,3,4,5,6], ret_year - 1 again
  3. convert bm_date to year-Dec format
In [96]:
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1
In [97]:
ret_df
Out[97]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta year month bm_date
0 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654 2008 2 2006
1 000001.XSHE 2008-03 -0.149321 0.003751 -0.153072 2008-02 5.823860e+10 1.0292 2008 3 2006
2 000001.XSHE 2008-04 0.050355 0.003745 0.046610 2008-03 4.954234e+10 1.0238 2008 4 2006
3 000001.XSHE 2008-05 -0.148211 0.003743 -0.151954 2008-04 5.203702e+10 1.0212 2008 5 2006
4 000001.XSHE 2008-06 -0.233848 0.003732 -0.237580 2008-05 4.432458e+10 1.0697 2008 6 2006
5 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672 2008 7 2007
6 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966 2008 8 2007
7 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386 2008 9 2007
... ... ... ... ... ... ... ... ... ... ... ...
623103 689009.XSHG 2024-07 0.113302 0.002068 0.111234 2024-06 2.008689e+10 1.5890 2024 7 2023
623104 689009.XSHG 2024-08 0.090770 0.002068 0.088702 2024-07 2.238688e+10 1.5046 2024 8 2023
623105 689009.XSHG 2024-09 0.078307 0.002068 0.076239 2024-08 2.442059e+10 1.1651 2024 9 2023
623106 689009.XSHG 2024-10 -0.080923 0.002068 -0.082991 2024-09 2.633272e+10 1.0361 2024 10 2023
623107 689009.XSHG 2024-11 0.004506 0.002068 0.002438 2024-10 2.420206e+10 1.3022 2024 11 2023
623108 689009.XSHG 2024-12 0.067436 0.002068 0.065368 2024-11 2.431436e+10 1.2585 2024 12 2023
623109 689009.XSHG 2025-01 0.080840 0.002068 0.078772 2024-12 2.610548e+10 1.2255 2025 1 2023
623110 689009.XSHG 2025-02 0.113168 0.002068 0.111100 2025-01 2.822514e+10 1.1695 2025 2 2023

623111 rows × 11 columns

In [98]:
ret_df.loc[(ret_df['secID']=='300349.XSHE')&(ret_df['ret_date']>='2013-01')&(ret_df['ret_date']<='2013-12')]
Out[98]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta year month bm_date
295708 300349.XSHE 2013-01 -0.035197 0.003246 -0.038443 2012-12 7.245000e+08 0.6363 2013 1 2011
295709 300349.XSHE 2013-02 0.025751 0.003240 0.022511 2013-01 6.990000e+08 0.5292 2013 2 2011
295710 300349.XSHE 2013-03 -0.073640 0.003236 -0.076876 2013-02 7.170000e+08 0.6351 2013 3 2011
295711 300349.XSHE 2013-04 0.025519 0.003235 0.022284 2013-03 6.642000e+08 0.7784 2013 4 2011
295712 300349.XSHE 2013-05 0.351575 0.003235 0.348340 2013-04 6.811500e+08 0.8078 2013 5 2011
295713 300349.XSHE 2013-06 0.000244 0.004241 -0.003997 2013-05 9.180000e+08 0.7089 2013 6 2011
295714 300349.XSHE 2013-07 0.172520 0.003972 0.168548 2013-06 9.182250e+08 0.5040 2013 7 2012
295715 300349.XSHE 2013-08 0.028202 0.003880 0.024322 2013-07 1.076625e+09 0.5452 2013 8 2012
295716 300349.XSHE 2013-09 -0.087392 0.003884 -0.091276 2013-08 1.107000e+09 0.5464 2013 9 2012
295717 300349.XSHE 2013-10 -0.002221 0.003897 -0.006118 2013-09 1.010250e+09 0.4669 2013 10 2012
295718 300349.XSHE 2013-11 0.166738 0.003920 0.162818 2013-10 1.008000e+09 0.6522 2013 11 2012
295719 300349.XSHE 2013-12 -0.054534 0.004417 -0.058951 2013-11 1.176075e+09 0.6451 2013 12 2012
In [99]:
pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()
Out[99]:
0        2006-12-31
1        2006-12-31
2        2006-12-31
3        2006-12-31
4        2006-12-31
5        2007-12-31
6        2007-12-31
7        2007-12-31
            ...    
623103   2023-12-31
623104   2023-12-31
623105   2023-12-31
623106   2023-12-31
623107   2023-12-31
623108   2023-12-31
623109   2023-12-31
623110   2023-12-31
Name: bm_date, Length: 623111, dtype: datetime64[ns]
In [100]:
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1

ret_df['bm_date'] = pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()

ret_df['bm_date'] = ret_df['bm_date'].dt.to_period('M')

ret_df.drop(['month','year'], axis=1, inplace=True)
In [101]:
ret_df
Out[101]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date
0 000001.XSHE 2008-02 -0.004504 0.003753 -0.008257 2008-01 5.850212e+10 0.9654 2006-12
1 000001.XSHE 2008-03 -0.149321 0.003751 -0.153072 2008-02 5.823860e+10 1.0292 2006-12
2 000001.XSHE 2008-04 0.050355 0.003745 0.046610 2008-03 4.954234e+10 1.0238 2006-12
3 000001.XSHE 2008-05 -0.148211 0.003743 -0.151954 2008-04 5.203702e+10 1.0212 2006-12
4 000001.XSHE 2008-06 -0.233848 0.003732 -0.237580 2008-05 4.432458e+10 1.0697 2006-12
5 000001.XSHE 2008-07 0.076047 0.003682 0.072365 2008-06 4.140495e+10 1.0672 2007-12
6 000001.XSHE 2008-08 -0.028846 0.003604 -0.032450 2008-07 4.455369e+10 1.0966 2007-12
7 000001.XSHE 2008-09 -0.257922 0.003591 -0.261513 2008-08 4.326849e+10 1.0386 2007-12
... ... ... ... ... ... ... ... ... ...
623103 689009.XSHG 2024-07 0.113302 0.002068 0.111234 2024-06 2.008689e+10 1.5890 2023-12
623104 689009.XSHG 2024-08 0.090770 0.002068 0.088702 2024-07 2.238688e+10 1.5046 2023-12
623105 689009.XSHG 2024-09 0.078307 0.002068 0.076239 2024-08 2.442059e+10 1.1651 2023-12
623106 689009.XSHG 2024-10 -0.080923 0.002068 -0.082991 2024-09 2.633272e+10 1.0361 2023-12
623107 689009.XSHG 2024-11 0.004506 0.002068 0.002438 2024-10 2.420206e+10 1.3022 2023-12
623108 689009.XSHG 2024-12 0.067436 0.002068 0.065368 2024-11 2.431436e+10 1.2585 2023-12
623109 689009.XSHG 2025-01 0.080840 0.002068 0.078772 2024-12 2.610548e+10 1.2255 2023-12
623110 689009.XSHG 2025-02 0.113168 0.002068 0.111100 2025-01 2.822514e+10 1.1695 2023-12

623111 rows × 9 columns

In [102]:
bm_df
Out[102]:
secID bm_date bm
0 000001.XSHE 2008-12 0.622601
1 000001.XSHE 2009-12 0.287250
2 000001.XSHE 2010-12 0.683467
3 000001.XSHE 2011-12 1.514294
4 000001.XSHE 2012-12 1.704577
5 000001.XSHE 2013-12 1.640895
6 000001.XSHE 2014-12 0.840421
7 000001.XSHE 2015-12 1.141096
... ... ... ...
49866 688981.XSHG 2020-12 1.650117
49867 688981.XSHG 2021-12 1.101390
49868 688981.XSHG 2022-12 1.656227
49869 688981.XSHG 2023-12 1.361571
49870 689009.XSHG 2020-12 0.747016
49871 689009.XSHG 2021-12 0.137716
49872 689009.XSHG 2022-12 0.313871
49873 689009.XSHG 2023-12 0.351450

49874 rows × 3 columns

In [103]:
ret_df = pd.merge(ret_df,bm_df,on=['secID','bm_date'])
In [104]:
ret_df
Out[104]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
0 000001.XSHE 2009-07 0.199816 0.001227 0.198589 2009-06 6.379637e+10 1.1618 2008-12 0.622601
1 000001.XSHE 2009-08 -0.305195 0.001435 -0.306630 2009-07 7.654395e+10 1.0652 2008-12 0.622601
2 000001.XSHE 2009-09 0.100056 0.001459 0.098597 2009-08 5.318314e+10 1.0146 2008-12 0.622601
3 000001.XSHE 2009-10 0.123438 0.001481 0.121957 2009-09 5.850437e+10 1.0209 2008-12 0.622601
4 000001.XSHE 2009-11 0.079181 0.001499 0.077682 2009-10 6.573409e+10 0.9892 2008-12 0.622601
5 000001.XSHE 2009-12 0.004534 0.001516 0.003018 2009-11 7.093901e+10 1.0081 2008-12 0.622601
6 000001.XSHE 2010-01 -0.109561 0.001553 -0.111114 2009-12 7.126066e+10 1.0052 2008-12 0.622601
7 000001.XSHE 2010-02 0.034563 0.001604 0.032959 2010-01 6.345328e+10 1.0097 2008-12 0.622601
... ... ... ... ... ... ... ... ... ... ...
548056 689009.XSHG 2024-07 0.113302 0.002068 0.111234 2024-06 2.008689e+10 1.5890 2023-12 0.351450
548057 689009.XSHG 2024-08 0.090770 0.002068 0.088702 2024-07 2.238688e+10 1.5046 2023-12 0.351450
548058 689009.XSHG 2024-09 0.078307 0.002068 0.076239 2024-08 2.442059e+10 1.1651 2023-12 0.351450
548059 689009.XSHG 2024-10 -0.080923 0.002068 -0.082991 2024-09 2.633272e+10 1.0361 2023-12 0.351450
548060 689009.XSHG 2024-11 0.004506 0.002068 0.002438 2024-10 2.420206e+10 1.3022 2023-12 0.351450
548061 689009.XSHG 2024-12 0.067436 0.002068 0.065368 2024-11 2.431436e+10 1.2585 2023-12 0.351450
548062 689009.XSHG 2025-01 0.080840 0.002068 0.078772 2024-12 2.610548e+10 1.2255 2023-12 0.351450
548063 689009.XSHG 2025-02 0.113168 0.002068 0.111100 2025-01 2.822514e+10 1.1695 2023-12 0.351450

548064 rows × 10 columns

In [105]:
gc.collect()
Out[105]:
43

Sorting on BM

In [106]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
In [107]:
quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['bm_date'])['bm'].quantile(value)
In [108]:
quantile_df
Out[108]:
q1 q2 q3 q4 q5 q6 q7 q8 q9
bm_date
2008-12 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
2009-12 0.153349 0.207738 0.250571 0.304024 0.358534 0.409447 0.504132 0.670160 0.878141
2010-12 0.144937 0.208690 0.262338 0.318854 0.399822 0.506119 0.648749 0.883376 1.223348
2011-12 0.246439 0.343578 0.444011 0.546351 0.678472 0.838870 1.075257 1.391082 1.899948
2012-12 0.267248 0.369360 0.467971 0.570465 0.693980 0.860729 1.076622 1.383808 1.828361
2013-12 0.209953 0.300281 0.391943 0.469116 0.561372 0.663863 0.795765 0.989655 1.327860
2014-12 0.176162 0.245337 0.306915 0.368283 0.425874 0.495094 0.586881 0.703513 0.891120
2015-12 0.114290 0.162872 0.202825 0.248851 0.295394 0.350959 0.412877 0.512192 0.695365
2016-12 0.163482 0.225193 0.284278 0.336331 0.389822 0.457543 0.535321 0.650701 0.864097
2017-12 0.219394 0.302661 0.385532 0.459889 0.536581 0.627518 0.742034 0.870825 1.135484
2018-12 0.316369 0.437954 0.543479 0.644450 0.761737 0.867746 1.030129 1.224285 1.566768
2019-12 0.233027 0.338262 0.429298 0.520926 0.613987 0.726117 0.869132 1.068601 1.389387
2020-12 0.188354 0.283266 0.366484 0.451673 0.549952 0.657634 0.802114 0.997374 1.353172
2021-12 0.166658 0.248605 0.325486 0.398606 0.491999 0.598281 0.739960 0.955527 1.307447
2022-12 0.212085 0.311705 0.404527 0.497167 0.594951 0.718764 0.901001 1.150651 1.591106
2023-12 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
In [109]:
ret_df_q = pd.merge(ret_df, quantile_df, on='bm_date')
In [110]:
ret_df_q
Out[110]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm q1 q2 q3 q4 q5 q6 q7 q8 q9
0 000001.XSHE 2009-07 0.199816 0.001227 0.198589 2009-06 6.379637e+10 1.1618 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
1 000001.XSHE 2009-08 -0.305195 0.001435 -0.306630 2009-07 7.654395e+10 1.0652 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
2 000001.XSHE 2009-09 0.100056 0.001459 0.098597 2009-08 5.318314e+10 1.0146 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
3 000001.XSHE 2009-10 0.123438 0.001481 0.121957 2009-09 5.850437e+10 1.0209 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
4 000001.XSHE 2009-11 0.079181 0.001499 0.077682 2009-10 6.573409e+10 0.9892 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
5 000001.XSHE 2009-12 0.004534 0.001516 0.003018 2009-11 7.093901e+10 1.0081 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
6 000001.XSHE 2010-01 -0.109561 0.001553 -0.111114 2009-12 7.126066e+10 1.0052 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
7 000001.XSHE 2010-02 0.034563 0.001604 0.032959 2010-01 6.345328e+10 1.0097 2008-12 0.622601 0.407372 0.531445 0.642522 0.759532 0.904136 1.076160 1.273804 1.555919 2.029908
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
548056 689009.XSHG 2024-07 0.113302 0.002068 0.111234 2024-06 2.008689e+10 1.5890 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548057 689009.XSHG 2024-08 0.090770 0.002068 0.088702 2024-07 2.238688e+10 1.5046 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548058 689009.XSHG 2024-09 0.078307 0.002068 0.076239 2024-08 2.442059e+10 1.1651 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548059 689009.XSHG 2024-10 -0.080923 0.002068 -0.082991 2024-09 2.633272e+10 1.0361 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548060 689009.XSHG 2024-11 0.004506 0.002068 0.002438 2024-10 2.420206e+10 1.3022 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548061 689009.XSHG 2024-12 0.067436 0.002068 0.065368 2024-11 2.431436e+10 1.2585 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548062 689009.XSHG 2025-01 0.080840 0.002068 0.078772 2024-12 2.610548e+10 1.2255 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413
548063 689009.XSHG 2025-02 0.113168 0.002068 0.111100 2025-01 2.822514e+10 1.1695 2023-12 0.351450 0.213122 0.306827 0.383825 0.465522 0.558105 0.672072 0.830842 1.058015 1.480413

548064 rows × 19 columns

In [111]:
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
In [112]:
portfolios['p2']
Out[112]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
12 000002.XSHE 2009-07 0.047843 0.001227 0.046616 2009-06 1.197492e+11 1.0946 2008-12 0.526557
13 000002.XSHE 2009-08 -0.296407 0.001435 -0.297842 2009-07 1.254784e+11 1.0656 2008-12 0.526557
14 000002.XSHE 2009-09 0.108511 0.001459 0.107052 2009-08 8.828569e+10 1.0563 2008-12 0.526557
15 000002.XSHE 2009-10 0.104607 0.001481 0.103126 2009-09 9.786563e+10 1.0992 2008-12 0.526557
16 000002.XSHE 2009-11 0.001737 0.001499 0.000238 2009-10 1.081030e+11 1.1257 2008-12 0.526557
17 000002.XSHE 2009-12 -0.062446 0.001516 -0.063962 2009-11 1.082909e+11 1.0625 2008-12 0.526557
18 000002.XSHE 2010-01 -0.135985 0.001553 -0.137538 2009-12 1.043824e+11 1.0717 2008-12 0.526557
19 000002.XSHE 2010-02 0.009635 0.001604 0.008031 2010-01 9.018793e+10 1.0262 2008-12 0.526557
... ... ... ... ... ... ... ... ... ... ...
547984 688787.XSHG 2024-07 -0.050479 0.002068 -0.052547 2024-06 2.263899e+09 2.1706 2023-12 0.258352
547985 688787.XSHG 2024-08 -0.167138 0.002068 -0.169206 2024-07 2.149631e+09 2.1062 2023-12 0.258352
547986 688787.XSHG 2024-09 0.384327 0.002068 0.382259 2024-08 2.618114e+09 2.2239 2023-12 0.258352
547987 688787.XSHG 2024-10 0.100198 0.002068 0.098130 2024-09 3.624338e+09 1.9278 2023-12 0.258352
547988 688787.XSHG 2024-11 0.278820 0.002068 0.276752 2024-10 3.987496e+09 2.0546 2023-12 0.258352
547989 688787.XSHG 2024-12 0.165498 0.002068 0.163430 2024-11 5.099289e+09 1.9667 2023-12 0.258352
547990 688787.XSHG 2025-01 0.359932 0.002068 0.357864 2024-12 5.943239e+09 1.9381 2023-12 0.258352
547991 688787.XSHG 2025-02 -0.260637 0.002068 -0.262705 2025-01 8.082370e+09 1.9190 2023-12 0.258352

54988 rows × 10 columns

return by portfolios

In [113]:
portfolios['p2'].info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 54988 entries, 12 to 547991
Data columns (total 10 columns):
secID               54988 non-null object
ret_date            54988 non-null period[M]
ret                 54988 non-null float64
rf                  54988 non-null float64
exret               54988 non-null float64
mktcap_beta_date    54988 non-null period[M]
mkt_cap             54988 non-null float64
beta                53915 non-null float64
bm_date             54988 non-null period[M]
bm                  54988 non-null float64
dtypes: float64(6), object(1), period[M](3)
memory usage: 4.6+ MB
In [114]:
for k in portfolios.keys():
    print(portfolios[k].groupby(['ret_date'])['exret'].mean().mean())
0.004510643630688633
0.0066368923106826065
0.008113699277954344
0.008768146599630147
0.009371341392898573
0.009428727732616847
0.009659117799666545
0.009650154493371397
0.009477718851978928
0.00735552612501565
In [115]:
portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
In [116]:
portfolios_crs_mean['p1']
Out[116]:
ret_date
2009-07    0.107734
2009-08   -0.123265
2009-09    0.044444
2009-10    0.102250
2009-11    0.136572
2009-12    0.009681
2010-01   -0.010283
2010-02    0.051705
             ...   
2024-07    0.014740
2024-08   -0.038136
2024-09    0.289537
2024-10    0.142061
2024-11    0.033280
2024-12   -0.068850
2025-01   -0.035031
2025-02    0.108273
Freq: M, Name: exret, Length: 188, dtype: float64

More robust by adjusting Newey-West Errors

In [117]:
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
In [118]:
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[118]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.004511 0.006637 0.008114 0.008768 0.009371 0.009429 0.009659 0.009650 0.009478 0.007356 0.002845
t-value 0.830134 1.233923 1.574315 1.652492 1.804599 1.755822 1.841693 1.823293 1.822553 1.432717 1.022295

既然一年调一次仓,年收益率呢?

In [119]:
portfolios['p1']
Out[119]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm
72 000014.XSHE 2009-07 0.183851 0.001227 0.182624 2009-06 2.987235e+09 1.0526 2008-12 0.360541
73 000014.XSHE 2009-08 -0.200000 0.001435 -0.201435 2009-07 3.529819e+09 1.0720 2008-12 0.360541
74 000014.XSHE 2009-09 -0.042858 0.001459 -0.044317 2009-08 2.823855e+09 1.1185 2008-12 0.360541
75 000014.XSHE 2009-10 0.164173 0.001481 0.162692 2009-09 2.702833e+09 1.1010 2008-12 0.360541
76 000014.XSHE 2009-11 0.181413 0.001499 0.179914 2009-10 3.146581e+09 1.1338 2008-12 0.360541
77 000014.XSHE 2009-12 -0.158979 0.001516 -0.160495 2009-11 3.717403e+09 1.0180 2008-12 0.360541
78 000014.XSHE 2010-01 -0.159355 0.001553 -0.160908 2009-12 3.126411e+09 1.0471 2008-12 0.360541
79 000014.XSHE 2010-02 0.029165 0.001604 0.027561 2010-01 2.628202e+09 1.0838 2008-12 0.360541
... ... ... ... ... ... ... ... ... ... ...
547808 688700.XSHG 2024-07 -0.035094 0.002068 -0.037162 2024-06 7.821101e+09 1.8836 2023-12 0.193922
547809 688700.XSHG 2024-08 -0.195338 0.002068 -0.197406 2024-07 7.546571e+09 2.1171 2023-12 0.193922
547810 688700.XSHG 2024-09 0.222122 0.002068 0.220054 2024-08 6.072468e+09 2.3065 2023-12 0.193922
547811 688700.XSHG 2024-10 0.123439 0.002068 0.121371 2024-09 7.421243e+09 2.1054 2023-12 0.193922
547812 688700.XSHG 2024-11 0.102716 0.002068 0.100648 2024-10 8.337335e+09 2.1818 2023-12 0.193922
547813 688700.XSHG 2024-12 -0.043815 0.002068 -0.045883 2024-11 9.193747e+09 2.2430 2023-12 0.193922
547814 688700.XSHG 2025-01 -0.116095 0.002068 -0.118163 2024-12 8.790905e+09 2.2065 2023-12 0.193922
547815 688700.XSHG 2025-02 0.240793 0.002068 0.238725 2025-01 7.770372e+09 2.1005 2023-12 0.193922

54877 rows × 10 columns

In [120]:
portfolios[k]['1+ret'] = portfolios[k]['ret']+1
portfolios[k]['1+rf'] = portfolios[k]['rf']+1
In [121]:
portfolios[k]
Out[121]:
secID ret_date ret rf exret mktcap_beta_date mkt_cap beta bm_date bm 1+ret 1+rf
108 000021.XSHE 2009-07 0.117859 0.001227 0.116632 2009-06 9.840280e+09 1.1005 2008-12 2.270029 1.117859 1.001227
109 000021.XSHE 2009-08 -0.165337 0.001435 -0.166772 2009-07 1.100003e+10 0.9318 2008-12 2.270029 0.834663 1.001435
110 000021.XSHE 2009-09 0.073689 0.001459 0.072230 2009-08 9.181333e+09 0.9785 2008-12 2.270029 1.073689 1.001459
111 000021.XSHE 2009-10 0.089130 0.001481 0.087649 2009-09 9.857852e+09 1.0156 2008-12 2.270029 1.089130 1.001481
112 000021.XSHE 2009-11 0.101467 0.001499 0.099968 2009-10 1.073645e+10 0.9419 2008-12 2.270029 1.101467 1.001499
113 000021.XSHE 2009-12 -0.042349 0.001516 -0.043865 2009-11 1.182591e+10 0.9184 2008-12 2.270029 0.957651 1.001516
114 000021.XSHE 2010-01 0.011639 0.001553 0.010086 2009-12 1.132511e+10 0.8867 2008-12 2.270029 1.011639 1.001553
115 000021.XSHE 2010-02 0.000769 0.001604 -0.000835 2010-01 1.145690e+10 0.8967 2008-12 2.270029 1.000769 1.001604
... ... ... ... ... ... ... ... ... ... ... ... ...
548040 688819.XSHG 2024-07 -0.054507 0.002068 -0.056575 2024-06 2.319431e+10 1.6092 2023-12 4.219591 0.945493 1.002068
548041 688819.XSHG 2024-08 0.093103 0.002068 0.091035 2024-07 2.193058e+10 1.6297 2023-12 4.219591 1.093103 1.002068
548042 688819.XSHG 2024-09 0.374670 0.002068 0.372602 2024-08 2.397199e+10 1.5742 2023-12 4.219591 1.374670 1.002068
548043 688819.XSHG 2024-10 -0.130659 0.002068 -0.132727 2024-09 3.295419e+10 1.2118 2023-12 4.219591 0.869341 1.002068
548044 688819.XSHG 2024-11 -0.019354 0.002068 -0.021422 2024-10 2.864779e+10 1.5119 2023-12 4.219591 0.980646 1.002068
548045 688819.XSHG 2024-12 -0.058116 0.002068 -0.060184 2024-11 2.809369e+10 1.4465 2023-12 4.219591 0.941884 1.002068
548046 688819.XSHG 2025-01 -0.026483 0.002068 -0.028551 2024-12 2.646056e+10 1.4207 2023-12 4.219591 0.973517 1.002068
548047 688819.XSHG 2025-02 0.080032 0.002068 0.077964 2025-01 2.576065e+10 1.3536 2023-12 4.219591 1.080032 1.002068

54889 rows × 12 columns

In [122]:
portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
Out[122]:
secID bm_date 1+ret
0 000001.XSHE 2013-12 1.776255
1 000001.XSHE 2015-12 1.079310
2 000001.XSHE 2016-12 0.982198
3 000001.XSHE 2022-12 0.968292
4 000001.XSHE 2023-12 1.160400
5 000002.XSHE 2023-12 1.118327
6 000011.XSHE 2014-12 0.904518
7 000011.XSHE 2015-12 1.508421
... ... ... ...
4745 688767.XSHG 2023-12 1.438224
4746 688778.XSHG 2023-12 1.734965
4747 688799.XSHG 2021-12 1.263452
4748 688819.XSHG 2021-12 1.083487
4749 688819.XSHG 2022-12 0.664141
4750 688819.XSHG 2023-12 1.199494
4751 688981.XSHG 2020-12 0.730670
4752 688981.XSHG 2022-12 0.912510

4753 rows × 3 columns

In [123]:
pf_year_ret = {}
for k in portfolios.keys():
    portfolios[k]['1+ret'] = portfolios[k]['ret']+1
    portfolios[k]['1+rf'] = portfolios[k]['rf']+1
    pf_year_ret[k] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
    pf_year_ret[k]['1+rf'] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+rf'].prod()['1+rf']
    pf_year_ret[k]['ret'] = pf_year_ret[k]['1+ret'] - 1
    pf_year_ret[k]['rf'] = pf_year_ret[k]['1+rf'] - 1
    pf_year_ret[k]['exret'] = pf_year_ret[k]['ret'] - pf_year_ret[k]['rf']
In [124]:
pf_year_ret['p1']
Out[124]:
secID bm_date 1+ret 1+rf ret rf exret
0 000004.XSHE 2011-12 1.180620 1.039890 0.180620 0.039890 0.140730
1 000004.XSHE 2012-12 1.388007 1.052017 0.388007 0.052017 0.335990
2 000004.XSHE 2013-12 2.773781 1.045296 1.773781 0.045296 1.728485
3 000004.XSHE 2014-12 0.956622 1.030621 -0.043378 0.030621 -0.073999
4 000004.XSHE 2015-12 0.696445 1.036393 -0.303555 0.036393 -0.339948
5 000004.XSHE 2016-12 0.764129 1.045471 -0.235871 0.045471 -0.281341
6 000004.XSHE 2017-12 1.082653 1.030115 0.082653 0.030115 0.052539
7 000004.XSHE 2018-12 1.406224 1.024673 0.406224 0.024673 0.381552
... ... ... ... ... ... ... ...
4877 688677.XSHG 2022-12 0.701338 1.024004 -0.298662 0.024004 -0.322666
4878 688690.XSHG 2022-12 0.448332 1.024004 -0.551668 0.024004 -0.575672
4879 688700.XSHG 2022-12 0.343769 1.024004 -0.656231 0.024004 -0.680235
4880 688700.XSHG 2023-12 1.232744 1.016664 0.232744 0.016664 0.216080
4881 688711.XSHG 2022-12 0.386785 1.024004 -0.613215 0.024004 -0.637219
4882 688768.XSHG 2022-12 0.294512 1.024004 -0.705488 0.024004 -0.729492
4883 688777.XSHG 2022-12 0.610677 1.024004 -0.389323 0.024004 -0.413327
4884 689009.XSHG 2021-12 0.828090 1.021321 -0.171910 0.021321 -0.193231

4885 rows × 7 columns

In [125]:
portfolios_crs_mean = dict()
for k in pf_year_ret.keys():
    portfolios_crs_mean[k] = pf_year_ret[k].groupby(['bm_date'])['exret'].mean()
In [126]:
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
In [127]:
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[127]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.052068 0.078005 0.094152 0.106622 0.114082 0.124210 0.123501 0.125792 0.124091 0.099119 0.047050
t-value 0.950304 1.491052 1.903178 2.012514 2.457102 1.933109 2.041182 1.871975 1.786847 1.466770 1.270679

Portfolio characteristics other than return

In [128]:
# average beta in each portfolio
for key in portfolios.keys():
    print(portfolios[key].groupby('bm_date')['bm'].mean().mean())  
0.1443215862765014
0.2596918481337941
0.34171210372356237
0.42024300830614564
0.5048602557450186
0.6031741676160136
0.7262496250720067
0.896256346913194
1.154773305679097
2.209371794746659
In [129]:
portfolios['p1'].groupby('bm_date')['secID'].nunique()
Out[129]:
bm_date
2008-12    144
2009-12    150
2010-12    185
2011-12    220
2012-12    238
2013-12    245
2014-12    264
2015-12    290
2016-12    298
2017-12    338
2018-12    346
2019-12    370
2020-12    392
2021-12    439
2022-12    475
2023-12    491
Freq: M, Name: secID, dtype: int64
In [130]:
pf_n_stks = pd.DataFrame()
for key, value in portfolios.items():
    pf_n_stks[key] = portfolios[key].groupby('bm_date')['secID'].nunique()

display(pf_n_stks)

pf_n_stks.plot()
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
bm_date
2008-12 144 142 143 144 142 142 141 142 142 141
2009-12 150 148 149 149 148 148 147 149 148 147
2010-12 185 183 182 183 183 182 181 182 182 181
2011-12 220 217 214 214 213 213 214 213 213 212
2012-12 238 241 233 236 236 236 236 234 234 235
2013-12 245 242 243 240 241 239 240 240 239 240
2014-12 264 253 251 250 251 250 252 251 250 249
2015-12 290 272 270 270 270 269 271 270 271 268
2016-12 298 290 290 288 292 290 289 289 289 287
2017-12 338 336 337 335 335 333 332 334 333 332
2018-12 346 344 341 339 343 342 341 340 341 341
2019-12 370 356 356 352 353 353 352 352 351 351
2020-12 392 383 383 382 382 384 382 381 381 381
2021-12 439 436 433 436 433 434 433 434 433 432
2022-12 475 469 468 470 468 468 469 468 468 468
2023-12 491 488 489 488 489 489 489 491 488 488
Out[130]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f69baaf7f90>
In [131]:
portfolios['p10'].groupby('bm_date')['mkt_cap'].mean()/1e9
Out[131]:
bm_date
2008-12    24.809703
2009-12    24.776515
2010-12     7.609529
2011-12     4.429930
2012-12     6.856646
2013-12    24.809051
2014-12    30.583072
2015-12    48.396112
2016-12    52.408349
2017-12    25.692625
2018-12    23.570998
2019-12    25.605492
2020-12    28.209597
2021-12    25.285104
2022-12    26.822755
2023-12    31.160605
Freq: M, Name: mkt_cap, dtype: float64
In [132]:
pf_mktcap = pd.DataFrame()
for key, value in portfolios.items():
    pf_mktcap[key] = portfolios[key].groupby('bm_date')['mkt_cap'].mean()

display(pf_mktcap)

pf_mktcap.plot()
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10
bm_date
2008-12 8.178973e+09 9.146321e+09 7.169733e+09 6.792753e+09 7.133477e+09 5.638368e+09 6.113149e+09 5.203166e+09 7.719806e+09 2.480970e+10
2009-12 1.086000e+10 8.434011e+09 7.168742e+09 9.046941e+09 9.436160e+09 1.319322e+10 1.462286e+10 1.163539e+10 3.963020e+09 2.477652e+10
2010-12 9.894483e+09 6.872961e+09 5.115607e+09 6.050936e+09 5.913509e+09 1.177430e+10 1.982726e+10 1.530054e+10 7.867258e+09 7.609529e+09
2011-12 8.679166e+09 5.203752e+09 5.832281e+09 8.101809e+09 1.352585e+10 8.386892e+09 1.470455e+10 7.589739e+09 3.736540e+09 4.429930e+09
2012-12 7.933644e+09 7.124045e+09 5.767185e+09 6.531256e+09 8.280338e+09 1.419138e+10 1.341193e+10 4.797786e+09 6.157433e+09 6.856646e+09
2013-12 1.040011e+10 9.919660e+09 8.817553e+09 9.078055e+09 8.530569e+09 7.203076e+09 1.403315e+10 1.693147e+10 1.861945e+10 2.480905e+10
2014-12 1.112403e+10 1.257988e+10 1.256844e+10 1.088796e+10 1.273958e+10 1.071537e+10 1.053461e+10 1.917828e+10 1.393947e+10 3.058307e+10
2015-12 9.314497e+09 9.112856e+09 9.981568e+09 1.014135e+10 9.342858e+09 1.074729e+10 9.961342e+09 1.031605e+10 1.505106e+10 4.839611e+10
2016-12 8.715027e+09 1.307421e+10 8.205977e+09 9.324338e+09 1.049927e+10 8.515916e+09 1.140932e+10 1.189586e+10 1.531117e+10 5.240835e+10
2017-12 1.792234e+10 9.148023e+09 7.195643e+09 9.058651e+09 9.097544e+09 8.441077e+09 7.968389e+09 1.164186e+10 1.192391e+10 2.569262e+10
2018-12 2.195517e+10 1.315840e+10 8.365672e+09 8.507789e+09 1.024115e+10 9.084155e+09 1.217223e+10 1.472566e+10 1.137340e+10 2.357100e+10
2019-12 4.080461e+10 1.924495e+10 1.235015e+10 8.707788e+09 1.382985e+10 1.314332e+10 1.589170e+10 1.114371e+10 1.191487e+10 2.560549e+10
2020-12 5.099567e+10 1.613472e+10 1.336371e+10 1.052788e+10 8.391447e+09 1.192074e+10 1.180157e+10 1.307650e+10 1.198211e+10 2.820960e+10
2021-12 3.754853e+10 1.534978e+10 1.264296e+10 1.072759e+10 1.002252e+10 7.712885e+09 9.780267e+09 1.306829e+10 1.279645e+10 2.528510e+10
2022-12 2.674813e+10 1.460591e+10 1.147212e+10 1.128342e+10 1.104140e+10 8.851923e+09 9.776261e+09 1.066437e+10 1.341447e+10 2.682276e+10
2023-12 1.982568e+10 1.235109e+10 1.498992e+10 1.218861e+10 1.136951e+10 1.040118e+10 1.044852e+10 9.183531e+09 1.450266e+10 3.116060e+10
Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6997e97950>
In [133]:
pf_mktcap = pf_mktcap / 1e10
for i in range(10):
    print(pf_mktcap.mean()[i])
1.8806254216473737
1.134128557774694
0.9437953430402802
0.9184820617998768
0.9962189797041303
0.999506809253669
1.202857008823436
1.1647012512317052
1.1267068246488174
2.5689130236794204

BM 1年调仓单排结论

排序方法:t年7月至t+1年6月,按照t-1年12月的BM排序,考察区间内每月平均收益率以及区间年平均收益率

结论:

  • 月、年平均收益率呈现微弱递增,但p10的收益率较差。
  • 最大BM组(也即估值最低组)的market cap起伏很大
  • 最小BM组(也即估值最高组)的market cap在样本后期显著增大
  • BM的效应可能和market cap有关系

BM Point-in-Time

In [135]:
del portfolios, portfolios_crs_mean
In [136]:
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")

# fundmen_df.to_pickle('./data/fundmen_df_pit.pkl')
In [147]:
fundmen_df = pd.read_pickle('./data/fundmen_df_pit.pkl')
In [148]:
fundmen_df
Out[148]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000001.XSHE 2025-03-15 2024-12-31 2024-12-31 2025-03-14 19:28:13 12 4.948420e+11 4.948420e+11 NaN
1 000001.XSHE 2024-10-19 2024-09-30 2024-09-30 2024-10-18 18:00:33 9 4.904690e+11 4.904690e+11 NaN
2 000001.XSHE 2024-08-16 2024-06-30 2024-06-30 2024-08-15 19:33:28 6 4.818690e+11 4.818690e+11 NaN
3 000001.XSHE 2024-04-20 2024-03-31 2024-03-31 2024-04-19 17:56:30 3 4.855760e+11 4.855760e+11 NaN
4 000001.XSHE 2025-03-15 2023-12-31 2024-12-31 2025-03-14 19:28:13 12 4.723280e+11 4.723280e+11 NaN
5 000001.XSHE 2024-10-19 2023-12-31 2024-09-30 2024-10-18 18:00:33 12 4.723280e+11 4.723280e+11 NaN
6 000001.XSHE 2024-08-16 2023-12-31 2024-06-30 2024-08-15 19:33:28 12 4.723280e+11 4.723280e+11 NaN
7 000001.XSHE 2024-04-20 2023-12-31 2024-03-31 2024-04-19 17:56:30 12 4.723280e+11 4.723280e+11 NaN
... ... ... ... ... ... ... ... ... ...
516386 900957.XSHG 2010-03-30 2008-12-31 2009-12-31 2010-03-29 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
516387 900957.XSHG 2009-10-26 2008-12-31 2009-09-30 2009-10-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
516388 900957.XSHG 2009-08-01 2008-12-31 2009-06-30 2009-07-31 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
516389 900957.XSHG 2009-04-18 2008-12-31 2009-03-31 2009-04-17 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
516390 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 2009-03-25 18:00:00 12 4.902596e+08 4.369354e+08 53324231.94
516391 900957.XSHG 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 4.324493e+08 3.737685e+08 58680833.72
516392 900957.XSHG 2008-08-25 2008-06-30 2008-06-30 2008-08-24 18:00:00 6 4.394184e+08 3.780434e+08 61375035.95
516393 900957.XSHG 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 4.389698e+08 3.777832e+08 61186603.43

516394 rows × 9 columns

In [149]:
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
In [150]:
fundmen_df.sort_values(['secID','publishDate','endDate'],inplace=True)
In [151]:
fundmen_df
Out[151]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
131 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
130 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
129 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN
128 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN
127 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 2009-04-23 18:00:00 12 1.640079e+10 1.640079e+10 NaN
123 000001.XSHE 2009-04-24 2009-03-31 2009-03-31 2009-04-23 18:00:00 3 1.707213e+10 1.707213e+10 NaN
126 000001.XSHE 2009-08-21 2008-12-31 2009-06-30 2009-08-20 18:00:00 12 1.640079e+10 1.640079e+10 NaN
122 000001.XSHE 2009-08-21 2009-06-30 2009-06-30 2009-08-20 18:00:00 6 1.798739e+10 1.798739e+10 NaN
... ... ... ... ... ... ... ... ... ...
516270 900957.XSHG 2024-04-12 2022-12-31 2023-12-31 2024-04-11 15:45:08 12 5.669258e+08 5.660700e+08 855788.18
516266 900957.XSHG 2024-04-12 2023-12-31 2023-12-31 2024-04-11 15:45:08 12 6.071647e+08 6.063475e+08 817154.47
516265 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 2024-04-25 15:32:41 12 6.071647e+08 6.063475e+08 817154.47
516262 900957.XSHG 2024-04-26 2024-03-31 2024-03-31 2024-04-25 15:32:41 3 6.072715e+08 6.064550e+08 816561.17
516264 900957.XSHG 2024-08-26 2023-12-31 2024-06-30 2024-08-25 15:31:32 12 6.071647e+08 6.063475e+08 817154.47
516261 900957.XSHG 2024-08-26 2024-06-30 2024-06-30 2024-08-25 15:31:32 6 6.127041e+08 6.118875e+08 816562.93
516263 900957.XSHG 2024-10-29 2023-12-31 2024-09-30 2024-10-28 15:32:41 12 6.071647e+08 6.063475e+08 817154.47
516260 900957.XSHG 2024-10-29 2024-09-30 2024-09-30 2024-10-28 15:32:41 9 6.156863e+08 6.148702e+08 816129.15

516394 rows × 9 columns

In [152]:
fundmen_df.loc[fundmen_df.duplicated(['secID','publishDate'], keep=False)] # 同一报表中包含往期信息
Out[152]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
127 000001.XSHE 2009-04-24 2008-12-31 2009-03-31 2009-04-23 18:00:00 12 1.640079e+10 1.640079e+10 NaN
123 000001.XSHE 2009-04-24 2009-03-31 2009-03-31 2009-04-23 18:00:00 3 1.707213e+10 1.707213e+10 NaN
126 000001.XSHE 2009-08-21 2008-12-31 2009-06-30 2009-08-20 18:00:00 12 1.640079e+10 1.640079e+10 NaN
122 000001.XSHE 2009-08-21 2009-06-30 2009-06-30 2009-08-20 18:00:00 6 1.798739e+10 1.798739e+10 NaN
125 000001.XSHE 2009-10-29 2008-12-31 2009-09-30 2009-10-28 18:00:00 12 1.640079e+10 1.640079e+10 NaN
121 000001.XSHE 2009-10-29 2009-09-30 2009-09-30 2009-10-28 18:00:00 9 1.908844e+10 1.908844e+10 NaN
124 000001.XSHE 2010-03-12 2008-12-31 2009-12-31 2010-03-11 18:00:00 12 1.640079e+10 1.640079e+10 NaN
120 000001.XSHE 2010-03-12 2009-12-31 2009-12-31 2010-03-11 18:00:00 12 2.046961e+10 2.046961e+10 NaN
... ... ... ... ... ... ... ... ... ...
516270 900957.XSHG 2024-04-12 2022-12-31 2023-12-31 2024-04-11 15:45:08 12 5.669258e+08 5.660700e+08 855788.18
516266 900957.XSHG 2024-04-12 2023-12-31 2023-12-31 2024-04-11 15:45:08 12 6.071647e+08 6.063475e+08 817154.47
516265 900957.XSHG 2024-04-26 2023-12-31 2024-03-31 2024-04-25 15:32:41 12 6.071647e+08 6.063475e+08 817154.47
516262 900957.XSHG 2024-04-26 2024-03-31 2024-03-31 2024-04-25 15:32:41 3 6.072715e+08 6.064550e+08 816561.17
516264 900957.XSHG 2024-08-26 2023-12-31 2024-06-30 2024-08-25 15:31:32 12 6.071647e+08 6.063475e+08 817154.47
516261 900957.XSHG 2024-08-26 2024-06-30 2024-06-30 2024-08-25 15:31:32 6 6.127041e+08 6.118875e+08 816562.93
516263 900957.XSHG 2024-10-29 2023-12-31 2024-09-30 2024-10-28 15:32:41 12 6.071647e+08 6.063475e+08 817154.47
516260 900957.XSHG 2024-10-29 2024-09-30 2024-09-30 2024-10-28 15:32:41 9 6.156863e+08 6.148702e+08 816129.15

508542 rows × 9 columns

In [153]:
fundmen_df.loc[~fundmen_df.duplicated(['secID','publishDate'], keep=False)] # 只有当期信息
Out[153]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
131 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
130 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
129 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN
128 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN
265 000002.XSHE 2008-04-25 2008-03-31 2008-03-31 2008-04-24 18:00:00 3 3.547986e+10 3.020373e+10 5.276124e+09
264 000002.XSHE 2008-08-05 2008-06-30 2008-06-30 2008-08-04 18:00:00 6 3.582300e+10 3.001995e+10 5.803053e+09
263 000002.XSHE 2008-10-28 2008-09-30 2008-09-30 2008-10-27 18:00:00 9 3.685165e+10 3.008923e+10 6.762425e+09
262 000002.XSHE 2009-03-10 2008-12-31 2008-12-31 2009-03-09 18:00:00 12 3.881855e+10 3.189193e+10 6.926624e+09
... ... ... ... ... ... ... ... ... ...
516259 900956.XSHG 2008-04-26 2008-03-31 2008-03-31 2008-04-25 18:00:00 3 5.439410e+08 4.738823e+08 7.005878e+07
516258 900956.XSHG 2008-08-30 2008-06-30 2008-06-30 2008-08-29 18:00:00 6 5.500809e+08 4.765419e+08 7.353901e+07
516257 900956.XSHG 2008-10-25 2008-09-30 2008-09-30 2008-10-24 18:00:00 9 5.370822e+08 4.594925e+08 7.758972e+07
516256 900956.XSHG 2009-04-11 2008-12-31 2008-12-31 2009-04-10 18:00:00 12 5.743349e+08 4.774409e+08 9.689399e+07
516393 900957.XSHG 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 4.389698e+08 3.777832e+08 6.118660e+07
516392 900957.XSHG 2008-08-25 2008-06-30 2008-06-30 2008-08-24 18:00:00 6 4.394184e+08 3.780434e+08 6.137504e+07
516391 900957.XSHG 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 4.324493e+08 3.737685e+08 5.868083e+07
516390 900957.XSHG 2009-03-26 2008-12-31 2008-12-31 2009-03-25 18:00:00 12 4.902596e+08 4.369354e+08 5.332423e+07

7852 rows × 9 columns

In [154]:
fundmen_df = fundmen_df.groupby(['secID','publishDate'],as_index=False).last() #不涉及上上个报表的信息
In [155]:
fundmen_df
Out[155]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN
1 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN
2 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN
3 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN
4 000001.XSHE 2009-04-24 2009-03-31 2009-03-31 2009-04-23 18:00:00 3 1.707213e+10 1.707213e+10 NaN
5 000001.XSHE 2009-08-21 2009-06-30 2009-06-30 2009-08-20 18:00:00 6 1.798739e+10 1.798739e+10 NaN
6 000001.XSHE 2009-10-29 2009-09-30 2009-09-30 2009-10-28 18:00:00 9 1.908844e+10 1.908844e+10 NaN
7 000001.XSHE 2010-03-12 2009-12-31 2009-12-31 2010-03-11 18:00:00 12 2.046961e+10 2.046961e+10 NaN
... ... ... ... ... ... ... ... ... ...
229420 900957.XSHG 2023-04-08 2022-12-31 2022-12-31 2023-04-07 15:38:50 12 5.669258e+08 5.660700e+08 855788.18
229421 900957.XSHG 2023-04-27 2023-03-31 2023-03-31 2023-04-26 18:14:09 3 5.756460e+08 5.747912e+08 854765.57
229422 900957.XSHG 2023-08-08 2023-06-30 2023-06-30 2023-08-07 15:32:40 6 5.862225e+08 5.853687e+08 853798.86
229423 900957.XSHG 2023-10-28 2023-09-30 2023-09-30 2023-10-27 15:36:39 9 5.983664e+08 5.975140e+08 852427.19
229424 900957.XSHG 2024-04-12 2023-12-31 2023-12-31 2024-04-11 15:45:08 12 6.071647e+08 6.063475e+08 817154.47
229425 900957.XSHG 2024-04-26 2024-03-31 2024-03-31 2024-04-25 15:32:41 3 6.072715e+08 6.064550e+08 816561.17
229426 900957.XSHG 2024-08-26 2024-06-30 2024-06-30 2024-08-25 15:31:32 6 6.127041e+08 6.118875e+08 816562.93
229427 900957.XSHG 2024-10-29 2024-09-30 2024-09-30 2024-10-28 15:32:41 9 6.156863e+08 6.148702e+08 816129.15

229428 rows × 9 columns

In [156]:
fundmen_df['secID'].str[-4:].unique()
Out[156]:
array(['XSHE', 'SHE2', 'XSHG'], dtype=object)
In [157]:
fundmen_df.loc[fundmen_df['secID'].str.endswith('SHE2')]
Out[157]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
2527 000043.XSHE2 2008-04-22 2008-03-31 2008-03-31 2008-04-21 18:00:00 3 1.754624e+09 1.497749e+09 2.568750e+08
2528 000043.XSHE2 2008-08-19 2008-06-30 2008-06-30 2008-08-18 18:00:00 6 1.660061e+09 1.441032e+09 2.190291e+08
2529 000043.XSHE2 2008-10-27 2008-09-30 2008-09-30 2008-10-26 18:00:00 9 1.797560e+09 1.429033e+09 3.685271e+08
2530 000043.XSHE2 2009-04-10 2008-12-31 2008-12-31 2009-04-09 18:00:00 12 1.743831e+09 1.446562e+09 2.972697e+08
2531 000043.XSHE2 2009-04-28 2009-03-31 2009-03-31 2009-04-27 18:00:00 3 2.031279e+09 1.673443e+09 3.578353e+08
2532 000043.XSHE2 2009-07-23 2009-06-30 2009-06-30 2009-07-22 18:00:00 6 2.048404e+09 1.679233e+09 3.691707e+08
2533 000043.XSHE2 2009-10-27 2009-09-30 2009-09-30 2009-10-26 18:00:00 9 2.041002e+09 1.669816e+09 3.711865e+08
2534 000043.XSHE2 2010-03-16 2009-12-31 2009-12-31 2010-03-15 18:00:00 12 2.077269e+09 1.716617e+09 3.606526e+08
... ... ... ... ... ... ... ... ... ...
2587 000043.XSHE2 2023-04-22 2023-03-31 2023-03-31 2023-04-21 18:09:31 3 9.503130e+09 9.334366e+09 1.687640e+08
2588 000043.XSHE2 2023-08-25 2023-06-30 2023-06-30 2023-08-24 16:04:57 6 9.746350e+09 9.569769e+09 1.765801e+08
2589 000043.XSHE2 2023-10-27 2023-09-30 2023-09-30 2023-10-26 18:24:59 9 9.809555e+09 9.618211e+09 1.913436e+08
2590 000043.XSHE2 2024-03-16 2023-12-31 2023-12-31 2024-03-15 20:29:31 12 9.910374e+09 9.759359e+09 1.510153e+08
2591 000043.XSHE2 2024-04-26 2024-03-31 2024-03-31 2024-04-25 19:42:48 3 1.011193e+10 9.950632e+09 1.612996e+08
2592 000043.XSHE2 2024-08-30 2024-06-30 2024-06-30 2024-08-29 19:28:21 6 1.018513e+10 1.001503e+10 1.701023e+08
2593 000043.XSHE2 2024-10-31 2024-09-30 2024-09-30 2024-10-30 19:42:35 9 1.036198e+10 1.020686e+10 1.551216e+08
2594 000043.XSHE2 2025-03-17 2024-12-31 2024-12-31 2025-03-16 15:32:30 12 1.057032e+10 1.041897e+10 1.513532e+08

68 rows × 9 columns

In [158]:
DataAPI.FdmtBSGet(secID='000043.XSHE2',beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","secShortName","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")
Out[158]:
secID secShortName publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt
0 000043.XSHE2 中航善达 2025-03-17 2024-12-31 2024-12-31 2025-03-16 15:32:30 12 1.057032e+10 1.041897e+10 1.513532e+08
1 000043.XSHE2 中航善达 2024-10-31 2024-09-30 2024-09-30 2024-10-30 19:42:35 9 1.036198e+10 1.020686e+10 1.551216e+08
2 000043.XSHE2 中航善达 2024-08-30 2024-06-30 2024-06-30 2024-08-29 19:28:21 6 1.018513e+10 1.001503e+10 1.701023e+08
3 000043.XSHE2 中航善达 2024-04-26 2024-03-31 2024-03-31 2024-04-25 19:42:48 3 1.011193e+10 9.950632e+09 1.612996e+08
4 000043.XSHE2 中航善达 2025-03-17 2023-12-31 2024-12-31 2025-03-16 15:32:30 12 9.910374e+09 9.759359e+09 1.510153e+08
5 000043.XSHE2 中航善达 2024-10-31 2023-12-31 2024-09-30 2024-10-30 19:42:35 12 9.910374e+09 9.759359e+09 1.510153e+08
6 000043.XSHE2 中航善达 2024-08-30 2023-12-31 2024-06-30 2024-08-29 19:28:21 12 9.910374e+09 9.759359e+09 1.510153e+08
7 000043.XSHE2 中航善达 2024-04-26 2023-12-31 2024-03-31 2024-04-25 19:42:48 12 9.910374e+09 9.759359e+09 1.510153e+08
... ... ... ... ... ... ... ... ... ... ...
124 000043.XSHE2 中航善达 2010-03-16 2008-12-31 2009-12-31 2010-03-15 18:00:00 12 2.014681e+09 1.697758e+09 3.169230e+08
125 000043.XSHE2 中航善达 2009-10-27 2008-12-31 2009-09-30 2009-10-26 18:00:00 12 2.014681e+09 1.697758e+09 3.169230e+08
126 000043.XSHE2 中航善达 2009-07-23 2008-12-31 2009-06-30 2009-07-22 18:00:00 12 2.014681e+09 1.697758e+09 3.169230e+08
127 000043.XSHE2 中航善达 2009-04-28 2008-12-31 2009-03-31 2009-04-27 18:00:00 12 2.014681e+09 1.697758e+09 3.169230e+08
128 000043.XSHE2 中航善达 2009-04-10 2008-12-31 2008-12-31 2009-04-09 18:00:00 12 1.743831e+09 1.446562e+09 2.972697e+08
129 000043.XSHE2 中航善达 2008-10-27 2008-09-30 2008-09-30 2008-10-26 18:00:00 9 1.797560e+09 1.429033e+09 3.685271e+08
130 000043.XSHE2 中航善达 2008-08-19 2008-06-30 2008-06-30 2008-08-18 18:00:00 6 1.660061e+09 1.441032e+09 2.190291e+08
131 000043.XSHE2 中航善达 2008-04-22 2008-03-31 2008-03-31 2008-04-21 18:00:00 3 1.754624e+09 1.497749e+09 2.568750e+08

132 rows × 10 columns

In [159]:
fundmen_df = fundmen_df[(fundmen_df['secID'].str.endswith('XSHE')) | (fundmen_df['secID'].str.endswith('XSHG'))]
In [160]:
# # minorityInt 有时报告,有时不报告。空值时,假设就是上一次报告的值
# # fundmen_df['minorityInt'] = fundmen_df.groupby('secID')['minorityInt'].fillna(method='ffill')
# # 第一轮填完空值为有效数值后,剩下的空值再用0填充。
# fundmen_df['minorityInt'].fillna(0,inplace=True)
  • 假设是上一次报告的值可能出现误差,因为股权变动了(注意ffill的方法)
  • 直接用TEquityAttrP
In [161]:
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
In [162]:
fundmen_df
Out[162]:
secID publishDate endDate endDateRep actPubtime fiscalPeriod TShEquity TEquityAttrP minorityInt book
0 000001.XSHE 2008-04-24 2008-03-31 2008-03-31 2008-04-23 18:00:00 3 1.404138e+10 1.404138e+10 NaN 1.404138e+10
1 000001.XSHE 2008-08-21 2008-06-30 2008-06-30 2008-08-20 18:00:00 6 1.694330e+10 1.694330e+10 NaN 1.694330e+10
2 000001.XSHE 2008-10-24 2008-09-30 2008-09-30 2008-10-23 18:00:00 9 1.837466e+10 1.837466e+10 NaN 1.837466e+10
3 000001.XSHE 2009-03-20 2008-12-31 2008-12-31 2009-03-19 18:00:00 12 1.640079e+10 1.640079e+10 NaN 1.640079e+10
4 000001.XSHE 2009-04-24 2009-03-31 2009-03-31 2009-04-23 18:00:00 3 1.707213e+10 1.707213e+10 NaN 1.707213e+10
5 000001.XSHE 2009-08-21 2009-06-30 2009-06-30 2009-08-20 18:00:00 6 1.798739e+10 1.798739e+10 NaN 1.798739e+10
6 000001.XSHE 2009-10-29 2009-09-30 2009-09-30 2009-10-28 18:00:00 9 1.908844e+10 1.908844e+10 NaN 1.908844e+10
7 000001.XSHE 2010-03-12 2009-12-31 2009-12-31 2010-03-11 18:00:00 12 2.046961e+10 2.046961e+10 NaN 2.046961e+10
... ... ... ... ... ... ... ... ... ... ...
229420 900957.XSHG 2023-04-08 2022-12-31 2022-12-31 2023-04-07 15:38:50 12 5.669258e+08 5.660700e+08 855788.18 5.660700e+08
229421 900957.XSHG 2023-04-27 2023-03-31 2023-03-31 2023-04-26 18:14:09 3 5.756460e+08 5.747912e+08 854765.57 5.747912e+08
229422 900957.XSHG 2023-08-08 2023-06-30 2023-06-30 2023-08-07 15:32:40 6 5.862225e+08 5.853687e+08 853798.86 5.853687e+08
229423 900957.XSHG 2023-10-28 2023-09-30 2023-09-30 2023-10-27 15:36:39 9 5.983664e+08 5.975140e+08 852427.19 5.975140e+08
229424 900957.XSHG 2024-04-12 2023-12-31 2023-12-31 2024-04-11 15:45:08 12 6.071647e+08 6.063475e+08 817154.47 6.063475e+08
229425 900957.XSHG 2024-04-26 2024-03-31 2024-03-31 2024-04-25 15:32:41 3 6.072715e+08 6.064550e+08 816561.17 6.064550e+08
229426 900957.XSHG 2024-08-26 2024-06-30 2024-06-30 2024-08-25 15:31:32 6 6.127041e+08 6.118875e+08 816562.93 6.118875e+08
229427 900957.XSHG 2024-10-29 2024-09-30 2024-09-30 2024-10-28 15:32:41 9 6.156863e+08 6.148702e+08 816129.15 6.148702e+08

229360 rows × 10 columns

  • 考虑到报表公布时间可能在当天收市以后,以及报表解读可能需要时间,把publishDate往后加1
  • publishDate和 tradeDate merge,
  • publishDate可能是非交易日,所以merge时要outer,左右表格都不丢观测值。
  • 接着要把tradeDate为空值的设置为publishDate,便于排序,方便下面填充
  • 然后按照secID和tradeDate sort,因为publishDate非交易日的被放到merge表格的最后了。
  • 接着 book 空值由上面第一个非空值填充(当时已知的最新的book value)
  • 再把 na 都丢弃即可
In [163]:
# fundmen_df['publishDate+1'] = fundmen_df['publishDate'] + dt.timedelta(days=1)
In [164]:
stk_fundmen_df = pd.merge(stk_df, fundmen_df[['secID','publishDate','endDate','book']], 
                          left_on=['secID','tradeDate'], right_on=['secID','publishDate'],
                          how='outer')
In [165]:
stk_fundmen_df
Out[165]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01 NaT NaT NaN
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01 NaT NaT NaN
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01 NaT NaT NaN
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01 NaT NaT NaN
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01 NaT NaT NaN
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01 NaT NaT NaN
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01 NaT NaT NaN
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01 NaT NaT NaN
... ... ... ... ... ... ... ... ...
13288440 900957.XSHG NaT NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
13288441 900957.XSHG NaT NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
13288442 900957.XSHG NaT NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
13288443 900957.XSHG NaT NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
13288444 900957.XSHG NaT NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
13288445 900957.XSHG NaT NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08
13288446 900957.XSHG NaT NaN NaN NaT 2023-04-08 2022-12-31 5.660700e+08
13288447 900957.XSHG NaT NaN NaN NaT 2023-10-28 2023-09-30 5.975140e+08

13288448 rows × 8 columns

In [166]:
stk_fundmen_df.loc[(stk_fundmen_df['secID']=='900957.XSHG')&(stk_fundmen_df['tradeDate']<='2016-08-10')&(stk_fundmen_df['tradeDate']>='2016-08-03')]
Out[166]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
13219683 900957.XSHG 2016-08-03 1.299 237176000.0 2016-08 NaT NaT NaN
13219684 900957.XSHG 2016-08-04 1.334 243432000.0 2016-08 NaT NaT NaN
13219685 900957.XSHG 2016-08-05 1.328 242512000.0 2016-08 NaT NaT NaN
13219686 900957.XSHG 2016-08-08 1.328 242512000.0 2016-08 NaT NaT NaN
13219687 900957.XSHG 2016-08-09 1.334 243432000.0 2016-08 NaT NaT NaN
13219688 900957.XSHG 2016-08-10 1.323 241592000.0 2016-08 NaT NaT NaN
In [168]:
stk_fundmen_df.loc[(stk_fundmen_df['secID']=='000001.XSHE')&(stk_fundmen_df['tradeDate']<='2009')&(stk_fundmen_df['tradeDate']>='2008-08-10')]
Out[168]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
149 000001.XSHE 2008-08-11 610.748 4.134069e+10 2008-08 NaT NaT NaN
150 000001.XSHE 2008-08-12 610.748 4.134069e+10 2008-08 NaT NaT NaN
151 000001.XSHE 2008-08-13 610.432 4.131927e+10 2008-08 NaT NaT NaN
152 000001.XSHE 2008-08-14 610.432 4.131927e+10 2008-08 NaT NaT NaN
153 000001.XSHE 2008-08-15 620.242 4.198329e+10 2008-08 NaT NaT NaN
154 000001.XSHE 2008-08-18 602.204 4.076234e+10 2008-08 NaT NaT NaN
155 000001.XSHE 2008-08-19 609.166 4.123358e+10 2008-08 NaT NaT NaN
156 000001.XSHE 2008-08-20 659.798 4.466079e+10 2008-08 NaT NaT NaN
... ... ... ... ... ... ... ... ...
238 000001.XSHE 2008-12-22 424.985 2.868144e+10 2008-12 NaT NaT NaN
239 000001.XSHE 2008-12-23 420.034 2.834729e+10 2008-12 NaT NaT NaN
240 000001.XSHE 2008-12-24 417.558 2.818021e+10 2008-12 NaT NaT NaN
241 000001.XSHE 2008-12-25 403.530 2.723344e+10 2008-12 NaT NaT NaN
242 000001.XSHE 2008-12-26 401.054 2.706637e+10 2008-12 NaT NaT NaN
243 000001.XSHE 2008-12-29 390.739 2.637022e+10 2008-12 NaT NaT NaN
244 000001.XSHE 2008-12-30 391.977 2.645375e+10 2008-12 NaT NaT NaN
245 000001.XSHE 2008-12-31 390.326 2.634237e+10 2008-12 NaT NaT NaN

97 rows × 8 columns

In [169]:
stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna()]
Out[169]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
13221779 000001.XSHE NaT NaN NaN NaT 2017-04-22 2017-03-31 2.077390e+11
13221780 000001.XSHE NaT NaN NaN NaT 2017-10-21 2017-09-30 2.181110e+11
13221781 000001.XSHE NaT NaN NaN NaT 2024-04-20 2024-03-31 4.855760e+11
13221782 000001.XSHE NaT NaN NaN NaT 2024-10-19 2024-09-30 4.904690e+11
13221783 000001.XSHE NaT NaN NaN NaT 2025-03-15 2024-12-31 4.948420e+11
13221784 000002.XSHE NaT NaN NaN NaT 2022-10-29 2022-09-30 2.411070e+11
13221785 000002.XSHE NaT NaN NaN NaT 2023-04-29 2023-03-31 2.460123e+11
13221786 000002.XSHE NaT NaN NaN NaT 2023-10-28 2023-09-30 2.529078e+11
... ... ... ... ... ... ... ... ...
13288440 900957.XSHG NaT NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
13288441 900957.XSHG NaT NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
13288442 900957.XSHG NaT NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
13288443 900957.XSHG NaT NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
13288444 900957.XSHG NaT NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
13288445 900957.XSHG NaT NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08
13288446 900957.XSHG NaT NaN NaN NaT 2023-04-08 2022-12-31 5.660700e+08
13288447 900957.XSHG NaT NaN NaN NaT 2023-10-28 2023-09-30 5.975140e+08

66669 rows × 8 columns

In [170]:
print(stk_fundmen_df.loc[(stk_fundmen_df['secID']=='000001.XSHE')&(stk_fundmen_df['ym']=='2017-04')].to_string())
            secID  tradeDate  closePrice  negMarketValue       ym publishDate endDate  book
2249  000001.XSHE 2017-04-05    1108.473    1.558147e+11  2017-04         NaT     NaT   NaN
2250  000001.XSHE 2017-04-06    1107.270    1.556455e+11  2017-04         NaT     NaT   NaN
2251  000001.XSHE 2017-04-07    1107.270    1.556455e+11  2017-04         NaT     NaT   NaN
2252  000001.XSHE 2017-04-10    1104.863    1.553071e+11  2017-04         NaT     NaT   NaN
2253  000001.XSHE 2017-04-11    1101.252    1.547996e+11  2017-04         NaT     NaT   NaN
2254  000001.XSHE 2017-04-12    1097.641    1.542921e+11  2017-04         NaT     NaT   NaN
2255  000001.XSHE 2017-04-13    1097.641    1.542921e+11  2017-04         NaT     NaT   NaN
2256  000001.XSHE 2017-04-14    1092.827    1.536153e+11  2017-04         NaT     NaT   NaN
2257  000001.XSHE 2017-04-17    1095.234    1.539537e+11  2017-04         NaT     NaT   NaN
2258  000001.XSHE 2017-04-18    1089.216    1.531078e+11  2017-04         NaT     NaT   NaN
2259  000001.XSHE 2017-04-19    1072.367    1.507393e+11  2017-04         NaT     NaT   NaN
2260  000001.XSHE 2017-04-20    1073.570    1.509085e+11  2017-04         NaT     NaT   NaN
2261  000001.XSHE 2017-04-21    1079.588    1.517544e+11  2017-04         NaT     NaT   NaN
2262  000001.XSHE 2017-04-24    1074.774    1.510776e+11  2017-04         NaT     NaT   NaN
2263  000001.XSHE 2017-04-25    1083.199    1.522619e+11  2017-04         NaT     NaT   NaN
2264  000001.XSHE 2017-04-26    1081.995    1.520927e+11  2017-04         NaT     NaT   NaN
2265  000001.XSHE 2017-04-27    1079.588    1.517544e+11  2017-04         NaT     NaT   NaN
2266  000001.XSHE 2017-04-28    1081.995    1.520927e+11  2017-04         NaT     NaT   NaN
In [171]:
idx = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna()].index
In [172]:
stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'tradeDate'] = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'publishDate']
In [173]:
stk_fundmen_df.loc[idx]
Out[173]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
13221779 000001.XSHE 2017-04-22 NaN NaN NaT 2017-04-22 2017-03-31 2.077390e+11
13221780 000001.XSHE 2017-10-21 NaN NaN NaT 2017-10-21 2017-09-30 2.181110e+11
13221781 000001.XSHE 2024-04-20 NaN NaN NaT 2024-04-20 2024-03-31 4.855760e+11
13221782 000001.XSHE 2024-10-19 NaN NaN NaT 2024-10-19 2024-09-30 4.904690e+11
13221783 000001.XSHE 2025-03-15 NaN NaN NaT 2025-03-15 2024-12-31 4.948420e+11
13221784 000002.XSHE 2022-10-29 NaN NaN NaT 2022-10-29 2022-09-30 2.411070e+11
13221785 000002.XSHE 2023-04-29 NaN NaN NaT 2023-04-29 2023-03-31 2.460123e+11
13221786 000002.XSHE 2023-10-28 NaN NaN NaT 2023-10-28 2023-09-30 2.529078e+11
... ... ... ... ... ... ... ... ...
13288440 900957.XSHG 2016-08-06 NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
13288441 900957.XSHG 2017-03-25 NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
13288442 900957.XSHG 2019-03-30 NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
13288443 900957.XSHG 2019-08-10 NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
13288444 900957.XSHG 2020-04-25 NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
13288445 900957.XSHG 2022-04-30 NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08
13288446 900957.XSHG 2023-04-08 NaN NaN NaT 2023-04-08 2022-12-31 5.660700e+08
13288447 900957.XSHG 2023-10-28 NaN NaN NaT 2023-10-28 2023-09-30 5.975140e+08

66669 rows × 8 columns

In [174]:
print(stk_fundmen_df.loc[(stk_fundmen_df['secID']=='000001.XSHE')&(stk_fundmen_df['tradeDate']<'2017-05')
                         &(stk_fundmen_df['tradeDate']>='2017-04')].to_string())
                secID  tradeDate  closePrice  negMarketValue       ym publishDate    endDate          book
2249      000001.XSHE 2017-04-05    1108.473    1.558147e+11  2017-04         NaT        NaT           NaN
2250      000001.XSHE 2017-04-06    1107.270    1.556455e+11  2017-04         NaT        NaT           NaN
2251      000001.XSHE 2017-04-07    1107.270    1.556455e+11  2017-04         NaT        NaT           NaN
2252      000001.XSHE 2017-04-10    1104.863    1.553071e+11  2017-04         NaT        NaT           NaN
2253      000001.XSHE 2017-04-11    1101.252    1.547996e+11  2017-04         NaT        NaT           NaN
2254      000001.XSHE 2017-04-12    1097.641    1.542921e+11  2017-04         NaT        NaT           NaN
2255      000001.XSHE 2017-04-13    1097.641    1.542921e+11  2017-04         NaT        NaT           NaN
2256      000001.XSHE 2017-04-14    1092.827    1.536153e+11  2017-04         NaT        NaT           NaN
2257      000001.XSHE 2017-04-17    1095.234    1.539537e+11  2017-04         NaT        NaT           NaN
2258      000001.XSHE 2017-04-18    1089.216    1.531078e+11  2017-04         NaT        NaT           NaN
2259      000001.XSHE 2017-04-19    1072.367    1.507393e+11  2017-04         NaT        NaT           NaN
2260      000001.XSHE 2017-04-20    1073.570    1.509085e+11  2017-04         NaT        NaT           NaN
2261      000001.XSHE 2017-04-21    1079.588    1.517544e+11  2017-04         NaT        NaT           NaN
2262      000001.XSHE 2017-04-24    1074.774    1.510776e+11  2017-04         NaT        NaT           NaN
2263      000001.XSHE 2017-04-25    1083.199    1.522619e+11  2017-04         NaT        NaT           NaN
2264      000001.XSHE 2017-04-26    1081.995    1.520927e+11  2017-04         NaT        NaT           NaN
2265      000001.XSHE 2017-04-27    1079.588    1.517544e+11  2017-04         NaT        NaT           NaN
2266      000001.XSHE 2017-04-28    1081.995    1.520927e+11  2017-04         NaT        NaT           NaN
13221779  000001.XSHE 2017-04-22         NaN             NaN      NaT  2017-04-22 2017-03-31  2.077390e+11
In [175]:
stk_fundmen_df
Out[175]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01 NaT NaT NaN
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01 NaT NaT NaN
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01 NaT NaT NaN
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01 NaT NaT NaN
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01 NaT NaT NaN
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01 NaT NaT NaN
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01 NaT NaT NaN
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01 NaT NaT NaN
... ... ... ... ... ... ... ... ...
13288440 900957.XSHG 2016-08-06 NaN NaN NaT 2016-08-06 2016-06-30 3.906354e+08
13288441 900957.XSHG 2017-03-25 NaN NaN NaT 2017-03-25 2016-12-31 3.930721e+08
13288442 900957.XSHG 2019-03-30 NaN NaN NaT 2019-03-30 2018-12-31 4.508051e+08
13288443 900957.XSHG 2019-08-10 NaN NaN NaT 2019-08-10 2019-06-30 4.618426e+08
13288444 900957.XSHG 2020-04-25 NaN NaN NaT 2020-04-25 2019-12-31 4.761021e+08
13288445 900957.XSHG 2022-04-30 NaN NaN NaT 2022-04-30 2022-03-31 5.333509e+08
13288446 900957.XSHG 2023-04-08 NaN NaN NaT 2023-04-08 2022-12-31 5.660700e+08
13288447 900957.XSHG 2023-10-28 NaN NaN NaT 2023-10-28 2023-09-30 5.975140e+08

13288448 rows × 8 columns

In [176]:
stk_fundmen_df.sort_values(['secID','tradeDate'],inplace=True)
In [177]:
stk_fundmen_df
Out[177]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01 NaT NaT NaN
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01 NaT NaT NaN
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01 NaT NaT NaN
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01 NaT NaT NaN
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01 NaT NaT NaN
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01 NaT NaT NaN
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01 NaT NaT NaN
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01 NaT NaT NaN
... ... ... ... ... ... ... ... ...
13221771 900957.XSHG 2025-03-13 0.290 5.299200e+07 2025-03 NaT NaT NaN
13221772 900957.XSHG 2025-03-14 0.310 5.667200e+07 2025-03 NaT NaT NaN
13221773 900957.XSHG 2025-03-17 0.319 5.814400e+07 2025-03 NaT NaT NaN
13221774 900957.XSHG 2025-03-18 0.320 5.832800e+07 2025-03 NaT NaT NaN
13221775 900957.XSHG 2025-03-19 0.323 5.888000e+07 2025-03 NaT NaT NaN
13221776 900957.XSHG 2025-03-20 0.315 5.759200e+07 2025-03 NaT NaT NaN
13221777 900957.XSHG 2025-03-21 0.315 5.759200e+07 2025-03 NaT NaT NaN
13221778 900957.XSHG 2025-03-24 0.310 5.667200e+07 2025-03 NaT NaT NaN

13288448 rows × 8 columns

In [178]:
temp = stk_fundmen_df[stk_fundmen_df['secID']=='000001.XSHE'].copy()
temp
Out[178]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01 NaT NaT NaN
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01 NaT NaT NaN
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01 NaT NaT NaN
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01 NaT NaT NaN
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01 NaT NaT NaN
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01 NaT NaT NaN
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01 NaT NaT NaN
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01 NaT NaT NaN
... ... ... ... ... ... ... ... ...
4178 000001.XSHE 2025-03-14 1756.263 2.322852e+11 2025-03 NaT NaT NaN
13221783 000001.XSHE 2025-03-15 NaN NaN NaT 2025-03-15 2024-12-31 4.948420e+11
4179 000001.XSHE 2025-03-17 1687.304 2.231641e+11 2025-03 NaT NaT NaN
4180 000001.XSHE 2025-03-18 1685.837 2.229700e+11 2025-03 NaT NaT NaN
4181 000001.XSHE 2025-03-19 1690.238 2.235522e+11 2025-03 NaT NaT NaN
4182 000001.XSHE 2025-03-20 1685.837 2.229700e+11 2025-03 NaT NaT NaN
4183 000001.XSHE 2025-03-21 1675.566 2.216116e+11 2025-03 NaT NaT NaN
4184 000001.XSHE 2025-03-24 1669.697 2.208354e+11 2025-03 NaT NaT NaN

4190 rows × 8 columns

In [179]:
temp[~temp['book'].isna()]
Out[179]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
75 000001.XSHE 2008-04-24 869.921 4.829500e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
157 000001.XSHE 2008-08-21 639.545 4.328991e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
197 000001.XSHE 2008-10-24 380.689 2.576831e+10 2008-10 2008-10-24 2008-09-30 1.837466e+10
295 000001.XSHE 2009-03-20 632.527 4.268801e+10 2009-03 2009-03-20 2008-12-31 1.640079e+10
319 000001.XSHE 2009-04-24 632.527 4.268801e+10 2009-04 2009-04-24 2009-03-31 1.707213e+10
401 000001.XSHE 2009-08-21 910.624 6.452731e+10 2009-08 2009-08-21 2009-06-30 1.798739e+10
444 000001.XSHE 2009-10-29 916.400 6.494458e+10 2009-10 2009-10-29 2009-09-30 1.908844e+10
534 000001.XSHE 2010-03-12 945.695 6.702070e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
... ... ... ... ... ... ... ... ...
3723 000001.XSHE 2023-04-25 1606.511 2.383001e+11 2023-04 2023-04-25 2023-03-31 4.467450e+11
3805 000001.XSHE 2023-08-24 1492.196 2.159837e+11 2023-08 2023-08-24 2023-06-30 4.520730e+11
3843 000001.XSHE 2023-10-25 1391.644 2.014296e+11 2023-10 2023-10-25 2023-09-30 4.658600e+11
3938 000001.XSHE 2024-03-15 1421.139 2.056988e+11 2024-03 2024-03-15 2023-12-31 4.723280e+11
13221781 000001.XSHE 2024-04-20 NaN NaN NaT 2024-04-20 2024-03-31 4.855760e+11
4042 000001.XSHE 2024-08-16 1454.991 1.965789e+11 2024-08 2024-08-16 2024-06-30 4.818690e+11
13221782 000001.XSHE 2024-10-19 NaN NaN NaT 2024-10-19 2024-09-30 4.904690e+11
13221783 000001.XSHE 2025-03-15 NaN NaN NaT 2025-03-15 2024-12-31 4.948420e+11

68 rows × 8 columns

In [180]:
stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
Out[180]:
publishDate endDate book
0 NaT NaT NaN
1 NaT NaT NaN
2 NaT NaT NaN
3 NaT NaT NaN
4 NaT NaT NaN
5 NaT NaT NaN
6 NaT NaT NaN
7 NaT NaT NaN
... ... ... ...
13221771 2024-10-29 2024-09-30 6.148702e+08
13221772 2024-10-29 2024-09-30 6.148702e+08
13221773 2024-10-29 2024-09-30 6.148702e+08
13221774 2024-10-29 2024-09-30 6.148702e+08
13221775 2024-10-29 2024-09-30 6.148702e+08
13221776 2024-10-29 2024-09-30 6.148702e+08
13221777 2024-10-29 2024-09-30 6.148702e+08
13221778 2024-10-29 2024-09-30 6.148702e+08

13288448 rows × 3 columns

In [181]:
stk_fundmen_df.shape
Out[181]:
(13288448, 8)
In [182]:
stk_fundmen_df.loc[:,['publishDate','endDate','book']] = stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
In [183]:
## 查看数据
idx = pd.IndexSlice
stk_fundmen_df.set_index(['secID','tradeDate'],inplace=True)
pd.options.display.max_rows = 20
In [184]:
stk_fundmen_df.loc[idx['000001.XSHE','2010-03'],:]
Out[184]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2010-03-01 926.303 6.564637e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-02 953.535 6.757628e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-03 961.374 6.813186e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-04 953.122 6.754704e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-05 960.137 6.804414e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-08 984.068 6.974013e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-09 982.830 6.965240e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-10 965.500 6.842427e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-11 976.228 6.918454e+10 2010-03 2009-10-29 2009-09-30 1.908844e+10
2010-03-12 945.695 6.702070e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
... ... ... ... ... ... ...
2010-03-18 951.059 6.740083e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-19 950.647 6.737159e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-22 955.185 6.769325e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-23 941.982 6.675753e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-24 940.744 6.666981e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-25 920.114 6.520775e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-26 941.157 6.669905e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-29 972.102 6.889213e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-30 975.816 6.915530e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10
2010-03-31 957.248 6.783945e+10 2010-03 2010-03-12 2009-12-31 2.046961e+10

23 rows × 6 columns

In [187]:
stk_fundmen_df.loc[idx['000001.XSHE','2008-04'],:]
Out[187]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2008-04-01 883.212 4.903286e+10 2008-04 NaT NaT NaN
2008-04-02 914.540 5.077211e+10 2008-04 NaT NaT NaN
2008-04-03 925.932 5.140457e+10 2008-04 NaT NaT NaN
2008-04-07 949.350 5.270462e+10 2008-04 NaT NaT NaN
2008-04-08 938.907 5.212487e+10 2008-04 NaT NaT NaN
2008-04-09 877.516 4.871663e+10 2008-04 NaT NaT NaN
2008-04-10 887.642 4.927882e+10 2008-04 NaT NaT NaN
2008-04-11 901.882 5.006939e+10 2008-04 NaT NaT NaN
2008-04-14 811.694 4.506245e+10 2008-04 NaT NaT NaN
2008-04-15 810.112 4.497461e+10 2008-04 NaT NaT NaN
... ... ... ... ... ... ...
2008-04-17 733.214 4.070553e+10 2008-04 NaT NaT NaN
2008-04-18 723.405 4.016092e+10 2008-04 NaT NaT NaN
2008-04-21 709.164 3.937035e+10 2008-04 NaT NaT NaN
2008-04-22 745.240 4.137312e+10 2008-04 NaT NaT NaN
2008-04-23 790.808 4.390295e+10 2008-04 NaT NaT NaN
2008-04-24 869.921 4.829500e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
2008-04-25 884.478 4.910313e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
2008-04-28 881.313 4.892745e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
2008-04-29 884.478 4.910313e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10
2008-04-30 937.325 5.203702e+10 2008-04 2008-04-24 2008-03-31 1.404138e+10

21 rows × 6 columns

In [190]:
stk_fundmen_df.loc[idx['000001.XSHE','2008-06':'2008-08'],:]
Out[190]:
closePrice negMarketValue ym publishDate endDate book
secID tradeDate
000001.XSHE 2008-06-02 818.656 4.544895e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-03 804.732 4.467595e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-04 809.795 4.495704e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-05 797.770 4.428945e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-06 783.214 4.348131e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-10 705.050 3.914196e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-11 670.874 3.724460e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-12 670.874 3.724460e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-13 670.241 3.720946e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
2008-06-16 682.899 3.791219e+10 2008-06 2008-04-24 2008-03-31 1.404138e+10
... ... ... ... ... ... ...
2008-08-18 602.204 4.076234e+10 2008-08 2008-04-24 2008-03-31 1.404138e+10
2008-08-19 609.166 4.123358e+10 2008-08 2008-04-24 2008-03-31 1.404138e+10
2008-08-20 659.798 4.466079e+10 2008-08 2008-04-24 2008-03-31 1.404138e+10
2008-08-21 639.545 4.328991e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-22 629.735 4.262589e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-25 637.330 4.313997e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-26 633.216 4.286151e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-27 631.634 4.275441e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-28 635.115 4.299003e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10
2008-08-29 639.229 4.326849e+10 2008-08 2008-08-21 2008-06-30 1.694330e+10

64 rows × 6 columns

In [191]:
stk_fundmen_df.reset_index(inplace=True)
In [192]:
stk_fundmen_df
Out[192]:
secID tradeDate closePrice negMarketValue ym publishDate endDate book
0 000001.XSHE 2008-01-02 1201.877 6.672404e+10 2008-01 NaT NaT NaN
1 000001.XSHE 2008-01-03 1181.940 6.561725e+10 2008-01 NaT NaT NaN
2 000001.XSHE 2008-01-04 1214.218 6.740920e+10 2008-01 NaT NaT NaN
3 000001.XSHE 2008-01-07 1239.534 6.881466e+10 2008-01 NaT NaT NaN
4 000001.XSHE 2008-01-08 1278.141 7.095798e+10 2008-01 NaT NaT NaN
5 000001.XSHE 2008-01-09 1300.926 7.222289e+10 2008-01 NaT NaT NaN
6 000001.XSHE 2008-01-10 1327.507 7.369862e+10 2008-01 NaT NaT NaN
7 000001.XSHE 2008-01-11 1394.911 7.744065e+10 2008-01 NaT NaT NaN
8 000001.XSHE 2008-01-14 1374.975 7.633385e+10 2008-01 NaT NaT NaN
9 000001.XSHE 2008-01-15 1330.039 7.383917e+10 2008-01 NaT NaT NaN
... ... ... ... ... ... ... ... ...
13288438 900957.XSHG 2025-03-11 0.295 5.391200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288439 900957.XSHG 2025-03-12 0.295 5.391200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288440 900957.XSHG 2025-03-13 0.290 5.299200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288441 900957.XSHG 2025-03-14 0.310 5.667200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288442 900957.XSHG 2025-03-17 0.319 5.814400e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288443 900957.XSHG 2025-03-18 0.320 5.832800e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288444 900957.XSHG 2025-03-19 0.323 5.888000e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288445 900957.XSHG 2025-03-20 0.315 5.759200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288446 900957.XSHG 2025-03-21 0.315 5.759200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08
13288447 900957.XSHG 2025-03-24 0.310 5.667200e+07 2025-03 2024-10-29 2024-09-30 6.148702e+08

13288448 rows × 8 columns

In [193]:
stk_df_m = stk_fundmen_df.groupby(['secID','ym'],as_index=False).last()

stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
In [194]:
stk_df_m
Out[194]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret
0 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10 NaT NaT NaN NaN
1 000001.XSHE 2008-02 2008-02-29 1049.032 5.823860e+10 NaT NaT NaN -0.004504
2 000001.XSHE 2008-03 2008-03-31 892.389 4.954234e+10 NaT NaT NaN -0.149321
3 000001.XSHE 2008-04 2008-04-30 937.325 5.203702e+10 2008-04-24 2008-03-31 1.404138e+10 0.050355
4 000001.XSHE 2008-05 2008-05-30 798.403 4.432458e+10 2008-04-24 2008-03-31 1.404138e+10 -0.148211
5 000001.XSHE 2008-06 2008-06-30 611.698 4.140495e+10 2008-04-24 2008-03-31 1.404138e+10 -0.233848
6 000001.XSHE 2008-07 2008-07-31 658.216 4.455369e+10 2008-04-24 2008-03-31 1.404138e+10 0.076047
7 000001.XSHE 2008-08 2008-08-29 639.229 4.326849e+10 2008-08-21 2008-06-30 1.694330e+10 -0.028846
8 000001.XSHE 2008-09 2008-09-26 474.358 3.210865e+10 2008-08-21 2008-06-30 1.694330e+10 -0.257922
9 000001.XSHE 2008-10 2008-10-31 345.352 2.330715e+10 2008-10-24 2008-09-30 1.837466e+10 -0.271959
... ... ... ... ... ... ... ... ... ...
657636 900957.XSHG 2024-06 2024-06-28 0.174 3.183200e+07 2024-04-26 2024-03-31 6.064550e+08 -0.163462
657637 900957.XSHG 2024-07 2024-07-31 0.188 3.440800e+07 2024-04-26 2024-03-31 6.064550e+08 0.080460
657638 900957.XSHG 2024-08 2024-08-30 0.270 4.931200e+07 2024-08-26 2024-06-30 6.118875e+08 0.436170
657639 900957.XSHG 2024-09 2024-09-30 0.354 6.458400e+07 2024-08-26 2024-06-30 6.118875e+08 0.311111
657640 900957.XSHG 2024-10 2024-10-31 0.401 7.323200e+07 2024-10-29 2024-09-30 6.148702e+08 0.132768
657641 900957.XSHG 2024-11 2024-11-29 0.392 7.157600e+07 2024-10-29 2024-09-30 6.148702e+08 -0.022444
657642 900957.XSHG 2024-12 2024-12-31 0.316 5.777600e+07 2024-10-29 2024-09-30 6.148702e+08 -0.193878
657643 900957.XSHG 2025-01 2025-01-27 0.340 6.200800e+07 2024-10-29 2024-09-30 6.148702e+08 0.075949
657644 900957.XSHG 2025-02 2025-02-28 0.286 5.225600e+07 2024-10-29 2024-09-30 6.148702e+08 -0.158824
657645 900957.XSHG 2025-03 2025-03-24 0.310 5.667200e+07 2024-10-29 2024-09-30 6.148702e+08 0.083916

657646 rows × 9 columns

In [195]:
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1) # 上一期的BM影响下一期ret
In [196]:
stk_df_m
Out[196]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date
0 000001.XSHE 2008-01 2008-01-31 1053.778 5.850212e+10 NaT NaT NaN -0.004504 2008-02
1 000001.XSHE 2008-02 2008-02-29 1049.032 5.823860e+10 NaT NaT NaN -0.149321 2008-03
2 000001.XSHE 2008-03 2008-03-31 892.389 4.954234e+10 NaT NaT NaN 0.050355 2008-04
3 000001.XSHE 2008-04 2008-04-30 937.325 5.203702e+10 2008-04-24 2008-03-31 1.404138e+10 -0.148211 2008-05
4 000001.XSHE 2008-05 2008-05-30 798.403 4.432458e+10 2008-04-24 2008-03-31 1.404138e+10 -0.233848 2008-06
5 000001.XSHE 2008-06 2008-06-30 611.698 4.140495e+10 2008-04-24 2008-03-31 1.404138e+10 0.076047 2008-07
6 000001.XSHE 2008-07 2008-07-31 658.216 4.455369e+10 2008-04-24 2008-03-31 1.404138e+10 -0.028846 2008-08
7 000001.XSHE 2008-08 2008-08-29 639.229 4.326849e+10 2008-08-21 2008-06-30 1.694330e+10 -0.257922 2008-09
8 000001.XSHE 2008-09 2008-09-26 474.358 3.210865e+10 2008-08-21 2008-06-30 1.694330e+10 -0.271959 2008-10
9 000001.XSHE 2008-10 2008-10-31 345.352 2.330715e+10 2008-10-24 2008-09-30 1.837466e+10 0.074075 2008-11
... ... ... ... ... ... ... ... ... ... ...
657636 900957.XSHG 2024-06 2024-06-28 0.174 3.183200e+07 2024-04-26 2024-03-31 6.064550e+08 0.080460 2024-07
657637 900957.XSHG 2024-07 2024-07-31 0.188 3.440800e+07 2024-04-26 2024-03-31 6.064550e+08 0.436170 2024-08
657638 900957.XSHG 2024-08 2024-08-30 0.270 4.931200e+07 2024-08-26 2024-06-30 6.118875e+08 0.311111 2024-09
657639 900957.XSHG 2024-09 2024-09-30 0.354 6.458400e+07 2024-08-26 2024-06-30 6.118875e+08 0.132768 2024-10
657640 900957.XSHG 2024-10 2024-10-31 0.401 7.323200e+07 2024-10-29 2024-09-30 6.148702e+08 -0.022444 2024-11
657641 900957.XSHG 2024-11 2024-11-29 0.392 7.157600e+07 2024-10-29 2024-09-30 6.148702e+08 -0.193878 2024-12
657642 900957.XSHG 2024-12 2024-12-31 0.316 5.777600e+07 2024-10-29 2024-09-30 6.148702e+08 0.075949 2025-01
657643 900957.XSHG 2025-01 2025-01-27 0.340 6.200800e+07 2024-10-29 2024-09-30 6.148702e+08 -0.158824 2025-02
657644 900957.XSHG 2025-02 2025-02-28 0.286 5.225600e+07 2024-10-29 2024-09-30 6.148702e+08 0.083916 2025-03
657645 900957.XSHG 2025-03 2025-03-24 0.310 5.667200e+07 2024-10-29 2024-09-30 6.148702e+08 NaN NaT

657646 rows × 10 columns

这里处理停牌仍然可以用“填充NA日期”的办法,但需要在日度数据先填。日度数据填充可能会使数据变得很大,但应该更稳妥。

In [197]:
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1]
Out[197]:
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date ym_diff
206 000001.XSHE 2025-03 2025-03-24 1669.697 2.208354e+11 2025-03-15 2024-12-31 4.948420e+11 NaN NaT 9223372036854775146
413 000002.XSHE 2025-03 2025-03-24 1192.820 6.976760e+10 2024-10-31 2024-09-30 2.341199e+11 NaN NaT 9223372036854775146
545 000004.XSHE 2022-05 2022-05-05 85.971 1.463441e+09 2022-04-30 2022-03-31 9.351158e+08 0.212246 2023-06 13
567 000004.XSHE 2025-03 2025-03-24 71.756 1.326025e+09 2024-10-26 2024-09-30 1.597473e+08 NaN NaT 9223372036854775146
722 000005.XSHE 2021-04 2021-04-30 21.814 2.348641e+09 2021-04-30 2021-03-31 1.248325e+09 NaN NaT 9223372036854775193
929 000006.XSHE 2025-03 2025-03-24 471.044 8.747918e+09 2024-10-31 2024-09-30 6.448604e+09 NaN NaT 9223372036854775146
1037 000007.XSHE 2021-04 2021-04-29 47.401 1.217255e+09 2021-04-29 2021-03-31 6.318241e+07 0.870572 2022-07 15
1048 000007.XSHE 2023-05 2023-05-04 65.568 1.683767e+09 2023-04-29 2023-03-31 9.028576e+07 -0.066054 2024-07 14
1057 000007.XSHE 2025-03 2025-03-24 72.185 2.078688e+09 2024-10-31 2024-09-30 1.289361e+08 NaN NaT 9223372036854775146
1201 000008.XSHE 2025-03 2025-03-24 71.274 7.117965e+09 2024-10-31 2024-09-30 3.376579e+09 NaN NaT 9223372036854775146
... ... ... ... ... ... ... ... ... ... ... ...
656758 900951.XSHG 2018-07 2018-07-30 0.537 5.370000e+07 2018-04-21 2018-03-31 1.635296e+08 -0.912477 2020-07 24
656760 900951.XSHG 2020-08 2020-08-26 0.069 6.900000e+06 2020-04-25 2020-03-31 -2.115164e+08 NaN NaT 9223372036854775201
656958 900952.XSHG 2024-06 2024-06-03 0.387 3.765438e+07 2024-04-30 2024-03-31 6.770680e+09 NaN NaT 9223372036854775155
656962 900953.XSHG 2008-04 2008-04-30 0.430 1.000800e+08 2008-04-30 2008-03-31 5.505128e+08 0.588372 2009-07 15
657143 900953.XSHG 2024-07 2024-07-04 0.031 7.200000e+06 2024-04-26 2024-03-31 2.434969e+08 NaN NaT 9223372036854775154
657208 900955.XSHG 2013-05 2013-05-02 1.251 1.188000e+08 2013-04-27 2013-03-31 1.602086e+09 -0.194245 2014-04 11
657281 900955.XSHG 2020-04 2020-04-30 0.507 4.818000e+07 2020-04-30 2020-03-31 1.031370e+09 -0.850099 2022-06 26
657283 900955.XSHG 2022-07 2022-07-12 0.073 6.930000e+06 2022-05-07 2021-12-31 6.172171e+08 NaN NaT 9223372036854775178
657438 900956.XSHG 2020-11 2020-11-20 3.424 3.530500e+08 2020-10-29 2020-09-30 1.430181e+09 NaN NaT 9223372036854775198
657645 900957.XSHG 2025-03 2025-03-24 0.310 5.667200e+07 2024-10-29 2024-09-30 6.148702e+08 NaN NaT 9223372036854775146

6193 rows × 11 columns

In [198]:
stk_df.loc[(stk_df['secID']=='000004.XSHE')&(stk_df['ym']>='2022-05')]
Out[198]:
secID tradeDate closePrice negMarketValue ym
11854 000004.XSHE 2022-05-05 85.971 1.463441e+09 2022-05
12134 000004.XSHE 2023-06-28 103.466 1.800402e+09 2023-06
12135 000004.XSHE 2023-06-29 109.275 1.901481e+09 2023-06
12136 000004.XSHE 2023-06-30 104.218 1.813483e+09 2023-06
12137 000004.XSHE 2023-07-03 94.513 1.644621e+09 2023-07
12138 000004.XSHE 2023-07-04 101.074 1.758781e+09 2023-07
12139 000004.XSHE 2023-07-05 111.188 1.934778e+09 2023-07
12140 000004.XSHE 2023-07-06 122.328 2.128613e+09 2023-07
12141 000004.XSHE 2023-07-07 116.040 2.019209e+09 2023-07
12142 000004.XSHE 2023-07-10 104.423 1.817050e+09 2023-07
... ... ... ... ... ...
12545 000004.XSHE 2025-03-11 83.579 1.544503e+09 2025-03
12546 000004.XSHE 2025-03-12 83.716 1.547029e+09 2025-03
12547 000004.XSHE 2025-03-13 82.144 1.517983e+09 2025-03
12548 000004.XSHE 2025-03-14 83.784 1.548292e+09 2025-03
12549 000004.XSHE 2025-03-17 84.946 1.569761e+09 2025-03
12550 000004.XSHE 2025-03-18 83.511 1.543241e+09 2025-03
12551 000004.XSHE 2025-03-19 82.007 1.515457e+09 2025-03
12552 000004.XSHE 2025-03-20 81.734 1.510406e+09 2025-03
12553 000004.XSHE 2025-03-21 79.615 1.471256e+09 2025-03
12554 000004.XSHE 2025-03-24 71.756 1.326025e+09 2025-03

422 rows × 5 columns

In [199]:
# 停牌删去
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1,'ret'] = np.nan
In [200]:
# 查看数据
temp = stk_df_m['secID'].unique()
display(stk_df_m[stk_df_m['secID'] == np.random.choice(temp,1)[0]])
secID ym tradeDate closePrice negMarketValue publishDate endDate book ret ret_date ym_diff
574809 603156.XSHG 2018-02 2018-02-28 81.910 3.526226e+09 2018-02-09 2017-09-30 6.551854e+09 -0.120986 2018-03 1
574810 603156.XSHG 2018-03 2018-03-30 72.000 3.099600e+09 2018-02-09 2017-09-30 6.551854e+09 0.160417 2018-04 1
574811 603156.XSHG 2018-04 2018-04-27 83.550 3.596828e+09 2018-04-14 2018-03-31 1.154757e+10 0.252663 2018-05 1
574812 603156.XSHG 2018-05 2018-05-31 104.660 4.383437e+09 2018-05-19 2017-12-31 7.336907e+09 -0.146704 2018-06 1
574813 603156.XSHG 2018-06 2018-06-29 89.306 3.740356e+09 2018-05-19 2017-12-31 7.336907e+09 -0.096679 2018-07 1
574814 603156.XSHG 2018-07 2018-07-31 80.672 3.378736e+09 2018-05-19 2017-12-31 7.336907e+09 -0.152692 2018-08 1
574815 603156.XSHG 2018-08 2018-08-31 68.354 2.862825e+09 2018-08-16 2018-06-30 1.054963e+10 0.119993 2018-09 1
574816 603156.XSHG 2018-09 2018-09-28 76.556 3.206364e+09 2018-08-16 2018-06-30 1.054963e+10 -0.210526 2018-10 1
574817 603156.XSHG 2018-10 2018-10-31 60.439 2.531340e+09 2018-10-26 2018-09-30 1.095350e+10 -0.004997 2018-11 1
574818 603156.XSHG 2018-11 2018-11-30 60.137 2.518683e+09 2018-10-26 2018-09-30 1.095350e+10 -0.005022 2018-12 1
... ... ... ... ... ... ... ... ... ... ... ...
574885 603156.XSHG 2024-06 2024-06-28 75.799 2.691705e+10 2024-04-24 2024-03-31 1.190245e+10 -0.005647 2024-07 1
574886 603156.XSHG 2024-07 2024-07-31 75.371 2.676519e+10 2024-04-24 2024-03-31 1.190245e+10 -0.122925 2024-08 1
574887 603156.XSHG 2024-08 2024-08-30 66.106 2.347491e+10 2024-08-26 2024-06-30 9.714359e+09 0.247436 2024-09 1
574888 603156.XSHG 2024-09 2024-09-30 82.463 2.928352e+10 2024-09-10 2024-03-31 1.160645e+10 -0.106314 2024-10 1
574889 603156.XSHG 2024-10 2024-10-31 73.696 2.617041e+10 2024-10-26 2024-09-30 9.871796e+09 0.036759 2024-11 1
574890 603156.XSHG 2024-11 2024-11-29 76.405 2.713218e+10 2024-10-26 2024-09-30 9.871796e+09 0.065297 2024-12 1
574891 603156.XSHG 2024-12 2024-12-31 81.394 2.890387e+10 2024-10-26 2024-09-30 9.871796e+09 -0.035899 2025-01 1
574892 603156.XSHG 2025-01 2025-01-27 78.472 2.775131e+10 2024-10-26 2024-09-30 9.871796e+09 -0.043149 2025-02 1
574893 603156.XSHG 2025-02 2025-02-28 75.086 2.655405e+10 2024-10-26 2024-09-30 9.871796e+09 0.050782 2025-03 1
574894 603156.XSHG 2025-03 2025-03-24 78.899 2.790255e+10 2024-10-26 2024-09-30 9.871796e+09 NaN NaT 9223372036854775146

86 rows × 11 columns

In [201]:
del temp
In [202]:
stk_df_m.drop(['tradeDate','closePrice','publishDate','endDate', 'ym_diff'],axis=1,inplace=True)

stk_df_m.rename(columns={'ym':'mktcap_book_date','negMarketValue':'mktcap'},inplace=True)

stk_df_m.dropna(inplace=True)

stk_df_m
Out[202]:
secID mktcap_book_date mktcap book ret ret_date
3 000001.XSHE 2008-04 5.203702e+10 1.404138e+10 -0.148211 2008-05
4 000001.XSHE 2008-05 4.432458e+10 1.404138e+10 -0.233848 2008-06
5 000001.XSHE 2008-06 4.140495e+10 1.404138e+10 0.076047 2008-07
6 000001.XSHE 2008-07 4.455369e+10 1.404138e+10 -0.028846 2008-08
7 000001.XSHE 2008-08 4.326849e+10 1.694330e+10 -0.257922 2008-09
8 000001.XSHE 2008-09 3.210865e+10 1.694330e+10 -0.271959 2008-10
9 000001.XSHE 2008-10 2.330715e+10 1.837466e+10 0.074075 2008-11
10 000001.XSHE 2008-11 2.503361e+10 1.837466e+10 0.052279 2008-12
11 000001.XSHE 2008-12 2.634237e+10 1.837466e+10 0.230446 2009-01
12 000001.XSHE 2009-01 3.241281e+10 1.837466e+10 0.185567 2009-02
... ... ... ... ... ... ...
657635 900957.XSHG 2024-05 3.790400e+07 6.064550e+08 -0.163462 2024-06
657636 900957.XSHG 2024-06 3.183200e+07 6.064550e+08 0.080460 2024-07
657637 900957.XSHG 2024-07 3.440800e+07 6.064550e+08 0.436170 2024-08
657638 900957.XSHG 2024-08 4.931200e+07 6.118875e+08 0.311111 2024-09
657639 900957.XSHG 2024-09 6.458400e+07 6.118875e+08 0.132768 2024-10
657640 900957.XSHG 2024-10 7.323200e+07 6.148702e+08 -0.022444 2024-11
657641 900957.XSHG 2024-11 7.157600e+07 6.148702e+08 -0.193878 2024-12
657642 900957.XSHG 2024-12 5.777600e+07 6.148702e+08 0.075949 2025-01
657643 900957.XSHG 2025-01 6.200800e+07 6.148702e+08 -0.158824 2025-02
657644 900957.XSHG 2025-02 5.225600e+07 6.148702e+08 0.083916 2025-03

647090 rows × 6 columns

Merge

In [203]:
ret_df = pd.merge(stk_df_m, rf, left_on='ret_date',right_on='ym')
ret_df.drop('ym',axis=1,inplace=True)
ret_df['exret'] = ret_df['ret'] - ret_df['rf']
ret_df.sort_values(['secID','ret_date'],inplace=True)
ret_df.reset_index(drop=True,inplace=True)
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mktcap_book_date'],right_on=['secID','ym'])
# display(ret_df)
ret_df.drop(['ym','rf','ret'],axis=1,inplace=True)
ret_df.rename(columns={'mktcap_book_date':'grouping_date'},inplace=True)
ret_df['bm'] = ret_df['book'] / ret_df['mktcap']
ret_df['size'] = np.log(ret_df['mktcap'])
ret_df = ret_df[['secID','grouping_date','size','mktcap','bm',
                 'beta','ret_date','exret']]
In [204]:
ret_df
Out[204]:
secID grouping_date size mktcap bm beta ret_date exret
0 000001.XSHE 2008-04 24.675221 5.203702e+10 0.269834 1.0212 2008-05 -0.151954
1 000001.XSHE 2008-05 24.514805 4.432458e+10 0.316785 1.0697 2008-06 -0.237580
2 000001.XSHE 2008-06 24.446666 4.140495e+10 0.339123 1.0672 2008-07 0.072365
3 000001.XSHE 2008-07 24.519961 4.455369e+10 0.315156 1.0966 2008-08 -0.032450
4 000001.XSHE 2008-08 24.490690 4.326849e+10 0.391585 1.0386 2008-09 -0.261513
5 000001.XSHE 2008-09 24.192391 3.210865e+10 0.527686 1.1184 2008-10 -0.275481
6 000001.XSHE 2008-10 23.872026 2.330715e+10 0.788370 1.1991 2008-11 0.071012
7 000001.XSHE 2008-11 23.943485 2.503361e+10 0.734000 1.2192 2008-12 0.050371
8 000001.XSHE 2008-12 23.994445 2.634237e+10 0.697533 1.2206 2009-01 0.229190
9 000001.XSHE 2009-01 24.201820 3.241281e+10 0.566895 1.2514 2009-02 0.184479
... ... ... ... ... ... ... ... ...
622849 689009.XSHG 2024-04 23.677179 1.918086e+10 0.278208 1.7081 2024-05 0.013089
622850 689009.XSHG 2024-05 23.685209 1.933550e+10 0.275983 1.5877 2024-06 -0.020732
622851 689009.XSHG 2024-06 23.723333 2.008689e+10 0.265660 1.5890 2024-07 0.111234
622852 689009.XSHG 2024-07 23.831741 2.238688e+10 0.238366 1.5046 2024-08 0.088702
622853 689009.XSHG 2024-08 23.918693 2.442059e+10 0.230185 1.1651 2024-09 0.076239
622854 689009.XSHG 2024-09 23.994078 2.633272e+10 0.213470 1.0361 2024-10 -0.082991
622855 689009.XSHG 2024-10 23.909704 2.420206e+10 0.250256 1.3022 2024-11 0.002438
622856 689009.XSHG 2024-11 23.914333 2.431436e+10 0.249100 1.2585 2024-12 0.065368
622857 689009.XSHG 2024-12 23.985411 2.610548e+10 0.232009 1.2255 2025-01 0.078772
622858 689009.XSHG 2025-01 24.063479 2.822514e+10 0.214585 1.1695 2025-02 0.111100

622859 rows × 8 columns

Sorting on BM poin-in-time

In [205]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))

quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)

ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')

portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)

portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()

mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[205]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.001817 0.004890 0.006940 0.007792 0.009464 0.010692 0.011430 0.01215 0.013274 0.014184 0.012367
t-value 0.292357 0.788418 1.095306 1.233720 1.479918 1.670487 1.775515 1.89019 1.949025 1.986427 3.257382

结论:

  • 用最新的BM更新portfolio可以带来收益率的递增,但每个portfolio本身的收益率并不显著为正,除了p10
  • p10和p1的差距是显著为正的

Sorting on BM with data from Uqer

In [206]:
# %%time
# begin_ = 2008
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# pb_df = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=['secID','tradeDate','PB'],pandas="1")
In [209]:
# # # 从优矿下载 PB,时间较长。由于优矿的限制,每次下载3年的数据
# # %%time
# pb = {}
# begin_ = 2008
# end_ = 2011
# while begin_ <= 2025:
#     if begin_ == 2025:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=['secID','tradeDate','PB'],pandas="1")
#     else:
#         pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=['secID','tradeDate','PB'],pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(pb)):
#     pb_df = pd.DataFrame(np.vstack([_df for _df in pb.values()]),columns=['secID','tradeDate','PB'])
    
# pb_df.to_pickle('./data/pb_df.pkl')

# # About 5mins
In [211]:
pb_df = pd.read_pickle('./data/pb_df.pkl')
In [212]:
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])

pb_df['PB'] = pd.to_numeric(pb_df['PB'])

pb_df['grouping_date'] = pb_df['tradeDate'].dt.to_period('M')

pb_df = pb_df.groupby(['secID','grouping_date'],as_index=False).last()

pb_df['bm_uqer'] = 1 / pb_df['PB']

ret_df = pd.merge(ret_df,pb_df[['secID','grouping_date','bm_uqer']],on=['secID','grouping_date'])

del pb_df
In [213]:
ret_df
Out[213]:
secID grouping_date size mktcap bm beta ret_date exret bm_uqer
0 000001.XSHE 2008-04 24.675221 5.203702e+10 0.269834 1.0212 2008-05 -0.151954 0.206701
1 000001.XSHE 2008-05 24.514805 4.432458e+10 0.316785 1.0697 2008-06 -0.237580 0.242665
2 000001.XSHE 2008-06 24.446666 4.140495e+10 0.339123 1.0672 2008-07 0.072365 0.304090
3 000001.XSHE 2008-07 24.519961 4.455369e+10 0.315156 1.0966 2008-08 -0.032450 0.282598
4 000001.XSHE 2008-08 24.490690 4.326849e+10 0.391585 1.0386 2008-09 -0.261513 0.351136
5 000001.XSHE 2008-09 24.192391 3.210865e+10 0.527686 1.1184 2008-10 -0.275481 0.473171
6 000001.XSHE 2008-10 23.872026 2.330715e+10 0.788370 1.1991 2008-11 0.071012 0.706914
7 000001.XSHE 2008-11 23.943485 2.503361e+10 0.734000 1.2192 2008-12 0.050371 0.658155
8 000001.XSHE 2008-12 23.994445 2.634237e+10 0.697533 1.2206 2009-01 0.229190 0.625469
9 000001.XSHE 2009-01 24.201820 3.241281e+10 0.566895 1.2514 2009-02 0.184479 0.508337
... ... ... ... ... ... ... ... ... ...
622849 689009.XSHG 2024-04 23.677179 1.918086e+10 0.278208 1.7081 2024-05 0.013089 0.200610
622850 689009.XSHG 2024-05 23.685209 1.933550e+10 0.275983 1.5877 2024-06 -0.020732 0.199005
622851 689009.XSHG 2024-06 23.723333 2.008689e+10 0.265660 1.5890 2024-07 0.111234 0.202319
622852 689009.XSHG 2024-07 23.831741 2.238688e+10 0.238366 1.5046 2024-08 0.088702 0.181729
622853 689009.XSHG 2024-08 23.918693 2.442059e+10 0.230185 1.1651 2024-09 0.076239 0.175497
622854 689009.XSHG 2024-09 23.994078 2.633272e+10 0.213470 1.0361 2024-10 -0.082991 0.162713
622855 689009.XSHG 2024-10 23.909704 2.420206e+10 0.250256 1.3022 2024-11 0.002438 0.190752
622856 689009.XSHG 2024-11 23.914333 2.431436e+10 0.249100 1.2585 2024-12 0.065368 0.189876
622857 689009.XSHG 2024-12 23.985411 2.610548e+10 0.232009 1.2255 2025-01 0.078772 0.177806
622858 689009.XSHG 2025-01 24.063479 2.822514e+10 0.214585 1.1695 2025-02 0.111100 0.164468

622859 rows × 9 columns

In [214]:
ret_df.loc[ret_df['secID']=='000001.XSHE',['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
Out[214]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f69ba56ec10>
In [216]:
sample_id = np.random.choice(ret_df['secID'].unique(),1)
display(ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']])
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
grouping_date bm bm_uqer
589660 2019-02 1.472720 0.147273
589661 2019-03 0.688224 0.068823
589662 2019-04 1.178440 0.117844
589663 2019-05 1.062270 0.106227
589664 2019-06 1.362319 0.136232
589665 2019-07 1.571769 0.157178
589666 2019-08 1.423675 0.142367
589667 2019-09 1.072778 0.107278
589668 2019-10 1.527213 0.152721
589669 2019-11 1.605082 0.160508
... ... ... ...
589722 2024-04 0.566053 0.539025
589723 2024-05 0.624323 0.594495
589724 2024-06 0.719559 0.685166
589725 2024-07 0.765703 0.729129
589726 2024-08 0.742583 0.705020
589727 2024-09 0.677102 0.642839
589728 2024-10 0.614226 0.581564
589729 2024-11 0.601869 0.569866
589730 2024-12 0.655242 0.620424
589731 2025-01 0.645494 0.611172

72 rows × 3 columns

Out[216]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f688a948a10>
In [217]:
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot(subplots=True)
Out[217]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f685da61050>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f685da41d50>],
      dtype=object)
In [218]:
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))

quantile_df = pd.DataFrame()
for key, value in q.items():
    quantile_df[key] = ret_df.groupby(['grouping_date'])['bm_uqer'].quantile(value)

ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')

portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']

portfolios['p1'] = ret_df_q.loc[ret_df_q['bm_uqer'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
    idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm_uqer']) & (ret_df_q['bm_uqer'] <= ret_df_q[f'q{i}'])
    portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm_uqer'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)

portfolios_crs_mean = dict()
for k in portfolios.keys():
    portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()

mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
    y = portfolios_crs_mean[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
             columns=mean_values.keys())
Out[218]:
p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p10-p1
mean 0.000755 0.006287 0.008066 0.009590 0.010874 0.011532 0.011324 0.011776 0.011783 0.010721 0.009966
t-value 0.118962 0.967509 1.203991 1.488412 1.647019 1.795274 1.739060 1.826312 1.835725 1.692520 2.832128

Double Sorting on Size and BM

In [219]:
q_size = dict()
keys = ['q_size_1']
values = [0.5]
q_size.update(zip(keys,values))

q_bm = dict()
keys = ['q_bm_1','q_bm_2']
values = [0.3, 0.7]
q_bm.update(zip(keys,values))

q_size_df = pd.DataFrame()
for key, value in q_size.items():
    q_size_df[key] = ret_df.groupby(['grouping_date'])['size'].quantile(value)

q_bm_df = pd.DataFrame()
for key, value in q_bm.items():
    q_bm_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)

ret_df_q = pd.merge(ret_df, q_size_df, on='grouping_date')
ret_df_q = pd.merge(ret_df_q, q_bm_df, on='grouping_date')

portfolios_size = dict()
portfolios_size['size1'] = ret_df_q.loc[ret_df_q['size'] <= ret_df_q['q_size_1'],
                                           ['secID','grouping_date','ret_date','exret','size','mktcap']]
portfolios_size['size2'] = ret_df_q.loc[ret_df_q['size'] >= ret_df_q['q_size_1'],
                                           ['secID','grouping_date','ret_date','exret','size','mktcap']]

portfolios_bm = dict()
portfolios_bm['bm1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q_bm_1'],
                                           ['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm2'] = ret_df_q.loc[(ret_df_q['bm'] >= ret_df_q['q_bm_1']) & \
                                           (ret_df_q['bm'] <= ret_df_q['q_bm_2']),
                                           ['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm3'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q_bm_2'],
                                           ['secID','grouping_date','ret_date','exret','bm']]

portfolios = dict()
for bm_group in portfolios_bm.keys():
    for size_group in portfolios_size.keys():
        portfolios[f'{bm_group}_{size_group}'] = pd.merge(portfolios_size[size_group],
                                                              portfolios_bm[bm_group][['secID','ret_date','bm']],
                                                              on=['secID','ret_date'])

mean_portfolios_ret = dict()
for pf in portfolios.keys():
    mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
    print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有bm和size分组没有任何交叉

# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['bm1_size1'].index

# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
    y = mean_portfolios_ret[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]

pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
(202,)
(202,)
(202,)
(202,)
(202,)
(202,)
Out[219]:
bm1_size1 bm1_size2 bm2_size1 bm2_size2 bm3_size1 bm3_size2
ret_mean 0.010345 0.001183 0.015280 0.004157 0.018916 0.004365
t_values 1.493421 0.190115 2.226204 0.667641 2.514287 0.724760

Fama MacBeth regression

In [220]:
# ret_df[(ret_df['ret_date'] >= '2008-02') & (ret_df['secID'] == '000001.XSHE')]
In [221]:
ret_df['exret100'] = ret_df['exret'] * 100

def fm_reg(df):
    df_  = df.dropna()
    if df_.shape[0] < 15:
        return None
    reg = LinearRegression().fit(y=df_.loc[:,'exret100'], X=df_.loc[:,['beta','size','bm']])
    return np.insert(reg.coef_, 0, reg.intercept_)

temp = ret_df.groupby('ret_date').apply(fm_reg)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept', 'beta_coef','size_coef', 'bm_coef']
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
    y = reg_result_df[k]
    const = np.full(shape=len(y),fill_value=1)
    reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
    mean_values[k] = reg.params[0]
    t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
Out[221]:
intercept beta_coef size_coef bm_coef
ret_mean 9.514657 0.316134 -0.412188 0.020470
t_values 2.881077 0.926961 -2.972695 0.405328
In [222]:
portfolios
Out[222]:
{'bm1_size1':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000014.XSHE       2008-04  2008-05  0.285851  21.064186  1.406241e+09   
 1      000019.XSHE       2008-04  2008-05 -0.099410  20.835595  1.118882e+09   
 2      000028.XSHE       2008-04  2008-05  0.003525  21.411577  1.990351e+09   
 3      000049.XSHE       2008-04  2008-05  0.000660  20.182470  5.822848e+08   
 4      000056.XSHE       2008-04  2008-05  0.059926  20.705572  9.824619e+08   
 5      000411.XSHE       2008-04  2008-05  0.028104  20.589655  8.749305e+08   
 6      000415.XSHE       2008-04  2008-05 -0.088174  21.297845  1.776383e+09   
 7      000421.XSHE       2008-04  2008-05 -0.041262  21.173249  1.568286e+09   
 8      000502.XSHE       2008-04  2008-05 -0.056025  20.652852  9.320083e+08   
 9      000504.XSHE       2008-04  2008-05 -0.025740  21.063705  1.405565e+09   
 ...            ...           ...      ...       ...        ...           ...   
 70256  688636.XSHG       2025-01  2025-02  0.238270  21.849751  3.084794e+09   
 70257  688662.XSHG       2025-01  2025-02  0.189509  21.867917  3.141344e+09   
 70258  688665.XSHG       2025-01  2025-02  0.180831  21.928738  3.338335e+09   
 70259  688682.XSHG       2025-01  2025-02  0.011077  21.589806  2.378667e+09   
 70260  688683.XSHG       2025-01  2025-02  0.041667  21.778934  2.873895e+09   
 70261  688699.XSHG       2025-01  2025-02  0.099571  22.026630  3.681661e+09   
 70262  688711.XSHG       2025-01  2025-02  0.103312  21.951345  3.414663e+09   
 70263  688768.XSHG       2025-01  2025-02  0.168741  21.923641  3.321362e+09   
 70264  688786.XSHG       2025-01  2025-02  0.076382  21.809629  2.963478e+09   
 70265  688793.XSHG       2025-01  2025-02  0.120302  21.533072  2.247472e+09   
 
              bm  
 0      0.235730  
 1      0.286621  
 2      0.262335  
 3      0.260120  
 4      0.246871  
 5      0.139386  
 6      0.231220  
 7      0.341337  
 8      0.214120  
 9      0.319674  
 ...         ...  
 70256  0.349945  
 70257  0.217974  
 70258  0.292719  
 70259  0.274688  
 70260  0.356041  
 70261  0.335746  
 70262  0.328296  
 70263  0.241562  
 70264  0.238720  
 70265  0.168434  
 
 [70266 rows x 7 columns],
 'bm1_size2':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000001.XSHE       2008-04  2008-05 -0.151954  24.675221  5.203702e+10   
 1       000002.XSHE       2008-04  2008-05 -0.193320  25.688758  1.433791e+11   
 2       000006.XSHE       2008-04  2008-05 -0.157475  22.566844  6.319107e+09   
 3       000028.XSHE       2008-04  2008-05  0.003525  21.411577  1.990351e+09   
 4       000060.XSHE       2008-04  2008-05 -0.076426  23.414869  1.475531e+10   
 5       000061.XSHE       2008-04  2008-05 -0.126181  22.681420  7.086236e+09   
 6       000069.XSHE       2008-04  2008-05 -0.169790  23.775237  2.115700e+10   
 7       000078.XSHE       2008-04  2008-05  0.034607  21.651166  2.529193e+09   
 8       000100.XSHE       2008-04  2008-05 -0.013063  23.325486  1.349367e+10   
 9       000157.XSHE       2008-04  2008-05 -0.003743  23.590576  1.758965e+10   
 ...             ...           ...      ...       ...        ...           ...   
 116664  688700.XSHG       2025-01  2025-02  0.238725  22.773584  7.770372e+09   
 116665  688702.XSHG       2025-01  2025-02 -0.138993  23.535269  1.664322e+10   
 116666  688728.XSHG       2025-01  2025-02  0.158480  23.720902  2.003811e+10   
 116667  688766.XSHG       2025-01  2025-02  0.095634  23.160074  1.143647e+10   
 116668  688776.XSHG       2025-01  2025-02  0.228926  22.372595  5.203487e+09   
 116669  688777.XSHG       2025-01  2025-02  0.078721  24.395173  3.932683e+10   
 116670  688787.XSHG       2025-01  2025-02 -0.262705  22.812951  8.082370e+09   
 116671  688789.XSHG       2025-01  2025-02 -0.010725  23.243859  1.243598e+10   
 116672  688800.XSHG       2025-01  2025-02 -0.243304  23.094516  1.071077e+10   
 116673  689009.XSHG       2025-01  2025-02  0.111100  24.063479  2.822514e+10   
 
               bm  
 0       0.269834  
 1       0.210656  
 2       0.309010  
 3       0.262335  
 4       0.252189  
 5       0.241808  
 6       0.241221  
 7       0.282157  
 8       0.290794  
 9       0.226938  
 ...          ...  
 116664  0.222991  
 116665  0.139247  
 116666  0.380056  
 116667  0.188675  
 116668  0.358599  
 116669  0.253764  
 116670  0.091190  
 116671  0.243896  
 116672  0.189438  
 116673  0.214585  
 
 [116674 rows x 7 columns],
 'bm2_size1':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000023.XSHE       2008-04  2008-05 -0.087917  19.991081  4.808574e+08   
 1       000025.XSHE       2008-04  2008-05 -0.059969  19.810400  4.013723e+08   
 2       000032.XSHE       2008-04  2008-05 -0.041783  21.128654  1.499885e+09   
 3       000033.XSHE       2008-04  2008-05 -0.045282  20.547741  8.390168e+08   
 4       000055.XSHE       2008-04  2008-05 -0.138600  21.000251  1.319146e+09   
 5       000065.XSHE       2008-04  2008-05  0.019470  20.404494  7.270412e+08   
 6       000070.XSHE       2008-04  2008-05 -0.043537  20.680975  9.585913e+08   
 7       000151.XSHE       2008-04  2008-05 -0.022690  21.000098  1.318945e+09   
 8       000153.XSHE       2008-04  2008-05  0.036690  20.698422  9.754621e+08   
 9       000404.XSHE       2008-04  2008-05 -0.010185  20.810400  1.091043e+09   
 ...             ...           ...      ...       ...        ...           ...   
 128608  688691.XSHG       2025-01  2025-02  0.186214  21.337776  1.848750e+09   
 128609  688701.XSHG       2025-01  2025-02  0.064576  20.181178  5.815329e+08   
 128610  688716.XSHG       2025-01  2025-02  0.048406  21.691853  2.634221e+09   
 128611  688718.XSHG       2025-01  2025-02  0.089782  21.276738  1.739282e+09   
 128612  688720.XSHG       2025-01  2025-02  0.022322  21.541543  2.266591e+09   
 128613  688722.XSHG       2025-01  2025-02  0.239206  21.761323  2.823726e+09   
 128614  688733.XSHG       2025-01  2025-02  0.046331  21.819397  2.992565e+09   
 128615  688767.XSHG       2025-01  2025-02 -0.010739  21.977753  3.506039e+09   
 128616  688788.XSHG       2025-01  2025-02  0.860445  21.942304  3.383933e+09   
 128617  688799.XSHG       2025-01  2025-02 -0.050065  22.091723  3.929282e+09   
 
               bm  
 0       0.599751  
 1       0.378219  
 2       0.547492  
 3       0.361216  
 4       0.428528  
 5       0.499997  
 6       0.652727  
 7       0.672036  
 8       0.651200  
 9       0.397495  
 ...          ...  
 128608  0.744005  
 128609  0.410440  
 128610  0.450630  
 128611  0.438686  
 128612  0.447260  
 128613  0.457573  
 128614  0.737121  
 128615  0.677626  
 128616  0.666130  
 128617  0.478947  
 
 [128618 rows x 7 columns],
 'bm2_size2':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000012.XSHE       2008-04  2008-05 -0.040189  22.744516  7.547756e+09   
 1       000031.XSHE       2008-04  2008-05 -0.125112  23.006117  9.804597e+09   
 2       000036.XSHE       2008-04  2008-05  0.066895  22.000617  3.587125e+09   
 3       000040.XSHE       2008-04  2008-05 -0.007356  21.500034  2.174433e+09   
 4       000063.XSHE       2008-04  2008-05  0.117195  24.223222  3.311399e+10   
 5       000155.XSHE       2008-04  2008-05 -0.003743  21.981573  3.519460e+09   
 6       000159.XSHE       2008-04  2008-05  0.059433  21.493075  2.159354e+09   
 7       000301.XSHE       2008-04  2008-05 -0.116094  22.597464  6.515594e+09   
 8       000338.XSHE       2008-04  2008-05 -0.152567  23.233858  1.231222e+10   
 9       000410.XSHE       2008-04  2008-05 -0.153743  21.997159  3.574743e+09   
 ...             ...           ...      ...       ...        ...           ...   
 120510  688626.XSHG       2025-01  2025-02  0.196791  22.277445  4.731200e+09   
 120511  688658.XSHG       2025-01  2025-02  0.013547  22.611925  6.610500e+09   
 120512  688660.XSHG       2025-01  2025-02 -0.085815  23.202601  1.193333e+10   
 120513  688692.XSHG       2025-01  2025-02 -0.046837  22.448957  5.616400e+09   
 120514  688772.XSHG       2025-01  2025-02  0.132951  23.577447  1.736021e+10   
 120515  688778.XSHG       2025-01  2025-02  0.260458  23.617170  1.806370e+10   
 120516  688779.XSHG       2025-01  2025-02  0.092038  22.946911  9.240959e+09   
 120517  688798.XSHG       2025-01  2025-02  0.030891  23.002465  9.768850e+09   
 120518  688819.XSHG       2025-01  2025-02  0.077964  23.972114  2.576065e+10   
 120519  688981.XSHG       2025-01  2025-02  0.038916  25.953232  1.867865e+11   
 
               bm  
 0       0.556775  
 1       0.346021  
 2       0.445921  
 3       0.431583  
 4       0.386975  
 5       0.475067  
 6       0.476546  
 7       0.355928  
 8       0.576120  
 9       0.362070  
 ...          ...  
 120510  0.428142  
 120511  0.554215  
 120512  0.452216  
 120513  0.547989  
 120514  0.402526  
 120515  0.477779  
 120516  0.798369  
 120517  0.395175  
 120518  0.616450  
 120519  0.773773  
 
 [120520 rows x 7 columns],
 'bm3_size1':               secID grouping_date ret_date     exret       size        mktcap  \
 0       000026.XSHE       2008-04  2008-05 -0.043301  20.645446  9.251313e+08   
 1       000029.XSHE       2008-04  2008-05  0.016215  21.120014  1.486981e+09   
 2       000037.XSHE       2008-04  2008-05 -0.108884  21.006348  1.327215e+09   
 3       000043.XSHE       2008-04  2008-05 -0.147016  21.172493  1.567101e+09   
 4       000045.XSHE       2008-04  2008-05  0.100362  19.650435  3.420390e+08   
 5       000050.XSHE       2008-04  2008-05 -0.067383  21.245501  1.685791e+09   
 6       000062.XSHE       2008-04  2008-05  0.008342  21.044204  1.378420e+09   
 7       000066.XSHE       2008-04  2008-05 -0.052229  21.402978  1.973310e+09   
 8       000068.XSHE       2008-04  2008-05 -0.031456  21.229588  1.659178e+09   
 9       000096.XSHE       2008-04  2008-05  0.009034  21.189470  1.593933e+09   
 ...             ...           ...      ...       ...        ...           ...   
 112606  688708.XSHG       2025-01  2025-02 -0.028212  21.434009  2.035504e+09   
 112607  688709.XSHG       2025-01  2025-02  0.306174  21.414694  1.996564e+09   
 112608  688710.XSHG       2025-01  2025-02  0.128962  20.606692  8.899640e+08   
 112609  688717.XSHG       2025-01  2025-02  0.035921  21.853852  3.097471e+09   
 112610  688719.XSHG       2025-01  2025-02  0.257954  21.357338  1.885272e+09   
 112611  688721.XSHG       2025-01  2025-02  0.054163  20.953210  1.258529e+09   
 112612  688726.XSHG       2025-01  2025-02  0.188408  21.071609  1.416719e+09   
 112613  688737.XSHG       2025-01  2025-02  0.125799  20.860001  1.146524e+09   
 112614  688750.XSHG       2025-01  2025-02  0.074161  21.157487  1.543761e+09   
 112615  688758.XSHG       2025-01  2025-02  0.133159  20.185871  5.842686e+08   
 
               bm  
 0       0.704629  
 1       0.788133  
 2       1.329284  
 3       0.955745  
 4       1.069320  
 5       0.909661  
 6       1.089287  
 7       0.890952  
 8       1.136797  
 9       0.941514  
 ...          ...  
 112606  0.987554  
 112607  1.384621  
 112608  2.665413  
 112609  1.454006  
 112610  0.935433  
 112611  0.939431  
 112612  1.923382  
 112613  1.603089  
 112614  1.178151  
 112615  1.803284  
 
 [112616 rows x 7 columns],
 'bm3_size2':              secID grouping_date ret_date     exret       size        mktcap  \
 0      000016.XSHE       2008-04  2008-05 -0.039679  21.860817  3.119121e+09   
 1      000021.XSHE       2008-04  2008-05 -0.086401  22.132387  4.092359e+09   
 2      000022.XSHE       2008-04  2008-05 -0.122478  21.489667  2.152008e+09   
 3      000024.XSHE       2008-04  2008-05 -0.234741  23.145074  1.126621e+10   
 4      000027.XSHE       2008-04  2008-05 -0.105240  22.752579  7.608860e+09   
 5      000039.XSHE       2008-04  2008-05 -0.111948  23.475675  1.568036e+10   
 6      000042.XSHE       2008-04  2008-05 -0.192470  21.490942  2.154753e+09   
 7      000046.XSHE       2008-04  2008-05 -0.188475  22.485113  5.823186e+09   
 8      000059.XSHE       2008-04  2008-05  0.149748  21.852859  3.094396e+09   
 9      000088.XSHE       2008-04  2008-05 -0.097924  22.204275  4.397380e+09   
 ...            ...           ...      ...       ...        ...           ...   
 74298  688388.XSHG       2025-01  2025-02  0.188311  22.562430  6.291277e+09   
 74299  688425.XSHG       2025-01  2025-02  0.007770  22.520024  6.030065e+09   
 74300  688439.XSHG       2025-01  2025-02  0.183238  22.408130  5.391720e+09   
 74301  688472.XSHG       2025-01  2025-02 -0.017242  23.461753  1.546358e+10   
 74302  688538.XSHG       2025-01  2025-02  0.033806  23.274856  1.282749e+10   
 74303  688548.XSHG       2025-01  2025-02  0.067872  22.599896  6.531458e+09   
 74304  688553.XSHG       2025-01  2025-02  0.159177  22.221270  4.472755e+09   
 74305  688563.XSHG       2025-01  2025-02 -0.084046  22.569600  6.336547e+09   
 74306  688707.XSHG       2025-01  2025-02  0.229402  22.348031  5.077227e+09   
 74307  688739.XSHG       2025-01  2025-02  0.093664  22.310369  4.889563e+09   
 
              bm  
 0      1.144066  
 1      0.869745  
 2      1.279411  
 3      0.702736  
 4      1.504914  
 5      1.043355  
 6      0.864991  
 7      1.378619  
 8      1.875600  
 9      0.853762  
 ...         ...  
 74298  1.111662  
 74299  2.832698  
 74300  0.897079  
 74301  1.469365  
 74302  0.841930  
 74303  0.890568  
 74304  0.867490  
 74305  1.612417  
 74306  0.908934  
 74307  1.953006  
 
 [74308 rows x 7 columns]}

Fama French 3 factors

Fama-French 3 factors 的构建:

  • Market return 就是 CAPM 里面的市场收益率
  • 另外还有 SMB,HML,也即 Small-Minus-Big, High-Minus-Low

构建方法:

  • mktcap1 也叫做 Small, mktcap2 Big. bm1 Low, bm2 Medium, bm3 High.
  • 因此对应的,我们的
    • bm1_mktcap1: SL
    • bm2_mktcap1: SM
    • bm3_mktcap1: SH
    • bm1_mktcap2: BL
    • bm2_mktcap2: BM
    • bm3_mktcap2: BH
  • 在 Fama French (1993) 的构建里,mktcap 是在t年6月形成并保持到t+1年5月不变。bm和我们这里的构建一样,t年6月按照t-1年的BM ratio构建,保持到t+1年5月不变。
  • Fama French 计算了这6组资产组合每一年从7月到下一年6月(资产形成期的第二个月的收益率)的 value-weighted excess return。weight 是t年6月的mktcap占所在portfolio 总的 mktcap 的比重。
  • SMB: (SL+SM+SH)/3 - (BL+BM+BH)/3。这样构建的意思是把BM的影响平均掉。
  • HML: (SH+BH)/2 - (SL+BL)/2

这里我们还是按照mktcap组合的构建日期,不改成和 Fama-French (1993) 原文一样的日期(t年6月)

In [223]:
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ret_date')['mktcap'].agg({'mktcapsum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ret_date')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcapsum']
    portfolios[pf]['weighted_exret'] = portfolios[pf]['exret'] * portfolios[pf]['weight']
    portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_exret'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret['bm1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
portfolios_vwret_df.rename(columns={"bm1_size1": "SL",
                                    "bm2_size1": "SM",
                                    "bm3_size1": "SH",
                                    "bm1_size2": "BL",
                                    "bm2_size2": "BM",
                                    "bm3_size2": "BH"},
                          inplace=True) # vw: value weighted
In [224]:
portfolios_vwret_df
Out[224]:
SL BL SM BM SH BH
ret_date
2008-05 -0.007284 -0.093577 -0.021644 -0.083619 -0.031219 -0.080907
2008-06 -0.255728 -0.216611 -0.266144 -0.233029 -0.250379 -0.227270
2008-07 0.102245 0.016148 0.103231 0.014999 0.121348 0.023211
2008-08 -0.244917 -0.197939 -0.226835 -0.148665 -0.226308 -0.147940
2008-09 -0.097522 -0.101487 -0.084565 -0.008817 -0.085767 -0.050331
2008-10 -0.269919 -0.252138 -0.241156 -0.267498 -0.249338 -0.271709
2008-11 0.204277 0.105977 0.214681 0.115202 0.212281 0.136421
2008-12 0.085073 0.012350 0.085156 0.023353 0.088594 -0.041147
2009-01 0.150153 0.109005 0.163256 0.153720 0.170672 0.117782
2009-02 0.071565 0.041163 0.077957 0.055364 0.106767 0.062305
... ... ... ... ... ... ...
2024-05 -0.024621 -0.028705 -0.007923 -0.003470 -0.001954 0.008990
2024-06 -0.099800 -0.045947 -0.091276 -0.048324 -0.097653 -0.019732
2024-07 0.010875 -0.014399 0.001214 -0.016881 0.010684 0.028977
2024-08 -0.021947 -0.046669 -0.031756 -0.037900 -0.039994 -0.021551
2024-09 0.249538 0.217921 0.228259 0.204215 0.226664 0.162202
2024-10 0.137455 -0.007024 0.086243 -0.002198 0.043261 -0.019062
2024-11 0.066066 0.003194 0.069243 -0.002622 0.081645 0.012220
2024-12 -0.065450 -0.024252 -0.048462 -0.015071 -0.042595 0.027961
2025-01 -0.025295 -0.025044 -0.009194 -0.028698 -0.016838 -0.031702
2025-02 0.097281 0.052045 0.082729 0.024387 0.068273 0.001713

202 rows × 6 columns

In [225]:
SMB = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3 - \
      (portfolios_vwret_df['BL'] + portfolios_vwret_df['BM'] + portfolios_vwret_df['BH']) / 3 

HML = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2 - \
      (portfolios_vwret_df['SL'] + portfolios_vwret_df['BL']) / 2 

factors_df = pd.DataFrame(np.vstack([SMB,HML])).T
factors_df.columns = ['SMB','HML']
factors_df.index = SMB.index
In [226]:
factors_df
Out[226]:
SMB HML
ret_date
2008-05 0.065986 -0.005633
2008-06 -0.031780 -0.002655
2008-07 0.090822 0.013083
2008-08 -0.067838 0.034304
2008-09 -0.035740 0.031455
2008-10 0.010311 0.000505
2008-11 0.091213 0.019224
2008-12 0.088089 -0.024988
2009-01 0.034524 0.014648
2009-02 0.032486 0.028171
... ... ...
2024-05 -0.003771 0.030181
2024-06 -0.058242 0.014181
2024-07 0.008358 0.021593
2024-08 0.004141 0.003535
2024-09 0.040041 -0.039297
2024-10 0.098415 -0.053116
2024-11 0.068054 0.012303
2024-12 -0.048381 0.037534
2025-01 0.011372 0.000900
2025-02 0.056713 -0.039669

202 rows × 2 columns

百度百科:中证800指数是由中证指数有限公司编制,其成份股是由中证500和沪深300成份股一起构成,中证800指数综合反映沪深证券市场内大中市值公司的整体状况。

In [227]:
# index_info = DataAPI.SecIDGet(assetClass="IDX",pandas="1")

# 用中证800作为market return
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df['ret_date'] = index_df['tradeDate'].dt.to_period('M')

index_df.sort_values('tradeDate',inplace=True)
index_df = index_df.groupby('ret_date',as_index=False).last()
index_df['mktret'] = index_df['closeIndex'] / index_df['closeIndex'].shift() - 1

index_df = pd.merge(index_df,rf,left_on=['ret_date'],right_on=['ym'])
index_df['exmktret'] = index_df['mktret'] - index_df['rf']

index_df.drop(['ym','mktret','rf','indexID','secShortName','tradeDate',
               'closeIndex','CHGPct'],axis=1,inplace=True)

index_df.dropna(inplace=True)

factors_df = pd.merge(index_df, factors_df, on='ret_date')

factors_df['ret_date'] = factors_df['ret_date'].dt.to_timestamp(how='end').dt.normalize()

factors_df.set_index('ret_date',inplace=True)

((1 + factors_df).cumprod()*100).plot()
Out[227]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f69ba8811d0>
In [228]:
((1 + factors_df.loc['2018':]).cumprod()*100).plot()
Out[228]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f685da568d0>
In [229]:
factors_df
Out[229]:
exmktret SMB HML
ret_date
2008-05-31 -0.080798 0.065986 -0.005633
2008-06-30 -0.236014 -0.031780 -0.002655
2008-07-31 0.014972 0.090822 0.013083
2008-08-31 -0.170063 -0.067838 0.034304
2008-09-30 -0.067913 -0.035740 0.031455
2008-10-31 -0.264035 0.010311 0.000505
2008-11-30 0.112223 0.091213 0.019224
2008-12-31 0.003717 0.088089 -0.024988
2009-01-31 0.124356 0.034524 0.014648
2009-02-28 0.057622 0.032486 0.028171
... ... ... ...
2024-05-31 -0.013421 -0.003771 0.030181
2024-06-30 -0.044333 -0.058242 0.014181
2024-07-31 -0.009161 0.008358 0.021593
2024-08-31 -0.040999 0.004141 0.003535
2024-09-30 0.214585 0.040041 -0.039297
2024-10-31 -0.018813 0.098415 -0.053116
2024-11-30 0.000586 0.068054 0.012303
2024-12-31 -0.004188 -0.048381 0.037534
2025-01-31 -0.030455 0.011372 0.000900
2025-02-28 0.024531 0.056713 -0.039669

202 rows × 3 columns

In [230]:
factors_df.to_csv('./output_data/factors/ff3.csv')

Long-only factors

In [231]:
small_only = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3 

high_only = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2 

factors_long_df = pd.DataFrame(np.vstack([small_only,high_only])).T
factors_long_df.columns = ['small_only','high_only']
factors_long_df.index = small_only.index

factors_long_df = pd.merge(index_df, factors_long_df, on='ret_date')

factors_long_df['ret_date'] = factors_long_df['ret_date'].dt.to_timestamp(freq='day',how='end').dt.normalize()

factors_long_df.set_index('ret_date',inplace=True)

((1 + factors_long_df).cumprod()*100).plot()
Out[231]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f68de244610>
In [232]:
((1 + factors_long_df.loc['2018':]).cumprod()*100).plot()
Out[232]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f69baf22110>
In [233]:
factors_long_df
Out[233]:
exmktret small_only high_only
ret_date
2008-05-31 -0.080798 -0.020049 -0.056063
2008-06-30 -0.236014 -0.257417 -0.238824
2008-07-31 0.014972 0.108941 0.072280
2008-08-31 -0.170063 -0.232686 -0.187124
2008-09-30 -0.067913 -0.089285 -0.068049
2008-10-31 -0.264035 -0.253471 -0.260523
2008-11-30 0.112223 0.210413 0.174351
2008-12-31 0.003717 0.086274 0.023723
2009-01-31 0.124356 0.161360 0.144227
2009-02-28 0.057622 0.085430 0.084536
... ... ... ...
2024-05-31 -0.013421 -0.011499 0.003518
2024-06-30 -0.044333 -0.096243 -0.058692
2024-07-31 -0.009161 0.007591 0.019831
2024-08-31 -0.040999 -0.031232 -0.030772
2024-09-30 0.214585 0.234821 0.194433
2024-10-31 -0.018813 0.088987 0.012100
2024-11-30 0.000586 0.072318 0.046932
2024-12-31 -0.004188 -0.052169 -0.007317
2025-01-31 -0.030455 -0.017109 -0.024270
2025-02-28 0.024531 0.082761 0.034993

202 rows × 3 columns

In [234]:
factors_long_df.to_csv('./output_data/factors/ff3_long_only.csv')
In [ ]:
 
In [ ]: