In [1]:
import numpy as np
import pandas as pd
import datetime as dt

pd.set_option('display.max_rows', 16)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

import gc
In [2]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)

Data

In [3]:
START = '2007-01-01'
END = '2024-04-15'
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
cond3 = stk_info['transCurrCD']=='CNY'
stk_info = stk_info[cond1 & cond2 & cond3].copy()
stk_id = stk_info['secID']

ST

In [4]:
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
In [5]:
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")

Risk free rate

In [6]:
shibor_df = DataAPI.MktIborGet(secID="Shibor1M.IRCN",beginDate=START,endDate=END,field=['secID','tradeDate','rate'],pandas="1")
shibor_df['rate'] = shibor_df['rate']*0.01/12
shibor_df['tradeDate'] = pd.to_datetime(shibor_df['tradeDate'])
shibor_df.drop('secID',axis=1,inplace=True)
shibor_df.rename(columns={'rate':'rf'},inplace=True)
shibor_df['ym'] = shibor_df['tradeDate'].dt.to_period('M')
shibor_df.sort_values('tradeDate',inplace=True)
shibor_df_m = shibor_df.groupby('ym',as_index=False).last()
shibor_df_m.drop('tradeDate',axis=1,inplace=True)
shibor_df_m
Out[6]:
ym rf
0 2007-01 0.002100
1 2007-02 0.002110
2 2007-03 0.002234
3 2007-04 0.002125
4 2007-05 0.002527
5 2007-06 0.002542
6 2007-07 0.002481
7 2007-08 0.002404
... ... ...
200 2023-09 0.001904
201 2023-10 0.001890
202 2023-11 0.001914
203 2023-12 0.002068
204 2024-01 0.001902
205 2024-02 0.001749
206 2024-03 0.001783
207 2024-04 0.001687

208 rows × 2 columns

Beta

In [7]:
beta_df = pd.read_pickle('./data/beta_df.pkl')
beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")
beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')
beta_df.drop(['Beta60','Beta120'],axis=1,inplace=True)
beta_df['Beta252'] = pd.to_numeric(beta_df['Beta252'])
# Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta252_winsor'] = beta_df['Beta252'].clip(lower=beta_df['Beta252'].quantile(lower_q),upper=beta_df['Beta252'].quantile(up_q))
# Monthly
beta_df_m = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()
beta_df_m.rename(columns={'Beta252':'beta'},inplace=True)
beta_df_m
Out[7]:
secID ym beta
0 000001.XSHE 2007-01 0.7949
1 000001.XSHE 2007-02 0.7880
2 000001.XSHE 2007-03 0.8512
3 000001.XSHE 2007-04 0.8642
4 000001.XSHE 2007-05 0.7715
5 000001.XSHE 2007-06 0.4614
6 000001.XSHE 2007-07 0.6423
7 000001.XSHE 2007-08 0.7722
... ... ... ...
619996 689009.XSHG 2023-08 0.8234
619997 689009.XSHG 2023-09 0.9152
619998 689009.XSHG 2023-10 0.9247
619999 689009.XSHG 2023-11 0.9541
620000 689009.XSHG 2023-12 1.0448
620001 689009.XSHG 2024-01 1.2314
620002 689009.XSHG 2024-02 1.4905
620003 689009.XSHG 2024-03 1.5477

620004 rows × 3 columns

BM

In [8]:
pb_df = pd.read_pickle('./data/pb_df.pkl')
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])
pb_df['PB'] = pd.to_numeric(pb_df['PB'])
pb_df['ym'] = pb_df['tradeDate'].dt.to_period('M')
pb_df.sort_values(['secID','tradeDate'],inplace=True)
pb_df = pb_df.groupby(['secID','ym'],as_index=False).last()
pb_df['bm'] = 1 / pb_df['PB']
pb_df.drop(['tradeDate','PB'],axis=1,inplace=True)
pb_df = pb_df[pb_df['bm'] >= 0]
pb_df
Out[8]:
secID ym bm
0 000001.XSHE 2007-01 0.162639
1 000001.XSHE 2007-02 0.163321
2 000001.XSHE 2007-03 0.176236
3 000001.XSHE 2007-04 0.140732
4 000001.XSHE 2007-05 0.127291
5 000001.XSHE 2007-06 0.123739
6 000001.XSHE 2007-07 0.093992
7 000001.XSHE 2007-08 0.097085
... ... ... ...
625109 689009.XSHG 2023-09 0.209367
625110 689009.XSHG 2023-10 0.223899
625111 689009.XSHG 2023-11 0.221803
625112 689009.XSHG 2023-12 0.247525
625113 689009.XSHG 2024-01 0.313607
625114 689009.XSHG 2024-02 0.241569
625115 689009.XSHG 2024-03 0.247127
625116 689009.XSHG 2024-04 0.260342

617335 rows × 3 columns

Trading data

In [9]:
# %%time
# # About 8 mins
# # # 从优矿下载股票信息,时间较长。由于优矿的限制,每次下载3年的数据

# stk_dict = {}
# begin_ = dt.datetime.strptime(START, '%Y-%m-%d').year
# end_ = dt.datetime.strptime(START, '%Y-%m-%d').year+3
# field = ['secID','tradeDate','preClosePrice','closePrice','negMarketValue','turnoverValue','turnoverRate']
# while begin_ <= 2024:
#     if begin_ == 2024:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=field,pandas="1")
#     else:
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=field,pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(stk_dict)):
#     stk_df = pd.DataFrame(np.vstack([_df for _df in stk_dict.values()]),columns=field)
    
# stk_df.to_pickle('./data/stk_df.pkl')
In [10]:
# stk_df = pd.read_pickle('./data/stk_df.pkl')
# stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
# stk_df.sort_values(['secID','tradeDate'],inplace=True)
# # drop ST stocks
# print(stk_df.shape)
# stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
# stk_df = stk_df[stk_df['STflg'].isna()].copy()
# stk_df.drop('STflg',axis=1,inplace=True)
# print(stk_df.shape)
In [11]:
# num_cols = ['preClosePrice','closePrice','negMarketValue','turnoverValue','turnoverRate']
# for col in num_cols:
#     stk_df[col] = pd.to_numeric(stk_df[col])
In [12]:
# # # If the trading days are required to be consecutive, fill missing days first. This could possibly produce a much larger df when using 
# ## daily data, and if the missing dates are a lot for some securities

# def fill_missing(df, full_dates, id_col='secID', date_col='tradeDate'):
#     """
#     This function fills the missing dates for stocks.
#     Parameters:
#         df: The dataframe. Could be a sub-dataframe created by "groupby".
#             The dataframe must be sorted on the "date_col".
#         full_dates: the unique dates covering all securities in the full dataframe. 
#                     Need to be sorted.
#         id_col: the security id.
#         date_col: the dates column for the security
#     Returns:
#         A dataframe with the missing dates filled with NA.
#     """
#     stk_id = df[id_col].unique()
#     # Newer version of pandas will allow comparison between "Timestamp" and "datetime64"
# #     date_start = np.where(full_dates == df[date_col].min())[0][0] 
# #     date_end = np.where(full_dates == df[date_col].max())[0][0]
#     date_start = np.where(full_dates == df[date_col].min().to_datetime64())[0][0] 
#     date_end = np.where(full_dates == df[date_col].max().to_datetime64())[0][0]
#     dates = full_dates[date_start:date_end+1]
#     idx = pd.MultiIndex.from_product([stk_id,dates],
#                                      names=(id_col,date_col))
#     df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
#     return df
In [13]:
# full_dates = np.sort(stk_df['tradeDate'].unique())
In [14]:
# %%time
# stk_df = stk_df.groupby('secID').apply(fill_missing, full_dates=full_dates)
In [15]:
# stk_df.reset_index(drop=True, inplace=True)

# stk_df['ret_daily'] = stk_df['closePrice'] / stk_df['preClosePrice'] - 1
# stk_df['illiq_daily'] = abs(stk_df['ret_daily'].values) / stk_df['turnoverValue'].values
# stk_df.loc[np.isinf(stk_df['illiq_daily']),'illiq_daily'] = np.nan
# stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')

# stk_df.to_pickle('./data/stk_df_filled.pkl')
In [16]:
stk_df = pd.read_pickle('./data/stk_df_filled.pkl')
In [17]:
stk_df
Out[17]:
secID tradeDate preClosePrice closePrice negMarketValue turnoverValue turnoverRate ret_daily illiq_daily ym
0 000001.XSHE 2007-06-20 824.193 987.007 4.835036e+10 4.182345e+09 0.0840 0.197544 4.723272e-11 2007-06
1 000001.XSHE 2007-06-21 987.007 1085.740 5.318694e+10 2.285485e+09 0.0440 0.100033 4.376871e-11 2007-06
2 000001.XSHE 2007-06-22 1085.740 1120.233 5.487665e+10 2.761567e+09 0.0510 0.031769 1.150402e-11 2007-06
3 000001.XSHE 2007-06-25 1120.233 1113.904 5.456661e+10 2.324186e+09 0.0426 -0.005650 2.430837e-12 2007-06
4 000001.XSHE 2007-06-26 1113.904 1113.904 5.456661e+10 0.000000e+00 0.0000 0.000000 NaN 2007-06
5 000001.XSHE 2007-06-27 1113.904 1019.602 4.994705e+10 2.446556e+09 0.0489 -0.084659 3.460334e-11 2007-06
6 000001.XSHE 2007-06-28 1019.602 953.780 4.672266e+10 1.617434e+09 0.0336 -0.064557 3.991295e-11 2007-06
7 000001.XSHE 2007-06-29 953.780 870.870 4.266117e+10 1.410758e+09 0.0316 -0.086928 6.161781e-11 2007-06
... ... ... ... ... ... ... ... ... ... ...
12395394 689009.XSHG 2024-04-01 29.950 31.050 1.600553e+10 1.891675e+08 0.0120 0.036728 1.941554e-10 2024-04
12395395 689009.XSHG 2024-04-02 31.050 30.370 1.565501e+10 1.320106e+08 0.0084 -0.021900 1.658970e-10 2024-04
12395396 689009.XSHG 2024-04-03 30.370 29.320 1.511376e+10 1.629004e+08 0.0107 -0.034574 2.122376e-10 2024-04
12395397 689009.XSHG 2024-04-08 29.320 27.500 1.417559e+10 2.572971e+08 0.0179 -0.062074 2.412529e-10 2024-04
12395398 689009.XSHG 2024-04-09 27.500 28.070 1.446941e+10 1.401048e+08 0.0098 0.020727 1.479413e-10 2024-04
12395399 689009.XSHG 2024-04-10 28.070 27.570 1.421167e+10 1.128337e+08 0.0080 -0.017813 1.578660e-10 2024-04
12395400 689009.XSHG 2024-04-11 27.570 27.330 1.408796e+10 1.264233e+08 0.0089 -0.008705 6.885686e-11 2024-04
12395401 689009.XSHG 2024-04-12 27.330 27.800 1.433023e+10 1.743296e+08 0.0121 0.017197 9.864775e-11 2024-04

12395402 rows × 10 columns

Monthly trading data

In [18]:
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).last()
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
stk_df_m['size'] = np.log(stk_df_m['negMarketValue'])
stk_df_m.drop(['tradeDate','preClosePrice'],axis=1,inplace=True)
stk_df_m = pd.merge(stk_df_m, shibor_df_m, on='ym')
stk_df_m['exret'] = stk_df_m['ret'] - stk_df_m['rf']
stk_df_m.sort_values(['secID','ym'],inplace=True)
stk_df_m.rename(columns={'negMarketValue':'mktcap'},inplace=True)
stk_df_m
Out[18]:
secID ym closePrice mktcap turnoverValue turnoverRate ret_daily illiq_daily ret size rf exret
0 000001.XSHE 2007-06 870.870 4.266117e+10 1.410758e+09 0.0316 -0.086928 6.161781e-11 NaN 24.476555 0.002542 NaN
1293 000001.XSHE 2007-07 1146.498 5.616330e+10 1.479466e+09 0.0270 0.059356 4.012016e-11 0.316497 24.751529 0.002481 0.314016
2595 000001.XSHE 2007-08 1202.510 5.890714e+10 6.552881e+08 0.0112 0.007958 1.214425e-11 0.048855 24.799228 0.002404 0.046451
3922 000001.XSHE 2007-09 1265.167 6.197651e+10 1.408136e+09 0.0228 0.030147 2.140927e-11 0.052105 24.850021 0.002621 0.049484
5264 000001.XSHE 2007-10 1520.542 7.448652e+10 1.440425e+09 0.0200 0.065410 4.541049e-11 0.201851 25.033884 0.003095 0.198756
6612 000001.XSHE 2007-11 1141.751 5.593078e+10 5.452159e+08 0.0096 -0.026181 4.801976e-11 -0.249116 24.747381 0.003780 -0.252896
7976 000001.XSHE 2007-12 1221.497 6.574629e+10 1.019671e+09 0.0154 -0.010256 1.005854e-11 0.069845 24.909069 0.003011 0.066834
9362 000001.XSHE 2008-01 1053.778 5.850212e+10 5.328429e+08 0.0089 -0.019146 3.593101e-11 -0.137306 24.792329 0.002949 -0.140255
... ... ... ... ... ... ... ... ... ... ... ... ...
575773 689009.XSHG 2023-09 34.830 1.785208e+10 8.745210e+07 0.0049 0.003746 4.283943e-11 0.042502 23.605386 0.001904 0.040598
580846 689009.XSHG 2023-10 32.790 1.716478e+10 6.562302e+07 0.0038 -0.006364 9.697263e-11 -0.058570 23.566125 0.001890 -0.060460
585927 689009.XSHG 2023-11 33.100 1.732706e+10 1.497706e+08 0.0087 -0.005110 3.411690e-11 0.009454 23.575535 0.001914 0.007540
591023 689009.XSHG 2023-12 29.660 1.552630e+10 1.169338e+08 0.0075 0.015753 1.347209e-10 -0.103927 23.465801 0.002068 -0.105996
596129 689009.XSHG 2024-01 23.340 1.221793e+10 1.255953e+08 0.0100 -0.033940 2.702363e-10 -0.213082 23.226170 0.001902 -0.214983
601238 689009.XSHG 2024-02 30.300 1.586132e+10 2.444204e+08 0.0156 0.044828 1.834036e-10 0.298201 23.487149 0.001749 0.296451
606353 689009.XSHG 2024-03 29.950 1.543851e+10 1.180638e+08 0.0077 -0.000334 2.827102e-12 -0.011551 23.460131 0.001783 -0.013334
611468 689009.XSHG 2024-04 27.800 1.433023e+10 1.743296e+08 0.0121 0.017197 9.864775e-11 -0.071786 23.385637 0.001687 -0.073474

617652 rows × 12 columns

In [19]:
stk_df_m[stk_df_m['secID'] == '000001.XSHE'].set_index('ym').loc['2010-01':'2010-12']
# 停牌的时间都填充为NaN。刚恢复交易的第一个月的月收益率也为NaN
Out[19]:
secID closePrice mktcap turnoverValue turnoverRate ret_daily illiq_daily ret size rf exret
ym
2010-01 000001.XSHE 895.357 6.345328e+10 5.284377e+08 0.0082 -0.002299 4.350204e-12 -0.109561 24.873570 0.001511 -0.111072
2010-02 000001.XSHE 926.303 6.564637e+10 1.008966e+09 0.0154 0.013544 1.342342e-11 0.034563 24.907548 0.001547 0.033016
2010-03 000001.XSHE 957.248 6.783945e+10 7.105142e+08 0.0104 -0.019028 2.678085e-11 0.033407 24.940410 0.001450 0.031957
2010-04 000001.XSHE 848.320 6.011979e+10 6.628292e+08 0.0111 0.019336 2.917199e-11 -0.113793 24.819605 0.001464 -0.115256
2010-05 000001.XSHE 722.475 5.120124e+10 4.806838e+08 0.0092 -0.031527 6.558795e-11 -0.148346 24.659030 0.002081 -0.150427
2010-06 000001.XSHE 722.475 5.437499e+10 0.000000e+00 0.0000 0.000000 9.651729e-11 0.000000 24.719170 0.003236 -0.003236
2010-07 000001.XSHE 722.475 5.437499e+10 0.000000e+00 0.0000 0.000000 NaN 0.000000 24.719170 0.001798 -0.001798
2010-08 000001.XSHE 722.475 5.437499e+10 0.000000e+00 0.0000 0.000000 NaN 0.000000 24.719170 0.002369 -0.002369
2010-09 000001.XSHE 669.249 5.036906e+10 0.000000e+00 0.0000 0.000000 7.082247e-12 -0.073672 24.642643 0.002922 -0.076594
2010-10 000001.XSHE 759.610 5.716982e+10 6.635094e+08 0.0115 -0.019178 2.890457e-11 0.135019 24.769292 0.002214 0.132805
2010-11 000001.XSHE 676.263 5.089697e+10 4.642922e+08 0.0091 -0.019737 4.250943e-11 -0.109723 24.653069 0.003092 -0.112815
2010-12 000001.XSHE 651.507 4.903361e+10 3.783178e+08 0.0078 0.007659 2.024503e-11 -0.036607 24.615772 0.005141 -0.041748

Momentum and reversal

Using unfilled data

In [20]:
stk_unfilled_df = pd.read_pickle('./data/stk_df.pkl')
stk_unfilled_df['tradeDate'] = pd.to_datetime(stk_unfilled_df['tradeDate'], format='%Y-%m-%d')
stk_unfilled_df['ym'] = stk_unfilled_df['tradeDate'].dt.to_period('M')
stk_unfilled_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_unfilled_df.shape)
stk_unfilled_df = pd.merge(stk_unfilled_df, st_df, on=['secID','tradeDate'],how='left')
stk_unfilled_df = stk_unfilled_df[stk_unfilled_df['STflg'].isna()].copy()
stk_unfilled_df.drop('STflg',axis=1,inplace=True)
print(stk_unfilled_df.shape)
# Monthly
stk_unfilled_df_m = stk_unfilled_df.groupby(['secID','ym'],as_index=False).last()
(12553162, 8)
(12001762, 8)
In [21]:
stk_unfilled_df_m['ret_mom'] = stk_unfilled_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1) #这个ret_mom不用作后面ret的计算,后面仍保留monthly ret
stk_unfilled_df_m.sort_values(['secID','ym'],inplace=True)
stk_unfilled_df_m['1+ret_mom'] = stk_unfilled_df_m['ret_mom'] + 1
In [22]:
stk_unfilled_df_m['mom'] = stk_unfilled_df_m.groupby('secID').rolling(11,min_periods=9)['1+ret_mom'].apply(np.prod, raw=True).values - 1
In [23]:
stk_df_m = pd.merge(stk_df_m, stk_unfilled_df_m[['secID','ym','1+ret_mom']],on=['secID','ym'],how='left')
stk_df_m.loc[stk_df_m['1+ret_mom'].isna(),'1+ret_mom'] = 1 # 缺失位置填充为1,以便连乘。
stk_df_m['mom'] = stk_df_m.groupby('secID').rolling(11,min_periods=11)['1+ret_mom'].apply(np.prod, raw=True).values - 1 
In [24]:
stk_df_m['rev'] = stk_df_m['exret'].values
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['rf'] = stk_df_m.groupby(['secID'])['rf'].shift(-1)
stk_df_m['exret'] = stk_df_m.groupby(['secID'])['exret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1)

stk_df_m['mom'] = stk_df_m.groupby(['secID'])['mom'].shift()
stk_df_m['mom_date'] = stk_df_m.groupby('secID')['ym'].shift()
In [25]:
stk_df_m.drop(['ret_daily','turnoverValue','turnoverRate','illiq_daily','1+ret_mom'],axis=1,inplace=True)
In [26]:
stk_df_m
Out[26]:
secID ym closePrice mktcap ret size rf exret mom rev ret_date mom_date
0 000001.XSHE 2007-06 870.870 4.266117e+10 0.316497 24.476555 0.002481 0.314016 NaN NaN 2007-07 NaT
1 000001.XSHE 2007-07 1146.498 5.616330e+10 0.048855 24.751529 0.002404 0.046451 NaN 0.314016 2007-08 2007-06
2 000001.XSHE 2007-08 1202.510 5.890714e+10 0.052105 24.799228 0.002621 0.049484 NaN 0.046451 2007-09 2007-07
3 000001.XSHE 2007-09 1265.167 6.197651e+10 0.201851 24.850021 0.003095 0.198756 NaN 0.049484 2007-10 2007-08
4 000001.XSHE 2007-10 1520.542 7.448652e+10 -0.249116 25.033884 0.003780 -0.252896 NaN 0.198756 2007-11 2007-09
5 000001.XSHE 2007-11 1141.751 5.593078e+10 0.069845 24.747381 0.003011 0.066834 NaN -0.252896 2007-12 2007-10
6 000001.XSHE 2007-12 1221.497 6.574629e+10 -0.137306 24.909069 0.002949 -0.140255 NaN 0.066834 2008-01 2007-11
7 000001.XSHE 2008-01 1053.778 5.850212e+10 -0.004504 24.792329 0.002946 -0.007450 NaN -0.140255 2008-02 2007-12
... ... ... ... ... ... ... ... ... ... ... ... ...
617644 689009.XSHG 2023-09 34.830 1.785208e+10 -0.058570 23.605386 0.001890 -0.060460 -0.127904 0.040598 2023-10 2023-08
617645 689009.XSHG 2023-10 32.790 1.716478e+10 0.009454 23.566125 0.001914 0.007540 0.088437 -0.060460 2023-11 2023-09
617646 689009.XSHG 2023-11 33.100 1.732706e+10 -0.103927 23.575535 0.002068 -0.105996 -0.017675 0.007540 2023-12 2023-10
617647 689009.XSHG 2023-12 29.660 1.552630e+10 -0.213082 23.465801 0.001902 -0.214983 0.085602 -0.105996 2024-01 2023-11
617648 689009.XSHG 2024-01 23.340 1.221793e+10 0.298201 23.226170 0.001749 0.296451 -0.106357 -0.214983 2024-02 2023-12
617649 689009.XSHG 2024-02 30.300 1.586132e+10 -0.011551 23.487149 0.001783 -0.013334 -0.292727 0.296451 2024-03 2024-01
617650 689009.XSHG 2024-03 29.950 1.543851e+10 -0.071786 23.460131 0.001687 -0.073474 -0.195005 -0.013334 2024-04 2024-02
617651 689009.XSHG 2024-04 27.800 1.433023e+10 NaN 23.385637 NaN NaN -0.104366 -0.073474 NaT 2024-03

617652 rows × 12 columns

Merge

In [27]:
ret_df = pd.merge(stk_df_m[['secID','ret_date','ret','rf','exret','ym','mktcap','size','rev','mom_date','mom']], 
                  beta_df_m[['secID','ym','beta']], on=['secID','ym'], how='left')
In [28]:
ret_df
Out[28]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988
5 000001.XSHE 2007-12 0.069845 0.003011 0.066834 2007-11 5.593078e+10 24.747381 -0.252896 2007-10 NaN 0.9560
6 000001.XSHE 2008-01 -0.137306 0.002949 -0.140255 2007-12 6.574629e+10 24.909069 0.066834 2007-11 NaN 0.9468
7 000001.XSHE 2008-02 -0.004504 0.002946 -0.007450 2008-01 5.850212e+10 24.792329 -0.140255 2007-12 NaN 0.9654
... ... ... ... ... ... ... ... ... ... ... ... ...
617644 689009.XSHG 2023-10 -0.058570 0.001890 -0.060460 2023-09 1.785208e+10 23.605386 0.040598 2023-08 -0.127904 0.9152
617645 689009.XSHG 2023-11 0.009454 0.001914 0.007540 2023-10 1.716478e+10 23.566125 -0.060460 2023-09 0.088437 0.9247
617646 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541
617647 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448
617648 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314
617649 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905
617650 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477
617651 689009.XSHG NaT NaN NaN NaN 2024-04 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN

617652 rows × 12 columns

In [29]:
ret_df = pd.merge(ret_df, pb_df, on=['secID','ym'],how='left')
In [30]:
ret_df
Out[30]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411
5 000001.XSHE 2007-12 0.069845 0.003011 0.066834 2007-11 5.593078e+10 24.747381 -0.252896 2007-10 NaN 0.9560 0.111084
6 000001.XSHE 2008-01 -0.137306 0.002949 -0.140255 2007-12 6.574629e+10 24.909069 0.066834 2007-11 NaN 0.9468 0.094476
7 000001.XSHE 2008-02 -0.004504 0.002946 -0.007450 2008-01 5.850212e+10 24.792329 -0.140255 2007-12 NaN 0.9654 0.109513
... ... ... ... ... ... ... ... ... ... ... ... ... ...
617644 689009.XSHG 2023-10 -0.058570 0.001890 -0.060460 2023-09 1.785208e+10 23.605386 0.040598 2023-08 -0.127904 0.9152 0.209367
617645 689009.XSHG 2023-11 0.009454 0.001914 0.007540 2023-10 1.716478e+10 23.566125 -0.060460 2023-09 0.088437 0.9247 0.223899
617646 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541 0.221803
617647 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 0.247525
617648 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 0.313607
617649 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 0.241569
617650 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 0.247127
617651 689009.XSHG NaT NaN NaN NaN 2024-04 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN 0.260342

617652 rows × 13 columns

Monthly Factors

In [31]:
def double_sort(df, sort1, sort2='size', group_date='ym', merge_cols=['secID','ret_date']):
    """
    Double sorting. 
    Arguments:
        sort1: variable 1 for sorting into 3 groups
        sort2: default is "size", sorting into 2 groups
    returns:
        portfolios containing 2*3 groups
    """
    q1 = dict()
    keys = [f'q_{sort1}_1',f'q_{sort1}_2']
    values = [0.3, 0.7]
    q1.update(zip(keys,values))

    q2 = dict()
    keys = [f'q_{sort2}_1']
    values = [0.5]
    q2.update(zip(keys,values))

    q1_df = pd.DataFrame()
    for key, value in q1.items():
        q1_df[key] = df.groupby([group_date])[sort1].quantile(value)
    
    q2_df = pd.DataFrame()
    for key, value in q2.items():
        q2_df[key] = df.groupby([group_date])[sort2].quantile(value)

    ret_df_q = pd.merge(df, q2_df, on=group_date)
    ret_df_q = pd.merge(ret_df_q, q1_df, on=group_date)

    portfolios1 = dict()
    portfolios1[f'{sort1}1'] = ret_df_q.loc[ret_df_q[f'{sort1}'] <= ret_df_q[f'q_{sort1}_1']]
    portfolios1[f'{sort1}2'] = ret_df_q.loc[(ret_df_q[f'{sort1}'] >= ret_df_q[f'q_{sort1}_1']) & \
                                            (ret_df_q[f'{sort1}'] <= ret_df_q[f'q_{sort1}_2'])]
    portfolios1[f'{sort1}3'] = ret_df_q.loc[ret_df_q[f'{sort1}'] >= ret_df_q[f'q_{sort1}_2']]

    portfolios2 = dict()
    portfolios2[f'{sort2}1'] = ret_df_q.loc[ret_df_q[f'{sort2}'] <= ret_df_q[f'q_{sort2}_1'],
                                            merge_cols+[group_date]+['ret','exret','size','mktcap']]
    portfolios2[f'{sort2}2'] = ret_df_q.loc[ret_df_q[f'{sort2}'] >= ret_df_q[f'q_{sort2}_1'],
                                            merge_cols+[group_date]+['ret','exret','size','mktcap']]

    portfolios = dict()
    for group1 in portfolios1.keys():
        for group2 in portfolios2.keys():
            portfolios[f'{group1}_{group2}'] = pd.merge(portfolios2[group2],
                                                        portfolios1[group1][merge_cols+[f'{sort1}']],
                                                        on=merge_cols)
    return portfolios
In [32]:
def factor(df, sort1, sort2='size', long_high=True, long_only=True):
    portfolios = double_sort(df=df, sort1=sort1, sort2=sort2)
    portfolios_vwret = {}
    for pf in portfolios.keys():
    #     portfolios[pf].dropna(inplace=True) # 不应该dropna。如果在某个月停牌,分组时在前一个月,并不知道会这样。
        temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
        portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
        portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
        portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
        portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()

    portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
    portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
    portfolios_vwret_df.columns = portfolios_vwret.keys()
    if long_only:
        if long_high:
            factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
        else:
            factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
    else:
        if long_high:
            factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2 - \
                     (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
        else:
            factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2 - \
                     (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
    factor.name = sort1
    return factor
In [33]:
# factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
    if f == 'rev':
        factors_dict[f] = factor(df=ret_df,sort1=f,long_high=False)
    else:
        factors_dict[f] = factor(df=ret_df,sort1=f)
In [34]:
factors_df = pd.DataFrame(factors_dict)
In [35]:
# Size
sort1 = 'bm'
portfolios = double_sort(ret_df,sort1=sort1)
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
    portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
    portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()

factors_df['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3 
In [36]:
factors_df
Out[36]:
bm mom rev size
ret_date
2007-02 0.226465 NaN NaN 0.197491
2007-03 0.180999 NaN 0.130172 0.195448
2007-04 0.365264 NaN 0.275759 0.331512
2007-05 0.081306 NaN 0.106637 0.071441
2007-06 -0.165555 NaN -0.134289 -0.176355
2007-07 0.250519 NaN 0.306722 0.254432
2007-08 0.152717 NaN 0.127764 0.091415
2007-09 0.052709 NaN 0.026173 0.028093
... ... ... ... ...
2023-09 0.011770 -0.005246 -0.008466 0.000998
2023-10 -0.013951 -0.026426 -0.029171 -0.000202
2023-11 0.021145 0.032624 0.026052 0.051803
2023-12 -0.016989 -0.011377 -0.017220 -0.014700
2024-01 -0.081903 -0.151439 -0.183637 -0.204120
2024-02 0.026499 0.072404 0.121448 0.023174
2024-03 0.035934 0.051984 0.051934 0.069830
2024-04 -0.001237 -0.020692 -0.030567 -0.027777

207 rows × 4 columns

市场收益率

In [37]:
shibor_df
Out[37]:
tradeDate rf ym
0 2007-01-04 0.002148 2007-01
1 2007-01-05 0.002143 2007-01
2 2007-01-08 0.002132 2007-01
3 2007-01-09 0.002131 2007-01
4 2007-01-10 0.002122 2007-01
5 2007-01-11 0.002112 2007-01
6 2007-01-12 0.002105 2007-01
7 2007-01-15 0.002101 2007-01
... ... ... ...
4306 2024-04-02 0.001758 2024-04
4307 2024-04-03 0.001747 2024-04
4308 2024-04-07 0.001735 2024-04
4309 2024-04-08 0.001718 2024-04
4310 2024-04-09 0.001708 2024-04
4311 2024-04-10 0.001708 2024-04
4312 2024-04-11 0.001699 2024-04
4313 2024-04-12 0.001687 2024-04

4314 rows × 3 columns

In [38]:
# 用中证800作为market return
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df['ret_date'] = index_df['tradeDate'].dt.to_period('M')
index_df.sort_values('tradeDate',inplace=True)
index_df = index_df.groupby('ret_date',as_index=False).last()
index_df['mktret'] = index_df['closeIndex'] / index_df['closeIndex'].shift() - 1
index_df = pd.merge(index_df,shibor_df_m,left_on=['ret_date'],right_on=['ym'])
index_df['exmktret'] = index_df['mktret'] - index_df['rf']
index_df.drop(['ym','mktret','indexID','secShortName','tradeDate',
               'closeIndex','CHGPct'],axis=1,inplace=True)
index_df.dropna(inplace=True)
In [39]:
index_df
Out[39]:
ret_date rf exmktret
1 2007-02 0.002110 0.089095
2 2007-03 0.002234 0.108569
3 2007-04 0.002125 0.291347
4 2007-05 0.002527 0.098653
5 2007-06 0.002542 -0.074613
6 2007-07 0.002481 0.192378
7 2007-08 0.002404 0.167471
8 2007-09 0.002621 0.047576
... ... ... ...
200 2023-09 0.001904 -0.019002
201 2023-10 0.001890 -0.032780
202 2023-11 0.001914 -0.016876
203 2023-12 0.002068 -0.021342
204 2024-01 0.001902 -0.084132
205 2024-02 0.001749 0.103123
206 2024-03 0.001783 -0.000241
207 2024-04 0.001687 -0.015058

207 rows × 3 columns

In [40]:
factors_df = pd.merge(index_df, factors_df, on='ret_date')
factors_df.set_index('ret_date',inplace=True)
factors_df = factors_df[['rf','exmktret','size','bm','mom','rev']]
factors_df.columns = ['rf','exmktret','size_long','bm_long','mom_long','rev_long']
factors_df
Out[40]:
rf exmktret size_long bm_long mom_long rev_long
ret_date
2007-02 0.002110 0.089095 0.197491 0.226465 NaN NaN
2007-03 0.002234 0.108569 0.195448 0.180999 NaN 0.130172
2007-04 0.002125 0.291347 0.331512 0.365264 NaN 0.275759
2007-05 0.002527 0.098653 0.071441 0.081306 NaN 0.106637
2007-06 0.002542 -0.074613 -0.176355 -0.165555 NaN -0.134289
2007-07 0.002481 0.192378 0.254432 0.250519 NaN 0.306722
2007-08 0.002404 0.167471 0.091415 0.152717 NaN 0.127764
2007-09 0.002621 0.047576 0.028093 0.052709 NaN 0.026173
... ... ... ... ... ... ...
2023-09 0.001904 -0.019002 0.000998 0.011770 -0.005246 -0.008466
2023-10 0.001890 -0.032780 -0.000202 -0.013951 -0.026426 -0.029171
2023-11 0.001914 -0.016876 0.051803 0.021145 0.032624 0.026052
2023-12 0.002068 -0.021342 -0.014700 -0.016989 -0.011377 -0.017220
2024-01 0.001902 -0.084132 -0.204120 -0.081903 -0.151439 -0.183637
2024-02 0.001749 0.103123 0.023174 0.026499 0.072404 0.121448
2024-03 0.001783 -0.000241 0.069830 0.035934 0.051984 0.051934
2024-04 0.001687 -0.015058 -0.027777 -0.001237 -0.020692 -0.030567

207 rows × 6 columns

In [41]:
((factors_df+1).cumprod()*100).plot()
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feda66c98d0>
In [42]:
factors_df.to_pickle('./output_data/factors/factors_long_only.pkl')

Long short factors

In [43]:
# factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
    if f == 'rev':
        factors_dict[f] = factor(df=ret_df,sort1=f,long_only=False,long_high=False)
    else:
        factors_dict[f] = factor(df=ret_df,sort1=f,long_only=False)
In [44]:
factors_df = pd.DataFrame(factors_dict)
In [45]:
# Size
sort1 = 'bm'
portfolios = double_sort(ret_df,sort1=sort1)
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
    portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
    portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()

factors_df['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3 - \
      (portfolios_vwret_df['bm1_size2'] + portfolios_vwret_df['bm2_size2'] + portfolios_vwret_df['bm3_size2']) / 3 
In [46]:
factors_df = pd.merge(index_df, factors_df, on='ret_date')
factors_df.set_index('ret_date',inplace=True)
factors_df = factors_df[['rf','exmktret','size','bm','mom','rev']]
factors_df
Out[46]:
rf exmktret size bm mom rev
ret_date
2007-02 0.002110 0.089095 0.077335 0.140293 NaN NaN
2007-03 0.002234 0.108569 0.081013 0.050888 NaN -0.042893
2007-04 0.002125 0.291347 0.037910 0.102383 NaN -0.070701
2007-05 0.002527 0.098653 -0.025365 0.005711 NaN 0.038247
2007-06 0.002542 -0.074613 -0.102133 -0.065699 NaN -0.001760
2007-07 0.002481 0.192378 0.060536 0.056698 NaN 0.124766
2007-08 0.002404 0.167471 -0.073206 0.054124 NaN -0.006058
2007-09 0.002621 0.047576 -0.025461 0.029599 NaN -0.018872
... ... ... ... ... ... ...
2023-09 0.001904 -0.019002 0.010934 0.035308 0.000937 0.001036
2023-10 0.001890 -0.032780 0.027223 -0.000641 -0.017343 -0.024244
2023-11 0.001914 -0.016876 0.052893 -0.005449 0.011578 -0.001833
2023-12 0.002068 -0.021342 0.006422 0.004467 0.012018 0.000934
2024-01 0.001902 -0.084132 -0.106333 0.116617 0.015056 -0.028987
2024-02 0.001749 0.103123 -0.077025 -0.063077 0.001893 0.096214
2024-03 0.001783 -0.000241 0.058634 -0.000580 0.029318 0.032941
2024-04 0.001687 -0.015058 -0.016812 0.035735 0.012141 -0.013324

207 rows × 6 columns

In [47]:
((factors_df+1).cumprod()*100).plot()
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feda65b0f50>
In [48]:
factors_df.to_pickle('./output_data/factors/factors.pkl')

Daily factors

In [49]:
shibor1d = DataAPI.MktIborGet(secID="Shibor1D.IRCN",beginDate=START,endDate=END,field=['tradeDate','rate'],pandas="1")
shibor1d['tradeDate'] = pd.to_datetime(shibor1d['tradeDate'])
shibor1d['rate'] = shibor1d['rate'] * 0.01 / 365
shibor1d.rename(columns={'rate':'rf'},inplace=True)
In [50]:
def daily_factor(df, sort1, sort2='size', long_high=True, long_only=True):
    
    portfolios = double_sort(df=df, sort1=sort1, sort2=sort2)
    portfolios_vwret = {}
    for pf in portfolios.keys():
        temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
        portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
        portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
        df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
                       stk_df[['secID','tradeDate','ym','ret_daily']],
                       left_on=['secID','ret_date'],
                       right_on=['secID','ym'])
        df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
        portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()

    portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
    portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
    portfolios_vwret_df.columns = portfolios_vwret.keys()
    if long_only:
        if long_high:
            factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
        else:
            factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
    else:
        if long_high:
            factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2 - \
                     (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
        else:
            factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2 - \
                     (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
    return factor
In [51]:
# Daily factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
    if f == 'rev':
        factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_high=False)
    else:
        factors_dict[f] = daily_factor(df=ret_df,sort1=f)
In [52]:
factors_daily = pd.DataFrame(factors_dict)
In [53]:
# The size daily factor
portfolios = double_sort(df=ret_df,sort1='bm')
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
    df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
                   stk_df[['secID','tradeDate','ym','ret_daily']],
                   left_on=['secID','ret_date'],
                   right_on=['secID','ym'])
    df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
    portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
portfolios_vwret_df.columns = portfolios_vwret.keys()

factors_daily['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3
In [54]:
factors_daily
Out[54]:
bm mom rev size
tradeDate
2007-02-01 0.013241 NaN NaN 0.009567
2007-02-02 -0.018668 NaN NaN -0.015228
2007-02-05 0.009517 NaN NaN 0.015056
2007-02-06 0.028519 NaN NaN 0.028173
2007-02-07 0.013860 NaN NaN 0.013953
2007-02-08 0.021072 NaN NaN 0.017962
2007-02-09 -0.001536 NaN NaN 0.000616
2007-02-12 0.025930 NaN NaN 0.023638
... ... ... ... ...
2024-04-01 0.018484 0.015003 0.023777 0.023897
2024-04-02 0.006944 -0.003919 -0.006005 0.000343
2024-04-03 -0.000831 -0.005276 -0.008295 -0.009897
2024-04-08 -0.016334 -0.011751 -0.023685 -0.028268
2024-04-09 0.009110 0.001992 0.014453 0.017553
2024-04-10 -0.013312 -0.014439 -0.019970 -0.023923
2024-04-11 0.003150 0.002058 0.000186 0.000954
2024-04-12 -0.008132 -0.004441 -0.010733 -0.008053

4179 rows × 4 columns

In [55]:
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])

index_df = pd.merge(shibor1d,index_df[['tradeDate','CHGPct']],on='tradeDate')
index_df.rename(columns={'CHGPct':'mktret'},inplace=True)
index_df['exmktret'] = index_df['mktret'] - index_df['rf']
In [56]:
index_df
Out[56]:
tradeDate rf mktret exmktret
0 2007-01-04 0.000039 0.009605 0.009566
1 2007-01-05 0.000038 0.008121 0.008083
2 2007-01-08 0.000038 0.032249 0.032211
3 2007-01-09 0.000037 0.029767 0.029730
4 2007-01-10 0.000037 0.025520 0.025483
5 2007-01-11 0.000037 -0.007034 -0.007071
6 2007-01-12 0.000036 -0.028233 -0.028269
7 2007-01-15 0.000036 0.053024 0.052988
... ... ... ... ...
4191 2024-04-01 0.000047 0.017813 0.017766
4192 2024-04-02 0.000047 -0.004044 -0.004091
4193 2024-04-03 0.000046 -0.002984 -0.003030
4194 2024-04-08 0.000047 -0.010185 -0.010232
4195 2024-04-09 0.000047 0.001547 0.001500
4196 2024-04-10 0.000047 -0.009308 -0.009355
4197 2024-04-11 0.000047 0.001196 0.001149
4198 2024-04-12 0.000047 -0.007204 -0.007251

4199 rows × 4 columns

In [57]:
factors_daily = pd.merge(index_df[['tradeDate','rf','exmktret']],factors_daily, on='tradeDate')

factors_daily.set_index('tradeDate',inplace=True)

factors_daily = factors_daily[['rf','exmktret','size','bm','mom','rev']]
In [58]:
factors_daily.columns = ['rf','exmktret','size_long','bm_long','mom_long','rev_long']
In [59]:
factors_daily
Out[59]:
rf exmktret size_long bm_long mom_long rev_long
tradeDate
2007-02-01 0.000048 0.005542 0.009567 0.013241 NaN NaN
2007-02-02 0.000048 -0.035335 -0.015228 -0.018668 NaN NaN
2007-02-05 0.000048 -0.005993 0.015056 0.009517 NaN NaN
2007-02-06 0.000049 0.019879 0.028173 0.028519 NaN NaN
2007-02-07 0.000050 0.021923 0.013953 0.013860 NaN NaN
2007-02-08 0.000052 0.017380 0.017962 0.021072 NaN NaN
2007-02-09 0.000076 -0.003355 0.000616 -0.001536 NaN NaN
2007-02-12 0.000110 0.033455 0.023638 0.025930 NaN NaN
... ... ... ... ... ... ...
2024-04-01 0.000047 0.017766 0.023897 0.018484 0.015003 0.023777
2024-04-02 0.000047 -0.004091 0.000343 0.006944 -0.003919 -0.006005
2024-04-03 0.000046 -0.003030 -0.009897 -0.000831 -0.005276 -0.008295
2024-04-08 0.000047 -0.010232 -0.028268 -0.016334 -0.011751 -0.023685
2024-04-09 0.000047 0.001500 0.017553 0.009110 0.001992 0.014453
2024-04-10 0.000047 -0.009355 -0.023923 -0.013312 -0.014439 -0.019970
2024-04-11 0.000047 0.001149 0.000954 0.003150 0.002058 0.000186
2024-04-12 0.000047 -0.007251 -0.008053 -0.008132 -0.004441 -0.010733

4179 rows × 6 columns

In [60]:
((factors_daily+1).cumprod()*100).plot()
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feda132bb50>
In [61]:
((factors_daily.loc['2020':]+1).cumprod()*100).plot()
Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fedac125790>
In [62]:
factors_daily.to_pickle('./output_data/factors/factors_daily_long_only.pkl')

Long short factors

In [63]:
# Daily factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
    if f == 'rev':
        factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_only=False,long_high=False)
    else:
        factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_only=False)
In [64]:
factors_daily = pd.DataFrame(factors_dict)
In [65]:
# The size daily factor
portfolios = double_sort(df=ret_df,sort1='bm')
portfolios_vwret = {}
for pf in portfolios.keys():
    temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
    portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
    portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
    df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
                   stk_df[['secID','tradeDate','ym','ret_daily']],
                   left_on=['secID','ret_date'],
                   right_on=['secID','ym'])
    df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
    portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()

portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
portfolios_vwret_df.columns = portfolios_vwret.keys()

factors_daily['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3 - \
      (portfolios_vwret_df['bm1_size2'] + portfolios_vwret_df['bm2_size2'] + portfolios_vwret_df['bm3_size2']) / 3 
In [66]:
factors_daily
Out[66]:
bm mom rev size
tradeDate
2007-02-01 0.007141 NaN NaN 0.002193
2007-02-02 0.010887 NaN NaN 0.018346
2007-02-05 0.009071 NaN NaN 0.019852
2007-02-06 0.005678 NaN NaN 0.006771
2007-02-07 -0.004263 NaN NaN -0.007161
2007-02-08 0.004951 NaN NaN -0.000822
2007-02-09 0.001221 NaN NaN 0.003797
2007-02-12 -0.005880 NaN NaN -0.009410
... ... ... ... ...
2024-04-01 -0.001234 -0.013083 0.007626 0.008277
2024-04-02 0.015831 -0.002311 -0.006347 0.003097
2024-04-03 0.011228 0.005311 -0.002610 -0.007240
2024-04-08 0.002853 0.019388 -0.005804 -0.018479
2024-04-09 0.000616 -0.019958 0.008483 0.015436
2024-04-10 0.005866 0.009825 -0.004603 -0.014298
2024-04-11 0.003639 0.004611 -0.000285 -0.000967
2024-04-12 -0.002438 0.008133 -0.009208 -0.002197

4179 rows × 4 columns

In [67]:
factors_daily = pd.merge(index_df[['tradeDate','rf','exmktret']],factors_daily, on='tradeDate')

factors_daily.set_index('tradeDate',inplace=True)

factors_daily = factors_daily[['rf','exmktret','size','bm','mom','rev']]
In [68]:
factors_daily
Out[68]:
rf exmktret size bm mom rev
tradeDate
2007-02-01 0.000048 0.005542 0.002193 0.007141 NaN NaN
2007-02-02 0.000048 -0.035335 0.018346 0.010887 NaN NaN
2007-02-05 0.000048 -0.005993 0.019852 0.009071 NaN NaN
2007-02-06 0.000049 0.019879 0.006771 0.005678 NaN NaN
2007-02-07 0.000050 0.021923 -0.007161 -0.004263 NaN NaN
2007-02-08 0.000052 0.017380 -0.000822 0.004951 NaN NaN
2007-02-09 0.000076 -0.003355 0.003797 0.001221 NaN NaN
2007-02-12 0.000110 0.033455 -0.009410 -0.005880 NaN NaN
... ... ... ... ... ... ...
2024-04-01 0.000047 0.017766 0.008277 -0.001234 -0.013083 0.007626
2024-04-02 0.000047 -0.004091 0.003097 0.015831 -0.002311 -0.006347
2024-04-03 0.000046 -0.003030 -0.007240 0.011228 0.005311 -0.002610
2024-04-08 0.000047 -0.010232 -0.018479 0.002853 0.019388 -0.005804
2024-04-09 0.000047 0.001500 0.015436 0.000616 -0.019958 0.008483
2024-04-10 0.000047 -0.009355 -0.014298 0.005866 0.009825 -0.004603
2024-04-11 0.000047 0.001149 -0.000967 0.003639 0.004611 -0.000285
2024-04-12 0.000047 -0.007251 -0.002197 -0.002438 0.008133 -0.009208

4179 rows × 6 columns

In [69]:
((factors_daily+1).cumprod()*100).plot()
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feda66b9190>
In [70]:
((factors_daily.loc['2020':]+1).cumprod()*100).plot()
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feda66a8250>
In [71]:
factors_daily.to_pickle('./output_data/factors/factors_daily.pkl')
In [ ]: