import numpy as np
import pandas as pd
import datetime as dt
pd.set_option('display.max_rows', 16)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import gc
plt.rcParams['figure.figsize'] = (16.0, 9.0)
财务数据处理的难点在于“报表数据所处的时间”、“报表报告的时间”、“报表修改时间”带来的复杂性。两种处理方式比较合理:
START = '2007-01-01'
END = '2024-03-31'
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID'].unique()
stk_info
len(stk_id)
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df.info()
st_df
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
Book/Market ratio, 简称BM,也即价值因子,反映了公司的账面价值和市值的比值。Fama French (1993) 发现估值低(BM高)的股票和高的相比,预期收益为正。
BM ratio Fama-French(1993) 原文的构造方法:
处理思路:
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,reportType="A",beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")
# fundmen_df.to_pickle('./data/fundmen_df.pkl')
fundmen_df = pd.read_pickle('./data/fundmen_df.pkl')
fundmen_df
fundmen_df.info()
fundmen_df['fiscalPeriod'].unique()
fundmen_df[(fundmen_df['secID']=='000001.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
fundmen_df[(fundmen_df['secID'] == '300720.XSHE') & (fundmen_df['endDate']=='2018-12-31')]
publishDate
: 实际公告日期endDate
:数值所在日期endDateRep
:数值所在报表日期。03-31是一季报,06-30是半年报,09-30是三季报,12-31是年报。后面的报表可能会对初始值做修改。比如,300720.XSHE在2020-04-24公布了数据截止至2019-12-31的报告,里面包含了数据截止至2018-12-31的报表数据。
300720.XSHE在2019-08-28公布了数据截止至2019-06-30的报告,里面包含了数据截止至2018-12-31的报表数据。
在t年6月分组时,应当取最新更新过的t-1年12月31日的Book数值。
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
fundmen_df
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
fundmen_df['pub_year'] - fundmen_df['data_year']
(fundmen_df['pub_year'] - fundmen_df['data_year']).unique()
(fundmen_df['pub_year'] - fundmen_df['data_year']).value_counts()
fundmen_df.loc[(fundmen_df['pub_year'] - fundmen_df['data_year'])==16]
fundmen_df.drop(['actPubtime','fiscalPeriod'],axis=1, inplace=True)
fundmen_df
# 每年6月底计算时,只能看到publishDate在6月之前的数值。
# 取 endDate 相同时,publishDate 最晚(但小于等于6)的那个数值
# 同时pub_year - data_year 不能大于1(最近的报告)
fundmen_df['pub_month'] = fundmen_df['publishDate'].dt.month
fundmen_df['pub_year'] = fundmen_df['publishDate'].dt.year
fundmen_df['data_year'] = fundmen_df['endDate'].dt.year
fundmen_df = fundmen_df[fundmen_df['pub_year'] - fundmen_df['data_year'] == 1]
fundmen_df = fundmen_df[fundmen_df['pub_month'] <= 6]
fundmen_df.sort_values(['secID','endDate','publishDate'],inplace=True)
fundmen_df[(fundmen_df['secID']=='300720.XSHE') & (fundmen_df['endDate']=='2019-12-31')]
fundmen_df.loc[fundmen_df.duplicated(['secID','endDate'], keep=False)]
fundmen_df.loc[fundmen_df['pub_month'].idxmax()]
fundmen_df.loc[(fundmen_df['secID']=='000005.XSHE')&(fundmen_df['endDate']=='2007-12-31')]
fundmen_df.shape
fundmen_df.drop_duplicates(['secID','endDate'],keep='last')
fundmen_df.groupby(['secID','endDate'],as_index=False).last()
# fundmen_df = fundmen_df.groupby(['secID','endDate'],as_index=False).first()
fundmen_df.drop_duplicates(['secID','endDate'],keep='last', inplace=True)
fundmen_df.shape
fundmen_df['bm_date'] = fundmen_df['endDate'].dt.to_period('M')
fundmen_df
# # minorityInt 有时报告,有时不报告。空值时,假设就是上一次报告的值
# # fundmen_df['minorityInt'] = fundmen_df.groupby('secID')['minorityInt'].fillna(method='ffill')
# # 第一轮填完空值为有效数值后,剩下的空值再用0填充。
# fundmen_df['minorityInt'].fillna(0,inplace=True)
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
np.allclose(fundmen_df['book'],fundmen_df['TEquityAttrP'])
fundmen_df[fundmen_df['book']-fundmen_df['TEquityAttrP'] > 10]
fundmen_df.loc[fundmen_df['TShEquity']<0]
fundmen_df.loc[fundmen_df['book'] < 0]
# fundmen_df = fundmen_df[fundmen_df['book'] > 0]
fundmen_df
pd.read_csv("./data/rf-monthly-2024.csv", encoding='GBK')
rf = pd.read_csv("./data/rf-monthly-2024.csv", encoding='GBK').drop(["Unnamed: 4", "年份()_Year", "月份_Month"],axis=1)
rf.columns = ['Date', 'rf']
rf['Date'] = pd.to_datetime(rf["Date"])
rf['Date'] = rf['Date'].dt.to_period('M')
rf.rename(columns={'Date':'ym'},inplace=True)
rf
beta_df = pd.read_pickle('./data/beta_df.pkl')
beta_df
beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")
beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')
beta_df[['Beta60','Beta120','Beta252']] = beta_df[['Beta60','Beta120','Beta252']].apply(pd.to_numeric)
beta_df
# # Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta60_winsor'] = beta_df['Beta60'].clip(lower=beta_df['Beta60'].quantile(lower_q),upper=beta_df['Beta60'].quantile(up_q))
# beta_df['Beta120_winsor'] = beta_df['Beta120'].clip(lower=beta_df['Beta120'].quantile(lower_q),upper=beta_df['Beta120'].quantile(up_q))
beta_m_df = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()
beta_m_df.rename(columns={'Beta252':'beta'},inplace=True)
beta_m_df
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
# field=["secID","tradeDate",
# "closePrice",
# "negMarketValue"],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df['closePrice'] = pd.to_numeric(stk_df['closePrice'])
stk_df['negMarketValue'] = pd.to_numeric(stk_df['negMarketValue'])
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
stk_df.info()
stk_df
stk_df.dropna().shape
stk_df.shape
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
stk_df.shape
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).tail(1)
stk_df_m
def fill_missing(df, full_dates, id_col='secID', date_col='ym'):
"""
This function fills the missing dates for stocks.
Parameters:
df: The dataframe. Could be a sub-dataframe created by "groupby".
The dataframe must be sorted on the "date_col".
full_dates: the unique dates covering all securities in the full dataframe.
Need to be sorted.
id_col: the security id.
date_col: the dates column for the security
Returns:
A dataframe with the missing dates filled with NA.
"""
one_stk_id = df[id_col].unique()
date_start = np.where(full_dates == df[date_col].min())[0][0]
date_end = np.where(full_dates == df[date_col].max())[0][0]
dates = full_dates[date_start:date_end+1]
idx = pd.MultiIndex.from_product([one_stk_id,dates],
names=(id_col,date_col))
df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
return df
full_dates = np.sort(stk_df['ym'].unique())
%%time
stk_df_m = stk_df_m.groupby('secID').apply(fill_missing, full_dates=full_dates)
stk_df_m.reset_index(drop=True, inplace=True)
stk_df_m
stk_df_m.info()
stk_df_m.drop('tradeDate',axis=1,inplace=True)
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
# # Use last month's market cap for sorting
stk_df_m['mkt_cap'] = stk_df_m.groupby('secID')['negMarketValue'].shift()
stk_df_m['mkt_cap_date'] = stk_df_m.groupby('secID')['ym'].shift()
stk_df_m.drop(['closePrice','negMarketValue'],axis=1,inplace=True)
stk_df_m
stk_df_m[(stk_df_m['secID']=='000007.XSHE') & (stk_df_m['ym']>='2021-03') & (stk_df_m['ym']<='2022-08')]
stk_df_m[stk_df_m['ret'].isna()]
stk_df_m[stk_df_m['mkt_cap'].isna()]
stk_df_m.dropna(inplace=True)
stk_df_m
fundmen_df
stk_df_m
bm_df = pd.merge(stk_df_m[['secID','mkt_cap','mkt_cap_date']], fundmen_df[['secID','book','bm_date']],
left_on=['secID','mkt_cap_date'],right_on=['secID','bm_date'])
bm_df['bm'] = bm_df['book'] / bm_df['mkt_cap']
bm_df.drop(['mkt_cap_date','mkt_cap','book'],axis=1,inplace=True)
bm_df
ret_df = pd.merge(stk_df_m, rf, on='ym')
ret_df['exret'] = ret_df['ret'] - ret_df['rf']
ret_df.sort_values(['secID','ym'],inplace=True)
ret_df.reset_index(drop=True,inplace=True)
display(ret_df)
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mkt_cap_date'],right_on=['secID','ym'])
display(ret_df)
ret_df.drop(['ym_y'],axis=1,inplace=True)
ret_df.rename(columns={'ym_x':'ret_date',
'mkt_cap_date':'mktcap_beta_date'},inplace=True)
ret_df
ret_df = ret_df[['secID','ret_date','ret','rf','exret','mktcap_beta_date','mkt_cap','beta']]
ret_df
调整return月份对应的bm月份
例:2007:12月的bm分组,对应的是 2008:07 -- 2009:06 的return
调整步骤:
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1
ret_df
ret_df.loc[(ret_df['secID']=='300349.XSHE')&(ret_df['ret_date']>='2013-01')&(ret_df['ret_date']<='2013-12')]
pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()
ret_df['year'] = ret_df['ret_date'].dt.year
ret_df['month'] = ret_df['ret_date'].dt.month
ret_df['bm_date'] = ret_df['year'] - 1
idx = ret_df['month'].isin([1,2,3,4,5,6])
ret_df.loc[idx,'bm_date'] = ret_df.loc[idx,'bm_date'] - 1
ret_df['bm_date'] = pd.to_datetime(ret_df['bm_date'].astype('str'),format='%Y') + pd.tseries.offsets.YearEnd()
ret_df['bm_date'] = ret_df['bm_date'].dt.to_period('M')
ret_df.drop(['month','year'], axis=1, inplace=True)
ret_df
bm_df
ret_df = pd.merge(ret_df,bm_df,on=['secID','bm_date'])
ret_df[ret_df['ret_date']<='2010-07']
gc.collect()
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['bm_date'])['bm'].quantile(value)
quantile_df
ret_df_q = pd.merge(ret_df, quantile_df, on='bm_date')
ret_df_q
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']
portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios['p2']
portfolios['p2'].info()
for k in portfolios.keys():
print(portfolios[k].groupby(['ret_date'])['exret'].mean().mean())
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
portfolios_crs_mean['p1']
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
portfolios['p1']
portfolios[k]['1+ret'] = portfolios[k]['ret']+1
portfolios[k]['1+rf'] = portfolios[k]['rf']+1
portfolios[k]
portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
pf_year_ret = {}
for k in portfolios.keys():
portfolios[k]['1+ret'] = portfolios[k]['ret']+1
portfolios[k]['1+rf'] = portfolios[k]['rf']+1
pf_year_ret[k] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+ret'].prod()
pf_year_ret[k]['1+rf'] = portfolios[k].groupby(['secID','bm_date'],as_index=False)['1+rf'].prod()['1+rf']
pf_year_ret[k]['ret'] = pf_year_ret[k]['1+ret'] - 1
pf_year_ret[k]['rf'] = pf_year_ret[k]['1+rf'] - 1
pf_year_ret[k]['exret'] = pf_year_ret[k]['ret'] - pf_year_ret[k]['rf']
pf_year_ret['p1']
portfolios_crs_mean = dict()
for k in pf_year_ret.keys():
portfolios_crs_mean[k] = pf_year_ret[k].groupby(['bm_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
# average beta in each portfolio
for key in portfolios.keys():
print(portfolios[key].groupby('bm_date')['bm'].mean().mean())
portfolios['p1'].groupby('bm_date')['secID'].nunique()
pf_n_stks = pd.DataFrame()
for key, value in portfolios.items():
pf_n_stks[key] = portfolios[key].groupby('bm_date')['secID'].nunique()
display(pf_n_stks)
pf_n_stks.plot()
portfolios['p10'].groupby('bm_date')['mkt_cap'].mean()/1e9
pf_mktcap = pd.DataFrame()
for key, value in portfolios.items():
pf_mktcap[key] = portfolios[key].groupby('bm_date')['mkt_cap'].mean()
display(pf_mktcap)
pf_mktcap.plot()
pf_mktcap = pf_mktcap / 1e10
for i in range(10):
print(pf_mktcap.mean()[i])
排序方法:t年7月至t+1年6月,按照t-1年12月的BM排序,考察区间内每月平均收益率以及区间年平均收益率
结论:
del portfolios, portfolios_crs_mean
# fundmen_df = DataAPI.FdmtBSGet(secID=stk_id,beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")
# fundmen_df.to_pickle('./data/fundmen_df_pit.pkl')
fundmen_df = pd.read_pickle('./data/fundmen_df_pit.pkl')
fundmen_df
fundmen_df[['publishDate','endDate']] = fundmen_df[['publishDate','endDate']].apply(pd.to_datetime)
fundmen_df.sort_values(['secID','publishDate','endDate'],inplace=True)
fundmen_df
fundmen_df.loc[fundmen_df.duplicated(['secID','publishDate'], keep=False)] # 同一报表中包含往期信息
fundmen_df.loc[~fundmen_df.duplicated(['secID','publishDate'], keep=False)] # 只有当期信息
fundmen_df = fundmen_df.groupby(['secID','publishDate'],as_index=False).last() #不涉及上上个报表的信息
fundmen_df
fundmen_df['secID'].str[-4:].unique()
fundmen_df.loc[fundmen_df['secID'].str.endswith('SHE2')]
DataAPI.FdmtBSGet(secID='000043.XSHE2',beginDate=START,endDate=END,publishDateEnd=u"",publishDateBegin=u"",endDateRep="",beginDateRep="",beginYear="",endYear="",fiscalPeriod="",field=["secID","secShortName","publishDate","endDate","endDateRep","actPubtime","fiscalPeriod","TShEquity","TEquityAttrP","minorityInt"],pandas="1")
fundmen_df = fundmen_df[(fundmen_df['secID'].str.endswith('XSHE')) | (fundmen_df['secID'].str.endswith('XSHG'))]
# # minorityInt 有时报告,有时不报告。空值时,假设就是上一次报告的值
# # fundmen_df['minorityInt'] = fundmen_df.groupby('secID')['minorityInt'].fillna(method='ffill')
# # 第一轮填完空值为有效数值后,剩下的空值再用0填充。
# fundmen_df['minorityInt'].fillna(0,inplace=True)
# fundmen_df['book'] = fundmen_df['TShEquity'] - fundmen_df['minorityInt']
fundmen_df['book'] = fundmen_df['TEquityAttrP']
fundmen_df
# fundmen_df['publishDate+1'] = fundmen_df['publishDate'] + dt.timedelta(days=1)
stk_fundmen_df = pd.merge(stk_df, fundmen_df[['secID','publishDate','endDate','book']],
left_on=['secID','tradeDate'], right_on=['secID','publishDate'],
how='outer')
stk_fundmen_df
stk_fundmen_df.loc[(stk_fundmen_df['secID']=='900957.XSHG')&(stk_fundmen_df['tradeDate']<='2016-08-10')&(stk_fundmen_df['tradeDate']>='2016-08-03')]
stk_fundmen_df.loc[(stk_fundmen_df['secID']=='000001.XSHE')&(stk_fundmen_df['tradeDate']<='2008')&(stk_fundmen_df['tradeDate']>='2007-08-10')]
idx = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna()].index
stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'tradeDate'] = stk_fundmen_df.loc[stk_fundmen_df['tradeDate'].isna(),'publishDate']
stk_fundmen_df.loc[idx]
stk_fundmen_df
stk_fundmen_df.sort_values(['secID','tradeDate'],inplace=True)
stk_fundmen_df
temp = stk_fundmen_df[stk_fundmen_df['secID']=='000001.XSHE'].copy()
temp
temp[~temp['book'].isna()]
stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
stk_fundmen_df.shape
stk_fundmen_df.loc[:,['publishDate','endDate','book']] = stk_fundmen_df[['secID','publishDate','endDate','book']].groupby('secID').fillna(method='ffill')
## 查看数据
idx = pd.IndexSlice
stk_fundmen_df.set_index(['secID','tradeDate'],inplace=True)
pd.options.display.max_rows = 20
stk_fundmen_df.loc[idx['000001.XSHE','2010-03'],:]
stk_fundmen_df.loc[idx['000001.XSHE','2007-04'],:]
stk_fundmen_df.loc[idx['000001.XSHE','2007-04':'2007-06'],:]
stk_fundmen_df.reset_index(inplace=True)
stk_fundmen_df
stk_df_m = stk_fundmen_df.groupby(['secID','ym'],as_index=False).last()
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
stk_df_m
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1) # 上一期的BM影响下一期
stk_df_m
这里处理停牌仍然可以用“填充NA日期”的办法,但需要在日度数据先填。日度数据填充可能会使数据变得很大,但应该更稳妥。
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1]
stk_df.loc[(stk_df['secID']=='000004.XSHE')&(stk_df['ym']>='2022-05')]
# 停牌删去
stk_df_m['ym_diff'] = stk_df_m['ret_date'].astype(int) - stk_df_m['ym'].astype(int)
stk_df_m.loc[stk_df_m['ym_diff'] != 1,'ret'] = np.nan
# 查看数据
temp = stk_df_m['secID'].unique()
display(stk_df_m[stk_df_m['secID'] == np.random.choice(temp,1)[0]])
del temp
stk_df_m.drop(['tradeDate','closePrice','publishDate','endDate', 'ym_diff'],axis=1,inplace=True)
stk_df_m.rename(columns={'ym':'mktcap_book_date','negMarketValue':'mktcap'},inplace=True)
stk_df_m.dropna(inplace=True)
stk_df_m
ret_df = pd.merge(stk_df_m, rf, left_on='ret_date',right_on='ym')
ret_df.drop('ym',axis=1,inplace=True)
ret_df['exret'] = ret_df['ret'] - ret_df['rf']
ret_df.sort_values(['secID','ret_date'],inplace=True)
ret_df.reset_index(drop=True,inplace=True)
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mktcap_book_date'],right_on=['secID','ym'])
# display(ret_df)
ret_df.drop(['ym','rf','ret'],axis=1,inplace=True)
ret_df.rename(columns={'mktcap_book_date':'grouping_date'},inplace=True)
ret_df['bm'] = ret_df['book'] / ret_df['mktcap']
ret_df['size'] = np.log(ret_df['mktcap'])
ret_df = ret_df[['secID','grouping_date','size','mktcap','bm',
'beta','ret_date','exret']]
ret_df
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)
ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']
portfolios['p1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm']) & (ret_df_q['bm'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
结论:
# %%time
# begin_ = 2007
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# pb_df = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=yesterday,
# field=['secID','tradeDate','PB'],pandas="1")
# # # 从优矿下载 PB,时间较长。由于优矿的限制,每次下载3年的数据
# %%time
# pb = {}
# begin_ = 2007
# end_ = 2010
# while begin_ <= 2024:
# if begin_ == 2024:
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=yesterday,
# field=['secID','tradeDate','PB'],pandas="1")
# else:
# pb[begin_] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=f'{end_}1231',
# field=['secID','tradeDate','PB'],pandas="1")
# begin_ = end_ + 1
# end_ = begin_ + 3
# for i in range(len(pb)):
# pb_df = pd.DataFrame(np.vstack([_df for _df in pb.values()]),columns=['secID','tradeDate','PB'])
# pb_df.to_pickle('./data/pb_df.pkl')
# # About 5mins
pb_df = pd.read_pickle('./data/pb_df.pkl')
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])
pb_df['PB'] = pd.to_numeric(pb_df['PB'])
pb_df['grouping_date'] = pb_df['tradeDate'].dt.to_period('M')
pb_df = pb_df.groupby(['secID','grouping_date'],as_index=False).last()
pb_df['bm_uqer'] = 1 / pb_df['PB']
ret_df = pd.merge(ret_df,pb_df[['secID','grouping_date','bm_uqer']],on=['secID','grouping_date'])
del pb_df
ret_df
ret_df.loc[ret_df['secID']=='000001.XSHE',['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
sample_id = np.random.choice(ret_df['secID'].unique(),1)
display(ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']])
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot()
ret_df.loc[ret_df['secID'].isin(sample_id),['grouping_date','bm','bm_uqer']].set_index('grouping_date').plot(subplots=True)
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['grouping_date'])['bm_uqer'].quantile(value)
ret_df_q = pd.merge(ret_df, quantile_df, on='grouping_date')
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']
portfolios['p1'] = ret_df_q.loc[ret_df_q['bm_uqer'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['bm_uqer']) & (ret_df_q['bm_uqer'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['bm_uqer'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
q_size = dict()
keys = ['q_size_1']
values = [0.5]
q_size.update(zip(keys,values))
q_bm = dict()
keys = ['q_bm_1','q_bm_2']
values = [0.3, 0.7]
q_bm.update(zip(keys,values))
q_size_df = pd.DataFrame()
for key, value in q_size.items():
q_size_df[key] = ret_df.groupby(['grouping_date'])['size'].quantile(value)
q_bm_df = pd.DataFrame()
for key, value in q_bm.items():
q_bm_df[key] = ret_df.groupby(['grouping_date'])['bm'].quantile(value)
ret_df_q = pd.merge(ret_df, q_size_df, on='grouping_date')
ret_df_q = pd.merge(ret_df_q, q_bm_df, on='grouping_date')
portfolios_size = dict()
portfolios_size['size1'] = ret_df_q.loc[ret_df_q['size'] <= ret_df_q['q_size_1'],
['secID','grouping_date','ret_date','exret','size','mktcap']]
portfolios_size['size2'] = ret_df_q.loc[ret_df_q['size'] >= ret_df_q['q_size_1'],
['secID','grouping_date','ret_date','exret','size','mktcap']]
portfolios_bm = dict()
portfolios_bm['bm1'] = ret_df_q.loc[ret_df_q['bm'] <= ret_df_q['q_bm_1'],
['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm2'] = ret_df_q.loc[(ret_df_q['bm'] >= ret_df_q['q_bm_1']) & \
(ret_df_q['bm'] <= ret_df_q['q_bm_2']),
['secID','grouping_date','ret_date','exret','bm']]
portfolios_bm['bm3'] = ret_df_q.loc[ret_df_q['bm'] >= ret_df_q['q_bm_2'],
['secID','grouping_date','ret_date','exret','bm']]
portfolios = dict()
for bm_group in portfolios_bm.keys():
for size_group in portfolios_size.keys():
portfolios[f'{bm_group}_{size_group}'] = pd.merge(portfolios_size[size_group],
portfolios_bm[bm_group][['secID','ret_date','bm']],
on=['secID','ret_date'])
mean_portfolios_ret = dict()
for pf in portfolios.keys():
mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有bm和size分组没有任何交叉
# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['bm1_size1'].index
# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
y = mean_portfolios_ret[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
# ret_df[(ret_df['ret_date'] >= '2008-02') & (ret_df['secID'] == '000001.XSHE')]
ret_df['exret100'] = ret_df['exret'] * 100
def fm_reg(df):
df_ = df.dropna()
if df_.shape[0] < 15:
return None
reg = LinearRegression().fit(y=df_.loc[:,'exret100'], X=df_.loc[:,['beta','size','bm']])
return np.insert(reg.coef_, 0, reg.intercept_)
temp = ret_df.groupby('ret_date').apply(fm_reg)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept', 'beta_coef','size_coef', 'bm_coef']
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
portfolios
Fama-French 3 factors 的构建:
构建方法:
这里我们还是按照mktcap组合的构建日期,不改成和 Fama-French (1993) 原文一样的日期(t年6月)
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ret_date')['mktcap'].agg({'mktcapsum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ret_date')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcapsum']
portfolios[pf]['weighted_exret'] = portfolios[pf]['exret'] * portfolios[pf]['weight']
portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_exret'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret['bm1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
portfolios_vwret_df.rename(columns={"bm1_size1": "SL",
"bm2_size1": "SM",
"bm3_size1": "SH",
"bm1_size2": "BL",
"bm2_size2": "BM",
"bm3_size2": "BH"},
inplace=True) # vw: value weighted
portfolios_vwret_df
SMB = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3 - \
(portfolios_vwret_df['BL'] + portfolios_vwret_df['BM'] + portfolios_vwret_df['BH']) / 3
HML = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2 - \
(portfolios_vwret_df['SL'] + portfolios_vwret_df['BL']) / 2
factors_df = pd.DataFrame(np.vstack([SMB,HML])).T
factors_df.columns = ['SMB','HML']
factors_df.index = SMB.index
factors_df
百度百科:中证800指数是由中证指数有限公司编制,其成份股是由中证500和沪深300成份股一起构成,中证800指数综合反映沪深证券市场内大中市值公司的整体状况。
# index_info = DataAPI.SecIDGet(assetClass="IDX",pandas="1")
# 用中证800作为market return
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df['ret_date'] = index_df['tradeDate'].dt.to_period('M')
index_df.sort_values('tradeDate',inplace=True)
index_df = index_df.groupby('ret_date',as_index=False).last()
index_df['mktret'] = index_df['closeIndex'] / index_df['closeIndex'].shift() - 1
index_df = pd.merge(index_df,rf,left_on=['ret_date'],right_on=['ym'])
index_df['exmktret'] = index_df['mktret'] - index_df['rf']
index_df.drop(['ym','mktret','rf','indexID','secShortName','tradeDate',
'closeIndex','CHGPct'],axis=1,inplace=True)
index_df.dropna(inplace=True)
factors_df = pd.merge(index_df, factors_df, on='ret_date')
factors_df['ret_date'] = factors_df['ret_date'].dt.to_timestamp(how='end').dt.normalize()
factors_df.set_index('ret_date',inplace=True)
((1 + factors_df).cumprod()*100).plot()
((1 + factors_df.loc['2018':]).cumprod()*100).plot()
factors_df
factors_df.to_csv('./output_data/factors/ff3.csv')
small_only = (portfolios_vwret_df['SL'] + portfolios_vwret_df['SM'] + portfolios_vwret_df['SH']) / 3
high_only = (portfolios_vwret_df['SH'] + portfolios_vwret_df['BH']) / 2
factors_long_df = pd.DataFrame(np.vstack([small_only,high_only])).T
factors_long_df.columns = ['small_only','high_only']
factors_long_df.index = small_only.index
factors_long_df = pd.merge(index_df, factors_long_df, on='ret_date')
factors_long_df['ret_date'] = factors_long_df['ret_date'].dt.to_timestamp(freq='day',how='end').dt.normalize()
factors_long_df.set_index('ret_date',inplace=True)
((1 + factors_long_df).cumprod()*100).plot()
((1 + factors_long_df.loc['2018':]).cumprod()*100).plot()
factors_long_df
factors_long_df.to_csv('./output_data/factors/ff3_long_only.csv')