import numpy as np
import pandas as pd
import datetime as dt
pd.set_option('display.max_rows', 20)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import gc
plt.rcParams['figure.figsize'] = (16.0, 9.0)
START = '2007-01-01'
END = '2022-03-31'
DataAPI.SysCodeGet(codeTypeID=u"",valueCD='UN',field=u"",pandas="1")
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
stk_info['listStatusCD'].unique()
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID']
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df.info()
st_df
st_df['STflg'].unique()
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
pd.read_csv("rf-monthly.csv").head()
rf = pd.read_csv("rf-monthly.csv").drop(["Unnamed: 4", "年份_Year", "月份_Month"],axis=1)
rf.columns = ['Date', 'rf']
rf['Date'] = pd.to_datetime(rf["Date"])
rf['Date'] = rf['Date'].dt.to_period('M')
rf.rename(columns={'Date':'ym'},inplace=True)
rf
rf.set_index('ym').plot()
Beta本身不是一个可以直接获取的公司特征,因此,需要计算:
计算量很大,因为涉及到几千只股票,每只个股要算比较长的一段时间(比如2007-2020有$14*12 = 168$个月,也即168个回归)。3000只股票,则回归的数目是 $168*3000 = 504,000$。
因此,从实际操作的角度来讲,第一步的时序回归计算factor exposure本身计算量很大。因此,对于公司特征类因子,直接使用公司特征作为因子暴露,是一个经济上也合理的选择
# %time
# begin_ = 2007
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# beta_df = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=yesterday,
# field=['secID','tradeDate','Beta60','Beta120','Beta252'],pandas="1")
# # # 从优矿下载 beta,时间较长。由于优矿的限制,每次下载3年的数据
# beta = {}
# begin_ = 2007
# end_ = 2010
# i = 0
# while end_ <= 2022:
# if end_ == 2022:
# yesterday = dt.datetime.today() - dt.timedelta(days=1)
# yesterday.strftime('%Y%m%d')
# beta[i] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=yesterday,
# field=['secID','tradeDate','Beta60','Beta120','Beta252'],pandas="1")
# else:
# beta[i] = DataAPI.MktStockFactorsDateRangeProGet(secID=stk_id,
# beginDate=f'{begin_}0101',
# endDate=f'{end_}1231',
# field=['secID','tradeDate','Beta60','Beta120','Beta252'],pandas="1")
# begin_ = end_ + 1
# end_ = begin_ + 3
# i = i+1
# for i in range(4):
# beta_df = pd.DataFrame(np.vstack([_df for _df in beta.values()]),columns=['secID','tradeDate','Beta60','Beta120','Beta252'])
# beta_df.to_pickle('./data/beta_df.pkl')
beta_df = pd.read_pickle('./data/beta_df.pkl')
beta_df
beta_df.info()
beta_df.isna().sum()
beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")
beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')
beta_df
beta_df.info()
beta_df[['Beta60','Beta120','Beta252']] = beta_df[['Beta60','Beta120','Beta252']].apply(pd.to_numeric)
做以下观察:
n_stk_avail = beta_df[['secID','tradeDate','ym','Beta252']].dropna().groupby('ym')['secID'].nunique()
n_stk = beta_df[['secID','tradeDate','ym','Beta252']].groupby('ym')['secID'].nunique()
(n_stk_avail / n_stk).plot()
n_stk_avail = beta_df[['secID','tradeDate','ym','Beta60']].dropna().groupby('ym')['secID'].nunique()
n_stk = beta_df[['secID','tradeDate','ym','Beta60']].groupby('ym')['secID'].nunique()
(n_stk_avail / n_stk).plot()
beta_df[['Beta60','Beta120','Beta252']].describe().round(2)
beta_df[['Beta60','Beta120','Beta252']].min()
beta_df[['Beta60','Beta120','Beta252']].max()
beta_df['Beta60'].quantile(0.99)
beta_df['Beta60'].quantile(0.9999)
beta_df['Beta60'].quantile(1e-4)
beta_df['Beta60'].quantile(0.99999)
beta_df['Beta60'].quantile(1e-5)
beta_df['Beta60'].nlargest(20)
beta_df['Beta60'].nsmallest(20)
# beta_df[beta_df['Beta60'] == beta_df['Beta60'].min()]
beta_df.loc[[beta_df['Beta60'].idxmin()]]
beta_df.loc[[beta_df['Beta60'].idxmax()]]
beta_df.loc[beta_df['secID']=='002499.XSHE',['Beta60','tradeDate']].set_index('tradeDate').plot()
beta_df.loc[beta_df['secID']=='688112.XSHG',['Beta60','tradeDate']].set_index('tradeDate').plot()
num_cols = beta_df.select_dtypes(np.number).columns
beta_df.groupby('secID')[num_cols].var().describe().round(2)
# # Winsorization
# beta_df.loc[beta_df['Beta60'] > beta_df['Beta60'].quantile(0.99999),'Beta60'] = beta_df['Beta60'].quantile(0.99999)
# beta_df.loc[beta_df['Beta60'] < beta_df['Beta60'].quantile(0.00001),'Beta60'] = beta_df['Beta60'].quantile(0.00001)
# beta_df.loc[beta_df['Beta120'] > beta_df['Beta120'].quantile(0.99999),'Beta120'] = beta_df['Beta120'].quantile(0.99999)
# beta_df.loc[beta_df['Beta120'] < beta_df['Beta120'].quantile(0.00001),'Beta120'] = beta_df['Beta120'].quantile(0.00001)
beta_df['Beta60_winsor'] = beta_df['Beta60'].clip(lower=beta_df['Beta60'].quantile(0.00001),upper=beta_df['Beta60'].quantile(0.99999))
beta_df['Beta120_winsor'] = beta_df['Beta120'].clip(lower=beta_df['Beta120'].quantile(0.00001),upper=beta_df['Beta120'].quantile(0.99999))
num_cols = beta_df.select_dtypes(np.number).columns
beta_df.groupby('secID')[['Beta60_winsor','Beta120_winsor','Beta252']].var().describe().round(2)
stk_id_beta = beta_df['secID'].unique()
stk_picked = np.random.choice(stk_id_beta, 10)
print(stk_picked)
beta_plot = beta_df.loc[beta_df['secID'].isin(stk_picked)]
beta_plot.pivot(index='tradeDate', columns='secID', values='Beta252').plot()
beta_plot.pivot(index='tradeDate', columns='secID', values='Beta60').plot()
stk_ = np.random.choice(stk_picked,2)
beta_plot.pivot(index='tradeDate', columns='secID', values='Beta60')[[stk_[0],stk_[1]]].plot()
beta_plot.pivot(index='tradeDate', columns='secID', values='Beta120')[[stk_[0],stk_[1]]].plot()
beta_corr = beta_df.groupby('secID')[['Beta60','Beta120','Beta252']].corr()
beta_corr
beta_corr['Beta252'].min()
beta_corr.loc[[beta_corr['Beta252'].idxmin()]]
beta_corr.loc['301155.XSHE']
beta_df.loc[beta_df['secID']=='301155.XSHE',['tradeDate','Beta60','Beta120','Beta252']].set_index('tradeDate').plot()
beta_df.loc[beta_df['secID']=='605183.XSHG',['tradeDate','Beta60','Beta120','Beta252']].set_index('tradeDate').plot()
beta_corr.loc[pd.IndexSlice[:,['Beta60','Beta120']],'Beta252'].mean()
beta_df.groupby(['secID','ym'])['Beta252'].last()
beta_m_df = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()
beta_m_df
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
# field=["secID","tradeDate",
# "closePrice",
# "negMarketValue"],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df
stk_df.info()
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'], format='%Y-%m-%d')
stk_df.dropna().shape
stk_df.shape
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
stk_df.shape
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).last()
# 有些数据库会把没有当月最后一个交易日数据的股票的月收益率设为空值。这里不做此设置,只要当月任何一日有交易,
# 都可被提出。last()会把最后一个非空值提出。
from myutils import utils as ut
temp = ut.makeStockFrame()
temp
temp.loc[5,'date'] = np.nan
temp.drop(5).groupby('variable').last()
stk_df_m[(stk_df_m['secID']=='000001.XSHE') & (stk_df_m['tradeDate']>='2010-05') & (stk_df_m['tradeDate']<='2010-10')]
np.where(full_dates == )
def fill_missing(df, full_dates, id_col='secID', date_col='ym'):
"""
This function fills the missing dates for stocks.
Parameters:
df: The dataframe. Could be a sub-dataframe created by "groupby".
The dataframe must be sorted on the "date_col".
full_dates: the unique dates covering all securities in the full dataframe.
Need to be sorted.
id_col: the security id.
date_col: the dates column for the security
Returns:
A dataframe with the missing dates filled with NA.
"""
one_stk_id = df[id_col].unique()
date_start = np.where(full_dates == df[date_col].min())[0][0]
date_end = np.where(full_dates == df[date_col].max())[0][0]
dates = full_dates[date_start:date_end+1]
idx = pd.MultiIndex.from_product([one_stk_id,dates],
names=(id_col,date_col))
df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
return df
full_dates = np.sort(stk_df['ym'].unique())
full_dates
x = ['a','b','c']
y = full_dates[0:3]
idx = pd.MultiIndex.from_product([x, y])
idx
temp = pd.DataFrame({'x':['a','a','b','c'],'y':[full_dates[0],full_dates[1],full_dates[1],full_dates[2]],'value':[1,2,3,4]}).set_index(['x','y'])
temp
temp.reindex(idx).reset_index()
temp = stk_df_m[stk_df_m['secID']=='300432.XSHE'].copy()
date_start = np.where(full_dates == temp['ym'].min())[0][0]
date_end = np.where(full_dates == temp['ym'].max())[0][0]
dates = full_dates[date_start:date_end+1]
len(dates)
temp['ym'].shape
np.setdiff1d(dates,temp['ym'])
one_stk_id = temp['secID'].unique()
idx = pd.MultiIndex.from_product([one_stk_id, dates],names=('secID','ym'))
temp = temp.set_index(['secID','ym']).reindex(idx)
temp
%%time
stk_df_m = stk_df_m.groupby('secID').apply(fill_missing, full_dates=full_dates)
stk_df_m[stk_df_m['secID'] == one_stk_id[0]]
stk_df_m.reset_index(drop=True, inplace=True)
stk_df_m.drop('tradeDate',axis=1,inplace=True)
stk_df_m
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
stk_df_m[stk_df_m['closePrice'].isna()]
stk_df_m[stk_df_m['ret'].isna()]
# # Example of groupby apply
# import pandas.util.testing as tm
# def unpivot(frame):
# N, K = frame.shape
# data = {
# "value": frame.to_numpy().ravel("F"),
# "variable": np.asarray(frame.columns).repeat(N),
# "date": np.tile(np.asarray(frame.index), K),
# }
# return pd.DataFrame(data, columns=["date", "variable", "value"])
# temp = tm.makeTimeDataFrame(3)
# temp = unpivot(temp)
# temp
# temp.groupby('variable')['value'].apply(lambda x: x / x.shift() - 1)
stk_df_m
# Use last month's market cap for sorting
stk_df_m['mkt_cap'] = stk_df_m.groupby('secID')['negMarketValue'].shift()
stk_df_m['mkt_cap_month'] = stk_df_m.ym - 1
stk_df_m[stk_df_m['ret'].isna()]
stk_df_m.drop(['closePrice','negMarketValue'],axis=1,inplace=True)
stk_df_m.dropna(inplace=True)
stk_df_m
ret_df = pd.merge(stk_df_m, rf, on='ym')
ret_df['exret'] = ret_df['ret'] - ret_df['rf']
ret_df.sort_values(['secID','ym'],inplace=True)
ret_df.reset_index(drop=True,inplace=True)
ret_df
beta_m_df
# Use last month's beta for grouping
ret_df = pd.merge(ret_df,beta_m_df,left_on=['secID','mkt_cap_month'],right_on=['secID','ym'])
ret_df
ret_df.drop(['ym_y','rf','ret'],axis=1,inplace=True)
ret_df.rename(columns={'ym_x':'ret_month',
'mkt_cap_month':'group_month'},inplace=True)
ret_df = ret_df[['secID','ret_month','exret','group_month','mkt_cap','Beta252']]
ret_df
gc.collect()
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
q
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['group_month'])['Beta252'].quantile(value)
quantile_df
ret_df_q = pd.merge(ret_df, quantile_df, on='group_month')
ret_df_q
portfolios = dict()
portfolios['p1'] = ret_df_q.loc[ret_df_q['Beta252'] <= ret_df_q['q1'],['secID','group_month','Beta252','mkt_cap','ret_month','exret']].copy()
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['Beta252']) & (ret_df_q['Beta252'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx,['secID','group_month','Beta252','mkt_cap','ret_month','exret']].copy()
portfolios['p10'] = ret_df_q.loc[ret_df_q['Beta252'] >= ret_df_q['q9'],['secID','group_month','Beta252','mkt_cap','ret_month','exret']].copy()
portfolios['p2']
portfolios['p1'].groupby(['ret_month'])['exret'].mean()
for k in portfolios.keys():
print(portfolios[k].groupby(['ret_month'])['exret'].mean().mean())
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_month'])['exret'].mean()
portfolios_crs_mean['p1']
y = portfolios_crs_mean['p1']
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
y.mean()
reg.params
reg.tvalues
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
portfolios['p1']
# average beta in each portfolio
for key in portfolios.keys():
print(portfolios[key].groupby('group_month')['Beta252'].mean().mean())
portfolios['p1'].groupby('group_month')['secID'].nunique()
pf_n_stks = pd.DataFrame()
for key, value in portfolios.items():
pf_n_stks[key] = portfolios[key].groupby('group_month')['secID'].nunique()
display(pf_n_stks)
pf_n_stks.plot()
portfolios['p1'].groupby('group_month')['mkt_cap'].mean()
pf_mktcap = pd.DataFrame()
for key, value in portfolios.items():
pf_mktcap[key] = portfolios[key].groupby('group_month')['mkt_cap'].mean()
display(pf_mktcap)
pf_mktcap.plot()
pf_mktcap = pf_mktcap / 1e10
for i in range(10):
print(pf_mktcap.mean()[i])
排序方法:按照上月末 beta252 分十组,考察下月各组合简单平均收益率
结论:
size-effect 指的是小市值股票的收益率比大市值股票更好. Fama and French (1992, 1993) 是最著名的两篇文章. 但这个现象更早就被发现了, 例如 Banz (1981), Shapiro (1986). Fama and French (2012) 说在全球市场都有类似的现象.
FF (1992) 总结了这个现象的已有文献并进一步做了检验, FF(1993)在此基础上构建了SMB因子, 并发现所谓Fama French 3 因子模型比 CAPM 表现更好.
FF 的市值是这样计算的: t年6月底的股价 * t年6月底的股本数, 也即:
$$MktCap_{i,t}^{FF} = \frac{ShareOut_{i, June} \times Price_{i, June}}{1000}$$作为t年至t+1年的市值分类标准. 这样做是为了避免价格的频繁变动本身带来的市值和收益率之间的相关性.
对于美国数据, ShareOut是按照1000股来衡量的, 上式再除以1000, 则$MktCap$的量纲就是"百万美元".
另外一种算法更直接:
$$MktCap_{i,t} = \frac{ShareOut_{i, t} \times Price_{i, t}}{1000}$$对于美国市场, 两种算法得到的结果差别不大.
在做回归分析的时候, 一般还会把$MktCap$取log, 因为一般来讲会有一小部分股票有极大的市值, 而大部分股票的市值与其相比很小. 回归会收到这个影响. 而sorting不会.
我们在中国市场看看小市值股票是否比大市值股票表现更好. 简单起见, 我们用后一种MktCap算法.
ret_df['mkt_cap'] = ret_df['mkt_cap'] / 1e6
ret_df['size'] = np.log(ret_df['mkt_cap'])
mktcap_group = ret_df.loc[ret_df['group_month'] >= '2018',['group_month','size']].groupby('group_month')
mktcap_group.boxplot(subplots=False, showmeans=True)
mktcap_group = ret_df.loc[ret_df['group_month'] >= '2018',['group_month','mkt_cap']].groupby('group_month')
mktcap_group.boxplot(subplots=False, showmeans=True)
mktcap_group = ret_df.loc[:,['group_month','mkt_cap']].groupby('group_month')
mktcap_group.agg([np.mean, np.median]).plot()
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['group_month'])['mkt_cap'].quantile(value)
display(quantile_df)
ret_df_q = pd.merge(ret_df, quantile_df, on='group_month')
display(ret_df_q)
portfolios = dict()
portfolios['p1'] = ret_df_q.loc[ret_df_q['mkt_cap'] <= ret_df_q['q1'],['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['mkt_cap']) & (ret_df_q['mkt_cap'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx,['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
portfolios['p10'] = ret_df_q.loc[ret_df_q['mkt_cap'] >= ret_df_q['q9'],['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
for k in portfolios.keys():
print(portfolios[k].groupby(['ret_month'])['exret'].mean().mean())
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_month'])['exret'].mean()
portfolios_crs_mean['p1']
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
mean_values
t_values
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
# average mktcap in each portfolio
for key in portfolios.keys():
print(portfolios[key].groupby('group_month')['mkt_cap'].mean().mean())
portfolios['p1'].groupby('group_month')['secID'].nunique()
pf_n_stks = pd.DataFrame()
for key, value in portfolios.items():
pf_n_stks[key] = portfolios[key].groupby('group_month')['secID'].nunique()
pf_n_stks
pf_n_stks.plot()
portfolios['p1']
pf_beta = pd.DataFrame()
for key, value in portfolios.items():
pf_beta[key] = portfolios[key].groupby('group_month')['Beta252'].mean()
for col in pf_beta.columns:
print(pf_beta.loc[:,col].mean())
排序方法:上月末市值按大小分成10组,考察下月收益率
ret_df_q_ = ret_df_q[(ret_df_q['group_month']>='2016') & (ret_df_q['group_month']<='2020')].copy()
portfolios = dict()
portfolios['p1'] = ret_df_q_.loc[ret_df_q_['mkt_cap'] <= ret_df_q_['q1'],['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
for i in range(2,10):
idx = (ret_df_q_[f'q{i-1}'] <= ret_df_q_['mkt_cap']) & (ret_df_q_['mkt_cap'] <= ret_df_q_[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q_.loc[idx,['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
portfolios['p10'] = ret_df_q_.loc[ret_df_q_['mkt_cap'] >= ret_df_q_['q9'],['secID','group_month','mkt_cap','Beta252','ret_month','exret']].copy()
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_month'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
del ret_df_q_
gc.collect()
stk_info[stk_info['secShortName']=='贵州茅台']
portfolios['p10'][portfolios['p10']['secID']=='600519.XSHG']
我们做 independent sort, mktcap 分2组,beta 分3组。mktcap的分位点是0.5,beta的分位点是0.3,0.7
q_mktcap = dict()
keys = ['q_mktcap_1']
values = [0.5]
q_mktcap.update(zip(keys,values))
q_beta = dict()
keys = ['q_beta_1','q_beta_2']
values = [0.3, 0.7]
q_beta.update(zip(keys,values))
q_mktcap_df = pd.DataFrame()
for key, value in q_mktcap.items():
q_mktcap_df[key] = ret_df.groupby(['group_month'])['mkt_cap'].quantile(value)
q_beta_df = pd.DataFrame()
for key, value in q_beta.items():
q_beta_df[key] = ret_df.groupby(['group_month'])['Beta252'].quantile(value)
q_mktcap_df
q_beta_df
ret_df_q = pd.merge(ret_df, q_mktcap_df, on='group_month')
ret_df_q = pd.merge(ret_df_q, q_beta_df, on='group_month')
ret_df_q
portfolios_mktcap = dict()
portfolios_mktcap['mktcap1'] = ret_df_q.loc[ret_df_q['mkt_cap'] <= ret_df_q['q_mktcap_1'],
['secID','group_month','ret_month','exret','mkt_cap']]
portfolios_mktcap['mktcap2'] = ret_df_q.loc[ret_df_q['mkt_cap'] >= ret_df_q['q_mktcap_1'],
['secID','group_month','ret_month','exret','mkt_cap']]
portfolios_beta = dict()
portfolios_beta['beta1'] = ret_df_q.loc[ret_df_q['Beta252'] <= ret_df_q['q_beta_1'],
['secID','group_month','ret_month','exret','Beta252']]
portfolios_beta['beta2'] = ret_df_q.loc[(ret_df_q['Beta252'] >= ret_df_q['q_beta_1']) & \
(ret_df_q['Beta252'] <= ret_df_q['q_beta_2']),
['secID','group_month','ret_month','exret','Beta252']]
portfolios_beta['beta3'] = ret_df_q.loc[ret_df_q['Beta252'] >= ret_df_q['q_beta_2'],
['secID','group_month','ret_month','exret','Beta252']]
portfolios_mktcap
portfolios_beta
portfolios = dict()
for beta_group in portfolios_beta.keys():
for mktcap_group in portfolios_mktcap.keys():
portfolios[f'{beta_group}_{mktcap_group}'] = pd.merge(portfolios_mktcap[mktcap_group],
portfolios_beta[beta_group][['secID','ret_month','Beta252']],
on=['secID','ret_month'])
portfolios
mean_portfolios_ret = dict()
for pf in portfolios.keys():
mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_month')['exret'].mean()
print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上beta和mktcap分组没有任何交叉
mean_portfolios_ret
# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['beta1_mktcap1'].index
mean_portfolios_ret_df
mean_portfolios_ret_df.plot()
# Within mktcap1, any difference in beta groups?
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[-7:] == 'mktcap1'])
mean_portfolios_ret_df[cols].plot()
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[-7:] == 'mktcap2'])
mean_portfolios_ret_df[cols].plot()
没有明显区别。下面看看在beta组内,不同的market cap有没有什么区别
# Within beta1, any difference in mktcap groups?
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[:5] == 'beta1'])
mean_portfolios_ret_df[cols].plot()
小盘股上涨多,下跌也多
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[:5] == 'beta2'])
mean_portfolios_ret_df[cols].plot()
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[:5] == 'beta3'])
mean_portfolios_ret_df[cols].plot()
# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
y = mean_portfolios_ret[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
ret_df['exret100'] = ret_df['exret'] * 100
def fm_reg(df):
df_ = df.dropna()
if df_.shape[0] < 15:
return None
reg = LinearRegression().fit(y=df_.loc[:,'exret'], X=df_.loc[:,['Beta252','size']])
return np.insert(reg.coef_, 0, reg.intercept_)
temp = ret_df.groupby('ret_month').apply(fm_reg)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept', 'beta_coef', 'size_coef']
reg_result_df
reg_result_df_ = reg_result_df.loc["2017":].copy()
reg_result_df_
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df_.columns:
y = reg_result_df_[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
winsor_top = ret_df.groupby(['ret_month'])[['Beta252','size']].quantile(0.995)
winsor_bottom = ret_df.groupby(['ret_month'])[['Beta252','size']].quantile(0.005)
winsor_top.columns = ['beta_q995','size_q995']
winsor_bottom.columns = ['beta_q005','size_q005']
ret_df_q = pd.merge(ret_df, winsor_top, on='ret_month')
ret_df_q = pd.merge(ret_df_q, winsor_bottom, on='ret_month')
ret_df_q.loc[ret_df_q['Beta252'] > ret_df_q['beta_q995'],'Beta252'] = ret_df_q['beta_q995']
ret_df_q.loc[ret_df_q['Beta252'] < ret_df_q['beta_q005'],'Beta252'] = ret_df_q['beta_q005']
ret_df_q.loc[ret_df_q['size'] > ret_df_q['size_q995'],'size'] = ret_df_q['size_q995']
ret_df_q.loc[ret_df_q['size'] < ret_df_q['size_q005'],'size'] = ret_df_q['size_q005']
ret_df_q
temp = ret_df_q.groupby('ret_month').apply(fm_reg)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept', 'beta_coef', 'size_coef']
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
样本:2007:01-2022:02, 全部A股
单排、双排、FM回归结论一致: