import numpy as np
import pandas as pd
import datetime as dt
pd.set_option('display.max_rows', 16)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import gc
plt.rcParams['figure.figsize'] = (16.0, 9.0)
START = '2007-01-01'
END = '2022-03-31'
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID']
stk_info
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
shibor_df = DataAPI.MktIborGet(secID="Shibor1M.IRCN",beginDate=START,endDate=END,field=['secID','tradeDate','rate'],pandas="1")
shibor_df['rate'] = shibor_df['rate']*0.01/12
shibor_df['tradeDate'] = pd.to_datetime(shibor_df['tradeDate'])
shibor_df.drop('secID',axis=1,inplace=True)
shibor_df.rename(columns={'rate':'rf'},inplace=True)
shibor_df['ym'] = shibor_df['tradeDate'].dt.to_period('M')
shibor_df.sort_values('tradeDate',inplace=True)
shibor_df_m = shibor_df.groupby('ym',as_index=False).last()
shibor_df_m.drop('tradeDate',axis=1,inplace=True)
shibor_df_m
beta_df = pd.read_pickle('./data/beta_df.pkl')
beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")
beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')
beta_df.drop(['Beta60','Beta120'],axis=1,inplace=True)
beta_df['Beta252'] = pd.to_numeric(beta_df['Beta252'])
# Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta252_winsor'] = beta_df['Beta252'].clip(lower=beta_df['Beta252'].quantile(lower_q),upper=beta_df['Beta252'].quantile(up_q))
# Monthly
beta_df_m = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()
beta_df_m.rename(columns={'Beta252':'beta'},inplace=True)
beta_df_m
pb_df = pd.read_pickle('./data/pb_df.pkl')
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])
pb_df['PB'] = pd.to_numeric(pb_df['PB'])
pb_df['ym'] = pb_df['tradeDate'].dt.to_period('M')
pb_df.sort_values(['secID','tradeDate'],inplace=True)
pb_df = pb_df.groupby(['secID','ym'],as_index=False).last()
pb_df['bm'] = 1 / pb_df['PB']
pb_df.drop(['tradeDate','PB'],axis=1,inplace=True)
pb_df = pb_df[pb_df['bm'] >= 0]
pb_df
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
# field=["secID","tradeDate",
# 'preClosePrice',"closePrice",
# "negMarketValue",
# "turnoverValue",'turnoverRate'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_df.shape)
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
print(stk_df.shape)
stk_df
# # If the trading days are required to be consecutive, fill missing days first. This could possibly produce a much larger df when using
## daily data, and if the missing dates are a lot for some securities
def fill_missing(df, full_dates, id_col='secID', date_col='tradeDate'):
"""
This function fills the missing dates for stocks.
Parameters:
df: The dataframe. Could be a sub-dataframe created by "groupby".
The dataframe must be sorted on the "date_col".
full_dates: the unique dates covering all securities in the full dataframe.
Need to be sorted.
id_col: the security id.
date_col: the dates column for the security
Returns:
A dataframe with the missing dates filled with NA.
"""
stk_id = df[id_col].unique()
# Newer version of pandas will allow comparison between "Timestamp" and "datetime64"
# date_start = np.where(full_dates == df[date_col].min())[0][0]
# date_end = np.where(full_dates == df[date_col].max())[0][0]
date_start = np.where(full_dates == df[date_col].min().to_datetime64())[0][0]
date_end = np.where(full_dates == df[date_col].max().to_datetime64())[0][0]
dates = full_dates[date_start:date_end+1]
idx = pd.MultiIndex.from_product([stk_id,dates],
names=(id_col,date_col))
df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
return df
full_dates = np.sort(stk_df['tradeDate'].unique())
full_dates
# %%time
# stk_df = stk_df.groupby('secID').apply(fill_missing, full_dates=full_dates)
Amihud (2002)'s liquidity measure is perhaps the most widely known.
$$ illiq_i = \frac{1}{D} \sum^D_{d=1} \frac{|R_{i,d}|}{VOLD_{i,d}} $$VOLD: the dollar volume, measured as "close_price * shares_traded"
Intuition: the effect of volume on moving prices. If a small volume moves price significantly (nonsignificantly), the stock is quite illiquid (liquid).
Period choices: Amihud(2002) uses t-11 month through t month (12 months) daily data as the measure for month t.
Let's use the two measures:
Also, let's use the actual dollar volume within one day
stk_df
# stk_df.reset_index(drop=True, inplace=True)
# stk_df['ret_daily'] = stk_df['closePrice'] / stk_df['preClosePrice'] - 1
# stk_df['illiq_daily'] = abs(stk_df['ret_daily']) / stk_df['turnoverValue']
# stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
# stk_df.to_pickle('./data/stk_df_filled.pkl')
stk_df = pd.read_pickle('./data/stk_df_filled.pkl')
stk_df
stk_df[(stk_df['secID']=='000001.XSHE') & (stk_df['tradeDate']>='2010-06-01') & (stk_df['tradeDate']<='2010-09-05')]
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).last()
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
stk_df_m['size'] = np.log(stk_df_m['negMarketValue'])
stk_df_m.drop(['tradeDate','preClosePrice'],axis=1,inplace=True)
stk_df_m = pd.merge(stk_df_m, shibor_df_m, on='ym')
stk_df_m['exret'] = stk_df_m['ret'] - stk_df_m['rf']
stk_df_m.sort_values(['secID','ym'],inplace=True)
stk_df_m.rename(columns={'negMarketValue':'mktcap'},inplace=True)
stk_df_m
stk_unfilled_df = pd.read_pickle('./data/stk_df.pkl')
stk_unfilled_df['tradeDate'] = pd.to_datetime(stk_unfilled_df['tradeDate'], format='%Y-%m-%d')
stk_unfilled_df['ym'] = stk_unfilled_df['tradeDate'].dt.to_period('M')
stk_unfilled_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_unfilled_df.shape)
stk_unfilled_df = pd.merge(stk_unfilled_df, st_df, on=['secID','tradeDate'],how='left')
stk_unfilled_df = stk_unfilled_df[stk_unfilled_df['STflg'].isna()].copy()
stk_unfilled_df.drop('STflg',axis=1,inplace=True)
print(stk_unfilled_df.shape)
# Monthly
stk_unfilled_df_m = stk_unfilled_df.groupby(['secID','ym'],as_index=False).last()
stk_unfilled_df_m['ret_mom'] = stk_unfilled_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1) #这个ret_mom不用作后面ret的计算,后面仍保留monthly ret
stk_unfilled_df_m.sort_values(['secID','ym'],inplace=True)
stk_unfilled_df_m['1+ret_mom'] = stk_unfilled_df_m['ret_mom'] + 1
stk_unfilled_df_m
stk_unfilled_df_m['mom'] = stk_unfilled_df_m.groupby('secID').rolling(11,min_periods=9)['1+ret_mom'].apply(np.prod, raw=True).values - 1
stk_unfilled_df_m
stk_df_m = pd.merge(stk_df_m, stk_unfilled_df_m[['secID','ym','1+ret_mom']],on=['secID','ym'],how='left')
stk_df_m[(stk_df_m['secID']=='000001.XSHE') & (stk_df_m['ym']>='2010-06') & (stk_df_m['ym']<='2010-10')]
stk_df_m.loc[stk_df_m['1+ret_mom'].isna(),'1+ret_mom'] = 1 # 缺失位置填充为1,以便连乘。
stk_df_m['mom'] = stk_df_m.groupby('secID').rolling(11,min_periods=11)['1+ret_mom'].apply(np.prod, raw=True).values - 1
stk_df_m
stk_df_m['rev'] = stk_df_m['exret'].values
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['rf'] = stk_df_m.groupby(['secID'])['rf'].shift(-1)
stk_df_m['exret'] = stk_df_m.groupby(['secID'])['exret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1)
stk_df_m['mom'] = stk_df_m.groupby(['secID'])['mom'].shift()
stk_df_m['mom_date'] = stk_df_m.groupby('secID')['ym'].shift()
stk_df_m.drop(['ret_daily','turnoverValue','turnoverRate','illiq_daily','1+ret_mom'],axis=1,inplace=True)
stk_df_m
MIN_NOBS = 15
illiq_df = stk_df.groupby(['secID','ym'],as_index=False)['illiq_daily'].mean()
idx = stk_df.groupby(['secID','ym'],as_index=False)['ret_daily'].count()['ret_daily'] < MIN_NOBS
illiq_df.loc[idx, 'illiq_daily'] = np.nan
illiq_df.rename(columns={'illiq_daily':'illiq_1m'},inplace=True)
illiq_df
illiq_df['illiq_1m'] = illiq_df['illiq_1m'] * 1e6
illiq_df
illiq_df['illiq_1m'].describe()
# # This code choses 12 months, instead of trading days.
# def amihud_illiq(df, len_periods='12m', min_nobs=200):
# """
# df: year_month is set as the index, this is faster than using mask.
# """
# year_months = df.index.unique()
# n = len(year_months)
# illiq = np.full(shape=n, fill_value=np.nan)
# start_notnan_month = int(len_periods[:-1])-1
# for i in range(start_notnan_month,n):
# df_ = df[str(year_months[i]-int(len_periods[:-1])+1) : str(year_months[i])].to_numpy()
# if len(df_) < min_nobs:
# continue
# else:
# illiq[i] = df_[:,-1].mean()
# illiq_df = pd.DataFrame({'year_month': year_months, f'illiq{len_periods}': illiq})
# return illiq_df
stk_df['illiq_250d'] = stk_df.groupby(['secID']).rolling(250, min_periods=200)['illiq_daily'].mean().values
illiq_df['illiq_12m'] = stk_df.groupby(['secID','ym'])['illiq_250d'].last().values
illiq_df['illiq_12m'] = illiq_df['illiq_12m']*1e6
illiq_df
pd.merge(stk_df_m[['secID','ret_date','ret','rf','exret','ym','mktcap','size','rev','mom_date','mom']],
beta_df_m[['secID','ym','beta']], on=['secID','ym'],how='left')
ret_df = pd.merge(stk_df_m[['secID','ret_date','ret','rf','exret','ym','mktcap','size','rev','mom_date','mom']],
beta_df_m[['secID','ym','beta']], on=['secID','ym'],how='left')
ret_df = pd.merge(ret_df, pb_df, on=['secID','ym'],how='left')
ret_df = pd.merge(ret_df, illiq_df, on=['secID','ym'],how='left')
ret_df
ret_df.columns.drop(['mom_date','mom'])
cols = list(ret_df.columns.drop(['mom_date','mom']))
cols = cols+['mom_date','mom']
cols
ret_df = ret_df[cols].copy()
ret_df[ret_df['secID']=='000004.XSHE']
gc.collect()
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['ym'])['illiq_12m'].quantile(value)
# quantile_df[key] = ret_df.dropna().groupby(['ym'])['illiq_12m'].quantile(value)
ret_df_q = pd.merge(ret_df, quantile_df, on='ym')
# ret_df_q = pd.merge(ret_df.dropna(), quantile_df, on='ym')
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']
portfolios['p1'] = ret_df_q.loc[ret_df_q['illiq_12m'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['illiq_12m']) & (ret_df_q['illiq_12m'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['illiq_12m'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = ret_df.groupby(['ym'])['illiq_1m'].quantile(value)
ret_df_q = pd.merge(ret_df, quantile_df, on='ym')
portfolios = dict()
drop_cols = [col for col in ret_df_q.columns if col[0]=='q']
portfolios['p1'] = ret_df_q.loc[ret_df_q['illiq_1m'] <= ret_df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (ret_df_q[f'q{i-1}'] <= ret_df_q['illiq_1m']) & (ret_df_q['illiq_1m'] <= ret_df_q[f'q{i}'])
portfolios[f'p{i}'] = ret_df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = ret_df_q.loc[ret_df_q['illiq_1m'] >= ret_df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys())
pd.DataFrame(portfolios_crs_mean)[['p2','p9']].plot()
ret_df.rename(columns={'illiq_1m':'illiq'},inplace=True)
from myutils.factor_func import double_sort, factor, daily_factor
double_sort?
portfolios = double_sort(ret_df, sort1='illiq')
portfolios
mean_portfolios_ret = dict()
for pf in portfolios.keys():
mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
# print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有illiq和size分组没有任何交叉
# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['illiq1_size1'].index
# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
y = mean_portfolios_ret[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
mean_portfolios_ret_df.plot()
# Within mktcap1, any difference in illiq groups?
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[-5:] == 'size1'])
mean_portfolios_ret_df[cols].plot()
# Within mktcap1, any difference in illiq groups?
pfs = mean_portfolios_ret_df.columns
cols = list(pfs[pfs.str[-5:] == 'size2'])
mean_portfolios_ret_df[cols].plot()
def fm_reg(df,cols):
df_ = df.dropna()
if df_.shape[0] < 15:
return [None]*(len(cols)+1)
reg = LinearRegression(fit_intercept=True).fit(y=df_.loc[:,'exret'], X=df_.loc[:,cols])
return np.insert(reg.coef_, 0, reg.intercept_)
cols = ['illiq']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
以前做上面内容时,先不考虑去除ST,这部分再去除。结论是差不多的。
# # st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
# # st_df.to_pickle('./data/st_df.pkl')
# ret_df
# st_df = pd.read_pickle('./data/st_df.pkl')
# st_df['ym'] = pd.to_datetime(st_df['tradeDate']).dt.to_period('M')
# st_df.sort_values(['secID','tradeDate'],inplace=True)
# st_df = st_df.groupby(['secID','ym'],as_index=False).last()
# st_df.drop('tradeDate',axis=1,inplace=True)
# st_df
# st_df.loc[st_df['secID']=='000004.XSHE','STflg']
# st_df['STflg'].unique()
# st_df[st_df['STflg'] == '*']
# st_df[~st_df['STflg'].isin(['S'])]
# st_df = st_df[~st_df['STflg'].isin(['S'])]
# st_df
# ret_df[ret_df['secID']=='000004.XSHE']
# ret_df = pd.merge(ret_df,st_df,on=['secID','ym'],how='left')
# ret_df[(ret_df['secID'] =='000004.XSHE') & (ret_df['ym']=='2007-02')]
# ret_df[~ret_df['STflg'].isna()]
# ret_df_exST = ret_df[ret_df['STflg'].isna()]
# ret_df_ST = ret_df[~ret_df['STflg'].isna()]
# ret_df_exST.drop('STflg',axis=1,inplace=True)
# ret_df_ST.drop('STflg',axis=1,inplace=True)
# ret_df.drop('STflg',axis=1,inplace=True)
# q = dict()
# keys = ['q'+str(i) for i in range(1, 10)]
# values = np.arange(0.1, 1.0, 0.1)
# q.update(zip(keys,values))
# quantile_df = pd.DataFrame()
# for key, value in q.items():
# quantile_df[key] = ret_df_exST.groupby(['ym'])['illiq'].quantile(value)
# ret_df_exST_q = pd.merge(ret_df_exST, quantile_df, on='ym')
# portfolios = dict()
# drop_cols = [col for col in ret_df_exST_q.columns if col[0]=='q']
# portfolios['p1'] = ret_df_exST_q.loc[ret_df_exST_q['illiq'] <= ret_df_exST_q['q1']].copy().drop(drop_cols, axis=1)
# for i in range(2,10):
# idx = (ret_df_exST_q[f'q{i-1}'] <= ret_df_exST_q['illiq']) & (ret_df_exST_q['illiq'] <= ret_df_exST_q[f'q{i}'])
# portfolios[f'p{i}'] = ret_df_exST_q.loc[idx].copy().drop(drop_cols, axis=1)
# portfolios['p10'] = ret_df_exST_q.loc[ret_df_exST_q['illiq'] >= ret_df_exST_q['q9']].copy().drop(drop_cols, axis=1)
# portfolios_crs_mean = dict()
# for k in portfolios.keys():
# portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
# mean_values = {}
# t_values = {}
# for k in portfolios_crs_mean.keys():
# y = portfolios_crs_mean[k]
# const = np.full(shape=len(y),fill_value=1)
# reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
# mean_values[k] = reg.params[0]
# t_values[k] = reg.tvalues[0]
# # Portfolio 10-1
# y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
# const = np.full(shape=len(y), fill_value=1)
# reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
# mean_values['p10-p1'] = reg.params[0]
# t_values['p10-p1'] = reg.tvalues[0]
# pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
# columns=mean_values.keys())
# portfolios = double_sort(ret_df_exST, sort1='illiq')
# mean_portfolios_ret = dict()
# for pf in portfolios.keys():
# mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
# print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有illiq和size分组没有任何交叉
# # Fast merge by stacking
# mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
# mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
# mean_portfolios_ret_df.index = mean_portfolios_ret['illiq1_size1'].index
# # Newey-West adjustment
# mean_values = {}
# t_values = {}
# for k in mean_portfolios_ret.keys():
# y = mean_portfolios_ret[k]
# const = np.full(shape=len(y),fill_value=1)
# reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
# mean_values[k] = reg.params[0]
# t_values[k] = reg.tvalues[0]
# pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
# cols = ['beta','size','bm','mom','rev','illiq']
# temp = ret_df_exST.groupby('ret_date').apply(fm_reg, cols=cols)
# reg_result_df = pd.DataFrame(temp.values.tolist())
# reg_result_df.index=temp.index
# reg_result_df.columns = ['intercept'] + cols
# reg_result_df.dropna(inplace=True)
# # Mean of coefs with NW adjustment
# mean_values = {}
# t_values = {}
# for k in reg_result_df.columns:
# y = reg_result_df[k]
# const = np.full(shape=len(y),fill_value=1)
# reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
# mean_values[k] = reg.params[0]
# t_values[k] = reg.tvalues[0]
# pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
illiq_df = factor(ret_df, sort1='illiq', long_only=False)
illiq_df
factors_df = pd.read_pickle('./data/factors/ff3_rev.pkl')
factors_df.index = factors_df.index.to_period('M')
factors_df = pd.merge(factors_df,illiq_df,on='ret_date')
factors_df
rf_df = ret_df[['ret_date','rf']].drop_duplicates().sort_values('ret_date').dropna()
factors_df = pd.merge(rf_df,factors_df,on='ret_date').set_index('ret_date')
((factors_df+1).cumprod()*100).plot()
((factors_df['2015':]+1).cumprod()*100).plot()
(np.log(1 + factors_df).cumsum()*100).plot()
illiq_long_df = factor(ret_df, sort1='illiq')
illiq_long_df
factors_long_df = pd.read_pickle('./data/factors/ff3_rev_long_only.pkl')
factors_long_df.index = factors_long_df.index.to_period('M')
factors_long_df = pd.merge(factors_long_df,illiq_long_df,on='ret_date')
factors_long_df = pd.merge(rf_df,factors_long_df,on='ret_date').set_index('ret_date')
factors_long_df.rename(columns={'illiq':'illiq_long'},inplace=True)
factors_long_df
((factors_long_df+1).cumprod()*100).plot()
((1 + factors_long_df['2018':]).cumprod()*100).plot()
stk_df
vol_df = stk_df.groupby(['secID','ym'])['ret_daily'].std()
vol_df
vol_df = vol_df.to_frame()
vol_df.columns = ['vol']
vol_df.reset_index(inplace=True)
MIN_NOBS = 15
idx = stk_df.groupby(['secID','ym'],as_index=False)['tradeDate'].count()['tradeDate'] < MIN_NOBS
vol_df.loc[idx, 'vol'] = np.nan
vol_df
Idiosyncratic volatility 指的是被已有的因子收益率回归之后,剩下的残差的波动率。传统金融理论认为残差部分属于可分散的风险,不应该被定价。即使认为因子模型还不够好,残差部分有剩余未被解释的系统性风险的部分,这部分的风险溢价(factor risk premium)也应该是正的。然而,美国市场的数据发现,波动率越低,未来的收益越高。这是一个很有意思的异象。
最广为人知的文献是 Ang, Hodrick, Xing, and Zhang (2006)。
我们每个月用因子模型计算一下ivol,用来排序。
pd.read_pickle('./data/factors/factors_daily.pkl')
shibor1d = DataAPI.MktIborGet(secID="Shibor1D.IRCN",beginDate=START,endDate=END,field=['tradeDate','rate'],pandas="1")
shibor1d['tradeDate'] = pd.to_datetime(shibor1d['tradeDate'])
shibor1d['rate'] = shibor1d['rate'] * 0.01 / 365
shibor1d.rename(columns={'rate':'rf'},inplace=True)
factors_daily = pd.read_pickle('./data/factors/factors_daily.pkl')
illiq_daily = daily_factor(ret_df, stock_df=stk_df,sort1='illiq',long_only=False)
illiq_daily
illiq_daily.name = 'illiq'
factors_daily = pd.merge(factors_daily, illiq_daily, on='tradeDate')
factors_daily
def idiovol_np(df, factor_cols, len_periods='1m', min_nobs=15):
"""
df: year_month is set as index
"""
year_months = df.index.unique()
n = len(year_months)
idiovol = np.full(shape=n, fill_value=np.nan)
start_notnan_month = int(len_periods[:-1])-1
for i in range(start_notnan_month,n):
df_ = df.loc[str(year_months[i]-int(len_periods[:-1])+1) : str(year_months[i]),['exret']+factor_cols].dropna().to_numpy()
if len(df_) < min_nobs:
continue
else:
y_ = df_[:,0] #
X_ = df_[:,1:]
reg = LinearRegression().fit(y=y_, X=X_)
res = y_ - reg.predict(X_)
idiovol[i] = np.std(res)
idiovol_df = pd.DataFrame({'ym': year_months, f'idiovol{len_periods}': idiovol})
return idiovol_df
def idiovol(df,factor_cols,min_nobs=15):
df = df.dropna()
y = df.loc[:,'exret']
X = df.loc[:,factor_cols]
nobs = len(X)
if nobs < min_nobs:
idiovol = np.nan
else:
reg = LinearRegression().fit(y=y,X=X)
res = y - reg.predict(X)
idiovol = np.std(res)
return idiovol
reg_df = pd.merge(stk_df[['secID','tradeDate','ret_daily']],factors_daily,
on=['tradeDate'])
reg_df
reg_df.sort_values(['secID','tradeDate'],inplace=True)
reg_df
reg_df['exret'] = reg_df['ret_daily'] - reg_df['rf']
reg_df['ym'] = reg_df['tradeDate'].dt.to_period('M')
# Testing the time consuming
ids = reg_df['secID'].unique()
ids_ = np.random.choice(ids, 50)
temp = reg_df[reg_df['secID'].isin(ids_)]
temp.set_index('ym',inplace=True)
%%time
result1 = temp.groupby(['secID','ym']).apply(idiovol, factor_cols=['exmktret','size','bm','mom','rev','illiq'])
%%time
result2 = temp.groupby(['secID']).apply(idiovol_np, factor_cols=['exmktret','size','bm','mom','rev','illiq'])
result1
result2
result1 = result1.to_frame().reset_index()
result1.columns = ['secID','ym','r1']
result2 = result2.reset_index().drop('level_1',axis=1)
result = pd.merge(result1, result2, on=['secID','ym'])
result
(result['r1'] - result['idiovol1m']).sum()
result['r1'].to_numpy()
result['idiovol1m'].to_numpy()
np.array_equal(result['r1'].to_numpy(), result['idiovol1m'].to_numpy())
np.allclose(result['r1'].to_numpy(), result['idiovol1m'].to_numpy())
result.fillna(0, inplace=True)
np.array_equal(result['r1'].to_numpy(), result['idiovol1m'].to_numpy())
np.allclose(result['r1'].to_numpy(), result['idiovol1m'].to_numpy())
reg_df.set_index('ym',inplace=True)
reg_df
# %%time
# ivol = reg_df.groupby(['secID']).apply(idiovol_np, factor_cols=['exmktret','size','bm','mom','rev','illiq'])
# # Takes about 26min 40s to execute
# ivol = ivol.reset_index().drop('level_1',axis=1)
# ivol.to_pickle('./data/idiovol1m.pkl')
ivol = pd.read_pickle('./data/idiovol1m.pkl')
vol_df = pd.merge(vol_df,ivol,on=['secID','ym'])
vol_df
vol_df.loc[vol_df['ym']>='2018',['ym','vol']].boxplot(by='ym',showmeans=True)
vol_df['vol_clip'] = vol_df['vol'].clip(lower=vol_df['vol'].quantile(0.001),
upper=vol_df['vol'].quantile(0.999))
vol_df['ivol_clip'] = vol_df['idiovol1m'].clip(lower=vol_df['idiovol1m'].quantile(0.001),
upper=vol_df['idiovol1m'].quantile(0.999))
vol_df.loc[vol_df['ym']>='2018',['ym','vol_clip']].boxplot(by='ym',showmeans=True,rot=45)
vol_df.loc[vol_df['ym']>='2018',['ym','ivol_clip']].boxplot(by='ym',showmeans=True,rot=45)
vol_df.loc[:,'vol':'ivol_clip'].describe()
vol_df.rename(columns={'idiovol1m':'ivol'},inplace=True)
ret_df = pd.merge(ret_df,vol_df,
on=['secID','ym'])
ret_df
def single_sort(df, sort):
q = dict()
keys = ['q'+str(i) for i in range(1, 10)]
values = np.arange(0.1, 1.0, 0.1)
q.update(zip(keys,values))
quantile_df = pd.DataFrame()
for key, value in q.items():
quantile_df[key] = df.groupby(['ym'])[sort].quantile(value)
df_q = pd.merge(df, quantile_df, on='ym')
portfolios = dict()
drop_cols = [col for col in df_q.columns if col[0]=='q']
portfolios['p1'] = df_q.loc[df_q[sort] <= df_q['q1']].copy().drop(drop_cols, axis=1)
for i in range(2,10):
idx = (df_q[f'q{i-1}'] <= df_q[sort]) & (df_q[sort] <= df_q[f'q{i}'])
portfolios[f'p{i}'] = df_q.loc[idx].copy().drop(drop_cols, axis=1)
portfolios['p10'] = df_q.loc[df_q[sort] >= df_q['q9']].copy().drop(drop_cols, axis=1)
portfolios_crs_mean = dict()
for k in portfolios.keys():
portfolios_crs_mean[k] = portfolios[k].groupby(['ret_date'])['exret'].mean()
mean_values = {}
t_values = {}
for k in portfolios_crs_mean.keys():
y = portfolios_crs_mean[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
# Portfolio 10-1
y = portfolios_crs_mean['p10'] - portfolios_crs_mean['p1']
const = np.full(shape=len(y), fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values['p10-p1'] = reg.params[0]
t_values['p10-p1'] = reg.tvalues[0]
display(pd.DataFrame([mean_values.values(),t_values.values()],index=['mean','t-value'],
columns=mean_values.keys()))
single_sort(ret_df, 'vol')
portfolios = double_sort(ret_df, sort1='vol')
mean_portfolios_ret = dict()
for pf in portfolios.keys():
mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有vol和size分组没有任何交叉
# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['vol1_size1'].index
# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
y = mean_portfolios_ret[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq','vol']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq','vol_clip']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
single_sort(ret_df, 'ivol')
portfolios = double_sort(ret_df, sort1='ivol')
mean_portfolios_ret = dict()
for pf in portfolios.keys():
mean_portfolios_ret[pf] = portfolios[pf].groupby('ret_date')['exret'].mean()
print(mean_portfolios_ret[pf].shape) # print 看一下会不会存在某个月份上没有vol和size分组没有任何交叉
# Fast merge by stacking
mean_portfolios_ret_df = pd.DataFrame(np.vstack([pf for pf in mean_portfolios_ret.values()])).T
mean_portfolios_ret_df.columns = mean_portfolios_ret.keys()
mean_portfolios_ret_df.index = mean_portfolios_ret['ivol1_size1'].index
# Newey-West adjustment
mean_values = {}
t_values = {}
for k in mean_portfolios_ret.keys():
y = mean_portfolios_ret[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=4)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq','ivol_clip']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq','ivol_clip']
temp = ret_df[ret_df['ret_date']>='2015'].groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
cols = ['beta','size','bm','mom','rev','illiq','ivol_clip']
temp = ret_df[ret_df['ret_date']>='2018'].groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
ivol_factor = factor(ret_df, sort1='ivol',long_high=False,long_only=False)
ivol_factor
factors_df = pd.merge(factors_df,ivol_factor,on='ret_date',how='left')
((factors_df+1).cumprod()*100).plot()
((factors_df['2018':]+1).cumprod()*100).plot()
factors_df.to_pickle('./data/factors/factors_all.pkl')
ret_df
ret_df.to_pickle('./data/factor_exposure/all_exposure.pkl')
cols = ['size','bm','rev','illiq','ivol']
monthend = stk_df[['secID','tradeDate','ym']].groupby(['secID','ym'],as_index=False).last()
monthend.rename(columns={'tradeDate':'month_end'},inplace=True)
ret_df = pd.merge(ret_df, monthend,on=['secID','ym'],how='left')
ret_df['month_end'] = ret_df['ym'].dt.to_timestamp() + pd.tseries.offsets.MonthEnd(0)
ret_df[['secID','month_end','rev']].dropna().pivot(index='month_end',
columns='secID',values='rev')
temp = ret_df[['secID','month_end','size']].dropna().pivot(index='month_end',
columns='secID',values='size').reset_index()
temp['month_end'] = temp['month_end'].dt.strftime('%Y/%m/%d')
temp.set_index('month_end')
for c in cols:
temp = ret_df[['secID','month_end',c]].dropna().pivot(index='month_end',columns='secID',values=c)
temp.reset_index(inplace=True)
temp['month_end'] = temp['month_end'].dt.strftime('%Y/%m/%d')
temp.set_index('month_end',inplace=True)
temp.to_csv(f'./data/factor_exposure/{c}_exposure.csv')
factors_daily = pd.read_pickle('./data/factors/factors_daily_long_only.pkl')
illiq_daily = daily_factor(ret_df, stock_df=stk_df,sort1='illiq',long_only=True)
illiq_daily.name = 'illiq_long'
factors_daily = pd.merge(factors_daily, illiq_daily, on='tradeDate')
reg_df = pd.merge(stk_df[['secID','tradeDate','ret_daily']],factors_daily,
on=['tradeDate'])
reg_df.sort_values(['secID','tradeDate'],inplace=True)
reg_df['exret'] = reg_df['ret_daily'] - reg_df['rf']
reg_df['ym'] = reg_df['tradeDate'].dt.to_period('M')
reg_df.set_index('ym',inplace=True)
reg_df
gc.collect()
# import sys
# def sizeof_fmt(num, suffix='B'):
# ''' by Fred Cirera, https://stackoverflow.com/a/1094933/1870254, modified'''
# for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
# if abs(num) < 1024.0:
# return "%3.1f %s%s" % (num, unit, suffix)
# num /= 1024.0
# return "%.1f %s%s" % (num, 'Yi', suffix)
# for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
# key= lambda x: -x[1])[:10]:
# print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))
# del stk_unfilled_df, beta_df
# %%time
# ivol = reg_df.groupby(['secID']).apply(idiovol_np, factor_cols=['exmktret','size_long','bm_long','mom_long','rev_long','illiq_long'])
# # Takes about 27min 25s to execute
# ivol = ivol.reset_index().drop('level_1',axis=1)
# display(ivol)
# ivol.to_pickle('./data/idiovol1m_long_only.pkl')
vol_df = stk_df.groupby(['secID','ym'])['ret_daily'].std()
vol_df = vol_df.to_frame()
vol_df.columns = ['vol']
vol_df.reset_index(inplace=True)
MIN_NOBS = 15
idx = stk_df.groupby(['secID','ym'],as_index=False)['tradeDate'].count()['tradeDate'] < MIN_NOBS
vol_df.loc[idx, 'vol'] = np.nan
ivol = pd.read_pickle('./data/idiovol1m_long_only.pkl')
vol_df = pd.merge(vol_df,ivol,on=['secID','ym'])
vol_df['vol_clip'] = vol_df['vol'].clip(lower=vol_df['vol'].quantile(0.001),
upper=vol_df['vol'].quantile(0.999))
vol_df['ivol_clip'] = vol_df['idiovol1m'].clip(lower=vol_df['idiovol1m'].quantile(0.001),
upper=vol_df['idiovol1m'].quantile(0.999))
vol_df.rename(columns={'idiovol1m':'ivol'},inplace=True)
try:
ret_df.drop(['vol','ivol','vol_clip','ivol_clip'],axis=1,inplace=True)
except KeyError:
pass
ret_df = pd.merge(ret_df,vol_df,
on=['secID','ym'])
ret_df
single_sort(ret_df, 'ivol')
cols = ['beta','size','bm','mom','rev','illiq','ivol_clip']
temp = ret_df.groupby('ret_date').apply(fm_reg, cols=cols)
reg_result_df = pd.DataFrame(temp.values.tolist())
reg_result_df.index=temp.index
reg_result_df.columns = ['intercept'] + cols
reg_result_df.dropna(inplace=True)
# Mean of coefs with NW adjustment
mean_values = {}
t_values = {}
for k in reg_result_df.columns:
y = reg_result_df[k]
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HAC', maxlags=6)
mean_values[k] = reg.params[0]
t_values[k] = reg.tvalues[0]
pd.DataFrame([mean_values.values(),t_values.values()],index=['ret_mean','t_values'],columns=mean_values.keys())
ivol_factor = factor(ret_df, sort1='ivol',long_high=False,long_only=True)
factors_long_df = pd.merge(factors_long_df,ivol_factor,on='ret_date',how='left')
((factors_long_df+1).cumprod()*100).plot()
((factors_long_df['2018':]+1).cumprod()*100).plot()
factors_long_df.rename(columns={'ivol':'ivol_long'},inplace=True)
factors_long_df
factors_long_df.to_pickle('./data/factors/factors_all_long_only.pkl')