import numpy as np
import pandas as pd
import datetime as dt
pd.set_option('display.max_rows', 16)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import gc
plt.rcParams['figure.figsize'] = (16.0, 9.0)
START = '2007-01-01'
END = '2022-03-31'
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID']
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
shibor_df = DataAPI.MktIborGet(secID="Shibor1M.IRCN",beginDate=START,endDate=END,field=['secID','tradeDate','rate'],pandas="1")
shibor_df['rate'] = shibor_df['rate']*0.01/12
shibor_df['tradeDate'] = pd.to_datetime(shibor_df['tradeDate'])
shibor_df.drop('secID',axis=1,inplace=True)
shibor_df.rename(columns={'rate':'rf'},inplace=True)
shibor_df['ym'] = shibor_df['tradeDate'].dt.to_period('M')
shibor_df.sort_values('tradeDate',inplace=True)
shibor_df_m = shibor_df.groupby('ym',as_index=False).last()
shibor_df_m.drop('tradeDate',axis=1,inplace=True)
shibor_df_m
beta_df = pd.read_pickle('./data/beta_df.pkl')
beta_df['tradeDate'] = pd.to_datetime(beta_df['tradeDate'], format="%Y-%m-%d")
beta_df['ym'] = beta_df['tradeDate'].dt.to_period('M')
beta_df.drop(['Beta60','Beta120'],axis=1,inplace=True)
beta_df['Beta252'] = pd.to_numeric(beta_df['Beta252'])
# Winsorization
# up_q = 0.99999
# lower_q = 0.00001
# beta_df['Beta252_winsor'] = beta_df['Beta252'].clip(lower=beta_df['Beta252'].quantile(lower_q),upper=beta_df['Beta252'].quantile(up_q))
# Monthly
beta_df_m = beta_df.groupby(['secID','ym'],as_index=False)['Beta252'].last()
beta_df_m.rename(columns={'Beta252':'beta'},inplace=True)
beta_df_m
pb_df = pd.read_pickle('./data/pb_df.pkl')
pb_df['tradeDate'] = pd.to_datetime(pb_df['tradeDate'])
pb_df['PB'] = pd.to_numeric(pb_df['PB'])
pb_df['ym'] = pb_df['tradeDate'].dt.to_period('M')
pb_df.sort_values(['secID','tradeDate'],inplace=True)
pb_df = pb_df.groupby(['secID','ym'],as_index=False).last()
pb_df['bm'] = 1 / pb_df['PB']
pb_df.drop(['tradeDate','PB'],axis=1,inplace=True)
pb_df = pb_df[pb_df['bm'] >= 0]
pb_df
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
# field=["secID","tradeDate",
# 'preClosePrice',"closePrice",
# "negMarketValue",
# "turnoverValue",'turnoverRate'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
# stk_df = pd.read_pickle('./data/stk_df.pkl')
# stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
# stk_df.sort_values(['secID','tradeDate'],inplace=True)
# # drop ST stocks
# print(stk_df.shape)
# stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
# stk_df = stk_df[stk_df['STflg'].isna()].copy()
# stk_df.drop('STflg',axis=1,inplace=True)
# print(stk_df.shape)
# # # If the trading days are required to be consecutive, fill missing days first. This could possibly produce a much larger df when using
# ## daily data, and if the missing dates are a lot for some securities
# def fill_missing(df, full_dates, id_col='secID', date_col='tradeDate'):
# """
# This function fills the missing dates for stocks.
# Parameters:
# df: The dataframe. Could be a sub-dataframe created by "groupby".
# The dataframe must be sorted on the "date_col".
# full_dates: the unique dates covering all securities in the full dataframe.
# Need to be sorted.
# id_col: the security id.
# date_col: the dates column for the security
# Returns:
# A dataframe with the missing dates filled with NA.
# """
# stk_id = df[id_col].unique()
# # Newer version of pandas will allow comparison between "Timestamp" and "datetime64"
# # date_start = np.where(full_dates == df[date_col].min())[0][0]
# # date_end = np.where(full_dates == df[date_col].max())[0][0]
# date_start = np.where(full_dates == df[date_col].min().to_datetime64())[0][0]
# date_end = np.where(full_dates == df[date_col].max().to_datetime64())[0][0]
# dates = full_dates[date_start:date_end+1]
# idx = pd.MultiIndex.from_product([stk_id,dates],
# names=(id_col,date_col))
# df = df.set_index([id_col,date_col]).reindex(idx).reset_index()
# return df
# full_dates = np.sort(stk_df['tradeDate'].unique())
# %%time
# stk_df = stk_df.groupby('secID').apply(fill_missing, full_dates=full_dates)
# stk_df.reset_index(drop=True, inplace=True)
# stk_df['ret_daily'] = stk_df['closePrice'] / stk_df['preClosePrice'] - 1
# stk_df['illiq_daily'] = abs(stk_df['ret_daily']) / stk_df['turnoverValue']
# stk_df['ym'] = stk_df['tradeDate'].dt.to_period('M')
# stk_df.to_pickle('./data/stk_df_filled.pkl')
stk_df = pd.read_pickle('./data/stk_df_filled.pkl')
stk_df
stk_df_m = stk_df.groupby(['secID','ym'],as_index=False).last()
stk_df_m['ret'] = stk_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1)
stk_df_m['size'] = np.log(stk_df_m['negMarketValue'])
stk_df_m.drop(['tradeDate','preClosePrice'],axis=1,inplace=True)
stk_df_m = pd.merge(stk_df_m, shibor_df_m, on='ym')
stk_df_m['exret'] = stk_df_m['ret'] - stk_df_m['rf']
stk_df_m.sort_values(['secID','ym'],inplace=True)
stk_df_m.rename(columns={'negMarketValue':'mktcap'},inplace=True)
stk_df_m
stk_df_m[stk_df_m['secID'] == '000001.XSHE'].set_index('ym').loc['2010-01':'2010-12']
# 停牌的时间都填充为NaN。刚恢复交易的第一个月的月收益率也为NaN
stk_unfilled_df = pd.read_pickle('./data/stk_df.pkl')
stk_unfilled_df['tradeDate'] = pd.to_datetime(stk_unfilled_df['tradeDate'], format='%Y-%m-%d')
stk_unfilled_df['ym'] = stk_unfilled_df['tradeDate'].dt.to_period('M')
stk_unfilled_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_unfilled_df.shape)
stk_unfilled_df = pd.merge(stk_unfilled_df, st_df, on=['secID','tradeDate'],how='left')
stk_unfilled_df = stk_unfilled_df[stk_unfilled_df['STflg'].isna()].copy()
stk_unfilled_df.drop('STflg',axis=1,inplace=True)
print(stk_unfilled_df.shape)
# Monthly
stk_unfilled_df_m = stk_unfilled_df.groupby(['secID','ym'],as_index=False).last()
stk_unfilled_df_m['ret_mom'] = stk_unfilled_df_m.groupby('secID')['closePrice'].apply(lambda x: x / x.shift() - 1) #这个ret_mom不用作后面ret的计算,后面仍保留monthly ret
stk_unfilled_df_m.sort_values(['secID','ym'],inplace=True)
stk_unfilled_df_m['1+ret_mom'] = stk_unfilled_df_m['ret_mom'] + 1
stk_unfilled_df_m['mom'] = stk_unfilled_df_m.groupby('secID').rolling(11,min_periods=9)['1+ret_mom'].apply(np.prod, raw=True).values - 1
stk_df_m = pd.merge(stk_df_m, stk_unfilled_df_m[['secID','ym','1+ret_mom']],on=['secID','ym'],how='left')
stk_df_m.loc[stk_df_m['1+ret_mom'].isna(),'1+ret_mom'] = 1 # 缺失位置填充为1,以便连乘。
stk_df_m['mom'] = stk_df_m.groupby('secID').rolling(11,min_periods=11)['1+ret_mom'].apply(np.prod, raw=True).values - 1
stk_df_m['rev'] = stk_df_m['exret'].values
stk_df_m['ret'] = stk_df_m.groupby(['secID'])['ret'].shift(-1)
stk_df_m['rf'] = stk_df_m.groupby(['secID'])['rf'].shift(-1)
stk_df_m['exret'] = stk_df_m.groupby(['secID'])['exret'].shift(-1)
stk_df_m['ret_date'] = stk_df_m.groupby('secID')['ym'].shift(-1)
stk_df_m['mom'] = stk_df_m.groupby(['secID'])['mom'].shift()
stk_df_m['mom_date'] = stk_df_m.groupby('secID')['ym'].shift()
stk_df_m.drop(['ret_daily','turnoverValue','turnoverRate','illiq_daily','1+ret_mom'],axis=1,inplace=True)
stk_df_m
ret_df = pd.merge(stk_df_m[['secID','ret_date','ret','rf','exret','ym','mktcap','size','rev','mom_date','mom']],
beta_df_m[['secID','ym','beta']], on=['secID','ym'], how='left')
ret_df
ret_df = pd.merge(ret_df, pb_df, on=['secID','ym'],how='left')
ret_df
def double_sort(df, sort1, sort2='size', group_date='ym', merge_cols=['secID','ret_date']):
"""
Double sorting.
Arguments:
sort1: variable 1 for sorting into 3 groups
sort2: default is "size", sorting into 2 groups
returns:
portfolios containing 2*3 groups
"""
q1 = dict()
keys = [f'q_{sort1}_1',f'q_{sort1}_2']
values = [0.3, 0.7]
q1.update(zip(keys,values))
q2 = dict()
keys = [f'q_{sort2}_1']
values = [0.5]
q2.update(zip(keys,values))
q1_df = pd.DataFrame()
for key, value in q1.items():
q1_df[key] = df.groupby([group_date])[sort1].quantile(value)
q2_df = pd.DataFrame()
for key, value in q2.items():
q2_df[key] = df.groupby([group_date])[sort2].quantile(value)
ret_df_q = pd.merge(df, q2_df, on=group_date)
ret_df_q = pd.merge(ret_df_q, q1_df, on=group_date)
portfolios1 = dict()
portfolios1[f'{sort1}1'] = ret_df_q.loc[ret_df_q[f'{sort1}'] <= ret_df_q[f'q_{sort1}_1']]
portfolios1[f'{sort1}2'] = ret_df_q.loc[(ret_df_q[f'{sort1}'] >= ret_df_q[f'q_{sort1}_1']) & \
(ret_df_q[f'{sort1}'] <= ret_df_q[f'q_{sort1}_2'])]
portfolios1[f'{sort1}3'] = ret_df_q.loc[ret_df_q[f'{sort1}'] >= ret_df_q[f'q_{sort1}_2']]
portfolios2 = dict()
portfolios2[f'{sort2}1'] = ret_df_q.loc[ret_df_q[f'{sort2}'] <= ret_df_q[f'q_{sort2}_1'],
merge_cols+[group_date]+['ret','exret','size','mktcap']]
portfolios2[f'{sort2}2'] = ret_df_q.loc[ret_df_q[f'{sort2}'] >= ret_df_q[f'q_{sort2}_1'],
merge_cols+[group_date]+['ret','exret','size','mktcap']]
portfolios = dict()
for group1 in portfolios1.keys():
for group2 in portfolios2.keys():
portfolios[f'{group1}_{group2}'] = pd.merge(portfolios2[group2],
portfolios1[group1][merge_cols+[f'{sort1}']],
on=merge_cols)
return portfolios
def factor(df, sort1, sort2='size', long_high=True, long_only=True):
portfolios = double_sort(df=df, sort1=sort1, sort2=sort2)
portfolios_vwret = {}
for pf in portfolios.keys():
# portfolios[pf].dropna(inplace=True) # 不应该dropna。如果在某个月停牌,分组时在前一个月,并不知道会这样。
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
if long_only:
if long_high:
factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
else:
factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
else:
if long_high:
factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2 - \
(portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
else:
factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2 - \
(portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
factor.name = sort1
return factor
# factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
if f == 'rev':
factors_dict[f] = factor(df=ret_df,sort1=f,long_high=False)
else:
factors_dict[f] = factor(df=ret_df,sort1=f)
factors_df = pd.DataFrame(factors_dict)
# Size
sort1 = 'bm'
portfolios = double_sort(ret_df,sort1=sort1)
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
factors_df['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3
factors_df
shibor_df
# 用中证800作为market return
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df['ret_date'] = index_df['tradeDate'].dt.to_period('M')
index_df.sort_values('tradeDate',inplace=True)
index_df = index_df.groupby('ret_date',as_index=False).last()
index_df['mktret'] = index_df['closeIndex'] / index_df['closeIndex'].shift() - 1
index_df = pd.merge(index_df,shibor_df_m,left_on=['ret_date'],right_on=['ym'])
index_df['exmktret'] = index_df['mktret'] - index_df['rf']
index_df.drop(['ym','mktret','indexID','secShortName','tradeDate',
'closeIndex','CHGPct'],axis=1,inplace=True)
index_df.dropna(inplace=True)
index_df
factors_df = pd.merge(index_df, factors_df, on='ret_date')
factors_df.set_index('ret_date',inplace=True)
factors_df = factors_df[['rf','exmktret','size','bm','mom','rev']]
factors_df.columns = ['rf','exmktret','size_long','bm_long','mom_long','rev_long']
factors_df
((factors_df+1).cumprod()*100).plot()
factors_df.to_pickle('./data/factors/factors_long_only.pkl')
# factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
if f == 'rev':
factors_dict[f] = factor(df=ret_df,sort1=f,long_only=False,long_high=False)
else:
factors_dict[f] = factor(df=ret_df,sort1=f,long_only=False)
factors_df = pd.DataFrame(factors_dict)
# Size
sort1 = 'bm'
portfolios = double_sort(ret_df,sort1=sort1)
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
portfolios[pf]['weighted_ret'] = portfolios[pf]['ret'] * portfolios[pf]['weight']
portfolios_vwret[pf] = portfolios[pf].groupby('ret_date')['weighted_ret'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[f'{sort1}1_size1'].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
factors_df['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3 - \
(portfolios_vwret_df['bm1_size2'] + portfolios_vwret_df['bm2_size2'] + portfolios_vwret_df['bm3_size2']) / 3
factors_df = pd.merge(index_df, factors_df, on='ret_date')
factors_df.set_index('ret_date',inplace=True)
factors_df = factors_df[['rf','exmktret','size','bm','mom','rev']]
factors_df
((factors_df+1).cumprod()*100).plot()
factors_df.to_pickle('./data/factors/factors.pkl')
shibor1d = DataAPI.MktIborGet(secID="Shibor1D.IRCN",beginDate=START,endDate=END,field=['tradeDate','rate'],pandas="1")
shibor1d['tradeDate'] = pd.to_datetime(shibor1d['tradeDate'])
shibor1d['rate'] = shibor1d['rate'] * 0.01 / 365
shibor1d.rename(columns={'rate':'rf'},inplace=True)
def daily_factor(df, sort1, sort2='size', long_high=True, long_only=True):
portfolios = double_sort(df=df, sort1=sort1, sort2=sort2)
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
stk_df[['secID','tradeDate','ym','ret_daily']],
left_on=['secID','ret_date'],
right_on=['secID','ym'])
df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
if long_only:
if long_high:
factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
else:
factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
else:
if long_high:
factor = (portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2 - \
(portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2
else:
factor = (portfolios_vwret_df[f'{sort1}1_{sort2}1'] + portfolios_vwret_df[f'{sort1}1_{sort2}2']) / 2 - \
(portfolios_vwret_df[f'{sort1}3_{sort2}1'] + portfolios_vwret_df[f'{sort1}3_{sort2}2']) / 2
return factor
# Daily factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
if f == 'rev':
factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_high=False)
else:
factors_dict[f] = daily_factor(df=ret_df,sort1=f)
factors_daily = pd.DataFrame(factors_dict)
# The size daily factor
portfolios = double_sort(df=ret_df,sort1='bm')
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
stk_df[['secID','tradeDate','ym','ret_daily']],
left_on=['secID','ret_date'],
right_on=['secID','ym'])
df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
factors_daily['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3
factors_daily
sec_id = ['000906.ZICN']
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=START,endDate=END,field=['indexID','secShortName','tradeDate','closeIndex','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'])
index_df = pd.merge(shibor1d,index_df[['tradeDate','CHGPct']],on='tradeDate')
index_df.rename(columns={'CHGPct':'mktret'},inplace=True)
index_df['exmktret'] = index_df['mktret'] - index_df['rf']
index_df
factors_daily = pd.merge(index_df[['tradeDate','rf','exmktret']],factors_daily, on='tradeDate')
factors_daily.set_index('tradeDate',inplace=True)
factors_daily = factors_daily[['rf','exmktret','size','bm','mom','rev']]
factors_daily.columns = ['rf','exmktret','size_long','bm_long','mom_long','rev_long']
factors_daily
((factors_daily+1).cumprod()*100).plot()
((factors_daily.loc['2020':]+1).cumprod()*100).plot()
# factors_daily.to_pickle('./data/factors/factors_daily_long_only.pkl')
# Daily factors excluding size
factors_dict = {}
for f in ['bm','mom','rev']:
if f == 'rev':
factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_only=False,long_high=False)
else:
factors_dict[f] = daily_factor(df=ret_df,sort1=f,long_only=False)
factors_daily = pd.DataFrame(factors_dict)
# The size daily factor
portfolios = double_sort(df=ret_df,sort1='bm')
portfolios_vwret = {}
for pf in portfolios.keys():
temp = portfolios[pf].groupby('ym')['mktcap'].agg({'mktcap_sum':np.sum})
portfolios[pf] = pd.merge(portfolios[pf], temp, on='ym')
portfolios[pf]['weight'] = portfolios[pf]['mktcap'] / portfolios[pf]['mktcap_sum']
df_ = pd.merge(portfolios[pf][['secID','ret_date','weight']],
stk_df[['secID','tradeDate','ym','ret_daily']],
left_on=['secID','ret_date'],
right_on=['secID','ym'])
df_['weighted_ret_daily'] = df_['ret_daily'] * df_['weight']
portfolios_vwret[pf] = df_.groupby('tradeDate')['weighted_ret_daily'].sum()
portfolios_vwret_df = pd.DataFrame(np.vstack([pf for pf in portfolios_vwret.values()])).T
portfolios_vwret_df.index = portfolios_vwret[list(portfolios_vwret.keys())[0]].index
portfolios_vwret_df.columns = portfolios_vwret.keys()
factors_daily['size'] = (portfolios_vwret_df['bm1_size1'] + portfolios_vwret_df['bm2_size1'] + portfolios_vwret_df['bm3_size1']) / 3 - \
(portfolios_vwret_df['bm1_size2'] + portfolios_vwret_df['bm2_size2'] + portfolios_vwret_df['bm3_size2']) / 3
factors_daily
factors_daily = pd.merge(index_df[['tradeDate','rf','exmktret']],factors_daily, on='tradeDate')
factors_daily.set_index('tradeDate',inplace=True)
factors_daily = factors_daily[['rf','exmktret','size','bm','mom','rev']]
factors_daily
((factors_daily+1).cumprod()*100).plot()
((factors_daily.loc['2020':]+1).cumprod()*100).plot()
factors_daily.to_pickle('./data/factors/factors_daily.pkl')