import pandas as pd
import numpy as np
import tqdm
# import gc
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16.0, 9.0)
import seaborn as sns
plt.rcParams['figure.figsize'] = (16.0, 9.0)
Risk free rate is from RESSET database. Specifically,
# Data is uploaded into uqer, but where is it?
! ls
! ls ..
# 👇 There it is!
# ! ls ../datas
rf_df = pd.read_csv('./data/rf-daily-2024.csv')
rf_df.head()
rf_df.tail()
rf_df.info()
rf_df.drop('Unnamed: 2',axis=1,inplace=True)
rf_df.rename(columns={'日无风险收益率_DRFRet':'rf','日期_Date':'date'},inplace=True)
# 另一种改名方式
# rf_df.columns = ['a','b']
rf_df['date'] = pd.to_datetime(rf_df['date'], format='%Y-%m-%d')
rf_df.head()
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
stk_info
stk_info['exchangeCD'].unique()
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
stk_id = stk_info['secID']
index_info = DataAPI.SecIDGet(assetClass="IDX",pandas="1")
stk_info.head()
index_info.head()
index_info['secID'].nunique()
index_info[index_info['secShortName'].isin(['上证综指','深证综指','创业板指','沪深300','中证500','中证1000'])]
index_info[index_info['secShortName'].isin(['上证综指','深证综指','创业板指','沪深300','中证500','中证1000'])].drop_duplicates('secShortName').secID.values
sec_id = index_info[index_info['secShortName'].isin(['上证综指','深证综指','创业板指','沪深300','中证500','中证1000'])].drop_duplicates('secShortName').secID.values
start = '20000101'
end = '20241231'
index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=start,endDate=end,field=['indexID','secShortName','tradeDate','closeIndex','turnoverVol','turnoverValue','CHGPct'],pandas="1")
index_df.info()
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'],format='%Y-%m-%d')
index_df.head()
index_ret_df = index_df.pivot(index='tradeDate',columns='secShortName',values='CHGPct')
index_ret_df.head()
rf_df.set_index('date',inplace=True)
rf_df.head()
index_ret_df = pd.merge(index_ret_df,rf_df,left_index=True,right_index=True)
index_ret_df.head()
index_retx_df = index_ret_df.iloc[:,:-1].sub(index_ret_df['rf'], axis=0) # 计算超额收益率
index_retx_df.head()
index_retx_df.tail()
# sec_id = index_info[index_info['secShortName'].isin(['上证综指','深证综指','创业板指','沪深300','中证500','中证1000'])].drop_duplicates('secShortName').secID.values
# start = '20000101'
# end = '20241231'
# index_df = DataAPI.MktIdxdGet(indexID=sec_id,beginDate=start,endDate=end,field=['indexID','secShortName','tradeDate','closeIndex','turnoverVol','turnoverValue','CHGPct'],pandas="1")
# index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'],format='%Y-%m-%d')
# index_ret_df = index_df.pivot(index='tradeDate',columns='secShortName',values='CHGPct')
# index_ret_df = pd.merge(index_ret_df,rf_df,left_index=True,right_index=True)
# index_retx_df = index_ret_df.iloc[:,:-1].sub(index_ret_df['rf'], axis=0)
# Raw index plot
index_df.pivot(index='tradeDate',columns='secShortName',values='closeIndex').plot()
((1 + index_retx_df).cumprod()*100).plot()
早期的平稳可能是一种假象:数据到后面会越来越大。log return会看得更清楚
# log return
(np.log(1 + index_retx_df).cumsum()*100).plot()
((1 + index_retx_df.loc['2006':'2024']).cumprod()*100).plot()
((1 + index_retx_df.loc['2010':'2024']).cumprod()*100).plot()
((1 + index_retx_df.loc['2018':'2024']).cumprod()*100).plot()
index_retx_df.describe()
0.019465*np.sqrt(240) # 2023计算
0.015047*np.sqrt(240) # 2023计算
0.019188*np.sqrt(240) # 2024 中证1000波动率
0.015883*np.sqrt(240) # 2024 沪深300波动率
us_df = pd.read_csv('./data/F-F_Research_Data_Factors_daily_2024.csv',skiprows=3)
us_df.drop(us_df.index.values[-1],inplace=True)
us_df.head()
us_df.rename(columns={'Unnamed: 0': 'date'},inplace=True)
us_df['date'] = pd.to_datetime(us_df['date'],format='%Y%m%d')
us_df['Mkt-RF'] = us_df['Mkt-RF']*0.01
us_df.set_index('date',inplace=True)
((1 + us_df.loc['2000':'2024','Mkt-RF']).cumprod()*100).plot()
(np.log(1 + us_df.loc['2000':'2024','Mkt-RF']).cumsum()*100).plot()
us_df.loc['2000':'2024','Mkt-RF'].describe()
0.012682*np.sqrt(250) # 2023 波动率
0.012525*np.sqrt(250)
# 2024, https://www.swsresearch.com/institute_sw/allIndex/downloadCenter/industryType
shenwan_industry_idx1 = ["农林牧渔","基础化工","钢铁","有色金属","电子","家用电器","食品饮料","纺织服饰","轻工制造","医药生物","公用事业","交通运输","房地产","商贸零售","社会服务","综合","建筑材料","建筑装饰","电力设备","国防军工","计算机","传媒","通信","银行","非银金融","汽车","机械设备","煤炭","石油石化","环保","美容护理"]
# shenwan_industry_idx1 = ['传媒','电力设备','电子','房地产','纺织服饰','非银金融','钢铁','公用事业','国防军工','环保','机械设备','基础化工','计算机','家用电器','建筑材料','建筑装饰','交通运输','煤炭','美容护理','农林牧渔','社会服务','石油石化','食品饮料','汽车','轻工制造','通信','综合','医药生物','银行','有色金属','商贸零售']
# shenwan_industry_idx1 = ["农林牧渔","基础化工","钢铁","有色金属","电子","家用电器","食品饮料","纺织服饰","轻工制造","医药生物","公用事业","交通运输","房地产","商贸零售","社会服务","综合","建筑材料","建筑装饰","电力设备","国防军工","计算机","传媒","通信","银行","非银金融","汽车","机械设备","煤炭","石油石化","环保","美容护理"]
len(shenwan_industry_idx1)
index_info['secShortName'][(index_info['secShortName'].str.contains('申万'))]
shenwan_industry_idx1 = ['申万' + indry for indry in shenwan_industry_idx1]
shenwan_industry_idx1
len(shenwan_industry_idx1)
index_info['secShortName'][(index_info['secShortName'].str.contains('申万')) &
(~index_info['secShortName'].str.contains('Ⅱ$'))]
index_info['secShortName'][index_info['secShortName'].str.contains('.*^申万Ⅰ$')]
这个正则表达式 .*^申万Ⅰ$
可以分解如下:
.*
: 表示匹配任意字符(除了行终止符)零次或多次。.
匹配任意单个字符,*
表示前一个字符或组的零次或多次出现。
^
: 这是一个插入符号,表示行的开头。
申万Ⅰ
: 这部分表示精确匹配字符序列 "申万Ⅰ"。
$
: 这个美元符号表示行的结尾。
因此,将所有部分放在一起,这个正则表达式匹配以任意字符开头,接着是精确的序列 "申万Ⅰ",然后紧跟着行结束。
index_info[index_info['secShortName'].isin(shenwan_industry_idx1)]
len(index_info[index_info['secShortName'].isin(shenwan_industry_idx1)])
index_info[index_info['secShortName'].isin(shenwan_industry_idx1)].duplicated('secShortName',keep=False)
set(shenwan_industry_idx1) - set(index_info[index_info['secShortName'].isin(shenwan_industry_idx1)]['secShortName'])
set(shenwan_industry_idx1).symmetric_difference(set(index_info[index_info['secShortName'].isin(shenwan_industry_idx1)]['secShortName']))
index_info.loc[index_info['secShortName'].isin(shenwan_industry_idx1)].sort_values('secShortName')
industry_id = index_info.loc[index_info['secShortName'].isin(shenwan_industry_idx1)].drop_duplicates(keep='last').loc[:,'secID']
industry_id
index_df = DataAPI.MktIdxdGet(indexID=list(industry_id) + ['000001.ZICN'],beginDate="20031015",endDate="20241231",
field=['indexID','secShortName','tradeDate','preCloseIndex',
'closeIndex','turnoverVol','turnoverValue','CHG','CHGPct'],pandas="1")
index_df['tradeDate'] = pd.to_datetime(index_df['tradeDate'],format='%Y-%m-%d')
index_ret_df = index_df.pivot(index='tradeDate',columns='indexID',values='CHGPct')
index_ret_df = pd.merge(index_ret_df,rf_df,left_index=True,right_index=True)
index_retx_df = index_ret_df.iloc[:,:-1].sub(index_ret_df['rf'], axis=0)
industry_id_name = dict(zip(index_df['indexID'],index_df['secShortName']))
industry_id_name
index_df.head()
index_retx_df.head()
index_retx_df.rename(mapper=industry_id_name, axis=1, inplace=True) # 将column名称改变
((1 + index_retx_df).cumprod()*100).plot()
industry_id_name_list = list(industry_id_name.values())
((1 + index_retx_df.loc[:,industry_id_name_list[0:10]]).cumprod()*100).plot()
食品饮料,家用电器
((1 + index_retx_df.loc[:,industry_id_name_list[10:20]]).cumprod()*100).plot()
社会服务,医药生物
((1 + index_retx_df.loc[:,industry_id_name_list[20:32]]).cumprod()*100).plot()
电力设备
((1 + index_retx_df['2020':'2024']).cumprod()*100).plot()
cols = ['上证综指','申万食品饮料','申万家用电器','申万社会服务','申万医药生物','申万电力设备']
((1 + index_retx_df.loc['2003':'2010',cols]).cumprod()*100).plot()
((1 + index_retx_df.loc['2008':'2010',cols]).cumprod()*100).plot()
((1 + index_retx_df.loc['2006':'2009',cols]).cumprod()*100).plot()
((1 + index_retx_df.loc['2011':'2020',cols]).cumprod()*100).plot()
((1 + index_retx_df.loc['2018':'2020',cols]).cumprod()*100).plot()
((1 + index_retx_df.loc['2020':'2024',cols]).cumprod()*100).plot()
观察以下几个指标:
按照以下几种分组:
# %%time
# # Need about 5 mins to read. Data size is about 490 MB.
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate="20160101",endDate="20241231",isOpen=1,
# field=["secID","secShortName","exchangeCD","tradeDate",
# "preClosePrice","closePrice","turnoverVol",
# "turnoverValue","dealAmount","turnoverRate",
# "negMarketValue","marketValue"],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df.info()
# stk_df.loc[(stk_df['tradeDate']>='2022-01-01') & (stk_df['tradeDate']<='2023-03-01')].to_pickle('./data/stk_df_22_23.pkl') # for exercise 1
stk_df.head()
sec_sector = DataAPI.EquGet(secID=stk_id,field=["secID","ListSector"],pandas="1")
# industry = DataAPI.EquIndustryGet(secID=stk_id,industryVersionCD="010303",field=['secID','industryName1',''],pandas="1")
# industry['industryName1'].unique()
# industry.drop_duplicates()
stk_df = pd.merge(stk_df, sec_sector, on="secID")
stk_df
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.set_index(['secID','tradeDate'],inplace=True)
stk_df['yearMonth'] = stk_df.index.get_level_values(1).to_period('M') # Change to Month
q33 = stk_df.groupby(['yearMonth'])['negMarketValue'].quantile(0.33)
q67 = stk_df.groupby(['yearMonth'])['negMarketValue'].quantile(0.67)
q33.name = 'negMarketValue_q33'
q67.name = 'negMarketValue_q67'
stk_df.reset_index(inplace=True)
stk_df = pd.merge(stk_df,q33,on='yearMonth')
stk_df = pd.merge(stk_df,q67,on='yearMonth')
stk_df
stk_df['size'] = 'small'
stk_df.loc[(stk_df['negMarketValue'] > stk_df['negMarketValue_q33']) & (stk_df['negMarketValue'] <= stk_df['negMarketValue_q67']), 'size'] = 'medium'
stk_df.loc[stk_df['negMarketValue'] > stk_df['negMarketValue_q67'], 'size'] = 'large'
stk_df.head()
stk_df.groupby('yearMonth')['secShortName'].nunique().plot()
n_stk_by_list = stk_df.groupby(['ListSector','yearMonth'])['secShortName'].nunique().reset_index().pivot(index='yearMonth',columns='ListSector',values='secShortName')
n_stk_by_list
n_stk_by_list.plot()
stk_df.groupby(['exchangeCD','yearMonth'])['secShortName'].nunique().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='secShortName').plot()
stk_df.groupby(['size','yearMonth'])['secShortName'].nunique().reset_index().pivot(index='yearMonth',columns='size',values='secShortName').plot()
stk_df.groupby(['yearMonth'])['negMarketValue'].sum()
(stk_df.groupby(['yearMonth'])['negMarketValue'].sum() / 1e12).plot()
mkt_value_by_sector = stk_df.groupby(['ListSector','yearMonth'])['negMarketValue'].sum().reset_index().pivot(index='yearMonth',columns='ListSector',values='negMarketValue') / 1e12
mkt_value_by_sector.plot()
mkt_value_by_exch = stk_df.groupby(['exchangeCD','yearMonth'])['negMarketValue'].sum().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='negMarketValue') / 1e12
mkt_value_by_exch.plot()
mkt_value_by_exch = stk_df.groupby(['size','yearMonth'])['negMarketValue'].sum().reset_index().pivot(index='yearMonth',columns='size',values='negMarketValue') / 1e12
mkt_value_by_exch.plot()
mkt_value_by_exch.plot(subplots=True)
stk_df.groupby(['yearMonth'])['turnoverValue'].sum()
(stk_df.groupby(['yearMonth'])['turnoverValue'].sum() / 1e12).plot()
turnover_value_by_sector = stk_df.groupby(['ListSector','yearMonth'])['turnoverValue'].sum().reset_index().pivot(index='yearMonth',columns='ListSector',values='turnoverValue') / 1e12
turnover_value_by_sector.plot()
turnover_value_by_sector = stk_df.groupby(['exchangeCD','yearMonth'])['turnoverValue'].sum().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='turnoverValue') / 1e12
turnover_value_by_sector.plot()
turnover_value_by_sector = stk_df.groupby(['size','yearMonth'])['turnoverValue'].sum().reset_index().pivot(index='yearMonth',columns='size',values='turnoverValue') / 1e12
turnover_value_by_sector.plot()
turnover_vol_by_sector = stk_df.groupby(['size','yearMonth'])['turnoverVol'].sum().reset_index().pivot(index='yearMonth',columns='size',values='turnoverVol') / 1e12
turnover_vol_by_sector.plot()
stk_df.groupby(['yearMonth'])['turnoverRate'].mean()
(stk_df.groupby(['yearMonth'])['turnoverRate'].mean() / 1e12).plot()
turnover_by_sector = stk_df.groupby(['ListSector','yearMonth'])['turnoverRate'].mean().reset_index().pivot(index='yearMonth',columns='ListSector',values='turnoverRate') / 1e12
turnover_by_sector.plot()
turnover_by_sector = stk_df.groupby(['exchangeCD','yearMonth'])['turnoverRate'].mean().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='turnoverRate') / 1e12
turnover_by_sector.plot()
turnover_by_size = stk_df.groupby(['size','yearMonth'])['turnoverRate'].mean().reset_index().pivot(index='yearMonth',columns='size',values='turnoverRate') / 1e12
turnover_by_size.plot()
stk_df
stk_df['ret'] = stk_df['closePrice'] / stk_df['preClosePrice'] - 1
stk_df.groupby(['yearMonth'])['ret'].mean()
(stk_df.groupby(['yearMonth'])['ret'].mean() * 100).plot()
ret_by_sector = stk_df.groupby(['ListSector','yearMonth'])['ret'].mean().reset_index().pivot(index='yearMonth',columns='ListSector',values='ret') * 100
ret_by_sector.plot()
ret_by_exch = stk_df.groupby(['exchangeCD','yearMonth'])['ret'].mean().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='ret') * 100
ret_by_exch.plot()
ret_by_size = stk_df.groupby(['size','yearMonth'])['ret'].mean().reset_index().pivot(index='yearMonth',columns='size',values='ret') * 100
ret_by_size.plot()
stk_df.groupby(['yearMonth','secShortName'])['ret'].std().groupby('yearMonth').mean()
(stk_df.groupby(['yearMonth','secShortName'])['ret'].std().groupby('yearMonth').mean() * 100).plot()
(stk_df.groupby(['yearMonth','secShortName'])['ret'].std().groupby('yearMonth').mean() * 100).plot()
vol = stk_df.groupby(['yearMonth','secShortName'])['ret'].std()
temp = stk_df[['secShortName','yearMonth','ListSector','exchangeCD','size']].drop_duplicates(['secShortName','yearMonth'])
vol = pd.merge(vol,temp,on=['secShortName','yearMonth'])
del temp
vol.rename(columns={'ret':'vol'},inplace=True)
vol.groupby(['ListSector','yearMonth'])['vol'].mean().reset_index().pivot(index='yearMonth',columns='ListSector',values='vol')
vol_by_sector = vol.groupby(['ListSector','yearMonth'])['vol'].mean().reset_index().pivot(index='yearMonth',columns='ListSector',values='vol') * np.sqrt(240) * 100
vol_by_sector.plot()
vol_by_sector = vol.groupby(['exchangeCD','yearMonth'])['vol'].std().reset_index().pivot(index='yearMonth',columns='exchangeCD',values='vol') * np.sqrt(240) * 100
vol_by_sector.plot()
vol_by_sector = vol.groupby(['size','yearMonth'])['vol'].std().reset_index().pivot(index='yearMonth',columns='size',values='vol') * np.sqrt(240) * 100
vol_by_sector.plot()
group = stk_df.groupby(['yearMonth'])[['negMarketValue','turnoverValue','turnoverRate','ret']].agg({'negMarketValue':'sum','turnoverValue':'sum',
'turnoverRate':'mean','ret':'mean'})
vol_mean = vol.groupby('yearMonth')['vol'].mean()
group = pd.merge(group,vol_mean,on='yearMonth')
# Standardize
from sklearn.preprocessing import StandardScaler
group
group[['negMarketValue','turnoverValue','turnoverRate']] = StandardScaler().fit(group[['negMarketValue','turnoverValue','turnoverRate']]).transform(group[['negMarketValue','turnoverValue','turnoverRate']])
group
group.plot()
group.plot(subplots=True)
group[group.columns] = StandardScaler().fit(group).transform(group)
group.plot()
group.plot(subplots=True)
group[['turnoverRate','ret']].corr()
group[['turnoverValue','ret']].corr()
group.corr()
corr = group.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True,
square=True, linewidths=.5, annot_kws={"size": 10},cbar_kws={"shrink": .5})