In [1]:
import pandas as pd
import numpy as np
import tqdm
import gc
import matplotlib.pyplot as plt
import talib as ta
import datetime as dt
import statsmodels.api as sm
pd.set_option('display.max_rows', 16)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
In [2]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)

Data

In [3]:
START = '20070101'
END = '20231231'
In [4]:
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
cond3 = stk_info['transCurrCD']=='CNY'
stk_info = stk_info[cond1 & cond2 & cond3].copy()
stk_id = stk_info['secID']
# ST
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
In [5]:
# %%time
# # About 8 mins
# # # 从优矿下载股票信息,时间较长。由于优矿的限制,每次下载3年的数据

# stk_dict = {}
# begin_ = dt.datetime.strptime(START, '%Y%m%d').year
# end_ = dt.datetime.strptime(START, '%Y%m%d').year+3
# field = ["secID","tradeDate",'preClosePrice',"closePrice",'openPrice','highestPrice','lowestPrice',"negMarketValue","turnoverValue",'turnoverVol']
# while begin_ <= 2023:
#     if begin_ == 2023:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=field,pandas="1")
#     else:
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=field,pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(stk_dict)):
#     stk_df = pd.DataFrame(np.vstack([_df for _df in stk_dict.values()]),columns=field)
    
# stk_df.to_pickle('./data/stk_df.pkl')
In [6]:
# %%time
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
#                                            field=["secID","tradeDate",
#                                                   'preClosePrice',"closePrice",
#                                                   'openPrice','highestPrice','lowestPrice',
#                                                   "negMarketValue",
#                                                   "turnoverValue",'turnoverVol'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
# # Takes about 6 mins
In [7]:
stk_df = pd.read_pickle('./data/stk_df.pkl')
In [8]:
stk_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11294684 entries, 0 to 11294683
Data columns (total 10 columns):
secID             object
tradeDate         object
preClosePrice     object
closePrice        object
openPrice         object
highestPrice      object
lowestPrice       object
negMarketValue    object
turnoverValue     object
turnoverVol       object
dtypes: object(10)
memory usage: 861.7+ MB
In [9]:
for col in stk_df.columns.drop(['secID','tradeDate']).tolist():
    stk_df[col] = pd.to_numeric(stk_df[col])
In [10]:
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_df.shape)
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
print(stk_df.shape)
(11294684, 10)
(10774842, 10)
In [11]:
stk_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774842 entries, 109 to 11294683
Data columns (total 10 columns):
secID             object
tradeDate         datetime64[ns]
preClosePrice     float64
closePrice        float64
openPrice         float64
highestPrice      float64
lowestPrice       float64
negMarketValue    float64
turnoverValue     float64
turnoverVol       int64
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 904.3+ MB

不填充停牌值比较合理,因为技术分析只看量价,直接计算量价关系较为合适

In [12]:
random_stkid = np.random.choice(stk_df['secID'].unique(),1000)

stk_df = stk_df[stk_df['secID'].isin(random_stkid)].copy()
In [13]:
stk_df.drop(stk_df.loc[stk_df['openPrice']==0].index,inplace=True)
In [14]:
stk_df['open_ret'] = stk_df.groupby('secID')['openPrice'].apply(lambda x: x / x.shift() - 1)
stk_df['close_ret'] = stk_df['closePrice']/stk_df['preClosePrice'] - 1
In [15]:
def rule_return(df, demean=True, open_ret=True):
    """
    df should contain these columns:
        signal: the signal generated by the rule
        close_ret: return calculated by close price
        open_ret: return calculated by open price
    close_ret_demean is demeaned return of close_ret, i.e. close_ret - close_ret.mean.
    open_ret_demean is similarly defined. The use of demeaned return series is to adjust the
    bias created by bullish or bearish markets.
    """
    df['close_ret_demean'] = df['close_ret'] - df['close_ret'].mean()
    df['open_ret_demean'] = df['open_ret'] - df['open_ret'].mean()
    df['position_close'] = df['signal']
    df['position_open'] = df['signal'].shift()
    df['position_close_ret'] = df['position_close'].shift() * df['close_ret']
    df['position_open_ret'] = df['position_open'].shift() * df['open_ret']
    df['position_close_ret_demean'] = df['position_close'].shift() * df['close_ret_demean']
    df['position_open_ret_demean'] = df['position_open'].shift() * df['open_ret_demean']
    df['close_cumret'] = (df['position_close_ret']+1).cumprod()
    df['open_cumret'] = (df['position_open_ret']+1).cumprod()
    if open_ret:
        if demean:
            return pd.DataFrame({'position_open_ret_demean':df['position_open_ret_demean'].values, 
                                 'open_cumret':df['open_cumret'].values})
        else:
            return pd.DataFrame({'position_open_ret':df['position_open_ret'].values, 
                                 'open_cumret':df['open_cumret'].values})
    else:
        if demean:
            return pd.DataFrame({'position_close_ret_demean':df['position_close_ret_demean'].values, 
                                 'close_cumret':df['close_cumret'].values})
        else:
            return pd.DataFrame({'position_close_ret':df['position_close_ret'].values, 
                                 'close_cumret':df['close_cumret'].values})

A bunch of TA signals

In [16]:
# EMA
stk_df['EMA'] = stk_df.groupby('secID')['closePrice'].apply(ta.EMA, 20)
stk_df['EM_signal'] = np.where(stk_df['closePrice']>stk_df['EMA'],1,np.nan)
stk_df.loc[stk_df['closePrice']<=stk_df['EMA'],'EM_signal'] = 0
stk_df['EM_signal'] = stk_df.groupby('secID')['EM_signal'].fillna(method='ffill')
In [17]:
stk_df['EM_signal'].value_counts()
Out[17]:
0.0    954836
1.0    912242
Name: EM_signal, dtype: int64
In [18]:
# MACD
stk_df['MACD'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[0])
stk_df['MACD_rawsignal'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[1])
stk_df['MACD_signal'] = np.where(stk_df['MACD']>stk_df['MACD_rawsignal'],1,np.nan)
stk_df.loc[stk_df['MACD']<=stk_df['MACD_rawsignal'],'MACD_signal'] = 0
stk_df['MACD_signal'] = stk_df.groupby('secID')['MACD_signal'].fillna(method='ffill')
In [19]:
stk_df['MACD_signal'].value_counts()
Out[19]:
1.0    971195
0.0    883327
Name: MACD_signal, dtype: int64
In [20]:
# # Example
# temp = stk_df[stk_df['secID']=='900957.XSHG'].copy()
# ta.OBV(temp['closePrice'],temp['turnoverValue'])
# stk_df.loc[9968491:]
In [21]:
# OBV
stk_df['OBV'] = stk_df.groupby('secID')[['closePrice','turnoverVol']].apply(lambda x: ta.OBV(real=x['closePrice'],volume=x['turnoverVol']).to_frame('OBV')).values
stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
stk_df['OBV_change'] = stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
buy1_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']>0)
buy2_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']>0)
sell1_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']<0)
sell2_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']<0)
stk_df['OBV_signal'] = 0
stk_df.loc[buy1_OBV | buy2_OBV,'OBV_signal']=1
In [22]:
stk_df['OBV_signal'].value_counts()
Out[22]:
0    947182
1    936943
Name: OBV_signal, dtype: int64
In [23]:
# AROON
N = 26
high_threshold = 70  # 强趋势
low_threshold = 30  # 弱趋势

stk_df['AROON_up'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[0].to_frame('AROON_up')).values
stk_df['AROON_down'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[1].to_frame('AROON_down')).values
buy_AROON = (stk_df['AROON_up']>high_threshold)&(stk_df['AROON_down']<low_threshold)
sell_AROON = (stk_df['AROON_down']>high_threshold)&(stk_df['AROON_up']<low_threshold)
stk_df['AROON_signal'] = np.where(buy_AROON,1,np.nan)
stk_df.loc[sell_AROON,'AROON_signal'] = 0
stk_df['AROON_signal'] = stk_df.groupby('secID')['AROON_signal'].fillna(method='ffill')
In [24]:
stk_df['AROON_signal'].value_counts()
Out[24]:
0.0    958018
1.0    912001
Name: AROON_signal, dtype: int64
In [25]:
# BOLL
stk_df['BOLL_up'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[0].to_frame('BOLL_up')).values
stk_df['BOLL_mid'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[1].to_frame('BOLL_mid')).values
stk_df['BOLL_low'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[2].to_frame('BOLL_low')).values
stk_df['BOLL_up_lag'] = stk_df.groupby('secID')['BOLL_up'].shift()
stk_df['BOLL_mid_lag'] = stk_df.groupby('secID')['BOLL_mid'].shift()
stk_df['BOLL_low_lag'] = stk_df.groupby('secID')['BOLL_low'].shift()

buy_BOLL = (stk_df['preClosePrice'] < stk_df['BOLL_up_lag'])&(stk_df['closePrice']>stk_df['BOLL_up'])
sell_BOLL = (stk_df['preClosePrice'] > stk_df['BOLL_mid_lag'])&(stk_df['closePrice']<stk_df['BOLL_mid'])
stk_df['BOLL_signal'] = np.where(buy_BOLL, 1, np.nan)
stk_df.loc[sell_BOLL,'BOLL_signal'] = 0
stk_df['BOLL_signal'] = stk_df.groupby('secID')['BOLL_signal'].fillna(method='ffill')
In [26]:
stk_df['BOLL_signal'].value_counts()
Out[26]:
0.0    1874052
1.0        282
Name: BOLL_signal, dtype: int64
In [27]:
# CCI
buy_threshold = -100  # 买入阈值
sell_threshold = 100  # 卖出阈值
stk_df['CCI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x:ta.CCI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice']).to_frame('CCI')).values
In [28]:
buy_CCI = stk_df['CCI']<buy_threshold
sell_CCI = stk_df['CCI']>sell_threshold
stk_df['CCI_signal'] = np.where(buy_CCI,1,np.nan)
stk_df.loc[sell_CCI,'CCI_signal'] = 0
stk_df['CCI_signal'] = stk_df.groupby('secID')['CCI_signal'].fillna(method='ffill')
In [29]:
stk_df['CCI_signal'].value_counts()
Out[29]:
0.0    946907
1.0    922819
Name: CCI_signal, dtype: int64
In [30]:
# CMO
buy_threshold = -50  # 买入阈值
sell_threshold = 50
stk_df['CMO'] = stk_df.groupby('secID')['closePrice'].apply(ta.CMO)
In [31]:
buy_CMO = stk_df['CMO'] < buy_threshold
sell_CMO = stk_df['CMO'] > sell_threshold
stk_df['CMO_signal'] = np.where(buy_CMO, 1, np.nan)
stk_df.loc[sell_CMO,'CMO_signal'] = 0
stk_df['CMO_signal'] = stk_df.groupby('secID')['CMO_signal'].fillna(method='ffill')
In [32]:
stk_df['CMO_signal'].value_counts()
Out[32]:
0.0    1115935
1.0     712274
Name: CMO_signal, dtype: int64
In [33]:
gc.collect()
Out[33]:
72
In [34]:
# DMI
stk_df['plus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.PLUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['minus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.MINUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['ADX'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.ADX(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
In [35]:
buy_threshold = 50 
sell_threshold = 20
buy_DMI = (stk_df['ADX']>buy_threshold)&(stk_df['plus_DI']>stk_df['minus_DI'])
sell_DMI = (stk_df['ADX']<sell_threshold)|(stk_df['plus_DI']<stk_df['minus_DI'])
stk_df['DMI_signal'] = np.where(buy_DMI,1,np.nan)
stk_df.loc[sell_DMI,'DMI_signal'] = 0
stk_df['DMI_signal'] = stk_df.groupby('secID')['DMI_signal'].fillna(method='ffill')
In [36]:
stk_df['DMI_signal'].value_counts()
Out[36]:
0.0    1785627
1.0      71675
Name: DMI_signal, dtype: int64
In [37]:
# MFI
stk_df['MFI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice','turnoverVol']].apply(lambda x:ta.MFI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice'],volume=x['turnoverVol']).to_frame()).values
In [38]:
buy_threshold = 20  
sell_threshold = 80 
buy_MFI = stk_df['MFI']<buy_threshold
sell_MFI = stk_df['MFI']>sell_threshold
stk_df['MFI_signal'] = np.where(buy_MFI,1,np.nan)
stk_df.loc[sell_MFI,'MFI_signal'] = 0
stk_df['MFI_signal'] = stk_df.groupby('secID')['MFI_signal'].fillna(method='ffill')
In [39]:
stk_df['MFI_signal'].value_counts()
Out[39]:
0.0    1268015
1.0     577057
Name: MFI_signal, dtype: int64
In [40]:
# RSI
stk_df['RSI'] = stk_df.groupby('secID')['closePrice'].apply(ta.RSI)
buy_threshold = 30
sell_threshold = 70
buy_RSI = stk_df['RSI'] < buy_threshold
sell_RSI = stk_df['RSI'] > sell_threshold
stk_df['RSI_signal'] = np.where(buy_RSI,1,np.nan)
stk_df.loc[sell_RSI,'RSI_signal'] = 0
stk_df['RSI_signal'] = stk_df.groupby('secID')['RSI_signal'].fillna(method='ffill')
In [41]:
# %%time
# WVAD
# def wvad(df):
#     return sum((df[:,0] - df[:,1]) / (df[:,2] - df[:,3]) * df[:,4])

# stk_df.groupby('secID')[['closePrice','openPrice','highestPrice',
#                          'lowestPrice','turnoverVol']].apply(lambda x: x.rolling(24).apply(wvad,raw=False))
####### The above code is not working ##########

# temp = stk_df[stk_df['secID'].isin(np.random.choice(stk_df['secID'].unique(),10))].copy()
# def wvad(closePrice):
#     close = closePrice
#     open_ = stk_df.loc[closePrice.index, 'openPrice'].to_numpy()
#     high = stk_df.loc[closePrice.index, 'highestPrice'].to_numpy()
#     low = stk_df.loc[closePrice.index, 'lowestPrice'].to_numpy()
#     volume = stk_df.loc[closePrice.index, 'turnoverVol'].to_numpy()
#     return sum((closePrice - open_) / (high - low) * volume)
# temp2 = temp.groupby('secID')['closePrice'].rolling(24).apply(wvad, raw=False)
In [42]:
stk_df.reset_index(inplace=True, drop=True)
In [43]:
cols = ['secID','tradeDate','open_ret'] + [col for col in stk_df.columns if col[-6:]=='signal']

ret_df = stk_df[cols].copy()

ret_df['signal_date'] = ret_df['tradeDate']

ret_df[['tradeDate','open_ret']] = ret_df.groupby('secID')[['tradeDate','open_ret']].shift(-2)

ret_df.dropna(inplace=True)

ret_df.drop('MACD_rawsignal',axis=1,inplace=True)

ret_df['ret_sign'] = np.where(ret_df['open_ret']>0,1,-1)

ret_df
Out[43]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign
81 000002.XSHE 2007-05-16 -0.055868 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-14 -1
82 000002.XSHE 2007-05-17 0.059722 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-15 1
83 000002.XSHE 2007-05-18 0.033187 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-16 1
84 000002.XSHE 2007-05-21 -0.037576 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-17 -1
85 000002.XSHE 2007-05-22 0.045341 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-18 1
86 000002.XSHE 2007-05-23 0.047590 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-21 1
87 000002.XSHE 2007-05-24 0.062104 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-22 1
88 000002.XSHE 2007-05-25 -0.016784 1.0 1.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-05-23 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1884115 688786.XSHG 2023-03-22 -0.006118 1.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2023-03-20 -1
1884116 688786.XSHG 2023-03-23 0.002392 1.0 0.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2023-03-21 1
1884117 688786.XSHG 2023-03-24 0.025579 1.0 1.0 1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2023-03-22 1
1884118 688786.XSHG 2023-03-27 -0.033846 1.0 1.0 1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2023-03-23 -1
1884119 688786.XSHG 2023-03-28 -0.021217 1.0 1.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2023-03-24 -1
1884120 688786.XSHG 2023-03-29 -0.020870 0.0 0.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2023-03-27 -1
1884121 688786.XSHG 2023-03-30 0.006878 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2023-03-28 1
1884122 688786.XSHG 2023-03-31 -0.017835 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2023-03-29 -1

1805093 rows × 15 columns

In [44]:
ret_df.sort_values('tradeDate',inplace=True)

ret_df.reset_index(inplace=True,drop=True)
In [45]:
ret_df
Out[45]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign
0 002089.XSHE 2007-03-01 0.063924 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1
1 600355.XSHG 2007-03-01 0.110109 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
2 600476.XSHG 2007-03-01 0.068561 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
3 600489.XSHG 2007-03-01 0.087017 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1
4 600306.XSHG 2007-03-01 0.056576 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
5 600446.XSHG 2007-03-01 0.039526 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1
6 600367.XSHG 2007-03-01 0.078061 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
7 600850.XSHG 2007-03-01 0.090906 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1805085 300662.XSHE 2023-03-31 0.010190 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 2023-03-29 1
1805086 600256.XSHG 2023-03-31 0.029704 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 2023-03-29 1
1805087 603026.XSHG 2023-03-31 0.003213 0.0 1.0 1 1.0 0.0 1.0 1.0 0.0 1.0 1.0 2023-03-29 1
1805088 003022.XSHE 2023-03-31 -0.009392 0.0 0.0 1 1.0 0.0 1.0 0.0 0.0 1.0 1.0 2023-03-29 -1
1805089 000883.XSHE 2023-03-31 -0.002366 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 2023-03-29 -1
1805090 603833.XSHG 2023-03-31 0.026994 0.0 0.0 1 1.0 0.0 1.0 0.0 0.0 1.0 0.0 2023-03-29 1
1805091 600373.XSHG 2023-03-31 -0.046232 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2023-03-29 -1
1805092 688786.XSHG 2023-03-31 -0.017835 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2023-03-29 -1

1805093 rows × 15 columns

Random Forest

Train, test split

In [46]:
ret_df['year'] = ret_df['tradeDate'].dt.year
In [47]:
time_idx = [value for (key, value) in sorted(ret_df.groupby('year').groups.items())]
In [48]:
def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result
In [49]:
list_flat([[1,2,3],[3,4,5]])
Out[49]:
[1, 2, 3, 3, 4, 5]
In [50]:
np.array([[1,2,3],[3,4,5]]).flatten()
Out[50]:
array([1, 2, 3, 3, 4, 5])
In [51]:
# training, validation, testing scheme:
# 1. [2007-2010], [2011-2014], [2015]
# 2. [2008-2011], [2012-2015], [2016]
# ...
# last. [2015-2018], [2019-2022], [2023]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[i-4:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])
In [52]:
len(fulltrain_idx)
Out[52]:
9
In [53]:
len(fulltrain_idx[0])
Out[53]:
611705
In [54]:
len(fulltrain_idx[1])
Out[54]:
673381
In [55]:
cv_idx
Out[55]:
[(array([     0,      1,      2, ..., 232926, 232927, 232928]),
  array([232929, 232930, 232931, ..., 611702, 611703, 611704])),
 (array([     0,      1,      2, ..., 278923, 278924, 278925]),
  array([278926, 278927, 278928, ..., 673378, 673379, 673380])),
 (array([     0,      1,      2, ..., 312439, 312440, 312441]),
  array([312442, 312443, 312444, ..., 725928, 725929, 725930])),
 (array([     0,      1,      2, ..., 347643, 347644, 347645]),
  array([347646, 347647, 347648, ..., 786923, 786924, 786925])),
 (array([     0,      1,      2, ..., 378773, 378774, 378775]),
  array([378776, 378777, 378778, ..., 857328, 857329, 857330])),
 (array([     0,      1,      2, ..., 394452, 394453, 394454]),
  array([394455, 394456, 394457, ..., 924559, 924560, 924561])),
 (array([     0,      1,      2, ..., 413486, 413487, 413488]),
  array([413489, 413490, 413491, ..., 987490, 987491, 987492])),
 (array([     0,      1,      2, ..., 439277, 439278, 439279]),
  array([ 439280,  439281,  439282, ..., 1057939, 1057940, 1057941])),
 (array([     0,      1,      2, ..., 478552, 478553, 478554]),
  array([ 478555,  478556,  478557, ..., 1144981, 1144982, 1144983]))]
In [56]:
test_years = list(range(2015, 2024))
test_years
Out[56]:
[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
In [57]:
X_cols = [col for col in ret_df.columns if col[-6:]=='signal']
In [58]:
X_cols
Out[58]:
['EM_signal',
 'MACD_signal',
 'OBV_signal',
 'AROON_signal',
 'BOLL_signal',
 'CCI_signal',
 'CMO_signal',
 'DMI_signal',
 'MFI_signal',
 'RSI_signal']

Evaluation metrics

用 accuracy, f1_score 作为好坏评价的标准

  • accuracy: 正确数/总数。比如预测了100次,对了99次,则accuracy = 99%. $accuracy = \frac{TP+TN}{TP+TN+FP+FN}$
  • $precision = \frac{TP}{TP+FP}$, 在预测为Positive时,预测对的次数
  • $recall = \frac{TP}{TP+FN}$,在真实为Positive时,预测对的次数
  • $f1 = \frac{2}{\frac{1}{precision}+\frac{1}{recall}} $

Random forest

In [59]:
hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [5,8]}
]
In [60]:
model = RandomForestClassifier()
In [61]:
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]])
In [62]:
X_fulltrain = ret_df.loc[fulltrain_idx[0], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[0], 'ret_sign']
X_test = ret_df.loc[test_idx[0], X_cols]
y_test = ret_df.loc[test_idx[0], 'ret_sign']
In [63]:
ret_df.loc[fulltrain_idx[0]]
Out[63]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign year
0 002089.XSHE 2007-03-01 0.063924 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1 2007
1 600355.XSHG 2007-03-01 0.110109 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
2 600476.XSHG 2007-03-01 0.068561 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
3 600489.XSHG 2007-03-01 0.087017 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1 2007
4 600306.XSHG 2007-03-01 0.056576 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
5 600446.XSHG 2007-03-01 0.039526 0.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2007-02-27 1 2007
6 600367.XSHG 2007-03-01 0.078061 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
7 600850.XSHG 2007-03-01 0.090906 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
611697 600992.XSHG 2014-12-31 -0.050900 1.0 1.0 1 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2014-12-29 -1 2014
611698 002556.XSHE 2014-12-31 0.000773 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 1 2014
611699 002497.XSHE 2014-12-31 0.021491 0.0 0.0 1 1.0 0.0 1.0 0.0 0.0 1.0 0.0 2014-12-29 1 2014
611700 002167.XSHE 2014-12-31 -0.035023 0.0 0.0 0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 2014-12-29 -1 2014
611701 600312.XSHG 2014-12-31 -0.053678 1.0 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2014-12-29 -1 2014
611702 600513.XSHG 2014-12-31 0.014551 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 2014-12-29 1 2014
611703 002686.XSHE 2014-12-31 -0.027924 0.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 1.0 2014-12-29 -1 2014
611704 600192.XSHG 2014-12-31 -0.027934 0.0 0.0 0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 2014-12-29 -1 2014

611705 rows × 16 columns

In [64]:
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 26.9 s, sys: 252 ms, total: 27.1 s
Wall time: 27.1 s
Out[64]:
GridSearchCV(cv=[(array([     0,      1,      2, ..., 232926, 232927, 232928]),
                  array([232929, 232930, 232931, ..., 611702, 611703, 611704]))],
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [5, 8],
                          'n_estimators': [50]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [65]:
grid_search.best_params_
Out[65]:
{'max_depth': 3, 'max_features': 8, 'n_estimators': 50}
In [66]:
pd.DataFrame({"features":X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',ascending=False)
Out[66]:
features feature_importance
1 MACD_signal 0.338913
2 OBV_signal 0.318480
0 EM_signal 0.099250
3 AROON_signal 0.087101
9 RSI_signal 0.069541
6 CMO_signal 0.048105
5 CCI_signal 0.034266
8 MFI_signal 0.004345
4 BOLL_signal 0.000000
7 DMI_signal 0.000000
In [67]:
y_pred = grid_search.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
0.5366211182618649
In [68]:
# Benchmark all-positive
all_ones = np.ones_like(y_pred)
accuracy_score(y_true=y_test, y_pred=all_ones)
Out[68]:
0.5544623045953354
In [69]:
y_pred = grid_search.predict(X_test)
print(f1_score(y_true=y_test, y_pred=y_pred))
print(f1_score(y_true=y_test, y_pred=all_ones))
0.6231269852935172
0.7133814733959413
In [70]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
    y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
    X_test = ret_df.loc[test_idx[i], X_cols]
    y_test = ret_df.loc[test_idx[i], 'ret_sign']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    all_ones = np.ones_like(y_pred)
    print("Test year", test_years[i],"Benchmark","accuracy:",accuracy_score(y_true=y_test, y_pred=all_ones))
    print("Test year", test_years[i],"Model","accuracy:",accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Test year", test_years[i],"Benchmark","f1:",f1_score(y_true=y_test, y_pred=all_ones))
    print("Test year", test_years[i],"Model","f1:",f1_score(y_true=y_test, y_pred=y_pred))
    print("====================我是分割线========================")
Test year 2015 Benchmark accuracy: 0.5544623045953354
Test year 2015 Model accuracy: 0.5366311583217036
Test year 2015 Benchmark f1: 0.7133814733959413
Test year 2015 Model f1: 0.623488717388112
====================我是分割线========================
Test year 2016 Benchmark accuracy: 0.4951232032854209
Test year 2016 Model accuracy: 0.5233041846633152
Test year 2016 Benchmark f1: 0.6623175965665236
Test year 2016 Model f1: 0.6244377967909964
====================我是分割线========================
Test year 2017 Benchmark accuracy: 0.47530937885480085
Test year 2017 Model accuracy: 0.5031872985555689
Test year 2017 Benchmark f1: 0.6443521415470924
Test year 2017 Model f1: 0.6112090280070002
====================我是分割线========================
Test year 2018 Benchmark accuracy: 0.4650108684032356
Test year 2018 Model accuracy: 0.49314043402344726
Test year 2018 Benchmark f1: 0.6348224145395815
Test year 2018 Model f1: 0.5884783187325687
====================我是分割线========================
Test year 2019 Benchmark accuracy: 0.4913895192288608
Test year 2019 Model accuracy: 0.49926895265062554
Test year 2019 Benchmark f1: 0.6589687172844544
Test year 2019 Model f1: 0.5386936143498852
====================我是分割线========================
Test year 2020 Benchmark accuracy: 0.4803768461445299
Test year 2020 Model accuracy: 0.5119931667952142
Test year 2020 Benchmark f1: 0.6489926499399336
Test year 2020 Model f1: 0.5384647845718867
====================我是分割线========================
Test year 2021 Benchmark accuracy: 0.4792000610640409
Test year 2021 Model accuracy: 0.5150927997275604
Test year 2021 Benchmark f1: 0.6479178492182258
Test year 2021 Model f1: 0.46911199393167996
====================我是分割线========================
Test year 2022 Benchmark accuracy: 0.46862538679937477
Test year 2022 Model accuracy: 0.5144245595006434
Test year 2022 Benchmark f1: 0.6381823316028412
Test year 2022 Model f1: 0.4546670488201012
====================我是分割线========================
Test year 2023 Benchmark accuracy: 0.4826460623089001
Test year 2023 Model accuracy: 0.5181803156763903
Test year 2023 Benchmark f1: 0.6510603907142658
Test year 2023 Model f1: 0.2271853668235138
====================我是分割线========================
CPU times: user 6min 56s, sys: 4.9 s, total: 7min 1s
Wall time: 7min

Return

In [71]:
len(test_years)
Out[71]:
9
In [72]:
i = 7
print(test_years[i])
X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
X_test = ret_df.loc[test_idx[i], X_cols]

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
grid_search.fit(X_fulltrain, y_fulltrain)
rule = grid_search.predict(X=X_test)
2022
In [73]:
rule
Out[73]:
array([-1,  1, -1, ...,  1, -1,  1])
In [74]:
rule[rule==-1] = 0
In [75]:
rule
Out[75]:
array([0, 1, 0, ..., 1, 0, 1])
In [76]:
len(rule)
Out[76]:
188082
In [77]:
ret_df.loc[test_idx[i],'open_ret']
Out[77]:
1568607    0.001204
1568608   -0.004623
1568609    0.006788
1568610    0.006907
1568611    0.013603
1568612    0.017220
1568613    0.011306
1568614   -0.002069
             ...   
1756681    0.006943
1756682   -0.010061
1756683    0.001595
1756684   -0.007683
1756685    0.008315
1756686   -0.023044
1756687   -0.005321
1756688    0.086302
Name: open_ret, Length: 188082, dtype: float64
In [78]:
rule_ret = ret_df.loc[test_idx[i],'open_ret'].values * rule
In [79]:
ret_df_2022 = ret_df.loc[test_idx[i],['secID','tradeDate','open_ret']].copy()
ret_df_2022['rule_ret'] = rule_ret
In [80]:
ret_df_2022
Out[80]:
secID tradeDate open_ret rule_ret
1568607 300791.XSHE 2022-01-04 0.001204 0.000000
1568608 002636.XSHE 2022-01-04 -0.004623 -0.004623
1568609 600720.XSHG 2022-01-04 0.006788 0.000000
1568610 002677.XSHE 2022-01-04 0.006907 0.000000
1568611 300383.XSHE 2022-01-04 0.013603 0.000000
1568612 002693.XSHE 2022-01-04 0.017220 0.000000
1568613 002489.XSHE 2022-01-04 0.011306 0.000000
1568614 001218.XSHE 2022-01-04 -0.002069 -0.002069
... ... ... ... ...
1756681 301055.XSHE 2022-12-30 0.006943 0.006943
1756682 002686.XSHE 2022-12-30 -0.010061 -0.000000
1756683 603079.XSHG 2022-12-30 0.001595 0.001595
1756684 603895.XSHG 2022-12-30 -0.007683 -0.007683
1756685 003022.XSHE 2022-12-30 0.008315 0.000000
1756686 002892.XSHE 2022-12-30 -0.023044 -0.023044
1756687 601107.XSHG 2022-12-30 -0.005321 -0.000000
1756688 002701.XSHE 2022-12-30 0.086302 0.086302

188082 rows × 4 columns

In [81]:
ret_df_2022.sort_values(['secID','tradeDate'],inplace=True)
In [82]:
# time-series mean of daily return
rule_tsmean_ret_by_crs = ret_df_2022.groupby('secID')['rule_ret'].mean()
rule_tsmean_ret_by_crs
Out[82]:
secID
000002.XSHE   -0.000540
000008.XSHE   -0.000559
000009.XSHE   -0.001586
000025.XSHE   -0.000187
000030.XSHE   -0.000868
000038.XSHE   -0.002357
000056.XSHE   -0.002405
000062.XSHE   -0.000188
                 ...   
688625.XSHG   -0.000670
688628.XSHG   -0.000712
688659.XSHG    0.003131
688660.XSHG   -0.002257
688661.XSHG   -0.000788
688700.XSHG    0.001938
688776.XSHG    0.000317
688786.XSHG   -0.001114
Name: rule_ret, Length: 831, dtype: float64
In [83]:
rule_tsmean_ret_by_crs.dropna(inplace=True)
y = rule_tsmean_ret_by_crs.values
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HC0')
mean_values = reg.params[0]
t_values = reg.tvalues[0]
In [84]:
pd.DataFrame([mean_values,t_values],index=['ret_mean','t_values'],columns=['rule_daily_ret'])
Out[84]:
rule_daily_ret
ret_mean -0.000122
t_values -0.799003
In [85]:
ret_df_2022['1+rule_ret'] = ret_df_2022['rule_ret'] + 1
In [86]:
ret_df_2022['cum_rule_ret'] = ret_df_2022.groupby('secID')['1+rule_ret'].cumprod()
In [87]:
ret_df_2022['1+open_ret'] = ret_df_2022['open_ret'] + 1
ret_df_2022['cum_open_ret'] = ret_df_2022.groupby('secID')['1+open_ret'].cumprod()
In [88]:
ret_df_2022
Out[88]:
secID tradeDate open_ret rule_ret 1+rule_ret cum_rule_ret 1+open_ret cum_open_ret
1569312 000002.XSHE 2022-01-04 0.018286 0.000000 1.000000 1.000000 1.018286 1.018286
1569771 000002.XSHE 2022-01-05 0.048230 0.000000 1.000000 1.000000 1.048230 1.067398
1570410 000002.XSHE 2022-01-06 0.033774 0.000000 1.000000 1.000000 1.033774 1.103448
1571285 000002.XSHE 2022-01-07 0.012310 0.000000 1.000000 1.000000 1.012310 1.117032
1572079 000002.XSHE 2022-01-10 0.019645 0.000000 1.000000 1.000000 1.019645 1.138976
1572623 000002.XSHE 2022-01-11 0.014220 0.000000 1.000000 1.000000 1.014220 1.155172
1573370 000002.XSHE 2022-01-12 0.001809 0.000000 1.000000 1.000000 1.001809 1.157262
1573862 000002.XSHE 2022-01-13 -0.006772 -0.000000 1.000000 1.000000 0.993228 1.149425
... ... ... ... ... ... ... ... ...
1750218 688786.XSHG 2022-12-21 -0.002262 -0.000000 1.000000 0.723008 0.997738 1.007237
1751230 688786.XSHG 2022-12-22 -0.020498 -0.000000 1.000000 0.723008 0.979502 0.986590
1752398 688786.XSHG 2022-12-23 -0.037454 -0.000000 1.000000 0.723008 0.962546 0.949638
1752743 688786.XSHG 2022-12-26 0.006763 0.006763 1.006763 0.727898 1.006763 0.956061
1753560 688786.XSHG 2022-12-27 0.027131 0.027131 1.027131 0.747646 1.027131 0.981999
1754288 688786.XSHG 2022-12-28 -0.022891 -0.022891 0.977109 0.730532 0.977109 0.959521
1755284 688786.XSHG 2022-12-29 -0.017464 -0.017464 0.982536 0.717774 0.982536 0.942763
1756484 688786.XSHG 2022-12-30 0.018494 0.018494 1.018494 0.731048 1.018494 0.960199

188082 rows × 8 columns

In [89]:
final_ret_2022 = ret_df_2022.groupby('secID').last()
In [90]:
final_ret_2022.sort_values('cum_rule_ret')
Out[90]:
tradeDate open_ret rule_ret 1+rule_ret cum_rule_ret 1+open_ret cum_open_ret
secID
002464.XSHE 2022-06-27 -0.038462 -0.038462 0.961538 0.097306 0.961538 0.097306
600091.XSHG 2022-06-15 -0.026223 -0.026223 0.973777 0.104188 0.973777 0.100244
600695.XSHG 2022-06-07 0.041337 0.041337 1.041337 0.141306 1.041337 0.141306
600652.XSHG 2022-06-15 -0.107438 -0.000000 1.000000 0.257113 0.892562 0.288000
688608.XSHG 2022-12-30 0.005591 0.005591 1.005591 0.459943 1.005591 0.605369
000056.XSHE 2022-12-30 0.004424 0.004424 1.004424 0.490503 1.004424 0.857958
300482.XSHE 2022-12-30 -0.023919 -0.023919 0.976081 0.496035 0.976081 0.826127
300589.XSHE 2022-12-30 0.020557 0.020557 1.020557 0.496537 1.020557 0.740127
... ... ... ... ... ... ... ...
300351.XSHE 2022-12-30 -0.026662 -0.026662 0.973338 1.827223 0.973338 1.058658
601975.XSHG 2022-12-30 0.015666 0.015666 1.015666 1.885278 1.015666 1.945000
003029.XSHE 2022-12-30 0.007866 0.000000 1.000000 1.922016 1.007866 1.645743
600387.XSHG 2022-12-30 -0.011278 -0.000000 1.000000 1.923154 0.988722 1.290026
300071.XSHE 2022-12-30 -0.015997 -0.000000 1.000000 1.970213 0.984003 1.873131
000411.XSHE 2022-12-30 0.003396 0.003396 1.003396 2.294187 1.003396 0.975191
600188.XSHG 2022-12-30 -0.007548 -0.007548 0.992452 2.397460 0.992452 1.550306
002219.XSHE 2022-12-30 -0.005022 -0.000000 1.000000 3.316126 0.994978 2.954720

831 rows × 7 columns

In [91]:
best_sec = final_ret_2022.sort_values('cum_rule_ret').index[-5:]
In [92]:
ret_df_2022.loc[ret_df_2022['secID'].isin(best_sec)].pivot(index='tradeDate',columns='secID',values='cum_rule_ret').plot()
Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda9eed0>
In [93]:
ret_df_2022.loc[ret_df_2022['secID'].isin(best_sec)].pivot(index='tradeDate',columns='secID',values='cum_open_ret').plot()
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f46bdc85190>
In [94]:
final_ret_2022.sort_values('cum_rule_ret', inplace=True)
worst_sec = final_ret_2022.loc[final_ret_2022['tradeDate']=='2022-12-30'].index[0:10]
In [95]:
final_ret_2022
Out[95]:
tradeDate open_ret rule_ret 1+rule_ret cum_rule_ret 1+open_ret cum_open_ret
secID
002464.XSHE 2022-06-27 -0.038462 -0.038462 0.961538 0.097306 0.961538 0.097306
600091.XSHG 2022-06-15 -0.026223 -0.026223 0.973777 0.104188 0.973777 0.100244
600695.XSHG 2022-06-07 0.041337 0.041337 1.041337 0.141306 1.041337 0.141306
600652.XSHG 2022-06-15 -0.107438 -0.000000 1.000000 0.257113 0.892562 0.288000
688608.XSHG 2022-12-30 0.005591 0.005591 1.005591 0.459943 1.005591 0.605369
000056.XSHE 2022-12-30 0.004424 0.004424 1.004424 0.490503 1.004424 0.857958
300482.XSHE 2022-12-30 -0.023919 -0.023919 0.976081 0.496035 0.976081 0.826127
300589.XSHE 2022-12-30 0.020557 0.020557 1.020557 0.496537 1.020557 0.740127
... ... ... ... ... ... ... ...
300351.XSHE 2022-12-30 -0.026662 -0.026662 0.973338 1.827223 0.973338 1.058658
601975.XSHG 2022-12-30 0.015666 0.015666 1.015666 1.885278 1.015666 1.945000
003029.XSHE 2022-12-30 0.007866 0.000000 1.000000 1.922016 1.007866 1.645743
600387.XSHG 2022-12-30 -0.011278 -0.000000 1.000000 1.923154 0.988722 1.290026
300071.XSHE 2022-12-30 -0.015997 -0.000000 1.000000 1.970213 0.984003 1.873131
000411.XSHE 2022-12-30 0.003396 0.003396 1.003396 2.294187 1.003396 0.975191
600188.XSHG 2022-12-30 -0.007548 -0.007548 0.992452 2.397460 0.992452 1.550306
002219.XSHE 2022-12-30 -0.005022 -0.000000 1.000000 3.316126 0.994978 2.954720

831 rows × 7 columns

In [96]:
worst_sec
Out[96]:
Index(['688608.XSHG', '000056.XSHE', '300482.XSHE', '300589.XSHE',
       '300543.XSHE', '688339.XSHG', '603176.XSHG', '300545.XSHE',
       '300977.XSHE', '601908.XSHG'],
      dtype='object', name='secID')
In [97]:
ret_df_2022.loc[ret_df_2022['secID'].isin(worst_sec)].pivot(index='tradeDate',columns='secID',values='cum_rule_ret').plot()
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda5ea10>
In [98]:
ret_df_2022.loc[ret_df_2022['secID'].isin(worst_sec)].pivot(index='tradeDate',columns='secID',values='cum_open_ret').plot()
Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda5e790>
In [ ]: