In [201]:
import pandas as pd
import numpy as np
import tqdm
import gc
import matplotlib.pyplot as plt
import talib as ta
pd.set_option('display.max_rows', 16)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
In [202]:
plt.rcParams['figure.figsize'] = (16.0, 9.0)

Data

In [203]:
START = '20070101'
END = '20221231'
In [204]:
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID']
# ST
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
In [205]:
# %%time
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
#                                            field=["secID","tradeDate",
#                                                   'preClosePrice',"closePrice",
#                                                   'openPrice','highestPrice','lowestPrice',
#                                                   "negMarketValue",
#                                                   "turnoverValue",'turnoverVol'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
# # Takes about 6 mins
In [206]:
stk_df = pd.read_pickle('./data/stk_df.pkl')
In [207]:
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_df.shape)
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
print(stk_df.shape)
(10019767, 10)
(9618086, 10)

不填充停牌值比较合理,因为技术分析只看量价,直接计算量价关系较为合适

In [208]:
random_stkid = np.random.choice(stk_df['secID'].unique(),1000)

stk_df = stk_df[stk_df['secID'].isin(random_stkid)].copy()
In [209]:
stk_df['open_ret'] = stk_df.groupby('secID')['openPrice'].apply(lambda x: x / x.shift() - 1)
stk_df['close_ret'] = stk_df['closePrice']/stk_df['preClosePrice'] - 1
In [210]:
def rule_return(df, demean=True, open_ret=True):
    """
    df should contain these columns:
        signal: the signal generated by the rule
        close_ret: return calculated by close price
        open_ret: return calculated by open price
    close_ret_demean is demeaned return of close_ret, i.e. close_ret - close_ret.mean.
    open_ret_demean is similarly defined. The use of demeaned return series is to adjust the
    bias created by bullish or bearish markets.
    """
    df['close_ret_demean'] = df['close_ret'] - df['close_ret'].mean()
    df['open_ret_demean'] = df['open_ret'] - df['open_ret'].mean()
    df['position_close'] = df['signal']
    df['position_open'] = df['signal'].shift()
    df['position_close_ret'] = df['position_close'].shift() * df['close_ret']
    df['position_open_ret'] = df['position_open'].shift() * df['open_ret']
    df['position_close_ret_demean'] = df['position_close'].shift() * df['close_ret_demean']
    df['position_open_ret_demean'] = df['position_open'].shift() * df['open_ret_demean']
    df['close_cumret'] = (df['position_close_ret']+1).cumprod()
    df['open_cumret'] = (df['position_open_ret']+1).cumprod()
    if open_ret:
        if demean:
            return pd.DataFrame({'position_open_ret_demean':df['position_open_ret_demean'].values, 
                                 'open_cumret':df['open_cumret'].values})
        else:
            return pd.DataFrame({'position_open_ret':df['position_open_ret'].values, 
                                 'open_cumret':df['open_cumret'].values})
    else:
        if demean:
            return pd.DataFrame({'position_close_ret_demean':df['position_close_ret_demean'].values, 
                                 'close_cumret':df['close_cumret'].values})
        else:
            return pd.DataFrame({'position_close_ret':df['position_close_ret'].values, 
                                 'close_cumret':df['close_cumret'].values})

A bunch of TA signals

In [211]:
# EMA
stk_df['EMA'] = stk_df.groupby('secID')['closePrice'].apply(ta.EMA, 20)
stk_df['EM_signal'] = np.where(stk_df['closePrice']>stk_df['EMA'],1,np.nan)
stk_df.loc[stk_df['closePrice']<=stk_df['EMA'],'EM_signal'] = 0
stk_df['EM_signal'] = stk_df.groupby('secID')['EM_signal'].fillna(method='ffill')
In [212]:
stk_df['EM_signal'].value_counts()
Out[212]:
0.0    878997
1.0    835361
Name: EM_signal, dtype: int64
In [213]:
# MACD
stk_df['MACD'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[0])
stk_df['MACD_rawsignal'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[1])
stk_df['MACD_signal'] = np.where(stk_df['MACD']>stk_df['MACD_rawsignal'],1,np.nan)
stk_df.loc[stk_df['MACD']<=stk_df['MACD_rawsignal'],'MACD_signal'] = 0
stk_df['MACD_signal'] = stk_df.groupby('secID')['MACD_signal'].fillna(method='ffill')
In [214]:
stk_df['MACD_signal'].value_counts()
Out[214]:
1.0    893456
0.0    808470
Name: MACD_signal, dtype: int64
In [215]:
# # Example
# temp = stk_df[stk_df['secID']=='900957.XSHG'].copy()
# ta.OBV(temp['closePrice'],temp['turnoverValue'])
# stk_df.loc[9968491:]
In [216]:
# OBV
stk_df['OBV'] = stk_df.groupby('secID')[['closePrice','turnoverVol']].apply(lambda x: ta.OBV(real=x['closePrice'],volume=x['turnoverVol']).to_frame('OBV')).values
stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
stk_df['OBV_change'] = stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
buy1_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']>0)
buy2_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']>0)
sell1_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']<0)
sell2_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']<0)
stk_df['OBV_signal'] = 0
stk_df.loc[buy1_OBV | buy2_OBV,'OBV_signal']=1
In [217]:
stk_df['OBV_signal'].value_counts()
Out[217]:
0    869554
1    861756
Name: OBV_signal, dtype: int64
In [218]:
# AROON
N = 26
high_threshold = 70  # 强趋势
low_threshold = 30  # 弱趋势

stk_df['AROON_up'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[0].to_frame('AROON_up')).values
stk_df['AROON_down'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[1].to_frame('AROON_down')).values
buy_AROON = (stk_df['AROON_up']>high_threshold)&(stk_df['AROON_down']<low_threshold)
sell_AROON = (stk_df['AROON_down']>high_threshold)&(stk_df['AROON_up']<low_threshold)
stk_df['AROON_signal'] = np.where(buy_AROON,1,np.nan)
stk_df.loc[sell_AROON,'AROON_signal'] = 0
stk_df['AROON_signal'] = stk_df.groupby('secID')['AROON_signal'].fillna(method='ffill')
In [219]:
stk_df['AROON_signal'].value_counts()
Out[219]:
0.0    880619
1.0    836843
Name: AROON_signal, dtype: int64
In [220]:
# BOLL
stk_df['BOLL_up'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[0].to_frame('BOLL_up')).values
stk_df['BOLL_mid'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[1].to_frame('BOLL_mid')).values
stk_df['BOLL_low'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[2].to_frame('BOLL_low')).values
stk_df['BOLL_up_lag'] = stk_df.groupby('secID')['BOLL_up'].shift()
stk_df['BOLL_mid_lag'] = stk_df.groupby('secID')['BOLL_mid'].shift()
stk_df['BOLL_low_lag'] = stk_df.groupby('secID')['BOLL_low'].shift()

buy_BOLL = (stk_df['preClosePrice'] < stk_df['BOLL_up_lag'])&(stk_df['closePrice']>stk_df['BOLL_up'])
sell_BOLL = (stk_df['preClosePrice'] > stk_df['BOLL_mid_lag'])&(stk_df['closePrice']<stk_df['BOLL_mid'])
stk_df['BOLL_signal'] = np.where(buy_BOLL, 1, np.nan)
stk_df.loc[sell_BOLL,'BOLL_signal'] = 0
stk_df['BOLL_signal'] = stk_df.groupby('secID')['BOLL_signal'].fillna(method='ffill')
In [221]:
stk_df['BOLL_signal'].value_counts()
Out[221]:
0.0    1721544
1.0        243
Name: BOLL_signal, dtype: int64
In [222]:
# CCI
buy_threshold = -100  # 买入阈值
sell_threshold = 100  # 卖出阈值
stk_df['CCI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x:ta.CCI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice']).to_frame('CCI')).values
In [223]:
buy_CCI = stk_df['CCI']<buy_threshold
sell_CCI = stk_df['CCI']>sell_threshold
stk_df['CCI_signal'] = np.where(buy_CCI,1,np.nan)
stk_df.loc[sell_CCI,'CCI_signal'] = 0
stk_df['CCI_signal'] = stk_df.groupby('secID')['CCI_signal'].fillna(method='ffill')
In [224]:
stk_df['CCI_signal'].value_counts()
Out[224]:
0.0    866663
1.0    850144
Name: CCI_signal, dtype: int64
In [225]:
# CMO
buy_threshold = -50  # 买入阈值
sell_threshold = 50
stk_df['CMO'] = stk_df.groupby('secID')['closePrice'].apply(ta.CMO)
In [226]:
buy_CMO = stk_df['CMO'] < buy_threshold
sell_CMO = stk_df['CMO'] > sell_threshold
stk_df['CMO_signal'] = np.where(buy_CMO, 1, np.nan)
stk_df.loc[sell_CMO,'CMO_signal'] = 0
stk_df['CMO_signal'] = stk_df.groupby('secID')['CMO_signal'].fillna(method='ffill')
In [227]:
stk_df['CMO_signal'].value_counts()
Out[227]:
0.0    1020933
1.0     658627
Name: CMO_signal, dtype: int64
In [228]:
gc.collect()
Out[228]:
86
In [229]:
# DMI
stk_df['plus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.PLUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['minus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.MINUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['ADX'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.ADX(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
In [230]:
buy_threshold = 50 
sell_threshold = 20
buy_DMI = (stk_df['ADX']>buy_threshold)&(stk_df['plus_DI']>stk_df['minus_DI'])
sell_DMI = (stk_df['ADX']<sell_threshold)|(stk_df['plus_DI']<stk_df['minus_DI'])
stk_df['DMI_signal'] = np.where(buy_DMI,1,np.nan)
stk_df.loc[sell_DMI,'DMI_signal'] = 0
stk_df['DMI_signal'] = stk_df.groupby('secID')['DMI_signal'].fillna(method='ffill')
In [231]:
stk_df['DMI_signal'].value_counts()
Out[231]:
0.0    1637030
1.0      66483
Name: DMI_signal, dtype: int64
In [232]:
# MFI
stk_df['MFI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice','turnoverVol']].apply(lambda x:ta.MFI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice'],volume=x['turnoverVol']).to_frame()).values
In [233]:
buy_threshold = 20  
sell_threshold = 80 
buy_MFI = stk_df['MFI']<buy_threshold
sell_MFI = stk_df['MFI']>sell_threshold
stk_df['MFI_signal'] = np.where(buy_MFI,1,np.nan)
stk_df.loc[sell_MFI,'MFI_signal'] = 0
stk_df['MFI_signal'] = stk_df.groupby('secID')['MFI_signal'].fillna(method='ffill')
In [234]:
stk_df['MFI_signal'].value_counts()
Out[234]:
0.0    1161754
1.0     531563
Name: MFI_signal, dtype: int64
In [235]:
# RSI
stk_df['RSI'] = stk_df.groupby('secID')['closePrice'].apply(ta.RSI)
buy_threshold = 30
sell_threshold = 70
buy_RSI = stk_df['RSI'] < buy_threshold
sell_RSI = stk_df['RSI'] < sell_threshold
stk_df['RSI_signal'] = np.where(buy_RSI,1,np.nan)
stk_df.loc[sell_RSI,'RSI_signal'] = 0
stk_df['RSI_signal'] = stk_df.groupby('secID')['RSI_signal'].fillna(method='ffill')
In [236]:
# %%time
# WVAD
# def wvad(df):
#     return sum((df[:,0] - df[:,1]) / (df[:,2] - df[:,3]) * df[:,4])

# stk_df.groupby('secID')[['closePrice','openPrice','highestPrice',
#                          'lowestPrice','turnoverVol']].apply(lambda x: x.rolling(24).apply(wvad,raw=False))
######## The above code is not working ##########

# temp = stk_df[stk_df['secID'].isin(np.random.choice(stk_df['secID'].unique(),10))].copy()
# def wvad(closePrice):
#     close = closePrice
#     open_ = stk_df.loc[closePrice.index, 'openPrice'].to_numpy()
#     high = stk_df.loc[closePrice.index, 'highestPrice'].to_numpy()
#     low = stk_df.loc[closePrice.index, 'lowestPrice'].to_numpy()
#     volume = stk_df.loc[closePrice.index, 'turnoverVol'].to_numpy()
#     return sum((closePrice - open_) / (high - low) * volume)
# temp2 = temp.groupby('secID')['closePrice'].rolling(24).apply(wvad, raw=False)
In [237]:
stk_df.reset_index(inplace=True, drop=True)
In [254]:
cols = ['secID','tradeDate','open_ret'] + [col for col in stk_df.columns if col[-6:]=='signal']

ret_df = stk_df[cols].copy()

ret_df['signal_date'] = ret_df['tradeDate']

ret_df[['tradeDate','open_ret']] = ret_df.groupby('secID')[['tradeDate','open_ret']].shift(-2)

ret_df.dropna(inplace=True)

ret_df.drop('MACD_rawsignal',axis=1,inplace=True)

ret_df['ret_sign'] = np.where(ret_df['open_ret']>0,1,-1)

ret_df
Out[254]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign
72 000016.XSHE 2007-04-25 0.027783 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-23 1
73 000016.XSHE 2007-04-26 -0.020546 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-24 -1
74 000016.XSHE 2007-04-27 -0.003311 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-25 -1
75 000016.XSHE 2007-04-30 -0.033220 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-26 -1
76 000016.XSHE 2007-05-08 0.046975 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-27 1
77 000016.XSHE 2007-05-09 0.031718 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-04-30 1
78 000016.XSHE 2007-05-10 -0.023314 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-05-08 -1
79 000016.XSHE 2007-05-11 0.015192 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-05-09 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1731300 900955.XSHG 2020-04-20 0.017639 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-16 1
1731301 900955.XSHG 2020-04-21 -0.041333 1.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-17 -1
1731302 900955.XSHG 2020-04-22 -0.052851 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-20 -1
1731303 900955.XSHG 2020-04-23 0.000000 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-21 -1
1731304 900955.XSHG 2020-04-24 0.004405 0.0 0.0 1 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-22 1
1731305 900955.XSHG 2020-04-27 -0.014620 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-23 -1
1731306 900955.XSHG 2020-04-28 -0.154303 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-24 -1
1731307 900955.XSHG 2020-04-29 -0.024561 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2020-04-27 -1

1655780 rows × 15 columns

In [255]:
ret_df.sort_values('tradeDate',inplace=True)

ret_df.reset_index(inplace=True,drop=True)
In [256]:
ret_df
Out[256]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign
0 000997.XSHE 2007-03-01 0.042649 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
1 600831.XSHG 2007-03-01 0.020545 0.0 0.0 0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 2007-02-27 1
2 600162.XSHG 2007-03-01 0.048594 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
3 600099.XSHG 2007-03-01 0.009170 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
4 000667.XSHE 2007-03-01 0.171178 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
5 600058.XSHG 2007-03-01 0.066308 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
6 600055.XSHG 2007-03-01 0.017825 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1
7 600223.XSHG 2007-03-01 -0.011986 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1655772 300600.XSHE 2022-04-11 -0.028164 0.0 1.0 0 1.0 0.0 1.0 1.0 0.0 1.0 0.0 2022-04-07 -1
1655773 603595.XSHG 2022-04-11 0.004298 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 2022-04-07 1
1655774 300605.XSHE 2022-04-11 -0.035209 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2022-04-07 -1
1655775 600977.XSHG 2022-04-11 -0.023079 0.0 1.0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2022-04-07 -1
1655776 000566.XSHE 2022-04-11 -0.023436 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2022-04-07 -1
1655777 002247.XSHE 2022-04-11 -0.028664 1.0 1.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2022-04-07 -1
1655778 002264.XSHE 2022-04-11 0.014036 1.0 1.0 0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2022-04-07 1
1655779 603619.XSHG 2022-04-11 -0.023817 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2022-04-07 -1

1655780 rows × 15 columns

Random Forest

Train, test split

In [257]:
ret_df['year'] = ret_df['tradeDate'].dt.year
In [258]:
time_idx = [value for (key, value) in sorted(ret_df.groupby('year').groups.items())]
In [259]:
time_idx
Out[259]:
[Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                 9,
             ...
             38047, 38048, 38049, 38050, 38051, 38052, 38053, 38054, 38055,
             38056],
            dtype='int64', length=38057),
 Int64Index([38057, 38058, 38059, 38060, 38061, 38062, 38063, 38064, 38065,
             38066,
             ...
             97910, 97911, 97912, 97913, 97914, 97915, 97916, 97917, 97918,
             97919],
            dtype='int64', length=59863),
 Int64Index([ 97920,  97921,  97922,  97923,  97924,  97925,  97926,  97927,
              97928,  97929,
             ...
             160530, 160531, 160532, 160533, 160534, 160535, 160536, 160537,
             160538, 160539],
            dtype='int64', length=62620),
 Int64Index([160540, 160541, 160542, 160543, 160544, 160545, 160546, 160547,
             160548, 160549,
             ...
             227555, 227556, 227557, 227558, 227559, 227560, 227561, 227562,
             227563, 227564],
            dtype='int64', length=67025),
 Int64Index([227565, 227566, 227567, 227568, 227569, 227570, 227571, 227572,
             227573, 227574,
             ...
             307810, 307811, 307812, 307813, 307814, 307815, 307816, 307817,
             307818, 307819],
            dtype='int64', length=80255),
 Int64Index([307820, 307821, 307822, 307823, 307824, 307825, 307826, 307827,
             307828, 307829,
             ...
             401506, 401507, 401508, 401509, 401510, 401511, 401512, 401513,
             401514, 401515],
            dtype='int64', length=93696),
 Int64Index([401516, 401517, 401518, 401519, 401520, 401521, 401522, 401523,
             401524, 401525,
             ...
             499957, 499958, 499959, 499960, 499961, 499962, 499963, 499964,
             499965, 499966],
            dtype='int64', length=98451),
 Int64Index([499967, 499968, 499969, 499970, 499971, 499972, 499973, 499974,
             499975, 499976,
             ...
             600980, 600981, 600982, 600983, 600984, 600985, 600986, 600987,
             600988, 600989],
            dtype='int64', length=101023),
 Int64Index([600990, 600991, 600992, 600993, 600994, 600995, 600996, 600997,
             600998, 600999,
             ...
             702686, 702687, 702688, 702689, 702690, 702691, 702692, 702693,
             702694, 702695],
            dtype='int64', length=101706),
 Int64Index([702696, 702697, 702698, 702699, 702700, 702701, 702702, 702703,
             702704, 702705,
             ...
             815447, 815448, 815449, 815450, 815451, 815452, 815453, 815454,
             815455, 815456],
            dtype='int64', length=112761),
 Int64Index([815457, 815458, 815459, 815460, 815461, 815462, 815463, 815464,
             815465, 815466,
             ...
             944537, 944538, 944539, 944540, 944541, 944542, 944543, 944544,
             944545, 944546],
            dtype='int64', length=129090),
 Int64Index([ 944547,  944548,  944549,  944550,  944551,  944552,  944553,
              944554,  944555,  944556,
             ...
             1091347, 1091348, 1091349, 1091350, 1091351, 1091352, 1091353,
             1091354, 1091355, 1091356],
            dtype='int64', length=146810),
 Int64Index([1091357, 1091358, 1091359, 1091360, 1091361, 1091362, 1091363,
             1091364, 1091365, 1091366,
             ...
             1250873, 1250874, 1250875, 1250876, 1250877, 1250878, 1250879,
             1250880, 1250881, 1250882],
            dtype='int64', length=159526),
 Int64Index([1250883, 1250884, 1250885, 1250886, 1250887, 1250888, 1250889,
             1250890, 1250891, 1250892,
             ...
             1418480, 1418481, 1418482, 1418483, 1418484, 1418485, 1418486,
             1418487, 1418488, 1418489],
            dtype='int64', length=167607),
 Int64Index([1418490, 1418491, 1418492, 1418493, 1418494, 1418495, 1418496,
             1418497, 1418498, 1418499,
             ...
             1604588, 1604589, 1604590, 1604591, 1604592, 1604593, 1604594,
             1604595, 1604596, 1604597],
            dtype='int64', length=186108),
 Int64Index([1604598, 1604599, 1604600, 1604601, 1604602, 1604603, 1604604,
             1604605, 1604606, 1604607,
             ...
             1655770, 1655771, 1655772, 1655773, 1655774, 1655775, 1655776,
             1655777, 1655778, 1655779],
            dtype='int64', length=51182)]
In [260]:
def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result
In [261]:
list_flat([[1,2,3],[3,4,5]])
Out[261]:
[1, 2, 3, 3, 4, 5]
In [262]:
# training, validation, testing scheme:
# 1. [2007-2010], [2011-2014], [2015]
# 2. [2007-2011], [2012-2015], [2016]
# ...
# last. [2008-2017], [2018-2021], [2022]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])
In [263]:
cv_idx[0]
Out[263]:
(array([     0,      1,      2, ..., 227562, 227563, 227564]),
 array([227565, 227566, 227567, ..., 600987, 600988, 600989]))
In [264]:
test_years = list(range(2015, 2023))
test_years
Out[264]:
[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
In [265]:
X_cols = [col for col in ret_df.columns if col[-6:]=='signal']
In [266]:
X_cols
Out[266]:
['EM_signal',
 'MACD_signal',
 'OBV_signal',
 'AROON_signal',
 'BOLL_signal',
 'CCI_signal',
 'CMO_signal',
 'DMI_signal',
 'MFI_signal',
 'RSI_signal']

Evaluation metrics

用 accuracy, f1_score 作为好坏评价的标准

Random forest

In [267]:
hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [1,3,5], 
     'max_features': [3,5,len(X_cols)]}
]
In [268]:
model = RandomForestClassifier()
In [269]:
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]])
In [270]:
X_fulltrain = ret_df.loc[fulltrain_idx[0], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[0], 'ret_sign']
X_test = ret_df.loc[test_idx[0], X_cols]
y_test = ret_df.loc[test_idx[0], 'ret_sign']
In [271]:
ret_df.loc[fulltrain_idx[0]]
Out[271]:
secID tradeDate open_ret EM_signal MACD_signal OBV_signal AROON_signal BOLL_signal CCI_signal CMO_signal DMI_signal MFI_signal RSI_signal signal_date ret_sign year
0 000997.XSHE 2007-03-01 0.042649 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
1 600831.XSHG 2007-03-01 0.020545 0.0 0.0 0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
2 600162.XSHG 2007-03-01 0.048594 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
3 600099.XSHG 2007-03-01 0.009170 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
4 000667.XSHE 2007-03-01 0.171178 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
5 600058.XSHG 2007-03-01 0.066308 1.0 1.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
6 600055.XSHG 2007-03-01 0.017825 1.0 0.0 0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 1 2007
7 600223.XSHG 2007-03-01 -0.011986 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2007-02-27 -1 2007
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
600982 900941.XSHG 2014-12-31 -0.010254 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 -1 2014
600983 600528.XSHG 2014-12-31 0.016396 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2014-12-29 1 2014
600984 000018.XSHE 2014-12-31 -0.139968 1.0 1.0 1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 2014-12-29 -1 2014
600985 002713.XSHE 2014-12-31 0.022068 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 1 2014
600986 002077.XSHE 2014-12-31 -0.037028 0.0 0.0 0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 -1 2014
600987 300152.XSHE 2014-12-31 -0.017756 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 -1 2014
600988 601118.XSHG 2014-12-31 0.010373 0.0 0.0 0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 2014-12-29 1 2014
600989 002568.XSHE 2014-12-31 0.014278 0.0 0.0 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2014-12-29 1 2014

600990 rows × 16 columns

In [272]:
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 37.7 s, sys: 84 ms, total: 37.7 s
Wall time: 37.7 s
Out[272]:
GridSearchCV(cv=[(array([     0,      1,      2, ..., 227562, 227563, 227564]),
                  array([227565, 227566, 227567, ..., 600987, 600988, 600989]))],
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [1, 3, 5], 'max_features': [3, 5, 10],
                          'n_estimators': [50]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [273]:
grid_search.best_params_
Out[273]:
{'max_depth': 3, 'max_features': 10, 'n_estimators': 50}
In [274]:
pd.DataFrame({"features":X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',ascending=False)
Out[274]:
features feature_importance
2 OBV_signal 0.343076
1 MACD_signal 0.336498
3 AROON_signal 0.137118
0 EM_signal 0.110952
5 CCI_signal 0.051342
6 CMO_signal 0.015486
8 MFI_signal 0.005530
4 BOLL_signal 0.000000
7 DMI_signal 0.000000
9 RSI_signal 0.000000
In [275]:
y_pred = grid_search.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
0.5340786187638881
In [276]:
# Benchmark all-positive
all_ones = np.ones_like(y_pred)
accuracy_score(y_true=y_test, y_pred=all_ones)
Out[276]:
0.5530155546378778
In [277]:
y_pred = grid_search.predict(X_test)
print(f1_score(y_true=y_test, y_pred=y_pred))
print(f1_score(y_true=y_test, y_pred=all_ones))
0.6173066828184939
0.7121828921627593
In [278]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
    y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
    X_test = ret_df.loc[test_idx[i], X_cols]
    y_test = ret_df.loc[test_idx[i], 'ret_sign']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",accuracy_score(y_true=y_test, y_pred=y_pred))
Test year 2015 : 0.5337541541305331
Test year 2016 : 0.5238601998918065
Test year 2017 : 0.4740336199550701
Test year 2018 : 0.5116545194469042
Test year 2019 : 0.49923523438185624
Test year 2020 : 0.5083200582314581
Test year 2021 : 0.5099404646764244
Test year 2022 : 0.513598530733461
CPU times: user 10min 45s, sys: 15.2 s, total: 11min
Wall time: 11min

Return

In [297]:
i = 6
print(test_years[i])
X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
X_test = ret_df.loc[test_idx[i], X_cols]

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
grid_search.fit(X_fulltrain, y_fulltrain)
rule = grid_search.predict(X=X_test)
2021
In [298]:
rule
Out[298]:
array([ 1, -1,  1, ...,  1, -1,  1])
In [299]:
rule[rule==-1] = 0
In [300]:
rule
Out[300]:
array([1, 0, 1, ..., 1, 0, 1])
In [301]:
rule_ret = ret_df.loc[test_idx[i],'open_ret'].values * rule
In [302]:
ret_df_2021 = ret_df.loc[test_idx[i],['secID','tradeDate','open_ret']].copy()
ret_df_2021['rule_ret'] = rule_ret
In [303]:
ret_df_2021
Out[303]:
secID tradeDate open_ret rule_ret
1418490 603086.XSHG 2021-01-04 0.011064 0.011064
1418491 002223.XSHE 2021-01-04 0.001062 0.000000
1418492 000881.XSHE 2021-01-04 -0.059139 -0.059139
1418493 300376.XSHE 2021-01-04 0.029376 0.029376
1418494 300860.XSHE 2021-01-04 0.056164 0.056164
1418495 002683.XSHE 2021-01-04 0.049054 0.049054
1418496 600096.XSHG 2021-01-04 -0.001603 -0.000000
1418497 300278.XSHE 2021-01-04 0.051413 0.051413
... ... ... ... ...
1604590 002413.XSHE 2021-12-31 0.012368 0.000000
1604591 603985.XSHG 2021-12-31 0.014632 0.000000
1604592 603955.XSHG 2021-12-31 0.019876 0.000000
1604593 603967.XSHG 2021-12-31 -0.005791 -0.000000
1604594 003040.XSHE 2021-12-31 0.001535 0.001535
1604595 603223.XSHG 2021-12-31 0.003763 0.003763
1604596 200530.XSHE 2021-12-31 0.060237 0.000000
1604597 688619.XSHG 2021-12-31 0.011786 0.011786

186108 rows × 4 columns

In [306]:
ret_df_2021.sort_values(['secID','tradeDate'],inplace=True)
In [307]:
# time-series mean of daily return
rule_tsmean_ret_by_crs = ret_df_2021.groupby('secID')['rule_ret'].mean()
rule_tsmean_ret_by_crs
Out[307]:
secID
000016.XSHE    0.000530
000027.XSHE    0.001752
000048.XSHE    0.000666
000056.XSHE    0.000988
000058.XSHE   -0.000555
000066.XSHE    0.000622
000070.XSHE   -0.000059
000099.XSHE    0.001105
                 ...   
900906.XSHG    0.004440
900910.XSHG    0.000701
900913.XSHG    0.000938
900920.XSHG    0.000142
900921.XSHG    0.002484
900940.XSHG   -0.000584
900941.XSHG   -0.000029
900946.XSHG   -0.002478
Name: rule_ret, Length: 821, dtype: float64
In [308]:
rule_tsmean_ret_by_crs.dropna(inplace=True)
y = rule_tsmean_ret_by_crs.values
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HC0')
mean_values = reg.params[0]
t_values = reg.tvalues[0]
In [309]:
pd.DataFrame([mean_values,t_values],index=['ret_mean','t_values'],columns=['rule_daily_ret'])
Out[309]:
rule_daily_ret
ret_mean 0.000497
t_values 5.203033
In [310]:
0.0005 *250
Out[310]:
0.125
In [311]:
(1+0.0005)**250 - 1
Out[311]:
0.13311305452977829
In [ ]: