import pandas as pd
import numpy as np
import tqdm
import gc
import matplotlib.pyplot as plt
import talib as ta
import datetime as dt
import statsmodels.api as sm
pd.set_option('display.max_rows', 16)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

plt.rcParams['figure.figsize'] = (16.0, 9.0)

Data¶

START = '20070101'
END = '20231231'

# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
cond3 = stk_info['transCurrCD']=='CNY'
stk_info = stk_info[cond1 & cond2 & cond3].copy()
stk_id = stk_info['secID']
# ST
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")

# %%time
# # About 8 mins
# # # 从优矿下载股票信息，时间较长。由于优矿的限制，每次下载3年的数据

# stk_dict = {}
# begin_ = dt.datetime.strptime(START, '%Y%m%d').year
# end_ = dt.datetime.strptime(START, '%Y%m%d').year+3
# field = ["secID","tradeDate",'preClosePrice',"closePrice",'openPrice','highestPrice','lowestPrice',"negMarketValue","turnoverValue",'turnoverVol']
# while begin_ <= 2023:
#     if begin_ == 2023:
#         yesterday = dt.datetime.today() - dt.timedelta(days=1)
#         yesterday.strftime('%Y%m%d')
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                      beginDate=f'{begin_}0101',
#                                                      endDate=yesterday,
#                                                      field=field,pandas="1")
#     else:
#         stk_dict[begin_] = DataAPI.MktEqudAdjAfGet(secID=stk_id,
#                                                          beginDate=f'{begin_}0101',
#                                                          endDate=f'{end_}1231',
#                                                          field=field,pandas="1")
#     begin_ = end_ + 1
#     end_ = begin_ + 3
    
# for i in range(len(stk_dict)):
#     stk_df = pd.DataFrame(np.vstack([_df for _df in stk_dict.values()]),columns=field)
    
# stk_df.to_pickle('./data/stk_df.pkl')

# %%time
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
#                                            field=["secID","tradeDate",
#                                                   'preClosePrice',"closePrice",
#                                                   'openPrice','highestPrice','lowestPrice',
#                                                   "negMarketValue",
#                                                   "turnoverValue",'turnoverVol'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
# # Takes about 6 mins

stk_df = pd.read_pickle('./data/stk_df.pkl')

stk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11294684 entries, 0 to 11294683
Data columns (total 10 columns):
secID             object
tradeDate         object
preClosePrice     object
closePrice        object
openPrice         object
highestPrice      object
lowestPrice       object
negMarketValue    object
turnoverValue     object
turnoverVol       object
dtypes: object(10)
memory usage: 861.7+ MB

for col in stk_df.columns.drop(['secID','tradeDate']).tolist():
    stk_df[col] = pd.to_numeric(stk_df[col])

stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_df.shape)
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
print(stk_df.shape)

(11294684, 10)
(10774842, 10)

stk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774842 entries, 109 to 11294683
Data columns (total 10 columns):
secID             object
tradeDate         datetime64[ns]
preClosePrice     float64
closePrice        float64
openPrice         float64
highestPrice      float64
lowestPrice       float64
negMarketValue    float64
turnoverValue     float64
turnoverVol       int64
dtypes: datetime64[ns](1), float64(7), int64(1), object(1)
memory usage: 904.3+ MB

不填充停牌值比较合理，因为技术分析只看量价，直接计算量价关系较为合适

random_stkid = np.random.choice(stk_df['secID'].unique(),1000)

stk_df = stk_df[stk_df['secID'].isin(random_stkid)].copy()

stk_df.drop(stk_df.loc[stk_df['openPrice']==0].index,inplace=True)

stk_df['open_ret'] = stk_df.groupby('secID')['openPrice'].apply(lambda x: x / x.shift() - 1)
stk_df['close_ret'] = stk_df['closePrice']/stk_df['preClosePrice'] - 1

def rule_return(df, demean=True, open_ret=True):
    """
    df should contain these columns:
        signal: the signal generated by the rule
        close_ret: return calculated by close price
        open_ret: return calculated by open price
    close_ret_demean is demeaned return of close_ret, i.e. close_ret - close_ret.mean.
    open_ret_demean is similarly defined. The use of demeaned return series is to adjust the
    bias created by bullish or bearish markets.
    """
    df['close_ret_demean'] = df['close_ret'] - df['close_ret'].mean()
    df['open_ret_demean'] = df['open_ret'] - df['open_ret'].mean()
    df['position_close'] = df['signal']
    df['position_open'] = df['signal'].shift()
    df['position_close_ret'] = df['position_close'].shift() * df['close_ret']
    df['position_open_ret'] = df['position_open'].shift() * df['open_ret']
    df['position_close_ret_demean'] = df['position_close'].shift() * df['close_ret_demean']
    df['position_open_ret_demean'] = df['position_open'].shift() * df['open_ret_demean']
    df['close_cumret'] = (df['position_close_ret']+1).cumprod()
    df['open_cumret'] = (df['position_open_ret']+1).cumprod()
    if open_ret:
        if demean:
            return pd.DataFrame({'position_open_ret_demean':df['position_open_ret_demean'].values, 
                                 'open_cumret':df['open_cumret'].values})
        else:
            return pd.DataFrame({'position_open_ret':df['position_open_ret'].values, 
                                 'open_cumret':df['open_cumret'].values})
    else:
        if demean:
            return pd.DataFrame({'position_close_ret_demean':df['position_close_ret_demean'].values, 
                                 'close_cumret':df['close_cumret'].values})
        else:
            return pd.DataFrame({'position_close_ret':df['position_close_ret'].values, 
                                 'close_cumret':df['close_cumret'].values})

A bunch of TA signals¶

# EMA
stk_df['EMA'] = stk_df.groupby('secID')['closePrice'].apply(ta.EMA, 20)
stk_df['EM_signal'] = np.where(stk_df['closePrice']>stk_df['EMA'],1,np.nan)
stk_df.loc[stk_df['closePrice']<=stk_df['EMA'],'EM_signal'] = 0
stk_df['EM_signal'] = stk_df.groupby('secID')['EM_signal'].fillna(method='ffill')

stk_df['EM_signal'].value_counts()

0.0    954836
1.0    912242
Name: EM_signal, dtype: int64

# MACD
stk_df['MACD'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[0])
stk_df['MACD_rawsignal'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[1])
stk_df['MACD_signal'] = np.where(stk_df['MACD']>stk_df['MACD_rawsignal'],1,np.nan)
stk_df.loc[stk_df['MACD']<=stk_df['MACD_rawsignal'],'MACD_signal'] = 0
stk_df['MACD_signal'] = stk_df.groupby('secID')['MACD_signal'].fillna(method='ffill')

stk_df['MACD_signal'].value_counts()

1.0    971195
0.0    883327
Name: MACD_signal, dtype: int64

# # Example
# temp = stk_df[stk_df['secID']=='900957.XSHG'].copy()
# ta.OBV(temp['closePrice'],temp['turnoverValue'])
# stk_df.loc[9968491:]

# OBV
stk_df['OBV'] = stk_df.groupby('secID')[['closePrice','turnoverVol']].apply(lambda x: ta.OBV(real=x['closePrice'],volume=x['turnoverVol']).to_frame('OBV')).values
stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
stk_df['OBV_change'] = stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
buy1_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']>0)
buy2_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']>0)
sell1_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']<0)
sell2_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']<0)
stk_df['OBV_signal'] = 0
stk_df.loc[buy1_OBV | buy2_OBV,'OBV_signal']=1

stk_df['OBV_signal'].value_counts()

0    947182
1    936943
Name: OBV_signal, dtype: int64

# AROON
N = 26
high_threshold = 70  # 强趋势
low_threshold = 30  # 弱趋势

stk_df['AROON_up'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[0].to_frame('AROON_up')).values
stk_df['AROON_down'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[1].to_frame('AROON_down')).values
buy_AROON = (stk_df['AROON_up']>high_threshold)&(stk_df['AROON_down']<low_threshold)
sell_AROON = (stk_df['AROON_down']>high_threshold)&(stk_df['AROON_up']<low_threshold)
stk_df['AROON_signal'] = np.where(buy_AROON,1,np.nan)
stk_df.loc[sell_AROON,'AROON_signal'] = 0
stk_df['AROON_signal'] = stk_df.groupby('secID')['AROON_signal'].fillna(method='ffill')

stk_df['AROON_signal'].value_counts()

0.0    958018
1.0    912001
Name: AROON_signal, dtype: int64

# BOLL
stk_df['BOLL_up'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[0].to_frame('BOLL_up')).values
stk_df['BOLL_mid'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[1].to_frame('BOLL_mid')).values
stk_df['BOLL_low'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[2].to_frame('BOLL_low')).values
stk_df['BOLL_up_lag'] = stk_df.groupby('secID')['BOLL_up'].shift()
stk_df['BOLL_mid_lag'] = stk_df.groupby('secID')['BOLL_mid'].shift()
stk_df['BOLL_low_lag'] = stk_df.groupby('secID')['BOLL_low'].shift()

buy_BOLL = (stk_df['preClosePrice'] < stk_df['BOLL_up_lag'])&(stk_df['closePrice']>stk_df['BOLL_up'])
sell_BOLL = (stk_df['preClosePrice'] > stk_df['BOLL_mid_lag'])&(stk_df['closePrice']<stk_df['BOLL_mid'])
stk_df['BOLL_signal'] = np.where(buy_BOLL, 1, np.nan)
stk_df.loc[sell_BOLL,'BOLL_signal'] = 0
stk_df['BOLL_signal'] = stk_df.groupby('secID')['BOLL_signal'].fillna(method='ffill')

stk_df['BOLL_signal'].value_counts()

0.0    1874052
1.0        282
Name: BOLL_signal, dtype: int64

# CCI
buy_threshold = -100  # 买入阈值
sell_threshold = 100  # 卖出阈值
stk_df['CCI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x:ta.CCI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice']).to_frame('CCI')).values

buy_CCI = stk_df['CCI']<buy_threshold
sell_CCI = stk_df['CCI']>sell_threshold
stk_df['CCI_signal'] = np.where(buy_CCI,1,np.nan)
stk_df.loc[sell_CCI,'CCI_signal'] = 0
stk_df['CCI_signal'] = stk_df.groupby('secID')['CCI_signal'].fillna(method='ffill')

stk_df['CCI_signal'].value_counts()

0.0    946907
1.0    922819
Name: CCI_signal, dtype: int64

# CMO
buy_threshold = -50  # 买入阈值
sell_threshold = 50
stk_df['CMO'] = stk_df.groupby('secID')['closePrice'].apply(ta.CMO)

buy_CMO = stk_df['CMO'] < buy_threshold
sell_CMO = stk_df['CMO'] > sell_threshold
stk_df['CMO_signal'] = np.where(buy_CMO, 1, np.nan)
stk_df.loc[sell_CMO,'CMO_signal'] = 0
stk_df['CMO_signal'] = stk_df.groupby('secID')['CMO_signal'].fillna(method='ffill')

stk_df['CMO_signal'].value_counts()

0.0    1115935
1.0     712274
Name: CMO_signal, dtype: int64

gc.collect()

72

# DMI
stk_df['plus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.PLUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['minus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.MINUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['ADX'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.ADX(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values

buy_threshold = 50 
sell_threshold = 20
buy_DMI = (stk_df['ADX']>buy_threshold)&(stk_df['plus_DI']>stk_df['minus_DI'])
sell_DMI = (stk_df['ADX']<sell_threshold)|(stk_df['plus_DI']<stk_df['minus_DI'])
stk_df['DMI_signal'] = np.where(buy_DMI,1,np.nan)
stk_df.loc[sell_DMI,'DMI_signal'] = 0
stk_df['DMI_signal'] = stk_df.groupby('secID')['DMI_signal'].fillna(method='ffill')

stk_df['DMI_signal'].value_counts()

0.0    1785627
1.0      71675
Name: DMI_signal, dtype: int64

# MFI
stk_df['MFI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice','turnoverVol']].apply(lambda x:ta.MFI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice'],volume=x['turnoverVol']).to_frame()).values

buy_threshold = 20  
sell_threshold = 80 
buy_MFI = stk_df['MFI']<buy_threshold
sell_MFI = stk_df['MFI']>sell_threshold
stk_df['MFI_signal'] = np.where(buy_MFI,1,np.nan)
stk_df.loc[sell_MFI,'MFI_signal'] = 0
stk_df['MFI_signal'] = stk_df.groupby('secID')['MFI_signal'].fillna(method='ffill')

stk_df['MFI_signal'].value_counts()

0.0    1268015
1.0     577057
Name: MFI_signal, dtype: int64

# RSI
stk_df['RSI'] = stk_df.groupby('secID')['closePrice'].apply(ta.RSI)
buy_threshold = 30
sell_threshold = 70
buy_RSI = stk_df['RSI'] < buy_threshold
sell_RSI = stk_df['RSI'] > sell_threshold
stk_df['RSI_signal'] = np.where(buy_RSI,1,np.nan)
stk_df.loc[sell_RSI,'RSI_signal'] = 0
stk_df['RSI_signal'] = stk_df.groupby('secID')['RSI_signal'].fillna(method='ffill')

# %%time
# WVAD
# def wvad(df):
#     return sum((df[:,0] - df[:,1]) / (df[:,2] - df[:,3]) * df[:,4])

# stk_df.groupby('secID')[['closePrice','openPrice','highestPrice',
#                          'lowestPrice','turnoverVol']].apply(lambda x: x.rolling(24).apply(wvad,raw=False))
####### The above code is not working ##########

# temp = stk_df[stk_df['secID'].isin(np.random.choice(stk_df['secID'].unique(),10))].copy()
# def wvad(closePrice):
#     close = closePrice
#     open_ = stk_df.loc[closePrice.index, 'openPrice'].to_numpy()
#     high = stk_df.loc[closePrice.index, 'highestPrice'].to_numpy()
#     low = stk_df.loc[closePrice.index, 'lowestPrice'].to_numpy()
#     volume = stk_df.loc[closePrice.index, 'turnoverVol'].to_numpy()
#     return sum((closePrice - open_) / (high - low) * volume)
# temp2 = temp.groupby('secID')['closePrice'].rolling(24).apply(wvad, raw=False)

stk_df.reset_index(inplace=True, drop=True)

cols = ['secID','tradeDate','open_ret'] + [col for col in stk_df.columns if col[-6:]=='signal']

ret_df = stk_df[cols].copy()

ret_df['signal_date'] = ret_df['tradeDate']

ret_df[['tradeDate','open_ret']] = ret_df.groupby('secID')[['tradeDate','open_ret']].shift(-2)

ret_df.dropna(inplace=True)

ret_df.drop('MACD_rawsignal',axis=1,inplace=True)

ret_df['ret_sign'] = np.where(ret_df['open_ret']>0,1,-1)

ret_df

ret_df.sort_values('tradeDate',inplace=True)

ret_df.reset_index(inplace=True,drop=True)

ret_df

Random Forest¶

Train, test split¶

ret_df['year'] = ret_df['tradeDate'].dt.year

time_idx = [value for (key, value) in sorted(ret_df.groupby('year').groups.items())]

def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result

list_flat([[1,2,3],[3,4,5]])

[1, 2, 3, 3, 4, 5]

np.array([[1,2,3],[3,4,5]]).flatten()

array([1, 2, 3, 3, 4, 5])

# training, validation, testing scheme:
# 1. [2007-2010], [2011-2014], [2015]
# 2. [2008-2011], [2012-2015], [2016]
# ...
# last. [2015-2018], [2019-2022], [2023]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[i-4:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作，不能带着pandas的index，
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])

len(fulltrain_idx)

9

len(fulltrain_idx[0])

611705

len(fulltrain_idx[1])

673381

cv_idx

[(array([     0,      1,      2, ..., 232926, 232927, 232928]),
  array([232929, 232930, 232931, ..., 611702, 611703, 611704])),
 (array([     0,      1,      2, ..., 278923, 278924, 278925]),
  array([278926, 278927, 278928, ..., 673378, 673379, 673380])),
 (array([     0,      1,      2, ..., 312439, 312440, 312441]),
  array([312442, 312443, 312444, ..., 725928, 725929, 725930])),
 (array([     0,      1,      2, ..., 347643, 347644, 347645]),
  array([347646, 347647, 347648, ..., 786923, 786924, 786925])),
 (array([     0,      1,      2, ..., 378773, 378774, 378775]),
  array([378776, 378777, 378778, ..., 857328, 857329, 857330])),
 (array([     0,      1,      2, ..., 394452, 394453, 394454]),
  array([394455, 394456, 394457, ..., 924559, 924560, 924561])),
 (array([     0,      1,      2, ..., 413486, 413487, 413488]),
  array([413489, 413490, 413491, ..., 987490, 987491, 987492])),
 (array([     0,      1,      2, ..., 439277, 439278, 439279]),
  array([ 439280,  439281,  439282, ..., 1057939, 1057940, 1057941])),
 (array([     0,      1,      2, ..., 478552, 478553, 478554]),
  array([ 478555,  478556,  478557, ..., 1144981, 1144982, 1144983]))]

test_years = list(range(2015, 2024))
test_years

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

X_cols = [col for col in ret_df.columns if col[-6:]=='signal']

X_cols

['EM_signal',
 'MACD_signal',
 'OBV_signal',
 'AROON_signal',
 'BOLL_signal',
 'CCI_signal',
 'CMO_signal',
 'DMI_signal',
 'MFI_signal',
 'RSI_signal']

Evaluation metrics¶

用 accuracy, f1_score 作为好坏评价的标准

accuracy: 正确数/总数。比如预测了100次，对了99次，则accuracy = 99%. $accuracy = \frac{TP+TN}{TP+TN+FP+FN}$
$precision = \frac{TP}{TP+FP}$, 在预测为Positive时，预测对的次数
$recall = \frac{TP}{TP+FN}$，在真实为Positive时，预测对的次数
$f1 = \frac{2}{\frac{1}{precision}+\frac{1}{recall}} $

Random forest¶

hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [5,8]}
]

model = RandomForestClassifier()

# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]])

X_fulltrain = ret_df.loc[fulltrain_idx[0], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[0], 'ret_sign']
X_test = ret_df.loc[test_idx[0], X_cols]
y_test = ret_df.loc[test_idx[0], 'ret_sign']

ret_df.loc[fulltrain_idx[0]]

%%time
grid_search.fit(X_fulltrain, y_fulltrain)

CPU times: user 26.9 s, sys: 252 ms, total: 27.1 s
Wall time: 27.1 s

GridSearchCV(cv=[(array([     0,      1,      2, ..., 232926, 232927, 232928]),
                  array([232929, 232930, 232931, ..., 611702, 611703, 611704]))],
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [5, 8],
                          'n_estimators': [50]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

grid_search.best_params_

{'max_depth': 3, 'max_features': 8, 'n_estimators': 50}

pd.DataFrame({"features":X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',ascending=False)

y_pred = grid_search.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))

0.5366211182618649

# Benchmark all-positive
all_ones = np.ones_like(y_pred)
accuracy_score(y_true=y_test, y_pred=all_ones)

0.5544623045953354

y_pred = grid_search.predict(X_test)
print(f1_score(y_true=y_test, y_pred=y_pred))
print(f1_score(y_true=y_test, y_pred=all_ones))

0.6231269852935172
0.7133814733959413

%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
    y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
    X_test = ret_df.loc[test_idx[i], X_cols]
    y_test = ret_df.loc[test_idx[i], 'ret_sign']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    all_ones = np.ones_like(y_pred)
    print("Test year", test_years[i],"Benchmark","accuracy:",accuracy_score(y_true=y_test, y_pred=all_ones))
    print("Test year", test_years[i],"Model","accuracy:",accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Test year", test_years[i],"Benchmark","f1:",f1_score(y_true=y_test, y_pred=all_ones))
    print("Test year", test_years[i],"Model","f1:",f1_score(y_true=y_test, y_pred=y_pred))
    print("====================我是分割线========================")

Test year 2015 Benchmark accuracy: 0.5544623045953354
Test year 2015 Model accuracy: 0.5366311583217036
Test year 2015 Benchmark f1: 0.7133814733959413
Test year 2015 Model f1: 0.623488717388112
====================我是分割线========================
Test year 2016 Benchmark accuracy: 0.4951232032854209
Test year 2016 Model accuracy: 0.5233041846633152
Test year 2016 Benchmark f1: 0.6623175965665236
Test year 2016 Model f1: 0.6244377967909964
====================我是分割线========================
Test year 2017 Benchmark accuracy: 0.47530937885480085
Test year 2017 Model accuracy: 0.5031872985555689
Test year 2017 Benchmark f1: 0.6443521415470924
Test year 2017 Model f1: 0.6112090280070002
====================我是分割线========================
Test year 2018 Benchmark accuracy: 0.4650108684032356
Test year 2018 Model accuracy: 0.49314043402344726
Test year 2018 Benchmark f1: 0.6348224145395815
Test year 2018 Model f1: 0.5884783187325687
====================我是分割线========================
Test year 2019 Benchmark accuracy: 0.4913895192288608
Test year 2019 Model accuracy: 0.49926895265062554
Test year 2019 Benchmark f1: 0.6589687172844544
Test year 2019 Model f1: 0.5386936143498852
====================我是分割线========================
Test year 2020 Benchmark accuracy: 0.4803768461445299
Test year 2020 Model accuracy: 0.5119931667952142
Test year 2020 Benchmark f1: 0.6489926499399336
Test year 2020 Model f1: 0.5384647845718867
====================我是分割线========================
Test year 2021 Benchmark accuracy: 0.4792000610640409
Test year 2021 Model accuracy: 0.5150927997275604
Test year 2021 Benchmark f1: 0.6479178492182258
Test year 2021 Model f1: 0.46911199393167996
====================我是分割线========================
Test year 2022 Benchmark accuracy: 0.46862538679937477
Test year 2022 Model accuracy: 0.5144245595006434
Test year 2022 Benchmark f1: 0.6381823316028412
Test year 2022 Model f1: 0.4546670488201012
====================我是分割线========================
Test year 2023 Benchmark accuracy: 0.4826460623089001
Test year 2023 Model accuracy: 0.5181803156763903
Test year 2023 Benchmark f1: 0.6510603907142658
Test year 2023 Model f1: 0.2271853668235138
====================我是分割线========================
CPU times: user 6min 56s, sys: 4.9 s, total: 7min 1s
Wall time: 7min

Return¶

len(test_years)

9

i = 7
print(test_years[i])
X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
X_test = ret_df.loc[test_idx[i], X_cols]

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
grid_search.fit(X_fulltrain, y_fulltrain)
rule = grid_search.predict(X=X_test)

2022

rule

array([-1,  1, -1, ...,  1, -1,  1])

rule[rule==-1] = 0

rule

array([0, 1, 0, ..., 1, 0, 1])

len(rule)

188082

ret_df.loc[test_idx[i],'open_ret']

1568607    0.001204
1568608   -0.004623
1568609    0.006788
1568610    0.006907
1568611    0.013603
1568612    0.017220
1568613    0.011306
1568614   -0.002069
             ...   
1756681    0.006943
1756682   -0.010061
1756683    0.001595
1756684   -0.007683
1756685    0.008315
1756686   -0.023044
1756687   -0.005321
1756688    0.086302
Name: open_ret, Length: 188082, dtype: float64

rule_ret = ret_df.loc[test_idx[i],'open_ret'].values * rule

ret_df_2022 = ret_df.loc[test_idx[i],['secID','tradeDate','open_ret']].copy()
ret_df_2022['rule_ret'] = rule_ret

ret_df_2022

ret_df_2022.sort_values(['secID','tradeDate'],inplace=True)

# time-series mean of daily return
rule_tsmean_ret_by_crs = ret_df_2022.groupby('secID')['rule_ret'].mean()
rule_tsmean_ret_by_crs

secID
000002.XSHE   -0.000540
000008.XSHE   -0.000559
000009.XSHE   -0.001586
000025.XSHE   -0.000187
000030.XSHE   -0.000868
000038.XSHE   -0.002357
000056.XSHE   -0.002405
000062.XSHE   -0.000188
                 ...   
688625.XSHG   -0.000670
688628.XSHG   -0.000712
688659.XSHG    0.003131
688660.XSHG   -0.002257
688661.XSHG   -0.000788
688700.XSHG    0.001938
688776.XSHG    0.000317
688786.XSHG   -0.001114
Name: rule_ret, Length: 831, dtype: float64

rule_tsmean_ret_by_crs.dropna(inplace=True)
y = rule_tsmean_ret_by_crs.values
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HC0')
mean_values = reg.params[0]
t_values = reg.tvalues[0]

pd.DataFrame([mean_values,t_values],index=['ret_mean','t_values'],columns=['rule_daily_ret'])

ret_df_2022['1+rule_ret'] = ret_df_2022['rule_ret'] + 1

ret_df_2022['cum_rule_ret'] = ret_df_2022.groupby('secID')['1+rule_ret'].cumprod()

ret_df_2022['1+open_ret'] = ret_df_2022['open_ret'] + 1
ret_df_2022['cum_open_ret'] = ret_df_2022.groupby('secID')['1+open_ret'].cumprod()

ret_df_2022

final_ret_2022 = ret_df_2022.groupby('secID').last()

final_ret_2022.sort_values('cum_rule_ret')

best_sec = final_ret_2022.sort_values('cum_rule_ret').index[-5:]

ret_df_2022.loc[ret_df_2022['secID'].isin(best_sec)].pivot(index='tradeDate',columns='secID',values='cum_rule_ret').plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda9eed0>

ret_df_2022.loc[ret_df_2022['secID'].isin(best_sec)].pivot(index='tradeDate',columns='secID',values='cum_open_ret').plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f46bdc85190>

final_ret_2022.sort_values('cum_rule_ret', inplace=True)
worst_sec = final_ret_2022.loc[final_ret_2022['tradeDate']=='2022-12-30'].index[0:10]

final_ret_2022

worst_sec

Index(['688608.XSHG', '000056.XSHE', '300482.XSHE', '300589.XSHE',
       '300543.XSHE', '688339.XSHG', '603176.XSHG', '300545.XSHE',
       '300977.XSHE', '601908.XSHG'],
      dtype='object', name='secID')

ret_df_2022.loc[ret_df_2022['secID'].isin(worst_sec)].pivot(index='tradeDate',columns='secID',values='cum_rule_ret').plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda5ea10>

ret_df_2022.loc[ret_df_2022['secID'].isin(worst_sec)].pivot(index='tradeDate',columns='secID',values='cum_open_ret').plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f46bda5e790>

	secID	tradeDate	open_ret	rule_ret	1+rule_ret	cum_rule_ret	1+open_ret	cum_open_ret
1569312	000002.XSHE	2022-01-04	0.018286	0.000000	1.000000	1.000000	1.018286	1.018286
1569771	000002.XSHE	2022-01-05	0.048230	0.000000	1.000000	1.000000	1.048230	1.067398
1570410	000002.XSHE	2022-01-06	0.033774	0.000000	1.000000	1.000000	1.033774	1.103448
1571285	000002.XSHE	2022-01-07	0.012310	0.000000	1.000000	1.000000	1.012310	1.117032
1572079	000002.XSHE	2022-01-10	0.019645	0.000000	1.000000	1.000000	1.019645	1.138976
1572623	000002.XSHE	2022-01-11	0.014220	0.000000	1.000000	1.000000	1.014220	1.155172
1573370	000002.XSHE	2022-01-12	0.001809	0.000000	1.000000	1.000000	1.001809	1.157262
1573862	000002.XSHE	2022-01-13	-0.006772	-0.000000	1.000000	1.000000	0.993228	1.149425
...	...	...	...	...	...	...	...	...
1750218	688786.XSHG	2022-12-21	-0.002262	-0.000000	1.000000	0.723008	0.997738	1.007237
1751230	688786.XSHG	2022-12-22	-0.020498	-0.000000	1.000000	0.723008	0.979502	0.986590
1752398	688786.XSHG	2022-12-23	-0.037454	-0.000000	1.000000	0.723008	0.962546	0.949638
1752743	688786.XSHG	2022-12-26	0.006763	0.006763	1.006763	0.727898	1.006763	0.956061
1753560	688786.XSHG	2022-12-27	0.027131	0.027131	1.027131	0.747646	1.027131	0.981999
1754288	688786.XSHG	2022-12-28	-0.022891	-0.022891	0.977109	0.730532	0.977109	0.959521
1755284	688786.XSHG	2022-12-29	-0.017464	-0.017464	0.982536	0.717774	0.982536	0.942763
1756484	688786.XSHG	2022-12-30	0.018494	0.018494	1.018494	0.731048	1.018494	0.960199

	tradeDate	open_ret	rule_ret	1+rule_ret	cum_rule_ret	1+open_ret	cum_open_ret
secID
002464.XSHE	2022-06-27	-0.038462	-0.038462	0.961538	0.097306	0.961538	0.097306
600091.XSHG	2022-06-15	-0.026223	-0.026223	0.973777	0.104188	0.973777	0.100244
600695.XSHG	2022-06-07	0.041337	0.041337	1.041337	0.141306	1.041337	0.141306
600652.XSHG	2022-06-15	-0.107438	-0.000000	1.000000	0.257113	0.892562	0.288000
688608.XSHG	2022-12-30	0.005591	0.005591	1.005591	0.459943	1.005591	0.605369
000056.XSHE	2022-12-30	0.004424	0.004424	1.004424	0.490503	1.004424	0.857958
300482.XSHE	2022-12-30	-0.023919	-0.023919	0.976081	0.496035	0.976081	0.826127
300589.XSHE	2022-12-30	0.020557	0.020557	1.020557	0.496537	1.020557	0.740127
...	...	...	...	...	...	...	...
300351.XSHE	2022-12-30	-0.026662	-0.026662	0.973338	1.827223	0.973338	1.058658
601975.XSHG	2022-12-30	0.015666	0.015666	1.015666	1.885278	1.015666	1.945000
003029.XSHE	2022-12-30	0.007866	0.000000	1.000000	1.922016	1.007866	1.645743
600387.XSHG	2022-12-30	-0.011278	-0.000000	1.000000	1.923154	0.988722	1.290026
300071.XSHE	2022-12-30	-0.015997	-0.000000	1.000000	1.970213	0.984003	1.873131
000411.XSHE	2022-12-30	0.003396	0.003396	1.003396	2.294187	1.003396	0.975191
600188.XSHG	2022-12-30	-0.007548	-0.007548	0.992452	2.397460	0.992452	1.550306
002219.XSHE	2022-12-30	-0.005022	-0.000000	1.000000	3.316126	0.994978	2.954720

	tradeDate	open_ret	rule_ret	1+rule_ret	cum_rule_ret	1+open_ret	cum_open_ret
secID
002464.XSHE	2022-06-27	-0.038462	-0.038462	0.961538	0.097306	0.961538	0.097306
600091.XSHG	2022-06-15	-0.026223	-0.026223	0.973777	0.104188	0.973777	0.100244
600695.XSHG	2022-06-07	0.041337	0.041337	1.041337	0.141306	1.041337	0.141306
600652.XSHG	2022-06-15	-0.107438	-0.000000	1.000000	0.257113	0.892562	0.288000
688608.XSHG	2022-12-30	0.005591	0.005591	1.005591	0.459943	1.005591	0.605369
000056.XSHE	2022-12-30	0.004424	0.004424	1.004424	0.490503	1.004424	0.857958
300482.XSHE	2022-12-30	-0.023919	-0.023919	0.976081	0.496035	0.976081	0.826127
300589.XSHE	2022-12-30	0.020557	0.020557	1.020557	0.496537	1.020557	0.740127
...	...	...	...	...	...	...	...
300351.XSHE	2022-12-30	-0.026662	-0.026662	0.973338	1.827223	0.973338	1.058658
601975.XSHG	2022-12-30	0.015666	0.015666	1.015666	1.885278	1.015666	1.945000
003029.XSHE	2022-12-30	0.007866	0.000000	1.000000	1.922016	1.007866	1.645743
600387.XSHG	2022-12-30	-0.011278	-0.000000	1.000000	1.923154	0.988722	1.290026
300071.XSHE	2022-12-30	-0.015997	-0.000000	1.000000	1.970213	0.984003	1.873131
000411.XSHE	2022-12-30	0.003396	0.003396	1.003396	2.294187	1.003396	0.975191
600188.XSHG	2022-12-30	-0.007548	-0.007548	0.992452	2.397460	0.992452	1.550306
002219.XSHE	2022-12-30	-0.005022	-0.000000	1.000000	3.316126	0.994978	2.954720

	secID	tradeDate	open_ret	EM_signal	MACD_signal	OBV_signal	AROON_signal	BOLL_signal	CCI_signal	CMO_signal	DMI_signal	MFI_signal	RSI_signal	signal_date	ret_sign
81	000002.XSHE	2007-05-16	-0.055868	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-14	-1
82	000002.XSHE	2007-05-17	0.059722	1.0	1.0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-15	1
83	000002.XSHE	2007-05-18	0.033187	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-16	1
84	000002.XSHE	2007-05-21	-0.037576	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-17	-1
85	000002.XSHE	2007-05-22	0.045341	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-18	1
86	000002.XSHE	2007-05-23	0.047590	1.0	1.0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-21	1
87	000002.XSHE	2007-05-24	0.062104	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-22	1
88	000002.XSHE	2007-05-25	-0.016784	1.0	1.0	1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-05-23	-1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1884115	688786.XSHG	2023-03-22	-0.006118	1.0	0.0	1	1.0	0.0	1.0	0.0	0.0	0.0	0.0	2023-03-20	-1
1884116	688786.XSHG	2023-03-23	0.002392	1.0	0.0	0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2023-03-21	1
1884117	688786.XSHG	2023-03-24	0.025579	1.0	1.0	1	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2023-03-22	1
1884118	688786.XSHG	2023-03-27	-0.033846	1.0	1.0	1	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2023-03-23	-1
1884119	688786.XSHG	2023-03-28	-0.021217	1.0	1.0	0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2023-03-24	-1
1884120	688786.XSHG	2023-03-29	-0.020870	0.0	0.0	0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	2023-03-27	-1
1884121	688786.XSHG	2023-03-30	0.006878	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	2023-03-28	1
1884122	688786.XSHG	2023-03-31	-0.017835	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	2023-03-29	-1

	secID	tradeDate	open_ret	EM_signal	MACD_signal	OBV_signal	AROON_signal	BOLL_signal	CCI_signal	CMO_signal	DMI_signal	MFI_signal	RSI_signal	signal_date	ret_sign
0	002089.XSHE	2007-03-01	0.063924	1.0	1.0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-02-27	1
1	600355.XSHG	2007-03-01	0.110109	1.0	0.0	0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2007-02-27	1
2	600476.XSHG	2007-03-01	0.068561	1.0	1.0	0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2007-02-27	1
3	600489.XSHG	2007-03-01	0.087017	1.0	1.0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-02-27	1
4	600306.XSHG	2007-03-01	0.056576	1.0	0.0	0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2007-02-27	1
5	600446.XSHG	2007-03-01	0.039526	0.0	0.0	0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2007-02-27	1
6	600367.XSHG	2007-03-01	0.078061	1.0	1.0	0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2007-02-27	1
7	600850.XSHG	2007-03-01	0.090906	1.0	0.0	0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2007-02-27	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1805085	300662.XSHE	2023-03-31	0.010190	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	1.0	1.0	2023-03-29	1
1805086	600256.XSHG	2023-03-31	0.029704	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	1.0	1.0	2023-03-29	1
1805087	603026.XSHG	2023-03-31	0.003213	0.0	1.0	1	1.0	0.0	1.0	1.0	0.0	1.0	1.0	2023-03-29	1
1805088	003022.XSHE	2023-03-31	-0.009392	0.0	0.0	1	1.0	0.0	1.0	0.0	0.0	1.0	1.0	2023-03-29	-1
1805089	000883.XSHE	2023-03-31	-0.002366	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	2023-03-29	-1
1805090	603833.XSHG	2023-03-31	0.026994	0.0	0.0	1	1.0	0.0	1.0	0.0	0.0	1.0	0.0	2023-03-29	1
1805091	600373.XSHG	2023-03-31	-0.046232	1.0	1.0	1	0.0	0.0	0.0	0.0	1.0	0.0	0.0	2023-03-29	-1
1805092	688786.XSHG	2023-03-31	-0.017835	0.0	0.0	0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	2023-03-29	-1

	features	feature_importance
1	MACD_signal	0.338913
2	OBV_signal	0.318480
0	EM_signal	0.099250
3	AROON_signal	0.087101
9	RSI_signal	0.069541
6	CMO_signal	0.048105
5	CCI_signal	0.034266
8	MFI_signal	0.004345
4	BOLL_signal	0.000000
7	DMI_signal	0.000000

	secID	tradeDate	open_ret	rule_ret
1568607	300791.XSHE	2022-01-04	0.001204	0.000000
1568608	002636.XSHE	2022-01-04	-0.004623	-0.004623
1568609	600720.XSHG	2022-01-04	0.006788	0.000000
1568610	002677.XSHE	2022-01-04	0.006907	0.000000
1568611	300383.XSHE	2022-01-04	0.013603	0.000000
1568612	002693.XSHE	2022-01-04	0.017220	0.000000
1568613	002489.XSHE	2022-01-04	0.011306	0.000000
1568614	001218.XSHE	2022-01-04	-0.002069	-0.002069
...	...	...	...	...
1756681	301055.XSHE	2022-12-30	0.006943	0.006943
1756682	002686.XSHE	2022-12-30	-0.010061	-0.000000
1756683	603079.XSHG	2022-12-30	0.001595	0.001595
1756684	603895.XSHG	2022-12-30	-0.007683	-0.007683
1756685	003022.XSHE	2022-12-30	0.008315	0.000000
1756686	002892.XSHE	2022-12-30	-0.023044	-0.023044
1756687	601107.XSHG	2022-12-30	-0.005321	-0.000000
1756688	002701.XSHE	2022-12-30	0.086302	0.086302

	rule_daily_ret
ret_mean	-0.000122
t_values	-0.799003