import pandas as pd
import numpy as np
import tqdm
import gc
import matplotlib.pyplot as plt
import talib as ta
pd.set_option('display.max_rows', 16)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
plt.rcParams['figure.figsize'] = (16.0, 9.0)
START = '20070101'
END = '20221231'
# Security Id
stk_info = DataAPI.SecIDGet(assetClass="E",pandas="1")
cond1 = (stk_info['exchangeCD'] == 'XSHE') | (stk_info['exchangeCD'] == 'XSHG')
cond2 = (stk_info['listStatusCD'] == 'L') | (stk_info['listStatusCD'] == 'DE')
stk_info = stk_info[cond1 & cond2].copy()
stk_id = stk_info['secID']
# ST
st_df = DataAPI.SecSTGet(beginDate=START,endDate=END,secID=stk_id,field=['secID','tradeDate','STflg'],pandas="1")
st_df['tradeDate'] = pd.to_datetime(st_df['tradeDate'],format="%Y-%m-%d")
# %%time
# stk_df = DataAPI.MktEqudAdjAfGet(secID=stk_id,beginDate=START,endDate=END,isOpen=1,
# field=["secID","tradeDate",
# 'preClosePrice',"closePrice",
# 'openPrice','highestPrice','lowestPrice',
# "negMarketValue",
# "turnoverValue",'turnoverVol'],pandas="1")
# stk_df.to_pickle('./data/stk_df.pkl')
# # Takes about 6 mins
stk_df = pd.read_pickle('./data/stk_df.pkl')
stk_df['tradeDate'] = pd.to_datetime(stk_df['tradeDate'], format='%Y-%m-%d')
stk_df.sort_values(['secID','tradeDate'],inplace=True)
# drop ST stocks
print(stk_df.shape)
stk_df = pd.merge(stk_df, st_df, on=['secID','tradeDate'],how='left')
stk_df = stk_df[stk_df['STflg'].isna()].copy()
stk_df.drop('STflg',axis=1,inplace=True)
print(stk_df.shape)
不填充停牌值比较合理,因为技术分析只看量价,直接计算量价关系较为合适
random_stkid = np.random.choice(stk_df['secID'].unique(),1000)
stk_df = stk_df[stk_df['secID'].isin(random_stkid)].copy()
stk_df['open_ret'] = stk_df.groupby('secID')['openPrice'].apply(lambda x: x / x.shift() - 1)
stk_df['close_ret'] = stk_df['closePrice']/stk_df['preClosePrice'] - 1
def rule_return(df, demean=True, open_ret=True):
"""
df should contain these columns:
signal: the signal generated by the rule
close_ret: return calculated by close price
open_ret: return calculated by open price
close_ret_demean is demeaned return of close_ret, i.e. close_ret - close_ret.mean.
open_ret_demean is similarly defined. The use of demeaned return series is to adjust the
bias created by bullish or bearish markets.
"""
df['close_ret_demean'] = df['close_ret'] - df['close_ret'].mean()
df['open_ret_demean'] = df['open_ret'] - df['open_ret'].mean()
df['position_close'] = df['signal']
df['position_open'] = df['signal'].shift()
df['position_close_ret'] = df['position_close'].shift() * df['close_ret']
df['position_open_ret'] = df['position_open'].shift() * df['open_ret']
df['position_close_ret_demean'] = df['position_close'].shift() * df['close_ret_demean']
df['position_open_ret_demean'] = df['position_open'].shift() * df['open_ret_demean']
df['close_cumret'] = (df['position_close_ret']+1).cumprod()
df['open_cumret'] = (df['position_open_ret']+1).cumprod()
if open_ret:
if demean:
return pd.DataFrame({'position_open_ret_demean':df['position_open_ret_demean'].values,
'open_cumret':df['open_cumret'].values})
else:
return pd.DataFrame({'position_open_ret':df['position_open_ret'].values,
'open_cumret':df['open_cumret'].values})
else:
if demean:
return pd.DataFrame({'position_close_ret_demean':df['position_close_ret_demean'].values,
'close_cumret':df['close_cumret'].values})
else:
return pd.DataFrame({'position_close_ret':df['position_close_ret'].values,
'close_cumret':df['close_cumret'].values})
# EMA
stk_df['EMA'] = stk_df.groupby('secID')['closePrice'].apply(ta.EMA, 20)
stk_df['EM_signal'] = np.where(stk_df['closePrice']>stk_df['EMA'],1,np.nan)
stk_df.loc[stk_df['closePrice']<=stk_df['EMA'],'EM_signal'] = 0
stk_df['EM_signal'] = stk_df.groupby('secID')['EM_signal'].fillna(method='ffill')
stk_df['EM_signal'].value_counts()
# MACD
stk_df['MACD'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[0])
stk_df['MACD_rawsignal'] = stk_df.groupby('secID')['closePrice'].apply(lambda x: ta.MACD(x)[1])
stk_df['MACD_signal'] = np.where(stk_df['MACD']>stk_df['MACD_rawsignal'],1,np.nan)
stk_df.loc[stk_df['MACD']<=stk_df['MACD_rawsignal'],'MACD_signal'] = 0
stk_df['MACD_signal'] = stk_df.groupby('secID')['MACD_signal'].fillna(method='ffill')
stk_df['MACD_signal'].value_counts()
# # Example
# temp = stk_df[stk_df['secID']=='900957.XSHG'].copy()
# ta.OBV(temp['closePrice'],temp['turnoverValue'])
# stk_df.loc[9968491:]
# OBV
stk_df['OBV'] = stk_df.groupby('secID')[['closePrice','turnoverVol']].apply(lambda x: ta.OBV(real=x['closePrice'],volume=x['turnoverVol']).to_frame('OBV')).values
stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
stk_df['OBV_change'] = stk_df.groupby('secID')['OBV'].apply(lambda x: x/x.shift()-1)
buy1_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']>0)
buy2_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']>0)
sell1_OBV = (stk_df['close_ret']>0)&(stk_df['OBV_change']<0)
sell2_OBV = (stk_df['close_ret']<0)&(stk_df['OBV_change']<0)
stk_df['OBV_signal'] = 0
stk_df.loc[buy1_OBV | buy2_OBV,'OBV_signal']=1
stk_df['OBV_signal'].value_counts()
# AROON
N = 26
high_threshold = 70 # 强趋势
low_threshold = 30 # 弱趋势
stk_df['AROON_up'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[0].to_frame('AROON_up')).values
stk_df['AROON_down'] = stk_df.groupby('secID')[['highestPrice','lowestPrice']].apply(lambda x:ta.AROON(high=x['highestPrice'],low=x['lowestPrice'])[1].to_frame('AROON_down')).values
buy_AROON = (stk_df['AROON_up']>high_threshold)&(stk_df['AROON_down']<low_threshold)
sell_AROON = (stk_df['AROON_down']>high_threshold)&(stk_df['AROON_up']<low_threshold)
stk_df['AROON_signal'] = np.where(buy_AROON,1,np.nan)
stk_df.loc[sell_AROON,'AROON_signal'] = 0
stk_df['AROON_signal'] = stk_df.groupby('secID')['AROON_signal'].fillna(method='ffill')
stk_df['AROON_signal'].value_counts()
# BOLL
stk_df['BOLL_up'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[0].to_frame('BOLL_up')).values
stk_df['BOLL_mid'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[1].to_frame('BOLL_mid')).values
stk_df['BOLL_low'] = stk_df.groupby('secID')[['closePrice']].apply(lambda x:ta.BBANDS(real=x['closePrice'])[2].to_frame('BOLL_low')).values
stk_df['BOLL_up_lag'] = stk_df.groupby('secID')['BOLL_up'].shift()
stk_df['BOLL_mid_lag'] = stk_df.groupby('secID')['BOLL_mid'].shift()
stk_df['BOLL_low_lag'] = stk_df.groupby('secID')['BOLL_low'].shift()
buy_BOLL = (stk_df['preClosePrice'] < stk_df['BOLL_up_lag'])&(stk_df['closePrice']>stk_df['BOLL_up'])
sell_BOLL = (stk_df['preClosePrice'] > stk_df['BOLL_mid_lag'])&(stk_df['closePrice']<stk_df['BOLL_mid'])
stk_df['BOLL_signal'] = np.where(buy_BOLL, 1, np.nan)
stk_df.loc[sell_BOLL,'BOLL_signal'] = 0
stk_df['BOLL_signal'] = stk_df.groupby('secID')['BOLL_signal'].fillna(method='ffill')
stk_df['BOLL_signal'].value_counts()
# CCI
buy_threshold = -100 # 买入阈值
sell_threshold = 100 # 卖出阈值
stk_df['CCI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x:ta.CCI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice']).to_frame('CCI')).values
buy_CCI = stk_df['CCI']<buy_threshold
sell_CCI = stk_df['CCI']>sell_threshold
stk_df['CCI_signal'] = np.where(buy_CCI,1,np.nan)
stk_df.loc[sell_CCI,'CCI_signal'] = 0
stk_df['CCI_signal'] = stk_df.groupby('secID')['CCI_signal'].fillna(method='ffill')
stk_df['CCI_signal'].value_counts()
# CMO
buy_threshold = -50 # 买入阈值
sell_threshold = 50
stk_df['CMO'] = stk_df.groupby('secID')['closePrice'].apply(ta.CMO)
buy_CMO = stk_df['CMO'] < buy_threshold
sell_CMO = stk_df['CMO'] > sell_threshold
stk_df['CMO_signal'] = np.where(buy_CMO, 1, np.nan)
stk_df.loc[sell_CMO,'CMO_signal'] = 0
stk_df['CMO_signal'] = stk_df.groupby('secID')['CMO_signal'].fillna(method='ffill')
stk_df['CMO_signal'].value_counts()
gc.collect()
# DMI
stk_df['plus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.PLUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['minus_DI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.MINUS_DI(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
stk_df['ADX'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice']].apply(lambda x: ta.ADX(high=x['highestPrice'], low=x['lowestPrice'], close=x['closePrice']).to_frame()).values
buy_threshold = 50
sell_threshold = 20
buy_DMI = (stk_df['ADX']>buy_threshold)&(stk_df['plus_DI']>stk_df['minus_DI'])
sell_DMI = (stk_df['ADX']<sell_threshold)|(stk_df['plus_DI']<stk_df['minus_DI'])
stk_df['DMI_signal'] = np.where(buy_DMI,1,np.nan)
stk_df.loc[sell_DMI,'DMI_signal'] = 0
stk_df['DMI_signal'] = stk_df.groupby('secID')['DMI_signal'].fillna(method='ffill')
stk_df['DMI_signal'].value_counts()
# MFI
stk_df['MFI'] = stk_df.groupby('secID')[['highestPrice','lowestPrice','closePrice','turnoverVol']].apply(lambda x:ta.MFI(high=x['highestPrice'],low=x['lowestPrice'],close=x['closePrice'],volume=x['turnoverVol']).to_frame()).values
buy_threshold = 20
sell_threshold = 80
buy_MFI = stk_df['MFI']<buy_threshold
sell_MFI = stk_df['MFI']>sell_threshold
stk_df['MFI_signal'] = np.where(buy_MFI,1,np.nan)
stk_df.loc[sell_MFI,'MFI_signal'] = 0
stk_df['MFI_signal'] = stk_df.groupby('secID')['MFI_signal'].fillna(method='ffill')
stk_df['MFI_signal'].value_counts()
# RSI
stk_df['RSI'] = stk_df.groupby('secID')['closePrice'].apply(ta.RSI)
buy_threshold = 30
sell_threshold = 70
buy_RSI = stk_df['RSI'] < buy_threshold
sell_RSI = stk_df['RSI'] < sell_threshold
stk_df['RSI_signal'] = np.where(buy_RSI,1,np.nan)
stk_df.loc[sell_RSI,'RSI_signal'] = 0
stk_df['RSI_signal'] = stk_df.groupby('secID')['RSI_signal'].fillna(method='ffill')
# %%time
# WVAD
# def wvad(df):
# return sum((df[:,0] - df[:,1]) / (df[:,2] - df[:,3]) * df[:,4])
# stk_df.groupby('secID')[['closePrice','openPrice','highestPrice',
# 'lowestPrice','turnoverVol']].apply(lambda x: x.rolling(24).apply(wvad,raw=False))
######## The above code is not working ##########
# temp = stk_df[stk_df['secID'].isin(np.random.choice(stk_df['secID'].unique(),10))].copy()
# def wvad(closePrice):
# close = closePrice
# open_ = stk_df.loc[closePrice.index, 'openPrice'].to_numpy()
# high = stk_df.loc[closePrice.index, 'highestPrice'].to_numpy()
# low = stk_df.loc[closePrice.index, 'lowestPrice'].to_numpy()
# volume = stk_df.loc[closePrice.index, 'turnoverVol'].to_numpy()
# return sum((closePrice - open_) / (high - low) * volume)
# temp2 = temp.groupby('secID')['closePrice'].rolling(24).apply(wvad, raw=False)
stk_df.reset_index(inplace=True, drop=True)
cols = ['secID','tradeDate','open_ret'] + [col for col in stk_df.columns if col[-6:]=='signal']
ret_df = stk_df[cols].copy()
ret_df['signal_date'] = ret_df['tradeDate']
ret_df[['tradeDate','open_ret']] = ret_df.groupby('secID')[['tradeDate','open_ret']].shift(-2)
ret_df.dropna(inplace=True)
ret_df.drop('MACD_rawsignal',axis=1,inplace=True)
ret_df['ret_sign'] = np.where(ret_df['open_ret']>0,1,-1)
ret_df
ret_df.sort_values('tradeDate',inplace=True)
ret_df.reset_index(inplace=True,drop=True)
ret_df
ret_df['year'] = ret_df['tradeDate'].dt.year
time_idx = [value for (key, value) in sorted(ret_df.groupby('year').groups.items())]
time_idx
def list_flat(list_):
return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
# result = []
# for sublist in list_:
# for item in sublist:
# result.append(item)
# return result
list_flat([[1,2,3],[3,4,5]])
# training, validation, testing scheme:
# 1. [2007-2010], [2011-2014], [2015]
# 2. [2007-2011], [2012-2015], [2016]
# ...
# last. [2008-2017], [2018-2021], [2022]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
train_idx = list_flat(time_idx[0:i])
val_idx = list_flat(time_idx[i:i+4])
fulltrain_idx.append(train_idx + val_idx)
cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0],
np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
# 因此cv_idx需要用fulltrain_idx的编号从0开始
test_idx.append(time_idx[i+4])
cv_idx[0]
test_years = list(range(2015, 2023))
test_years
X_cols = [col for col in ret_df.columns if col[-6:]=='signal']
X_cols
用 accuracy, f1_score 作为好坏评价的标准
hyperparam_grid = [
{'n_estimators': [50], 'max_depth': [1,3,5],
'max_features': [3,5,len(X_cols)]}
]
model = RandomForestClassifier()
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]])
X_fulltrain = ret_df.loc[fulltrain_idx[0], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[0], 'ret_sign']
X_test = ret_df.loc[test_idx[0], X_cols]
y_test = ret_df.loc[test_idx[0], 'ret_sign']
ret_df.loc[fulltrain_idx[0]]
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
grid_search.best_params_
pd.DataFrame({"features":X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',ascending=False)
y_pred = grid_search.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
# Benchmark all-positive
all_ones = np.ones_like(y_pred)
accuracy_score(y_true=y_test, y_pred=all_ones)
y_pred = grid_search.predict(X_test)
print(f1_score(y_true=y_test, y_pred=y_pred))
print(f1_score(y_true=y_test, y_pred=all_ones))
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
X_test = ret_df.loc[test_idx[i], X_cols]
y_test = ret_df.loc[test_idx[i], 'ret_sign']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
print("Test year", test_years[i],":",accuracy_score(y_true=y_test, y_pred=y_pred))
i = 6
print(test_years[i])
X_fulltrain = ret_df.loc[fulltrain_idx[i], X_cols]
y_fulltrain = ret_df.loc[fulltrain_idx[i], 'ret_sign']
X_test = ret_df.loc[test_idx[i], X_cols]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]])
grid_search.fit(X_fulltrain, y_fulltrain)
rule = grid_search.predict(X=X_test)
rule
rule[rule==-1] = 0
rule
rule_ret = ret_df.loc[test_idx[i],'open_ret'].values * rule
ret_df_2021 = ret_df.loc[test_idx[i],['secID','tradeDate','open_ret']].copy()
ret_df_2021['rule_ret'] = rule_ret
ret_df_2021
ret_df_2021.sort_values(['secID','tradeDate'],inplace=True)
# time-series mean of daily return
rule_tsmean_ret_by_crs = ret_df_2021.groupby('secID')['rule_ret'].mean()
rule_tsmean_ret_by_crs
rule_tsmean_ret_by_crs.dropna(inplace=True)
y = rule_tsmean_ret_by_crs.values
const = np.full(shape=len(y),fill_value=1)
reg = sm.OLS(y, const).fit().get_robustcov_results(cov_type='HC0')
mean_values = reg.params[0]
t_values = reg.tvalues[0]
pd.DataFrame([mean_values,t_values],index=['ret_mean','t_values'],columns=['rule_daily_ret'])
0.0005 *250
(1+0.0005)**250 - 1