In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras

- [Data](#data)
  - [NA 值处理](#na-值处理)
  - [Use rank instead of numerical values](#use-rank-instead-of-numerical-values)
- [Train, Validation, Test split](#train-validation-test-split)
- [Evaluation metrics](#evaluation-metrics)
- [Models](#models)
  - [Linear regression](#linear-regression)
  - [Huber regressor](#huber-regressor)
  - [Random Forest](#random-forest)
  - [Partial Least Squares](#partial-least-squares)
  - [Principal Component Regression](#principal-component-regression)
    - [PCA transform](#pca-transform)
    - [PCA regression](#pca-regression)
  - [Pipeline](#pipeline)
  - [Elastic Net](#elastic-net)
  - [Gradient Boosted Regression Trees](#gradient-boosted-regression-trees)
  - [Neural Nets](#neural-nets)
    - [GridSeachCV Neural Nets](#gridseachcv-neural-nets)
- [Transformation pipeline example](#transformation-pipeline-example)

# Data

In [38]:
df = pd.read_pickle('../../data/factor_exposure/all_exposure.pkl')

In [39]:
df

Unnamed: 0,secID,ret_date,tradeDate,ret,rf,exret,ym,mktcap,size,rev,beta,bm,illiq,illiq_12m,mom_date,mom,vol,ivol,vol_clip,ivol_clip
0,000001.XSHE,2007-07,2007-06-29,0.316497,0.002481,0.314016,2007-06,4.266117e+10,24.476555,,0.4614,0.123739,,,NaT,,,,,
1,000001.XSHE,2007-08,2007-07-31,0.048855,0.002404,0.046451,2007-07,5.616330e+10,24.751529,0.314016,0.6423,0.093992,0.000040,,2007-06,,0.042521,,0.042521,
2,000001.XSHE,2007-09,2007-08-31,0.052105,0.002621,0.049484,2007-08,5.890714e+10,24.799228,0.046451,0.7722,0.097085,0.000020,,2007-07,,0.033926,,0.033926,
3,000001.XSHE,2007-10,2007-09-28,0.201851,0.003095,0.198756,2007-09,6.197651e+10,24.850021,0.049484,0.7596,0.092276,0.000025,,2007-08,,0.023872,,0.023872,
4,000001.XSHE,2007-11,2007-10-31,-0.249116,0.003780,-0.252896,2007-10,7.448652e+10,25.033884,0.198756,0.7988,0.083411,0.000030,,2007-09,,0.035921,,0.035921,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504875,900957.XSHG,2021-12,2021-11-30,0.035831,0.002026,0.033805,2021-11,1.120560e+08,18.534509,-0.042588,,,0.070056,0.062884,2021-10,0.216730,0.009639,0.007046,0.009639,0.007046
504876,900957.XSHG,2022-01,2021-12-31,-0.022013,0.002014,-0.024027,2021-12,1.161040e+08,18.569997,0.033805,,,0.078037,0.059672,2021-11,0.211045,0.010961,0.008692,0.010961,0.008692
504877,900957.XSHG,2022-02,2022-01-28,-0.011254,0.001921,-0.013175,2022-01,1.135280e+08,18.547560,-0.024027,,,0.044515,0.058502,2021-12,-0.059172,0.010559,0.008409,0.010559,0.008409
504878,900957.XSHG,2022-03,2022-02-28,-0.034146,0.001919,-0.036066,2022-02,1.122400e+08,18.536150,-0.013175,,,0.057218,0.060208,2022-01,-0.157182,0.006517,0.004195,0.006517,0.004195


## NA 值处理

In [40]:
for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 4853
tradeDate 0
ret 31888
rf 4853
exret 31888
ym 0
mktcap 23011
size 23011
rev 30586
beta 40704
bm 21512
illiq 41579
illiq_12m 91808
mom_date 3547
mom 49225
vol 30782
ivol 54678
vol_clip 30782
ivol_clip 54678


ret_date 为 NA 的删除，已到最新数据处

In [41]:
df = df[~df['ret_date'].isna()].copy()

In [42]:
df

Unnamed: 0,secID,ret_date,tradeDate,ret,rf,exret,ym,mktcap,size,rev,beta,bm,illiq,illiq_12m,mom_date,mom,vol,ivol,vol_clip,ivol_clip
0,000001.XSHE,2007-07,2007-06-29,0.316497,0.002481,0.314016,2007-06,4.266117e+10,24.476555,,0.4614,0.123739,,,NaT,,,,,
1,000001.XSHE,2007-08,2007-07-31,0.048855,0.002404,0.046451,2007-07,5.616330e+10,24.751529,0.314016,0.6423,0.093992,0.000040,,2007-06,,0.042521,,0.042521,
2,000001.XSHE,2007-09,2007-08-31,0.052105,0.002621,0.049484,2007-08,5.890714e+10,24.799228,0.046451,0.7722,0.097085,0.000020,,2007-07,,0.033926,,0.033926,
3,000001.XSHE,2007-10,2007-09-28,0.201851,0.003095,0.198756,2007-09,6.197651e+10,24.850021,0.049484,0.7596,0.092276,0.000025,,2007-08,,0.023872,,0.023872,
4,000001.XSHE,2007-11,2007-10-31,-0.249116,0.003780,-0.252896,2007-10,7.448652e+10,25.033884,0.198756,0.7988,0.083411,0.000030,,2007-09,,0.035921,,0.035921,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504874,900957.XSHG,2021-11,2021-10-29,-0.040625,0.001963,-0.042588,2021-10,1.168400e+08,18.576316,-0.042478,,,0.058457,0.067646,2021-09,0.285164,0.011663,0.007700,0.011663,0.007700
504875,900957.XSHG,2021-12,2021-11-30,0.035831,0.002026,0.033805,2021-11,1.120560e+08,18.534509,-0.042588,,,0.070056,0.062884,2021-10,0.216730,0.009639,0.007046,0.009639,0.007046
504876,900957.XSHG,2022-01,2021-12-31,-0.022013,0.002014,-0.024027,2021-12,1.161040e+08,18.569997,0.033805,,,0.078037,0.059672,2021-11,0.211045,0.010961,0.008692,0.010961,0.008692
504877,900957.XSHG,2022-02,2022-01-28,-0.011254,0.001921,-0.013175,2022-01,1.135280e+08,18.547560,-0.024027,,,0.044515,0.058502,2021-12,-0.059172,0.010559,0.008409,0.010559,0.008409


momentum 从 2008-01 开始。简单起见，把所有数据调整为从2008-01开始。

In [43]:
df.loc[~df['mom'].isna(),'ret_date'].min()

Period('2008-01', 'M')

In [44]:
df = df[df['ret_date'] >= '2008-01'].copy()

In [45]:
for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
tradeDate 0
ret 26378
rf 0
exret 26378
ym 0
mktcap 22483
size 22483
rev 29845
beta 39159
bm 20363
illiq 36007
illiq_12m 79832
mom_date 3381
mom 36211
vol 25455
ivol 37368
vol_clip 25455
ivol_clip 37368


剩余的NA值有至少三个来源：
- 由于停牌日期填充造成，
- 由于计算时要求最低样本数造成，
- 由优矿直接给出了NA值

return 的 NA 值直接删除

In [46]:
df = df[~df['ret'].isna()].copy()

In [47]:
df

Unnamed: 0,secID,ret_date,tradeDate,ret,rf,exret,ym,mktcap,size,rev,beta,bm,illiq,illiq_12m,mom_date,mom,vol,ivol,vol_clip,ivol_clip
6,000001.XSHE,2008-01,2007-12-28,-0.137306,0.002949,-0.140255,2007-12,6.574629e+10,24.909069,0.066834,0.9468,0.094476,0.000025,,2007-11,,0.027254,,0.027254,
7,000001.XSHE,2008-02,2008-01-31,-0.004504,0.002946,-0.007450,2008-01,5.850212e+10,24.792329,-0.140255,0.9654,0.109513,0.000039,,2007-12,,0.037722,0.013266,0.037722,0.013266
8,000001.XSHE,2008-03,2008-02-29,-0.149321,0.002746,-0.152068,2008-02,5.823860e+10,24.787814,-0.007450,1.0292,0.110009,0.000064,,2008-01,,0.041448,0.009474,0.041448,0.009474
9,000001.XSHE,2008-04,2008-03-31,0.050355,0.002862,0.047493,2008-03,4.954234e+10,24.626093,-0.152068,1.0238,0.201102,0.000043,,2008-02,,0.045109,0.021746,0.045109,0.021746
10,000001.XSHE,2008-05,2008-04-30,-0.148211,0.002953,-0.151164,2008-04,5.203702e+10,24.675221,0.047493,1.0212,0.206701,0.000051,0.000038,2008-03,,0.046323,0.014474,0.046323,0.014474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504874,900957.XSHG,2021-11,2021-10-29,-0.040625,0.001963,-0.042588,2021-10,1.168400e+08,18.576316,-0.042478,,,0.058457,0.067646,2021-09,0.285164,0.011663,0.007700,0.011663,0.007700
504875,900957.XSHG,2021-12,2021-11-30,0.035831,0.002026,0.033805,2021-11,1.120560e+08,18.534509,-0.042588,,,0.070056,0.062884,2021-10,0.216730,0.009639,0.007046,0.009639,0.007046
504876,900957.XSHG,2022-01,2021-12-31,-0.022013,0.002014,-0.024027,2021-12,1.161040e+08,18.569997,0.033805,,,0.078037,0.059672,2021-11,0.211045,0.010961,0.008692,0.010961,0.008692
504877,900957.XSHG,2022-02,2022-01-28,-0.011254,0.001921,-0.013175,2022-01,1.135280e+08,18.547560,-0.024027,,,0.044515,0.058502,2021-12,-0.059172,0.010559,0.008409,0.010559,0.008409


In [48]:
for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
tradeDate 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 7328
beta 25845
bm 16422
illiq 11127
illiq_12m 62624
mom_date 3381
mom 35755
vol 2799
ivol 12482
vol_clip 2799
ivol_clip 12482


In [49]:
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)

In [50]:
df.drop(['ret','rf'],axis=1,inplace=True)

In [51]:
df.reset_index(inplace=True,drop=True)

In [52]:
df

Unnamed: 0,secID,ret_date,tradeDate,exret,ym,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol
0,000001.XSHE,2008-01,2007-12-28,-0.140255,2007-12,24.909069,0.066834,0.9468,0.094476,0.000025,,,0.027254,
1,000001.XSHE,2008-02,2008-01-31,-0.007450,2008-01,24.792329,-0.140255,0.9654,0.109513,0.000039,,,0.037722,0.013266
2,000001.XSHE,2008-03,2008-02-29,-0.152068,2008-02,24.787814,-0.007450,1.0292,0.110009,0.000064,,,0.041448,0.009474
3,000001.XSHE,2008-04,2008-03-31,0.047493,2008-03,24.626093,-0.152068,1.0238,0.201102,0.000043,,,0.045109,0.021746
4,000001.XSHE,2008-05,2008-04-30,-0.151164,2008-04,24.675221,0.047493,1.0212,0.206701,0.000051,0.000038,,0.046323,0.014474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461016,900957.XSHG,2021-11,2021-10-29,-0.042588,2021-10,18.576316,-0.042478,,,0.058457,0.067646,0.285164,0.011663,0.007700
461017,900957.XSHG,2021-12,2021-11-30,0.033805,2021-11,18.534509,-0.042588,,,0.070056,0.062884,0.216730,0.009639,0.007046
461018,900957.XSHG,2022-01,2021-12-31,-0.024027,2021-12,18.569997,0.033805,,,0.078037,0.059672,0.211045,0.010961,0.008692
461019,900957.XSHG,2022-02,2022-01-28,-0.013175,2022-01,18.547560,-0.024027,,,0.044515,0.058502,-0.059172,0.010559,0.008409


- reversal 的 NA 是由于在对应的return date，上个月停牌所以没有上个月的return。
- beta, bm 是优矿的NA。可以用当月的横截面上的中值填充
- illiq, ivol, vol 也可用当月的横截面上的中值填充.

In [53]:
for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
tradeDate 0
exret 0
ym 0
size 0
rev 7328
beta 25845
bm 16422
illiq 11127
illiq_12m 62624
mom 35755
vol 2799
ivol 12482


In [54]:
# Reversal 的空值丢掉，其他的用 median 填充
df = df[~df['rev'].isna()].copy()

In [55]:
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']

In [56]:
df

Unnamed: 0,secID,ret_date,tradeDate,exret,ym,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol
0,000001.XSHE,2008-01,2007-12-28,-0.140255,2007-12,24.909069,0.066834,0.9468,0.094476,0.000025,,,0.027254,
1,000001.XSHE,2008-02,2008-01-31,-0.007450,2008-01,24.792329,-0.140255,0.9654,0.109513,0.000039,,,0.037722,0.013266
2,000001.XSHE,2008-03,2008-02-29,-0.152068,2008-02,24.787814,-0.007450,1.0292,0.110009,0.000064,,,0.041448,0.009474
3,000001.XSHE,2008-04,2008-03-31,0.047493,2008-03,24.626093,-0.152068,1.0238,0.201102,0.000043,,,0.045109,0.021746
4,000001.XSHE,2008-05,2008-04-30,-0.151164,2008-04,24.675221,0.047493,1.0212,0.206701,0.000051,0.000038,,0.046323,0.014474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461016,900957.XSHG,2021-11,2021-10-29,-0.042588,2021-10,18.576316,-0.042478,,,0.058457,0.067646,0.285164,0.011663,0.007700
461017,900957.XSHG,2021-12,2021-11-30,0.033805,2021-11,18.534509,-0.042588,,,0.070056,0.062884,0.216730,0.009639,0.007046
461018,900957.XSHG,2022-01,2021-12-31,-0.024027,2021-12,18.569997,0.033805,,,0.078037,0.059672,0.211045,0.010961,0.008692
461019,900957.XSHG,2022-02,2022-01-28,-0.013175,2022-01,18.547560,-0.024027,,,0.044515,0.058502,-0.059172,0.010559,0.008409


In [57]:
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))

In [58]:
temp.fillna(0, inplace=True)

In [59]:
df[cols] = temp.copy()

In [60]:
for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
tradeDate 0
exret 0
ym 0
size 0
rev 0
beta 0
bm 0
illiq 0
illiq_12m 0
mom 0
vol 0
ivol 0


In [61]:
df

Unnamed: 0,secID,ret_date,tradeDate,exret,ym,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol
0,000001.XSHE,2008-01,2007-12-28,-0.140255,2007-12,24.909069,0.066834,0.94680,0.094476,0.000025,0.000536,0.777814,0.027254,0.000000
1,000001.XSHE,2008-02,2008-01-31,-0.007450,2008-01,24.792329,-0.140255,0.96540,0.109513,0.000039,0.000524,1.119102,0.037722,0.013266
2,000001.XSHE,2008-03,2008-02-29,-0.152068,2008-02,24.787814,-0.007450,1.02920,0.110009,0.000064,0.000527,0.656120,0.041448,0.009474
3,000001.XSHE,2008-04,2008-03-31,0.047493,2008-03,24.626093,-0.152068,1.02380,0.201102,0.000043,0.000565,0.545260,0.045109,0.021746
4,000001.XSHE,2008-05,2008-04-30,-0.151164,2008-04,24.675221,0.047493,1.02120,0.206701,0.000051,0.000038,-0.055889,0.046323,0.014474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461016,900957.XSHG,2021-11,2021-10-29,-0.042588,2021-10,18.576316,-0.042478,0.47010,0.375432,0.058457,0.067646,0.285164,0.011663,0.007700
461017,900957.XSHG,2021-12,2021-11-30,0.033805,2021-11,18.534509,-0.042588,0.46980,0.332403,0.070056,0.062884,0.216730,0.009639,0.007046
461018,900957.XSHG,2022-01,2021-12-31,-0.024027,2021-12,18.569997,0.033805,0.46910,0.324354,0.078037,0.059672,0.211045,0.010961,0.008692
461019,900957.XSHG,2022-02,2022-01-28,-0.013175,2022-01,18.547560,-0.024027,0.55830,0.356716,0.044515,0.058502,-0.059172,0.010559,0.008409


## Use rank instead of numerical values

$$c_{i,t} = \frac{2}{N+1}CSrank(c^r_{i,t}) - 1$$

$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t

In [62]:
def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1

In [63]:
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()

In [64]:
num_X_cols

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']

In [65]:
temp = df[['ret_date']+num_X_cols].groupby('ret_date').apply(csrank)

In [66]:
temp

Unnamed: 0,ret_date,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol
0,0.0,0.970696,-0.853480,0.123810,-0.696703,-0.979487,0.000000,0.000000,0.321612,0.000000
1,0.0,0.972444,-0.412618,-0.219724,-0.641769,-0.960841,0.000000,0.000000,-0.601160,-0.718637
2,0.0,0.968481,-0.664756,0.375358,-0.588825,-0.958453,0.000000,0.000000,0.415473,-0.616046
3,0.0,0.969936,0.483178,-0.079456,-0.176807,-0.975662,0.000000,0.000000,0.218325,0.400143
4,0.0,0.965567,0.522238,-0.301291,-0.318508,-0.971306,-0.977044,0.000000,-0.707317,-0.519369
...,...,...,...,...,...,...,...,...,...,...
461016,0.0,-0.990901,-0.011374,-0.000227,0.000000,0.989536,0.989991,0.556415,-0.897179,-0.702457
461017,0.0,-0.990967,-0.783198,0.000226,0.000000,0.988708,0.989612,0.585818,-0.900181,-0.775068
461018,0.0,-0.991033,0.121722,0.000000,0.000000,0.993275,0.989240,0.392961,-0.862811,-0.724277
461019,0.0,-0.991125,0.656091,0.000000,0.000000,0.988462,0.989350,-0.645440,-0.958731,-0.672953


In [67]:
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)

In [68]:
del temp

In [69]:
df_rank

Unnamed: 0,secID,ret_date,tradeDate,exret,ym,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol
0,000001.XSHE,2008-01,2007-12-28,-0.140255,2007-12,0.970696,-0.853480,0.123810,-0.696703,-0.979487,0.000000,0.000000,0.321612,0.000000
1,000001.XSHE,2008-02,2008-01-31,-0.007450,2008-01,0.972444,-0.412618,-0.219724,-0.641769,-0.960841,0.000000,0.000000,-0.601160,-0.718637
2,000001.XSHE,2008-03,2008-02-29,-0.152068,2008-02,0.968481,-0.664756,0.375358,-0.588825,-0.958453,0.000000,0.000000,0.415473,-0.616046
3,000001.XSHE,2008-04,2008-03-31,0.047493,2008-03,0.969936,0.483178,-0.079456,-0.176807,-0.975662,0.000000,0.000000,0.218325,0.400143
4,000001.XSHE,2008-05,2008-04-30,-0.151164,2008-04,0.965567,0.522238,-0.301291,-0.318508,-0.971306,-0.977044,0.000000,-0.707317,-0.519369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461016,900957.XSHG,2021-11,2021-10-29,-0.042588,2021-10,-0.990901,-0.011374,-0.000227,0.000000,0.989536,0.989991,0.556415,-0.897179,-0.702457
461017,900957.XSHG,2021-12,2021-11-30,0.033805,2021-11,-0.990967,-0.783198,0.000226,0.000000,0.988708,0.989612,0.585818,-0.900181,-0.775068
461018,900957.XSHG,2022-01,2021-12-31,-0.024027,2021-12,-0.991033,0.121722,0.000000,0.000000,0.993275,0.989240,0.392961,-0.862811,-0.724277
461019,900957.XSHG,2022-02,2022-01-28,-0.013175,2022-01,-0.991125,0.656091,0.000000,0.000000,0.988462,0.989350,-0.645440,-0.958731,-0.672953


# Train, Validation, Test split

In [70]:
df_rank['year'] = df_rank['ret_date'].dt.year

In [74]:
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]

In [75]:
time_idx

[Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                  8,      9,
             ...
             460852, 460853, 460854, 460855, 460856, 460857, 460858, 460859,
             460860, 460861],
            dtype='int64', length=17347),
 Int64Index([    12,     13,     14,     15,     16,     17,     18,     19,
                 20,     21,
             ...
             460864, 460865, 460866, 460867, 460868, 460869, 460870, 460871,
             460872, 460873],
            dtype='int64', length=18345),
 Int64Index([    24,     25,     26,     27,     28,     29,     31,     32,
                192,    193,
             ...
             460876, 460877, 460878, 460879, 460880, 460881, 460882, 460883,
             460884, 460885],
            dtype='int64', length=20770),
 Int64Index([    33,     34,     35,     36,     37,     38,     39,     40,
                 41,     42,
             ...
             460888, 460889, 460890, 460891, 460892, 460893, 

In [76]:
df_rank.groupby('year')['secID'].nunique()

year
2008    1559
2009    1627
2010    1934
2011    2231
2012    2477
2013    2530
2014    2649
2015    2863
2016    3028
2017    3471
2018    3605
2019    3739
2020    4045
2021    4509
2022    4538
Name: secID, dtype: int64

In [77]:
df_rank.groupby('year')['secID'].count()

year
2008    17347
2009    18345
2010    20770
2011    24588
2012    27649
2013    28885
2014    28408
2015    28331
2016    31459
2017    36050
2018    40026
2019    43017
2020    45124
2021    50192
2022    13502
Name: secID, dtype: int64

In [79]:
def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result

In [80]:
list_flat([[1,2,3],[3,4,5]])

[1, 2, 3, 3, 4, 5]

In [81]:
df_rank

Unnamed: 0,secID,ret_date,tradeDate,exret,ym,size,rev,beta,bm,illiq,illiq_12m,mom,vol,ivol,year
0,000001.XSHE,2008-01,2007-12-28,-0.140255,2007-12,0.970696,-0.853480,0.123810,-0.696703,-0.979487,0.000000,0.000000,0.321612,0.000000,2008
1,000001.XSHE,2008-02,2008-01-31,-0.007450,2008-01,0.972444,-0.412618,-0.219724,-0.641769,-0.960841,0.000000,0.000000,-0.601160,-0.718637,2008
2,000001.XSHE,2008-03,2008-02-29,-0.152068,2008-02,0.968481,-0.664756,0.375358,-0.588825,-0.958453,0.000000,0.000000,0.415473,-0.616046,2008
3,000001.XSHE,2008-04,2008-03-31,0.047493,2008-03,0.969936,0.483178,-0.079456,-0.176807,-0.975662,0.000000,0.000000,0.218325,0.400143,2008
4,000001.XSHE,2008-05,2008-04-30,-0.151164,2008-04,0.965567,0.522238,-0.301291,-0.318508,-0.971306,-0.977044,0.000000,-0.707317,-0.519369,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461016,900957.XSHG,2021-11,2021-10-29,-0.042588,2021-10,-0.990901,-0.011374,-0.000227,0.000000,0.989536,0.989991,0.556415,-0.897179,-0.702457,2021
461017,900957.XSHG,2021-12,2021-11-30,0.033805,2021-11,-0.990967,-0.783198,0.000226,0.000000,0.988708,0.989612,0.585818,-0.900181,-0.775068,2021
461018,900957.XSHG,2022-01,2021-12-31,-0.024027,2021-12,-0.991033,0.121722,0.000000,0.000000,0.993275,0.989240,0.392961,-0.862811,-0.724277,2022
461019,900957.XSHG,2022-02,2022-01-28,-0.013175,2022-01,-0.991125,0.656091,0.000000,0.000000,0.988462,0.989350,-0.645440,-0.958731,-0.672953,2022


In [45]:
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2016], [2017-2020], [2021]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作，不能带着pandas的index，
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])

In [46]:
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000]))[0]

array([0, 4])

In [47]:
test_years = list(range(2016, 2022))
test_years

[2016, 2017, 2018, 2019, 2020, 2021]

# Evaluation metrics

In [48]:
def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)

In [49]:
r2_oos_scorer = make_scorer(r2_oos)

# Models

## Linear regression

In [50]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']

In [51]:
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']

In [52]:
model = LinearRegression()

In [54]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.0117860555614957
Test year 2017 : -0.08186440331616396
Test year 2018 : -0.045722182188346894
Test year 2019 : 0.0053533305241438844
Test year 2020 : -0.0006830141045139904
Test year 2021 : -0.01959896422561891


In [55]:
cols = ['size','rev','illiq','ivol']

In [56]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.011951287753103168
Test year 2017 : -0.08295671126910364
Test year 2018 : -0.045758754149365366
Test year 2019 : 0.006047261470831122
Test year 2020 : -0.0011655147730460502
Test year 2021 : -0.020928820173429674


## Huber regressor

In [57]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']

In [58]:
model = HuberRegressor(alpha=0.01,epsilon=1.05)

In [59]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : 0.0014038503898485821
Test year 2017 : -0.04070584873067329
Test year 2018 : 0.0016444781377967788
Test year 2019 : -0.016957932821754618
Test year 2020 : -0.013372340329569798
Test year 2021 : 0.008736696659550569


## Random Forest

In [60]:
cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

In [66]:
hyperparam_grid = [
    {'n_estimators': [100], 'max_depth': [1,3,7], 
     'max_features': [3,5,len(cols)]}
]

In [67]:
model = RandomForestRegressor(random_state=42)

In [68]:
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

In [69]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']

In [59]:
%%time
grid_search.fit(X_fulltrain, y_fulltrain)

CPU times: user 1min 48s, sys: 385 ms, total: 1min 49s
Wall time: 1min 49s


GridSearchCV(cv=[(array([    0,     1,     2, ..., 86282, 86283, 86284]),
                  array([ 86285,  86286,  86287, ..., 203045, 203046, 203047]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [1, 3, 7], 'max_features': [3, 5, 9],
                          'n_estimators': [100]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

In [60]:
grid_search.best_params_

{'max_depth': 3, 'max_features': 3, 'n_estimators': 100}

In [61]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.05127266281889475 {'max_depth': 1, 'max_features': 3, 'n_estimators': 100}
0.0534381698663029 {'max_depth': 1, 'max_features': 5, 'n_estimators': 100}
0.05949245846715574 {'max_depth': 1, 'max_features': 9, 'n_estimators': 100}
0.06188021686062709 {'max_depth': 3, 'max_features': 3, 'n_estimators': 100}
0.05804954897281175 {'max_depth': 3, 'max_features': 5, 'n_estimators': 100}
0.06081729711883632 {'max_depth': 3, 'max_features': 9, 'n_estimators': 100}
0.05614456342229886 {'max_depth': 7, 'max_features': 3, 'n_estimators': 100}
0.043036296006267676 {'max_depth': 7, 'max_features': 5, 'n_estimators': 100}
0.04356663192832121 {'max_depth': 7, 'max_features': 9, 'n_estimators': 100}


In [62]:
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)

Unnamed: 0,features,feature_importance
5,illiq,0.302548
0,size,0.211411
1,rev,0.198549
6,illiq_12m,0.113615
8,ivol,0.092814
7,vol,0.052963
2,mom,0.010744
4,bm,0.010535
3,beta,0.00682


In [63]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.012832337545194417

In [64]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.012832337545194417
Test year 2017 : -0.0820561067657255
Test year 2018 : -0.04409182586886584
Test year 2019 : 0.007830509088117443
Test year 2020 : 0.003591662917594607
Test year 2021 : -0.015391690896670474
CPU times: user 22min 37s, sys: 4.99 s, total: 22min 42s
Wall time: 22min 50s


## Partial Least Squares

In [61]:
cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

In [62]:
model = PLSRegression(n_components=4)

In [64]:
y_pred.reshape(-1).shape

(8378,)

In [76]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01123119284077867
Test year 2017 : -0.08973582085167675
Test year 2018 : -0.0454224712689455
Test year 2019 : 0.00484391775863835
Test year 2020 : -0.0007496252971606054
Test year 2021 : -0.02132040728407225
CPU times: user 7.04 s, sys: 224 ms, total: 7.27 s
Wall time: 2.41 s


## Principal Component Regression

### PCA transform

In [65]:
cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

In [66]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

In [67]:
pca = PCA(3, random_state=42)

In [68]:
pca.fit(X_fulltrain)

PCA(n_components=3, random_state=42)

In [69]:
pca.components_

array([[ 0.54714595, -0.01385498,  0.04325358,  0.17598213,  0.04193536,
        -0.57860615, -0.57126752,  0.06380787,  0.02524075],
       [-0.06032312,  0.30002583,  0.26579743, -0.01213907, -0.39538595,
        -0.04921141,  0.06139761,  0.56294733,  0.59675692],
       [ 0.12342608, -0.20637066,  0.50242834, -0.61520441, -0.44211959,
        -0.07749277, -0.02191892, -0.30373145, -0.13060767]])

In [70]:
pca.components_.shape

(3, 9)

In [71]:
X_fulltrain.shape

(203048, 9)

In [84]:
pca.components_.T.shape

(9, 3)

In [72]:
np.matmul(X_fulltrain.values,pca.components_.T)

array([[ 1.5880652 , -0.1593774 ,  0.92155463],
       [ 1.74711817,  0.03463427,  0.4903537 ],
       [ 1.72960462,  0.78025716,  0.22764192],
       ...,
       [-1.74314491, -0.50257968, -0.34461461],
       [-1.66920021,  0.2734256 , -0.03379416],
       [-1.65589407,  0.43992714, -0.01309807]])

In [73]:
pca.fit_transform(X_fulltrain)

array([[ 1.5880652 , -0.1593774 ,  0.92155463],
       [ 1.74711817,  0.03463427,  0.4903537 ],
       [ 1.72960462,  0.78025716,  0.22764192],
       ...,
       [-1.74314491, -0.50257968, -0.34461461],
       [-1.66920021,  0.2734256 , -0.03379416],
       [-1.65589407,  0.43992714, -0.01309807]])

### PCA regression

sklearn 是 duck typing，因此无需继承，只需在定义类的时候包括对应的方法，`fit()`(return self)，`transform()`，`fit_transform()`即可。

但直接用继承，可以更方便。
- `BaseEstimator`是sklearn里最基本的类，其他的类都从这个类继承而来，包括了`set_params()`和`get_params()`的方法。
- `TransformerMixin`包括了`fit_transform()`方法。因此由这个类继承而来的话，就不用自定义 `fit_transform` 了
- 类似的，`RegressorMixin`包括了`predict()`方法

In [74]:
class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_

In [75]:
model = PCARegressor()

In [76]:
model.fit(X=X_fulltrain, y=y_fulltrain)

PCARegressor()

In [77]:
model.X_

array([[ 1.5880652 , -0.1593774 ,  0.92155463],
       [ 1.74711817,  0.03463427,  0.4903537 ],
       [ 1.72960462,  0.78025716,  0.22764192],
       ...,
       [-1.74314491, -0.50257968, -0.34461461],
       [-1.66920021,  0.2734256 , -0.03379416],
       [-1.65589407,  0.43992714, -0.01309807]])

In [78]:
hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]

In [79]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

In [80]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 86282, 86283, 86284]),
                  array([ 86285,  86286,  86287, ..., 203045, 203046, 203047]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))

In [81]:
grid_search.best_params_

{'n_components': 9}

In [82]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.06047645010846184 {'n_components': 1}
0.05648442569524177 {'n_components': 2}
0.05505885643926543 {'n_components': 3}
0.06827088360625753 {'n_components': 4}
0.07112802051912726 {'n_components': 5}
0.07107450847527214 {'n_components': 6}
0.06906175137901698 {'n_components': 7}
0.07715717389016781 {'n_components': 8}
0.07815893155696621 {'n_components': 9}


In [83]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)

-0.01117901387512199

In [84]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01117901387512199
Test year 2017 : -0.08825633439790259
Test year 2018 : -0.04484009526192567
Test year 2019 : 0.005339549758387907
Test year 2020 : -0.00026464526848823944
Test year 2021 : -0.021433913684697492
CPU times: user 53.8 s, sys: 4.24 s, total: 58.1 s
Wall time: 15.5 s


## Pipeline

In [85]:
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)

In [86]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

In [87]:
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)

CPU times: user 4 s, sys: 345 ms, total: 4.34 s
Wall time: 1.13 s


GridSearchCV(cv=[(array([    0,     1,     2, ..., 86282, 86283, 86284]),
                  array([ 86285,  86286,  86287, ..., 203045, 203046, 203047]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))

In [88]:
grid_search.best_params_

{'pca__n_components': 9}

In [89]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.06047645010846184 {'pca__n_components': 1}
0.05648442569524177 {'pca__n_components': 2}
0.05505885643926543 {'pca__n_components': 3}
0.06827088360625753 {'pca__n_components': 4}
0.07112802051912726 {'pca__n_components': 5}
0.07107450847527291 {'pca__n_components': 6}
0.06906175137901698 {'pca__n_components': 7}
0.07715717389016709 {'pca__n_components': 8}
0.07815893155696621 {'pca__n_components': 9}


In [90]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.01117901387512199

## Elastic Net

In [98]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

In [99]:
model = SGDRegressor(penalty='elasticnet')

In [100]:
hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5]}]

In [101]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

In [102]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 86282, 86283, 86284]),
                  array([ 86285,  86286,  86287, ..., 203045, 203046, 203047]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

In [103]:
grid_search.best_params_

{'alpha': 0.001, 'l1_ratio': 0.15}

In [104]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.010007949151975781

In [105]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01183600331821233
Test year 2017 : -0.0754301490157141
Test year 2018 : -0.045485803118642476
Test year 2019 : 0.003638036962217317
Test year 2020 : 0.006039585416782511
Test year 2021 : -0.013272185902106548
CPU times: user 1min 3s, sys: 1.49 s, total: 1min 4s
Wall time: 22.7 s


## Gradient Boosted Regression Trees

In [113]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

In [105]:
hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]

In [106]:
model = GradientBoostingRegressor()

In [107]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

In [108]:
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

CPU times: user 8min 28s, sys: 1.86 s, total: 8min 30s
Wall time: 8min 33s


GridSearchCV(cv=[(array([    0,     1,     2, ..., 86282, 86283, 86284]),
                  array([ 86285,  86286,  86287, ..., 203045, 203046, 203047]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

In [109]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 1}

In [110]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)

0.06543277039809901 {'learning_rate': 0.1, 'max_depth': 1}
0.02000841809692018 {'learning_rate': 0.1, 'max_depth': 2}
nan {'learning_rate': 0.1, 'max_depth': 3}
nan {'learning_rate': 0.1, 'max_depth': 4}
nan {'learning_rate': 0.1, 'max_depth': 5}
nan {'learning_rate': 0.1, 'max_depth': 6}
0.06399724146424032 {'learning_rate': 0.05, 'max_depth': 1}
0.05965646878537504 {'learning_rate': 0.05, 'max_depth': 2}
0.04302709526703145 {'learning_rate': 0.05, 'max_depth': 3}
nan {'learning_rate': 0.05, 'max_depth': 4}
nan {'learning_rate': 0.05, 'max_depth': 5}
nan {'learning_rate': 0.05, 'max_depth': 6}
0.05053331785726358 {'learning_rate': 0.01, 'max_depth': 1}
0.05778650929940647 {'learning_rate': 0.01, 'max_depth': 2}
0.05718740382416022 {'learning_rate': 0.01, 'max_depth': 3}
0.057024598216398666 {'learning_rate': 0.01, 'max_depth': 4}
0.05472527792527635 {'learning_rate': 0.01, 'max_depth': 5}
0.05257540418160582 {'learning_rate': 0.01, 'max_depth': 6}


  print(np.sqrt(mean_score), params)


In [111]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.01074611143132742

## Neural Nets

In [106]:
tf.__version__

'2.4.1'

In [107]:
keras.__version__

'2.4.0'

In [108]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

In [109]:
X_train.shape

(86285, 9)

In [110]:
X_val.shape

(116763, 9)

In [114]:
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(32, activation='relu'))
nn_model.add(keras.layers.Dense(16, activation='relu'))
nn_model.add(keras.layers.Dense(1))

In [115]:
nn_model.compile(loss='mse',optimizer='sgd')

In [116]:
nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f800887e640>

In [117]:
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.010190283642224518

### GridSeachCV Neural Nets

In [119]:
def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(32, activation='relu'))
    nn_model.add(keras.layers.Dense(16, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(lr=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model

In [120]:
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

In [121]:
hyperparams_grid = {
    'learning_rate':[0.003,0.001]
}

In [122]:
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])

In [123]:
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [124]:
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

# Transformation pipeline example

In [125]:
df_rank

Unnamed: 0,secID,ret_date,exret,ym,size,rev,mom,beta,bm,illiq,illiq_12m,vol,ivol,year
0,000001.XSHE,2008-02,-0.007450,2008-01,0.974984,-0.415405,0.242923,-0.133641,-0.608953,-0.964450,-0.980250,-0.479921,0.000000,2008
1,000001.XSHE,2008-03,-0.152068,2008-02,0.971391,-0.657997,0.462939,0.431730,-0.560468,-0.962289,-0.980494,0.465540,-0.499350,2008
2,000001.XSHE,2008-04,0.047493,2008-03,0.972745,0.484750,0.587281,0.161583,-0.162881,-0.977936,-0.979234,0.280986,0.550941,2008
3,000001.XSHE,2008-05,-0.151164,2008-04,0.968709,0.543677,0.601043,-0.211213,-0.294654,-0.973924,-0.979140,-0.580183,-0.830508,2008
4,000001.XSHE,2008-06,-0.236961,2008-05,0.967617,-0.800518,0.669689,0.246114,-0.137306,-0.950777,-0.979275,-0.155440,-0.604922,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419502,900957.XSHG,2020-10,0.003573,2020-09,-0.990615,0.225488,-0.664609,0.000000,0.000000,0.986663,0.987157,-0.045690,0.094097,2020
419503,900957.XSHG,2020-11,0.011202,2020-10,-0.989315,0.238951,-0.655658,0.000243,0.000000,0.986887,0.987373,-0.600291,-0.668286,2020
419504,900957.XSHG,2020-12,-0.038373,2020-11,-0.989375,-0.100700,-0.490461,0.000000,0.000000,0.989375,0.987443,-0.325767,-0.320454,2020
419505,900957.XSHG,2021-01,0.330973,2020-12,-0.989914,0.217099,-0.491354,0.000000,0.000000,0.989433,0.987512,-0.973103,-0.847743,2021


In [126]:
X_fulltrain.columns.tolist().index('illiq')

5

In [127]:
X_fulltrain.columns.tolist().index('illiq_12m')

6

In [128]:
illiq_idx = 5
illiq_12m_idx = 6

In [129]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()

In [130]:
X_fulltrain.values.shape

(203048, 9)

In [131]:
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)

In [132]:
X_fulltrain_new

array([[ 0.97498354, -0.41540487,  0.24292298, ..., -0.479921  ,
         0.        , -0.97235023],
       [ 0.97139142, -0.6579974 ,  0.46293888, ...,  0.46553966,
        -0.4993498 , -0.97139142],
       [ 0.97274497,  0.48475016,  0.58728099, ...,  0.28098637,
         0.55094095, -0.97858533],
       ...,
       [-0.98812049,  0.42299533, -0.4900297 , ..., -0.46457361,
        -0.51887993,  0.98515062],
       [-0.98844884,  0.3019802 ,  0.42079208, ..., -0.20379538,
         0.19059406,  0.98638614],
       [-0.98793727,  0.68234821,  0.61640531, ..., -0.20305589,
         0.19018898,  0.977885  ]])

In [133]:
X_fulltrain_new.shape

(203048, 10)

In [134]:
# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])

In [135]:
pipeline.fit_transform(X_fulltrain.values)

array([[ 1.68951131e+00, -7.19839143e-01,  4.22125848e-01, ...,
        -8.31636558e-01, -1.94790357e-18, -1.76971464e+00],
       [ 1.68328665e+00, -1.14021841e+00,  8.04446213e-01, ...,
         8.06715692e-01, -8.68622094e-01, -1.76796957e+00],
       [ 1.68563218e+00,  8.40004933e-01,  1.02051477e+00, ...,
         4.86910428e-01,  9.58365208e-01, -1.78106277e+00],
       ...,
       [-1.71227582e+00,  7.32992362e-01, -8.51521769e-01, ...,
        -8.05041659e-01, -9.02594872e-01,  1.79301184e+00],
       [-1.71284481e+00,  5.23289884e-01,  7.31207958e-01, ...,
        -3.53149139e-01,  3.31539553e-01,  1.79526054e+00],
       [-1.71195833e+00,  1.18241500e+00,  1.07112393e+00, ...,
        -3.51867707e-01,  3.30834919e-01,  1.77978814e+00]])