In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras
  • Data
    • NA 值处理
    • Use rank instead of numerical values
  • Train, Validation, Test split
  • Evaluation metrics
  • Models
    • Linear regression
    • Huber regressor
    • Random Forest
    • Partial Least Squares
    • Principal Component Regression
      • PCA transform
      • PCA regression
    • Pipeline
    • Elastic Net
    • Gradient Boosted Regression Trees
    • Neural Nets
      • GridSeachCV Neural Nets
  • Transformation pipeline example

Data¶

In [2]:
df = pd.read_pickle('../../data/factor_exposure/all_exposure_2023.pkl')
In [3]:
df
Out[3]:
secID ret_date tradeDate ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 2007-06-29 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 2007-07-31 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 2007-08-31 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 2007-09-28 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 2007-10-31 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
547253 689009.XSHG 2022-12 2022-11-30 -0.086579 0.001955 -0.088534 2022-11 1.708055e+10 23.561206 0.041529 2022-10 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108 0.017044 0.010108
547254 689009.XSHG 2023-01 2022-12-30 0.088554 0.001856 0.086698 2022-12 1.560173e+10 23.470648 -0.088534 2022-11 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598 0.019017 0.013598
547255 689009.XSHG 2023-02 2023-01-31 -0.005725 0.001910 -0.007635 2023-01 1.698332e+10 23.555498 0.086698 2022-12 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679 0.014183 0.009679
547256 689009.XSHG 2023-03 2023-02-28 -0.011818 0.001982 -0.013800 2023-02 1.688610e+10 23.549757 -0.007635 2023-01 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591 0.024867 0.012591
547257 689009.XSHG NaT 2023-03-10 NaN NaN NaN 2023-03 1.668654e+10 23.537868 -0.013800 2023-02 -0.246403 NaN 0.193716 NaN 0.000201 NaN NaN NaN NaN

547258 rows × 20 columns

In [4]:
df.drop('tradeDate',axis=1,inplace=True)
In [5]:
df
Out[5]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
547253 689009.XSHG 2022-12 -0.086579 0.001955 -0.088534 2022-11 1.708055e+10 23.561206 0.041529 2022-10 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108 0.017044 0.010108
547254 689009.XSHG 2023-01 0.088554 0.001856 0.086698 2022-12 1.560173e+10 23.470648 -0.088534 2022-11 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598 0.019017 0.013598
547255 689009.XSHG 2023-02 -0.005725 0.001910 -0.007635 2023-01 1.698332e+10 23.555498 0.086698 2022-12 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679 0.014183 0.009679
547256 689009.XSHG 2023-03 -0.011818 0.001982 -0.013800 2023-02 1.688610e+10 23.549757 -0.007635 2023-01 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591 0.024867 0.012591
547257 689009.XSHG NaT NaN NaN NaN 2023-03 1.668654e+10 23.537868 -0.013800 2023-02 -0.246403 NaN 0.193716 NaN 0.000201 NaN NaN NaN NaN

547258 rows × 19 columns

NA 值处理¶

In [6]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 5068
ret 19730
rf 5068
exret 19730
ym 0
mktcap 14043
size 14043
rev 18541
mom_date 3879
mom 53422
beta 28578
bm 4198
illiq 32683
illiq_12m 96646
vol 21868
ivol 36221
vol_clip 21868
ivol_clip 36221

ret_date 为 NA 的删除,已到最新数据处

In [7]:
df = df[~df['ret_date'].isna()].copy()
In [8]:
df
Out[8]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
547252 689009.XSHG 2022-11 0.043125 0.001596 0.041529 2022-10 1.637440e+10 23.518985 -0.166109 2022-09 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851 0.059961 0.051851
547253 689009.XSHG 2022-12 -0.086579 0.001955 -0.088534 2022-11 1.708055e+10 23.561206 0.041529 2022-10 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108 0.017044 0.010108
547254 689009.XSHG 2023-01 0.088554 0.001856 0.086698 2022-12 1.560173e+10 23.470648 -0.088534 2022-11 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598 0.019017 0.013598
547255 689009.XSHG 2023-02 -0.005725 0.001910 -0.007635 2023-01 1.698332e+10 23.555498 0.086698 2022-12 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679 0.014183 0.009679
547256 689009.XSHG 2023-03 -0.011818 0.001982 -0.013800 2023-02 1.688610e+10 23.549757 -0.007635 2023-01 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591 0.024867 0.012591

542190 rows × 19 columns

momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。

In [9]:
df.loc[~df['mom'].isna(),'ret_date'].min()
Out[9]:
Period('2008-01', 'M')
In [10]:
df = df[df['ret_date'] >= '2008-01'].copy()
In [11]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
ret 14262
rf 0
exret 14262
ym 0
mktcap 13693
size 13693
rev 17990
mom_date 3693
mom 40190
beta 23245
bm 4024
illiq 27119
illiq_12m 84325
vol 16457
ivol 18316
vol_clip 16457
ivol_clip 18316

剩余的NA值有至少三个来源:

  • 由于停牌日期填充造成,
  • 由于计算时要求最低样本数造成,
  • 由优矿直接给出了NA值

return 的 NA 值直接删除

In [12]:
df = df[~df['ret'].isna()].copy()
In [13]:
df
Out[13]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
6 000001.XSHE 2008-01 -0.137306 0.002949 -0.140255 2007-12 6.574629e+10 24.909069 0.066834 2007-11 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN 0.026541 NaN
7 000001.XSHE 2008-02 -0.004504 0.002946 -0.007450 2008-01 5.850212e+10 24.792329 -0.140255 2007-12 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909 0.037722 0.012909
8 000001.XSHE 2008-03 -0.149321 0.002746 -0.152068 2008-02 5.823860e+10 24.787814 -0.007450 2008-01 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032 0.041448 0.009032
9 000001.XSHE 2008-04 0.050355 0.002862 0.047493 2008-03 4.954234e+10 24.626093 -0.152068 2008-02 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484 0.045109 0.021484
10 000001.XSHE 2008-05 -0.148211 0.002953 -0.151164 2008-04 5.203702e+10 24.675221 0.047493 2008-03 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
547252 689009.XSHG 2022-11 0.043125 0.001596 0.041529 2022-10 1.637440e+10 23.518985 -0.166109 2022-09 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851 0.059961 0.051851
547253 689009.XSHG 2022-12 -0.086579 0.001955 -0.088534 2022-11 1.708055e+10 23.561206 0.041529 2022-10 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108 0.017044 0.010108
547254 689009.XSHG 2023-01 0.088554 0.001856 0.086698 2022-12 1.560173e+10 23.470648 -0.088534 2022-11 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598 0.019017 0.013598
547255 689009.XSHG 2023-02 -0.005725 0.001910 -0.007635 2023-01 1.698332e+10 23.555498 0.086698 2022-12 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679 0.014183 0.009679
547256 689009.XSHG 2023-03 -0.011818 0.001982 -0.013800 2023-02 1.688610e+10 23.549757 -0.007635 2023-01 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591 0.024867 0.012591

514959 rows × 19 columns

In [14]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 4297
mom_date 3693
mom 40162
beta 21372
bm 1004
illiq 13186
illiq_12m 71577
vol 2716
ivol 4413
vol_clip 2716
ivol_clip 4413
In [15]:
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)
In [16]:
df.drop(['ret','rf'],axis=1,inplace=True)
In [17]:
df.reset_index(inplace=True,drop=True)
In [18]:
df
Out[18]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 23.518985 -0.166109 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851
514955 689009.XSHG 2022-12 -0.088534 2022-11 23.561206 0.041529 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108
514956 689009.XSHG 2023-01 0.086698 2022-12 23.470648 -0.088534 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598
514957 689009.XSHG 2023-02 -0.007635 2023-01 23.555498 0.086698 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679
514958 689009.XSHG 2023-03 -0.013800 2023-02 23.549757 -0.007635 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591

514959 rows × 13 columns

  • reversal 的 NA 是由于在对应的return date,上个月停牌所以没有上个月的return。
  • beta, bm 是优矿的NA。可以用当月的横截面上的中值填充
  • illiq, ivol, vol 也可用当月的横截面上的中值填充.
In [19]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
exret 0
ym 0
size 0
rev 4297
mom 40162
beta 21372
bm 1004
illiq 13186
illiq_12m 71577
vol 2716
ivol 4413
In [20]:
# Reversal 的空值丢掉,其他的用 median 填充
df = df[~df['rev'].isna()].copy()
In [21]:
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']
In [22]:
df
Out[22]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 23.518985 -0.166109 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851
514955 689009.XSHG 2022-12 -0.088534 2022-11 23.561206 0.041529 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108
514956 689009.XSHG 2023-01 0.086698 2022-12 23.470648 -0.088534 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598
514957 689009.XSHG 2023-02 -0.007635 2023-01 23.555498 0.086698 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679
514958 689009.XSHG 2023-03 -0.013800 2023-02 23.549757 -0.007635 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591

510662 rows × 13 columns

In [23]:
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))
In [24]:
temp.fillna(0, inplace=True)
In [25]:
df[cols] = temp.copy()
In [26]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
exret 0
ym 0
size 0
rev 0
mom 0
beta 0
bm 0
illiq 0
illiq_12m 0
vol 0
ivol 0
In [27]:
df
Out[27]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 0.796305 0.9468 0.094476 0.000025 0.000505 0.026541 0.000000
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 1.145639 0.9654 0.109513 0.000039 0.000494 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 0.693690 1.0292 0.110009 0.000064 0.000490 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 0.558575 1.0238 0.201102 0.000043 0.000526 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 23.518985 -0.166109 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851
514955 689009.XSHG 2022-12 -0.088534 2022-11 23.561206 0.041529 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108
514956 689009.XSHG 2023-01 0.086698 2022-12 23.470648 -0.088534 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598
514957 689009.XSHG 2023-02 -0.007635 2023-01 23.555498 0.086698 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679
514958 689009.XSHG 2023-03 -0.013800 2023-02 23.549757 -0.007635 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591

510662 rows × 13 columns

In [28]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 510662 entries, 0 to 514958
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype    
---  ------     --------------   -----    
 0   secID      510662 non-null  object   
 1   ret_date   510662 non-null  period[M]
 2   exret      510662 non-null  float64  
 3   ym         510662 non-null  period[M]
 4   size       510662 non-null  float64  
 5   rev        510662 non-null  float64  
 6   mom        510662 non-null  float64  
 7   beta       510662 non-null  float64  
 8   bm         510662 non-null  float64  
 9   illiq      510662 non-null  float64  
 10  illiq_12m  510662 non-null  float64  
 11  vol        510662 non-null  float64  
 12  ivol       510662 non-null  float64  
dtypes: float64(10), object(1), period[M](2)
memory usage: 54.5+ MB

Use rank instead of numerical values¶

$$c_{i,t} = \frac{2}{N+1}CSrank(c^r_{i,t}) - 1$$

$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t

In [29]:
def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1
In [30]:
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()
In [31]:
num_X_cols
Out[31]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [32]:
df[['ret_date']+num_X_cols]
Out[32]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
0 2008-01 24.909069 0.066834 0.796305 0.9468 0.094476 0.000025 0.000505 0.026541 0.000000
1 2008-02 24.792329 -0.140255 1.145639 0.9654 0.109513 0.000039 0.000494 0.037722 0.012909
2 2008-03 24.787814 -0.007450 0.693690 1.0292 0.110009 0.000064 0.000490 0.041448 0.009032
3 2008-04 24.626093 -0.152068 0.558575 1.0238 0.201102 0.000043 0.000526 0.045109 0.021484
4 2008-05 24.675221 0.047493 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ...
514954 2022-11 23.518985 -0.166109 -0.401406 0.7083 0.209701 0.000264 0.000206 0.059961 0.051851
514955 2022-12 23.561206 0.041529 -0.474030 0.7363 0.201033 0.000122 0.000202 0.017044 0.010108
514956 2023-01 23.470648 -0.088534 -0.523619 0.6919 0.220085 0.000194 0.000210 0.019017 0.013598
514957 2023-02 23.555498 0.086698 -0.498602 0.7379 0.201772 0.000100 0.000210 0.014183 0.009679
514958 2023-03 23.549757 -0.007635 -0.418230 0.7453 0.202930 0.000091 0.000202 0.024867 0.012591

510662 rows × 10 columns

In [123]:
df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)
Out[123]:
ret_date size
ret_date
2008-01 0 0.0 0.969559
183 0.0 0.990868
651 0.0 0.522070
1504 0.0 0.678843
1687 0.0 -0.231355
... ... ... ...
2023-03 514851 0.0 -0.743772
514871 0.0 0.278208
514897 0.0 0.041658
514929 0.0 0.943898
514958 0.0 0.672179

510662 rows × 2 columns

In [33]:
temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)
In [34]:
temp
Out[34]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
ret_date
2008-01 0 0.0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
183 0.0 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
651 0.0 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
1504 0.0 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.517504 0.493151 0.000000
1687 0.0 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ... ...
2023-03 514851 0.0 -0.743772 0.284488 0.184844 -0.557254 0.279464 0.348964 0.836718 -0.684739 -0.466192
514871 0.0 0.278208 -0.802805 0.209964 0.550345 -0.735817 -0.220431 0.083944 0.504291 0.274440
514897 0.0 0.041658 -0.816203 0.559975 -0.490057 0.016956 -0.027842 -0.103203 -0.330961 -0.108227
514929 0.0 0.943898 -0.279883 -0.469960 -0.014026 0.134603 -0.969018 -0.969018 -0.486707 -0.643291
514958 0.0 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856

510662 rows × 10 columns

In [35]:
temp.drop('ret_date',axis=1).reset_index()
Out[35]:
ret_date level_1 size rev mom beta bm illiq illiq_12m vol ivol
0 2008-01 0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
1 2008-01 183 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
2 2008-01 651 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
3 2008-01 1504 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.517504 0.493151 0.000000
4 2008-01 1687 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ... ...
510657 2023-03 514851 -0.743772 0.284488 0.184844 -0.557254 0.279464 0.348964 0.836718 -0.684739 -0.466192
510658 2023-03 514871 0.278208 -0.802805 0.209964 0.550345 -0.735817 -0.220431 0.083944 0.504291 0.274440
510659 2023-03 514897 0.041658 -0.816203 0.559975 -0.490057 0.016956 -0.027842 -0.103203 -0.330961 -0.108227
510660 2023-03 514929 0.943898 -0.279883 -0.469960 -0.014026 0.134603 -0.969018 -0.969018 -0.486707 -0.643291
510661 2023-03 514958 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856

510662 rows × 11 columns

In [36]:
temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp
Out[36]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
level_1
0 2008-01 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
183 2008-01 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
651 2008-01 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
1504 2008-01 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.517504 0.493151 0.000000
1687 2008-01 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ...
514851 2023-03 -0.743772 0.284488 0.184844 -0.557254 0.279464 0.348964 0.836718 -0.684739 -0.466192
514871 2023-03 0.278208 -0.802805 0.209964 0.550345 -0.735817 -0.220431 0.083944 0.504291 0.274440
514897 2023-03 0.041658 -0.816203 0.559975 -0.490057 0.016956 -0.027842 -0.103203 -0.330961 -0.108227
514929 2023-03 0.943898 -0.279883 -0.469960 -0.014026 0.134603 -0.969018 -0.969018 -0.486707 -0.643291
514958 2023-03 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856

510662 rows × 10 columns

In [37]:
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)
In [38]:
del temp
In [39]:
df_rank
Out[39]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784
... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 0.700640 -0.936034 -0.827719 -0.674414 -0.684009 -0.107036 -0.260981 0.953092 0.987207
514955 689009.XSHG 2022-12 -0.088534 2022-11 0.696713 -0.218240 -0.865111 -0.164157 -0.676776 -0.345917 -0.257688 -0.380276 -0.379003
514956 689009.XSHG 2023-01 0.086698 2022-12 0.682479 -0.431583 -0.970483 -0.223066 -0.635252 -0.150327 -0.242673 -0.109846 0.041113
514957 689009.XSHG 2023-02 -0.007635 2023-01 0.682247 0.145253 -0.983232 0.037099 -0.625865 -0.477258 -0.247956 -0.309998 -0.089499
514958 689009.XSHG 2023-03 -0.013800 2023-02 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856

510662 rows × 13 columns

Train, Validation, Test split¶

In [40]:
df_rank['year'] = df_rank['ret_date'].dt.year
In [41]:
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]
In [42]:
# sorted(df_rank.groupby('year').groups.items())
In [43]:
time_idx
Out[43]:
[Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                  8,      9,
             ...
             461853, 461854, 461855, 461856, 461857, 461858, 461859, 461860,
             461861, 461862],
            dtype='int64', length=16621),
 Int64Index([    12,     13,     14,     15,     16,     17,     18,     19,
                 20,     21,
             ...
             461865, 461866, 461867, 461868, 461869, 461870, 461871, 461872,
             461873, 461874],
            dtype='int64', length=17335),
 Int64Index([    24,     25,     26,     27,     28,     29,     30,     31,
                 32,     33,
             ...
             461877, 461878, 461879, 461880, 461881, 461882, 461883, 461884,
             461885, 461886],
            dtype='int64', length=19823),
 Int64Index([    36,     37,     38,     39,     40,     41,     42,     43,
                 44,     45,
             ...
             461889, 461890, 461891, 461892, 461893, 461894, 461895, 461896,
             461897, 461898],
            dtype='int64', length=23864),
 Int64Index([    48,     49,     50,     51,     52,     53,     54,     55,
                 56,     57,
             ...
             478263, 478264, 478265, 479640, 479641, 479642, 490582, 490583,
             490584, 499706],
            dtype='int64', length=26912),
 Int64Index([    60,     61,     62,     63,     64,     65,     66,     67,
                 68,     69,
             ...
             499709, 499710, 499711, 499712, 499713, 499714, 499715, 499716,
             499717, 499718],
            dtype='int64', length=28592),
 Int64Index([    72,     73,     74,     75,     76,     77,     78,     79,
                 80,     81,
             ...
             499721, 499722, 499723, 499724, 499725, 499726, 499727, 499728,
             499729, 499730],
            dtype='int64', length=29539),
 Int64Index([    84,     85,     86,     87,     88,     89,     90,     91,
                 92,     93,
             ...
             500009, 500010, 500011, 500012, 500013, 500014, 500015, 500016,
             500017, 500018],
            dtype='int64', length=31727),
 Int64Index([    96,     97,     98,     99,    100,    101,    102,    103,
                104,    105,
             ...
             500108, 500109, 500110, 500111, 500112, 500113, 500114, 500115,
             500116, 500117],
            dtype='int64', length=33468),
 Int64Index([   108,    109,    110,    111,    112,    113,    114,    115,
                116,    117,
             ...
             500120, 500121, 500122, 500123, 500124, 500125, 500126, 500127,
             500128, 500129],
            dtype='int64', length=37665),
 Int64Index([   120,    121,    122,    123,    124,    125,    126,    127,
                128,    129,
             ...
             500132, 500133, 500134, 500135, 500136, 500137, 500138, 500139,
             500140, 500141],
            dtype='int64', length=41103),
 Int64Index([   132,    133,    134,    135,    136,    137,    138,    139,
                140,    141,
             ...
             510143, 510144, 510145, 510667, 510729, 510770, 511041, 511042,
             511043, 511044],
            dtype='int64', length=41992),
 Int64Index([   144,    145,    146,    147,    148,    149,    150,    151,
                152,    153,
             ...
             513137, 513138, 513139, 513140, 514745, 514899, 514900, 514901,
             514902, 514931],
            dtype='int64', length=44134),
 Int64Index([   156,    157,    158,    159,    160,    161,    162,    163,
                164,    165,
             ...
             514934, 514935, 514936, 514937, 514938, 514939, 514940, 514941,
             514942, 514943],
            dtype='int64', length=49181),
 Int64Index([   168,    169,    170,    171,    172,    173,    174,    175,
                176,    177,
             ...
             514946, 514947, 514948, 514949, 514950, 514951, 514952, 514953,
             514954, 514955],
            dtype='int64', length=54418),
 Int64Index([   180,    181,    182,    363,    364,    365,    831,    832,
                833,    946,
             ...
             514871, 514895, 514896, 514897, 514927, 514928, 514929, 514956,
             514957, 514958],
            dtype='int64', length=14288)]
In [44]:
df_rank.groupby('year')['secID'].nunique()
Out[44]:
year
2008    1463
2009    1530
2010    1841
2011    2142
2012    2383
2013    2432
2014    2549
2015    2772
2016    2941
2017    3392
2018    3522
2019    3648
2020    3961
2021    4422
2022    4770
2023    4777
Name: secID, dtype: int64
In [45]:
df_rank.groupby('year')['secID'].count()
Out[45]:
year
2008    16621
2009    17335
2010    19823
2011    23864
2012    26912
2013    28592
2014    29539
2015    31727
2016    33468
2017    37665
2018    41103
2019    41992
2020    44134
2021    49181
2022    54418
2023    14288
Name: secID, dtype: int64
In [46]:
def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result
In [47]:
list_flat([[1,2,3],[3,4,5]])
Out[47]:
[1, 2, 3, 3, 4, 5]
In [48]:
np.array([[1,2,3],[3,4,5]]).flatten()
Out[48]:
array([1, 2, 3, 3, 4, 5])
In [49]:
df_rank
Out[49]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 0.700640 -0.936034 -0.827719 -0.674414 -0.684009 -0.107036 -0.260981 0.953092 0.987207 2022
514955 689009.XSHG 2022-12 -0.088534 2022-11 0.696713 -0.218240 -0.865111 -0.164157 -0.676776 -0.345917 -0.257688 -0.380276 -0.379003 2022
514956 689009.XSHG 2023-01 0.086698 2022-12 0.682479 -0.431583 -0.970483 -0.223066 -0.635252 -0.150327 -0.242673 -0.109846 0.041113 2023
514957 689009.XSHG 2023-02 -0.007635 2023-01 0.682247 0.145253 -0.983232 0.037099 -0.625865 -0.477258 -0.247956 -0.309998 -0.089499 2023
514958 689009.XSHG 2023-03 -0.013800 2023-02 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856 2023

510662 rows × 14 columns

In [51]:
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2018], [2019-2022], [2023]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])
In [52]:
df_rank.loc[fulltrain_idx[-1]]
Out[52]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
514951 689009.XSHG 2022-08 -0.113907 2022-07 0.784744 0.646208 -0.844805 -0.314336 -0.831653 -0.137221 -0.210872 0.667251 0.736081 2022
514952 689009.XSHG 2022-09 -0.131337 2022-08 0.763169 -0.625599 -0.814976 -0.165869 -0.809752 -0.322159 -0.227253 0.639965 0.803222 2022
514953 689009.XSHG 2022-10 -0.166109 2022-09 0.750215 -0.393626 -0.954350 -0.143626 -0.779500 -0.217054 -0.260551 -0.252799 0.253230 2022
514954 689009.XSHG 2022-11 0.041529 2022-10 0.700640 -0.936034 -0.827719 -0.674414 -0.684009 -0.107036 -0.260981 0.953092 0.987207 2022
514955 689009.XSHG 2022-12 -0.088534 2022-11 0.696713 -0.218240 -0.865111 -0.164157 -0.676776 -0.345917 -0.257688 -0.380276 -0.379003 2022

496374 rows × 14 columns

In [53]:
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]
Out[53]:
array([0, 2, 4])
In [54]:
test_years = list(range(2016, 2024))
test_years
Out[54]:
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

Evaluation metrics¶

In [55]:
def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)
In [56]:
r2_oos_scorer = make_scorer(r2_oos)

Sklearn¶

Scikit-learn (sklearn) 的设计理念:

  • Estimators: 可以基于数据估计出参数的东西。用fit()估计。比如填充空值(imputer),linear regression,等等。
  • Transformers (不是神经网络里的那个,不是 Vaswani et al. (2017)): 可以把数据转换成新数据的东西。用transform()转换。一般可以直接用 fit_transform()
  • Predictors: 可以基于数据做预测,比如linear regression

统一的命名规范:

  • hyperparameter可以由model.\<hyperparameter>取出,比如model.n_estimators
  • estimated parameters可以由 model.\<estimate>取出,比如model.featureimportances

数据用np.array保存,或者SciPy的稀疏矩阵。避免各类其他包的自定义(比如pandas)

给出了大量的机器学习模型,同时很容易自定义进行拓展。自定义的模型可以很方便的融入到sklearn自带的模型当中

Transformation pipeline example¶

In [171]:
df_rank
Out[171]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
514954 689009.XSHG 2022-11 0.041529 2022-10 0.700640 -0.936034 -0.827719 -0.674414 -0.684009 -0.107036 -0.260981 0.953092 0.987207 2022
514955 689009.XSHG 2022-12 -0.088534 2022-11 0.696713 -0.218240 -0.865111 -0.164157 -0.676776 -0.345917 -0.257688 -0.380276 -0.379003 2022
514956 689009.XSHG 2023-01 0.086698 2022-12 0.682479 -0.431583 -0.970483 -0.223066 -0.635252 -0.150327 -0.242673 -0.109846 0.041113 2023
514957 689009.XSHG 2023-02 -0.007635 2023-01 0.682247 0.145253 -0.983232 0.037099 -0.625865 -0.477258 -0.247956 -0.309998 -0.089499 2023
514958 689009.XSHG 2023-03 -0.013800 2023-02 0.672179 -0.409253 -0.964831 0.139418 -0.605610 -0.406322 -0.243458 0.471216 0.117856 2023

510662 rows × 14 columns

In [172]:
X_fulltrain.columns.tolist()
Out[172]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [173]:
X_fulltrain.columns.tolist().index('illiq')
Out[173]:
5
In [174]:
X_fulltrain.columns.tolist().index('illiq_12m')
Out[174]:
6
In [175]:
illiq_idx = 4
illiq_12m_idx = 5
In [176]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()
In [177]:
X_fulltrain.values.shape
Out[177]:
(194413, 9)
In [178]:
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)
In [179]:
X_fulltrain_new
Out[179]:
array([[ 0.9695586 , -0.85844749,  0.        , ...,  0.27549467,
         0.        , -0.82572298],
       [ 0.97153558, -0.45168539,  0.        , ..., -0.63595506,
        -0.7917603 , -0.78651685],
       [ 0.96733482, -0.70007424,  0.        , ...,  0.437268  ,
        -0.62583519, -0.75575353],
       ...,
       [-0.90249267,  0.521261  ,  0.        , ...,  0.33284457,
         0.83577713,  0.0164956 ],
       [-0.90905757,  0.02090209,  0.        , ..., -0.41620829,
        -0.3186652 ,  0.12431243],
       [-0.88114453,  0.87160675,  0.21129861, ...,  0.09024211,
         0.57446809,  0.09941306]])
In [180]:
X_fulltrain_new.shape
Out[180]:
(194413, 10)
In [181]:
# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])
In [182]:
pipeline.fit_transform(X_fulltrain.values)
Out[182]:
array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ...,
         4.77437532e-01, -7.62641454e-19, -1.98928732e+00],
       [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ...,
        -1.10212226e+00, -1.37679088e+00, -1.89483403e+00],
       [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ...,
         7.57793805e-01, -1.08826394e+00, -1.82072068e+00],
       ...,
       [-1.56393521e+00,  9.03360864e-01, -7.60380513e-19, ...,
         5.76826008e-01,  1.45333168e+00,  3.97403136e-02],
       [-1.57531157e+00,  3.62239461e-02, -7.60380513e-19, ...,
        -7.21296915e-01, -5.54126473e-01,  2.99486812e-01],
       [-1.52694089e+00,  1.51052051e+00,  3.66337572e-01, ...,
         1.56391306e-01,  9.98941754e-01,  2.39500587e-01]])

Models¶

Linear regression¶

In [57]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
In [58]:
cols
Out[58]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
In [59]:
model = LinearRegression()
In [60]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009412304211971145
Test year 2017 : -0.08839594863078148
Test year 2018 : -0.04979526421788871
Test year 2019 : 0.006463809562448852
Test year 2020 : -0.001544287862749627
Test year 2021 : 0.011488412068509812
Test year 2022 : -0.0009306275137825892
Test year 2023 : 0.0538460552856318
In [61]:
cols = ['size','rev','illiq','ivol']
In [62]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.01021329786011016
Test year 2017 : -0.08793262026819404
Test year 2018 : -0.04979850705536615
Test year 2019 : 0.007779285918034451
Test year 2020 : -0.0007569573338341851
Test year 2021 : 0.01083296171623438
Test year 2022 : -0.0017994744447327182
Test year 2023 : 0.0567680782563359

Huber regressor¶

In [63]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
Out[63]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
In [64]:
model = HuberRegressor(alpha=0.01,epsilon=1.05)
In [65]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : 0.0068762585146247
Test year 2017 : -0.02917724503186392
Test year 2018 : 0.00904631578299342
Test year 2019 : -0.018436209926423253
Test year 2020 : -0.01373519780133825
Test year 2021 : -0.008464097203231491
Test year 2022 : 0.010896689656339609
Test year 2023 : -0.025851837667542954

Random Forest¶

In [66]:
cols = num_X_cols
cols
Out[66]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [69]:
hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [3,5]}
]
In [70]:
model = RandomForestRegressor(random_state=42)
In [71]:
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [72]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']
In [73]:
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 44.2 s, sys: 255 ms, total: 44.5 s
Wall time: 45 s
Out[73]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5],
                          'n_estimators': [50]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [74]:
grid_search.best_params_
Out[74]:
{'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
In [75]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
0.03288083772090255 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50}
0.03268911472458363 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50}
0.05166632659254923 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50}
0.05452611070305944 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
0.05636006091771797 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
0.05020334277634201 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}
In [76]:
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)
Out[76]:
features feature_importance
5 illiq 0.198897
1 rev 0.158943
0 size 0.132347
7 vol 0.116127
8 ivol 0.114038
2 mom 0.093076
4 bm 0.079133
6 illiq_12m 0.063776
3 beta 0.043663
In [77]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[77]:
-0.027635441051083953
In [78]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.027635441051083953
Test year 2017 : -0.07734369016377896
Test year 2018 : -0.03959447233848512
Test year 2019 : 0.007431052632130841
Test year 2020 : 0.004284335274858608
Test year 2021 : 0.011873444443791237
Test year 2022 : -0.0027118213183869866
Test year 2023 : 0.05356830744986352
CPU times: user 13min 53s, sys: 6.11 s, total: 13min 59s
Wall time: 14min 13s

Partial Least Squares¶

In [79]:
cols = num_X_cols
cols
Out[79]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [80]:
model = PLSRegression(n_components=4)
In [81]:
y_pred.reshape(-1).shape
Out[81]:
(14288,)
In [82]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.008666088091823676
Test year 2017 : -0.09331774541749294
Test year 2018 : -0.049216423103574325
Test year 2019 : 0.006125528081381337
Test year 2020 : -0.0015378783009631913
Test year 2021 : 0.011202972263563482
Test year 2022 : -0.0006573216211354094
Test year 2023 : 0.05334143576809536
CPU times: user 9.06 s, sys: 621 ms, total: 9.68 s
Wall time: 2.68 s

Principal Component Regression¶

PCA transform¶

In [83]:
cols = num_X_cols
cols
Out[83]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [84]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [85]:
pca = PCA(3, random_state=42)
In [86]:
pca.fit(X_fulltrain)
Out[86]:
PCA(n_components=3, random_state=42)
In [87]:
pca.components_
Out[87]:
array([[ 0.5393208 , -0.10042879, -0.02121698,  0.13077125,  0.11124688,
        -0.53998128, -0.55702611, -0.17478931, -0.19160066],
       [ 0.13101866,  0.27952024,  0.28856617,  0.04523341, -0.37628468,
        -0.23483181, -0.13341238,  0.53814814,  0.56146916],
       [ 0.06685249, -0.20225271,  0.52662786, -0.60773615, -0.43785557,
        -0.02737757,  0.022656  , -0.30870296, -0.14023771]])
In [88]:
pca.components_.shape
Out[88]:
(3, 9)
In [89]:
X_fulltrain.shape
Out[89]:
(194413, 9)
In [90]:
pca.components_.T.shape
Out[90]:
(9, 3)
In [91]:
np.matmul(X_fulltrain.values,pca.components_.T)
Out[91]:
array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])
In [92]:
pca.fit_transform(X_fulltrain)
Out[92]:
array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])

PCA regression¶

sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,fit()(return self),transform(),fit_transform()即可。

但直接用继承,可以更方便。

  • BaseEstimator是sklearn里最基本的类,其他的类都从这个类继承而来,包括了set_params()和get_params()的方法。
  • TransformerMixin包括了fit_transform()方法。因此由这个类继承而来的话,就不用自定义 fit_transform 了
  • 类似的,RegressorMixin包括了predict()方法
In [95]:
class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_
In [96]:
model = PCARegressor()
In [97]:
model.fit(X=X_fulltrain, y=y_fulltrain)
Out[97]:
PCARegressor()
In [98]:
model.X_
Out[98]:
array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])
In [99]:
hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]
In [100]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [101]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
Out[101]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [102]:
grid_search.best_params_
Out[102]:
{'n_components': 6}
In [103]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
nan {'n_components': 1}
0.03978386586015345 {'n_components': 2}
0.03867864707593599 {'n_components': 3}
0.05065681706795535 {'n_components': 4}
0.050715696965028 {'n_components': 5}
0.052339724870998625 {'n_components': 6}
0.043948248157652296 {'n_components': 7}
0.05198899108126847 {'n_components': 8}
0.04966417067338399 {'n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)
In [104]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)
Out[104]:
-0.010497492168772826
In [105]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.010497492168772826
Test year 2017 : -0.0892205268426225
Test year 2018 : -0.04907431002684648
Test year 2019 : 0.006466449764656601
Test year 2020 : -0.0005226873925128217
Test year 2021 : 0.009468277374521827
Test year 2022 : -0.006204075783554419
Test year 2023 : 0.05486550995882933
CPU times: user 1min 21s, sys: 7.58 s, total: 1min 29s
Wall time: 22.9 s

Pipeline¶

In [106]:
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [107]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [108]:
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)
CPU times: user 4.38 s, sys: 447 ms, total: 4.82 s
Wall time: 1.27 s
Out[108]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))
In [109]:
grid_search.best_params_
Out[109]:
{'pca__n_components': 6}
In [110]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
nan {'pca__n_components': 1}
0.03978386586015345 {'pca__n_components': 2}
0.03867864707593599 {'pca__n_components': 3}
0.05065681706795535 {'pca__n_components': 4}
0.050715696965028 {'pca__n_components': 5}
0.052339724870998625 {'pca__n_components': 6}
0.043948248157652296 {'pca__n_components': 7}
0.05198899108126847 {'pca__n_components': 8}
0.04966417067338399 {'pca__n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)
In [111]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[111]:
-0.010497492168772826

Elastic Net¶

In [112]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [113]:
model = SGDRegressor(penalty='elasticnet')
In [115]:
hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]
In [116]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [117]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
Out[117]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5, 0.7]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [118]:
grid_search.best_params_
Out[118]:
{'alpha': 0.01, 'l1_ratio': 0.5}
In [119]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[119]:
-0.0311148263765626
In [120]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.00864821027186724
Test year 2017 : -0.10212855467368231
Test year 2018 : -0.0360399221201777
Test year 2019 : 0.009780619233734189
Test year 2020 : -0.0013783134494498306
Test year 2021 : 0.007371679077704529
Test year 2022 : -0.010387142856184584
Test year 2023 : 0.0358911891731738
CPU times: user 1min 49s, sys: 3.03 s, total: 1min 52s
Wall time: 39.5 s

Gradient Boosted Regression Trees¶

In [128]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [129]:
hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]
In [130]:
model = GradientBoostingRegressor()
In [131]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [132]:
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
CPU times: user 8min 4s, sys: 1.33 s, total: 8min 6s
Wall time: 8min 10s
Out[132]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [133]:
grid_search.best_params_
Out[133]:
{'learning_rate': 0.1, 'max_depth': 3}
In [134]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)
0.0016364787061411423 {'learning_rate': 0.1, 'max_depth': 1}
0.0031692481731780964 {'learning_rate': 0.1, 'max_depth': 2}
0.005036926182540924 {'learning_rate': 0.1, 'max_depth': 3}
0.0014929783765171845 {'learning_rate': 0.1, 'max_depth': 4}
0.002093148803113287 {'learning_rate': 0.1, 'max_depth': 5}
-0.002867794489446185 {'learning_rate': 0.1, 'max_depth': 6}
0.0009616462813958337 {'learning_rate': 0.05, 'max_depth': 1}
0.002077392481220186 {'learning_rate': 0.05, 'max_depth': 2}
0.004847425597103383 {'learning_rate': 0.05, 'max_depth': 3}
0.004754069888309176 {'learning_rate': 0.05, 'max_depth': 4}
0.004074091843459637 {'learning_rate': 0.05, 'max_depth': 5}
0.0021864643800394434 {'learning_rate': 0.05, 'max_depth': 6}
-0.001052554576351561 {'learning_rate': 0.01, 'max_depth': 1}
-0.00010459917823935072 {'learning_rate': 0.01, 'max_depth': 2}
0.0014224899839351268 {'learning_rate': 0.01, 'max_depth': 3}
0.0030169482746364995 {'learning_rate': 0.01, 'max_depth': 4}
0.003678689341579111 {'learning_rate': 0.01, 'max_depth': 5}
0.003458896012854762 {'learning_rate': 0.01, 'max_depth': 6}
In [135]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[135]:
-0.045627159691304264

Neural Nets¶

In [136]:
tf.__version__
Out[136]:
'2.8.0'
In [137]:
keras.__version__
Out[137]:
'2.8.0'
In [138]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [139]:
X_train.shape
Out[139]:
(77643, 9)
In [140]:
X_val.shape
Out[140]:
(116770, 9)
In [141]:
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
2023-04-03 09:33:33.820524: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [142]:
nn_model.compile(loss='mse',optimizer='sgd')
In [143]:
nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))
Epoch 1/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0265 - val_loss: 0.0278
Epoch 2/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0278
Epoch 3/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276
Epoch 4/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 5/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 6/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 7/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0272
Epoch 8/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 9/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0274
Epoch 10/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
Out[143]:
<keras.callbacks.History at 0x7f9a581a7e20>
In [144]:
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[144]:
0.005978663942921458

GridSeachCV Neural Nets¶

In [159]:
def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(8, activation='relu'))
    nn_model.add(keras.layers.Dense(4, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model
In [160]:
# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/2335962656.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.
  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
In [167]:
hyperparams_grid = {
    'learning_rate':[0.003]
}
In [168]:
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])
In [169]:
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))
Epoch 1/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0326 - val_loss: 0.0286
Epoch 2/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0230 - val_loss: 0.0278
Epoch 3/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0279
Epoch 4/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276
Epoch 5/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0277
Epoch 6/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 7/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 8/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 9/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 10/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276
3650/3650 [==============================] - 3s 906us/step - loss: 0.0276
Epoch 1/10
6076/6076 [==============================] - 6s 975us/step - loss: 0.0333 - val_loss: 0.0272
Epoch 2/10
6076/6076 [==============================] - 6s 954us/step - loss: 0.0253 - val_loss: 0.0268
Epoch 3/10
6076/6076 [==============================] - 6s 956us/step - loss: 0.0252 - val_loss: 0.0267
Epoch 4/10
6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267
Epoch 5/10
6076/6076 [==============================] - 6s 949us/step - loss: 0.0251 - val_loss: 0.0268
Epoch 6/10
6076/6076 [==============================] - 6s 966us/step - loss: 0.0251 - val_loss: 0.0267
Epoch 7/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0268
Epoch 8/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267
Epoch 9/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267
Epoch 10/10
6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267
Out[169]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f9a4ac096a0>,
             param_grid={'learning_rate': [0.003]})
In [170]:
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[170]:
-0.023086233917368748
In [ ]: