In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras
  • Data
    • NA 值处理
    • Use rank instead of numerical values
  • Train, Validation, Test split
  • Evaluation metrics
  • Models
    • Linear regression
    • Huber regressor
    • Random Forest
    • Partial Least Squares
    • Principal Component Regression
      • PCA transform
      • PCA regression
    • Pipeline
    • Elastic Net
    • Gradient Boosted Regression Trees
    • Neural Nets
      • GridSeachCV Neural Nets
  • Transformation pipeline example

Data¶

In [ ]:
df = pd.read_pickle('../../../data/factor_exposure/all_exposure_2024.pkl')
In [ ]:
df
Out[ ]:
secID ret_date tradeDate ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 2007-06-29 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 2007-07-31 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 2007-08-31 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 2007-09-28 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 2007-10-31 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
616458 689009.XSHG 2024-01 2023-12-29 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228
616459 689009.XSHG 2024-02 2024-01-31 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890
616460 689009.XSHG 2024-03 2024-02-29 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755
616461 689009.XSHG 2024-04 2024-03-29 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928
616462 689009.XSHG NaT 2024-04-12 NaN NaN NaN 2024-04 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN 0.260342 NaN 0.000121 NaN NaN NaN NaN

616463 rows × 20 columns

In [ ]:
df.drop('tradeDate',axis=1,inplace=True)
In [ ]:
df
Out[ ]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228
616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890
616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755
616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928
616462 689009.XSHG NaT NaN NaN NaN 2024-04 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN 0.260342 NaN 0.000121 NaN NaN NaN NaN

616463 rows × 19 columns

NA 值处理¶

In [ ]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 5299
ret 23412
rf 5299
exret 23412
ym 0
mktcap 17365
size 17365
rev 22223
mom_date 4110
mom 56595
beta 29710
bm 5210
illiq 36680
illiq_12m 107758
vol 25669
ivol 40175
vol_clip 25669
ivol_clip 40175

ret_date 为 NA 的删除,已到最新数据处

In [ ]:
df = df[~df['ret_date'].isna()].copy()
In [ ]:
df
Out[ ]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 0.123739 NaN NaN NaN NaN NaN NaN
1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN
2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN
3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN
4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
616457 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128 0.017594 0.015128
616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228
616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890
616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755
616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928

611164 rows × 19 columns

momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。

In [ ]:
df.loc[~df['mom'].isna(),'ret_date'].min()
Out[ ]:
Period('2008-01', 'M')
In [ ]:
df = df[df['ret_date'] >= '2008-01'].copy()
In [ ]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
ret 17713
rf 0
exret 17713
ym 0
mktcap 17015
size 17015
rev 21679
mom_date 3931
mom 43462
beta 24044
bm 5017
illiq 30823
illiq_12m 95560
vol 19966
ivol 21978
vol_clip 19966
ivol_clip 21978

剩余的NA值有至少三个来源:

  • 由于停牌日期填充造成,
  • 由于计算时要求最低样本数造成,
  • 由优矿直接给出了NA值

return 的 NA 值直接删除

In [ ]:
df = df[~df['ret'].isna()].copy()
In [ ]:
df
Out[ ]:
secID ret_date ret rf exret ym mktcap size rev mom_date mom beta bm illiq illiq_12m vol ivol vol_clip ivol_clip
6 000001.XSHE 2008-01 -0.137306 0.002949 -0.140255 2007-12 6.574629e+10 24.909069 0.066834 2007-11 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN 0.026541 NaN
7 000001.XSHE 2008-02 -0.004504 0.002946 -0.007450 2008-01 5.850212e+10 24.792329 -0.140255 2007-12 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909 0.037722 0.012909
8 000001.XSHE 2008-03 -0.149321 0.002746 -0.152068 2008-02 5.823860e+10 24.787814 -0.007450 2008-01 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032 0.041448 0.009032
9 000001.XSHE 2008-04 0.050355 0.002862 0.047493 2008-03 4.954234e+10 24.626093 -0.152068 2008-02 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484 0.045109 0.021484
10 000001.XSHE 2008-05 -0.148211 0.002953 -0.151164 2008-04 5.203702e+10 24.675221 0.047493 2008-03 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
616457 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128 0.017594 0.015128
616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228
616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890
616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755
616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928

580482 rows × 19 columns

In [ ]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 4664
mom_date 3931
mom 43434
beta 22149
bm 1463
illiq 13508
illiq_12m 79369
vol 2874
ivol 4699
vol_clip 2874
ivol_clip 4699
In [ ]:
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)
In [ ]:
df.drop(['ret','rf'],axis=1,inplace=True)
In [ ]:
df.reset_index(inplace=True,drop=True)
In [ ]:
df
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128
580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228
580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890
580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755
580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928

580482 rows × 13 columns

  • reversal 的 NA 是由于在对应的return date,上个月停牌所以没有上个月的return。
  • beta, bm 是优矿的NA。可以用当月的横截面上的中值填充
  • illiq, ivol, vol 也可用当月的横截面上的中值填充.
In [ ]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
exret 0
ym 0
size 0
rev 4664
mom 43434
beta 22149
bm 1463
illiq 13508
illiq_12m 79369
vol 2874
ivol 4699
In [ ]:
# Reversal 的空值丢掉,其他的用 median 填充
df = df[~df['rev'].isna()].copy()
In [ ]:
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']
In [ ]:
df
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128
580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228
580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890
580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755
580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928

575818 rows × 13 columns

In [ ]:
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))
In [ ]:
temp.fillna(0, inplace=True)
In [ ]:
temp
Out[ ]:
mom beta bm illiq illiq_12m vol ivol
0 0.796305 0.9468 0.094476 0.000025 0.000502 0.026541 0.000000
1 1.145639 0.9654 0.109513 0.000039 0.000478 0.037722 0.012909
2 0.693690 1.0292 0.110009 0.000064 0.000474 0.041448 0.009032
3 0.558575 1.0238 0.201102 0.000043 0.000528 0.045109 0.021484
4 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ...
580477 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128
580478 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228
580479 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890
580480 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755
580481 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928

575818 rows × 7 columns

In [ ]:
df[cols] = temp.copy()
In [ ]:
for col in df.columns:
    print(col, df[col].isna().sum())
secID 0
ret_date 0
exret 0
ym 0
size 0
rev 0
mom 0
beta 0
bm 0
illiq 0
illiq_12m 0
vol 0
ivol 0
In [ ]:
df
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 0.796305 0.9468 0.094476 0.000025 0.000502 0.026541 0.000000
1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 1.145639 0.9654 0.109513 0.000039 0.000478 0.037722 0.012909
2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 0.693690 1.0292 0.110009 0.000064 0.000474 0.041448 0.009032
3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 0.558575 1.0238 0.201102 0.000043 0.000528 0.045109 0.021484
4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128
580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228
580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890
580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755
580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928

575818 rows × 13 columns

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 575818 entries, 0 to 580481
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype    
---  ------     --------------   -----    
 0   secID      575818 non-null  object   
 1   ret_date   575818 non-null  period[M]
 2   exret      575818 non-null  float64  
 3   ym         575818 non-null  period[M]
 4   size       575818 non-null  float64  
 5   rev        575818 non-null  float64  
 6   mom        575818 non-null  float64  
 7   beta       575818 non-null  float64  
 8   bm         575818 non-null  float64  
 9   illiq      575818 non-null  float64  
 10  illiq_12m  575818 non-null  float64  
 11  vol        575818 non-null  float64  
 12  ivol       575818 non-null  float64  
dtypes: float64(10), object(1), period[M](2)
memory usage: 61.5+ MB

Use rank instead of numerical values¶

$$c_{i,t} = \frac{2}{N+1}CSrank(c^r_{i,t}) - 1$$

$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t

In [ ]:
def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1
In [ ]:
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()
In [ ]:
num_X_cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [ ]:
df[['ret_date']+num_X_cols]
Out[ ]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
0 2008-01 24.909069 0.066834 0.796305 0.9468 0.094476 0.000025 0.000502 0.026541 0.000000
1 2008-02 24.792329 -0.140255 1.145639 0.9654 0.109513 0.000039 0.000478 0.037722 0.012909
2 2008-03 24.787814 -0.007450 0.693690 1.0292 0.110009 0.000064 0.000474 0.041448 0.009032
3 2008-04 24.626093 -0.152068 0.558575 1.0238 0.201102 0.000043 0.000528 0.045109 0.021484
4 2008-05 24.675221 0.047493 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098
... ... ... ... ... ... ... ... ... ... ...
580477 2023-12 23.575535 0.007540 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128
580478 2024-01 23.465801 -0.105996 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228
580479 2024-02 23.226170 -0.214983 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890
580480 2024-03 23.487149 0.296451 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755
580481 2024-04 23.460131 -0.013334 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928

575818 rows × 10 columns

In [ ]:
df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)
Out[ ]:
ret_date size
ret_date
2008-01 0 0.0 0.969559
196 0.0 0.990868
701 0.0 0.522070
1632 0.0 0.678843
1828 0.0 -0.231355
... ... ... ...
2024-04 580322 0.0 -0.224050
580355 0.0 -0.121426
580394 0.0 0.845672
580439 0.0 0.952213
580481 0.0 0.705445

575818 rows × 2 columns

In [ ]:
temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)
In [ ]:
temp
Out[ ]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
ret_date
2008-01 0 0.0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
196 0.0 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
701 0.0 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
1632 0.0 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.511416 0.493151 0.000000
1828 0.0 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ... ...
2024-04 580322 0.0 -0.224050 0.788876 0.872699 0.518997 -0.316882 0.229926 0.310223 -0.073247 0.164121
580355 0.0 -0.121426 0.336859 -0.942812 0.849197 -0.210732 0.381120 0.228750 0.623580 0.095574
580394 0.0 0.845672 -0.481394 0.279279 0.686251 0.255778 -0.156287 -0.175872 -0.578143 -0.592244
580439 0.0 0.952213 -0.869957 0.562867 0.264787 -0.171563 -0.974540 -0.984332 -0.707795 -0.609871
580481 0.0 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143

575818 rows × 10 columns

In [ ]:
temp.drop('ret_date',axis=1).reset_index()
Out[ ]:
ret_date level_1 size rev mom beta bm illiq illiq_12m vol ivol
0 2008-01 0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
1 2008-01 196 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
2 2008-01 701 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
3 2008-01 1632 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.511416 0.493151 0.000000
4 2008-01 1828 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ... ...
575813 2024-04 580322 -0.224050 0.788876 0.872699 0.518997 -0.316882 0.229926 0.310223 -0.073247 0.164121
575814 2024-04 580355 -0.121426 0.336859 -0.942812 0.849197 -0.210732 0.381120 0.228750 0.623580 0.095574
575815 2024-04 580394 0.845672 -0.481394 0.279279 0.686251 0.255778 -0.156287 -0.175872 -0.578143 -0.592244
575816 2024-04 580439 0.952213 -0.869957 0.562867 0.264787 -0.171563 -0.974540 -0.984332 -0.707795 -0.609871
575817 2024-04 580481 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143

575818 rows × 11 columns

In [ ]:
temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp
Out[ ]:
ret_date size rev mom beta bm illiq illiq_12m vol ivol
level_1
0 2008-01 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
196 2008-01 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 -0.989346 0.745814 0.000000
701 2008-01 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 -0.595129 0.814307 0.000000
1632 2008-01 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 -0.511416 0.493151 0.000000
1828 2008-01 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 0.000000 -0.982496 0.000000
... ... ... ... ... ... ... ... ... ... ...
580322 2024-04 -0.224050 0.788876 0.872699 0.518997 -0.316882 0.229926 0.310223 -0.073247 0.164121
580355 2024-04 -0.121426 0.336859 -0.942812 0.849197 -0.210732 0.381120 0.228750 0.623580 0.095574
580394 2024-04 0.845672 -0.481394 0.279279 0.686251 0.255778 -0.156287 -0.175872 -0.578143 -0.592244
580439 2024-04 0.952213 -0.869957 0.562867 0.264787 -0.171563 -0.974540 -0.984332 -0.707795 -0.609871
580481 2024-04 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143

575818 rows × 10 columns

In [ ]:
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)
In [ ]:
del temp
In [ ]:
df_rank
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784
... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365
580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983
580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574
580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436
580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143

575818 rows × 13 columns

In [ ]:
df_rank['size'].describe()
Out[ ]:
count    5.758180e+05
mean     7.502544e-18
std      5.771543e-01
min     -9.996083e-01
25%     -4.998116e-01
50%      0.000000e+00
75%      4.998176e-01
max      9.996083e-01
Name: size, dtype: float64

Train, Validation, Test split¶

In [ ]:
df_rank['year'] = df_rank['ret_date'].dt.year
In [ ]:
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]
In [ ]:
df_rank.groupby('year')['secID'].nunique()
Out[ ]:
year
2008    1463
2009    1530
2010    1841
2011    2142
2012    2383
2013    2432
2014    2549
2015    2772
2016    2941
2017    3392
2018    3522
2019    3648
2020    3961
2021    4422
2022    4770
2023    5110
2024    5111
Name: secID, dtype: int64
In [ ]:
df_rank.groupby('year')['secID'].count()
Out[ ]:
year
2008    16621
2009    17335
2010    19823
2011    23864
2012    26912
2013    28592
2014    29539
2015    31727
2016    33468
2017    37665
2018    41103
2019    41992
2020    44134
2021    49181
2022    54418
2023    59061
2024    20383
Name: secID, dtype: int64
In [ ]:
def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result
In [ ]:
list_flat([[1,2,3],[3,4,5]])
Out[ ]:
[1, 2, 3, 3, 4, 5]
In [ ]:
np.array([[1,2,3],[3,4,5]]).flatten()
Out[ ]:
array([1, 2, 3, 3, 4, 5])
In [ ]:
df_rank
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023
580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983 2024
580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574 2024
580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436 2024
580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143 2024

575818 rows × 14 columns

In [ ]:
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2009-2019], [2020-2023], [2024]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])
In [ ]:
df_rank.loc[fulltrain_idx[-1]]
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
580473 689009.XSHG 2023-08 -0.040985 2023-07 0.701600 -0.517200 -0.731600 0.584200 -0.678400 -0.577600 -0.471200 -0.330000 -0.473600 2023
580474 689009.XSHG 2023-09 0.040598 2023-08 0.708997 0.125000 -0.755175 0.343949 -0.653264 -0.599920 -0.489650 -0.016720 0.261545 2023
580475 689009.XSHG 2023-10 -0.060460 2023-09 0.720055 0.669768 -0.649574 0.496535 -0.684815 -0.497129 -0.520887 -0.263512 -0.524055 2023
580476 689009.XSHG 2023-11 0.007540 2023-10 0.716881 -0.502468 0.138006 0.430207 -0.665153 -0.103653 -0.522606 -0.249358 -0.039289 2023
580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023

555435 rows × 14 columns

In [ ]:
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]
Out[ ]:
array([0, 2, 4])
In [ ]:
test_years = list(range(2016, 2025))
test_years
Out[ ]:
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

Evaluation metrics¶

Clark Watson, West (2007)

In [ ]:
def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)
In [ ]:
r2_oos_scorer = make_scorer(r2_oos)

Sklearn¶

Scikit-learn (sklearn) 的设计理念:

  • Estimators: 可以基于数据估计出参数的东西。用fit()估计。比如填充空值(imputer),linear regression,等等。
  • Transformers (不是神经网络里的那个,不是 Vaswani et al. (2017)): 可以把数据转换成新数据的东西。用transform()转换。一般可以直接用 fit_transform()
  • Predictors: 可以基于数据做预测,比如linear regression

统一的命名规范:

  • hyperparameter可以由model.<hyperparameter>取出,比如model.n_estimators
  • estimated parameters可以由 model.<estimate>取出,比如model.feature_importances_

数据用np.array保存,或者SciPy的稀疏矩阵。避免各类其他包的自定义(比如pandas)

给出了大量的机器学习模型,同时很容易自定义进行拓展。自定义的模型可以很方便的融入到sklearn自带的模型当中

Models¶

Linear regression¶

In [ ]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
In [ ]:
cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
In [ ]:
model = LinearRegression()
In [ ]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009411576536744626
Test year 2017 : -0.08839433918218265
Test year 2018 : -0.04979412952068807
Test year 2019 : 0.006460501095753468
Test year 2020 : -0.001548658826626248
Test year 2021 : 0.011487385386933058
Test year 2022 : -0.0009344902234940111
Test year 2023 : 0.009191975684269216
Test year 2024 : -0.0203421002634514
In [ ]:
cols = ['size','rev','illiq','ivol']
In [ ]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009411576536744626
Test year 2017 : -0.08839433918218265
Test year 2018 : -0.04979412952068807
Test year 2019 : 0.006460501095753468
Test year 2020 : -0.001548658826626248
Test year 2021 : 0.011487385386933058
Test year 2022 : -0.0009344902234940111
Test year 2023 : 0.009191975684269216
Test year 2024 : -0.0203421002634514

Huber regressor¶

In [ ]:
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
In [ ]:
model = HuberRegressor(alpha=0.01,epsilon=1.05)
In [ ]:
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : 0.006876742676993675
Test year 2017 : -0.029175894616541687
Test year 2018 : 0.009048190975758708
Test year 2019 : -0.018440250950227055
Test year 2020 : -0.013739947817012599
Test year 2021 : -0.008465853786439048
Test year 2022 : 0.010892561021420222
Test year 2023 : -0.002345351762501169
Test year 2024 : 0.015101267852206668

Random Forest¶

In [ ]:
cols = num_X_cols
cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [ ]:
hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [3,5]}
]
In [ ]:
model = RandomForestRegressor(random_state=42)
In [ ]:
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']
In [ ]:
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 48.4 s, sys: 456 ms, total: 48.9 s
Wall time: 50.8 s
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5],
                          'n_estimators': [50]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [ ]:
grid_search.best_params_
Out[ ]:
{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
In [ ]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
0.027084789556580172 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50}
0.028556999189124684 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50}
0.05042647627227969 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50}
0.05530932712278905 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
0.04464716807967209 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
0.05370787731186078 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}
In [ ]:
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)
Out[ ]:
features feature_importance
5 illiq 0.260451
1 rev 0.212757
7 vol 0.120819
8 ivol 0.115962
0 size 0.102099
2 mom 0.067074
4 bm 0.049063
6 illiq_12m 0.044348
3 beta 0.027427
In [ ]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
-0.022154653336818875
In [ ]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.022154653336818875
Test year 2017 : -0.08063812247577906
Test year 2018 : -0.03875517775769799
Test year 2019 : 0.00843188559327912
Test year 2020 : 0.00426710793978613
Test year 2021 : 0.011844755800809903
Test year 2022 : -0.002779929058047914
Test year 2023 : 0.009093821371165545
Test year 2024 : -0.017035974030240375
CPU times: user 18min 26s, sys: 10.6 s, total: 18min 37s
Wall time: 19min 19s

Partial Least Squares¶

In [ ]:
cols = num_X_cols
cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [ ]:
model = PLSRegression(n_components=4)
In [ ]:
y_pred.reshape(-1).shape
Out[ ]:
(20383,)
In [ ]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.008579624491225069
Test year 2017 : -0.0932107045146382
Test year 2018 : -0.04912393839310547
Test year 2019 : 0.006105302409689872
Test year 2020 : -0.0015629368636205232
Test year 2021 : 0.011195966741407215
Test year 2022 : -0.000636538680146348
Test year 2023 : 0.009706227073746132
Test year 2024 : -0.02135918156321326
CPU times: user 10.3 s, sys: 783 ms, total: 11.1 s
Wall time: 3.4 s

Principal Component Regression¶

PCA transform¶

In [ ]:
cols = num_X_cols
cols
Out[ ]:
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [ ]:
pca = PCA(3, random_state=42)
In [ ]:
pca.fit(X_fulltrain)
Out[ ]:
PCA(n_components=3, random_state=42)
In [ ]:
pca.components_
Out[ ]:
array([[ 0.54095879, -0.09832092, -0.01437701,  0.13052989,  0.10593014,
        -0.54353902, -0.55735791, -0.16921922, -0.18577673],
       [ 0.1230896 ,  0.28038463,  0.28945179,  0.04301217, -0.37797118,
        -0.22714271, -0.12955155,  0.5400808 ,  0.56362587],
       [ 0.06274676, -0.20182262,  0.52534126, -0.60948758, -0.43840652,
        -0.02333879,  0.01991072, -0.30854347, -0.13975493]])
In [ ]:
pca.components_.shape
Out[ ]:
(3, 9)
In [ ]:
X_fulltrain.shape
Out[ ]:
(194413, 9)
In [ ]:
pca.components_.T.shape
Out[ ]:
(9, 3)
In [ ]:
np.matmul(X_fulltrain.values,pca.components_.T)
Out[ ]:
array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])
In [ ]:
pca.fit_transform(X_fulltrain)
Out[ ]:
array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])

PCA regression¶

sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,fit()(return self),transform(),fit_transform()即可。

但直接用继承,可以更方便。

  • BaseEstimator是sklearn里最基本的类,其他的类都从这个类继承而来,包括了set_params()和get_params()的方法。
  • TransformerMixin包括了fit_transform()方法。因此由这个类继承而来的话,就不用自定义 fit_transform 了
  • 类似的,RegressorMixin包括了predict()方法
In [ ]:
class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_
In [ ]:
model = PCARegressor()
In [ ]:
model.fit(X=X_fulltrain, y=y_fulltrain)
Out[ ]:
PCARegressor()
In [ ]:
model.X_
Out[ ]:
array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])
In [ ]:
hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]
In [ ]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [ ]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [ ]:
grid_search.best_params_
Out[ ]:
{'n_components': 6}
In [ ]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
nan {'n_components': 1}
0.03874473370176844 {'n_components': 2}
0.037587746458808476 {'n_components': 3}
0.04997889197284674 {'n_components': 4}
0.04997497095219836 {'n_components': 5}
0.05161556457239314 {'n_components': 6}
0.041827129620731866 {'n_components': 7}
0.050300834943468646 {'n_components': 8}
0.0487322179924835 {'n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)
In [ ]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)
Out[ ]:
-0.01057358710785472
In [ ]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.01057358710785472
Test year 2017 : -0.08914596959361498
Test year 2018 : -0.04897289891375012
Test year 2019 : 0.006451234792761107
Test year 2020 : -0.00048657810249452815
Test year 2021 : 0.0095202141605224
Test year 2022 : -0.0061009049213132105
Test year 2023 : 0.008028420859995777
Test year 2024 : -0.022204856045419552
CPU times: user 1min 41s, sys: 9.21 s, total: 1min 50s
Wall time: 29.1 s

Pipeline¶

In [ ]:
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [ ]:
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)
CPU times: user 4.1 s, sys: 430 ms, total: 4.53 s
Wall time: 1.54 s
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))
In [ ]:
grid_search.best_params_
Out[ ]:
{'pca__n_components': 6}
In [ ]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)
nan {'pca__n_components': 1}
0.03874473370176844 {'pca__n_components': 2}
0.037587746458808476 {'pca__n_components': 3}
0.04997889197284674 {'pca__n_components': 4}
0.04997497095219836 {'pca__n_components': 5}
0.05161556457239422 {'pca__n_components': 6}
0.041827129620731866 {'pca__n_components': 7}
0.050300834943468646 {'pca__n_components': 8}
0.0487322179924835 {'pca__n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)
In [ ]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
-0.01057358710785472

Elastic Net¶

In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [ ]:
model = SGDRegressor(penalty='elasticnet')
In [ ]:
hyperparam_grid = [{'alpha':[0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]
In [ ]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [ ]:
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5, 0.7]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [ ]:
grid_search.best_params_
Out[ ]:
{'alpha': 0.001, 'l1_ratio': 0.7}
In [ ]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
-0.0110463976016415
In [ ]:
%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.012516057005940606
Test year 2017 : -0.10558980314809396
Test year 2018 : -0.038706726553651816
Test year 2019 : 0.008269590365803325
Test year 2020 : 0.006087524412165979
Test year 2021 : 0.009994658939758372
Test year 2022 : -0.005719992513779193
Test year 2023 : 0.010637533125030907
Test year 2024 : -0.013783028576732859
CPU times: user 1min 41s, sys: 3.04 s, total: 1min 44s
Wall time: 43.1 s

Gradient Boosted Regression Trees¶

In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [ ]:
hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]
In [ ]:
model = GradientBoostingRegressor()
In [ ]:
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
In [ ]:
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
CPU times: user 9min 37s, sys: 11 s, total: 9min 48s
Wall time: 10min 36s
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))
In [ ]:
grid_search.best_params_
Out[ ]:
{'learning_rate': 0.1, 'max_depth': 3}
In [ ]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)
0.0010920514714541918 {'learning_rate': 0.1, 'max_depth': 1}
0.0024649624915880075 {'learning_rate': 0.1, 'max_depth': 2}
0.005179742214071581 {'learning_rate': 0.1, 'max_depth': 3}
0.0027306548772843 {'learning_rate': 0.1, 'max_depth': 4}
0.0002629712278338081 {'learning_rate': 0.1, 'max_depth': 5}
-0.0016882062465084502 {'learning_rate': 0.1, 'max_depth': 6}
0.0005808239346339894 {'learning_rate': 0.05, 'max_depth': 1}
0.0018532340083319276 {'learning_rate': 0.05, 'max_depth': 2}
0.003954208330685383 {'learning_rate': 0.05, 'max_depth': 3}
0.004945961987173897 {'learning_rate': 0.05, 'max_depth': 4}
0.003509540482510065 {'learning_rate': 0.05, 'max_depth': 5}
0.0011919994497078257 {'learning_rate': 0.05, 'max_depth': 6}
-0.0012377392747824345 {'learning_rate': 0.01, 'max_depth': 1}
-0.00038711061142104874 {'learning_rate': 0.01, 'max_depth': 2}
0.0010593345400554677 {'learning_rate': 0.01, 'max_depth': 3}
0.003114775494288402 {'learning_rate': 0.01, 'max_depth': 4}
0.003665138899025089 {'learning_rate': 0.01, 'max_depth': 5}
0.003195682240093589 {'learning_rate': 0.01, 'max_depth': 6}
In [ ]:
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
-0.04906362476677084

Neural Nets¶

In [ ]:
tf.__version__
Out[ ]:
'2.8.0'
In [ ]:
keras.__version__
Out[ ]:
'2.8.0'
In [ ]:
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
In [ ]:
X_train.shape
Out[ ]:
(77643, 9)
In [ ]:
X_val.shape
Out[ ]:
(116770, 9)
In [ ]:
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
2024-05-13 10:01:17.440490: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [ ]:
nn_model.compile(loss='mse',optimizer='sgd')
In [ ]:
nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))
Epoch 1/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0275
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0278
Epoch 4/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277
Epoch 5/10
2427/2427 [==============================] - 4s 2ms/step - loss: 0.0223 - val_loss: 0.0274
Epoch 6/10
2427/2427 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0273
Epoch 7/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 9/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 10/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Out[ ]:
<keras.callbacks.History at 0x7f7d85059d90>
In [ ]:
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
0.003361474358112737

GridSeachCV Neural Nets¶

In [ ]:
def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(8, activation='relu'))
    nn_model.add(keras.layers.Dense(4, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model
In [ ]:
# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/3997882518.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.
  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
In [ ]:
hyperparams_grid = {
    'learning_rate':[0.003,0.001]
}
In [ ]:
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])
In [ ]:
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))
Epoch 1/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0346 - val_loss: 0.0313
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0249 - val_loss: 0.0295
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0235 - val_loss: 0.0282
Epoch 4/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
Epoch 5/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0227 - val_loss: 0.0279
Epoch 6/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0226 - val_loss: 0.0277
Epoch 7/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0225 - val_loss: 0.0277
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 9/10
2427/2427 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0278
Epoch 10/10
2427/2427 [==============================] - 6s 3ms/step - loss: 0.0224 - val_loss: 0.0275
3650/3650 [==============================] - 5s 1ms/step - loss: 0.0275
Epoch 1/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0432 - val_loss: 0.0342
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0278 - val_loss: 0.0303
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0251 - val_loss: 0.0291
Epoch 4/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0241 - val_loss: 0.0286
Epoch 5/10
2427/2427 [==============================] - 6s 3ms/step - loss: 0.0236 - val_loss: 0.0283
Epoch 6/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0233 - val_loss: 0.0281
Epoch 7/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0232 - val_loss: 0.0280
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0230 - val_loss: 0.0280
Epoch 9/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
Epoch 10/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
3650/3650 [==============================] - 5s 1ms/step - loss: 0.0279
Epoch 1/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0297 - val_loss: 0.0273
Epoch 2/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0255 - val_loss: 0.0270
Epoch 3/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0253 - val_loss: 0.0269
Epoch 4/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 5/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 6/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 7/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 8/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0269
Epoch 9/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0269
Epoch 10/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0267
Out[ ]:
GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f7d851852b0>,
             param_grid={'learning_rate': [0.003, 0.001]})
In [ ]:
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
Out[ ]:
-0.02629758156221018

Transformation pipeline example¶

In [ ]:
df_rank
Out[ ]:
secID ret_date exret ym size rev mom beta bm illiq illiq_12m vol ivol year
0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008
1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008
2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008
3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008
4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023
580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983 2024
580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574 2024
580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436 2024
580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143 2024

575818 rows × 14 columns

In [ ]:
illiq_idx = 5
illiq_12m_idx = 6
In [ ]:
X_fulltrain
Out[ ]:
size rev mom beta bm illiq illiq_12m vol ivol
0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000
1 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760
2 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835
3 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785
4 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784
... ... ... ... ... ... ... ... ... ...
556879 -0.911797 -0.975744 0.000000 -0.550165 -0.223815 0.953693 0.000000 0.351709 -0.339214
556880 -0.890682 -0.083639 0.000000 -0.468452 -0.264123 0.271460 0.000000 -0.356566 -0.264123
556881 -0.902493 0.521261 0.000000 -0.303519 -0.313050 0.346041 0.000000 0.332845 0.835777
556882 -0.909058 0.020902 0.000000 -0.273194 -0.325266 0.573891 0.000000 -0.416208 -0.318665
556883 -0.881145 0.871607 0.211299 -0.427733 -0.551724 0.750550 0.995598 0.090242 0.579604

194413 rows × 9 columns

In [ ]:
class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()
In [ ]:
X_fulltrain.values.shape
Out[ ]:
(194413, 9)
In [ ]:
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)
In [ ]:
X_fulltrain_new
Out[ ]:
array([[ 0.9695586 , -0.85844749,  0.        , ...,  0.27549467,
         0.        , -0.48934551],
       [ 0.97153558, -0.45168539,  0.        , ..., -0.63595506,
        -0.7917603 , -0.47977528],
       [ 0.96733482, -0.70007424,  0.        , ...,  0.437268  ,
        -0.62583519, -0.47698589],
       ...,
       [-0.90249267,  0.521261  ,  0.        , ...,  0.33284457,
         0.83577713,  0.17302053],
       [-0.90905757,  0.02090209,  0.        , ..., -0.41620829,
        -0.3186652 ,  0.28694536],
       [-0.88114453,  0.87160675,  0.21129861, ...,  0.09024211,
         0.57960382,  0.8730741 ]])
In [ ]:
X_fulltrain_new.shape
Out[ ]:
(194413, 10)
In [ ]:
# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])
In [ ]:
pipeline.fit_transform(X_fulltrain.values)
Out[ ]:
array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ...,
         4.77437532e-01,  4.06742109e-18, -9.01081929e-01],
       [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ...,
        -1.10212226e+00, -1.37679088e+00, -8.83459288e-01],
       [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ...,
         7.57793805e-01, -1.08826394e+00, -8.78322906e-01],
       ...,
       [-1.56393521e+00,  9.03360864e-01, -7.60380513e-19, ...,
         5.76826008e-01,  1.45333168e+00,  3.18600392e-01],
       [-1.57531157e+00,  3.62239461e-02, -7.60380513e-19, ...,
        -7.21296915e-01, -5.54126473e-01,  5.28381837e-01],
       [-1.52694089e+00,  1.51052051e+00,  3.66337572e-01, ...,
         1.56391306e-01,  1.00787227e+00,  1.60768063e+00]])
In [ ]:
 
In [ ]: