import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline
# import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
df = pd.read_pickle('../../data/factor_exposure/all_exposure.pkl')
df.drop('tradeDate',axis=1,inplace=True)
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | beta | bm | illiq | illiq_12m | mom_date | mom | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | 0.4614 | 0.123739 | NaN | NaN | NaT | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 0.6423 | 0.093992 | 0.000040 | NaN | 2007-06 | NaN | 0.042521 | NaN | 0.042521 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 0.7722 | 0.097085 | 0.000020 | NaN | 2007-07 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 0.7596 | 0.092276 | 0.000025 | NaN | 2007-08 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 0.7988 | 0.083411 | 0.000030 | NaN | 2007-09 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
504875 | 900957.XSHG | 2021-12 | 0.035831 | 0.002026 | 0.033805 | 2021-11 | 1.120560e+08 | 18.534509 | -0.042588 | NaN | NaN | 0.070056 | 0.062884 | 2021-10 | 0.216730 | 0.009639 | 0.007046 | 0.009639 | 0.007046 |
504876 | 900957.XSHG | 2022-01 | -0.022013 | 0.002014 | -0.024027 | 2021-12 | 1.161040e+08 | 18.569997 | 0.033805 | NaN | NaN | 0.078037 | 0.059672 | 2021-11 | 0.211045 | 0.010961 | 0.008692 | 0.010961 | 0.008692 |
504877 | 900957.XSHG | 2022-02 | -0.011254 | 0.001921 | -0.013175 | 2022-01 | 1.135280e+08 | 18.547560 | -0.024027 | NaN | NaN | 0.044515 | 0.058502 | 2021-12 | -0.059172 | 0.010559 | 0.008409 | 0.010559 | 0.008409 |
504878 | 900957.XSHG | 2022-03 | -0.034146 | 0.001919 | -0.036066 | 2022-02 | 1.122400e+08 | 18.536150 | -0.013175 | NaN | NaN | 0.057218 | 0.060208 | 2022-01 | -0.157182 | 0.006517 | 0.004195 | 0.006517 | 0.004195 |
504879 | 900957.XSHG | NaT | NaN | NaN | NaN | 2022-03 | 1.083760e+08 | 18.501117 | -0.036066 | NaN | NaN | NaN | 0.062442 | 2022-02 | -0.117647 | NaN | NaN | NaN | NaN |
504880 rows × 19 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 4853 ret 31888 rf 4853 exret 31888 ym 0 mktcap 23011 size 23011 rev 30586 beta 40704 bm 21512 illiq 41579 illiq_12m 91808 mom_date 3547 mom 49225 vol 30782 ivol 54678 vol_clip 30782 ivol_clip 54678
ret_date 为 NA 的删除,已到最新数据处
df = df[~df['ret_date'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | beta | bm | illiq | illiq_12m | mom_date | mom | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | 0.4614 | 0.123739 | NaN | NaN | NaT | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 0.6423 | 0.093992 | 0.000040 | NaN | 2007-06 | NaN | 0.042521 | NaN | 0.042521 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 0.7722 | 0.097085 | 0.000020 | NaN | 2007-07 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 0.7596 | 0.092276 | 0.000025 | NaN | 2007-08 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 0.7988 | 0.083411 | 0.000030 | NaN | 2007-09 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
504874 | 900957.XSHG | 2021-11 | -0.040625 | 0.001963 | -0.042588 | 2021-10 | 1.168400e+08 | 18.576316 | -0.042478 | NaN | NaN | 0.058457 | 0.067646 | 2021-09 | 0.285164 | 0.011663 | 0.007700 | 0.011663 | 0.007700 |
504875 | 900957.XSHG | 2021-12 | 0.035831 | 0.002026 | 0.033805 | 2021-11 | 1.120560e+08 | 18.534509 | -0.042588 | NaN | NaN | 0.070056 | 0.062884 | 2021-10 | 0.216730 | 0.009639 | 0.007046 | 0.009639 | 0.007046 |
504876 | 900957.XSHG | 2022-01 | -0.022013 | 0.002014 | -0.024027 | 2021-12 | 1.161040e+08 | 18.569997 | 0.033805 | NaN | NaN | 0.078037 | 0.059672 | 2021-11 | 0.211045 | 0.010961 | 0.008692 | 0.010961 | 0.008692 |
504877 | 900957.XSHG | 2022-02 | -0.011254 | 0.001921 | -0.013175 | 2022-01 | 1.135280e+08 | 18.547560 | -0.024027 | NaN | NaN | 0.044515 | 0.058502 | 2021-12 | -0.059172 | 0.010559 | 0.008409 | 0.010559 | 0.008409 |
504878 | 900957.XSHG | 2022-03 | -0.034146 | 0.001919 | -0.036066 | 2022-02 | 1.122400e+08 | 18.536150 | -0.013175 | NaN | NaN | 0.057218 | 0.060208 | 2022-01 | -0.157182 | 0.006517 | 0.004195 | 0.006517 | 0.004195 |
500027 rows × 19 columns
momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。
df.loc[~df['mom'].isna(),'ret_date'].min()
Period('2008-01', 'M')
df = df[df['ret_date'] >= '2008-01'].copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 26378 rf 0 exret 26378 ym 0 mktcap 22483 size 22483 rev 29845 beta 39159 bm 20363 illiq 36007 illiq_12m 79832 mom_date 3381 mom 36211 vol 25455 ivol 37368 vol_clip 25455 ivol_clip 37368
剩余的NA值有至少三个来源:
return 的 NA 值直接删除
df = df[~df['ret'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | beta | bm | illiq | illiq_12m | mom_date | mom | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 000001.XSHE | 2008-01 | -0.137306 | 0.002949 | -0.140255 | 2007-12 | 6.574629e+10 | 24.909069 | 0.066834 | 0.9468 | 0.094476 | 0.000025 | NaN | 2007-11 | NaN | 0.027254 | NaN | 0.027254 | NaN |
7 | 000001.XSHE | 2008-02 | -0.004504 | 0.002946 | -0.007450 | 2008-01 | 5.850212e+10 | 24.792329 | -0.140255 | 0.9654 | 0.109513 | 0.000039 | NaN | 2007-12 | NaN | 0.037722 | 0.013266 | 0.037722 | 0.013266 |
8 | 000001.XSHE | 2008-03 | -0.149321 | 0.002746 | -0.152068 | 2008-02 | 5.823860e+10 | 24.787814 | -0.007450 | 1.0292 | 0.110009 | 0.000064 | NaN | 2008-01 | NaN | 0.041448 | 0.009474 | 0.041448 | 0.009474 |
9 | 000001.XSHE | 2008-04 | 0.050355 | 0.002862 | 0.047493 | 2008-03 | 4.954234e+10 | 24.626093 | -0.152068 | 1.0238 | 0.201102 | 0.000043 | NaN | 2008-02 | NaN | 0.045109 | 0.021746 | 0.045109 | 0.021746 |
10 | 000001.XSHE | 2008-05 | -0.148211 | 0.002953 | -0.151164 | 2008-04 | 5.203702e+10 | 24.675221 | 0.047493 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 2008-03 | NaN | 0.046323 | 0.014474 | 0.046323 | 0.014474 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
504874 | 900957.XSHG | 2021-11 | -0.040625 | 0.001963 | -0.042588 | 2021-10 | 1.168400e+08 | 18.576316 | -0.042478 | NaN | NaN | 0.058457 | 0.067646 | 2021-09 | 0.285164 | 0.011663 | 0.007700 | 0.011663 | 0.007700 |
504875 | 900957.XSHG | 2021-12 | 0.035831 | 0.002026 | 0.033805 | 2021-11 | 1.120560e+08 | 18.534509 | -0.042588 | NaN | NaN | 0.070056 | 0.062884 | 2021-10 | 0.216730 | 0.009639 | 0.007046 | 0.009639 | 0.007046 |
504876 | 900957.XSHG | 2022-01 | -0.022013 | 0.002014 | -0.024027 | 2021-12 | 1.161040e+08 | 18.569997 | 0.033805 | NaN | NaN | 0.078037 | 0.059672 | 2021-11 | 0.211045 | 0.010961 | 0.008692 | 0.010961 | 0.008692 |
504877 | 900957.XSHG | 2022-02 | -0.011254 | 0.001921 | -0.013175 | 2022-01 | 1.135280e+08 | 18.547560 | -0.024027 | NaN | NaN | 0.044515 | 0.058502 | 2021-12 | -0.059172 | 0.010559 | 0.008409 | 0.010559 | 0.008409 |
504878 | 900957.XSHG | 2022-03 | -0.034146 | 0.001919 | -0.036066 | 2022-02 | 1.122400e+08 | 18.536150 | -0.013175 | NaN | NaN | 0.057218 | 0.060208 | 2022-01 | -0.157182 | 0.006517 | 0.004195 | 0.006517 | 0.004195 |
461021 rows × 19 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 0 rf 0 exret 0 ym 0 mktcap 0 size 0 rev 7328 beta 25845 bm 16422 illiq 11127 illiq_12m 62624 mom_date 3381 mom 35755 vol 2799 ivol 12482 vol_clip 2799 ivol_clip 12482
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)
df.drop(['ret','rf'],axis=1,inplace=True)
df.reset_index(inplace=True,drop=True)
df
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | 0.9468 | 0.094476 | 0.000025 | NaN | NaN | 0.027254 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | 0.9654 | 0.109513 | 0.000039 | NaN | NaN | 0.037722 | 0.013266 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | 1.0292 | 0.110009 | 0.000064 | NaN | NaN | 0.041448 | 0.009474 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | 1.0238 | 0.201102 | 0.000043 | NaN | NaN | 0.045109 | 0.021746 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | NaN | 0.046323 | 0.014474 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | 18.576316 | -0.042478 | NaN | NaN | 0.058457 | 0.067646 | 0.285164 | 0.011663 | 0.007700 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | 18.534509 | -0.042588 | NaN | NaN | 0.070056 | 0.062884 | 0.216730 | 0.009639 | 0.007046 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | 18.569997 | 0.033805 | NaN | NaN | 0.078037 | 0.059672 | 0.211045 | 0.010961 | 0.008692 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | 18.547560 | -0.024027 | NaN | NaN | 0.044515 | 0.058502 | -0.059172 | 0.010559 | 0.008409 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | 18.536150 | -0.013175 | NaN | NaN | 0.057218 | 0.060208 | -0.157182 | 0.006517 | 0.004195 |
461021 rows × 13 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 7328 beta 25845 bm 16422 illiq 11127 illiq_12m 62624 mom 35755 vol 2799 ivol 12482
# Reversal 的空值丢掉,其他的用 median 填充
df = df[~df['rev'].isna()].copy()
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']
df
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | 0.9468 | 0.094476 | 0.000025 | NaN | NaN | 0.027254 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | 0.9654 | 0.109513 | 0.000039 | NaN | NaN | 0.037722 | 0.013266 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | 1.0292 | 0.110009 | 0.000064 | NaN | NaN | 0.041448 | 0.009474 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | 1.0238 | 0.201102 | 0.000043 | NaN | NaN | 0.045109 | 0.021746 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | NaN | 0.046323 | 0.014474 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | 18.576316 | -0.042478 | NaN | NaN | 0.058457 | 0.067646 | 0.285164 | 0.011663 | 0.007700 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | 18.534509 | -0.042588 | NaN | NaN | 0.070056 | 0.062884 | 0.216730 | 0.009639 | 0.007046 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | 18.569997 | 0.033805 | NaN | NaN | 0.078037 | 0.059672 | 0.211045 | 0.010961 | 0.008692 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | 18.547560 | -0.024027 | NaN | NaN | 0.044515 | 0.058502 | -0.059172 | 0.010559 | 0.008409 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | 18.536150 | -0.013175 | NaN | NaN | 0.057218 | 0.060208 | -0.157182 | 0.006517 | 0.004195 |
453693 rows × 13 columns
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))
temp.fillna(0, inplace=True)
df[cols] = temp.copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 0 beta 0 bm 0 illiq 0 illiq_12m 0 mom 0 vol 0 ivol 0
df
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | 0.94680 | 0.094476 | 0.000025 | 0.000536 | 0.777814 | 0.027254 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | 0.96540 | 0.109513 | 0.000039 | 0.000524 | 1.119102 | 0.037722 | 0.013266 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | 1.02920 | 0.110009 | 0.000064 | 0.000527 | 0.656120 | 0.041448 | 0.009474 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | 1.02380 | 0.201102 | 0.000043 | 0.000565 | 0.545260 | 0.045109 | 0.021746 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | 1.02120 | 0.206701 | 0.000051 | 0.000038 | -0.055889 | 0.046323 | 0.014474 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | 18.576316 | -0.042478 | 0.47010 | 0.375432 | 0.058457 | 0.067646 | 0.285164 | 0.011663 | 0.007700 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | 18.534509 | -0.042588 | 0.46980 | 0.332403 | 0.070056 | 0.062884 | 0.216730 | 0.009639 | 0.007046 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | 18.569997 | 0.033805 | 0.46910 | 0.324354 | 0.078037 | 0.059672 | 0.211045 | 0.010961 | 0.008692 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | 18.547560 | -0.024027 | 0.55830 | 0.356716 | 0.044515 | 0.058502 | -0.059172 | 0.010559 | 0.008409 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | 18.536150 | -0.013175 | 0.63515 | 0.342607 | 0.057218 | 0.060208 | -0.157182 | 0.006517 | 0.004195 |
453693 rows × 13 columns
$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t
def csrank(df):
return df.rank() * 2 / (len(df) + 1) - 1
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()
num_X_cols
['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']
temp = df[['ret_date']+num_X_cols].groupby('ret_date').apply(csrank)
temp
ret_date | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 |
1 | 0.0 | 0.972444 | -0.412618 | -0.219724 | -0.641769 | -0.960841 | 0.000000 | 0.000000 | -0.601160 | -0.718637 |
2 | 0.0 | 0.968481 | -0.664756 | 0.375358 | -0.588825 | -0.958453 | 0.000000 | 0.000000 | 0.415473 | -0.616046 |
3 | 0.0 | 0.969936 | 0.483178 | -0.079456 | -0.176807 | -0.975662 | 0.000000 | 0.000000 | 0.218325 | 0.400143 |
4 | 0.0 | 0.965567 | 0.522238 | -0.301291 | -0.318508 | -0.971306 | -0.977044 | 0.000000 | -0.707317 | -0.519369 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 0.0 | -0.990901 | -0.011374 | -0.000227 | 0.000000 | 0.989536 | 0.989991 | 0.556415 | -0.897179 | -0.702457 |
461017 | 0.0 | -0.990967 | -0.783198 | 0.000226 | 0.000000 | 0.988708 | 0.989612 | 0.585818 | -0.900181 | -0.775068 |
461018 | 0.0 | -0.991033 | 0.121722 | 0.000000 | 0.000000 | 0.993275 | 0.989240 | 0.392961 | -0.862811 | -0.724277 |
461019 | 0.0 | -0.991125 | 0.656091 | 0.000000 | 0.000000 | 0.988462 | 0.989350 | -0.645440 | -0.958731 | -0.672953 |
461020 | 0.0 | -0.991184 | -0.546396 | 0.000000 | 0.000000 | 0.988980 | 0.990302 | -0.648667 | -0.990743 | -0.902138 |
453693 rows × 10 columns
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
temp.drop('ret_date',axis=1),
left_index=True, right_index=True)
del temp
df_rank
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.972444 | -0.412618 | -0.219724 | -0.641769 | -0.960841 | 0.000000 | 0.000000 | -0.601160 | -0.718637 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.968481 | -0.664756 | 0.375358 | -0.588825 | -0.958453 | 0.000000 | 0.000000 | 0.415473 | -0.616046 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969936 | 0.483178 | -0.079456 | -0.176807 | -0.975662 | 0.000000 | 0.000000 | 0.218325 | 0.400143 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.965567 | 0.522238 | -0.301291 | -0.318508 | -0.971306 | -0.977044 | 0.000000 | -0.707317 | -0.519369 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | -0.990901 | -0.011374 | -0.000227 | 0.000000 | 0.989536 | 0.989991 | 0.556415 | -0.897179 | -0.702457 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | -0.990967 | -0.783198 | 0.000226 | 0.000000 | 0.988708 | 0.989612 | 0.585818 | -0.900181 | -0.775068 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | -0.991033 | 0.121722 | 0.000000 | 0.000000 | 0.993275 | 0.989240 | 0.392961 | -0.862811 | -0.724277 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | -0.991125 | 0.656091 | 0.000000 | 0.000000 | 0.988462 | 0.989350 | -0.645440 | -0.958731 | -0.672953 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | -0.991184 | -0.546396 | 0.000000 | 0.000000 | 0.988980 | 0.990302 | -0.648667 | -0.990743 | -0.902138 |
453693 rows × 13 columns
df_rank.sort_values('ret_date')
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 |
408212 | 601808.XSHG | 2008-01 | -0.131953 | 2007-12 | 0.872527 | -0.838828 | 0.000000 | -0.598535 | -0.919414 | 0.000000 | 0.000000 | 0.623443 | 0.000000 |
316829 | 600377.XSHG | 2008-01 | -0.135120 | 2007-12 | 0.302564 | -0.484249 | -0.454945 | 0.752381 | -0.311355 | -0.570696 | -0.372894 | 0.135531 | 0.000000 |
64314 | 000959.XSHE | 2008-01 | -0.122610 | 2007-12 | 0.765568 | 0.208791 | 0.657143 | 0.786081 | -0.739194 | -0.854945 | -0.238095 | 0.126740 | 0.000000 |
8011 | 000070.XSHE | 2008-01 | 0.015180 | 2007-12 | -0.452015 | 0.028571 | -0.768498 | 0.516484 | 0.815385 | 0.894505 | 0.229304 | 0.265934 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375762 | 600831.XSHG | 2022-03 | 0.057854 | 2022-02 | -0.010800 | 0.136434 | 0.509808 | 0.813974 | -0.038572 | 0.194622 | -0.159797 | 0.881419 | 0.798104 |
24688 | 000576.XSHE | 2022-03 | 0.001405 | 2022-02 | 0.100727 | 0.000661 | -0.745867 | -0.113511 | -0.176108 | -0.410183 | 0.349791 | -0.778268 | -0.056645 |
24569 | 000573.XSHE | 2022-03 | -0.028663 | 2022-02 | -0.462200 | 0.071633 | -0.938726 | 0.701565 | -0.346264 | 0.639850 | 0.622658 | 0.865991 | 0.682169 |
25039 | 000582.XSHE | 2022-03 | -0.059102 | 2022-02 | 0.460877 | -0.300419 | -0.486886 | 0.803394 | 0.105136 | 0.099846 | -0.689663 | -0.427375 | -0.777827 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | -0.991184 | -0.546396 | 0.000000 | 0.000000 | 0.988980 | 0.990302 | -0.648667 | -0.990743 | -0.902138 |
453693 rows × 13 columns
df_rank['year'] = df_rank['ret_date'].dt.year
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]
time_idx
[Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 460852, 460853, 460854, 460855, 460856, 460857, 460858, 460859, 460860, 460861], dtype='int64', length=17347), Int64Index([ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ... 460864, 460865, 460866, 460867, 460868, 460869, 460870, 460871, 460872, 460873], dtype='int64', length=18345), Int64Index([ 24, 25, 26, 27, 28, 29, 31, 32, 192, 193, ... 460876, 460877, 460878, 460879, 460880, 460881, 460882, 460883, 460884, 460885], dtype='int64', length=20770), Int64Index([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, ... 460888, 460889, 460890, 460891, 460892, 460893, 460894, 460895, 460896, 460897], dtype='int64', length=24588), Int64Index([ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, ... 460900, 460901, 460902, 460903, 460904, 460905, 460906, 460907, 460908, 460909], dtype='int64', length=27649), Int64Index([ 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, ... 460912, 460913, 460914, 460915, 460916, 460917, 460918, 460919, 460920, 460921], dtype='int64', length=28885), Int64Index([ 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, ... 460924, 460925, 460926, 460927, 460928, 460929, 460930, 460931, 460932, 460933], dtype='int64', length=28408), Int64Index([ 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, ... 460936, 460937, 460938, 460939, 460940, 460941, 460942, 460943, 460944, 460945], dtype='int64', length=28331), Int64Index([ 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, ... 460948, 460949, 460950, 460951, 460952, 460953, 460954, 460955, 460956, 460957], dtype='int64', length=31459), Int64Index([ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, ... 460960, 460961, 460962, 460963, 460964, 460965, 460966, 460967, 460968, 460969], dtype='int64', length=36050), Int64Index([ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, ... 460972, 460973, 460974, 460975, 460976, 460977, 460978, 460979, 460980, 460981], dtype='int64', length=40026), Int64Index([ 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, ... 460984, 460985, 460986, 460987, 460988, 460989, 460990, 460991, 460992, 460993], dtype='int64', length=43017), Int64Index([ 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, ... 460996, 460997, 460998, 460999, 461000, 461001, 461002, 461003, 461004, 461005], dtype='int64', length=45124), Int64Index([ 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, ... 461008, 461009, 461010, 461011, 461012, 461013, 461014, 461015, 461016, 461017], dtype='int64', length=50192), Int64Index([ 165, 166, 167, 329, 330, 331, 449, 450, 451, 751, ... 459999, 460408, 460409, 460410, 460564, 460565, 460566, 461018, 461019, 461020], dtype='int64', length=13502)]
df_rank.groupby('year')['secID'].nunique()
year 2008 1559 2009 1627 2010 1934 2011 2231 2012 2477 2013 2530 2014 2649 2015 2863 2016 3028 2017 3471 2018 3605 2019 3739 2020 4045 2021 4509 2022 4538 Name: secID, dtype: int64
df_rank.groupby('year')['secID'].count()
year 2008 17347 2009 18345 2010 20770 2011 24588 2012 27649 2013 28885 2014 28408 2015 28331 2016 31459 2017 36050 2018 40026 2019 43017 2020 45124 2021 50192 2022 13502 Name: secID, dtype: int64
def list_flat(list_):
return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
# result = []
# for sublist in list_:
# for item in sublist:
# result.append(item)
# return result
list_flat([[1,2,3],[3,4,5]])
[1, 2, 3, 3, 4, 5]
df_rank
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.972444 | -0.412618 | -0.219724 | -0.641769 | -0.960841 | 0.000000 | 0.000000 | -0.601160 | -0.718637 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.968481 | -0.664756 | 0.375358 | -0.588825 | -0.958453 | 0.000000 | 0.000000 | 0.415473 | -0.616046 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969936 | 0.483178 | -0.079456 | -0.176807 | -0.975662 | 0.000000 | 0.000000 | 0.218325 | 0.400143 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.965567 | 0.522238 | -0.301291 | -0.318508 | -0.971306 | -0.977044 | 0.000000 | -0.707317 | -0.519369 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | -0.990901 | -0.011374 | -0.000227 | 0.000000 | 0.989536 | 0.989991 | 0.556415 | -0.897179 | -0.702457 | 2021 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | -0.990967 | -0.783198 | 0.000226 | 0.000000 | 0.988708 | 0.989612 | 0.585818 | -0.900181 | -0.775068 | 2021 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | -0.991033 | 0.121722 | 0.000000 | 0.000000 | 0.993275 | 0.989240 | 0.392961 | -0.862811 | -0.724277 | 2022 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | -0.991125 | 0.656091 | 0.000000 | 0.000000 | 0.988462 | 0.989350 | -0.645440 | -0.958731 | -0.672953 | 2022 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | -0.991184 | -0.546396 | 0.000000 | 0.000000 | 0.988980 | 0.990302 | -0.648667 | -0.990743 | -0.902138 | 2022 |
453693 rows × 14 columns
cross validation
2008: [0,1,100,1000; 5,10,300;]
cv_idx: [0,1,2,3; 4,5,6]
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2017], [2018-2021], [2022]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
train_idx = list_flat(time_idx[0:i])
val_idx = list_flat(time_idx[i:i+4])
fulltrain_idx.append(train_idx + val_idx)
cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0],
np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
# 因此cv_idx需要用fulltrain_idx的编号从0开始
test_idx.append(time_idx[i+4])
df_rank.loc[fulltrain_idx[4]]
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.972444 | -0.412618 | -0.219724 | -0.641769 | -0.960841 | 0.000000 | 0.000000 | -0.601160 | -0.718637 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.968481 | -0.664756 | 0.375358 | -0.588825 | -0.958453 | 0.000000 | 0.000000 | 0.415473 | -0.616046 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969936 | 0.483178 | -0.079456 | -0.176807 | -0.975662 | 0.000000 | 0.000000 | 0.218325 | 0.400143 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.965567 | 0.522238 | -0.301291 | -0.318508 | -0.971306 | -0.977044 | 0.000000 | -0.707317 | -0.519369 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
460989 | 900957.XSHG | 2019-08 | -0.114800 | 2019-07 | -0.990484 | 0.104954 | 0.000000 | 0.000000 | 0.991604 | 0.986566 | 0.048419 | -0.968094 | -0.889169 | 2019 |
460990 | 900957.XSHG | 2019-09 | 0.096579 | 2019-08 | -0.991126 | -0.888519 | 0.000277 | 0.000000 | 0.990017 | 0.986689 | -0.458680 | -0.567388 | -0.728785 | 2019 |
460991 | 900957.XSHG | 2019-10 | -0.019290 | 2019-09 | -0.991160 | 0.750276 | 0.000000 | 0.000000 | 0.988950 | 0.987845 | -0.747514 | -0.218232 | 0.052486 | 2019 |
460992 | 900957.XSHG | 2019-11 | -0.080039 | 2019-10 | -0.990634 | -0.045730 | 0.000275 | 0.000000 | 0.981267 | 0.987328 | -0.653444 | -0.671625 | -0.627548 | 2019 |
460993 | 900957.XSHG | 2019-12 | 0.046207 | 2019-11 | -0.991228 | -0.588268 | 0.000000 | 0.000000 | 0.980263 | 0.986294 | -0.555921 | 0.112939 | 0.321272 | 2019 |
344875 rows × 14 columns
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]
array([0, 2, 4])
test_years = list(range(2016, 2023))
test_years
[2016, 2017, 2018, 2019, 2020, 2021, 2022]
def r2_oos(y_true, y_pred):
return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)
r2_oos_scorer = make_scorer(r2_oos)
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'beta', 'bm', 'illiq', 'mom', 'ivol']
model = LinearRegression()
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.011040105913657339 Test year 2017 : -0.08027893343542303 Test year 2018 : -0.04274052247156712 Test year 2019 : 0.005845892928012408 Test year 2020 : 0.000447862723551129 Test year 2021 : 0.011070563639831277 Test year 2022 : -0.039843884519424
cols = ['size','rev','illiq','ivol']
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.011177166966521046 Test year 2017 : -0.08141130184048362 Test year 2018 : -0.042766248969311915 Test year 2019 : 0.006650509413867134 Test year 2020 : 0.00018448663790970876 Test year 2021 : 0.01024341147435115 Test year 2022 : -0.040448065360397356
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'beta', 'bm', 'illiq', 'mom', 'ivol']
model = HuberRegressor(alpha=0.01,epsilon=1.05)
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : 0.0023583262241427816 Test year 2017 : -0.036809073349359345 Test year 2018 : 0.0055923630272862734 Test year 2019 : -0.019272124325766526 Test year 2020 : -0.013786542186777107 Test year 2021 : -0.008908423774469876 Test year 2022 : 0.0314098472397073
cols = num_X_cols
cols
['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']
hyperparam_grid = [
{'n_estimators': [20], 'max_depth': [1,3,5],
'max_features': [3]}
]
model = RandomForestRegressor(random_state=42)
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 13.6 s, sys: 299 ms, total: 13.9 s Wall time: 14.5 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=RandomForestRegressor(random_state=42), param_grid=[{'max_depth': [1, 3, 5], 'max_features': [3], 'n_estimators': [20]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'max_depth': 5, 'max_features': 3, 'n_estimators': 20}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
nan {'max_depth': 1, 'max_features': 3, 'n_estimators': 20} 0.02645160793818769 {'max_depth': 3, 'max_features': 3, 'n_estimators': 20} 0.028981572726202026 {'max_depth': 5, 'max_features': 3, 'n_estimators': 20}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
ascending=False)
features | feature_importance | |
---|---|---|
1 | rev | 0.217390 |
4 | illiq | 0.189291 |
0 | size | 0.161213 |
5 | illiq_12m | 0.140252 |
8 | ivol | 0.082429 |
7 | vol | 0.079622 |
6 | mom | 0.045475 |
2 | beta | 0.043145 |
3 | bm | 0.041184 |
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.012291646126052935
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.012832337545194417 Test year 2017 : -0.0820561067657255 Test year 2018 : -0.04409182586886584 Test year 2019 : 0.007830509088117443 Test year 2020 : 0.003591662917594607 Test year 2021 : -0.015391690896670474 CPU times: user 22min 37s, sys: 4.99 s, total: 22min 42s Wall time: 22min 50s
cols = num_X_cols
cols
['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']
model = PLSRegression(n_components=4)
y_pred.reshape(-1).shape
(31459,)
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X_fulltrain, y_fulltrain)
y_pred = model.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009957481993730699 Test year 2017 : -0.08789715100343876 Test year 2018 : -0.04246315607293405 Test year 2019 : 0.005660472134352501 Test year 2020 : 0.0005198826566897852 Test year 2021 : 0.010895831760366126 Test year 2022 : -0.03793343273496119 CPU times: user 9.45 s, sys: 775 ms, total: 10.2 s Wall time: 3.68 s
cols = num_X_cols
cols
['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
pca = PCA(3, random_state=42)
pca.fit(X_fulltrain)
PCA(n_components=3, random_state=42)
pca.components_
array([[ 0.55218289, -0.00712661, 0.12947152, -0.00753788, -0.58554178, -0.57211516, 0.06107138, 0.05366476, 0.03793427], [-0.07942897, 0.29144874, -0.01245223, -0.4038553 , -0.03172494, 0.07656302, 0.28628117, 0.55863802, 0.58702027], [ 0.08360049, -0.23390857, -0.62087249, -0.41365348, -0.04029952, 0.00788101, 0.52037796, -0.29728935, -0.14437987]])
pca.components_.shape
(3, 9)
X_fulltrain.shape
(194323, 9)
pca.components_.T.shape
(9, 3)
np.matmul(X_fulltrain.values,pca.components_.T)
array([[ 1.15415548, 0.16471731, 0.43597185], [ 1.0193875 , -0.66278179, 0.90089929], [ 1.15269411, -0.13666742, 0.25103196], ..., [-1.76038395, -0.48358324, -0.17121601], [-1.69942192, 0.02958169, -0.01992473], [-1.66692222, 0.40290614, -0.05801819]])
pca.fit_transform(X_fulltrain)
array([[ 1.15415548, 0.16471731, 0.43597185], [ 1.0193875 , -0.66278179, 0.90089929], [ 1.15269411, -0.13666742, 0.25103196], ..., [-1.76038395, -0.48358324, -0.17121601], [-1.69942192, 0.02958169, -0.01992473], [-1.66692222, 0.40290614, -0.05801819]])
sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,fit()
(return self),transform()
,fit_transform()
即可。
但直接用继承,可以更方便。
BaseEstimator
是sklearn里最基本的类,其他的类都从这个类继承而来,包括了set_params()
和get_params()
的方法。TransformerMixin
包括了fit_transform()
方法。因此由这个类继承而来的话,就不用自定义 fit_transform
了RegressorMixin
包括了predict()
方法class PCARegressor(BaseEstimator, RegressorMixin):
def __init__(self, n_components=3):
self.n_components = n_components
def fit(self, X, y):
self.pca_ = PCA(n_components=self.n_components).fit(X)
self.X_ = self.pca_.transform(X)
self.reg_ = LinearRegression().fit(self.X_,y)
return self
def predict(self, X):
self.pred_ = self.reg_.predict(self.pca_.transform(X))
return self.pred_
model = PCARegressor()
model.fit(X=X_fulltrain, y=y_fulltrain)
PCARegressor()
model.X_
array([[ 1.15415548, 0.16471731, 0.43597185], [ 1.0193875 , -0.66278179, 0.90089929], [ 1.15269411, -0.13666742, 0.25103196], ..., [-1.76038395, -0.48358324, -0.17121601], [-1.69942192, 0.02958169, -0.01992473], [-1.66692222, 0.40290614, -0.05801819]])
hyperparam_grid = [
{'n_components': range(1, len(cols)+1)}
]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=PCARegressor(), param_grid=[{'n_components': range(1, 10)}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'n_components': 9}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
0.008866500079819884 {'n_components': 1} nan {'n_components': 2} nan {'n_components': 3} 0.027970921235605007 {'n_components': 4} 0.03313896020831564 {'n_components': 5} 0.03412866151593205 {'n_components': 6} 0.027344385016598305 {'n_components': 7} 0.042214846044480614 {'n_components': 8} 0.043720734317050605 {'n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)
-0.010083320861536516
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.010083320861536516 Test year 2017 : -0.0861213984121596 Test year 2018 : -0.04199121460584965 Test year 2019 : 0.006116823960447437 Test year 2020 : 0.0008257380622559429 Test year 2021 : 0.009637627897886802 Test year 2022 : -0.041629032594069804 CPU times: user 1min 45s, sys: 11 s, total: 1min 56s Wall time: 34.5 s
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)
CPU times: user 6.5 s, sys: 761 ms, total: 7.26 s Wall time: 2.44 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=Pipeline(steps=[('pca', PCA()), ('linear_regression', LinearRegression())]), param_grid={'pca__n_components': range(1, 10)}, return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'pca__n_components': 9}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
0.008866500079819884 {'pca__n_components': 1} nan {'pca__n_components': 2} nan {'pca__n_components': 3} 0.027970921235601038 {'pca__n_components': 4} 0.03313896020831564 {'pca__n_components': 5} 0.03412866151593205 {'pca__n_components': 6} 0.027344385016598305 {'pca__n_components': 7} 0.042214846044480614 {'pca__n_components': 8} 0.043720734317050605 {'pca__n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.010083320861536516
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
model = SGDRegressor(penalty='elasticnet')
hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
'l1_ratio':[0.15, 0.30, 0.5]}]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=SGDRegressor(penalty='elasticnet'), param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1], 'l1_ratio': [0.15, 0.3, 0.5]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'alpha': 0.0001, 'l1_ratio': 0.15}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.01649354448289997
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.01794839855231989 Test year 2017 : -0.0889158386492046 Test year 2018 : -0.049128472181301674 Test year 2019 : 0.007448471572534254 Test year 2020 : 0.00131514690209944 Test year 2021 : 0.007089595884243738 Test year 2022 : -0.036756966095477184 CPU times: user 1min 10s, sys: 1.5 s, total: 1min 12s Wall time: 24.5 s
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
hyperparam_grid = [
{'max_depth': [1,2,3,4,5,6],
'learning_rate': [0.1, 0.05, 0.01]}
]
model = GradientBoostingRegressor()
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
CPU times: user 9min 1s, sys: 2.38 s, total: 9min 3s Wall time: 9min 8s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=GradientBoostingRegressor(), param_grid=[{'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [1, 2, 3, 4, 5, 6]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'learning_rate': 0.05, 'max_depth': 4}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(mean_score, params)
0.0014027564804455217 {'learning_rate': 0.1, 'max_depth': 1} 0.0021418471187519827 {'learning_rate': 0.1, 'max_depth': 2} 0.002198125424866415 {'learning_rate': 0.1, 'max_depth': 3} 0.000531469891437597 {'learning_rate': 0.1, 'max_depth': 4} -0.0027277709656639004 {'learning_rate': 0.1, 'max_depth': 5} -0.006314742607181678 {'learning_rate': 0.1, 'max_depth': 6} 0.0008960738128721557 {'learning_rate': 0.05, 'max_depth': 1} 0.001329288644544957 {'learning_rate': 0.05, 'max_depth': 2} 0.0032426540517009883 {'learning_rate': 0.05, 'max_depth': 3} 0.0039906633223752985 {'learning_rate': 0.05, 'max_depth': 4} 0.002606190722552637 {'learning_rate': 0.05, 'max_depth': 5} -0.00022434254377290408 {'learning_rate': 0.05, 'max_depth': 6} -0.0009877248534237992 {'learning_rate': 0.01, 'max_depth': 1} -0.000191156503541956 {'learning_rate': 0.01, 'max_depth': 2} 0.0012339578747196933 {'learning_rate': 0.01, 'max_depth': 3} 0.0026258496548985377 {'learning_rate': 0.01, 'max_depth': 4} 0.003356999919468695 {'learning_rate': 0.01, 'max_depth': 5} 0.0038975361950893683 {'learning_rate': 0.01, 'max_depth': 6}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.017481311295932223
tf.__version__
'2.4.1'
keras.__version__
'2.4.0'
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
X_train.shape
(81050, 9)
X_val.shape
(113273, 9)
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
nn_model.compile(loss='mse',optimizer='sgd')
nn_model.fit(X_train, y_train, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0394 - val_loss: 0.0239 Epoch 2/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0227 - val_loss: 0.0239 Epoch 3/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0225 - val_loss: 0.0243 Epoch 4/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0220 - val_loss: 0.0239 Epoch 5/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0220 - val_loss: 0.0238 Epoch 6/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0221 - val_loss: 0.0238 Epoch 7/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0224 - val_loss: 0.0237 Epoch 8/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0220 - val_loss: 0.0242 Epoch 9/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0221 - val_loss: 0.0239 Epoch 10/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0222 - val_loss: 0.0242
<tensorflow.python.keras.callbacks.History at 0x7fd17d1a86a0>
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
0.00047489305309966756
def build_model(learning_rate=0.003):
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[9]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
optimizer = keras.optimizers.SGD(lr=learning_rate)
nn_model.compile(loss="mse", optimizer=optimizer)
return nn_model
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
hyperparams_grid = {
'learning_rate':[0.003,0.001]
}
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0481 - val_loss: 0.0291 Epoch 2/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0258 - val_loss: 0.0260 Epoch 3/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0237 - val_loss: 0.0251 Epoch 4/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0230 - val_loss: 0.0251 Epoch 5/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0226 - val_loss: 0.0246 Epoch 6/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0246 Epoch 7/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0226 - val_loss: 0.0244 Epoch 8/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0226 - val_loss: 0.0243 Epoch 9/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0223 - val_loss: 0.0244 Epoch 10/10 2533/2533 [==============================] - 8s 3ms/step - loss: 0.0222 - val_loss: 0.0242 3540/3540 [==============================] - 8s 2ms/step - loss: 0.0242 Epoch 1/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.1056 - val_loss: 0.0348 Epoch 2/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0303 - val_loss: 0.0291 Epoch 3/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0259 - val_loss: 0.0269 Epoch 4/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0245 - val_loss: 0.0258 Epoch 5/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0233 - val_loss: 0.0252 Epoch 6/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0227 - val_loss: 0.0249 Epoch 7/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0228 - val_loss: 0.0246 Epoch 8/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0222 - val_loss: 0.0244 Epoch 9/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0243 Epoch 10/10 2533/2533 [==============================] - 7s 3ms/step - loss: 0.0221 - val_loss: 0.0243 3540/3540 [==============================] - 8s 2ms/step - loss: 0.0243 Epoch 1/10 6073/6073 [==============================] - 14s 2ms/step - loss: 0.0409 - val_loss: 0.0233 Epoch 2/10 6073/6073 [==============================] - 12s 2ms/step - loss: 0.0230 - val_loss: 0.0232 Epoch 3/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0230 - val_loss: 0.0232 Epoch 4/10 6073/6073 [==============================] - 14s 2ms/step - loss: 0.0229 - val_loss: 0.0232 Epoch 5/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0232 - val_loss: 0.0232 Epoch 6/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0233 - val_loss: 0.0232 Epoch 7/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0227 - val_loss: 0.0232 Epoch 8/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0227 - val_loss: 0.0233 Epoch 9/10 6073/6073 [==============================] - 14s 2ms/step - loss: 0.0229 - val_loss: 0.0232 Epoch 10/10 6073/6073 [==============================] - 13s 2ms/step - loss: 0.0228 - val_loss: 0.0231
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 81047, 81048, 81049]), array([ 81050, 81051, 81052, ..., 194320, 194321, 194322]))], estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7fd17d1ae880>, param_grid={'learning_rate': [0.003, 0.001]})
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.025505656518980402
df_rank
secID | ret_date | exret | ym | size | rev | beta | bm | illiq | illiq_12m | mom | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.970696 | -0.853480 | 0.123810 | -0.696703 | -0.979487 | 0.000000 | 0.000000 | 0.321612 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.972444 | -0.412618 | -0.219724 | -0.641769 | -0.960841 | 0.000000 | 0.000000 | -0.601160 | -0.718637 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.968481 | -0.664756 | 0.375358 | -0.588825 | -0.958453 | 0.000000 | 0.000000 | 0.415473 | -0.616046 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969936 | 0.483178 | -0.079456 | -0.176807 | -0.975662 | 0.000000 | 0.000000 | 0.218325 | 0.400143 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.965567 | 0.522238 | -0.301291 | -0.318508 | -0.971306 | -0.977044 | 0.000000 | -0.707317 | -0.519369 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
461016 | 900957.XSHG | 2021-11 | -0.042588 | 2021-10 | -0.990901 | -0.011374 | -0.000227 | 0.000000 | 0.989536 | 0.989991 | 0.556415 | -0.897179 | -0.702457 | 2021 |
461017 | 900957.XSHG | 2021-12 | 0.033805 | 2021-11 | -0.990967 | -0.783198 | 0.000226 | 0.000000 | 0.988708 | 0.989612 | 0.585818 | -0.900181 | -0.775068 | 2021 |
461018 | 900957.XSHG | 2022-01 | -0.024027 | 2021-12 | -0.991033 | 0.121722 | 0.000000 | 0.000000 | 0.993275 | 0.989240 | 0.392961 | -0.862811 | -0.724277 | 2022 |
461019 | 900957.XSHG | 2022-02 | -0.013175 | 2022-01 | -0.991125 | 0.656091 | 0.000000 | 0.000000 | 0.988462 | 0.989350 | -0.645440 | -0.958731 | -0.672953 | 2022 |
461020 | 900957.XSHG | 2022-03 | -0.036066 | 2022-02 | -0.991184 | -0.546396 | 0.000000 | 0.000000 | 0.988980 | 0.990302 | -0.648667 | -0.990743 | -0.902138 | 2022 |
453693 rows × 14 columns
X_fulltrain.columns.tolist()
['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']
X_fulltrain.columns.tolist().index('illiq')
4
X_fulltrain.columns.tolist().index('illiq_12m')
5
illiq_idx = 4
illiq_12m_idx = 5
class FeatureAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_avg_illiq=True):
self.add_avg_illiq = add_avg_illiq
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
return np.c_[X, avg_illiq]
feature_adder = FeatureAdder()
X_fulltrain.values.shape
(194323, 9)
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)
X_fulltrain_new
array([[ 0.97069597, -0.85347985, 0.12380952, ..., 0.32161172, 0. , -0.48974359], [ 0.9724438 , -0.41261784, -0.21972444, ..., -0.60116026, -0.71863669, -0.48042059], [ 0.96848138, -0.66475645, 0.37535817, ..., 0.41547278, -0.61604585, -0.47922636], ..., [-0.98794143, 0.42204996, 0. , ..., -0.48578811, -0.60292851, 0.98535745], [-0.98826488, 0.30008382, 0. , ..., -0.22212909, -0.16680637, 0.98658843], [-0.98773006, 0.68179959, 0. , ..., -0.2196319 , 0.18200409, 0.97832311]])
X_fulltrain_new.shape
(194323, 10)
# This can be added to a pipeline
pipeline = Pipeline([
('feature_adder', FeatureAdder()),
('std_scaler', StandardScaler())
])
pipeline.fit_transform(X_fulltrain.values)
array([[ 1.68212553e+00, -1.47900112e+00, 2.14580503e-01, ..., 5.57323099e-01, 5.08734687e-19, -8.93034985e-01], [ 1.68515435e+00, -7.15028295e-01, -3.80815457e-01, ..., -1.04175463e+00, -1.24981219e+00, -8.76034741e-01], [ 1.67828784e+00, -1.15196102e+00, 6.50552087e-01, ..., 7.19975553e-01, -1.07139200e+00, -8.73857086e-01], ..., [-1.71201030e+00, 7.31373277e-01, -4.05585273e-18, ..., -8.41825465e-01, -1.04857907e+00, 1.79677426e+00], [-1.71257081e+00, 5.20017322e-01, -4.05585273e-18, ..., -3.84928977e-01, -2.90100180e-01, 1.79901892e+00], [-1.71164402e+00, 1.18149521e+00, -4.05585273e-18, ..., -3.80601589e-01, 3.16531192e-01, 1.78394732e+00]])