import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline
# import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
Data¶
df = pd.read_pickle('../../../data/factor_exposure/all_exposure_2024.pkl')
df
secID | ret_date | tradeDate | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 2007-06-29 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 2007-07-31 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 2007-08-31 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 2007-09-28 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | 2007-10-31 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
616458 | 689009.XSHG | 2024-01 | 2023-12-29 | -0.213082 | 0.001902 | -0.214983 | 2023-12 | 1.552630e+10 | 23.465801 | -0.105996 | 2023-11 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 | 0.024634 | 0.018228 |
616459 | 689009.XSHG | 2024-02 | 2024-01-31 | 0.298201 | 0.001749 | 0.296451 | 2024-01 | 1.221793e+10 | 23.226170 | -0.214983 | 2023-12 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 | 0.024607 | 0.013890 |
616460 | 689009.XSHG | 2024-03 | 2024-02-29 | -0.011551 | 0.001783 | -0.013334 | 2024-02 | 1.586132e+10 | 23.487149 | 0.296451 | 2024-01 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 | 0.044243 | 0.024755 |
616461 | 689009.XSHG | 2024-04 | 2024-03-29 | -0.071786 | 0.001687 | -0.073474 | 2024-03 | 1.543851e+10 | 23.460131 | -0.013334 | 2024-02 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 | 0.030206 | 0.022928 |
616462 | 689009.XSHG | NaT | 2024-04-12 | NaN | NaN | NaN | 2024-04 | 1.433023e+10 | 23.385637 | -0.073474 | 2024-03 | -0.104366 | NaN | 0.260342 | NaN | 0.000121 | NaN | NaN | NaN | NaN |
616463 rows × 20 columns
df.drop('tradeDate',axis=1,inplace=True)
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
616458 | 689009.XSHG | 2024-01 | -0.213082 | 0.001902 | -0.214983 | 2023-12 | 1.552630e+10 | 23.465801 | -0.105996 | 2023-11 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 | 0.024634 | 0.018228 |
616459 | 689009.XSHG | 2024-02 | 0.298201 | 0.001749 | 0.296451 | 2024-01 | 1.221793e+10 | 23.226170 | -0.214983 | 2023-12 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 | 0.024607 | 0.013890 |
616460 | 689009.XSHG | 2024-03 | -0.011551 | 0.001783 | -0.013334 | 2024-02 | 1.586132e+10 | 23.487149 | 0.296451 | 2024-01 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 | 0.044243 | 0.024755 |
616461 | 689009.XSHG | 2024-04 | -0.071786 | 0.001687 | -0.073474 | 2024-03 | 1.543851e+10 | 23.460131 | -0.013334 | 2024-02 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 | 0.030206 | 0.022928 |
616462 | 689009.XSHG | NaT | NaN | NaN | NaN | 2024-04 | 1.433023e+10 | 23.385637 | -0.073474 | 2024-03 | -0.104366 | NaN | 0.260342 | NaN | 0.000121 | NaN | NaN | NaN | NaN |
616463 rows × 19 columns
NA 值处理¶
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 5299 ret 23412 rf 5299 exret 23412 ym 0 mktcap 17365 size 17365 rev 22223 mom_date 4110 mom 56595 beta 29710 bm 5210 illiq 36680 illiq_12m 107758 vol 25669 ivol 40175 vol_clip 25669 ivol_clip 40175
ret_date 为 NA 的删除,已到最新数据处
df = df[~df['ret_date'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
616457 | 689009.XSHG | 2023-12 | -0.103927 | 0.002068 | -0.105996 | 2023-11 | 1.732706e+10 | 23.575535 | 0.007540 | 2023-10 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 | 0.017594 | 0.015128 |
616458 | 689009.XSHG | 2024-01 | -0.213082 | 0.001902 | -0.214983 | 2023-12 | 1.552630e+10 | 23.465801 | -0.105996 | 2023-11 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 | 0.024634 | 0.018228 |
616459 | 689009.XSHG | 2024-02 | 0.298201 | 0.001749 | 0.296451 | 2024-01 | 1.221793e+10 | 23.226170 | -0.214983 | 2023-12 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 | 0.024607 | 0.013890 |
616460 | 689009.XSHG | 2024-03 | -0.011551 | 0.001783 | -0.013334 | 2024-02 | 1.586132e+10 | 23.487149 | 0.296451 | 2024-01 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 | 0.044243 | 0.024755 |
616461 | 689009.XSHG | 2024-04 | -0.071786 | 0.001687 | -0.073474 | 2024-03 | 1.543851e+10 | 23.460131 | -0.013334 | 2024-02 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 | 0.030206 | 0.022928 |
611164 rows × 19 columns
momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。
df.loc[~df['mom'].isna(),'ret_date'].min()
Period('2008-01', 'M')
df = df[df['ret_date'] >= '2008-01'].copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 17713 rf 0 exret 17713 ym 0 mktcap 17015 size 17015 rev 21679 mom_date 3931 mom 43462 beta 24044 bm 5017 illiq 30823 illiq_12m 95560 vol 19966 ivol 21978 vol_clip 19966 ivol_clip 21978
剩余的NA值有至少三个来源:
- 由于停牌日期填充造成,
- 由于计算时要求最低样本数造成,
- 由优矿直接给出了NA值
return 的 NA 值直接删除
df = df[~df['ret'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 000001.XSHE | 2008-01 | -0.137306 | 0.002949 | -0.140255 | 2007-12 | 6.574629e+10 | 24.909069 | 0.066834 | 2007-11 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN | 0.026541 | NaN |
7 | 000001.XSHE | 2008-02 | -0.004504 | 0.002946 | -0.007450 | 2008-01 | 5.850212e+10 | 24.792329 | -0.140255 | 2007-12 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 | 0.037722 | 0.012909 |
8 | 000001.XSHE | 2008-03 | -0.149321 | 0.002746 | -0.152068 | 2008-02 | 5.823860e+10 | 24.787814 | -0.007450 | 2008-01 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 | 0.041448 | 0.009032 |
9 | 000001.XSHE | 2008-04 | 0.050355 | 0.002862 | 0.047493 | 2008-03 | 4.954234e+10 | 24.626093 | -0.152068 | 2008-02 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 | 0.045109 | 0.021484 |
10 | 000001.XSHE | 2008-05 | -0.148211 | 0.002953 | -0.151164 | 2008-04 | 5.203702e+10 | 24.675221 | 0.047493 | 2008-03 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
616457 | 689009.XSHG | 2023-12 | -0.103927 | 0.002068 | -0.105996 | 2023-11 | 1.732706e+10 | 23.575535 | 0.007540 | 2023-10 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 | 0.017594 | 0.015128 |
616458 | 689009.XSHG | 2024-01 | -0.213082 | 0.001902 | -0.214983 | 2023-12 | 1.552630e+10 | 23.465801 | -0.105996 | 2023-11 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 | 0.024634 | 0.018228 |
616459 | 689009.XSHG | 2024-02 | 0.298201 | 0.001749 | 0.296451 | 2024-01 | 1.221793e+10 | 23.226170 | -0.214983 | 2023-12 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 | 0.024607 | 0.013890 |
616460 | 689009.XSHG | 2024-03 | -0.011551 | 0.001783 | -0.013334 | 2024-02 | 1.586132e+10 | 23.487149 | 0.296451 | 2024-01 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 | 0.044243 | 0.024755 |
616461 | 689009.XSHG | 2024-04 | -0.071786 | 0.001687 | -0.073474 | 2024-03 | 1.543851e+10 | 23.460131 | -0.013334 | 2024-02 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 | 0.030206 | 0.022928 |
580482 rows × 19 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 0 rf 0 exret 0 ym 0 mktcap 0 size 0 rev 4664 mom_date 3931 mom 43434 beta 22149 bm 1463 illiq 13508 illiq_12m 79369 vol 2874 ivol 4699 vol_clip 2874 ivol_clip 4699
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)
df.drop(['ret','rf'],axis=1,inplace=True)
df.reset_index(inplace=True,drop=True)
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 23.575535 | 0.007540 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 23.465801 | -0.105996 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 23.226170 | -0.214983 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 23.487149 | 0.296451 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 23.460131 | -0.013334 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 |
580482 rows × 13 columns
- reversal 的 NA 是由于在对应的return date,上个月停牌所以没有上个月的return。
- beta, bm 是优矿的NA。可以用当月的横截面上的中值填充
- illiq, ivol, vol 也可用当月的横截面上的中值填充.
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 4664 mom 43434 beta 22149 bm 1463 illiq 13508 illiq_12m 79369 vol 2874 ivol 4699
# Reversal 的空值丢掉,其他的用 median 填充
df = df[~df['rev'].isna()].copy()
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 23.575535 | 0.007540 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 23.465801 | -0.105996 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 23.226170 | -0.214983 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 23.487149 | 0.296451 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 23.460131 | -0.013334 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 |
575818 rows × 13 columns
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))
temp.fillna(0, inplace=True)
temp
mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|
0 | 0.796305 | 0.9468 | 0.094476 | 0.000025 | 0.000502 | 0.026541 | 0.000000 |
1 | 1.145639 | 0.9654 | 0.109513 | 0.000039 | 0.000478 | 0.037722 | 0.012909 |
2 | 0.693690 | 1.0292 | 0.110009 | 0.000064 | 0.000474 | 0.041448 | 0.009032 |
3 | 0.558575 | 1.0238 | 0.201102 | 0.000043 | 0.000528 | 0.045109 | 0.021484 |
4 | -0.048874 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... |
580477 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 |
580478 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 |
580479 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 |
580480 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 |
580481 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 |
575818 rows × 7 columns
df[cols] = temp.copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 0 mom 0 beta 0 bm 0 illiq 0 illiq_12m 0 vol 0 ivol 0
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | 0.796305 | 0.9468 | 0.094476 | 0.000025 | 0.000502 | 0.026541 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | 1.145639 | 0.9654 | 0.109513 | 0.000039 | 0.000478 | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | 0.693690 | 1.0292 | 0.110009 | 0.000064 | 0.000474 | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | 0.558575 | 1.0238 | 0.201102 | 0.000043 | 0.000528 | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | -0.048874 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 23.575535 | 0.007540 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 23.465801 | -0.105996 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 23.226170 | -0.214983 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 23.487149 | 0.296451 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 23.460131 | -0.013334 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 |
575818 rows × 13 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 575818 entries, 0 to 580481 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 secID 575818 non-null object 1 ret_date 575818 non-null period[M] 2 exret 575818 non-null float64 3 ym 575818 non-null period[M] 4 size 575818 non-null float64 5 rev 575818 non-null float64 6 mom 575818 non-null float64 7 beta 575818 non-null float64 8 bm 575818 non-null float64 9 illiq 575818 non-null float64 10 illiq_12m 575818 non-null float64 11 vol 575818 non-null float64 12 ivol 575818 non-null float64 dtypes: float64(10), object(1), period[M](2) memory usage: 61.5+ MB
Use rank instead of numerical values¶
$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t
def csrank(df):
return df.rank() * 2 / (len(df) + 1) - 1
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()
num_X_cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
df[['ret_date']+num_X_cols]
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-01 | 24.909069 | 0.066834 | 0.796305 | 0.9468 | 0.094476 | 0.000025 | 0.000502 | 0.026541 | 0.000000 |
1 | 2008-02 | 24.792329 | -0.140255 | 1.145639 | 0.9654 | 0.109513 | 0.000039 | 0.000478 | 0.037722 | 0.012909 |
2 | 2008-03 | 24.787814 | -0.007450 | 0.693690 | 1.0292 | 0.110009 | 0.000064 | 0.000474 | 0.041448 | 0.009032 |
3 | 2008-04 | 24.626093 | -0.152068 | 0.558575 | 1.0238 | 0.201102 | 0.000043 | 0.000528 | 0.045109 | 0.021484 |
4 | 2008-05 | 24.675221 | 0.047493 | -0.048874 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 2023-12 | 23.575535 | 0.007540 | -0.017675 | 0.9541 | 0.221803 | 0.000086 | 0.000115 | 0.017594 | 0.015128 |
580478 | 2024-01 | 23.465801 | -0.105996 | 0.085602 | 1.0448 | 0.247525 | 0.000110 | 0.000107 | 0.024634 | 0.018228 |
580479 | 2024-02 | 23.226170 | -0.214983 | -0.106357 | 1.2314 | 0.313607 | 0.000184 | 0.000116 | 0.024607 | 0.013890 |
580480 | 2024-03 | 23.487149 | 0.296451 | -0.292727 | 1.4905 | 0.241569 | 0.000164 | 0.000120 | 0.044243 | 0.024755 |
580481 | 2024-04 | 23.460131 | -0.013334 | -0.195005 | 1.5477 | 0.247127 | 0.000085 | 0.000118 | 0.030206 | 0.022928 |
575818 rows × 10 columns
df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)
ret_date | size | ||
---|---|---|---|
ret_date | |||
2008-01 | 0 | 0.0 | 0.969559 |
196 | 0.0 | 0.990868 | |
701 | 0.0 | 0.522070 | |
1632 | 0.0 | 0.678843 | |
1828 | 0.0 | -0.231355 | |
... | ... | ... | ... |
2024-04 | 580322 | 0.0 | -0.224050 |
580355 | 0.0 | -0.121426 | |
580394 | 0.0 | 0.845672 | |
580439 | 0.0 | 0.952213 | |
580481 | 0.0 | 0.705445 |
575818 rows × 2 columns
temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)
temp
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | ||
---|---|---|---|---|---|---|---|---|---|---|---|
ret_date | |||||||||||
2008-01 | 0 | 0.0 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
196 | 0.0 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 | |
701 | 0.0 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 | |
1632 | 0.0 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.511416 | 0.493151 | 0.000000 | |
1828 | 0.0 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2024-04 | 580322 | 0.0 | -0.224050 | 0.788876 | 0.872699 | 0.518997 | -0.316882 | 0.229926 | 0.310223 | -0.073247 | 0.164121 |
580355 | 0.0 | -0.121426 | 0.336859 | -0.942812 | 0.849197 | -0.210732 | 0.381120 | 0.228750 | 0.623580 | 0.095574 | |
580394 | 0.0 | 0.845672 | -0.481394 | 0.279279 | 0.686251 | 0.255778 | -0.156287 | -0.175872 | -0.578143 | -0.592244 | |
580439 | 0.0 | 0.952213 | -0.869957 | 0.562867 | 0.264787 | -0.171563 | -0.974540 | -0.984332 | -0.707795 | -0.609871 | |
580481 | 0.0 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 |
575818 rows × 10 columns
temp.drop('ret_date',axis=1).reset_index()
ret_date | level_1 | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-01 | 0 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
1 | 2008-01 | 196 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 |
2 | 2008-01 | 701 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 |
3 | 2008-01 | 1632 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.511416 | 0.493151 | 0.000000 |
4 | 2008-01 | 1828 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
575813 | 2024-04 | 580322 | -0.224050 | 0.788876 | 0.872699 | 0.518997 | -0.316882 | 0.229926 | 0.310223 | -0.073247 | 0.164121 |
575814 | 2024-04 | 580355 | -0.121426 | 0.336859 | -0.942812 | 0.849197 | -0.210732 | 0.381120 | 0.228750 | 0.623580 | 0.095574 |
575815 | 2024-04 | 580394 | 0.845672 | -0.481394 | 0.279279 | 0.686251 | 0.255778 | -0.156287 | -0.175872 | -0.578143 | -0.592244 |
575816 | 2024-04 | 580439 | 0.952213 | -0.869957 | 0.562867 | 0.264787 | -0.171563 | -0.974540 | -0.984332 | -0.707795 | -0.609871 |
575817 | 2024-04 | 580481 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 |
575818 rows × 11 columns
temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|
level_1 | ||||||||||
0 | 2008-01 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
196 | 2008-01 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 |
701 | 2008-01 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 |
1632 | 2008-01 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.511416 | 0.493151 | 0.000000 |
1828 | 2008-01 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580322 | 2024-04 | -0.224050 | 0.788876 | 0.872699 | 0.518997 | -0.316882 | 0.229926 | 0.310223 | -0.073247 | 0.164121 |
580355 | 2024-04 | -0.121426 | 0.336859 | -0.942812 | 0.849197 | -0.210732 | 0.381120 | 0.228750 | 0.623580 | 0.095574 |
580394 | 2024-04 | 0.845672 | -0.481394 | 0.279279 | 0.686251 | 0.255778 | -0.156287 | -0.175872 | -0.578143 | -0.592244 |
580439 | 2024-04 | 0.952213 | -0.869957 | 0.562867 | 0.264787 | -0.171563 | -0.974540 | -0.984332 | -0.707795 | -0.609871 |
580481 | 2024-04 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 |
575818 rows × 10 columns
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
temp.drop('ret_date',axis=1),
left_index=True, right_index=True)
del temp
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 0.719298 | -0.186280 | 0.045535 | 0.381234 | -0.645969 | -0.519022 | -0.530455 | 0.114528 | 0.413365 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 0.691854 | -0.841401 | 0.168831 | 0.515545 | -0.574577 | -0.515152 | -0.553325 | 0.515545 | 0.622983 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 0.674975 | -0.066928 | -0.269872 | 0.369774 | -0.590579 | -0.492836 | -0.555643 | -0.341315 | 0.285574 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 0.712774 | 0.947884 | -0.253527 | 0.546042 | -0.709444 | -0.564655 | -0.563480 | -0.303292 | 0.719436 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 |
575818 rows × 13 columns
df_rank['size'].describe()
count 5.758180e+05 mean 7.502544e-18 std 5.771543e-01 min -9.996083e-01 25% -4.998116e-01 50% 0.000000e+00 75% 4.998176e-01 max 9.996083e-01 Name: size, dtype: float64
Train, Validation, Test split¶
df_rank['year'] = df_rank['ret_date'].dt.year
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]
df_rank.groupby('year')['secID'].nunique()
year 2008 1463 2009 1530 2010 1841 2011 2142 2012 2383 2013 2432 2014 2549 2015 2772 2016 2941 2017 3392 2018 3522 2019 3648 2020 3961 2021 4422 2022 4770 2023 5110 2024 5111 Name: secID, dtype: int64
df_rank.groupby('year')['secID'].count()
year 2008 16621 2009 17335 2010 19823 2011 23864 2012 26912 2013 28592 2014 29539 2015 31727 2016 33468 2017 37665 2018 41103 2019 41992 2020 44134 2021 49181 2022 54418 2023 59061 2024 20383 Name: secID, dtype: int64
def list_flat(list_):
return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
# result = []
# for sublist in list_:
# for item in sublist:
# result.append(item)
# return result
list_flat([[1,2,3],[3,4,5]])
[1, 2, 3, 3, 4, 5]
np.array([[1,2,3],[3,4,5]]).flatten()
array([1, 2, 3, 3, 4, 5])
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 0.719298 | -0.186280 | 0.045535 | 0.381234 | -0.645969 | -0.519022 | -0.530455 | 0.114528 | 0.413365 | 2023 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 0.691854 | -0.841401 | 0.168831 | 0.515545 | -0.574577 | -0.515152 | -0.553325 | 0.515545 | 0.622983 | 2024 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 0.674975 | -0.066928 | -0.269872 | 0.369774 | -0.590579 | -0.492836 | -0.555643 | -0.341315 | 0.285574 | 2024 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 0.712774 | 0.947884 | -0.253527 | 0.546042 | -0.709444 | -0.564655 | -0.563480 | -0.303292 | 0.719436 | 2024 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 | 2024 |
575818 rows × 14 columns
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2009-2019], [2020-2023], [2024]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
train_idx = list_flat(time_idx[0:i])
val_idx = list_flat(time_idx[i:i+4])
fulltrain_idx.append(train_idx + val_idx)
cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0],
np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
# 因此cv_idx需要用fulltrain_idx的编号从0开始
test_idx.append(time_idx[i+4])
df_rank.loc[fulltrain_idx[-1]]
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580473 | 689009.XSHG | 2023-08 | -0.040985 | 2023-07 | 0.701600 | -0.517200 | -0.731600 | 0.584200 | -0.678400 | -0.577600 | -0.471200 | -0.330000 | -0.473600 | 2023 |
580474 | 689009.XSHG | 2023-09 | 0.040598 | 2023-08 | 0.708997 | 0.125000 | -0.755175 | 0.343949 | -0.653264 | -0.599920 | -0.489650 | -0.016720 | 0.261545 | 2023 |
580475 | 689009.XSHG | 2023-10 | -0.060460 | 2023-09 | 0.720055 | 0.669768 | -0.649574 | 0.496535 | -0.684815 | -0.497129 | -0.520887 | -0.263512 | -0.524055 | 2023 |
580476 | 689009.XSHG | 2023-11 | 0.007540 | 2023-10 | 0.716881 | -0.502468 | 0.138006 | 0.430207 | -0.665153 | -0.103653 | -0.522606 | -0.249358 | -0.039289 | 2023 |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 0.719298 | -0.186280 | 0.045535 | 0.381234 | -0.645969 | -0.519022 | -0.530455 | 0.114528 | 0.413365 | 2023 |
555435 rows × 14 columns
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]
array([0, 2, 4])
test_years = list(range(2016, 2025))
test_years
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Evaluation metrics¶
Clark Watson, West (2007)
def r2_oos(y_true, y_pred):
return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)
r2_oos_scorer = make_scorer(r2_oos)
Sklearn¶
Scikit-learn (sklearn) 的设计理念:
- Estimators: 可以基于数据估计出参数的东西。用fit()估计。比如填充空值(imputer),linear regression,等等。
- Transformers (不是神经网络里的那个,不是 Vaswani et al. (2017)): 可以把数据转换成新数据的东西。用transform()转换。一般可以直接用 fit_transform()
- Predictors: 可以基于数据做预测,比如linear regression
统一的命名规范:
- hyperparameter可以由model.<hyperparameter>取出,比如model.n_estimators
- estimated parameters可以由 model.<estimate>取出,比如model.feature_importances_
数据用np.array保存,或者SciPy的稀疏矩阵。避免各类其他包的自定义(比如pandas)
给出了大量的机器学习模型,同时很容易自定义进行拓展。自定义的模型可以很方便的融入到sklearn自带的模型当中
Models¶
Linear regression¶
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
model = LinearRegression()
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009411576536744626 Test year 2017 : -0.08839433918218265 Test year 2018 : -0.04979412952068807 Test year 2019 : 0.006460501095753468 Test year 2020 : -0.001548658826626248 Test year 2021 : 0.011487385386933058 Test year 2022 : -0.0009344902234940111 Test year 2023 : 0.009191975684269216 Test year 2024 : -0.0203421002634514
cols = ['size','rev','illiq','ivol']
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009411576536744626 Test year 2017 : -0.08839433918218265 Test year 2018 : -0.04979412952068807 Test year 2019 : 0.006460501095753468 Test year 2020 : -0.001548658826626248 Test year 2021 : 0.011487385386933058 Test year 2022 : -0.0009344902234940111 Test year 2023 : 0.009191975684269216 Test year 2024 : -0.0203421002634514
Huber regressor¶
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
model = HuberRegressor(alpha=0.01,epsilon=1.05)
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : 0.006876742676993675 Test year 2017 : -0.029175894616541687 Test year 2018 : 0.009048190975758708 Test year 2019 : -0.018440250950227055 Test year 2020 : -0.013739947817012599 Test year 2021 : -0.008465853786439048 Test year 2022 : 0.010892561021420222 Test year 2023 : -0.002345351762501169 Test year 2024 : 0.015101267852206668
Random Forest¶
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
hyperparam_grid = [
{'n_estimators': [50], 'max_depth': [3,5,7],
'max_features': [3,5]}
]
model = RandomForestRegressor(random_state=42)
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 48.4 s, sys: 456 ms, total: 48.9 s Wall time: 50.8 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=RandomForestRegressor(random_state=42), param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5], 'n_estimators': [50]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
0.027084789556580172 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50} 0.028556999189124684 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50} 0.05042647627227969 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50} 0.05530932712278905 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50} 0.04464716807967209 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50} 0.05370787731186078 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
ascending=False)
features | feature_importance | |
---|---|---|
5 | illiq | 0.260451 |
1 | rev | 0.212757 |
7 | vol | 0.120819 |
8 | ivol | 0.115962 |
0 | size | 0.102099 |
2 | mom | 0.067074 |
4 | bm | 0.049063 |
6 | illiq_12m | 0.044348 |
3 | beta | 0.027427 |
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.022154653336818875
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.022154653336818875 Test year 2017 : -0.08063812247577906 Test year 2018 : -0.03875517775769799 Test year 2019 : 0.00843188559327912 Test year 2020 : 0.00426710793978613 Test year 2021 : 0.011844755800809903 Test year 2022 : -0.002779929058047914 Test year 2023 : 0.009093821371165545 Test year 2024 : -0.017035974030240375 CPU times: user 18min 26s, sys: 10.6 s, total: 18min 37s Wall time: 19min 19s
Partial Least Squares¶
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
model = PLSRegression(n_components=4)
y_pred.reshape(-1).shape
(20383,)
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X_fulltrain, y_fulltrain)
y_pred = model.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.008579624491225069 Test year 2017 : -0.0932107045146382 Test year 2018 : -0.04912393839310547 Test year 2019 : 0.006105302409689872 Test year 2020 : -0.0015629368636205232 Test year 2021 : 0.011195966741407215 Test year 2022 : -0.000636538680146348 Test year 2023 : 0.009706227073746132 Test year 2024 : -0.02135918156321326 CPU times: user 10.3 s, sys: 783 ms, total: 11.1 s Wall time: 3.4 s
Principal Component Regression¶
PCA transform¶
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
pca = PCA(3, random_state=42)
pca.fit(X_fulltrain)
PCA(n_components=3, random_state=42)
pca.components_
array([[ 0.54095879, -0.09832092, -0.01437701, 0.13052989, 0.10593014, -0.54353902, -0.55735791, -0.16921922, -0.18577673], [ 0.1230896 , 0.28038463, 0.28945179, 0.04301217, -0.37797118, -0.22714271, -0.12955155, 0.5400808 , 0.56362587], [ 0.06274676, -0.20182262, 0.52534126, -0.60948758, -0.43840652, -0.02333879, 0.01991072, -0.30854347, -0.13975493]])
pca.components_.shape
(3, 9)
X_fulltrain.shape
(194413, 9)
pca.components_.T.shape
(9, 3)
np.matmul(X_fulltrain.values,pca.components_.T)
array([[ 1.03429184, 0.50775265, 0.41399241], [ 1.25904914, -0.35426324, 0.85397871], [ 1.13891321, 0.24847025, 0.21082479], ..., [-1.01192038, 0.71256279, -0.06717374], [-0.74623422, -0.52959361, 0.40740754], [-1.76547939, 0.46317872, 0.27584235]])
pca.fit_transform(X_fulltrain)
array([[ 1.03429184, 0.50775265, 0.41399241], [ 1.25904914, -0.35426324, 0.85397871], [ 1.13891321, 0.24847025, 0.21082479], ..., [-1.01192038, 0.71256279, -0.06717374], [-0.74623422, -0.52959361, 0.40740754], [-1.76547939, 0.46317872, 0.27584235]])
PCA regression¶
sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,fit()
(return self),transform()
,fit_transform()
即可。
但直接用继承,可以更方便。
BaseEstimator
是sklearn里最基本的类,其他的类都从这个类继承而来,包括了set_params()
和get_params()
的方法。TransformerMixin
包括了fit_transform()
方法。因此由这个类继承而来的话,就不用自定义fit_transform
了- 类似的,
RegressorMixin
包括了predict()
方法
model = PCARegressor()
model.fit(X=X_fulltrain, y=y_fulltrain)
PCARegressor()
model.X_
array([[ 1.03429184, 0.50775265, 0.41399241], [ 1.25904914, -0.35426324, 0.85397871], [ 1.13891321, 0.24847025, 0.21082479], ..., [-1.01192038, 0.71256279, -0.06717374], [-0.74623422, -0.52959361, 0.40740754], [-1.76547939, 0.46317872, 0.27584235]])
hyperparam_grid = [
{'n_components': range(1, len(cols)+1)}
]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=PCARegressor(), param_grid=[{'n_components': range(1, 10)}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'n_components': 6}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
nan {'n_components': 1} 0.03874473370176844 {'n_components': 2} 0.037587746458808476 {'n_components': 3} 0.04997889197284674 {'n_components': 4} 0.04997497095219836 {'n_components': 5} 0.05161556457239314 {'n_components': 6} 0.041827129620731866 {'n_components': 7} 0.050300834943468646 {'n_components': 8} 0.0487322179924835 {'n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)
-0.01057358710785472
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.01057358710785472 Test year 2017 : -0.08914596959361498 Test year 2018 : -0.04897289891375012 Test year 2019 : 0.006451234792761107 Test year 2020 : -0.00048657810249452815 Test year 2021 : 0.0095202141605224 Test year 2022 : -0.0061009049213132105 Test year 2023 : 0.008028420859995777 Test year 2024 : -0.022204856045419552 CPU times: user 1min 41s, sys: 9.21 s, total: 1min 50s Wall time: 29.1 s
Pipeline¶
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)
CPU times: user 4.1 s, sys: 430 ms, total: 4.53 s Wall time: 1.54 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=Pipeline(steps=[('pca', PCA()), ('linear_regression', LinearRegression())]), param_grid={'pca__n_components': range(1, 10)}, return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'pca__n_components': 6}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
nan {'pca__n_components': 1} 0.03874473370176844 {'pca__n_components': 2} 0.037587746458808476 {'pca__n_components': 3} 0.04997889197284674 {'pca__n_components': 4} 0.04997497095219836 {'pca__n_components': 5} 0.05161556457239422 {'pca__n_components': 6} 0.041827129620731866 {'pca__n_components': 7} 0.050300834943468646 {'pca__n_components': 8} 0.0487322179924835 {'pca__n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.01057358710785472
Elastic Net¶
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
model = SGDRegressor(penalty='elasticnet')
hyperparam_grid = [{'alpha':[0.001, 0.01, 0.1],
'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=SGDRegressor(penalty='elasticnet'), param_grid=[{'alpha': [0.001, 0.01, 0.1], 'l1_ratio': [0.15, 0.3, 0.5, 0.7]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'alpha': 0.001, 'l1_ratio': 0.7}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.0110463976016415
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.012516057005940606 Test year 2017 : -0.10558980314809396 Test year 2018 : -0.038706726553651816 Test year 2019 : 0.008269590365803325 Test year 2020 : 0.006087524412165979 Test year 2021 : 0.009994658939758372 Test year 2022 : -0.005719992513779193 Test year 2023 : 0.010637533125030907 Test year 2024 : -0.013783028576732859 CPU times: user 1min 41s, sys: 3.04 s, total: 1min 44s Wall time: 43.1 s
Gradient Boosted Regression Trees¶
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
hyperparam_grid = [
{'max_depth': [1,2,3,4,5,6],
'learning_rate': [0.1, 0.05, 0.01]}
]
model = GradientBoostingRegressor()
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
CPU times: user 9min 37s, sys: 11 s, total: 9min 48s Wall time: 10min 36s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=GradientBoostingRegressor(), param_grid=[{'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [1, 2, 3, 4, 5, 6]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'learning_rate': 0.1, 'max_depth': 3}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(mean_score, params)
0.0010920514714541918 {'learning_rate': 0.1, 'max_depth': 1} 0.0024649624915880075 {'learning_rate': 0.1, 'max_depth': 2} 0.005179742214071581 {'learning_rate': 0.1, 'max_depth': 3} 0.0027306548772843 {'learning_rate': 0.1, 'max_depth': 4} 0.0002629712278338081 {'learning_rate': 0.1, 'max_depth': 5} -0.0016882062465084502 {'learning_rate': 0.1, 'max_depth': 6} 0.0005808239346339894 {'learning_rate': 0.05, 'max_depth': 1} 0.0018532340083319276 {'learning_rate': 0.05, 'max_depth': 2} 0.003954208330685383 {'learning_rate': 0.05, 'max_depth': 3} 0.004945961987173897 {'learning_rate': 0.05, 'max_depth': 4} 0.003509540482510065 {'learning_rate': 0.05, 'max_depth': 5} 0.0011919994497078257 {'learning_rate': 0.05, 'max_depth': 6} -0.0012377392747824345 {'learning_rate': 0.01, 'max_depth': 1} -0.00038711061142104874 {'learning_rate': 0.01, 'max_depth': 2} 0.0010593345400554677 {'learning_rate': 0.01, 'max_depth': 3} 0.003114775494288402 {'learning_rate': 0.01, 'max_depth': 4} 0.003665138899025089 {'learning_rate': 0.01, 'max_depth': 5} 0.003195682240093589 {'learning_rate': 0.01, 'max_depth': 6}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.04906362476677084
Neural Nets¶
tf.__version__
'2.8.0'
keras.__version__
'2.8.0'
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
X_train.shape
(77643, 9)
X_val.shape
(116770, 9)
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
2024-05-13 10:01:17.440490: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
nn_model.compile(loss='mse',optimizer='sgd')
nn_model.fit(X_train, y_train, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0275 Epoch 2/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277 Epoch 3/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0278 Epoch 4/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277 Epoch 5/10 2427/2427 [==============================] - 4s 2ms/step - loss: 0.0223 - val_loss: 0.0274 Epoch 6/10 2427/2427 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0273 Epoch 7/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276 Epoch 8/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276 Epoch 9/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276 Epoch 10/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
<keras.callbacks.History at 0x7f7d85059d90>
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
0.003361474358112737
GridSeachCV Neural Nets¶
def build_model(learning_rate=0.003):
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[9]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
nn_model.compile(loss="mse", optimizer=optimizer)
return nn_model
# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/3997882518.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating. keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
hyperparams_grid = {
'learning_rate':[0.003,0.001]
}
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2427/2427 [==============================] - 6s 2ms/step - loss: 0.0346 - val_loss: 0.0313 Epoch 2/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0249 - val_loss: 0.0295 Epoch 3/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0235 - val_loss: 0.0282 Epoch 4/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279 Epoch 5/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0227 - val_loss: 0.0279 Epoch 6/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0226 - val_loss: 0.0277 Epoch 7/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0225 - val_loss: 0.0277 Epoch 8/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0224 - val_loss: 0.0275 Epoch 9/10 2427/2427 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0278 Epoch 10/10 2427/2427 [==============================] - 6s 3ms/step - loss: 0.0224 - val_loss: 0.0275 3650/3650 [==============================] - 5s 1ms/step - loss: 0.0275 Epoch 1/10 2427/2427 [==============================] - 6s 2ms/step - loss: 0.0432 - val_loss: 0.0342 Epoch 2/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0278 - val_loss: 0.0303 Epoch 3/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0251 - val_loss: 0.0291 Epoch 4/10 2427/2427 [==============================] - 6s 2ms/step - loss: 0.0241 - val_loss: 0.0286 Epoch 5/10 2427/2427 [==============================] - 6s 3ms/step - loss: 0.0236 - val_loss: 0.0283 Epoch 6/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0233 - val_loss: 0.0281 Epoch 7/10 2427/2427 [==============================] - 6s 2ms/step - loss: 0.0232 - val_loss: 0.0280 Epoch 8/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0230 - val_loss: 0.0280 Epoch 9/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279 Epoch 10/10 2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279 3650/3650 [==============================] - 5s 1ms/step - loss: 0.0279 Epoch 1/10 6076/6076 [==============================] - 10s 2ms/step - loss: 0.0297 - val_loss: 0.0273 Epoch 2/10 6076/6076 [==============================] - 10s 2ms/step - loss: 0.0255 - val_loss: 0.0270 Epoch 3/10 6076/6076 [==============================] - 9s 1ms/step - loss: 0.0253 - val_loss: 0.0269 Epoch 4/10 6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268 Epoch 5/10 6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268 Epoch 6/10 6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268 Epoch 7/10 6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268 Epoch 8/10 6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0269 Epoch 9/10 6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0269 Epoch 10/10 6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0267
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f7d851852b0>, param_grid={'learning_rate': [0.003, 0.001]})
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.02629758156221018
Transformation pipeline example¶
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
580477 | 689009.XSHG | 2023-12 | -0.105996 | 2023-11 | 0.719298 | -0.186280 | 0.045535 | 0.381234 | -0.645969 | -0.519022 | -0.530455 | 0.114528 | 0.413365 | 2023 |
580478 | 689009.XSHG | 2024-01 | -0.214983 | 2023-12 | 0.691854 | -0.841401 | 0.168831 | 0.515545 | -0.574577 | -0.515152 | -0.553325 | 0.515545 | 0.622983 | 2024 |
580479 | 689009.XSHG | 2024-02 | 0.296451 | 2024-01 | 0.674975 | -0.066928 | -0.269872 | 0.369774 | -0.590579 | -0.492836 | -0.555643 | -0.341315 | 0.285574 | 2024 |
580480 | 689009.XSHG | 2024-03 | -0.013334 | 2024-02 | 0.712774 | 0.947884 | -0.253527 | 0.546042 | -0.709444 | -0.564655 | -0.563480 | -0.303292 | 0.719436 | 2024 |
580481 | 689009.XSHG | 2024-04 | -0.073474 | 2024-03 | 0.705445 | -0.363494 | 0.056404 | 0.515864 | -0.662554 | -0.621622 | -0.574226 | 0.361535 | 0.578143 | 2024 |
575818 rows × 14 columns
illiq_idx = 5
illiq_12m_idx = 6
X_fulltrain
size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
1 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 |
2 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 |
3 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 |
4 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
556879 | -0.911797 | -0.975744 | 0.000000 | -0.550165 | -0.223815 | 0.953693 | 0.000000 | 0.351709 | -0.339214 |
556880 | -0.890682 | -0.083639 | 0.000000 | -0.468452 | -0.264123 | 0.271460 | 0.000000 | -0.356566 | -0.264123 |
556881 | -0.902493 | 0.521261 | 0.000000 | -0.303519 | -0.313050 | 0.346041 | 0.000000 | 0.332845 | 0.835777 |
556882 | -0.909058 | 0.020902 | 0.000000 | -0.273194 | -0.325266 | 0.573891 | 0.000000 | -0.416208 | -0.318665 |
556883 | -0.881145 | 0.871607 | 0.211299 | -0.427733 | -0.551724 | 0.750550 | 0.995598 | 0.090242 | 0.579604 |
194413 rows × 9 columns
class FeatureAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_avg_illiq=True):
self.add_avg_illiq = add_avg_illiq
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
return np.c_[X, avg_illiq]
feature_adder = FeatureAdder()
X_fulltrain.values.shape
(194413, 9)
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)
X_fulltrain_new
array([[ 0.9695586 , -0.85844749, 0. , ..., 0.27549467, 0. , -0.48934551], [ 0.97153558, -0.45168539, 0. , ..., -0.63595506, -0.7917603 , -0.47977528], [ 0.96733482, -0.70007424, 0. , ..., 0.437268 , -0.62583519, -0.47698589], ..., [-0.90249267, 0.521261 , 0. , ..., 0.33284457, 0.83577713, 0.17302053], [-0.90905757, 0.02090209, 0. , ..., -0.41620829, -0.3186652 , 0.28694536], [-0.88114453, 0.87160675, 0.21129861, ..., 0.09024211, 0.57960382, 0.8730741 ]])
X_fulltrain_new.shape
(194413, 10)
# This can be added to a pipeline
pipeline = Pipeline([
('feature_adder', FeatureAdder()),
('std_scaler', StandardScaler())
])
pipeline.fit_transform(X_fulltrain.values)
array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ..., 4.77437532e-01, 4.06742109e-18, -9.01081929e-01], [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ..., -1.10212226e+00, -1.37679088e+00, -8.83459288e-01], [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ..., 7.57793805e-01, -1.08826394e+00, -8.78322906e-01], ..., [-1.56393521e+00, 9.03360864e-01, -7.60380513e-19, ..., 5.76826008e-01, 1.45333168e+00, 3.18600392e-01], [-1.57531157e+00, 3.62239461e-02, -7.60380513e-19, ..., -7.21296915e-01, -5.54126473e-01, 5.28381837e-01], [-1.52694089e+00, 1.51052051e+00, 3.66337572e-01, ..., 1.56391306e-01, 1.00787227e+00, 1.60768063e+00]])