import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline
# import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
df = pd.read_pickle('../../data/factor_exposure/all_exposure_2023.pkl')
df
secID | ret_date | tradeDate | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 2007-06-29 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 2007-07-31 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 2007-08-31 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 2007-09-28 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | 2007-10-31 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
547253 | 689009.XSHG | 2022-12 | 2022-11-30 | -0.086579 | 0.001955 | -0.088534 | 2022-11 | 1.708055e+10 | 23.561206 | 0.041529 | 2022-10 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 | 0.017044 | 0.010108 |
547254 | 689009.XSHG | 2023-01 | 2022-12-30 | 0.088554 | 0.001856 | 0.086698 | 2022-12 | 1.560173e+10 | 23.470648 | -0.088534 | 2022-11 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 | 0.019017 | 0.013598 |
547255 | 689009.XSHG | 2023-02 | 2023-01-31 | -0.005725 | 0.001910 | -0.007635 | 2023-01 | 1.698332e+10 | 23.555498 | 0.086698 | 2022-12 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 | 0.014183 | 0.009679 |
547256 | 689009.XSHG | 2023-03 | 2023-02-28 | -0.011818 | 0.001982 | -0.013800 | 2023-02 | 1.688610e+10 | 23.549757 | -0.007635 | 2023-01 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 | 0.024867 | 0.012591 |
547257 | 689009.XSHG | NaT | 2023-03-10 | NaN | NaN | NaN | 2023-03 | 1.668654e+10 | 23.537868 | -0.013800 | 2023-02 | -0.246403 | NaN | 0.193716 | NaN | 0.000201 | NaN | NaN | NaN | NaN |
547258 rows × 20 columns
df.drop('tradeDate',axis=1,inplace=True)
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
547253 | 689009.XSHG | 2022-12 | -0.086579 | 0.001955 | -0.088534 | 2022-11 | 1.708055e+10 | 23.561206 | 0.041529 | 2022-10 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 | 0.017044 | 0.010108 |
547254 | 689009.XSHG | 2023-01 | 0.088554 | 0.001856 | 0.086698 | 2022-12 | 1.560173e+10 | 23.470648 | -0.088534 | 2022-11 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 | 0.019017 | 0.013598 |
547255 | 689009.XSHG | 2023-02 | -0.005725 | 0.001910 | -0.007635 | 2023-01 | 1.698332e+10 | 23.555498 | 0.086698 | 2022-12 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 | 0.014183 | 0.009679 |
547256 | 689009.XSHG | 2023-03 | -0.011818 | 0.001982 | -0.013800 | 2023-02 | 1.688610e+10 | 23.549757 | -0.007635 | 2023-01 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 | 0.024867 | 0.012591 |
547257 | 689009.XSHG | NaT | NaN | NaN | NaN | 2023-03 | 1.668654e+10 | 23.537868 | -0.013800 | 2023-02 | -0.246403 | NaN | 0.193716 | NaN | 0.000201 | NaN | NaN | NaN | NaN |
547258 rows × 19 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 5068 ret 19730 rf 5068 exret 19730 ym 0 mktcap 14043 size 14043 rev 18541 mom_date 3879 mom 53422 beta 28578 bm 4198 illiq 32683 illiq_12m 96646 vol 21868 ivol 36221 vol_clip 21868 ivol_clip 36221
ret_date 为 NA 的删除,已到最新数据处
df = df[~df['ret_date'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2007-07 | 0.316497 | 0.002481 | 0.314016 | 2007-06 | 4.266117e+10 | 24.476555 | NaN | NaT | NaN | 0.4614 | 0.123739 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 000001.XSHE | 2007-08 | 0.048855 | 0.002404 | 0.046451 | 2007-07 | 5.616330e+10 | 24.751529 | 0.314016 | 2007-06 | NaN | 0.6423 | 0.093992 | 0.000040 | NaN | 0.041604 | NaN | 0.041604 | NaN |
2 | 000001.XSHE | 2007-09 | 0.052105 | 0.002621 | 0.049484 | 2007-08 | 5.890714e+10 | 24.799228 | 0.046451 | 2007-07 | NaN | 0.7722 | 0.097085 | 0.000020 | NaN | 0.033926 | NaN | 0.033926 | NaN |
3 | 000001.XSHE | 2007-10 | 0.201851 | 0.003095 | 0.198756 | 2007-09 | 6.197651e+10 | 24.850021 | 0.049484 | 2007-08 | NaN | 0.7596 | 0.092276 | 0.000025 | NaN | 0.023872 | NaN | 0.023872 | NaN |
4 | 000001.XSHE | 2007-11 | -0.249116 | 0.003780 | -0.252896 | 2007-10 | 7.448652e+10 | 25.033884 | 0.198756 | 2007-09 | NaN | 0.7988 | 0.083411 | 0.000030 | NaN | 0.035921 | NaN | 0.035921 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
547252 | 689009.XSHG | 2022-11 | 0.043125 | 0.001596 | 0.041529 | 2022-10 | 1.637440e+10 | 23.518985 | -0.166109 | 2022-09 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 | 0.059961 | 0.051851 |
547253 | 689009.XSHG | 2022-12 | -0.086579 | 0.001955 | -0.088534 | 2022-11 | 1.708055e+10 | 23.561206 | 0.041529 | 2022-10 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 | 0.017044 | 0.010108 |
547254 | 689009.XSHG | 2023-01 | 0.088554 | 0.001856 | 0.086698 | 2022-12 | 1.560173e+10 | 23.470648 | -0.088534 | 2022-11 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 | 0.019017 | 0.013598 |
547255 | 689009.XSHG | 2023-02 | -0.005725 | 0.001910 | -0.007635 | 2023-01 | 1.698332e+10 | 23.555498 | 0.086698 | 2022-12 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 | 0.014183 | 0.009679 |
547256 | 689009.XSHG | 2023-03 | -0.011818 | 0.001982 | -0.013800 | 2023-02 | 1.688610e+10 | 23.549757 | -0.007635 | 2023-01 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 | 0.024867 | 0.012591 |
542190 rows × 19 columns
momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。
df.loc[~df['mom'].isna(),'ret_date'].min()
Period('2008-01', 'M')
df = df[df['ret_date'] >= '2008-01'].copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 14262 rf 0 exret 14262 ym 0 mktcap 13693 size 13693 rev 17990 mom_date 3693 mom 40190 beta 23245 bm 4024 illiq 27119 illiq_12m 84325 vol 16457 ivol 18316 vol_clip 16457 ivol_clip 18316
剩余的NA值有至少三个来源:
return 的 NA 值直接删除
df = df[~df['ret'].isna()].copy()
df
secID | ret_date | ret | rf | exret | ym | mktcap | size | rev | mom_date | mom | beta | bm | illiq | illiq_12m | vol | ivol | vol_clip | ivol_clip | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
6 | 000001.XSHE | 2008-01 | -0.137306 | 0.002949 | -0.140255 | 2007-12 | 6.574629e+10 | 24.909069 | 0.066834 | 2007-11 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN | 0.026541 | NaN |
7 | 000001.XSHE | 2008-02 | -0.004504 | 0.002946 | -0.007450 | 2008-01 | 5.850212e+10 | 24.792329 | -0.140255 | 2007-12 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 | 0.037722 | 0.012909 |
8 | 000001.XSHE | 2008-03 | -0.149321 | 0.002746 | -0.152068 | 2008-02 | 5.823860e+10 | 24.787814 | -0.007450 | 2008-01 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 | 0.041448 | 0.009032 |
9 | 000001.XSHE | 2008-04 | 0.050355 | 0.002862 | 0.047493 | 2008-03 | 4.954234e+10 | 24.626093 | -0.152068 | 2008-02 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 | 0.045109 | 0.021484 |
10 | 000001.XSHE | 2008-05 | -0.148211 | 0.002953 | -0.151164 | 2008-04 | 5.203702e+10 | 24.675221 | 0.047493 | 2008-03 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
547252 | 689009.XSHG | 2022-11 | 0.043125 | 0.001596 | 0.041529 | 2022-10 | 1.637440e+10 | 23.518985 | -0.166109 | 2022-09 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 | 0.059961 | 0.051851 |
547253 | 689009.XSHG | 2022-12 | -0.086579 | 0.001955 | -0.088534 | 2022-11 | 1.708055e+10 | 23.561206 | 0.041529 | 2022-10 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 | 0.017044 | 0.010108 |
547254 | 689009.XSHG | 2023-01 | 0.088554 | 0.001856 | 0.086698 | 2022-12 | 1.560173e+10 | 23.470648 | -0.088534 | 2022-11 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 | 0.019017 | 0.013598 |
547255 | 689009.XSHG | 2023-02 | -0.005725 | 0.001910 | -0.007635 | 2023-01 | 1.698332e+10 | 23.555498 | 0.086698 | 2022-12 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 | 0.014183 | 0.009679 |
547256 | 689009.XSHG | 2023-03 | -0.011818 | 0.001982 | -0.013800 | 2023-02 | 1.688610e+10 | 23.549757 | -0.007635 | 2023-01 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 | 0.024867 | 0.012591 |
514959 rows × 19 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 ret 0 rf 0 exret 0 ym 0 mktcap 0 size 0 rev 4297 mom_date 3693 mom 40162 beta 21372 bm 1004 illiq 13186 illiq_12m 71577 vol 2716 ivol 4413 vol_clip 2716 ivol_clip 4413
df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)
df.drop(['ret','rf'],axis=1,inplace=True)
df.reset_index(inplace=True,drop=True)
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 23.518985 | -0.166109 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 23.561206 | 0.041529 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 23.470648 | -0.088534 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 23.555498 | 0.086698 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 23.549757 | -0.007635 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 |
514959 rows × 13 columns
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 4297 mom 40162 beta 21372 bm 1004 illiq 13186 illiq_12m 71577 vol 2716 ivol 4413
# Reversal 的空值丢掉,其他的用 median 填充
df = df[~df['rev'].isna()].copy()
cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | NaN | 0.9468 | 0.094476 | 0.000025 | NaN | 0.026541 | NaN |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | NaN | 0.9654 | 0.109513 | 0.000039 | NaN | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | NaN | 1.0292 | 0.110009 | 0.000064 | NaN | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | NaN | 1.0238 | 0.201102 | 0.000043 | NaN | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | NaN | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 23.518985 | -0.166109 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 23.561206 | 0.041529 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 23.470648 | -0.088534 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 23.555498 | 0.086698 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 23.549757 | -0.007635 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 |
510662 rows × 13 columns
temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))
temp.fillna(0, inplace=True)
df[cols] = temp.copy()
for col in df.columns:
print(col, df[col].isna().sum())
secID 0 ret_date 0 exret 0 ym 0 size 0 rev 0 mom 0 beta 0 bm 0 illiq 0 illiq_12m 0 vol 0 ivol 0
df
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 24.909069 | 0.066834 | 0.796305 | 0.9468 | 0.094476 | 0.000025 | 0.000505 | 0.026541 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 24.792329 | -0.140255 | 1.145639 | 0.9654 | 0.109513 | 0.000039 | 0.000494 | 0.037722 | 0.012909 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 24.787814 | -0.007450 | 0.693690 | 1.0292 | 0.110009 | 0.000064 | 0.000490 | 0.041448 | 0.009032 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 24.626093 | -0.152068 | 0.558575 | 1.0238 | 0.201102 | 0.000043 | 0.000526 | 0.045109 | 0.021484 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 24.675221 | 0.047493 | -0.048874 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 23.518985 | -0.166109 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 23.561206 | 0.041529 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 23.470648 | -0.088534 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 23.555498 | 0.086698 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 23.549757 | -0.007635 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 |
510662 rows × 13 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 510662 entries, 0 to 514958 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 secID 510662 non-null object 1 ret_date 510662 non-null period[M] 2 exret 510662 non-null float64 3 ym 510662 non-null period[M] 4 size 510662 non-null float64 5 rev 510662 non-null float64 6 mom 510662 non-null float64 7 beta 510662 non-null float64 8 bm 510662 non-null float64 9 illiq 510662 non-null float64 10 illiq_12m 510662 non-null float64 11 vol 510662 non-null float64 12 ivol 510662 non-null float64 dtypes: float64(10), object(1), period[M](2) memory usage: 54.5+ MB
$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t
def csrank(df):
return df.rank() * 2 / (len(df) + 1) - 1
num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()
num_X_cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
df[['ret_date']+num_X_cols]
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-01 | 24.909069 | 0.066834 | 0.796305 | 0.9468 | 0.094476 | 0.000025 | 0.000505 | 0.026541 | 0.000000 |
1 | 2008-02 | 24.792329 | -0.140255 | 1.145639 | 0.9654 | 0.109513 | 0.000039 | 0.000494 | 0.037722 | 0.012909 |
2 | 2008-03 | 24.787814 | -0.007450 | 0.693690 | 1.0292 | 0.110009 | 0.000064 | 0.000490 | 0.041448 | 0.009032 |
3 | 2008-04 | 24.626093 | -0.152068 | 0.558575 | 1.0238 | 0.201102 | 0.000043 | 0.000526 | 0.045109 | 0.021484 |
4 | 2008-05 | 24.675221 | 0.047493 | -0.048874 | 1.0212 | 0.206701 | 0.000051 | 0.000038 | 0.046323 | 0.015098 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 2022-11 | 23.518985 | -0.166109 | -0.401406 | 0.7083 | 0.209701 | 0.000264 | 0.000206 | 0.059961 | 0.051851 |
514955 | 2022-12 | 23.561206 | 0.041529 | -0.474030 | 0.7363 | 0.201033 | 0.000122 | 0.000202 | 0.017044 | 0.010108 |
514956 | 2023-01 | 23.470648 | -0.088534 | -0.523619 | 0.6919 | 0.220085 | 0.000194 | 0.000210 | 0.019017 | 0.013598 |
514957 | 2023-02 | 23.555498 | 0.086698 | -0.498602 | 0.7379 | 0.201772 | 0.000100 | 0.000210 | 0.014183 | 0.009679 |
514958 | 2023-03 | 23.549757 | -0.007635 | -0.418230 | 0.7453 | 0.202930 | 0.000091 | 0.000202 | 0.024867 | 0.012591 |
510662 rows × 10 columns
df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)
ret_date | size | ||
---|---|---|---|
ret_date | |||
2008-01 | 0 | 0.0 | 0.969559 |
183 | 0.0 | 0.990868 | |
651 | 0.0 | 0.522070 | |
1504 | 0.0 | 0.678843 | |
1687 | 0.0 | -0.231355 | |
... | ... | ... | ... |
2023-03 | 514851 | 0.0 | -0.743772 |
514871 | 0.0 | 0.278208 | |
514897 | 0.0 | 0.041658 | |
514929 | 0.0 | 0.943898 | |
514958 | 0.0 | 0.672179 |
510662 rows × 2 columns
temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)
temp
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | ||
---|---|---|---|---|---|---|---|---|---|---|---|
ret_date | |||||||||||
2008-01 | 0 | 0.0 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
183 | 0.0 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 | |
651 | 0.0 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 | |
1504 | 0.0 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.517504 | 0.493151 | 0.000000 | |
1687 | 0.0 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2023-03 | 514851 | 0.0 | -0.743772 | 0.284488 | 0.184844 | -0.557254 | 0.279464 | 0.348964 | 0.836718 | -0.684739 | -0.466192 |
514871 | 0.0 | 0.278208 | -0.802805 | 0.209964 | 0.550345 | -0.735817 | -0.220431 | 0.083944 | 0.504291 | 0.274440 | |
514897 | 0.0 | 0.041658 | -0.816203 | 0.559975 | -0.490057 | 0.016956 | -0.027842 | -0.103203 | -0.330961 | -0.108227 | |
514929 | 0.0 | 0.943898 | -0.279883 | -0.469960 | -0.014026 | 0.134603 | -0.969018 | -0.969018 | -0.486707 | -0.643291 | |
514958 | 0.0 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 |
510662 rows × 10 columns
temp.drop('ret_date',axis=1).reset_index()
ret_date | level_1 | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-01 | 0 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
1 | 2008-01 | 183 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 |
2 | 2008-01 | 651 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 |
3 | 2008-01 | 1504 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.517504 | 0.493151 | 0.000000 |
4 | 2008-01 | 1687 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
510657 | 2023-03 | 514851 | -0.743772 | 0.284488 | 0.184844 | -0.557254 | 0.279464 | 0.348964 | 0.836718 | -0.684739 | -0.466192 |
510658 | 2023-03 | 514871 | 0.278208 | -0.802805 | 0.209964 | 0.550345 | -0.735817 | -0.220431 | 0.083944 | 0.504291 | 0.274440 |
510659 | 2023-03 | 514897 | 0.041658 | -0.816203 | 0.559975 | -0.490057 | 0.016956 | -0.027842 | -0.103203 | -0.330961 | -0.108227 |
510660 | 2023-03 | 514929 | 0.943898 | -0.279883 | -0.469960 | -0.014026 | 0.134603 | -0.969018 | -0.969018 | -0.486707 | -0.643291 |
510661 | 2023-03 | 514958 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 |
510662 rows × 11 columns
temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp
ret_date | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|
level_1 | ||||||||||
0 | 2008-01 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
183 | 2008-01 | 0.990868 | -0.990868 | 0.852359 | 0.662100 | -0.375951 | -0.996956 | -0.989346 | 0.745814 | 0.000000 |
651 | 2008-01 | 0.522070 | -0.972603 | 0.552511 | 0.523592 | 0.283105 | -0.223744 | -0.595129 | 0.814307 | 0.000000 |
1504 | 2008-01 | 0.678843 | -0.506849 | -0.517504 | 0.775495 | -0.636225 | -0.698630 | -0.517504 | 0.493151 | 0.000000 |
1687 | 2008-01 | -0.231355 | -0.945967 | 0.709285 | 0.000000 | -0.403349 | 0.000000 | 0.000000 | -0.982496 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514851 | 2023-03 | -0.743772 | 0.284488 | 0.184844 | -0.557254 | 0.279464 | 0.348964 | 0.836718 | -0.684739 | -0.466192 |
514871 | 2023-03 | 0.278208 | -0.802805 | 0.209964 | 0.550345 | -0.735817 | -0.220431 | 0.083944 | 0.504291 | 0.274440 |
514897 | 2023-03 | 0.041658 | -0.816203 | 0.559975 | -0.490057 | 0.016956 | -0.027842 | -0.103203 | -0.330961 | -0.108227 |
514929 | 2023-03 | 0.943898 | -0.279883 | -0.469960 | -0.014026 | 0.134603 | -0.969018 | -0.969018 | -0.486707 | -0.643291 |
514958 | 2023-03 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 |
510662 rows × 10 columns
df_rank = pd.merge(df.drop(num_X_cols, axis=1),
temp.drop('ret_date',axis=1),
left_index=True, right_index=True)
del temp
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 0.700640 | -0.936034 | -0.827719 | -0.674414 | -0.684009 | -0.107036 | -0.260981 | 0.953092 | 0.987207 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 0.696713 | -0.218240 | -0.865111 | -0.164157 | -0.676776 | -0.345917 | -0.257688 | -0.380276 | -0.379003 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 0.682479 | -0.431583 | -0.970483 | -0.223066 | -0.635252 | -0.150327 | -0.242673 | -0.109846 | 0.041113 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 0.682247 | 0.145253 | -0.983232 | 0.037099 | -0.625865 | -0.477258 | -0.247956 | -0.309998 | -0.089499 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 |
510662 rows × 13 columns
df_rank['year'] = df_rank['ret_date'].dt.year
time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]
# sorted(df_rank.groupby('year').groups.items())
time_idx
[Int64Index([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ... 461853, 461854, 461855, 461856, 461857, 461858, 461859, 461860, 461861, 461862], dtype='int64', length=16621), Int64Index([ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ... 461865, 461866, 461867, 461868, 461869, 461870, 461871, 461872, 461873, 461874], dtype='int64', length=17335), Int64Index([ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, ... 461877, 461878, 461879, 461880, 461881, 461882, 461883, 461884, 461885, 461886], dtype='int64', length=19823), Int64Index([ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, ... 461889, 461890, 461891, 461892, 461893, 461894, 461895, 461896, 461897, 461898], dtype='int64', length=23864), Int64Index([ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, ... 478263, 478264, 478265, 479640, 479641, 479642, 490582, 490583, 490584, 499706], dtype='int64', length=26912), Int64Index([ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, ... 499709, 499710, 499711, 499712, 499713, 499714, 499715, 499716, 499717, 499718], dtype='int64', length=28592), Int64Index([ 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, ... 499721, 499722, 499723, 499724, 499725, 499726, 499727, 499728, 499729, 499730], dtype='int64', length=29539), Int64Index([ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, ... 500009, 500010, 500011, 500012, 500013, 500014, 500015, 500016, 500017, 500018], dtype='int64', length=31727), Int64Index([ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, ... 500108, 500109, 500110, 500111, 500112, 500113, 500114, 500115, 500116, 500117], dtype='int64', length=33468), Int64Index([ 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, ... 500120, 500121, 500122, 500123, 500124, 500125, 500126, 500127, 500128, 500129], dtype='int64', length=37665), Int64Index([ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, ... 500132, 500133, 500134, 500135, 500136, 500137, 500138, 500139, 500140, 500141], dtype='int64', length=41103), Int64Index([ 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, ... 510143, 510144, 510145, 510667, 510729, 510770, 511041, 511042, 511043, 511044], dtype='int64', length=41992), Int64Index([ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, ... 513137, 513138, 513139, 513140, 514745, 514899, 514900, 514901, 514902, 514931], dtype='int64', length=44134), Int64Index([ 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, ... 514934, 514935, 514936, 514937, 514938, 514939, 514940, 514941, 514942, 514943], dtype='int64', length=49181), Int64Index([ 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, ... 514946, 514947, 514948, 514949, 514950, 514951, 514952, 514953, 514954, 514955], dtype='int64', length=54418), Int64Index([ 180, 181, 182, 363, 364, 365, 831, 832, 833, 946, ... 514871, 514895, 514896, 514897, 514927, 514928, 514929, 514956, 514957, 514958], dtype='int64', length=14288)]
df_rank.groupby('year')['secID'].nunique()
year 2008 1463 2009 1530 2010 1841 2011 2142 2012 2383 2013 2432 2014 2549 2015 2772 2016 2941 2017 3392 2018 3522 2019 3648 2020 3961 2021 4422 2022 4770 2023 4777 Name: secID, dtype: int64
df_rank.groupby('year')['secID'].count()
year 2008 16621 2009 17335 2010 19823 2011 23864 2012 26912 2013 28592 2014 29539 2015 31727 2016 33468 2017 37665 2018 41103 2019 41992 2020 44134 2021 49181 2022 54418 2023 14288 Name: secID, dtype: int64
def list_flat(list_):
return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
# result = []
# for sublist in list_:
# for item in sublist:
# result.append(item)
# return result
list_flat([[1,2,3],[3,4,5]])
[1, 2, 3, 3, 4, 5]
np.array([[1,2,3],[3,4,5]]).flatten()
array([1, 2, 3, 3, 4, 5])
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 0.700640 | -0.936034 | -0.827719 | -0.674414 | -0.684009 | -0.107036 | -0.260981 | 0.953092 | 0.987207 | 2022 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 0.696713 | -0.218240 | -0.865111 | -0.164157 | -0.676776 | -0.345917 | -0.257688 | -0.380276 | -0.379003 | 2022 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 0.682479 | -0.431583 | -0.970483 | -0.223066 | -0.635252 | -0.150327 | -0.242673 | -0.109846 | 0.041113 | 2023 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 0.682247 | 0.145253 | -0.983232 | 0.037099 | -0.625865 | -0.477258 | -0.247956 | -0.309998 | -0.089499 | 2023 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 | 2023 |
510662 rows × 14 columns
# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2018], [2019-2022], [2023]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
train_idx = list_flat(time_idx[0:i])
val_idx = list_flat(time_idx[i:i+4])
fulltrain_idx.append(train_idx + val_idx)
cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0],
np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,
# 因此cv_idx需要用fulltrain_idx的编号从0开始
test_idx.append(time_idx[i+4])
df_rank.loc[fulltrain_idx[-1]]
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514951 | 689009.XSHG | 2022-08 | -0.113907 | 2022-07 | 0.784744 | 0.646208 | -0.844805 | -0.314336 | -0.831653 | -0.137221 | -0.210872 | 0.667251 | 0.736081 | 2022 |
514952 | 689009.XSHG | 2022-09 | -0.131337 | 2022-08 | 0.763169 | -0.625599 | -0.814976 | -0.165869 | -0.809752 | -0.322159 | -0.227253 | 0.639965 | 0.803222 | 2022 |
514953 | 689009.XSHG | 2022-10 | -0.166109 | 2022-09 | 0.750215 | -0.393626 | -0.954350 | -0.143626 | -0.779500 | -0.217054 | -0.260551 | -0.252799 | 0.253230 | 2022 |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 0.700640 | -0.936034 | -0.827719 | -0.674414 | -0.684009 | -0.107036 | -0.260981 | 0.953092 | 0.987207 | 2022 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 0.696713 | -0.218240 | -0.865111 | -0.164157 | -0.676776 | -0.345917 | -0.257688 | -0.380276 | -0.379003 | 2022 |
496374 rows × 14 columns
# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]
array([0, 2, 4])
test_years = list(range(2016, 2024))
test_years
[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
def r2_oos(y_true, y_pred):
return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)
r2_oos_scorer = make_scorer(r2_oos)
Scikit-learn (sklearn) 的设计理念:
统一的命名规范:
数据用np.array保存,或者SciPy的稀疏矩阵。避免各类其他包的自定义(比如pandas)
给出了大量的机器学习模型,同时很容易自定义进行拓展。自定义的模型可以很方便的融入到sklearn自带的模型当中
df_rank
secID | ret_date | exret | ym | size | rev | mom | beta | bm | illiq | illiq_12m | vol | ivol | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 000001.XSHE | 2008-01 | -0.140255 | 2007-12 | 0.969559 | -0.858447 | 0.000000 | 0.086758 | -0.672755 | -0.978691 | 0.000000 | 0.275495 | 0.000000 | 2008 |
1 | 000001.XSHE | 2008-02 | -0.007450 | 2008-01 | 0.971536 | -0.451685 | 0.000000 | -0.170037 | -0.613483 | -0.959551 | 0.000000 | -0.635955 | -0.791760 | 2008 |
2 | 000001.XSHE | 2008-03 | -0.152068 | 2008-02 | 0.967335 | -0.700074 | 0.000000 | 0.345212 | -0.557535 | -0.953972 | 0.000000 | 0.437268 | -0.625835 | 2008 |
3 | 000001.XSHE | 2008-04 | 0.047493 | 2008-03 | 0.969027 | 0.443953 | 0.000000 | 0.048673 | -0.112094 | -0.974926 | 0.000000 | 0.241888 | 0.306785 | 2008 |
4 | 000001.XSHE | 2008-05 | -0.151164 | 2008-04 | 0.964549 | 0.545052 | 0.000000 | -0.264402 | -0.258493 | -0.970458 | -0.976366 | -0.704579 | -0.497784 | 2008 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
514954 | 689009.XSHG | 2022-11 | 0.041529 | 2022-10 | 0.700640 | -0.936034 | -0.827719 | -0.674414 | -0.684009 | -0.107036 | -0.260981 | 0.953092 | 0.987207 | 2022 |
514955 | 689009.XSHG | 2022-12 | -0.088534 | 2022-11 | 0.696713 | -0.218240 | -0.865111 | -0.164157 | -0.676776 | -0.345917 | -0.257688 | -0.380276 | -0.379003 | 2022 |
514956 | 689009.XSHG | 2023-01 | 0.086698 | 2022-12 | 0.682479 | -0.431583 | -0.970483 | -0.223066 | -0.635252 | -0.150327 | -0.242673 | -0.109846 | 0.041113 | 2023 |
514957 | 689009.XSHG | 2023-02 | -0.007635 | 2023-01 | 0.682247 | 0.145253 | -0.983232 | 0.037099 | -0.625865 | -0.477258 | -0.247956 | -0.309998 | -0.089499 | 2023 |
514958 | 689009.XSHG | 2023-03 | -0.013800 | 2023-02 | 0.672179 | -0.409253 | -0.964831 | 0.139418 | -0.605610 | -0.406322 | -0.243458 | 0.471216 | 0.117856 | 2023 |
510662 rows × 14 columns
X_fulltrain.columns.tolist()
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
X_fulltrain.columns.tolist().index('illiq')
5
X_fulltrain.columns.tolist().index('illiq_12m')
6
illiq_idx = 4
illiq_12m_idx = 5
class FeatureAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_avg_illiq=True):
self.add_avg_illiq = add_avg_illiq
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
return np.c_[X, avg_illiq]
feature_adder = FeatureAdder()
X_fulltrain.values.shape
(194413, 9)
X_fulltrain_new = feature_adder.transform(X_fulltrain.values)
X_fulltrain_new
array([[ 0.9695586 , -0.85844749, 0. , ..., 0.27549467, 0. , -0.82572298], [ 0.97153558, -0.45168539, 0. , ..., -0.63595506, -0.7917603 , -0.78651685], [ 0.96733482, -0.70007424, 0. , ..., 0.437268 , -0.62583519, -0.75575353], ..., [-0.90249267, 0.521261 , 0. , ..., 0.33284457, 0.83577713, 0.0164956 ], [-0.90905757, 0.02090209, 0. , ..., -0.41620829, -0.3186652 , 0.12431243], [-0.88114453, 0.87160675, 0.21129861, ..., 0.09024211, 0.57446809, 0.09941306]])
X_fulltrain_new.shape
(194413, 10)
# This can be added to a pipeline
pipeline = Pipeline([
('feature_adder', FeatureAdder()),
('std_scaler', StandardScaler())
])
pipeline.fit_transform(X_fulltrain.values)
array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ..., 4.77437532e-01, -7.62641454e-19, -1.98928732e+00], [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ..., -1.10212226e+00, -1.37679088e+00, -1.89483403e+00], [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ..., 7.57793805e-01, -1.08826394e+00, -1.82072068e+00], ..., [-1.56393521e+00, 9.03360864e-01, -7.60380513e-19, ..., 5.76826008e-01, 1.45333168e+00, 3.97403136e-02], [-1.57531157e+00, 3.62239461e-02, -7.60380513e-19, ..., -7.21296915e-01, -5.54126473e-01, 2.99486812e-01], [-1.52694089e+00, 1.51052051e+00, 3.66337572e-01, ..., 1.56391306e-01, 9.98941754e-01, 2.39500587e-01]])
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
model = LinearRegression()
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.009412304211971145 Test year 2017 : -0.08839594863078148 Test year 2018 : -0.04979526421788871 Test year 2019 : 0.006463809562448852 Test year 2020 : -0.001544287862749627 Test year 2021 : 0.011488412068509812 Test year 2022 : -0.0009306275137825892 Test year 2023 : 0.0538460552856318
cols = ['size','rev','illiq','ivol']
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.01021329786011016 Test year 2017 : -0.08793262026819404 Test year 2018 : -0.04979850705536615 Test year 2019 : 0.007779285918034451 Test year 2020 : -0.0007569573338341851 Test year 2021 : 0.01083296171623438 Test year 2022 : -0.0017994744447327182 Test year 2023 : 0.0567680782563359
cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']
model = HuberRegressor(alpha=0.01,epsilon=1.05)
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X=X_fulltrain, y=y_fulltrain)
y_pred = model.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : 0.0068762585146247 Test year 2017 : -0.02917724503186392 Test year 2018 : 0.00904631578299342 Test year 2019 : -0.018436209926423253 Test year 2020 : -0.01373519780133825 Test year 2021 : -0.008464097203231491 Test year 2022 : 0.010896689656339609 Test year 2023 : -0.025851837667542954
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
hyperparam_grid = [
{'n_estimators': [50], 'max_depth': [3,5,7],
'max_features': [3,5]}
]
model = RandomForestRegressor(random_state=42)
# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']
%%time
grid_search.fit(X_fulltrain, y_fulltrain)
CPU times: user 44.2 s, sys: 255 ms, total: 44.5 s Wall time: 45 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=RandomForestRegressor(random_state=42), param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5], 'n_estimators': [50]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
0.03288083772090255 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50} 0.03268911472458363 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50} 0.05166632659254923 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50} 0.05452611070305944 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50} 0.05636006091771797 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50} 0.05020334277634201 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}
pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
ascending=False)
features | feature_importance | |
---|---|---|
5 | illiq | 0.198897 |
1 | rev | 0.158943 |
0 | size | 0.132347 |
7 | vol | 0.116127 |
8 | ivol | 0.114038 |
2 | mom | 0.093076 |
4 | bm | 0.079133 |
6 | illiq_12m | 0.063776 |
3 | beta | 0.043663 |
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.027635441051083953
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.027635441051083953 Test year 2017 : -0.07734369016377896 Test year 2018 : -0.03959447233848512 Test year 2019 : 0.007431052632130841 Test year 2020 : 0.004284335274858608 Test year 2021 : 0.011873444443791237 Test year 2022 : -0.0027118213183869866 Test year 2023 : 0.05356830744986352 CPU times: user 13min 53s, sys: 6.11 s, total: 13min 59s Wall time: 14min 13s
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
model = PLSRegression(n_components=4)
y_pred.reshape(-1).shape
(14288,)
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
model.fit(X_fulltrain, y_fulltrain)
y_pred = model.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.008666088091823676 Test year 2017 : -0.09331774541749294 Test year 2018 : -0.049216423103574325 Test year 2019 : 0.006125528081381337 Test year 2020 : -0.0015378783009631913 Test year 2021 : 0.011202972263563482 Test year 2022 : -0.0006573216211354094 Test year 2023 : 0.05334143576809536 CPU times: user 9.06 s, sys: 621 ms, total: 9.68 s Wall time: 2.68 s
cols = num_X_cols
cols
['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
pca = PCA(3, random_state=42)
pca.fit(X_fulltrain)
PCA(n_components=3, random_state=42)
pca.components_
array([[ 0.5393208 , -0.10042879, -0.02121698, 0.13077125, 0.11124688, -0.53998128, -0.55702611, -0.17478931, -0.19160066], [ 0.13101866, 0.27952024, 0.28856617, 0.04523341, -0.37628468, -0.23483181, -0.13341238, 0.53814814, 0.56146916], [ 0.06685249, -0.20225271, 0.52662786, -0.60773615, -0.43785557, -0.02737757, 0.022656 , -0.30870296, -0.14023771]])
pca.components_.shape
(3, 9)
X_fulltrain.shape
(194413, 9)
pca.components_.T.shape
(9, 3)
np.matmul(X_fulltrain.values,pca.components_.T)
array([[ 1.02594082, 0.52223329, 0.42203243], [ 1.25984674, -0.33726701, 0.86188511], [ 1.13373865, 0.26441179, 0.21948021], ..., [-1.01876888, 0.6986476 , -0.07366154], [-0.74036858, -0.54089609, 0.40091094], [-1.76902279, 0.43974171, 0.27114652]])
pca.fit_transform(X_fulltrain)
array([[ 1.02594082, 0.52223329, 0.42203243], [ 1.25984674, -0.33726701, 0.86188511], [ 1.13373865, 0.26441179, 0.21948021], ..., [-1.01876888, 0.6986476 , -0.07366154], [-0.74036858, -0.54089609, 0.40091094], [-1.76902279, 0.43974171, 0.27114652]])
sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,fit()
(return self),transform()
,fit_transform()
即可。
但直接用继承,可以更方便。
BaseEstimator
是sklearn里最基本的类,其他的类都从这个类继承而来,包括了set_params()
和get_params()
的方法。TransformerMixin
包括了fit_transform()
方法。因此由这个类继承而来的话,就不用自定义 fit_transform
了RegressorMixin
包括了predict()
方法class PCARegressor(BaseEstimator, RegressorMixin):
def __init__(self, n_components=3):
self.n_components = n_components
def fit(self, X, y):
self.pca_ = PCA(n_components=self.n_components).fit(X)
self.X_ = self.pca_.transform(X)
self.reg_ = LinearRegression().fit(self.X_,y)
return self
def predict(self, X):
self.pred_ = self.reg_.predict(self.pca_.transform(X))
return self.pred_
model = PCARegressor()
model.fit(X=X_fulltrain, y=y_fulltrain)
PCARegressor()
model.X_
array([[ 1.02594082, 0.52223329, 0.42203243], [ 1.25984674, -0.33726701, 0.86188511], [ 1.13373865, 0.26441179, 0.21948021], ..., [-1.01876888, 0.6986476 , -0.07366154], [-0.74036858, -0.54089609, 0.40091094], [-1.76902279, 0.43974171, 0.27114652]])
hyperparam_grid = [
{'n_components': range(1, len(cols)+1)}
]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=PCARegressor(), param_grid=[{'n_components': range(1, 10)}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'n_components': 6}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
nan {'n_components': 1} 0.03978386586015345 {'n_components': 2} 0.03867864707593599 {'n_components': 3} 0.05065681706795535 {'n_components': 4} 0.050715696965028 {'n_components': 5} 0.052339724870998625 {'n_components': 6} 0.043948248157652296 {'n_components': 7} 0.05198899108126847 {'n_components': 8} 0.04966417067338399 {'n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)
-0.010497492168772826
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.010497492168772826 Test year 2017 : -0.0892205268426225 Test year 2018 : -0.04907431002684648 Test year 2019 : 0.006466449764656601 Test year 2020 : -0.0005226873925128217 Test year 2021 : 0.009468277374521827 Test year 2022 : -0.006204075783554419 Test year 2023 : 0.05486550995882933 CPU times: user 1min 21s, sys: 7.58 s, total: 1min 29s Wall time: 22.9 s
pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)
CPU times: user 4.38 s, sys: 447 ms, total: 4.82 s Wall time: 1.27 s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=Pipeline(steps=[('pca', PCA()), ('linear_regression', LinearRegression())]), param_grid={'pca__n_components': range(1, 10)}, return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'pca__n_components': 6}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(mean_score), params)
nan {'pca__n_components': 1} 0.03978386586015345 {'pca__n_components': 2} 0.03867864707593599 {'pca__n_components': 3} 0.05065681706795535 {'pca__n_components': 4} 0.050715696965028 {'pca__n_components': 5} 0.052339724870998625 {'pca__n_components': 6} 0.043948248157652296 {'pca__n_components': 7} 0.05198899108126847 {'pca__n_components': 8} 0.04966417067338399 {'pca__n_components': 9}
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt print(np.sqrt(mean_score), params)
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.010497492168772826
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
model = SGDRegressor(penalty='elasticnet')
hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=SGDRegressor(penalty='elasticnet'), param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1], 'l1_ratio': [0.15, 0.3, 0.5, 0.7]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'alpha': 0.01, 'l1_ratio': 0.5}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.0311148263765626
%%time
for i in range(len(fulltrain_idx)):
X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
X_test = df_rank.loc[test_idx[i], cols]
y_test = df_rank.loc[test_idx[i], 'exret']
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]],
scoring=r2_oos_scorer,
return_train_score=True)
grid_search.fit(X_fulltrain, y_fulltrain)
y_pred = grid_search.predict(X=X_test)
y_pred = y_pred.reshape(-1)
print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))
Test year 2016 : -0.00864821027186724 Test year 2017 : -0.10212855467368231 Test year 2018 : -0.0360399221201777 Test year 2019 : 0.009780619233734189 Test year 2020 : -0.0013783134494498306 Test year 2021 : 0.007371679077704529 Test year 2022 : -0.010387142856184584 Test year 2023 : 0.0358911891731738 CPU times: user 1min 49s, sys: 3.03 s, total: 1min 52s Wall time: 39.5 s
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
hyperparam_grid = [
{'max_depth': [1,2,3,4,5,6],
'learning_rate': [0.1, 0.05, 0.01]}
]
model = GradientBoostingRegressor()
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]],
scoring=r2_oos_scorer,
return_train_score=True)
%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)
CPU times: user 8min 4s, sys: 1.33 s, total: 8min 6s Wall time: 8min 10s
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=GradientBoostingRegressor(), param_grid=[{'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [1, 2, 3, 4, 5, 6]}], return_train_score=True, scoring=make_scorer(r2_oos))
grid_search.best_params_
{'learning_rate': 0.1, 'max_depth': 3}
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(mean_score, params)
0.0016364787061411423 {'learning_rate': 0.1, 'max_depth': 1} 0.0031692481731780964 {'learning_rate': 0.1, 'max_depth': 2} 0.005036926182540924 {'learning_rate': 0.1, 'max_depth': 3} 0.0014929783765171845 {'learning_rate': 0.1, 'max_depth': 4} 0.002093148803113287 {'learning_rate': 0.1, 'max_depth': 5} -0.002867794489446185 {'learning_rate': 0.1, 'max_depth': 6} 0.0009616462813958337 {'learning_rate': 0.05, 'max_depth': 1} 0.002077392481220186 {'learning_rate': 0.05, 'max_depth': 2} 0.004847425597103383 {'learning_rate': 0.05, 'max_depth': 3} 0.004754069888309176 {'learning_rate': 0.05, 'max_depth': 4} 0.004074091843459637 {'learning_rate': 0.05, 'max_depth': 5} 0.0021864643800394434 {'learning_rate': 0.05, 'max_depth': 6} -0.001052554576351561 {'learning_rate': 0.01, 'max_depth': 1} -0.00010459917823935072 {'learning_rate': 0.01, 'max_depth': 2} 0.0014224899839351268 {'learning_rate': 0.01, 'max_depth': 3} 0.0030169482746364995 {'learning_rate': 0.01, 'max_depth': 4} 0.003678689341579111 {'learning_rate': 0.01, 'max_depth': 5} 0.003458896012854762 {'learning_rate': 0.01, 'max_depth': 6}
y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.045627159691304264
tf.__version__
'2.8.0'
keras.__version__
'2.8.0'
X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']
X_train.shape
(77643, 9)
X_val.shape
(116770, 9)
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
2023-04-03 09:33:33.820524: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
nn_model.compile(loss='mse',optimizer='sgd')
nn_model.fit(X_train, y_train, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0265 - val_loss: 0.0278 Epoch 2/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0278 Epoch 3/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276 Epoch 4/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0277 Epoch 5/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277 Epoch 6/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276 Epoch 7/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0272 Epoch 8/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275 Epoch 9/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0274 Epoch 10/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
<keras.callbacks.History at 0x7f9a581a7e20>
y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
0.005978663942921458
def build_model(learning_rate=0.003):
nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[9]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))
optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
nn_model.compile(loss="mse", optimizer=optimizer)
return nn_model
# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/2335962656.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating. keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
hyperparams_grid = {
'learning_rate':[0.003]
}
nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])
nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
validation_data=(X_val,y_val))
Epoch 1/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0326 - val_loss: 0.0286 Epoch 2/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0230 - val_loss: 0.0278 Epoch 3/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0279 Epoch 4/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276 Epoch 5/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0277 Epoch 6/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275 Epoch 7/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277 Epoch 8/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276 Epoch 9/10 2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276 Epoch 10/10 2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276 3650/3650 [==============================] - 3s 906us/step - loss: 0.0276 Epoch 1/10 6076/6076 [==============================] - 6s 975us/step - loss: 0.0333 - val_loss: 0.0272 Epoch 2/10 6076/6076 [==============================] - 6s 954us/step - loss: 0.0253 - val_loss: 0.0268 Epoch 3/10 6076/6076 [==============================] - 6s 956us/step - loss: 0.0252 - val_loss: 0.0267 Epoch 4/10 6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267 Epoch 5/10 6076/6076 [==============================] - 6s 949us/step - loss: 0.0251 - val_loss: 0.0268 Epoch 6/10 6076/6076 [==============================] - 6s 966us/step - loss: 0.0251 - val_loss: 0.0267 Epoch 7/10 6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0268 Epoch 8/10 6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267 Epoch 9/10 6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267 Epoch 10/10 6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267
GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]), array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))], estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f9a4ac096a0>, param_grid={'learning_rate': [0.003]})
y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)
-0.023086233917368748