import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras

df = pd.read_pickle('../../../data/factor_exposure/all_exposure_2024.pkl')

df

df.drop('tradeDate',axis=1,inplace=True)

df

for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 5299
ret 23412
rf 5299
exret 23412
ym 0
mktcap 17365
size 17365
rev 22223
mom_date 4110
mom 56595
beta 29710
bm 5210
illiq 36680
illiq_12m 107758
vol 25669
ivol 40175
vol_clip 25669
ivol_clip 40175

df = df[~df['ret_date'].isna()].copy()

df

df.loc[~df['mom'].isna(),'ret_date'].min()

Period('2008-01', 'M')

df = df[df['ret_date'] >= '2008-01'].copy()

for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 17713
rf 0
exret 17713
ym 0
mktcap 17015
size 17015
rev 21679
mom_date 3931
mom 43462
beta 24044
bm 5017
illiq 30823
illiq_12m 95560
vol 19966
ivol 21978
vol_clip 19966
ivol_clip 21978

df = df[~df['ret'].isna()].copy()

df

for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 4664
mom_date 3931
mom 43434
beta 22149
bm 1463
illiq 13508
illiq_12m 79369
vol 2874
ivol 4699
vol_clip 2874
ivol_clip 4699

df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)

df.drop(['ret','rf'],axis=1,inplace=True)

df.reset_index(inplace=True,drop=True)

df

for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 4664
mom 43434
beta 22149
bm 1463
illiq 13508
illiq_12m 79369
vol 2874
ivol 4699

# Reversal 的空值丢掉，其他的用 median 填充
df = df[~df['rev'].isna()].copy()

cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']

df

temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))

temp.fillna(0, inplace=True)

temp

df[cols] = temp.copy()

for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 0
mom 0
beta 0
bm 0
illiq 0
illiq_12m 0
vol 0
ivol 0

df

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 575818 entries, 0 to 580481
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype    
---  ------     --------------   -----    
 0   secID      575818 non-null  object   
 1   ret_date   575818 non-null  period[M]
 2   exret      575818 non-null  float64  
 3   ym         575818 non-null  period[M]
 4   size       575818 non-null  float64  
 5   rev        575818 non-null  float64  
 6   mom        575818 non-null  float64  
 7   beta       575818 non-null  float64  
 8   bm         575818 non-null  float64  
 9   illiq      575818 non-null  float64  
 10  illiq_12m  575818 non-null  float64  
 11  vol        575818 non-null  float64  
 12  ivol       575818 non-null  float64  
dtypes: float64(10), object(1), period[M](2)
memory usage: 61.5+ MB

def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1

num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()

num_X_cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

df[['ret_date']+num_X_cols]

df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)

temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)

temp

temp.drop('ret_date',axis=1).reset_index()

temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp

df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)

del temp

df_rank

df_rank['size'].describe()

count    5.758180e+05
mean     7.502544e-18
std      5.771543e-01
min     -9.996083e-01
25%     -4.998116e-01
50%      0.000000e+00
75%      4.998176e-01
max      9.996083e-01
Name: size, dtype: float64

df_rank['year'] = df_rank['ret_date'].dt.year

time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]

df_rank.groupby('year')['secID'].nunique()

year
2008    1463
2009    1530
2010    1841
2011    2142
2012    2383
2013    2432
2014    2549
2015    2772
2016    2941
2017    3392
2018    3522
2019    3648
2020    3961
2021    4422
2022    4770
2023    5110
2024    5111
Name: secID, dtype: int64

df_rank.groupby('year')['secID'].count()

year
2008    16621
2009    17335
2010    19823
2011    23864
2012    26912
2013    28592
2014    29539
2015    31727
2016    33468
2017    37665
2018    41103
2019    41992
2020    44134
2021    49181
2022    54418
2023    59061
2024    20383
Name: secID, dtype: int64

def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result

list_flat([[1,2,3],[3,4,5]])

[1, 2, 3, 3, 4, 5]

np.array([[1,2,3],[3,4,5]]).flatten()

array([1, 2, 3, 3, 4, 5])

df_rank

# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2009-2019], [2020-2023], [2024]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作，不能带着pandas的index，
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])

df_rank.loc[fulltrain_idx[-1]]

# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]

array([0, 2, 4])

test_years = list(range(2016, 2025))
test_years

[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)

r2_oos_scorer = make_scorer(r2_oos)

cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']

cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']

model = LinearRegression()

for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.009411576536744626
Test year 2017 : -0.08839433918218265
Test year 2018 : -0.04979412952068807
Test year 2019 : 0.006460501095753468
Test year 2020 : -0.001548658826626248
Test year 2021 : 0.011487385386933058
Test year 2022 : -0.0009344902234940111
Test year 2023 : 0.009191975684269216
Test year 2024 : -0.0203421002634514

cols = ['size','rev','illiq','ivol']

for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.009411576536744626
Test year 2017 : -0.08839433918218265
Test year 2018 : -0.04979412952068807
Test year 2019 : 0.006460501095753468
Test year 2020 : -0.001548658826626248
Test year 2021 : 0.011487385386933058
Test year 2022 : -0.0009344902234940111
Test year 2023 : 0.009191975684269216
Test year 2024 : -0.0203421002634514

cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']

model = HuberRegressor(alpha=0.01,epsilon=1.05)

for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : 0.006876742676993675
Test year 2017 : -0.029175894616541687
Test year 2018 : 0.009048190975758708
Test year 2019 : -0.018440250950227055
Test year 2020 : -0.013739947817012599
Test year 2021 : -0.008465853786439048
Test year 2022 : 0.010892561021420222
Test year 2023 : -0.002345351762501169
Test year 2024 : 0.015101267852206668

cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [3,5]}
]

model = RandomForestRegressor(random_state=42)

# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']

%%time
grid_search.fit(X_fulltrain, y_fulltrain)

CPU times: user 48.4 s, sys: 456 ms, total: 48.9 s
Wall time: 50.8 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5],
                          'n_estimators': [50]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

grid_search.best_params_

{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}

cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.027084789556580172 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50}
0.028556999189124684 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50}
0.05042647627227969 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50}
0.05530932712278905 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
0.04464716807967209 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
0.05370787731186078 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}

pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)

y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.022154653336818875

%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.022154653336818875
Test year 2017 : -0.08063812247577906
Test year 2018 : -0.03875517775769799
Test year 2019 : 0.00843188559327912
Test year 2020 : 0.00426710793978613
Test year 2021 : 0.011844755800809903
Test year 2022 : -0.002779929058047914
Test year 2023 : 0.009093821371165545
Test year 2024 : -0.017035974030240375
CPU times: user 18min 26s, sys: 10.6 s, total: 18min 37s
Wall time: 19min 19s

cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

model = PLSRegression(n_components=4)

y_pred.reshape(-1).shape

(20383,)

%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.008579624491225069
Test year 2017 : -0.0932107045146382
Test year 2018 : -0.04912393839310547
Test year 2019 : 0.006105302409689872
Test year 2020 : -0.0015629368636205232
Test year 2021 : 0.011195966741407215
Test year 2022 : -0.000636538680146348
Test year 2023 : 0.009706227073746132
Test year 2024 : -0.02135918156321326
CPU times: user 10.3 s, sys: 783 ms, total: 11.1 s
Wall time: 3.4 s

cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

pca = PCA(3, random_state=42)

pca.fit(X_fulltrain)

PCA(n_components=3, random_state=42)

pca.components_

array([[ 0.54095879, -0.09832092, -0.01437701,  0.13052989,  0.10593014,
        -0.54353902, -0.55735791, -0.16921922, -0.18577673],
       [ 0.1230896 ,  0.28038463,  0.28945179,  0.04301217, -0.37797118,
        -0.22714271, -0.12955155,  0.5400808 ,  0.56362587],
       [ 0.06274676, -0.20182262,  0.52534126, -0.60948758, -0.43840652,
        -0.02333879,  0.01991072, -0.30854347, -0.13975493]])

pca.components_.shape

(3, 9)

X_fulltrain.shape

(194413, 9)

pca.components_.T.shape

(9, 3)

np.matmul(X_fulltrain.values,pca.components_.T)

array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])

pca.fit_transform(X_fulltrain)

array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])

class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_

model = PCARegressor()

model.fit(X=X_fulltrain, y=y_fulltrain)

PCARegressor()

model.X_

array([[ 1.03429184,  0.50775265,  0.41399241],
       [ 1.25904914, -0.35426324,  0.85397871],
       [ 1.13891321,  0.24847025,  0.21082479],
       ...,
       [-1.01192038,  0.71256279, -0.06717374],
       [-0.74623422, -0.52959361,  0.40740754],
       [-1.76547939,  0.46317872,  0.27584235]])

hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))

grid_search.best_params_

{'n_components': 6}

cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

nan {'n_components': 1}
0.03874473370176844 {'n_components': 2}
0.037587746458808476 {'n_components': 3}
0.04997889197284674 {'n_components': 4}
0.04997497095219836 {'n_components': 5}
0.05161556457239314 {'n_components': 6}
0.041827129620731866 {'n_components': 7}
0.050300834943468646 {'n_components': 8}
0.0487322179924835 {'n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)

y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)

-0.01057358710785472

%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01057358710785472
Test year 2017 : -0.08914596959361498
Test year 2018 : -0.04897289891375012
Test year 2019 : 0.006451234792761107
Test year 2020 : -0.00048657810249452815
Test year 2021 : 0.0095202141605224
Test year 2022 : -0.0061009049213132105
Test year 2023 : 0.008028420859995777
Test year 2024 : -0.022204856045419552
CPU times: user 1min 41s, sys: 9.21 s, total: 1min 50s
Wall time: 29.1 s

pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)

CPU times: user 4.1 s, sys: 430 ms, total: 4.53 s
Wall time: 1.54 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))

grid_search.best_params_

{'pca__n_components': 6}

cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

nan {'pca__n_components': 1}
0.03874473370176844 {'pca__n_components': 2}
0.037587746458808476 {'pca__n_components': 3}
0.04997889197284674 {'pca__n_components': 4}
0.04997497095219836 {'pca__n_components': 5}
0.05161556457239422 {'pca__n_components': 6}
0.041827129620731866 {'pca__n_components': 7}
0.050300834943468646 {'pca__n_components': 8}
0.0487322179924835 {'pca__n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)

y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.01057358710785472

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

model = SGDRegressor(penalty='elasticnet')

hyperparam_grid = [{'alpha':[0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5, 0.7]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

grid_search.best_params_

{'alpha': 0.001, 'l1_ratio': 0.7}

y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.0110463976016415

%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.012516057005940606
Test year 2017 : -0.10558980314809396
Test year 2018 : -0.038706726553651816
Test year 2019 : 0.008269590365803325
Test year 2020 : 0.006087524412165979
Test year 2021 : 0.009994658939758372
Test year 2022 : -0.005719992513779193
Test year 2023 : 0.010637533125030907
Test year 2024 : -0.013783028576732859
CPU times: user 1min 41s, sys: 3.04 s, total: 1min 44s
Wall time: 43.1 s

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]

model = GradientBoostingRegressor()

grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)

%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

CPU times: user 9min 37s, sys: 11 s, total: 9min 48s
Wall time: 10min 36s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))

grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 3}

cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)

0.0010920514714541918 {'learning_rate': 0.1, 'max_depth': 1}
0.0024649624915880075 {'learning_rate': 0.1, 'max_depth': 2}
0.005179742214071581 {'learning_rate': 0.1, 'max_depth': 3}
0.0027306548772843 {'learning_rate': 0.1, 'max_depth': 4}
0.0002629712278338081 {'learning_rate': 0.1, 'max_depth': 5}
-0.0016882062465084502 {'learning_rate': 0.1, 'max_depth': 6}
0.0005808239346339894 {'learning_rate': 0.05, 'max_depth': 1}
0.0018532340083319276 {'learning_rate': 0.05, 'max_depth': 2}
0.003954208330685383 {'learning_rate': 0.05, 'max_depth': 3}
0.004945961987173897 {'learning_rate': 0.05, 'max_depth': 4}
0.003509540482510065 {'learning_rate': 0.05, 'max_depth': 5}
0.0011919994497078257 {'learning_rate': 0.05, 'max_depth': 6}
-0.0012377392747824345 {'learning_rate': 0.01, 'max_depth': 1}
-0.00038711061142104874 {'learning_rate': 0.01, 'max_depth': 2}
0.0010593345400554677 {'learning_rate': 0.01, 'max_depth': 3}
0.003114775494288402 {'learning_rate': 0.01, 'max_depth': 4}
0.003665138899025089 {'learning_rate': 0.01, 'max_depth': 5}
0.003195682240093589 {'learning_rate': 0.01, 'max_depth': 6}

y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.04906362476677084

tf.__version__

'2.8.0'

keras.__version__

'2.8.0'

X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']

X_train.shape

(77643, 9)

X_val.shape

(116770, 9)

nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))

2024-05-13 10:01:17.440490: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

nn_model.compile(loss='mse',optimizer='sgd')

nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))

Epoch 1/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0275
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0278
Epoch 4/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277
Epoch 5/10
2427/2427 [==============================] - 4s 2ms/step - loss: 0.0223 - val_loss: 0.0274
Epoch 6/10
2427/2427 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0273
Epoch 7/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 9/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276
Epoch 10/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276

<keras.callbacks.History at 0x7f7d85059d90>

y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

0.003361474358112737

def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(8, activation='relu'))
    nn_model.add(keras.layers.Dense(4, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model

# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/3997882518.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.
  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

hyperparams_grid = {
    'learning_rate':[0.003,0.001]
}

nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])

nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))

Epoch 1/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0346 - val_loss: 0.0313
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0249 - val_loss: 0.0295
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0235 - val_loss: 0.0282
Epoch 4/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
Epoch 5/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0227 - val_loss: 0.0279
Epoch 6/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0226 - val_loss: 0.0277
Epoch 7/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0225 - val_loss: 0.0277
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 9/10
2427/2427 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0278
Epoch 10/10
2427/2427 [==============================] - 6s 3ms/step - loss: 0.0224 - val_loss: 0.0275
3650/3650 [==============================] - 5s 1ms/step - loss: 0.0275
Epoch 1/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0432 - val_loss: 0.0342
Epoch 2/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0278 - val_loss: 0.0303
Epoch 3/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0251 - val_loss: 0.0291
Epoch 4/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0241 - val_loss: 0.0286
Epoch 5/10
2427/2427 [==============================] - 6s 3ms/step - loss: 0.0236 - val_loss: 0.0283
Epoch 6/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0233 - val_loss: 0.0281
Epoch 7/10
2427/2427 [==============================] - 6s 2ms/step - loss: 0.0232 - val_loss: 0.0280
Epoch 8/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0230 - val_loss: 0.0280
Epoch 9/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
Epoch 10/10
2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279
3650/3650 [==============================] - 5s 1ms/step - loss: 0.0279
Epoch 1/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0297 - val_loss: 0.0273
Epoch 2/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0255 - val_loss: 0.0270
Epoch 3/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0253 - val_loss: 0.0269
Epoch 4/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 5/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 6/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 7/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268
Epoch 8/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0269
Epoch 9/10
6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0269
Epoch 10/10
6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0267

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f7d851852b0>,
             param_grid={'learning_rate': [0.003, 0.001]})

y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.02629758156221018

df_rank

illiq_idx = 5
illiq_12m_idx = 6

X_fulltrain

class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()

X_fulltrain.values.shape

(194413, 9)

X_fulltrain_new = feature_adder.transform(X_fulltrain.values)

X_fulltrain_new

array([[ 0.9695586 , -0.85844749,  0.        , ...,  0.27549467,
         0.        , -0.48934551],
       [ 0.97153558, -0.45168539,  0.        , ..., -0.63595506,
        -0.7917603 , -0.47977528],
       [ 0.96733482, -0.70007424,  0.        , ...,  0.437268  ,
        -0.62583519, -0.47698589],
       ...,
       [-0.90249267,  0.521261  ,  0.        , ...,  0.33284457,
         0.83577713,  0.17302053],
       [-0.90905757,  0.02090209,  0.        , ..., -0.41620829,
        -0.3186652 ,  0.28694536],
       [-0.88114453,  0.87160675,  0.21129861, ...,  0.09024211,
         0.57960382,  0.8730741 ]])

X_fulltrain_new.shape

(194413, 10)

# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])

pipeline.fit_transform(X_fulltrain.values)

array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ...,
         4.77437532e-01,  4.06742109e-18, -9.01081929e-01],
       [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ...,
        -1.10212226e+00, -1.37679088e+00, -8.83459288e-01],
       [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ...,
         7.57793805e-01, -1.08826394e+00, -8.78322906e-01],
       ...,
       [-1.56393521e+00,  9.03360864e-01, -7.60380513e-19, ...,
         5.76826008e-01,  1.45333168e+00,  3.18600392e-01],
       [-1.57531157e+00,  3.62239461e-02, -7.60380513e-19, ...,
        -7.21296915e-01, -5.54126473e-01,  5.28381837e-01],
       [-1.52694089e+00,  1.51052051e+00,  3.66337572e-01, ...,
         1.56391306e-01,  1.00787227e+00,  1.60768063e+00]])

	secID	ret_date	tradeDate	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	2007-06-29	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	2007-07-31	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	2007-08-31	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	2007-09-28	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	2007-10-31	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
616458	689009.XSHG	2024-01	2023-12-29	-0.213082	0.001902	-0.214983	2023-12	1.552630e+10	23.465801	-0.105996	2023-11	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228	0.024634	0.018228
616459	689009.XSHG	2024-02	2024-01-31	0.298201	0.001749	0.296451	2024-01	1.221793e+10	23.226170	-0.214983	2023-12	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890	0.024607	0.013890
616460	689009.XSHG	2024-03	2024-02-29	-0.011551	0.001783	-0.013334	2024-02	1.586132e+10	23.487149	0.296451	2024-01	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755	0.044243	0.024755
616461	689009.XSHG	2024-04	2024-03-29	-0.071786	0.001687	-0.073474	2024-03	1.543851e+10	23.460131	-0.013334	2024-02	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928	0.030206	0.022928
616462	689009.XSHG	NaT	2024-04-12	NaN	NaN	NaN	2024-04	1.433023e+10	23.385637	-0.073474	2024-03	-0.104366	NaN	0.260342	NaN	0.000121	NaN	NaN	NaN	NaN

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
616458	689009.XSHG	2024-01	-0.213082	0.001902	-0.214983	2023-12	1.552630e+10	23.465801	-0.105996	2023-11	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228	0.024634	0.018228
616459	689009.XSHG	2024-02	0.298201	0.001749	0.296451	2024-01	1.221793e+10	23.226170	-0.214983	2023-12	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890	0.024607	0.013890
616460	689009.XSHG	2024-03	-0.011551	0.001783	-0.013334	2024-02	1.586132e+10	23.487149	0.296451	2024-01	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755	0.044243	0.024755
616461	689009.XSHG	2024-04	-0.071786	0.001687	-0.073474	2024-03	1.543851e+10	23.460131	-0.013334	2024-02	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928	0.030206	0.022928
616462	689009.XSHG	NaT	NaN	NaN	NaN	2024-04	1.433023e+10	23.385637	-0.073474	2024-03	-0.104366	NaN	0.260342	NaN	0.000121	NaN	NaN	NaN	NaN

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
616457	689009.XSHG	2023-12	-0.103927	0.002068	-0.105996	2023-11	1.732706e+10	23.575535	0.007540	2023-10	-0.017675	0.9541	0.221803	0.000086	0.000115	0.017594	0.015128	0.017594	0.015128
616458	689009.XSHG	2024-01	-0.213082	0.001902	-0.214983	2023-12	1.552630e+10	23.465801	-0.105996	2023-11	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228	0.024634	0.018228
616459	689009.XSHG	2024-02	0.298201	0.001749	0.296451	2024-01	1.221793e+10	23.226170	-0.214983	2023-12	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890	0.024607	0.013890
616460	689009.XSHG	2024-03	-0.011551	0.001783	-0.013334	2024-02	1.586132e+10	23.487149	0.296451	2024-01	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755	0.044243	0.024755
616461	689009.XSHG	2024-04	-0.071786	0.001687	-0.073474	2024-03	1.543851e+10	23.460131	-0.013334	2024-02	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928	0.030206	0.022928

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
6	000001.XSHE	2008-01	-0.137306	0.002949	-0.140255	2007-12	6.574629e+10	24.909069	0.066834	2007-11	NaN	0.9468	0.094476	0.000025	NaN	0.026541	NaN	0.026541	NaN
7	000001.XSHE	2008-02	-0.004504	0.002946	-0.007450	2008-01	5.850212e+10	24.792329	-0.140255	2007-12	NaN	0.9654	0.109513	0.000039	NaN	0.037722	0.012909	0.037722	0.012909
8	000001.XSHE	2008-03	-0.149321	0.002746	-0.152068	2008-02	5.823860e+10	24.787814	-0.007450	2008-01	NaN	1.0292	0.110009	0.000064	NaN	0.041448	0.009032	0.041448	0.009032
9	000001.XSHE	2008-04	0.050355	0.002862	0.047493	2008-03	4.954234e+10	24.626093	-0.152068	2008-02	NaN	1.0238	0.201102	0.000043	NaN	0.045109	0.021484	0.045109	0.021484
10	000001.XSHE	2008-05	-0.148211	0.002953	-0.151164	2008-04	5.203702e+10	24.675221	0.047493	2008-03	NaN	1.0212	0.206701	0.000051	0.000038	0.046323	0.015098	0.046323	0.015098
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
616457	689009.XSHG	2023-12	-0.103927	0.002068	-0.105996	2023-11	1.732706e+10	23.575535	0.007540	2023-10	-0.017675	0.9541	0.221803	0.000086	0.000115	0.017594	0.015128	0.017594	0.015128
616458	689009.XSHG	2024-01	-0.213082	0.001902	-0.214983	2023-12	1.552630e+10	23.465801	-0.105996	2023-11	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228	0.024634	0.018228
616459	689009.XSHG	2024-02	0.298201	0.001749	0.296451	2024-01	1.221793e+10	23.226170	-0.214983	2023-12	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890	0.024607	0.013890
616460	689009.XSHG	2024-03	-0.011551	0.001783	-0.013334	2024-02	1.586132e+10	23.487149	0.296451	2024-01	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755	0.044243	0.024755
616461	689009.XSHG	2024-04	-0.071786	0.001687	-0.073474	2024-03	1.543851e+10	23.460131	-0.013334	2024-02	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928	0.030206	0.022928

	secID	ret_date	exret	ym	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	000001.XSHE	2008-01	-0.140255	2007-12	24.909069	0.066834	NaN	0.9468	0.094476	0.000025	NaN	0.026541	NaN
1	000001.XSHE	2008-02	-0.007450	2008-01	24.792329	-0.140255	NaN	0.9654	0.109513	0.000039	NaN	0.037722	0.012909
2	000001.XSHE	2008-03	-0.152068	2008-02	24.787814	-0.007450	NaN	1.0292	0.110009	0.000064	NaN	0.041448	0.009032
3	000001.XSHE	2008-04	0.047493	2008-03	24.626093	-0.152068	NaN	1.0238	0.201102	0.000043	NaN	0.045109	0.021484
4	000001.XSHE	2008-05	-0.151164	2008-04	24.675221	0.047493	NaN	1.0212	0.206701	0.000051	0.000038	0.046323	0.015098
...	...	...	...	...	...	...	...	...	...	...	...	...	...
580477	689009.XSHG	2023-12	-0.105996	2023-11	23.575535	0.007540	-0.017675	0.9541	0.221803	0.000086	0.000115	0.017594	0.015128
580478	689009.XSHG	2024-01	-0.214983	2023-12	23.465801	-0.105996	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228
580479	689009.XSHG	2024-02	0.296451	2024-01	23.226170	-0.214983	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890
580480	689009.XSHG	2024-03	-0.013334	2024-02	23.487149	0.296451	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755
580481	689009.XSHG	2024-04	-0.073474	2024-03	23.460131	-0.013334	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928

Data¶

NA 值处理¶

Use rank instead of numerical values¶

Train, Validation, Test split¶

Evaluation metrics¶

Sklearn¶

Models¶

Linear regression¶

Huber regressor¶

Random Forest¶

Partial Least Squares¶

Principal Component Regression¶

PCA transform¶

PCA regression¶

Pipeline¶

Elastic Net¶

Gradient Boosted Regression Trees¶

Neural Nets¶

GridSeachCV Neural Nets¶

Transformation pipeline example¶

	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	0.796305	0.9468	0.094476	0.000025	0.000502	0.026541	0.000000
1	1.145639	0.9654	0.109513	0.000039	0.000478	0.037722	0.012909
2	0.693690	1.0292	0.110009	0.000064	0.000474	0.041448	0.009032
3	0.558575	1.0238	0.201102	0.000043	0.000528	0.045109	0.021484
4	-0.048874	1.0212	0.206701	0.000051	0.000038	0.046323	0.015098
...	...	...	...	...	...	...	...
580477	-0.017675	0.9541	0.221803	0.000086	0.000115	0.017594	0.015128
580478	0.085602	1.0448	0.247525	0.000110	0.000107	0.024634	0.018228
580479	-0.106357	1.2314	0.313607	0.000184	0.000116	0.024607	0.013890
580480	-0.292727	1.4905	0.241569	0.000164	0.000120	0.044243	0.024755
580481	-0.195005	1.5477	0.247127	0.000085	0.000118	0.030206	0.022928

		ret_date	size
ret_date
2008-01	0	0.0	0.969559
	196	0.0	0.990868
	701	0.0	0.522070
	1632	0.0	0.678843
	1828	0.0	-0.231355
...	...	...	...
2024-04	580322	0.0	-0.224050
	580355	0.0	-0.121426
	580394	0.0	0.845672
	580439	0.0	0.952213
	580481	0.0	0.705445

	ret_date	level_1	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	2008-01	0	0.969559	-0.858447	0.000000	0.086758	-0.672755	-0.978691	0.000000	0.275495	0.000000
1	2008-01	196	0.990868	-0.990868	0.852359	0.662100	-0.375951	-0.996956	-0.989346	0.745814	0.000000
2	2008-01	701	0.522070	-0.972603	0.552511	0.523592	0.283105	-0.223744	-0.595129	0.814307	0.000000
3	2008-01	1632	0.678843	-0.506849	-0.517504	0.775495	-0.636225	-0.698630	-0.511416	0.493151	0.000000
4	2008-01	1828	-0.231355	-0.945967	0.709285	0.000000	-0.403349	0.000000	0.000000	-0.982496	0.000000
...	...	...	...	...	...	...	...	...	...	...	...
575813	2024-04	580322	-0.224050	0.788876	0.872699	0.518997	-0.316882	0.229926	0.310223	-0.073247	0.164121
575814	2024-04	580355	-0.121426	0.336859	-0.942812	0.849197	-0.210732	0.381120	0.228750	0.623580	0.095574
575815	2024-04	580394	0.845672	-0.481394	0.279279	0.686251	0.255778	-0.156287	-0.175872	-0.578143	-0.592244
575816	2024-04	580439	0.952213	-0.869957	0.562867	0.264787	-0.171563	-0.974540	-0.984332	-0.707795	-0.609871
575817	2024-04	580481	0.705445	-0.363494	0.056404	0.515864	-0.662554	-0.621622	-0.574226	0.361535	0.578143

	ret_date	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
level_1
0	2008-01	0.969559	-0.858447	0.000000	0.086758	-0.672755	-0.978691	0.000000	0.275495	0.000000
196	2008-01	0.990868	-0.990868	0.852359	0.662100	-0.375951	-0.996956	-0.989346	0.745814	0.000000
701	2008-01	0.522070	-0.972603	0.552511	0.523592	0.283105	-0.223744	-0.595129	0.814307	0.000000
1632	2008-01	0.678843	-0.506849	-0.517504	0.775495	-0.636225	-0.698630	-0.511416	0.493151	0.000000
1828	2008-01	-0.231355	-0.945967	0.709285	0.000000	-0.403349	0.000000	0.000000	-0.982496	0.000000
...	...	...	...	...	...	...	...	...	...	...
580322	2024-04	-0.224050	0.788876	0.872699	0.518997	-0.316882	0.229926	0.310223	-0.073247	0.164121
580355	2024-04	-0.121426	0.336859	-0.942812	0.849197	-0.210732	0.381120	0.228750	0.623580	0.095574
580394	2024-04	0.845672	-0.481394	0.279279	0.686251	0.255778	-0.156287	-0.175872	-0.578143	-0.592244
580439	2024-04	0.952213	-0.869957	0.562867	0.264787	-0.171563	-0.974540	-0.984332	-0.707795	-0.609871
580481	2024-04	0.705445	-0.363494	0.056404	0.515864	-0.662554	-0.621622	-0.574226	0.361535	0.578143

	features	feature_importance
5	illiq	0.260451
1	rev	0.212757
7	vol	0.120819
8	ivol	0.115962
0	size	0.102099
2	mom	0.067074
4	bm	0.049063
6	illiq_12m	0.044348
3	beta	0.027427

	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	0.969559	-0.858447	0.000000	0.086758	-0.672755	-0.978691	0.000000	0.275495	0.000000
1	0.971536	-0.451685	0.000000	-0.170037	-0.613483	-0.959551	0.000000	-0.635955	-0.791760
2	0.967335	-0.700074	0.000000	0.345212	-0.557535	-0.953972	0.000000	0.437268	-0.625835
3	0.969027	0.443953	0.000000	0.048673	-0.112094	-0.974926	0.000000	0.241888	0.306785
4	0.964549	0.545052	0.000000	-0.264402	-0.258493	-0.970458	-0.976366	-0.704579	-0.497784
...	...	...	...	...	...	...	...	...	...
556879	-0.911797	-0.975744	0.000000	-0.550165	-0.223815	0.953693	0.000000	0.351709	-0.339214
556880	-0.890682	-0.083639	0.000000	-0.468452	-0.264123	0.271460	0.000000	-0.356566	-0.264123
556881	-0.902493	0.521261	0.000000	-0.303519	-0.313050	0.346041	0.000000	0.332845	0.835777
556882	-0.909058	0.020902	0.000000	-0.273194	-0.325266	0.573891	0.000000	-0.416208	-0.318665
556883	-0.881145	0.871607	0.211299	-0.427733	-0.551724	0.750550	0.995598	0.090242	0.579604