import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras


df = pd.read_pickle('../../data/factor_exposure/all_exposure.pkl')


df.drop('tradeDate',axis=1,inplace=True)

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 4853
ret 31888
rf 4853
exret 31888
ym 0
mktcap 23011
size 23011
rev 30586
beta 40704
bm 21512
illiq 41579
illiq_12m 91808
mom_date 3547
mom 49225
vol 30782
ivol 54678
vol_clip 30782
ivol_clip 54678


df = df[~df['ret_date'].isna()].copy()

df


df.loc[~df['mom'].isna(),'ret_date'].min()

Period('2008-01', 'M')


df = df[df['ret_date'] >= '2008-01'].copy()


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 26378
rf 0
exret 26378
ym 0
mktcap 22483
size 22483
rev 29845
beta 39159
bm 20363
illiq 36007
illiq_12m 79832
mom_date 3381
mom 36211
vol 25455
ivol 37368
vol_clip 25455
ivol_clip 37368


df = df[~df['ret'].isna()].copy()

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 7328
beta 25845
bm 16422
illiq 11127
illiq_12m 62624
mom_date 3381
mom 35755
vol 2799
ivol 12482
vol_clip 2799
ivol_clip 12482


df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)


df.drop(['ret','rf'],axis=1,inplace=True)


df.reset_index(inplace=True,drop=True)

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 7328
beta 25845
bm 16422
illiq 11127
illiq_12m 62624
mom 35755
vol 2799
ivol 12482


# Reversal 的空值丢掉，其他的用 median 填充
df = df[~df['rev'].isna()].copy()


cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']

df


temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))


temp.fillna(0, inplace=True)


df[cols] = temp.copy()


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 0
beta 0
bm 0
illiq 0
illiq_12m 0
mom 0
vol 0
ivol 0

df


def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1


num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()


num_X_cols

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']


temp = df[['ret_date']+num_X_cols].groupby('ret_date').apply(csrank)


temp


df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)


del temp


df_rank


df_rank.sort_values('ret_date')


df_rank['year'] = df_rank['ret_date'].dt.year


time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]


time_idx

[Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                  8,      9,
             ...
             460852, 460853, 460854, 460855, 460856, 460857, 460858, 460859,
             460860, 460861],
            dtype='int64', length=17347),
 Int64Index([    12,     13,     14,     15,     16,     17,     18,     19,
                 20,     21,
             ...
             460864, 460865, 460866, 460867, 460868, 460869, 460870, 460871,
             460872, 460873],
            dtype='int64', length=18345),
 Int64Index([    24,     25,     26,     27,     28,     29,     31,     32,
                192,    193,
             ...
             460876, 460877, 460878, 460879, 460880, 460881, 460882, 460883,
             460884, 460885],
            dtype='int64', length=20770),
 Int64Index([    33,     34,     35,     36,     37,     38,     39,     40,
                 41,     42,
             ...
             460888, 460889, 460890, 460891, 460892, 460893, 460894, 460895,
             460896, 460897],
            dtype='int64', length=24588),
 Int64Index([    45,     46,     47,     48,     49,     50,     51,     52,
                 53,     54,
             ...
             460900, 460901, 460902, 460903, 460904, 460905, 460906, 460907,
             460908, 460909],
            dtype='int64', length=27649),
 Int64Index([    57,     58,     59,     60,     61,     62,     63,     64,
                 65,     66,
             ...
             460912, 460913, 460914, 460915, 460916, 460917, 460918, 460919,
             460920, 460921],
            dtype='int64', length=28885),
 Int64Index([    69,     70,     71,     72,     73,     74,     75,     76,
                 77,     78,
             ...
             460924, 460925, 460926, 460927, 460928, 460929, 460930, 460931,
             460932, 460933],
            dtype='int64', length=28408),
 Int64Index([    81,     82,     83,     84,     85,     86,     87,     88,
                 89,     90,
             ...
             460936, 460937, 460938, 460939, 460940, 460941, 460942, 460943,
             460944, 460945],
            dtype='int64', length=28331),
 Int64Index([    93,     94,     95,     96,     97,     98,     99,    100,
                101,    102,
             ...
             460948, 460949, 460950, 460951, 460952, 460953, 460954, 460955,
             460956, 460957],
            dtype='int64', length=31459),
 Int64Index([   105,    106,    107,    108,    109,    110,    111,    112,
                113,    114,
             ...
             460960, 460961, 460962, 460963, 460964, 460965, 460966, 460967,
             460968, 460969],
            dtype='int64', length=36050),
 Int64Index([   117,    118,    119,    120,    121,    122,    123,    124,
                125,    126,
             ...
             460972, 460973, 460974, 460975, 460976, 460977, 460978, 460979,
             460980, 460981],
            dtype='int64', length=40026),
 Int64Index([   129,    130,    131,    132,    133,    134,    135,    136,
                137,    138,
             ...
             460984, 460985, 460986, 460987, 460988, 460989, 460990, 460991,
             460992, 460993],
            dtype='int64', length=43017),
 Int64Index([   141,    142,    143,    144,    145,    146,    147,    148,
                149,    150,
             ...
             460996, 460997, 460998, 460999, 461000, 461001, 461002, 461003,
             461004, 461005],
            dtype='int64', length=45124),
 Int64Index([   153,    154,    155,    156,    157,    158,    159,    160,
                161,    162,
             ...
             461008, 461009, 461010, 461011, 461012, 461013, 461014, 461015,
             461016, 461017],
            dtype='int64', length=50192),
 Int64Index([   165,    166,    167,    329,    330,    331,    449,    450,
                451,    751,
             ...
             459999, 460408, 460409, 460410, 460564, 460565, 460566, 461018,
             461019, 461020],
            dtype='int64', length=13502)]


df_rank.groupby('year')['secID'].nunique()

year
2008    1559
2009    1627
2010    1934
2011    2231
2012    2477
2013    2530
2014    2649
2015    2863
2016    3028
2017    3471
2018    3605
2019    3739
2020    4045
2021    4509
2022    4538
Name: secID, dtype: int64


df_rank.groupby('year')['secID'].count()

year
2008    17347
2009    18345
2010    20770
2011    24588
2012    27649
2013    28885
2014    28408
2015    28331
2016    31459
2017    36050
2018    40026
2019    43017
2020    45124
2021    50192
2022    13502
Name: secID, dtype: int64


def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result


list_flat([[1,2,3],[3,4,5]])

[1, 2, 3, 3, 4, 5]


df_rank


cross validation


2008: [0,1,100,1000; 5,10,300;]
      cv_idx: [0,1,2,3; 4,5,6]


# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2017], [2018-2021], [2022]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作，不能带着pandas的index，
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])


df_rank.loc[fulltrain_idx[4]]


# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]

array([0, 2, 4])


test_years = list(range(2016, 2023))
test_years

[2016, 2017, 2018, 2019, 2020, 2021, 2022]


def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)


r2_oos_scorer = make_scorer(r2_oos)


cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']


cols

['size', 'rev', 'beta', 'bm', 'illiq', 'mom', 'ivol']


model = LinearRegression()


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.011040105913657339
Test year 2017 : -0.08027893343542303
Test year 2018 : -0.04274052247156712
Test year 2019 : 0.005845892928012408
Test year 2020 : 0.000447862723551129
Test year 2021 : 0.011070563639831277
Test year 2022 : -0.039843884519424


cols = ['size','rev','illiq','ivol']


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.011177166966521046
Test year 2017 : -0.08141130184048362
Test year 2018 : -0.042766248969311915
Test year 2019 : 0.006650509413867134
Test year 2020 : 0.00018448663790970876
Test year 2021 : 0.01024341147435115
Test year 2022 : -0.040448065360397356


cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols

['size', 'rev', 'beta', 'bm', 'illiq', 'mom', 'ivol']


model = HuberRegressor(alpha=0.01,epsilon=1.05)


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : 0.0023583262241427816
Test year 2017 : -0.036809073349359345
Test year 2018 : 0.0055923630272862734
Test year 2019 : -0.019272124325766526
Test year 2020 : -0.013786542186777107
Test year 2021 : -0.008908423774469876
Test year 2022 : 0.0314098472397073


cols = num_X_cols
cols

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']


hyperparam_grid = [
    {'n_estimators': [20], 'max_depth': [1,3,5], 
     'max_features': [3]}
]


model = RandomForestRegressor(random_state=42)


# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']


%%time
grid_search.fit(X_fulltrain, y_fulltrain)

CPU times: user 13.6 s, sys: 299 ms, total: 13.9 s
Wall time: 14.5 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [1, 3, 5], 'max_features': [3],
                          'n_estimators': [20]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'max_depth': 5, 'max_features': 3, 'n_estimators': 20}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

nan {'max_depth': 1, 'max_features': 3, 'n_estimators': 20}
0.02645160793818769 {'max_depth': 3, 'max_features': 3, 'n_estimators': 20}
0.028981572726202026 {'max_depth': 5, 'max_features': 3, 'n_estimators': 20}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)


pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.012291646126052935


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.012832337545194417
Test year 2017 : -0.0820561067657255
Test year 2018 : -0.04409182586886584
Test year 2019 : 0.007830509088117443
Test year 2020 : 0.003591662917594607
Test year 2021 : -0.015391690896670474
CPU times: user 22min 37s, sys: 4.99 s, total: 22min 42s
Wall time: 22min 50s


cols = num_X_cols
cols

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']


model = PLSRegression(n_components=4)


y_pred.reshape(-1).shape

(31459,)


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.009957481993730699
Test year 2017 : -0.08789715100343876
Test year 2018 : -0.04246315607293405
Test year 2019 : 0.005660472134352501
Test year 2020 : 0.0005198826566897852
Test year 2021 : 0.010895831760366126
Test year 2022 : -0.03793343273496119
CPU times: user 9.45 s, sys: 775 ms, total: 10.2 s
Wall time: 3.68 s


cols = num_X_cols
cols

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


pca = PCA(3, random_state=42)


pca.fit(X_fulltrain)

PCA(n_components=3, random_state=42)


pca.components_

array([[ 0.55218289, -0.00712661,  0.12947152, -0.00753788, -0.58554178,
        -0.57211516,  0.06107138,  0.05366476,  0.03793427],
       [-0.07942897,  0.29144874, -0.01245223, -0.4038553 , -0.03172494,
         0.07656302,  0.28628117,  0.55863802,  0.58702027],
       [ 0.08360049, -0.23390857, -0.62087249, -0.41365348, -0.04029952,
         0.00788101,  0.52037796, -0.29728935, -0.14437987]])


pca.components_.shape

(3, 9)


X_fulltrain.shape

(194323, 9)


pca.components_.T.shape

(9, 3)


np.matmul(X_fulltrain.values,pca.components_.T)

array([[ 1.15415548,  0.16471731,  0.43597185],
       [ 1.0193875 , -0.66278179,  0.90089929],
       [ 1.15269411, -0.13666742,  0.25103196],
       ...,
       [-1.76038395, -0.48358324, -0.17121601],
       [-1.69942192,  0.02958169, -0.01992473],
       [-1.66692222,  0.40290614, -0.05801819]])


pca.fit_transform(X_fulltrain)

array([[ 1.15415548,  0.16471731,  0.43597185],
       [ 1.0193875 , -0.66278179,  0.90089929],
       [ 1.15269411, -0.13666742,  0.25103196],
       ...,
       [-1.76038395, -0.48358324, -0.17121601],
       [-1.69942192,  0.02958169, -0.01992473],
       [-1.66692222,  0.40290614, -0.05801819]])


class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_


model = PCARegressor()


model.fit(X=X_fulltrain, y=y_fulltrain)

PCARegressor()


model.X_

array([[ 1.15415548,  0.16471731,  0.43597185],
       [ 1.0193875 , -0.66278179,  0.90089929],
       [ 1.15269411, -0.13666742,  0.25103196],
       ...,
       [-1.76038395, -0.48358324, -0.17121601],
       [-1.69942192,  0.02958169, -0.01992473],
       [-1.66692222,  0.40290614, -0.05801819]])


hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'n_components': 9}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.008866500079819884 {'n_components': 1}
nan {'n_components': 2}
nan {'n_components': 3}
0.027970921235605007 {'n_components': 4}
0.03313896020831564 {'n_components': 5}
0.03412866151593205 {'n_components': 6}
0.027344385016598305 {'n_components': 7}
0.042214846044480614 {'n_components': 8}
0.043720734317050605 {'n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)

-0.010083320861536516


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.010083320861536516
Test year 2017 : -0.0861213984121596
Test year 2018 : -0.04199121460584965
Test year 2019 : 0.006116823960447437
Test year 2020 : 0.0008257380622559429
Test year 2021 : 0.009637627897886802
Test year 2022 : -0.041629032594069804
CPU times: user 1min 45s, sys: 11 s, total: 1min 56s
Wall time: 34.5 s


pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)

CPU times: user 6.5 s, sys: 761 ms, total: 7.26 s
Wall time: 2.44 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'pca__n_components': 9}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.008866500079819884 {'pca__n_components': 1}
nan {'pca__n_components': 2}
nan {'pca__n_components': 3}
0.027970921235601038 {'pca__n_components': 4}
0.03313896020831564 {'pca__n_components': 5}
0.03412866151593205 {'pca__n_components': 6}
0.027344385016598305 {'pca__n_components': 7}
0.042214846044480614 {'pca__n_components': 8}
0.043720734317050605 {'pca__n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_83836/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.010083320861536516


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


model = SGDRegressor(penalty='elasticnet')


hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5]}]


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'alpha': 0.0001, 'l1_ratio': 0.15}


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.01649354448289997


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01794839855231989
Test year 2017 : -0.0889158386492046
Test year 2018 : -0.049128472181301674
Test year 2019 : 0.007448471572534254
Test year 2020 : 0.00131514690209944
Test year 2021 : 0.007089595884243738
Test year 2022 : -0.036756966095477184
CPU times: user 1min 10s, sys: 1.5 s, total: 1min 12s
Wall time: 24.5 s


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]


model = GradientBoostingRegressor()


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

CPU times: user 9min 1s, sys: 2.38 s, total: 9min 3s
Wall time: 9min 8s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'learning_rate': 0.05, 'max_depth': 4}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)

0.0014027564804455217 {'learning_rate': 0.1, 'max_depth': 1}
0.0021418471187519827 {'learning_rate': 0.1, 'max_depth': 2}
0.002198125424866415 {'learning_rate': 0.1, 'max_depth': 3}
0.000531469891437597 {'learning_rate': 0.1, 'max_depth': 4}
-0.0027277709656639004 {'learning_rate': 0.1, 'max_depth': 5}
-0.006314742607181678 {'learning_rate': 0.1, 'max_depth': 6}
0.0008960738128721557 {'learning_rate': 0.05, 'max_depth': 1}
0.001329288644544957 {'learning_rate': 0.05, 'max_depth': 2}
0.0032426540517009883 {'learning_rate': 0.05, 'max_depth': 3}
0.0039906633223752985 {'learning_rate': 0.05, 'max_depth': 4}
0.002606190722552637 {'learning_rate': 0.05, 'max_depth': 5}
-0.00022434254377290408 {'learning_rate': 0.05, 'max_depth': 6}
-0.0009877248534237992 {'learning_rate': 0.01, 'max_depth': 1}
-0.000191156503541956 {'learning_rate': 0.01, 'max_depth': 2}
0.0012339578747196933 {'learning_rate': 0.01, 'max_depth': 3}
0.0026258496548985377 {'learning_rate': 0.01, 'max_depth': 4}
0.003356999919468695 {'learning_rate': 0.01, 'max_depth': 5}
0.0038975361950893683 {'learning_rate': 0.01, 'max_depth': 6}


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.017481311295932223


tf.__version__

'2.4.1'


keras.__version__

'2.4.0'


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


X_train.shape

(81050, 9)


X_val.shape

(113273, 9)


nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))


nn_model.compile(loss='mse',optimizer='sgd')


nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))

Epoch 1/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0394 - val_loss: 0.0239
Epoch 2/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0227 - val_loss: 0.0239
Epoch 3/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0225 - val_loss: 0.0243
Epoch 4/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0220 - val_loss: 0.0239
Epoch 5/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0220 - val_loss: 0.0238
Epoch 6/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0221 - val_loss: 0.0238
Epoch 7/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0224 - val_loss: 0.0237
Epoch 8/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0220 - val_loss: 0.0242
Epoch 9/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0221 - val_loss: 0.0239
Epoch 10/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0222 - val_loss: 0.0242

<tensorflow.python.keras.callbacks.History at 0x7fd17d1a86a0>


y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

0.00047489305309966756


def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(8, activation='relu'))
    nn_model.add(keras.layers.Dense(4, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(lr=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model


keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)


hyperparams_grid = {
    'learning_rate':[0.003,0.001]
}


nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])


nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))

Epoch 1/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0481 - val_loss: 0.0291
Epoch 2/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0258 - val_loss: 0.0260
Epoch 3/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0237 - val_loss: 0.0251
Epoch 4/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0230 - val_loss: 0.0251
Epoch 5/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0226 - val_loss: 0.0246
Epoch 6/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0246
Epoch 7/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0226 - val_loss: 0.0244
Epoch 8/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0226 - val_loss: 0.0243
Epoch 9/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0223 - val_loss: 0.0244
Epoch 10/10
2533/2533 [==============================] - 8s 3ms/step - loss: 0.0222 - val_loss: 0.0242
3540/3540 [==============================] - 8s 2ms/step - loss: 0.0242
Epoch 1/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.1056 - val_loss: 0.0348
Epoch 2/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0303 - val_loss: 0.0291
Epoch 3/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0259 - val_loss: 0.0269
Epoch 4/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0245 - val_loss: 0.0258
Epoch 5/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0233 - val_loss: 0.0252
Epoch 6/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0227 - val_loss: 0.0249
Epoch 7/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0228 - val_loss: 0.0246
Epoch 8/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0222 - val_loss: 0.0244
Epoch 9/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0243
Epoch 10/10
2533/2533 [==============================] - 7s 3ms/step - loss: 0.0221 - val_loss: 0.0243
3540/3540 [==============================] - 8s 2ms/step - loss: 0.0243
Epoch 1/10
6073/6073 [==============================] - 14s 2ms/step - loss: 0.0409 - val_loss: 0.0233
Epoch 2/10
6073/6073 [==============================] - 12s 2ms/step - loss: 0.0230 - val_loss: 0.0232
Epoch 3/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0230 - val_loss: 0.0232
Epoch 4/10
6073/6073 [==============================] - 14s 2ms/step - loss: 0.0229 - val_loss: 0.0232
Epoch 5/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0232 - val_loss: 0.0232
Epoch 6/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0233 - val_loss: 0.0232
Epoch 7/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0227 - val_loss: 0.0232
Epoch 8/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0227 - val_loss: 0.0233
Epoch 9/10
6073/6073 [==============================] - 14s 2ms/step - loss: 0.0229 - val_loss: 0.0232
Epoch 10/10
6073/6073 [==============================] - 13s 2ms/step - loss: 0.0228 - val_loss: 0.0231

GridSearchCV(cv=[(array([    0,     1,     2, ..., 81047, 81048, 81049]),
                  array([ 81050,  81051,  81052, ..., 194320, 194321, 194322]))],
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7fd17d1ae880>,
             param_grid={'learning_rate': [0.003, 0.001]})


y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.025505656518980402


df_rank


X_fulltrain.columns.tolist()

['size', 'rev', 'beta', 'bm', 'illiq', 'illiq_12m', 'mom', 'vol', 'ivol']


X_fulltrain.columns.tolist().index('illiq')

4


X_fulltrain.columns.tolist().index('illiq_12m')

5


illiq_idx = 4
illiq_12m_idx = 5


class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()


X_fulltrain.values.shape

(194323, 9)


X_fulltrain_new = feature_adder.transform(X_fulltrain.values)


X_fulltrain_new

array([[ 0.97069597, -0.85347985,  0.12380952, ...,  0.32161172,
         0.        , -0.48974359],
       [ 0.9724438 , -0.41261784, -0.21972444, ..., -0.60116026,
        -0.71863669, -0.48042059],
       [ 0.96848138, -0.66475645,  0.37535817, ...,  0.41547278,
        -0.61604585, -0.47922636],
       ...,
       [-0.98794143,  0.42204996,  0.        , ..., -0.48578811,
        -0.60292851,  0.98535745],
       [-0.98826488,  0.30008382,  0.        , ..., -0.22212909,
        -0.16680637,  0.98658843],
       [-0.98773006,  0.68179959,  0.        , ..., -0.2196319 ,
         0.18200409,  0.97832311]])


X_fulltrain_new.shape

(194323, 10)


# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])


pipeline.fit_transform(X_fulltrain.values)

array([[ 1.68212553e+00, -1.47900112e+00,  2.14580503e-01, ...,
         5.57323099e-01,  5.08734687e-19, -8.93034985e-01],
       [ 1.68515435e+00, -7.15028295e-01, -3.80815457e-01, ...,
        -1.04175463e+00, -1.24981219e+00, -8.76034741e-01],
       [ 1.67828784e+00, -1.15196102e+00,  6.50552087e-01, ...,
         7.19975553e-01, -1.07139200e+00, -8.73857086e-01],
       ...,
       [-1.71201030e+00,  7.31373277e-01, -4.05585273e-18, ...,
        -8.41825465e-01, -1.04857907e+00,  1.79677426e+00],
       [-1.71257081e+00,  5.20017322e-01, -4.05585273e-18, ...,
        -3.84928977e-01, -2.90100180e-01,  1.79901892e+00],
       [-1.71164402e+00,  1.18149521e+00, -4.05585273e-18, ...,
        -3.80601589e-01,  3.16531192e-01,  1.78394732e+00]])

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	beta	bm	illiq	illiq_12m	mom_date	mom	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	0.4614	0.123739	NaN	NaN	NaT	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	0.6423	0.093992	0.000040	NaN	2007-06	NaN	0.042521	NaN	0.042521	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	0.7722	0.097085	0.000020	NaN	2007-07	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	0.7596	0.092276	0.000025	NaN	2007-08	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	0.7988	0.083411	0.000030	NaN	2007-09	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
504875	900957.XSHG	2021-12	0.035831	0.002026	0.033805	2021-11	1.120560e+08	18.534509	-0.042588	NaN	NaN	0.070056	0.062884	2021-10	0.216730	0.009639	0.007046	0.009639	0.007046
504876	900957.XSHG	2022-01	-0.022013	0.002014	-0.024027	2021-12	1.161040e+08	18.569997	0.033805	NaN	NaN	0.078037	0.059672	2021-11	0.211045	0.010961	0.008692	0.010961	0.008692
504877	900957.XSHG	2022-02	-0.011254	0.001921	-0.013175	2022-01	1.135280e+08	18.547560	-0.024027	NaN	NaN	0.044515	0.058502	2021-12	-0.059172	0.010559	0.008409	0.010559	0.008409
504878	900957.XSHG	2022-03	-0.034146	0.001919	-0.036066	2022-02	1.122400e+08	18.536150	-0.013175	NaN	NaN	0.057218	0.060208	2022-01	-0.157182	0.006517	0.004195	0.006517	0.004195
504879	900957.XSHG	NaT	NaN	NaN	NaN	2022-03	1.083760e+08	18.501117	-0.036066	NaN	NaN	NaN	0.062442	2022-02	-0.117647	NaN	NaN	NaN	NaN

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	beta	bm	illiq	illiq_12m	mom_date	mom	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	0.4614	0.123739	NaN	NaN	NaT	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	0.6423	0.093992	0.000040	NaN	2007-06	NaN	0.042521	NaN	0.042521	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	0.7722	0.097085	0.000020	NaN	2007-07	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	0.7596	0.092276	0.000025	NaN	2007-08	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	0.7988	0.083411	0.000030	NaN	2007-09	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
504874	900957.XSHG	2021-11	-0.040625	0.001963	-0.042588	2021-10	1.168400e+08	18.576316	-0.042478	NaN	NaN	0.058457	0.067646	2021-09	0.285164	0.011663	0.007700	0.011663	0.007700
504875	900957.XSHG	2021-12	0.035831	0.002026	0.033805	2021-11	1.120560e+08	18.534509	-0.042588	NaN	NaN	0.070056	0.062884	2021-10	0.216730	0.009639	0.007046	0.009639	0.007046
504876	900957.XSHG	2022-01	-0.022013	0.002014	-0.024027	2021-12	1.161040e+08	18.569997	0.033805	NaN	NaN	0.078037	0.059672	2021-11	0.211045	0.010961	0.008692	0.010961	0.008692
504877	900957.XSHG	2022-02	-0.011254	0.001921	-0.013175	2022-01	1.135280e+08	18.547560	-0.024027	NaN	NaN	0.044515	0.058502	2021-12	-0.059172	0.010559	0.008409	0.010559	0.008409
504878	900957.XSHG	2022-03	-0.034146	0.001919	-0.036066	2022-02	1.122400e+08	18.536150	-0.013175	NaN	NaN	0.057218	0.060208	2022-01	-0.157182	0.006517	0.004195	0.006517	0.004195

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	beta	bm	illiq	illiq_12m	mom_date	mom	vol	ivol	vol_clip	ivol_clip
6	000001.XSHE	2008-01	-0.137306	0.002949	-0.140255	2007-12	6.574629e+10	24.909069	0.066834	0.9468	0.094476	0.000025	NaN	2007-11	NaN	0.027254	NaN	0.027254	NaN
7	000001.XSHE	2008-02	-0.004504	0.002946	-0.007450	2008-01	5.850212e+10	24.792329	-0.140255	0.9654	0.109513	0.000039	NaN	2007-12	NaN	0.037722	0.013266	0.037722	0.013266
8	000001.XSHE	2008-03	-0.149321	0.002746	-0.152068	2008-02	5.823860e+10	24.787814	-0.007450	1.0292	0.110009	0.000064	NaN	2008-01	NaN	0.041448	0.009474	0.041448	0.009474
9	000001.XSHE	2008-04	0.050355	0.002862	0.047493	2008-03	4.954234e+10	24.626093	-0.152068	1.0238	0.201102	0.000043	NaN	2008-02	NaN	0.045109	0.021746	0.045109	0.021746
10	000001.XSHE	2008-05	-0.148211	0.002953	-0.151164	2008-04	5.203702e+10	24.675221	0.047493	1.0212	0.206701	0.000051	0.000038	2008-03	NaN	0.046323	0.014474	0.046323	0.014474
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
504874	900957.XSHG	2021-11	-0.040625	0.001963	-0.042588	2021-10	1.168400e+08	18.576316	-0.042478	NaN	NaN	0.058457	0.067646	2021-09	0.285164	0.011663	0.007700	0.011663	0.007700
504875	900957.XSHG	2021-12	0.035831	0.002026	0.033805	2021-11	1.120560e+08	18.534509	-0.042588	NaN	NaN	0.070056	0.062884	2021-10	0.216730	0.009639	0.007046	0.009639	0.007046
504876	900957.XSHG	2022-01	-0.022013	0.002014	-0.024027	2021-12	1.161040e+08	18.569997	0.033805	NaN	NaN	0.078037	0.059672	2021-11	0.211045	0.010961	0.008692	0.010961	0.008692
504877	900957.XSHG	2022-02	-0.011254	0.001921	-0.013175	2022-01	1.135280e+08	18.547560	-0.024027	NaN	NaN	0.044515	0.058502	2021-12	-0.059172	0.010559	0.008409	0.010559	0.008409
504878	900957.XSHG	2022-03	-0.034146	0.001919	-0.036066	2022-02	1.122400e+08	18.536150	-0.013175	NaN	NaN	0.057218	0.060208	2022-01	-0.157182	0.006517	0.004195	0.006517	0.004195

	secID	ret_date	exret	ym	size	rev	beta	bm	illiq	illiq_12m	mom	vol	ivol
0	000001.XSHE	2008-01	-0.140255	2007-12	24.909069	0.066834	0.9468	0.094476	0.000025	NaN	NaN	0.027254	NaN
1	000001.XSHE	2008-02	-0.007450	2008-01	24.792329	-0.140255	0.9654	0.109513	0.000039	NaN	NaN	0.037722	0.013266
2	000001.XSHE	2008-03	-0.152068	2008-02	24.787814	-0.007450	1.0292	0.110009	0.000064	NaN	NaN	0.041448	0.009474
3	000001.XSHE	2008-04	0.047493	2008-03	24.626093	-0.152068	1.0238	0.201102	0.000043	NaN	NaN	0.045109	0.021746
4	000001.XSHE	2008-05	-0.151164	2008-04	24.675221	0.047493	1.0212	0.206701	0.000051	0.000038	NaN	0.046323	0.014474
...	...	...	...	...	...	...	...	...	...	...	...	...	...
461016	900957.XSHG	2021-11	-0.042588	2021-10	18.576316	-0.042478	NaN	NaN	0.058457	0.067646	0.285164	0.011663	0.007700
461017	900957.XSHG	2021-12	0.033805	2021-11	18.534509	-0.042588	NaN	NaN	0.070056	0.062884	0.216730	0.009639	0.007046
461018	900957.XSHG	2022-01	-0.024027	2021-12	18.569997	0.033805	NaN	NaN	0.078037	0.059672	0.211045	0.010961	0.008692
461019	900957.XSHG	2022-02	-0.013175	2022-01	18.547560	-0.024027	NaN	NaN	0.044515	0.058502	-0.059172	0.010559	0.008409
461020	900957.XSHG	2022-03	-0.036066	2022-02	18.536150	-0.013175	NaN	NaN	0.057218	0.060208	-0.157182	0.006517	0.004195

	secID	ret_date	exret	ym	size	rev	beta	bm	illiq	illiq_12m	mom	vol	ivol
0	000001.XSHE	2008-01	-0.140255	2007-12	24.909069	0.066834	0.9468	0.094476	0.000025	NaN	NaN	0.027254	NaN
1	000001.XSHE	2008-02	-0.007450	2008-01	24.792329	-0.140255	0.9654	0.109513	0.000039	NaN	NaN	0.037722	0.013266
2	000001.XSHE	2008-03	-0.152068	2008-02	24.787814	-0.007450	1.0292	0.110009	0.000064	NaN	NaN	0.041448	0.009474
3	000001.XSHE	2008-04	0.047493	2008-03	24.626093	-0.152068	1.0238	0.201102	0.000043	NaN	NaN	0.045109	0.021746
4	000001.XSHE	2008-05	-0.151164	2008-04	24.675221	0.047493	1.0212	0.206701	0.000051	0.000038	NaN	0.046323	0.014474
...	...	...	...	...	...	...	...	...	...	...	...	...	...
461016	900957.XSHG	2021-11	-0.042588	2021-10	18.576316	-0.042478	NaN	NaN	0.058457	0.067646	0.285164	0.011663	0.007700
461017	900957.XSHG	2021-12	0.033805	2021-11	18.534509	-0.042588	NaN	NaN	0.070056	0.062884	0.216730	0.009639	0.007046
461018	900957.XSHG	2022-01	-0.024027	2021-12	18.569997	0.033805	NaN	NaN	0.078037	0.059672	0.211045	0.010961	0.008692
461019	900957.XSHG	2022-02	-0.013175	2022-01	18.547560	-0.024027	NaN	NaN	0.044515	0.058502	-0.059172	0.010559	0.008409
461020	900957.XSHG	2022-03	-0.036066	2022-02	18.536150	-0.013175	NaN	NaN	0.057218	0.060208	-0.157182	0.006517	0.004195

Data¶

NA 值处理¶

Use rank instead of numerical values¶

Train, Validation, Test split¶

Evaluation metrics¶

Models¶

Linear regression¶

Huber regressor¶

Random Forest¶

Partial Least Squares¶

Principal Component Regression¶

PCA transform¶

PCA regression¶

Pipeline¶

Elastic Net¶

Gradient Boosted Regression Trees¶

Neural Nets¶

GridSeachCV Neural Nets¶

Transformation pipeline example¶

	ret_date	size	rev	beta	bm	illiq	illiq_12m	mom	vol	ivol
0	0.0	0.970696	-0.853480	0.123810	-0.696703	-0.979487	0.000000	0.000000	0.321612	0.000000
1	0.0	0.972444	-0.412618	-0.219724	-0.641769	-0.960841	0.000000	0.000000	-0.601160	-0.718637
2	0.0	0.968481	-0.664756	0.375358	-0.588825	-0.958453	0.000000	0.000000	0.415473	-0.616046
3	0.0	0.969936	0.483178	-0.079456	-0.176807	-0.975662	0.000000	0.000000	0.218325	0.400143
4	0.0	0.965567	0.522238	-0.301291	-0.318508	-0.971306	-0.977044	0.000000	-0.707317	-0.519369
...	...	...	...	...	...	...	...	...	...	...
461016	0.0	-0.990901	-0.011374	-0.000227	0.000000	0.989536	0.989991	0.556415	-0.897179	-0.702457
461017	0.0	-0.990967	-0.783198	0.000226	0.000000	0.988708	0.989612	0.585818	-0.900181	-0.775068
461018	0.0	-0.991033	0.121722	0.000000	0.000000	0.993275	0.989240	0.392961	-0.862811	-0.724277
461019	0.0	-0.991125	0.656091	0.000000	0.000000	0.988462	0.989350	-0.645440	-0.958731	-0.672953
461020	0.0	-0.991184	-0.546396	0.000000	0.000000	0.988980	0.990302	-0.648667	-0.990743	-0.902138

	features	feature_importance
1	rev	0.217390
4	illiq	0.189291
0	size	0.161213
5	illiq_12m	0.140252
8	ivol	0.082429
7	vol	0.079622
6	mom	0.045475
2	beta	0.043145
3	bm	0.041184

	secID	ret_date	exret	ym	size	rev	beta	bm	illiq	illiq_12m	mom	vol	ivol
0	000001.XSHE	2008-01	-0.140255	2007-12	0.970696	-0.853480	0.123810	-0.696703	-0.979487	0.000000	0.000000	0.321612	0.000000
408212	601808.XSHG	2008-01	-0.131953	2007-12	0.872527	-0.838828	0.000000	-0.598535	-0.919414	0.000000	0.000000	0.623443	0.000000
316829	600377.XSHG	2008-01	-0.135120	2007-12	0.302564	-0.484249	-0.454945	0.752381	-0.311355	-0.570696	-0.372894	0.135531	0.000000
64314	000959.XSHE	2008-01	-0.122610	2007-12	0.765568	0.208791	0.657143	0.786081	-0.739194	-0.854945	-0.238095	0.126740	0.000000
8011	000070.XSHE	2008-01	0.015180	2007-12	-0.452015	0.028571	-0.768498	0.516484	0.815385	0.894505	0.229304	0.265934	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...
375762	600831.XSHG	2022-03	0.057854	2022-02	-0.010800	0.136434	0.509808	0.813974	-0.038572	0.194622	-0.159797	0.881419	0.798104
24688	000576.XSHE	2022-03	0.001405	2022-02	0.100727	0.000661	-0.745867	-0.113511	-0.176108	-0.410183	0.349791	-0.778268	-0.056645
24569	000573.XSHE	2022-03	-0.028663	2022-02	-0.462200	0.071633	-0.938726	0.701565	-0.346264	0.639850	0.622658	0.865991	0.682169
25039	000582.XSHE	2022-03	-0.059102	2022-02	0.460877	-0.300419	-0.486886	0.803394	0.105136	0.099846	-0.689663	-0.427375	-0.777827
461020	900957.XSHG	2022-03	-0.036066	2022-02	-0.991184	-0.546396	0.000000	0.000000	0.988980	0.990302	-0.648667	-0.990743	-0.902138