import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16.0, 9.0)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline

# import lightgbm as lgb

import tensorflow as tf
from tensorflow import keras


df = pd.read_pickle('../../data/factor_exposure/all_exposure_2023.pkl')

df


df.drop('tradeDate',axis=1,inplace=True)

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 5068
ret 19730
rf 5068
exret 19730
ym 0
mktcap 14043
size 14043
rev 18541
mom_date 3879
mom 53422
beta 28578
bm 4198
illiq 32683
illiq_12m 96646
vol 21868
ivol 36221
vol_clip 21868
ivol_clip 36221


df = df[~df['ret_date'].isna()].copy()

df


df.loc[~df['mom'].isna(),'ret_date'].min()

Period('2008-01', 'M')


df = df[df['ret_date'] >= '2008-01'].copy()


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 14262
rf 0
exret 14262
ym 0
mktcap 13693
size 13693
rev 17990
mom_date 3693
mom 40190
beta 23245
bm 4024
illiq 27119
illiq_12m 84325
vol 16457
ivol 18316
vol_clip 16457
ivol_clip 18316


df = df[~df['ret'].isna()].copy()

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
ret 0
rf 0
exret 0
ym 0
mktcap 0
size 0
rev 4297
mom_date 3693
mom 40162
beta 21372
bm 1004
illiq 13186
illiq_12m 71577
vol 2716
ivol 4413
vol_clip 2716
ivol_clip 4413


df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)


df.drop(['ret','rf'],axis=1,inplace=True)


df.reset_index(inplace=True,drop=True)

df


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 4297
mom 40162
beta 21372
bm 1004
illiq 13186
illiq_12m 71577
vol 2716
ivol 4413


# Reversal 的空值丢掉，其他的用 median 填充
df = df[~df['rev'].isna()].copy()


cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']

df


temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))


temp.fillna(0, inplace=True)


df[cols] = temp.copy()


for col in df.columns:
    print(col, df[col].isna().sum())

secID 0
ret_date 0
exret 0
ym 0
size 0
rev 0
mom 0
beta 0
bm 0
illiq 0
illiq_12m 0
vol 0
ivol 0

df


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510662 entries, 0 to 514958
Data columns (total 13 columns):
 #   Column     Non-Null Count   Dtype    
---  ------     --------------   -----    
 0   secID      510662 non-null  object   
 1   ret_date   510662 non-null  period[M]
 2   exret      510662 non-null  float64  
 3   ym         510662 non-null  period[M]
 4   size       510662 non-null  float64  
 5   rev        510662 non-null  float64  
 6   mom        510662 non-null  float64  
 7   beta       510662 non-null  float64  
 8   bm         510662 non-null  float64  
 9   illiq      510662 non-null  float64  
 10  illiq_12m  510662 non-null  float64  
 11  vol        510662 non-null  float64  
 12  ivol       510662 non-null  float64  
dtypes: float64(10), object(1), period[M](2)
memory usage: 54.5+ MB


def csrank(df):
    return df.rank() * 2 / (len(df) + 1) - 1


num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()


num_X_cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']


df[['ret_date']+num_X_cols]


df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)


temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)


temp


temp.drop('ret_date',axis=1).reset_index()


temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')
temp


df_rank = pd.merge(df.drop(num_X_cols, axis=1),
                   temp.drop('ret_date',axis=1),
                   left_index=True, right_index=True)


del temp


df_rank


df_rank['year'] = df_rank['ret_date'].dt.year


time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]


# sorted(df_rank.groupby('year').groups.items())


time_idx

[Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                  8,      9,
             ...
             461853, 461854, 461855, 461856, 461857, 461858, 461859, 461860,
             461861, 461862],
            dtype='int64', length=16621),
 Int64Index([    12,     13,     14,     15,     16,     17,     18,     19,
                 20,     21,
             ...
             461865, 461866, 461867, 461868, 461869, 461870, 461871, 461872,
             461873, 461874],
            dtype='int64', length=17335),
 Int64Index([    24,     25,     26,     27,     28,     29,     30,     31,
                 32,     33,
             ...
             461877, 461878, 461879, 461880, 461881, 461882, 461883, 461884,
             461885, 461886],
            dtype='int64', length=19823),
 Int64Index([    36,     37,     38,     39,     40,     41,     42,     43,
                 44,     45,
             ...
             461889, 461890, 461891, 461892, 461893, 461894, 461895, 461896,
             461897, 461898],
            dtype='int64', length=23864),
 Int64Index([    48,     49,     50,     51,     52,     53,     54,     55,
                 56,     57,
             ...
             478263, 478264, 478265, 479640, 479641, 479642, 490582, 490583,
             490584, 499706],
            dtype='int64', length=26912),
 Int64Index([    60,     61,     62,     63,     64,     65,     66,     67,
                 68,     69,
             ...
             499709, 499710, 499711, 499712, 499713, 499714, 499715, 499716,
             499717, 499718],
            dtype='int64', length=28592),
 Int64Index([    72,     73,     74,     75,     76,     77,     78,     79,
                 80,     81,
             ...
             499721, 499722, 499723, 499724, 499725, 499726, 499727, 499728,
             499729, 499730],
            dtype='int64', length=29539),
 Int64Index([    84,     85,     86,     87,     88,     89,     90,     91,
                 92,     93,
             ...
             500009, 500010, 500011, 500012, 500013, 500014, 500015, 500016,
             500017, 500018],
            dtype='int64', length=31727),
 Int64Index([    96,     97,     98,     99,    100,    101,    102,    103,
                104,    105,
             ...
             500108, 500109, 500110, 500111, 500112, 500113, 500114, 500115,
             500116, 500117],
            dtype='int64', length=33468),
 Int64Index([   108,    109,    110,    111,    112,    113,    114,    115,
                116,    117,
             ...
             500120, 500121, 500122, 500123, 500124, 500125, 500126, 500127,
             500128, 500129],
            dtype='int64', length=37665),
 Int64Index([   120,    121,    122,    123,    124,    125,    126,    127,
                128,    129,
             ...
             500132, 500133, 500134, 500135, 500136, 500137, 500138, 500139,
             500140, 500141],
            dtype='int64', length=41103),
 Int64Index([   132,    133,    134,    135,    136,    137,    138,    139,
                140,    141,
             ...
             510143, 510144, 510145, 510667, 510729, 510770, 511041, 511042,
             511043, 511044],
            dtype='int64', length=41992),
 Int64Index([   144,    145,    146,    147,    148,    149,    150,    151,
                152,    153,
             ...
             513137, 513138, 513139, 513140, 514745, 514899, 514900, 514901,
             514902, 514931],
            dtype='int64', length=44134),
 Int64Index([   156,    157,    158,    159,    160,    161,    162,    163,
                164,    165,
             ...
             514934, 514935, 514936, 514937, 514938, 514939, 514940, 514941,
             514942, 514943],
            dtype='int64', length=49181),
 Int64Index([   168,    169,    170,    171,    172,    173,    174,    175,
                176,    177,
             ...
             514946, 514947, 514948, 514949, 514950, 514951, 514952, 514953,
             514954, 514955],
            dtype='int64', length=54418),
 Int64Index([   180,    181,    182,    363,    364,    365,    831,    832,
                833,    946,
             ...
             514871, 514895, 514896, 514897, 514927, 514928, 514929, 514956,
             514957, 514958],
            dtype='int64', length=14288)]


df_rank.groupby('year')['secID'].nunique()

year
2008    1463
2009    1530
2010    1841
2011    2142
2012    2383
2013    2432
2014    2549
2015    2772
2016    2941
2017    3392
2018    3522
2019    3648
2020    3961
2021    4422
2022    4770
2023    4777
Name: secID, dtype: int64


df_rank.groupby('year')['secID'].count()

year
2008    16621
2009    17335
2010    19823
2011    23864
2012    26912
2013    28592
2014    29539
2015    31727
2016    33468
2017    37665
2018    41103
2019    41992
2020    44134
2021    49181
2022    54418
2023    14288
Name: secID, dtype: int64


def list_flat(list_):
    return [item for sublist in list_ for item in sublist]
# This is the same as:
# def list_flat2(list_):
#     result = []
#     for sublist in list_:
#         for item in sublist:
#             result.append(item)
#     return result


list_flat([[1,2,3],[3,4,5]])

[1, 2, 3, 3, 4, 5]


np.array([[1,2,3],[3,4,5]]).flatten()

array([1, 2, 3, 3, 4, 5])


df_rank


# training, validation, testing scheme:
# 1. [2008-2011], [2012-2015], [2016]
# 2. [2008-2012], [2013-2016], [2017]
# ...
# last. [2008-2018], [2019-2022], [2023]
fulltrain_idx = []
cv_idx = []
test_idx = []
for i in range(4,len(time_idx)-4):
    train_idx = list_flat(time_idx[0:i])
    val_idx = list_flat(time_idx[i:i+4])
    fulltrain_idx.append(train_idx + val_idx)
    cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], 
                   np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作，不能带着pandas的index，
                                                                          # 因此cv_idx需要用fulltrain_idx的编号从0开始
    test_idx.append(time_idx[i+4])


df_rank.loc[fulltrain_idx[-1]]


# Example
a = [0,1,4,5,3000]
np.where(np.isin(a, [0,3000,4]))[0]

array([0, 2, 4])


test_years = list(range(2016, 2024))
test_years

[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]


def r2_oos(y_true, y_pred):
    return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)


r2_oos_scorer = make_scorer(r2_oos)


df_rank


X_fulltrain.columns.tolist()

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']


X_fulltrain.columns.tolist().index('illiq')

5


X_fulltrain.columns.tolist().index('illiq_12m')

6


illiq_idx = 4
illiq_12m_idx = 5


class FeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_illiq=True):
        self.add_avg_illiq = add_avg_illiq
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2
        return np.c_[X, avg_illiq]

feature_adder = FeatureAdder()


X_fulltrain.values.shape

(194413, 9)


X_fulltrain_new = feature_adder.transform(X_fulltrain.values)


X_fulltrain_new

array([[ 0.9695586 , -0.85844749,  0.        , ...,  0.27549467,
         0.        , -0.82572298],
       [ 0.97153558, -0.45168539,  0.        , ..., -0.63595506,
        -0.7917603 , -0.78651685],
       [ 0.96733482, -0.70007424,  0.        , ...,  0.437268  ,
        -0.62583519, -0.75575353],
       ...,
       [-0.90249267,  0.521261  ,  0.        , ...,  0.33284457,
         0.83577713,  0.0164956 ],
       [-0.90905757,  0.02090209,  0.        , ..., -0.41620829,
        -0.3186652 ,  0.12431243],
       [-0.88114453,  0.87160675,  0.21129861, ...,  0.09024211,
         0.57446809,  0.09941306]])


X_fulltrain_new.shape

(194413, 10)


# This can be added to a pipeline
pipeline = Pipeline([
    ('feature_adder', FeatureAdder()),
    ('std_scaler', StandardScaler())
])


pipeline.fit_transform(X_fulltrain.values)

array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ...,
         4.77437532e-01, -7.62641454e-19, -1.98928732e+00],
       [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ...,
        -1.10212226e+00, -1.37679088e+00, -1.89483403e+00],
       [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ...,
         7.57793805e-01, -1.08826394e+00, -1.82072068e+00],
       ...,
       [-1.56393521e+00,  9.03360864e-01, -7.60380513e-19, ...,
         5.76826008e-01,  1.45333168e+00,  3.97403136e-02],
       [-1.57531157e+00,  3.62239461e-02, -7.60380513e-19, ...,
        -7.21296915e-01, -5.54126473e-01,  2.99486812e-01],
       [-1.52694089e+00,  1.51052051e+00,  3.66337572e-01, ...,
         1.56391306e-01,  9.98941754e-01,  2.39500587e-01]])


cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']


cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']


model = LinearRegression()


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.009412304211971145
Test year 2017 : -0.08839594863078148
Test year 2018 : -0.04979526421788871
Test year 2019 : 0.006463809562448852
Test year 2020 : -0.001544287862749627
Test year 2021 : 0.011488412068509812
Test year 2022 : -0.0009306275137825892
Test year 2023 : 0.0538460552856318


cols = ['size','rev','illiq','ivol']


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.01021329786011016
Test year 2017 : -0.08793262026819404
Test year 2018 : -0.04979850705536615
Test year 2019 : 0.007779285918034451
Test year 2020 : -0.0007569573338341851
Test year 2021 : 0.01083296171623438
Test year 2022 : -0.0017994744447327182
Test year 2023 : 0.0567680782563359


cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']


model = HuberRegressor(alpha=0.01,epsilon=1.05)


for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X=X_fulltrain, y=y_fulltrain)
    y_pred = model.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : 0.0068762585146247
Test year 2017 : -0.02917724503186392
Test year 2018 : 0.00904631578299342
Test year 2019 : -0.018436209926423253
Test year 2020 : -0.01373519780133825
Test year 2021 : -0.008464097203231491
Test year 2022 : 0.010896689656339609
Test year 2023 : -0.025851837667542954


cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']


hyperparam_grid = [
    {'n_estimators': [50], 'max_depth': [3,5,7], 
     'max_features': [3,5]}
]


model = RandomForestRegressor(random_state=42)


# Cross validation for period 0, i.e.
# train: [2008-2011], val: [2012-2015], test: [2016]
grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0], cols]
y_test = df_rank.loc[test_idx[0], 'exret']


%%time
grid_search.fit(X_fulltrain, y_fulltrain)

CPU times: user 44.2 s, sys: 255 ms, total: 44.5 s
Wall time: 45 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5],
                          'n_estimators': [50]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'max_depth': 7, 'max_features': 3, 'n_estimators': 50}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

0.03288083772090255 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50}
0.03268911472458363 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50}
0.05166632659254923 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50}
0.05452611070305944 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50}
0.05636006091771797 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50}
0.05020334277634201 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}


pd.DataFrame({"features":num_X_cols,"feature_importance":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',
                                                                                                                        ascending=False)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.027635441051083953


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.027635441051083953
Test year 2017 : -0.07734369016377896
Test year 2018 : -0.03959447233848512
Test year 2019 : 0.007431052632130841
Test year 2020 : 0.004284335274858608
Test year 2021 : 0.011873444443791237
Test year 2022 : -0.0027118213183869866
Test year 2023 : 0.05356830744986352
CPU times: user 13min 53s, sys: 6.11 s, total: 13min 59s
Wall time: 14min 13s


cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']


model = PLSRegression(n_components=4)


y_pred.reshape(-1).shape

(14288,)


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    model.fit(X_fulltrain, y_fulltrain)
    y_pred = model.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.008666088091823676
Test year 2017 : -0.09331774541749294
Test year 2018 : -0.049216423103574325
Test year 2019 : 0.006125528081381337
Test year 2020 : -0.0015378783009631913
Test year 2021 : 0.011202972263563482
Test year 2022 : -0.0006573216211354094
Test year 2023 : 0.05334143576809536
CPU times: user 9.06 s, sys: 621 ms, total: 9.68 s
Wall time: 2.68 s


cols = num_X_cols
cols

['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


pca = PCA(3, random_state=42)


pca.fit(X_fulltrain)

PCA(n_components=3, random_state=42)


pca.components_

array([[ 0.5393208 , -0.10042879, -0.02121698,  0.13077125,  0.11124688,
        -0.53998128, -0.55702611, -0.17478931, -0.19160066],
       [ 0.13101866,  0.27952024,  0.28856617,  0.04523341, -0.37628468,
        -0.23483181, -0.13341238,  0.53814814,  0.56146916],
       [ 0.06685249, -0.20225271,  0.52662786, -0.60773615, -0.43785557,
        -0.02737757,  0.022656  , -0.30870296, -0.14023771]])


pca.components_.shape

(3, 9)


X_fulltrain.shape

(194413, 9)


pca.components_.T.shape

(9, 3)


np.matmul(X_fulltrain.values,pca.components_.T)

array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])


pca.fit_transform(X_fulltrain)

array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])


class PCARegressor(BaseEstimator, RegressorMixin):
    def __init__(self, n_components=3):
        self.n_components = n_components
    
    def fit(self, X, y):
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.X_ = self.pca_.transform(X)
        self.reg_ = LinearRegression().fit(self.X_,y)
        return self
    
    def predict(self, X):
        self.pred_ = self.reg_.predict(self.pca_.transform(X))
        return self.pred_


model = PCARegressor()


model.fit(X=X_fulltrain, y=y_fulltrain)

PCARegressor()


model.X_

array([[ 1.02594082,  0.52223329,  0.42203243],
       [ 1.25984674, -0.33726701,  0.86188511],
       [ 1.13373865,  0.26441179,  0.21948021],
       ...,
       [-1.01876888,  0.6986476 , -0.07366154],
       [-0.74036858, -0.54089609,  0.40091094],
       [-1.76902279,  0.43974171,  0.27114652]])


hyperparam_grid = [
    {'n_components': range(1, len(cols)+1)}
]


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=PCARegressor(),
             param_grid=[{'n_components': range(1, 10)}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'n_components': 6}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

nan {'n_components': 1}
0.03978386586015345 {'n_components': 2}
0.03867864707593599 {'n_components': 3}
0.05065681706795535 {'n_components': 4}
0.050715696965028 {'n_components': 5}
0.052339724870998625 {'n_components': 6}
0.043948248157652296 {'n_components': 7}
0.05198899108126847 {'n_components': 8}
0.04966417067338399 {'n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test,y_pred=y_pred)

-0.010497492168772826


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.010497492168772826
Test year 2017 : -0.0892205268426225
Test year 2018 : -0.04907431002684648
Test year 2019 : 0.006466449764656601
Test year 2020 : -0.0005226873925128217
Test year 2021 : 0.009468277374521827
Test year 2022 : -0.006204075783554419
Test year 2023 : 0.05486550995882933
CPU times: user 1min 21s, sys: 7.58 s, total: 1min 29s
Wall time: 22.9 s


pca = PCA()
linear_reg = LinearRegression()
pipeline = Pipeline(steps=[('pca',pca),
                           ('linear_regression', linear_reg)])
hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}
grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],
                           scoring=r2_oos_scorer,
                           return_train_score=True)


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


%%time
grid_search.fit(X=X_fulltrain,y=y_fulltrain)

CPU times: user 4.38 s, sys: 447 ms, total: 4.82 s
Wall time: 1.27 s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=Pipeline(steps=[('pca', PCA()),
                                       ('linear_regression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': range(1, 10)},
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'pca__n_components': 6}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(np.sqrt(mean_score), params)

nan {'pca__n_components': 1}
0.03978386586015345 {'pca__n_components': 2}
0.03867864707593599 {'pca__n_components': 3}
0.05065681706795535 {'pca__n_components': 4}
0.050715696965028 {'pca__n_components': 5}
0.052339724870998625 {'pca__n_components': 6}
0.043948248157652296 {'pca__n_components': 7}
0.05198899108126847 {'pca__n_components': 8}
0.04966417067338399 {'pca__n_components': 9}

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt
  print(np.sqrt(mean_score), params)


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.010497492168772826


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


model = SGDRegressor(penalty='elasticnet')


hyperparam_grid = [{'alpha':[0.0001, 0.001, 0.01, 0.1],
                    'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


grid_search.fit(X=X_fulltrain, y=y_fulltrain)

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=SGDRegressor(penalty='elasticnet'),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1],
                          'l1_ratio': [0.15, 0.3, 0.5, 0.7]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'alpha': 0.01, 'l1_ratio': 0.5}


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.0311148263765626


%%time
for i in range(len(fulltrain_idx)):
    X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]
    y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']
    X_test = df_rank.loc[test_idx[i], cols]
    y_test = df_rank.loc[test_idx[i], 'exret']
    
    grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)
    grid_search.fit(X_fulltrain, y_fulltrain)
    y_pred = grid_search.predict(X=X_test)
    y_pred = y_pred.reshape(-1)
    print("Test year", test_years[i],":",r2_oos(y_true=y_test, y_pred=y_pred))

Test year 2016 : -0.00864821027186724
Test year 2017 : -0.10212855467368231
Test year 2018 : -0.0360399221201777
Test year 2019 : 0.009780619233734189
Test year 2020 : -0.0013783134494498306
Test year 2021 : 0.007371679077704529
Test year 2022 : -0.010387142856184584
Test year 2023 : 0.0358911891731738
CPU times: user 1min 49s, sys: 3.03 s, total: 1min 52s
Wall time: 39.5 s


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


hyperparam_grid = [
    {'max_depth': [1,2,3,4,5,6], 
     'learning_rate': [0.1, 0.05, 0.01]}
]


model = GradientBoostingRegressor()


grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], 
                           scoring=r2_oos_scorer,
                           return_train_score=True)


%%time
grid_search.fit(X=X_fulltrain, y=y_fulltrain)

CPU times: user 8min 4s, sys: 1.33 s, total: 8min 6s
Wall time: 8min 10s

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=GradientBoostingRegressor(),
             param_grid=[{'learning_rate': [0.1, 0.05, 0.01],
                          'max_depth': [1, 2, 3, 4, 5, 6]}],
             return_train_score=True, scoring=make_scorer(r2_oos))


grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 3}


cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
                              cv_results['params']):
    print(mean_score, params)

0.0016364787061411423 {'learning_rate': 0.1, 'max_depth': 1}
0.0031692481731780964 {'learning_rate': 0.1, 'max_depth': 2}
0.005036926182540924 {'learning_rate': 0.1, 'max_depth': 3}
0.0014929783765171845 {'learning_rate': 0.1, 'max_depth': 4}
0.002093148803113287 {'learning_rate': 0.1, 'max_depth': 5}
-0.002867794489446185 {'learning_rate': 0.1, 'max_depth': 6}
0.0009616462813958337 {'learning_rate': 0.05, 'max_depth': 1}
0.002077392481220186 {'learning_rate': 0.05, 'max_depth': 2}
0.004847425597103383 {'learning_rate': 0.05, 'max_depth': 3}
0.004754069888309176 {'learning_rate': 0.05, 'max_depth': 4}
0.004074091843459637 {'learning_rate': 0.05, 'max_depth': 5}
0.0021864643800394434 {'learning_rate': 0.05, 'max_depth': 6}
-0.001052554576351561 {'learning_rate': 0.01, 'max_depth': 1}
-0.00010459917823935072 {'learning_rate': 0.01, 'max_depth': 2}
0.0014224899839351268 {'learning_rate': 0.01, 'max_depth': 3}
0.0030169482746364995 {'learning_rate': 0.01, 'max_depth': 4}
0.003678689341579111 {'learning_rate': 0.01, 'max_depth': 5}
0.003458896012854762 {'learning_rate': 0.01, 'max_depth': 6}


y_pred = grid_search.predict(X_test)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.045627159691304264


tf.__version__

'2.8.0'


keras.__version__

'2.8.0'


X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]
y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']
X_train = X_fulltrain.values[cv_idx[0][0]]
y_train = y_fulltrain.values[cv_idx[0][0]]
X_val = X_fulltrain.values[cv_idx[0][1]]
y_val = y_fulltrain.values[cv_idx[0][1]]
X_test = df_rank.loc[test_idx[0],cols]
y_test = df_rank.loc[test_idx[0],'exret']


X_train.shape

(77643, 9)


X_val.shape

(116770, 9)


nn_model = keras.models.Sequential()
nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))
nn_model.add(keras.layers.Dense(8, activation='relu'))
nn_model.add(keras.layers.Dense(4, activation='relu'))
nn_model.add(keras.layers.Dense(1))

2023-04-03 09:33:33.820524: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


nn_model.compile(loss='mse',optimizer='sgd')


nn_model.fit(X_train, y_train, epochs=10,
             validation_data=(X_val,y_val))

Epoch 1/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0265 - val_loss: 0.0278
Epoch 2/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0278
Epoch 3/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276
Epoch 4/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 5/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 6/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 7/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0272
Epoch 8/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 9/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0274
Epoch 10/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275

<keras.callbacks.History at 0x7f9a581a7e20>


y_pred = nn_model.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

0.005978663942921458


def build_model(learning_rate=0.003):
    nn_model = keras.models.Sequential()
    nn_model.add(keras.layers.InputLayer(input_shape=[9]))
    nn_model.add(keras.layers.Dense(8, activation='relu'))
    nn_model.add(keras.layers.Dense(4, activation='relu'))
    nn_model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate) 
    nn_model.compile(loss="mse", optimizer=optimizer)
    return nn_model


# from scikeras.wrappers import KerasRegressor
# keras_reg = KerasRegressor(build_model)
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_32890/2335962656.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.
  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)


hyperparams_grid = {
    'learning_rate':[0.003]
}


nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])


nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,
                 validation_data=(X_val,y_val))

Epoch 1/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0326 - val_loss: 0.0286
Epoch 2/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0230 - val_loss: 0.0278
Epoch 3/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0226 - val_loss: 0.0279
Epoch 4/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0276
Epoch 5/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0225 - val_loss: 0.0277
Epoch 6/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0275
Epoch 7/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0277
Epoch 8/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 9/10
2427/2427 [==============================] - 3s 1ms/step - loss: 0.0224 - val_loss: 0.0276
Epoch 10/10
2427/2427 [==============================] - 4s 1ms/step - loss: 0.0224 - val_loss: 0.0276
3650/3650 [==============================] - 3s 906us/step - loss: 0.0276
Epoch 1/10
6076/6076 [==============================] - 6s 975us/step - loss: 0.0333 - val_loss: 0.0272
Epoch 2/10
6076/6076 [==============================] - 6s 954us/step - loss: 0.0253 - val_loss: 0.0268
Epoch 3/10
6076/6076 [==============================] - 6s 956us/step - loss: 0.0252 - val_loss: 0.0267
Epoch 4/10
6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267
Epoch 5/10
6076/6076 [==============================] - 6s 949us/step - loss: 0.0251 - val_loss: 0.0268
Epoch 6/10
6076/6076 [==============================] - 6s 966us/step - loss: 0.0251 - val_loss: 0.0267
Epoch 7/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0268
Epoch 8/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267
Epoch 9/10
6076/6076 [==============================] - 6s 1ms/step - loss: 0.0251 - val_loss: 0.0267
Epoch 10/10
6076/6076 [==============================] - 6s 963us/step - loss: 0.0251 - val_loss: 0.0267

GridSearchCV(cv=[(array([    0,     1,     2, ..., 77640, 77641, 77642]),
                  array([ 77643,  77644,  77645, ..., 194410, 194411, 194412]))],
             estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f9a4ac096a0>,
             param_grid={'learning_rate': [0.003]})


y_pred = nn_search_cv.predict(X_test).reshape(-1)
r2_oos(y_true=y_test, y_pred=y_pred)

-0.023086233917368748

	secID	ret_date	tradeDate	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	2007-06-29	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	2007-07-31	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	2007-08-31	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	2007-09-28	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	2007-10-31	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
547253	689009.XSHG	2022-12	2022-11-30	-0.086579	0.001955	-0.088534	2022-11	1.708055e+10	23.561206	0.041529	2022-10	-0.474030	0.7363	0.201033	0.000122	0.000202	0.017044	0.010108	0.017044	0.010108
547254	689009.XSHG	2023-01	2022-12-30	0.088554	0.001856	0.086698	2022-12	1.560173e+10	23.470648	-0.088534	2022-11	-0.523619	0.6919	0.220085	0.000194	0.000210	0.019017	0.013598	0.019017	0.013598
547255	689009.XSHG	2023-02	2023-01-31	-0.005725	0.001910	-0.007635	2023-01	1.698332e+10	23.555498	0.086698	2022-12	-0.498602	0.7379	0.201772	0.000100	0.000210	0.014183	0.009679	0.014183	0.009679
547256	689009.XSHG	2023-03	2023-02-28	-0.011818	0.001982	-0.013800	2023-02	1.688610e+10	23.549757	-0.007635	2023-01	-0.418230	0.7453	0.202930	0.000091	0.000202	0.024867	0.012591	0.024867	0.012591
547257	689009.XSHG	NaT	2023-03-10	NaN	NaN	NaN	2023-03	1.668654e+10	23.537868	-0.013800	2023-02	-0.246403	NaN	0.193716	NaN	0.000201	NaN	NaN	NaN	NaN

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
547253	689009.XSHG	2022-12	-0.086579	0.001955	-0.088534	2022-11	1.708055e+10	23.561206	0.041529	2022-10	-0.474030	0.7363	0.201033	0.000122	0.000202	0.017044	0.010108	0.017044	0.010108
547254	689009.XSHG	2023-01	0.088554	0.001856	0.086698	2022-12	1.560173e+10	23.470648	-0.088534	2022-11	-0.523619	0.6919	0.220085	0.000194	0.000210	0.019017	0.013598	0.019017	0.013598
547255	689009.XSHG	2023-02	-0.005725	0.001910	-0.007635	2023-01	1.698332e+10	23.555498	0.086698	2022-12	-0.498602	0.7379	0.201772	0.000100	0.000210	0.014183	0.009679	0.014183	0.009679
547256	689009.XSHG	2023-03	-0.011818	0.001982	-0.013800	2023-02	1.688610e+10	23.549757	-0.007635	2023-01	-0.418230	0.7453	0.202930	0.000091	0.000202	0.024867	0.012591	0.024867	0.012591
547257	689009.XSHG	NaT	NaN	NaN	NaN	2023-03	1.668654e+10	23.537868	-0.013800	2023-02	-0.246403	NaN	0.193716	NaN	0.000201	NaN	NaN	NaN	NaN

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
0	000001.XSHE	2007-07	0.316497	0.002481	0.314016	2007-06	4.266117e+10	24.476555	NaN	NaT	NaN	0.4614	0.123739	NaN	NaN	NaN	NaN	NaN	NaN
1	000001.XSHE	2007-08	0.048855	0.002404	0.046451	2007-07	5.616330e+10	24.751529	0.314016	2007-06	NaN	0.6423	0.093992	0.000040	NaN	0.041604	NaN	0.041604	NaN
2	000001.XSHE	2007-09	0.052105	0.002621	0.049484	2007-08	5.890714e+10	24.799228	0.046451	2007-07	NaN	0.7722	0.097085	0.000020	NaN	0.033926	NaN	0.033926	NaN
3	000001.XSHE	2007-10	0.201851	0.003095	0.198756	2007-09	6.197651e+10	24.850021	0.049484	2007-08	NaN	0.7596	0.092276	0.000025	NaN	0.023872	NaN	0.023872	NaN
4	000001.XSHE	2007-11	-0.249116	0.003780	-0.252896	2007-10	7.448652e+10	25.033884	0.198756	2007-09	NaN	0.7988	0.083411	0.000030	NaN	0.035921	NaN	0.035921	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
547252	689009.XSHG	2022-11	0.043125	0.001596	0.041529	2022-10	1.637440e+10	23.518985	-0.166109	2022-09	-0.401406	0.7083	0.209701	0.000264	0.000206	0.059961	0.051851	0.059961	0.051851
547253	689009.XSHG	2022-12	-0.086579	0.001955	-0.088534	2022-11	1.708055e+10	23.561206	0.041529	2022-10	-0.474030	0.7363	0.201033	0.000122	0.000202	0.017044	0.010108	0.017044	0.010108
547254	689009.XSHG	2023-01	0.088554	0.001856	0.086698	2022-12	1.560173e+10	23.470648	-0.088534	2022-11	-0.523619	0.6919	0.220085	0.000194	0.000210	0.019017	0.013598	0.019017	0.013598
547255	689009.XSHG	2023-02	-0.005725	0.001910	-0.007635	2023-01	1.698332e+10	23.555498	0.086698	2022-12	-0.498602	0.7379	0.201772	0.000100	0.000210	0.014183	0.009679	0.014183	0.009679
547256	689009.XSHG	2023-03	-0.011818	0.001982	-0.013800	2023-02	1.688610e+10	23.549757	-0.007635	2023-01	-0.418230	0.7453	0.202930	0.000091	0.000202	0.024867	0.012591	0.024867	0.012591

	secID	ret_date	ret	rf	exret	ym	mktcap	size	rev	mom_date	mom	beta	bm	illiq	illiq_12m	vol	ivol	vol_clip	ivol_clip
6	000001.XSHE	2008-01	-0.137306	0.002949	-0.140255	2007-12	6.574629e+10	24.909069	0.066834	2007-11	NaN	0.9468	0.094476	0.000025	NaN	0.026541	NaN	0.026541	NaN
7	000001.XSHE	2008-02	-0.004504	0.002946	-0.007450	2008-01	5.850212e+10	24.792329	-0.140255	2007-12	NaN	0.9654	0.109513	0.000039	NaN	0.037722	0.012909	0.037722	0.012909
8	000001.XSHE	2008-03	-0.149321	0.002746	-0.152068	2008-02	5.823860e+10	24.787814	-0.007450	2008-01	NaN	1.0292	0.110009	0.000064	NaN	0.041448	0.009032	0.041448	0.009032
9	000001.XSHE	2008-04	0.050355	0.002862	0.047493	2008-03	4.954234e+10	24.626093	-0.152068	2008-02	NaN	1.0238	0.201102	0.000043	NaN	0.045109	0.021484	0.045109	0.021484
10	000001.XSHE	2008-05	-0.148211	0.002953	-0.151164	2008-04	5.203702e+10	24.675221	0.047493	2008-03	NaN	1.0212	0.206701	0.000051	0.000038	0.046323	0.015098	0.046323	0.015098
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
547252	689009.XSHG	2022-11	0.043125	0.001596	0.041529	2022-10	1.637440e+10	23.518985	-0.166109	2022-09	-0.401406	0.7083	0.209701	0.000264	0.000206	0.059961	0.051851	0.059961	0.051851
547253	689009.XSHG	2022-12	-0.086579	0.001955	-0.088534	2022-11	1.708055e+10	23.561206	0.041529	2022-10	-0.474030	0.7363	0.201033	0.000122	0.000202	0.017044	0.010108	0.017044	0.010108
547254	689009.XSHG	2023-01	0.088554	0.001856	0.086698	2022-12	1.560173e+10	23.470648	-0.088534	2022-11	-0.523619	0.6919	0.220085	0.000194	0.000210	0.019017	0.013598	0.019017	0.013598
547255	689009.XSHG	2023-02	-0.005725	0.001910	-0.007635	2023-01	1.698332e+10	23.555498	0.086698	2022-12	-0.498602	0.7379	0.201772	0.000100	0.000210	0.014183	0.009679	0.014183	0.009679
547256	689009.XSHG	2023-03	-0.011818	0.001982	-0.013800	2023-02	1.688610e+10	23.549757	-0.007635	2023-01	-0.418230	0.7453	0.202930	0.000091	0.000202	0.024867	0.012591	0.024867	0.012591

	secID	ret_date	exret	ym	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	000001.XSHE	2008-01	-0.140255	2007-12	24.909069	0.066834	NaN	0.9468	0.094476	0.000025	NaN	0.026541	NaN
1	000001.XSHE	2008-02	-0.007450	2008-01	24.792329	-0.140255	NaN	0.9654	0.109513	0.000039	NaN	0.037722	0.012909
2	000001.XSHE	2008-03	-0.152068	2008-02	24.787814	-0.007450	NaN	1.0292	0.110009	0.000064	NaN	0.041448	0.009032
3	000001.XSHE	2008-04	0.047493	2008-03	24.626093	-0.152068	NaN	1.0238	0.201102	0.000043	NaN	0.045109	0.021484
4	000001.XSHE	2008-05	-0.151164	2008-04	24.675221	0.047493	NaN	1.0212	0.206701	0.000051	0.000038	0.046323	0.015098
...	...	...	...	...	...	...	...	...	...	...	...	...	...
514954	689009.XSHG	2022-11	0.041529	2022-10	23.518985	-0.166109	-0.401406	0.7083	0.209701	0.000264	0.000206	0.059961	0.051851
514955	689009.XSHG	2022-12	-0.088534	2022-11	23.561206	0.041529	-0.474030	0.7363	0.201033	0.000122	0.000202	0.017044	0.010108
514956	689009.XSHG	2023-01	0.086698	2022-12	23.470648	-0.088534	-0.523619	0.6919	0.220085	0.000194	0.000210	0.019017	0.013598
514957	689009.XSHG	2023-02	-0.007635	2023-01	23.555498	0.086698	-0.498602	0.7379	0.201772	0.000100	0.000210	0.014183	0.009679
514958	689009.XSHG	2023-03	-0.013800	2023-02	23.549757	-0.007635	-0.418230	0.7453	0.202930	0.000091	0.000202	0.024867	0.012591

Data¶

NA 值处理¶

Use rank instead of numerical values¶

Train, Validation, Test split¶

Evaluation metrics¶

Sklearn¶

Transformation pipeline example¶

Models¶

Linear regression¶

Huber regressor¶

Random Forest¶

Partial Least Squares¶

Principal Component Regression¶

PCA transform¶

PCA regression¶

Pipeline¶

Elastic Net¶

Gradient Boosted Regression Trees¶

Neural Nets¶

GridSeachCV Neural Nets¶

		ret_date	size
ret_date
2008-01	0	0.0	0.969559
	183	0.0	0.990868
	651	0.0	0.522070
	1504	0.0	0.678843
	1687	0.0	-0.231355
...	...	...	...
2023-03	514851	0.0	-0.743772
	514871	0.0	0.278208
	514897	0.0	0.041658
	514929	0.0	0.943898
	514958	0.0	0.672179

	ret_date	level_1	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
0	2008-01	0	0.969559	-0.858447	0.000000	0.086758	-0.672755	-0.978691	0.000000	0.275495	0.000000
1	2008-01	183	0.990868	-0.990868	0.852359	0.662100	-0.375951	-0.996956	-0.989346	0.745814	0.000000
2	2008-01	651	0.522070	-0.972603	0.552511	0.523592	0.283105	-0.223744	-0.595129	0.814307	0.000000
3	2008-01	1504	0.678843	-0.506849	-0.517504	0.775495	-0.636225	-0.698630	-0.517504	0.493151	0.000000
4	2008-01	1687	-0.231355	-0.945967	0.709285	0.000000	-0.403349	0.000000	0.000000	-0.982496	0.000000
...	...	...	...	...	...	...	...	...	...	...	...
510657	2023-03	514851	-0.743772	0.284488	0.184844	-0.557254	0.279464	0.348964	0.836718	-0.684739	-0.466192
510658	2023-03	514871	0.278208	-0.802805	0.209964	0.550345	-0.735817	-0.220431	0.083944	0.504291	0.274440
510659	2023-03	514897	0.041658	-0.816203	0.559975	-0.490057	0.016956	-0.027842	-0.103203	-0.330961	-0.108227
510660	2023-03	514929	0.943898	-0.279883	-0.469960	-0.014026	0.134603	-0.969018	-0.969018	-0.486707	-0.643291
510661	2023-03	514958	0.672179	-0.409253	-0.964831	0.139418	-0.605610	-0.406322	-0.243458	0.471216	0.117856

	ret_date	size	rev	mom	beta	bm	illiq	illiq_12m	vol	ivol
level_1
0	2008-01	0.969559	-0.858447	0.000000	0.086758	-0.672755	-0.978691	0.000000	0.275495	0.000000
183	2008-01	0.990868	-0.990868	0.852359	0.662100	-0.375951	-0.996956	-0.989346	0.745814	0.000000
651	2008-01	0.522070	-0.972603	0.552511	0.523592	0.283105	-0.223744	-0.595129	0.814307	0.000000
1504	2008-01	0.678843	-0.506849	-0.517504	0.775495	-0.636225	-0.698630	-0.517504	0.493151	0.000000
1687	2008-01	-0.231355	-0.945967	0.709285	0.000000	-0.403349	0.000000	0.000000	-0.982496	0.000000
...	...	...	...	...	...	...	...	...	...	...
514851	2023-03	-0.743772	0.284488	0.184844	-0.557254	0.279464	0.348964	0.836718	-0.684739	-0.466192
514871	2023-03	0.278208	-0.802805	0.209964	0.550345	-0.735817	-0.220431	0.083944	0.504291	0.274440
514897	2023-03	0.041658	-0.816203	0.559975	-0.490057	0.016956	-0.027842	-0.103203	-0.330961	-0.108227
514929	2023-03	0.943898	-0.279883	-0.469960	-0.014026	0.134603	-0.969018	-0.969018	-0.486707	-0.643291
514958	2023-03	0.672179	-0.409253	-0.964831	0.139418	-0.605610	-0.406322	-0.243458	0.471216	0.117856

	features	feature_importance
5	illiq	0.198897
1	rev	0.158943
0	size	0.132347
7	vol	0.116127
8	ivol	0.114038
2	mom	0.093076
4	bm	0.079133
6	illiq_12m	0.063776
3	beta	0.043663