{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "hourly-fever", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "plt.rcParams['figure.figsize'] = (16.0, 9.0)\n", "\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import mean_squared_error, make_scorer\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor\n", "from sklearn.cross_decomposition import PLSRegression\n", "from sklearn.decomposition import PCA\n", "# from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", "from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin\n", "from sklearn.pipeline import Pipeline\n", "\n", "# import lightgbm as lgb\n", "\n", "import tensorflow as tf\n", "from tensorflow import keras" ] }, { "cell_type": "markdown", "id": "under-restaurant", "metadata": {}, "source": [ "- [Data](#data)\n", " - [NA 值处理](#na-值处理)\n", " - [Use rank instead of numerical values](#use-rank-instead-of-numerical-values)\n", "- [Train, Validation, Test split](#train-validation-test-split)\n", "- [Evaluation metrics](#evaluation-metrics)\n", "- [Models](#models)\n", " - [Linear regression](#linear-regression)\n", " - [Huber regressor](#huber-regressor)\n", " - [Random Forest](#random-forest)\n", " - [Partial Least Squares](#partial-least-squares)\n", " - [Principal Component Regression](#principal-component-regression)\n", " - [PCA transform](#pca-transform)\n", " - [PCA regression](#pca-regression)\n", " - [Pipeline](#pipeline)\n", " - [Elastic Net](#elastic-net)\n", " - [Gradient Boosted Regression Trees](#gradient-boosted-regression-trees)\n", " - [Neural Nets](#neural-nets)\n", " - [GridSeachCV Neural Nets](#gridseachcv-neural-nets)\n", "- [Transformation pipeline example](#transformation-pipeline-example)" ] }, { "cell_type": "markdown", "id": "intended-belize", "metadata": {}, "source": [ "# Data" ] }, { "cell_type": "code", "execution_count": 6, "id": "broken-matthew", "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('../../../data/factor_exposure/all_exposure_2024.pkl')" ] }, { "cell_type": "code", "execution_count": 7, "id": "ee6796e4-a80e-466d-a2d2-71329f02b4ef", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_datetradeDateretrfexretymmktcapsizerevmom_datemombetabmilliqilliq_12mvolivolvol_clipivol_clip
0000001.XSHE2007-072007-06-290.3164970.0024810.3140162007-064.266117e+1024.476555NaNNaTNaN0.46140.123739NaNNaNNaNNaNNaNNaN
1000001.XSHE2007-082007-07-310.0488550.0024040.0464512007-075.616330e+1024.7515290.3140162007-06NaN0.64230.0939920.000040NaN0.041604NaN0.041604NaN
2000001.XSHE2007-092007-08-310.0521050.0026210.0494842007-085.890714e+1024.7992280.0464512007-07NaN0.77220.0970850.000020NaN0.033926NaN0.033926NaN
3000001.XSHE2007-102007-09-280.2018510.0030950.1987562007-096.197651e+1024.8500210.0494842007-08NaN0.75960.0922760.000025NaN0.023872NaN0.023872NaN
4000001.XSHE2007-112007-10-31-0.2491160.003780-0.2528962007-107.448652e+1025.0338840.1987562007-09NaN0.79880.0834110.000030NaN0.035921NaN0.035921NaN
...............................................................
616458689009.XSHG2024-012023-12-29-0.2130820.001902-0.2149832023-121.552630e+1023.465801-0.1059962023-110.0856021.04480.2475250.0001100.0001070.0246340.0182280.0246340.018228
616459689009.XSHG2024-022024-01-310.2982010.0017490.2964512024-011.221793e+1023.226170-0.2149832023-12-0.1063571.23140.3136070.0001840.0001160.0246070.0138900.0246070.013890
616460689009.XSHG2024-032024-02-29-0.0115510.001783-0.0133342024-021.586132e+1023.4871490.2964512024-01-0.2927271.49050.2415690.0001640.0001200.0442430.0247550.0442430.024755
616461689009.XSHG2024-042024-03-29-0.0717860.001687-0.0734742024-031.543851e+1023.460131-0.0133342024-02-0.1950051.54770.2471270.0000850.0001180.0302060.0229280.0302060.022928
616462689009.XSHGNaT2024-04-12NaNNaNNaN2024-041.433023e+1023.385637-0.0734742024-03-0.104366NaN0.260342NaN0.000121NaNNaNNaNNaN
\n", "

616463 rows × 20 columns

\n", "
" ], "text/plain": [ " secID ret_date tradeDate ret rf exret \\\n", "0 000001.XSHE 2007-07 2007-06-29 0.316497 0.002481 0.314016 \n", "1 000001.XSHE 2007-08 2007-07-31 0.048855 0.002404 0.046451 \n", "2 000001.XSHE 2007-09 2007-08-31 0.052105 0.002621 0.049484 \n", "3 000001.XSHE 2007-10 2007-09-28 0.201851 0.003095 0.198756 \n", "4 000001.XSHE 2007-11 2007-10-31 -0.249116 0.003780 -0.252896 \n", "... ... ... ... ... ... ... \n", "616458 689009.XSHG 2024-01 2023-12-29 -0.213082 0.001902 -0.214983 \n", "616459 689009.XSHG 2024-02 2024-01-31 0.298201 0.001749 0.296451 \n", "616460 689009.XSHG 2024-03 2024-02-29 -0.011551 0.001783 -0.013334 \n", "616461 689009.XSHG 2024-04 2024-03-29 -0.071786 0.001687 -0.073474 \n", "616462 689009.XSHG NaT 2024-04-12 NaN NaN NaN \n", "\n", " ym mktcap size rev mom_date mom beta \\\n", "0 2007-06 4.266117e+10 24.476555 NaN NaT NaN 0.4614 \n", "1 2007-07 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 \n", "2 2007-08 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 \n", "3 2007-09 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 \n", "4 2007-10 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 \n", "... ... ... ... ... ... ... ... \n", "616458 2023-12 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 \n", "616459 2024-01 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 \n", "616460 2024-02 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 \n", "616461 2024-03 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 \n", "616462 2024-04 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN \n", "\n", " bm illiq illiq_12m vol ivol vol_clip ivol_clip \n", "0 0.123739 NaN NaN NaN NaN NaN NaN \n", "1 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN \n", "2 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN \n", "3 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN \n", "4 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN \n", "... ... ... ... ... ... ... ... \n", "616458 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228 \n", "616459 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890 \n", "616460 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755 \n", "616461 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928 \n", "616462 0.260342 NaN 0.000121 NaN NaN NaN NaN \n", "\n", "[616463 rows x 20 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 8, "id": "c5f2f0ad-7408-4afb-8461-2ecdbd1eef49", "metadata": {}, "outputs": [], "source": [ "df.drop('tradeDate',axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "postal-medicaid", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateretrfexretymmktcapsizerevmom_datemombetabmilliqilliq_12mvolivolvol_clipivol_clip
0000001.XSHE2007-070.3164970.0024810.3140162007-064.266117e+1024.476555NaNNaTNaN0.46140.123739NaNNaNNaNNaNNaNNaN
1000001.XSHE2007-080.0488550.0024040.0464512007-075.616330e+1024.7515290.3140162007-06NaN0.64230.0939920.000040NaN0.041604NaN0.041604NaN
2000001.XSHE2007-090.0521050.0026210.0494842007-085.890714e+1024.7992280.0464512007-07NaN0.77220.0970850.000020NaN0.033926NaN0.033926NaN
3000001.XSHE2007-100.2018510.0030950.1987562007-096.197651e+1024.8500210.0494842007-08NaN0.75960.0922760.000025NaN0.023872NaN0.023872NaN
4000001.XSHE2007-11-0.2491160.003780-0.2528962007-107.448652e+1025.0338840.1987562007-09NaN0.79880.0834110.000030NaN0.035921NaN0.035921NaN
............................................................
616458689009.XSHG2024-01-0.2130820.001902-0.2149832023-121.552630e+1023.465801-0.1059962023-110.0856021.04480.2475250.0001100.0001070.0246340.0182280.0246340.018228
616459689009.XSHG2024-020.2982010.0017490.2964512024-011.221793e+1023.226170-0.2149832023-12-0.1063571.23140.3136070.0001840.0001160.0246070.0138900.0246070.013890
616460689009.XSHG2024-03-0.0115510.001783-0.0133342024-021.586132e+1023.4871490.2964512024-01-0.2927271.49050.2415690.0001640.0001200.0442430.0247550.0442430.024755
616461689009.XSHG2024-04-0.0717860.001687-0.0734742024-031.543851e+1023.460131-0.0133342024-02-0.1950051.54770.2471270.0000850.0001180.0302060.0229280.0302060.022928
616462689009.XSHGNaTNaNNaNNaN2024-041.433023e+1023.385637-0.0734742024-03-0.104366NaN0.260342NaN0.000121NaNNaNNaNNaN
\n", "

616463 rows × 19 columns

\n", "
" ], "text/plain": [ " secID ret_date ret rf exret ym \\\n", "0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 \n", "1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 \n", "2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 \n", "3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 \n", "4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 \n", "... ... ... ... ... ... ... \n", "616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 \n", "616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 \n", "616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 \n", "616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 \n", "616462 689009.XSHG NaT NaN NaN NaN 2024-04 \n", "\n", " mktcap size rev mom_date mom beta \\\n", "0 4.266117e+10 24.476555 NaN NaT NaN 0.4614 \n", "1 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 \n", "2 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 \n", "3 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 \n", "4 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 \n", "... ... ... ... ... ... ... \n", "616458 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 \n", "616459 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 \n", "616460 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 \n", "616461 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 \n", "616462 1.433023e+10 23.385637 -0.073474 2024-03 -0.104366 NaN \n", "\n", " bm illiq illiq_12m vol ivol vol_clip ivol_clip \n", "0 0.123739 NaN NaN NaN NaN NaN NaN \n", "1 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN \n", "2 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN \n", "3 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN \n", "4 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN \n", "... ... ... ... ... ... ... ... \n", "616458 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228 \n", "616459 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890 \n", "616460 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755 \n", "616461 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928 \n", "616462 0.260342 NaN 0.000121 NaN NaN NaN NaN \n", "\n", "[616463 rows x 19 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "about-wesley", "metadata": {}, "source": [ "## NA 值处理" ] }, { "cell_type": "code", "execution_count": 10, "id": "68ceb981-e3b0-4946-a23a-b9a17c85aee0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "secID 0\n", "ret_date 5299\n", "ret 23412\n", "rf 5299\n", "exret 23412\n", "ym 0\n", "mktcap 17365\n", "size 17365\n", "rev 22223\n", "mom_date 4110\n", "mom 56595\n", "beta 29710\n", "bm 5210\n", "illiq 36680\n", "illiq_12m 107758\n", "vol 25669\n", "ivol 40175\n", "vol_clip 25669\n", "ivol_clip 40175\n" ] } ], "source": [ "for col in df.columns:\n", " print(col, df[col].isna().sum())" ] }, { "cell_type": "markdown", "id": "equipped-meaning", "metadata": {}, "source": [ "ret_date 为 NA 的删除,已到最新数据处" ] }, { "cell_type": "code", "execution_count": 11, "id": "periodic-london", "metadata": {}, "outputs": [], "source": [ "df = df[~df['ret_date'].isna()].copy()" ] }, { "cell_type": "code", "execution_count": 12, "id": "cubic-console", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateretrfexretymmktcapsizerevmom_datemombetabmilliqilliq_12mvolivolvol_clipivol_clip
0000001.XSHE2007-070.3164970.0024810.3140162007-064.266117e+1024.476555NaNNaTNaN0.46140.123739NaNNaNNaNNaNNaNNaN
1000001.XSHE2007-080.0488550.0024040.0464512007-075.616330e+1024.7515290.3140162007-06NaN0.64230.0939920.000040NaN0.041604NaN0.041604NaN
2000001.XSHE2007-090.0521050.0026210.0494842007-085.890714e+1024.7992280.0464512007-07NaN0.77220.0970850.000020NaN0.033926NaN0.033926NaN
3000001.XSHE2007-100.2018510.0030950.1987562007-096.197651e+1024.8500210.0494842007-08NaN0.75960.0922760.000025NaN0.023872NaN0.023872NaN
4000001.XSHE2007-11-0.2491160.003780-0.2528962007-107.448652e+1025.0338840.1987562007-09NaN0.79880.0834110.000030NaN0.035921NaN0.035921NaN
............................................................
616457689009.XSHG2023-12-0.1039270.002068-0.1059962023-111.732706e+1023.5755350.0075402023-10-0.0176750.95410.2218030.0000860.0001150.0175940.0151280.0175940.015128
616458689009.XSHG2024-01-0.2130820.001902-0.2149832023-121.552630e+1023.465801-0.1059962023-110.0856021.04480.2475250.0001100.0001070.0246340.0182280.0246340.018228
616459689009.XSHG2024-020.2982010.0017490.2964512024-011.221793e+1023.226170-0.2149832023-12-0.1063571.23140.3136070.0001840.0001160.0246070.0138900.0246070.013890
616460689009.XSHG2024-03-0.0115510.001783-0.0133342024-021.586132e+1023.4871490.2964512024-01-0.2927271.49050.2415690.0001640.0001200.0442430.0247550.0442430.024755
616461689009.XSHG2024-04-0.0717860.001687-0.0734742024-031.543851e+1023.460131-0.0133342024-02-0.1950051.54770.2471270.0000850.0001180.0302060.0229280.0302060.022928
\n", "

611164 rows × 19 columns

\n", "
" ], "text/plain": [ " secID ret_date ret rf exret ym \\\n", "0 000001.XSHE 2007-07 0.316497 0.002481 0.314016 2007-06 \n", "1 000001.XSHE 2007-08 0.048855 0.002404 0.046451 2007-07 \n", "2 000001.XSHE 2007-09 0.052105 0.002621 0.049484 2007-08 \n", "3 000001.XSHE 2007-10 0.201851 0.003095 0.198756 2007-09 \n", "4 000001.XSHE 2007-11 -0.249116 0.003780 -0.252896 2007-10 \n", "... ... ... ... ... ... ... \n", "616457 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 \n", "616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 \n", "616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 \n", "616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 \n", "616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 \n", "\n", " mktcap size rev mom_date mom beta \\\n", "0 4.266117e+10 24.476555 NaN NaT NaN 0.4614 \n", "1 5.616330e+10 24.751529 0.314016 2007-06 NaN 0.6423 \n", "2 5.890714e+10 24.799228 0.046451 2007-07 NaN 0.7722 \n", "3 6.197651e+10 24.850021 0.049484 2007-08 NaN 0.7596 \n", "4 7.448652e+10 25.033884 0.198756 2007-09 NaN 0.7988 \n", "... ... ... ... ... ... ... \n", "616457 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541 \n", "616458 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 \n", "616459 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 \n", "616460 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 \n", "616461 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 \n", "\n", " bm illiq illiq_12m vol ivol vol_clip ivol_clip \n", "0 0.123739 NaN NaN NaN NaN NaN NaN \n", "1 0.093992 0.000040 NaN 0.041604 NaN 0.041604 NaN \n", "2 0.097085 0.000020 NaN 0.033926 NaN 0.033926 NaN \n", "3 0.092276 0.000025 NaN 0.023872 NaN 0.023872 NaN \n", "4 0.083411 0.000030 NaN 0.035921 NaN 0.035921 NaN \n", "... ... ... ... ... ... ... ... \n", "616457 0.221803 0.000086 0.000115 0.017594 0.015128 0.017594 0.015128 \n", "616458 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228 \n", "616459 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890 \n", "616460 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755 \n", "616461 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928 \n", "\n", "[611164 rows x 19 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "biological-wayne", "metadata": {}, "source": [ "momentum 从 2008-01 开始。简单起见,把所有数据调整为从2008-01开始。" ] }, { "cell_type": "code", "execution_count": 13, "id": "crazy-flash", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Period('2008-01', 'M')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[~df['mom'].isna(),'ret_date'].min()" ] }, { "cell_type": "code", "execution_count": 14, "id": "little-evaluation", "metadata": {}, "outputs": [], "source": [ "df = df[df['ret_date'] >= '2008-01'].copy()" ] }, { "cell_type": "code", "execution_count": 15, "id": "likely-estimate", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "secID 0\n", "ret_date 0\n", "ret 17713\n", "rf 0\n", "exret 17713\n", "ym 0\n", "mktcap 17015\n", "size 17015\n", "rev 21679\n", "mom_date 3931\n", "mom 43462\n", "beta 24044\n", "bm 5017\n", "illiq 30823\n", "illiq_12m 95560\n", "vol 19966\n", "ivol 21978\n", "vol_clip 19966\n", "ivol_clip 21978\n" ] } ], "source": [ "for col in df.columns:\n", " print(col, df[col].isna().sum())" ] }, { "cell_type": "markdown", "id": "liable-agreement", "metadata": {}, "source": [ "剩余的NA值有至少三个来源:\n", "- 由于停牌日期填充造成,\n", "- 由于计算时要求最低样本数造成,\n", "- 由优矿直接给出了NA值" ] }, { "cell_type": "markdown", "id": "minor-pressing", "metadata": {}, "source": [ "return 的 NA 值直接删除" ] }, { "cell_type": "code", "execution_count": 16, "id": "elementary-sixth", "metadata": {}, "outputs": [], "source": [ "df = df[~df['ret'].isna()].copy()" ] }, { "cell_type": "code", "execution_count": 17, "id": "agreed-poison", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateretrfexretymmktcapsizerevmom_datemombetabmilliqilliq_12mvolivolvol_clipivol_clip
6000001.XSHE2008-01-0.1373060.002949-0.1402552007-126.574629e+1024.9090690.0668342007-11NaN0.94680.0944760.000025NaN0.026541NaN0.026541NaN
7000001.XSHE2008-02-0.0045040.002946-0.0074502008-015.850212e+1024.792329-0.1402552007-12NaN0.96540.1095130.000039NaN0.0377220.0129090.0377220.012909
8000001.XSHE2008-03-0.1493210.002746-0.1520682008-025.823860e+1024.787814-0.0074502008-01NaN1.02920.1100090.000064NaN0.0414480.0090320.0414480.009032
9000001.XSHE2008-040.0503550.0028620.0474932008-034.954234e+1024.626093-0.1520682008-02NaN1.02380.2011020.000043NaN0.0451090.0214840.0451090.021484
10000001.XSHE2008-05-0.1482110.002953-0.1511642008-045.203702e+1024.6752210.0474932008-03NaN1.02120.2067010.0000510.0000380.0463230.0150980.0463230.015098
............................................................
616457689009.XSHG2023-12-0.1039270.002068-0.1059962023-111.732706e+1023.5755350.0075402023-10-0.0176750.95410.2218030.0000860.0001150.0175940.0151280.0175940.015128
616458689009.XSHG2024-01-0.2130820.001902-0.2149832023-121.552630e+1023.465801-0.1059962023-110.0856021.04480.2475250.0001100.0001070.0246340.0182280.0246340.018228
616459689009.XSHG2024-020.2982010.0017490.2964512024-011.221793e+1023.226170-0.2149832023-12-0.1063571.23140.3136070.0001840.0001160.0246070.0138900.0246070.013890
616460689009.XSHG2024-03-0.0115510.001783-0.0133342024-021.586132e+1023.4871490.2964512024-01-0.2927271.49050.2415690.0001640.0001200.0442430.0247550.0442430.024755
616461689009.XSHG2024-04-0.0717860.001687-0.0734742024-031.543851e+1023.460131-0.0133342024-02-0.1950051.54770.2471270.0000850.0001180.0302060.0229280.0302060.022928
\n", "

580482 rows × 19 columns

\n", "
" ], "text/plain": [ " secID ret_date ret rf exret ym \\\n", "6 000001.XSHE 2008-01 -0.137306 0.002949 -0.140255 2007-12 \n", "7 000001.XSHE 2008-02 -0.004504 0.002946 -0.007450 2008-01 \n", "8 000001.XSHE 2008-03 -0.149321 0.002746 -0.152068 2008-02 \n", "9 000001.XSHE 2008-04 0.050355 0.002862 0.047493 2008-03 \n", "10 000001.XSHE 2008-05 -0.148211 0.002953 -0.151164 2008-04 \n", "... ... ... ... ... ... ... \n", "616457 689009.XSHG 2023-12 -0.103927 0.002068 -0.105996 2023-11 \n", "616458 689009.XSHG 2024-01 -0.213082 0.001902 -0.214983 2023-12 \n", "616459 689009.XSHG 2024-02 0.298201 0.001749 0.296451 2024-01 \n", "616460 689009.XSHG 2024-03 -0.011551 0.001783 -0.013334 2024-02 \n", "616461 689009.XSHG 2024-04 -0.071786 0.001687 -0.073474 2024-03 \n", "\n", " mktcap size rev mom_date mom beta \\\n", "6 6.574629e+10 24.909069 0.066834 2007-11 NaN 0.9468 \n", "7 5.850212e+10 24.792329 -0.140255 2007-12 NaN 0.9654 \n", "8 5.823860e+10 24.787814 -0.007450 2008-01 NaN 1.0292 \n", "9 4.954234e+10 24.626093 -0.152068 2008-02 NaN 1.0238 \n", "10 5.203702e+10 24.675221 0.047493 2008-03 NaN 1.0212 \n", "... ... ... ... ... ... ... \n", "616457 1.732706e+10 23.575535 0.007540 2023-10 -0.017675 0.9541 \n", "616458 1.552630e+10 23.465801 -0.105996 2023-11 0.085602 1.0448 \n", "616459 1.221793e+10 23.226170 -0.214983 2023-12 -0.106357 1.2314 \n", "616460 1.586132e+10 23.487149 0.296451 2024-01 -0.292727 1.4905 \n", "616461 1.543851e+10 23.460131 -0.013334 2024-02 -0.195005 1.5477 \n", "\n", " bm illiq illiq_12m vol ivol vol_clip ivol_clip \n", "6 0.094476 0.000025 NaN 0.026541 NaN 0.026541 NaN \n", "7 0.109513 0.000039 NaN 0.037722 0.012909 0.037722 0.012909 \n", "8 0.110009 0.000064 NaN 0.041448 0.009032 0.041448 0.009032 \n", "9 0.201102 0.000043 NaN 0.045109 0.021484 0.045109 0.021484 \n", "10 0.206701 0.000051 0.000038 0.046323 0.015098 0.046323 0.015098 \n", "... ... ... ... ... ... ... ... \n", "616457 0.221803 0.000086 0.000115 0.017594 0.015128 0.017594 0.015128 \n", "616458 0.247525 0.000110 0.000107 0.024634 0.018228 0.024634 0.018228 \n", "616459 0.313607 0.000184 0.000116 0.024607 0.013890 0.024607 0.013890 \n", "616460 0.241569 0.000164 0.000120 0.044243 0.024755 0.044243 0.024755 \n", "616461 0.247127 0.000085 0.000118 0.030206 0.022928 0.030206 0.022928 \n", "\n", "[580482 rows x 19 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 18, "id": "enhanced-garden", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "secID 0\n", "ret_date 0\n", "ret 0\n", "rf 0\n", "exret 0\n", "ym 0\n", "mktcap 0\n", "size 0\n", "rev 4664\n", "mom_date 3931\n", "mom 43434\n", "beta 22149\n", "bm 1463\n", "illiq 13508\n", "illiq_12m 79369\n", "vol 2874\n", "ivol 4699\n", "vol_clip 2874\n", "ivol_clip 4699\n" ] } ], "source": [ "for col in df.columns:\n", " print(col, df[col].isna().sum())" ] }, { "cell_type": "code", "execution_count": 19, "id": "based-advertiser", "metadata": {}, "outputs": [], "source": [ "df.drop(['mom_date','mktcap','vol_clip','ivol_clip'],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "backed-sharing", "metadata": {}, "outputs": [], "source": [ "df.drop(['ret','rf'],axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "id": "appropriate-minority", "metadata": {}, "outputs": [], "source": [ "df.reset_index(inplace=True,drop=True)" ] }, { "cell_type": "code", "execution_count": 22, "id": "drawn-prompt", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivol
0000001.XSHE2008-01-0.1402552007-1224.9090690.066834NaN0.94680.0944760.000025NaN0.026541NaN
1000001.XSHE2008-02-0.0074502008-0124.792329-0.140255NaN0.96540.1095130.000039NaN0.0377220.012909
2000001.XSHE2008-03-0.1520682008-0224.787814-0.007450NaN1.02920.1100090.000064NaN0.0414480.009032
3000001.XSHE2008-040.0474932008-0324.626093-0.152068NaN1.02380.2011020.000043NaN0.0451090.021484
4000001.XSHE2008-05-0.1511642008-0424.6752210.047493NaN1.02120.2067010.0000510.0000380.0463230.015098
..........................................
580477689009.XSHG2023-12-0.1059962023-1123.5755350.007540-0.0176750.95410.2218030.0000860.0001150.0175940.015128
580478689009.XSHG2024-01-0.2149832023-1223.465801-0.1059960.0856021.04480.2475250.0001100.0001070.0246340.018228
580479689009.XSHG2024-020.2964512024-0123.226170-0.214983-0.1063571.23140.3136070.0001840.0001160.0246070.013890
580480689009.XSHG2024-03-0.0133342024-0223.4871490.296451-0.2927271.49050.2415690.0001640.0001200.0442430.024755
580481689009.XSHG2024-04-0.0734742024-0323.460131-0.013334-0.1950051.54770.2471270.0000850.0001180.0302060.022928
\n", "

580482 rows × 13 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 \n", "... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 \n", "\n", " mom beta bm illiq illiq_12m vol ivol \n", "0 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN \n", "1 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909 \n", "2 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032 \n", "3 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484 \n", "4 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098 \n", "... ... ... ... ... ... ... ... \n", "580477 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128 \n", "580478 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 \n", "580479 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 \n", "580480 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 \n", "580481 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 \n", "\n", "[580482 rows x 13 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "actual-standard", "metadata": {}, "source": [ "- reversal 的 NA 是由于在对应的return date,上个月停牌所以没有上个月的return。\n", "- beta, bm 是优矿的NA。可以用当月的横截面上的中值填充\n", "- illiq, ivol, vol 也可用当月的横截面上的中值填充." ] }, { "cell_type": "code", "execution_count": 23, "id": "nuclear-chassis", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "secID 0\n", "ret_date 0\n", "exret 0\n", "ym 0\n", "size 0\n", "rev 4664\n", "mom 43434\n", "beta 22149\n", "bm 1463\n", "illiq 13508\n", "illiq_12m 79369\n", "vol 2874\n", "ivol 4699\n" ] } ], "source": [ "for col in df.columns:\n", " print(col, df[col].isna().sum())" ] }, { "cell_type": "code", "execution_count": 24, "id": "declared-blake", "metadata": {}, "outputs": [], "source": [ "# Reversal 的空值丢掉,其他的用 median 填充\n", "df = df[~df['rev'].isna()].copy()" ] }, { "cell_type": "code", "execution_count": 25, "id": "functional-finland", "metadata": {}, "outputs": [], "source": [ "cols = ['mom','beta','bm','illiq','illiq_12m','vol','ivol']" ] }, { "cell_type": "code", "execution_count": 26, "id": "personal-stylus", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivol
0000001.XSHE2008-01-0.1402552007-1224.9090690.066834NaN0.94680.0944760.000025NaN0.026541NaN
1000001.XSHE2008-02-0.0074502008-0124.792329-0.140255NaN0.96540.1095130.000039NaN0.0377220.012909
2000001.XSHE2008-03-0.1520682008-0224.787814-0.007450NaN1.02920.1100090.000064NaN0.0414480.009032
3000001.XSHE2008-040.0474932008-0324.626093-0.152068NaN1.02380.2011020.000043NaN0.0451090.021484
4000001.XSHE2008-05-0.1511642008-0424.6752210.047493NaN1.02120.2067010.0000510.0000380.0463230.015098
..........................................
580477689009.XSHG2023-12-0.1059962023-1123.5755350.007540-0.0176750.95410.2218030.0000860.0001150.0175940.015128
580478689009.XSHG2024-01-0.2149832023-1223.465801-0.1059960.0856021.04480.2475250.0001100.0001070.0246340.018228
580479689009.XSHG2024-020.2964512024-0123.226170-0.214983-0.1063571.23140.3136070.0001840.0001160.0246070.013890
580480689009.XSHG2024-03-0.0133342024-0223.4871490.296451-0.2927271.49050.2415690.0001640.0001200.0442430.024755
580481689009.XSHG2024-04-0.0734742024-0323.460131-0.013334-0.1950051.54770.2471270.0000850.0001180.0302060.022928
\n", "

575818 rows × 13 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 \n", "... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 \n", "\n", " mom beta bm illiq illiq_12m vol ivol \n", "0 NaN 0.9468 0.094476 0.000025 NaN 0.026541 NaN \n", "1 NaN 0.9654 0.109513 0.000039 NaN 0.037722 0.012909 \n", "2 NaN 1.0292 0.110009 0.000064 NaN 0.041448 0.009032 \n", "3 NaN 1.0238 0.201102 0.000043 NaN 0.045109 0.021484 \n", "4 NaN 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098 \n", "... ... ... ... ... ... ... ... \n", "580477 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128 \n", "580478 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 \n", "580479 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 \n", "580480 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 \n", "580481 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 \n", "\n", "[575818 rows x 13 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 27, "id": "competitive-cream", "metadata": {}, "outputs": [], "source": [ "temp = df.groupby('ret_date',as_index=False)[cols].transform(lambda x: x.fillna(x.median()))" ] }, { "cell_type": "code", "execution_count": 28, "id": "exceptional-efficiency", "metadata": {}, "outputs": [], "source": [ "temp.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 29, "id": "0c9634da-4035-411a-b071-ff9c07937033", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mombetabmilliqilliq_12mvolivol
00.7963050.94680.0944760.0000250.0005020.0265410.000000
11.1456390.96540.1095130.0000390.0004780.0377220.012909
20.6936901.02920.1100090.0000640.0004740.0414480.009032
30.5585751.02380.2011020.0000430.0005280.0451090.021484
4-0.0488741.02120.2067010.0000510.0000380.0463230.015098
........................
580477-0.0176750.95410.2218030.0000860.0001150.0175940.015128
5804780.0856021.04480.2475250.0001100.0001070.0246340.018228
580479-0.1063571.23140.3136070.0001840.0001160.0246070.013890
580480-0.2927271.49050.2415690.0001640.0001200.0442430.024755
580481-0.1950051.54770.2471270.0000850.0001180.0302060.022928
\n", "

575818 rows × 7 columns

\n", "
" ], "text/plain": [ " mom beta bm illiq illiq_12m vol ivol\n", "0 0.796305 0.9468 0.094476 0.000025 0.000502 0.026541 0.000000\n", "1 1.145639 0.9654 0.109513 0.000039 0.000478 0.037722 0.012909\n", "2 0.693690 1.0292 0.110009 0.000064 0.000474 0.041448 0.009032\n", "3 0.558575 1.0238 0.201102 0.000043 0.000528 0.045109 0.021484\n", "4 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098\n", "... ... ... ... ... ... ... ...\n", "580477 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128\n", "580478 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228\n", "580479 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890\n", "580480 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755\n", "580481 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928\n", "\n", "[575818 rows x 7 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp" ] }, { "cell_type": "code", "execution_count": 30, "id": "inclusive-singing", "metadata": {}, "outputs": [], "source": [ "df[cols] = temp.copy()" ] }, { "cell_type": "code", "execution_count": 31, "id": "valid-wilson", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "secID 0\n", "ret_date 0\n", "exret 0\n", "ym 0\n", "size 0\n", "rev 0\n", "mom 0\n", "beta 0\n", "bm 0\n", "illiq 0\n", "illiq_12m 0\n", "vol 0\n", "ivol 0\n" ] } ], "source": [ "for col in df.columns:\n", " print(col, df[col].isna().sum())" ] }, { "cell_type": "code", "execution_count": 32, "id": "caroline-simon", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivol
0000001.XSHE2008-01-0.1402552007-1224.9090690.0668340.7963050.94680.0944760.0000250.0005020.0265410.000000
1000001.XSHE2008-02-0.0074502008-0124.792329-0.1402551.1456390.96540.1095130.0000390.0004780.0377220.012909
2000001.XSHE2008-03-0.1520682008-0224.787814-0.0074500.6936901.02920.1100090.0000640.0004740.0414480.009032
3000001.XSHE2008-040.0474932008-0324.626093-0.1520680.5585751.02380.2011020.0000430.0005280.0451090.021484
4000001.XSHE2008-05-0.1511642008-0424.6752210.047493-0.0488741.02120.2067010.0000510.0000380.0463230.015098
..........................................
580477689009.XSHG2023-12-0.1059962023-1123.5755350.007540-0.0176750.95410.2218030.0000860.0001150.0175940.015128
580478689009.XSHG2024-01-0.2149832023-1223.465801-0.1059960.0856021.04480.2475250.0001100.0001070.0246340.018228
580479689009.XSHG2024-020.2964512024-0123.226170-0.214983-0.1063571.23140.3136070.0001840.0001160.0246070.013890
580480689009.XSHG2024-03-0.0133342024-0223.4871490.296451-0.2927271.49050.2415690.0001640.0001200.0442430.024755
580481689009.XSHG2024-04-0.0734742024-0323.460131-0.013334-0.1950051.54770.2471270.0000850.0001180.0302060.022928
\n", "

575818 rows × 13 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 24.909069 0.066834 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 24.792329 -0.140255 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 24.787814 -0.007450 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 24.626093 -0.152068 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 24.675221 0.047493 \n", "... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 23.575535 0.007540 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 23.465801 -0.105996 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 23.226170 -0.214983 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 23.487149 0.296451 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 23.460131 -0.013334 \n", "\n", " mom beta bm illiq illiq_12m vol ivol \n", "0 0.796305 0.9468 0.094476 0.000025 0.000502 0.026541 0.000000 \n", "1 1.145639 0.9654 0.109513 0.000039 0.000478 0.037722 0.012909 \n", "2 0.693690 1.0292 0.110009 0.000064 0.000474 0.041448 0.009032 \n", "3 0.558575 1.0238 0.201102 0.000043 0.000528 0.045109 0.021484 \n", "4 -0.048874 1.0212 0.206701 0.000051 0.000038 0.046323 0.015098 \n", "... ... ... ... ... ... ... ... \n", "580477 -0.017675 0.9541 0.221803 0.000086 0.000115 0.017594 0.015128 \n", "580478 0.085602 1.0448 0.247525 0.000110 0.000107 0.024634 0.018228 \n", "580479 -0.106357 1.2314 0.313607 0.000184 0.000116 0.024607 0.013890 \n", "580480 -0.292727 1.4905 0.241569 0.000164 0.000120 0.044243 0.024755 \n", "580481 -0.195005 1.5477 0.247127 0.000085 0.000118 0.030206 0.022928 \n", "\n", "[575818 rows x 13 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 33, "id": "50c1b68b-dcd0-4e80-ae68-c0ea4869b994", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 575818 entries, 0 to 580481\n", "Data columns (total 13 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 secID 575818 non-null object \n", " 1 ret_date 575818 non-null period[M]\n", " 2 exret 575818 non-null float64 \n", " 3 ym 575818 non-null period[M]\n", " 4 size 575818 non-null float64 \n", " 5 rev 575818 non-null float64 \n", " 6 mom 575818 non-null float64 \n", " 7 beta 575818 non-null float64 \n", " 8 bm 575818 non-null float64 \n", " 9 illiq 575818 non-null float64 \n", " 10 illiq_12m 575818 non-null float64 \n", " 11 vol 575818 non-null float64 \n", " 12 ivol 575818 non-null float64 \n", "dtypes: float64(10), object(1), period[M](2)\n", "memory usage: 61.5+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "id": "latin-intranet", "metadata": {}, "source": [ "## Use rank instead of numerical values" ] }, { "cell_type": "markdown", "id": "metropolitan-tumor", "metadata": {}, "source": [ "$$c_{i,t} = \\frac{2}{N+1}CSrank(c^r_{i,t}) - 1$$\n", "\n", "$c^r_{i,t}$ is the original value, $CSrank$ ranks the value with other firms in the same month t" ] }, { "cell_type": "code", "execution_count": 34, "id": "intimate-cooperative", "metadata": {}, "outputs": [], "source": [ "def csrank(df):\n", " return df.rank() * 2 / (len(df) + 1) - 1" ] }, { "cell_type": "code", "execution_count": 35, "id": "precise-vegetation", "metadata": {}, "outputs": [], "source": [ "num_X_cols = df.select_dtypes('number').columns.drop('exret').tolist()" ] }, { "cell_type": "code", "execution_count": 36, "id": "allied-senior", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "num_X_cols" ] }, { "cell_type": "code", "execution_count": 37, "id": "de4d12c2-c7f3-4e1d-b903-6bb96b41683a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ret_datesizerevmombetabmilliqilliq_12mvolivol
02008-0124.9090690.0668340.7963050.94680.0944760.0000250.0005020.0265410.000000
12008-0224.792329-0.1402551.1456390.96540.1095130.0000390.0004780.0377220.012909
22008-0324.787814-0.0074500.6936901.02920.1100090.0000640.0004740.0414480.009032
32008-0424.626093-0.1520680.5585751.02380.2011020.0000430.0005280.0451090.021484
42008-0524.6752210.047493-0.0488741.02120.2067010.0000510.0000380.0463230.015098
.................................
5804772023-1223.5755350.007540-0.0176750.95410.2218030.0000860.0001150.0175940.015128
5804782024-0123.465801-0.1059960.0856021.04480.2475250.0001100.0001070.0246340.018228
5804792024-0223.226170-0.214983-0.1063571.23140.3136070.0001840.0001160.0246070.013890
5804802024-0323.4871490.296451-0.2927271.49050.2415690.0001640.0001200.0442430.024755
5804812024-0423.460131-0.013334-0.1950051.54770.2471270.0000850.0001180.0302060.022928
\n", "

575818 rows × 10 columns

\n", "
" ], "text/plain": [ " ret_date size rev mom beta bm illiq \\\n", "0 2008-01 24.909069 0.066834 0.796305 0.9468 0.094476 0.000025 \n", "1 2008-02 24.792329 -0.140255 1.145639 0.9654 0.109513 0.000039 \n", "2 2008-03 24.787814 -0.007450 0.693690 1.0292 0.110009 0.000064 \n", "3 2008-04 24.626093 -0.152068 0.558575 1.0238 0.201102 0.000043 \n", "4 2008-05 24.675221 0.047493 -0.048874 1.0212 0.206701 0.000051 \n", "... ... ... ... ... ... ... ... \n", "580477 2023-12 23.575535 0.007540 -0.017675 0.9541 0.221803 0.000086 \n", "580478 2024-01 23.465801 -0.105996 0.085602 1.0448 0.247525 0.000110 \n", "580479 2024-02 23.226170 -0.214983 -0.106357 1.2314 0.313607 0.000184 \n", "580480 2024-03 23.487149 0.296451 -0.292727 1.4905 0.241569 0.000164 \n", "580481 2024-04 23.460131 -0.013334 -0.195005 1.5477 0.247127 0.000085 \n", "\n", " illiq_12m vol ivol \n", "0 0.000502 0.026541 0.000000 \n", "1 0.000478 0.037722 0.012909 \n", "2 0.000474 0.041448 0.009032 \n", "3 0.000528 0.045109 0.021484 \n", "4 0.000038 0.046323 0.015098 \n", "... ... ... ... \n", "580477 0.000115 0.017594 0.015128 \n", "580478 0.000107 0.024634 0.018228 \n", "580479 0.000116 0.024607 0.013890 \n", "580480 0.000120 0.044243 0.024755 \n", "580481 0.000118 0.030206 0.022928 \n", "\n", "[575818 rows x 10 columns]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['ret_date']+num_X_cols]" ] }, { "cell_type": "code", "execution_count": 38, "id": "d946c3ba-0b0c-42a1-a748-f8e861616a17", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ret_datesize
ret_date
2008-0100.00.969559
1960.00.990868
7010.00.522070
16320.00.678843
18280.0-0.231355
............
2024-045803220.0-0.224050
5803550.0-0.121426
5803940.00.845672
5804390.00.952213
5804810.00.705445
\n", "

575818 rows × 2 columns

\n", "
" ], "text/plain": [ " ret_date size\n", "ret_date \n", "2008-01 0 0.0 0.969559\n", " 196 0.0 0.990868\n", " 701 0.0 0.522070\n", " 1632 0.0 0.678843\n", " 1828 0.0 -0.231355\n", "... ... ...\n", "2024-04 580322 0.0 -0.224050\n", " 580355 0.0 -0.121426\n", " 580394 0.0 0.845672\n", " 580439 0.0 0.952213\n", " 580481 0.0 0.705445\n", "\n", "[575818 rows x 2 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['ret_date','size']].groupby('ret_date',group_keys=True).apply(csrank)" ] }, { "cell_type": "code", "execution_count": 39, "id": "boring-humidity", "metadata": {}, "outputs": [], "source": [ "temp = df[['ret_date']+num_X_cols].groupby('ret_date',group_keys=True).apply(csrank)" ] }, { "cell_type": "code", "execution_count": 40, "id": "83220672-a4a4-41ba-ae01-0d6347c46e8e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ret_datesizerevmombetabmilliqilliq_12mvolivol
ret_date
2008-0100.00.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.000000
1960.00.990868-0.9908680.8523590.662100-0.375951-0.996956-0.9893460.7458140.000000
7010.00.522070-0.9726030.5525110.5235920.283105-0.223744-0.5951290.8143070.000000
16320.00.678843-0.506849-0.5175040.775495-0.636225-0.698630-0.5114160.4931510.000000
18280.0-0.231355-0.9459670.7092850.000000-0.4033490.0000000.000000-0.9824960.000000
....................................
2024-045803220.0-0.2240500.7888760.8726990.518997-0.3168820.2299260.310223-0.0732470.164121
5803550.0-0.1214260.336859-0.9428120.849197-0.2107320.3811200.2287500.6235800.095574
5803940.00.845672-0.4813940.2792790.6862510.255778-0.156287-0.175872-0.578143-0.592244
5804390.00.952213-0.8699570.5628670.264787-0.171563-0.974540-0.984332-0.707795-0.609871
5804810.00.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.578143
\n", "

575818 rows × 10 columns

\n", "
" ], "text/plain": [ " ret_date size rev mom beta bm \\\n", "ret_date \n", "2008-01 0 0.0 0.969559 -0.858447 0.000000 0.086758 -0.672755 \n", " 196 0.0 0.990868 -0.990868 0.852359 0.662100 -0.375951 \n", " 701 0.0 0.522070 -0.972603 0.552511 0.523592 0.283105 \n", " 1632 0.0 0.678843 -0.506849 -0.517504 0.775495 -0.636225 \n", " 1828 0.0 -0.231355 -0.945967 0.709285 0.000000 -0.403349 \n", "... ... ... ... ... ... ... \n", "2024-04 580322 0.0 -0.224050 0.788876 0.872699 0.518997 -0.316882 \n", " 580355 0.0 -0.121426 0.336859 -0.942812 0.849197 -0.210732 \n", " 580394 0.0 0.845672 -0.481394 0.279279 0.686251 0.255778 \n", " 580439 0.0 0.952213 -0.869957 0.562867 0.264787 -0.171563 \n", " 580481 0.0 0.705445 -0.363494 0.056404 0.515864 -0.662554 \n", "\n", " illiq illiq_12m vol ivol \n", "ret_date \n", "2008-01 0 -0.978691 0.000000 0.275495 0.000000 \n", " 196 -0.996956 -0.989346 0.745814 0.000000 \n", " 701 -0.223744 -0.595129 0.814307 0.000000 \n", " 1632 -0.698630 -0.511416 0.493151 0.000000 \n", " 1828 0.000000 0.000000 -0.982496 0.000000 \n", "... ... ... ... ... \n", "2024-04 580322 0.229926 0.310223 -0.073247 0.164121 \n", " 580355 0.381120 0.228750 0.623580 0.095574 \n", " 580394 -0.156287 -0.175872 -0.578143 -0.592244 \n", " 580439 -0.974540 -0.984332 -0.707795 -0.609871 \n", " 580481 -0.621622 -0.574226 0.361535 0.578143 \n", "\n", "[575818 rows x 10 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp" ] }, { "cell_type": "code", "execution_count": 41, "id": "94072912-a6f9-4059-8ee7-285d3b87bb5d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ret_datelevel_1sizerevmombetabmilliqilliq_12mvolivol
02008-0100.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.000000
12008-011960.990868-0.9908680.8523590.662100-0.375951-0.996956-0.9893460.7458140.000000
22008-017010.522070-0.9726030.5525110.5235920.283105-0.223744-0.5951290.8143070.000000
32008-0116320.678843-0.506849-0.5175040.775495-0.636225-0.698630-0.5114160.4931510.000000
42008-011828-0.231355-0.9459670.7092850.000000-0.4033490.0000000.000000-0.9824960.000000
....................................
5758132024-04580322-0.2240500.7888760.8726990.518997-0.3168820.2299260.310223-0.0732470.164121
5758142024-04580355-0.1214260.336859-0.9428120.849197-0.2107320.3811200.2287500.6235800.095574
5758152024-045803940.845672-0.4813940.2792790.6862510.255778-0.156287-0.175872-0.578143-0.592244
5758162024-045804390.952213-0.8699570.5628670.264787-0.171563-0.974540-0.984332-0.707795-0.609871
5758172024-045804810.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.578143
\n", "

575818 rows × 11 columns

\n", "
" ], "text/plain": [ " ret_date level_1 size rev mom beta bm \\\n", "0 2008-01 0 0.969559 -0.858447 0.000000 0.086758 -0.672755 \n", "1 2008-01 196 0.990868 -0.990868 0.852359 0.662100 -0.375951 \n", "2 2008-01 701 0.522070 -0.972603 0.552511 0.523592 0.283105 \n", "3 2008-01 1632 0.678843 -0.506849 -0.517504 0.775495 -0.636225 \n", "4 2008-01 1828 -0.231355 -0.945967 0.709285 0.000000 -0.403349 \n", "... ... ... ... ... ... ... ... \n", "575813 2024-04 580322 -0.224050 0.788876 0.872699 0.518997 -0.316882 \n", "575814 2024-04 580355 -0.121426 0.336859 -0.942812 0.849197 -0.210732 \n", "575815 2024-04 580394 0.845672 -0.481394 0.279279 0.686251 0.255778 \n", "575816 2024-04 580439 0.952213 -0.869957 0.562867 0.264787 -0.171563 \n", "575817 2024-04 580481 0.705445 -0.363494 0.056404 0.515864 -0.662554 \n", "\n", " illiq illiq_12m vol ivol \n", "0 -0.978691 0.000000 0.275495 0.000000 \n", "1 -0.996956 -0.989346 0.745814 0.000000 \n", "2 -0.223744 -0.595129 0.814307 0.000000 \n", "3 -0.698630 -0.511416 0.493151 0.000000 \n", "4 0.000000 0.000000 -0.982496 0.000000 \n", "... ... ... ... ... \n", "575813 0.229926 0.310223 -0.073247 0.164121 \n", "575814 0.381120 0.228750 0.623580 0.095574 \n", "575815 -0.156287 -0.175872 -0.578143 -0.592244 \n", "575816 -0.974540 -0.984332 -0.707795 -0.609871 \n", "575817 -0.621622 -0.574226 0.361535 0.578143 \n", "\n", "[575818 rows x 11 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp.drop('ret_date',axis=1).reset_index()" ] }, { "cell_type": "code", "execution_count": 42, "id": "3a5e3ad0-f26e-41d8-bac9-2f3cb366de67", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ret_datesizerevmombetabmilliqilliq_12mvolivol
level_1
02008-010.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.000000
1962008-010.990868-0.9908680.8523590.662100-0.375951-0.996956-0.9893460.7458140.000000
7012008-010.522070-0.9726030.5525110.5235920.283105-0.223744-0.5951290.8143070.000000
16322008-010.678843-0.506849-0.5175040.775495-0.636225-0.698630-0.5114160.4931510.000000
18282008-01-0.231355-0.9459670.7092850.000000-0.4033490.0000000.000000-0.9824960.000000
.................................
5803222024-04-0.2240500.7888760.8726990.518997-0.3168820.2299260.310223-0.0732470.164121
5803552024-04-0.1214260.336859-0.9428120.849197-0.2107320.3811200.2287500.6235800.095574
5803942024-040.845672-0.4813940.2792790.6862510.255778-0.156287-0.175872-0.578143-0.592244
5804392024-040.952213-0.8699570.5628670.264787-0.171563-0.974540-0.984332-0.707795-0.609871
5804812024-040.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.578143
\n", "

575818 rows × 10 columns

\n", "
" ], "text/plain": [ " ret_date size rev mom beta bm illiq \\\n", "level_1 \n", "0 2008-01 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 \n", "196 2008-01 0.990868 -0.990868 0.852359 0.662100 -0.375951 -0.996956 \n", "701 2008-01 0.522070 -0.972603 0.552511 0.523592 0.283105 -0.223744 \n", "1632 2008-01 0.678843 -0.506849 -0.517504 0.775495 -0.636225 -0.698630 \n", "1828 2008-01 -0.231355 -0.945967 0.709285 0.000000 -0.403349 0.000000 \n", "... ... ... ... ... ... ... ... \n", "580322 2024-04 -0.224050 0.788876 0.872699 0.518997 -0.316882 0.229926 \n", "580355 2024-04 -0.121426 0.336859 -0.942812 0.849197 -0.210732 0.381120 \n", "580394 2024-04 0.845672 -0.481394 0.279279 0.686251 0.255778 -0.156287 \n", "580439 2024-04 0.952213 -0.869957 0.562867 0.264787 -0.171563 -0.974540 \n", "580481 2024-04 0.705445 -0.363494 0.056404 0.515864 -0.662554 -0.621622 \n", "\n", " illiq_12m vol ivol \n", "level_1 \n", "0 0.000000 0.275495 0.000000 \n", "196 -0.989346 0.745814 0.000000 \n", "701 -0.595129 0.814307 0.000000 \n", "1632 -0.511416 0.493151 0.000000 \n", "1828 0.000000 -0.982496 0.000000 \n", "... ... ... ... \n", "580322 0.310223 -0.073247 0.164121 \n", "580355 0.228750 0.623580 0.095574 \n", "580394 -0.175872 -0.578143 -0.592244 \n", "580439 -0.984332 -0.707795 -0.609871 \n", "580481 -0.574226 0.361535 0.578143 \n", "\n", "[575818 rows x 10 columns]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp = temp.drop('ret_date',axis=1).reset_index().set_index('level_1')\n", "temp" ] }, { "cell_type": "code", "execution_count": 43, "id": "altered-contrary", "metadata": {}, "outputs": [], "source": [ "df_rank = pd.merge(df.drop(num_X_cols, axis=1),\n", " temp.drop('ret_date',axis=1),\n", " left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 44, "id": "differential-israel", "metadata": {}, "outputs": [], "source": [ "del temp" ] }, { "cell_type": "code", "execution_count": 45, "id": "adaptive-customer", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivol
0000001.XSHE2008-01-0.1402552007-120.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.000000
1000001.XSHE2008-02-0.0074502008-010.971536-0.4516850.000000-0.170037-0.613483-0.9595510.000000-0.635955-0.791760
2000001.XSHE2008-03-0.1520682008-020.967335-0.7000740.0000000.345212-0.557535-0.9539720.0000000.437268-0.625835
3000001.XSHE2008-040.0474932008-030.9690270.4439530.0000000.048673-0.112094-0.9749260.0000000.2418880.306785
4000001.XSHE2008-05-0.1511642008-040.9645490.5450520.000000-0.264402-0.258493-0.970458-0.976366-0.704579-0.497784
..........................................
580477689009.XSHG2023-12-0.1059962023-110.719298-0.1862800.0455350.381234-0.645969-0.519022-0.5304550.1145280.413365
580478689009.XSHG2024-01-0.2149832023-120.691854-0.8414010.1688310.515545-0.574577-0.515152-0.5533250.5155450.622983
580479689009.XSHG2024-020.2964512024-010.674975-0.066928-0.2698720.369774-0.590579-0.492836-0.555643-0.3413150.285574
580480689009.XSHG2024-03-0.0133342024-020.7127740.947884-0.2535270.546042-0.709444-0.564655-0.563480-0.3032920.719436
580481689009.XSHG2024-04-0.0734742024-030.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.578143
\n", "

575818 rows × 13 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev mom \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 \n", "... ... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 \n", "\n", " beta bm illiq illiq_12m vol ivol \n", "0 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 \n", "1 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 \n", "2 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 \n", "3 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 \n", "4 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 \n", "... ... ... ... ... ... ... \n", "580477 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 \n", "580478 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983 \n", "580479 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574 \n", "580480 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436 \n", "580481 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143 \n", "\n", "[575818 rows x 13 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank" ] }, { "cell_type": "code", "execution_count": 46, "id": "e6380c15-5278-4a05-9ce9-59a29a28d1e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 5.758180e+05\n", "mean 7.502544e-18\n", "std 5.771543e-01\n", "min -9.996083e-01\n", "25% -4.998116e-01\n", "50% 0.000000e+00\n", "75% 4.998176e-01\n", "max 9.996083e-01\n", "Name: size, dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank['size'].describe()" ] }, { "cell_type": "markdown", "id": "irish-plant", "metadata": {}, "source": [ "# Train, Validation, Test split" ] }, { "cell_type": "code", "execution_count": 47, "id": "overhead-bullet", "metadata": {}, "outputs": [], "source": [ "df_rank['year'] = df_rank['ret_date'].dt.year" ] }, { "cell_type": "code", "execution_count": 48, "id": "forbidden-glass", "metadata": {}, "outputs": [], "source": [ "time_idx = [value for (key, value) in sorted(df_rank.groupby('year').groups.items())]" ] }, { "cell_type": "code", "execution_count": 49, "id": "posted-tackle", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "year\n", "2008 1463\n", "2009 1530\n", "2010 1841\n", "2011 2142\n", "2012 2383\n", "2013 2432\n", "2014 2549\n", "2015 2772\n", "2016 2941\n", "2017 3392\n", "2018 3522\n", "2019 3648\n", "2020 3961\n", "2021 4422\n", "2022 4770\n", "2023 5110\n", "2024 5111\n", "Name: secID, dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank.groupby('year')['secID'].nunique()" ] }, { "cell_type": "code", "execution_count": 50, "id": "tribal-humanitarian", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "year\n", "2008 16621\n", "2009 17335\n", "2010 19823\n", "2011 23864\n", "2012 26912\n", "2013 28592\n", "2014 29539\n", "2015 31727\n", "2016 33468\n", "2017 37665\n", "2018 41103\n", "2019 41992\n", "2020 44134\n", "2021 49181\n", "2022 54418\n", "2023 59061\n", "2024 20383\n", "Name: secID, dtype: int64" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank.groupby('year')['secID'].count()" ] }, { "cell_type": "code", "execution_count": 51, "id": "ordinary-twins", "metadata": {}, "outputs": [], "source": [ "def list_flat(list_):\n", " return [item for sublist in list_ for item in sublist]\n", "# This is the same as:\n", "# def list_flat2(list_):\n", "# result = []\n", "# for sublist in list_:\n", "# for item in sublist:\n", "# result.append(item)\n", "# return result" ] }, { "cell_type": "code", "execution_count": 52, "id": "micro-department", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 3, 4, 5]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_flat([[1,2,3],[3,4,5]])" ] }, { "cell_type": "code", "execution_count": 53, "id": "9647949d-41d8-41ba-916a-82818b4c0d3b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 3, 4, 5])" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.array([[1,2,3],[3,4,5]]).flatten()" ] }, { "cell_type": "code", "execution_count": 54, "id": "543e2266-a55b-4023-9acf-58303998eff8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivolyear
0000001.XSHE2008-01-0.1402552007-120.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.0000002008
1000001.XSHE2008-02-0.0074502008-010.971536-0.4516850.000000-0.170037-0.613483-0.9595510.000000-0.635955-0.7917602008
2000001.XSHE2008-03-0.1520682008-020.967335-0.7000740.0000000.345212-0.557535-0.9539720.0000000.437268-0.6258352008
3000001.XSHE2008-040.0474932008-030.9690270.4439530.0000000.048673-0.112094-0.9749260.0000000.2418880.3067852008
4000001.XSHE2008-05-0.1511642008-040.9645490.5450520.000000-0.264402-0.258493-0.970458-0.976366-0.704579-0.4977842008
.............................................
580477689009.XSHG2023-12-0.1059962023-110.719298-0.1862800.0455350.381234-0.645969-0.519022-0.5304550.1145280.4133652023
580478689009.XSHG2024-01-0.2149832023-120.691854-0.8414010.1688310.515545-0.574577-0.515152-0.5533250.5155450.6229832024
580479689009.XSHG2024-020.2964512024-010.674975-0.066928-0.2698720.369774-0.590579-0.492836-0.555643-0.3413150.2855742024
580480689009.XSHG2024-03-0.0133342024-020.7127740.947884-0.2535270.546042-0.709444-0.564655-0.563480-0.3032920.7194362024
580481689009.XSHG2024-04-0.0734742024-030.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.5781432024
\n", "

575818 rows × 14 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev mom \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 \n", "... ... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 \n", "\n", " beta bm illiq illiq_12m vol ivol year \n", "0 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008 \n", "1 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008 \n", "2 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008 \n", "3 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008 \n", "4 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008 \n", "... ... ... ... ... ... ... ... \n", "580477 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023 \n", "580478 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983 2024 \n", "580479 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574 2024 \n", "580480 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436 2024 \n", "580481 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143 2024 \n", "\n", "[575818 rows x 14 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank" ] }, { "cell_type": "code", "execution_count": 55, "id": "crazy-pleasure", "metadata": {}, "outputs": [], "source": [ "# training, validation, testing scheme:\n", "# 1. [2008-2011], [2012-2015], [2016]\n", "# 2. [2008-2012], [2013-2016], [2017]\n", "# ...\n", "# last. [2009-2019], [2020-2023], [2024]\n", "fulltrain_idx = []\n", "cv_idx = []\n", "test_idx = []\n", "for i in range(4,len(time_idx)-4):\n", " train_idx = list_flat(time_idx[0:i])\n", " val_idx = list_flat(time_idx[i:i+4])\n", " fulltrain_idx.append(train_idx + val_idx)\n", " cv_idx.append((np.where(np.isin(fulltrain_idx[-1], train_idx))[0], \n", " np.where(np.isin(fulltrain_idx[-1], val_idx))[0])) # GridSearchCV 内部用 array 操作,不能带着pandas的index,\n", " # 因此cv_idx需要用fulltrain_idx的编号从0开始\n", " test_idx.append(time_idx[i+4])" ] }, { "cell_type": "code", "execution_count": 56, "id": "1cbf7150-82b5-4467-ba53-413e7ced32a8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivolyear
0000001.XSHE2008-01-0.1402552007-120.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.0000002008
1000001.XSHE2008-02-0.0074502008-010.971536-0.4516850.000000-0.170037-0.613483-0.9595510.000000-0.635955-0.7917602008
2000001.XSHE2008-03-0.1520682008-020.967335-0.7000740.0000000.345212-0.557535-0.9539720.0000000.437268-0.6258352008
3000001.XSHE2008-040.0474932008-030.9690270.4439530.0000000.048673-0.112094-0.9749260.0000000.2418880.3067852008
4000001.XSHE2008-05-0.1511642008-040.9645490.5450520.000000-0.264402-0.258493-0.970458-0.976366-0.704579-0.4977842008
.............................................
580473689009.XSHG2023-08-0.0409852023-070.701600-0.517200-0.7316000.584200-0.678400-0.577600-0.471200-0.330000-0.4736002023
580474689009.XSHG2023-090.0405982023-080.7089970.125000-0.7551750.343949-0.653264-0.599920-0.489650-0.0167200.2615452023
580475689009.XSHG2023-10-0.0604602023-090.7200550.669768-0.6495740.496535-0.684815-0.497129-0.520887-0.263512-0.5240552023
580476689009.XSHG2023-110.0075402023-100.716881-0.5024680.1380060.430207-0.665153-0.103653-0.522606-0.249358-0.0392892023
580477689009.XSHG2023-12-0.1059962023-110.719298-0.1862800.0455350.381234-0.645969-0.519022-0.5304550.1145280.4133652023
\n", "

555435 rows × 14 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev mom \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 \n", "... ... ... ... ... ... ... ... \n", "580473 689009.XSHG 2023-08 -0.040985 2023-07 0.701600 -0.517200 -0.731600 \n", "580474 689009.XSHG 2023-09 0.040598 2023-08 0.708997 0.125000 -0.755175 \n", "580475 689009.XSHG 2023-10 -0.060460 2023-09 0.720055 0.669768 -0.649574 \n", "580476 689009.XSHG 2023-11 0.007540 2023-10 0.716881 -0.502468 0.138006 \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 \n", "\n", " beta bm illiq illiq_12m vol ivol year \n", "0 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008 \n", "1 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008 \n", "2 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008 \n", "3 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008 \n", "4 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008 \n", "... ... ... ... ... ... ... ... \n", "580473 0.584200 -0.678400 -0.577600 -0.471200 -0.330000 -0.473600 2023 \n", "580474 0.343949 -0.653264 -0.599920 -0.489650 -0.016720 0.261545 2023 \n", "580475 0.496535 -0.684815 -0.497129 -0.520887 -0.263512 -0.524055 2023 \n", "580476 0.430207 -0.665153 -0.103653 -0.522606 -0.249358 -0.039289 2023 \n", "580477 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023 \n", "\n", "[555435 rows x 14 columns]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank.loc[fulltrain_idx[-1]]" ] }, { "cell_type": "code", "execution_count": 57, "id": "amino-endorsement", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 2, 4])" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Example\n", "a = [0,1,4,5,3000]\n", "np.where(np.isin(a, [0,3000,4]))[0]" ] }, { "cell_type": "code", "execution_count": 59, "id": "relative-circuit", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_years = list(range(2016, 2025))\n", "test_years" ] }, { "cell_type": "markdown", "id": "innocent-george", "metadata": { "tags": [] }, "source": [ "# Evaluation metrics" ] }, { "cell_type": "markdown", "id": "57e7f542-e6fe-4ef1-bda7-4dddebd2f789", "metadata": {}, "source": [ "Clark Watson, West (2007)" ] }, { "cell_type": "code", "execution_count": 60, "id": "alpha-steel", "metadata": {}, "outputs": [], "source": [ "def r2_oos(y_true, y_pred):\n", " return 1 - np.sum((y_true - y_pred)**2) / np.sum(y_true**2)" ] }, { "cell_type": "code", "execution_count": 61, "id": "clinical-allergy", "metadata": {}, "outputs": [], "source": [ "r2_oos_scorer = make_scorer(r2_oos)" ] }, { "cell_type": "markdown", "id": "5ac4438e-4204-4812-b7f8-6d6c88b3f0dd", "metadata": { "tags": [] }, "source": [ "# Sklearn" ] }, { "cell_type": "markdown", "id": "204550c0-81d2-4ee3-bc36-c67f6f1d5b0f", "metadata": {}, "source": [ "Scikit-learn (sklearn) 的设计理念:\n", "- Estimators: 可以基于数据估计出参数的东西。用fit()估计。比如填充空值(imputer),linear regression,等等。\n", "- Transformers (不是神经网络里的那个,不是 Vaswani et al. (2017)): 可以把数据转换成新数据的东西。用transform()转换。一般可以直接用 fit_transform()\n", "- Predictors: 可以基于数据做预测,比如linear regression\n", "\n", "统一的命名规范:\n", "- hyperparameter可以由model.\\取出,比如model.n_estimators\n", "- estimated parameters可以由 model.\\取出,比如model.feature_importances_\n", "\n", "数据用np.array保存,或者SciPy的稀疏矩阵。避免各类其他包的自定义(比如pandas)\n", "\n", "给出了大量的机器学习模型,同时很容易自定义进行拓展。自定义的模型可以很方便的融入到sklearn自带的模型当中" ] }, { "cell_type": "markdown", "id": "periodic-harrison", "metadata": {}, "source": [ "# Models" ] }, { "cell_type": "markdown", "id": "preceding-humanitarian", "metadata": {}, "source": [ "## Linear regression" ] }, { "cell_type": "code", "execution_count": 62, "id": "spiritual-university", "metadata": {}, "outputs": [], "source": [ "cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']" ] }, { "cell_type": "code", "execution_count": 63, "id": "uniform-ebony", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols" ] }, { "cell_type": "code", "execution_count": 64, "id": "cultural-blocking", "metadata": {}, "outputs": [], "source": [ "model = LinearRegression()" ] }, { "cell_type": "code", "execution_count": 65, "id": "breeding-announcement", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.009411576536744626\n", "Test year 2017 : -0.08839433918218265\n", "Test year 2018 : -0.04979412952068807\n", "Test year 2019 : 0.006460501095753468\n", "Test year 2020 : -0.001548658826626248\n", "Test year 2021 : 0.011487385386933058\n", "Test year 2022 : -0.0009344902234940111\n", "Test year 2023 : 0.009191975684269216\n", "Test year 2024 : -0.0203421002634514\n" ] } ], "source": [ "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " model.fit(X=X_fulltrain, y=y_fulltrain)\n", " y_pred = model.predict(X=X_test)\n", " \n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "code", "execution_count": 63, "id": "superb-standard", "metadata": {}, "outputs": [], "source": [ "cols = ['size','rev','illiq','ivol']" ] }, { "cell_type": "code", "execution_count": 66, "id": "raising-denver", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.009411576536744626\n", "Test year 2017 : -0.08839433918218265\n", "Test year 2018 : -0.04979412952068807\n", "Test year 2019 : 0.006460501095753468\n", "Test year 2020 : -0.001548658826626248\n", "Test year 2021 : 0.011487385386933058\n", "Test year 2022 : -0.0009344902234940111\n", "Test year 2023 : 0.009191975684269216\n", "Test year 2024 : -0.0203421002634514\n" ] } ], "source": [ "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " model.fit(X=X_fulltrain, y=y_fulltrain)\n", " y_pred = model.predict(X=X_test)\n", " \n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "prescription-campus", "metadata": {}, "source": [ "## Huber regressor" ] }, { "cell_type": "code", "execution_count": 67, "id": "bright-wound", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'ivol']" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = [col for col in num_X_cols if col != 'illiq_12m' and col!='vol']\n", "cols" ] }, { "cell_type": "code", "execution_count": 68, "id": "cheap-paste", "metadata": {}, "outputs": [], "source": [ "model = HuberRegressor(alpha=0.01,epsilon=1.05)" ] }, { "cell_type": "code", "execution_count": 69, "id": "printable-farming", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : 0.006876742676993675\n", "Test year 2017 : -0.029175894616541687\n", "Test year 2018 : 0.009048190975758708\n", "Test year 2019 : -0.018440250950227055\n", "Test year 2020 : -0.013739947817012599\n", "Test year 2021 : -0.008465853786439048\n", "Test year 2022 : 0.010892561021420222\n", "Test year 2023 : -0.002345351762501169\n", "Test year 2024 : 0.015101267852206668\n" ] } ], "source": [ "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " model.fit(X=X_fulltrain, y=y_fulltrain)\n", " y_pred = model.predict(X=X_test)\n", " \n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "actual-brunswick", "metadata": { "tags": [] }, "source": [ "## Random Forest" ] }, { "cell_type": "code", "execution_count": 70, "id": "elder-cheese", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = num_X_cols\n", "cols" ] }, { "cell_type": "code", "execution_count": 71, "id": "liable-corps", "metadata": {}, "outputs": [], "source": [ "hyperparam_grid = [\n", " {'n_estimators': [50], 'max_depth': [3,5,7], \n", " 'max_features': [3,5]}\n", "]" ] }, { "cell_type": "code", "execution_count": 72, "id": "tight-helena", "metadata": {}, "outputs": [], "source": [ "model = RandomForestRegressor(random_state=42)" ] }, { "cell_type": "code", "execution_count": 73, "id": "historic-chorus", "metadata": {}, "outputs": [], "source": [ "# Cross validation for period 0, i.e.\n", "# train: [2008-2011], val: [2012-2015], test: [2016]\n", "grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)" ] }, { "cell_type": "code", "execution_count": 74, "id": "chinese-terry", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_test = df_rank.loc[test_idx[0], cols]\n", "y_test = df_rank.loc[test_idx[0], 'exret']" ] }, { "cell_type": "code", "execution_count": 76, "id": "wanted-aruba", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 48.4 s, sys: 456 ms, total: 48.9 s\n", "Wall time: 50.8 s\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=RandomForestRegressor(random_state=42),\n", " param_grid=[{'max_depth': [3, 5, 7], 'max_features': [3, 5],\n", " 'n_estimators': [50]}],\n", " return_train_score=True, scoring=make_scorer(r2_oos))" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "grid_search.fit(X_fulltrain, y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 77, "id": "special-translation", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'max_depth': 5, 'max_features': 5, 'n_estimators': 50}" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 78, "id": "after-construction", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.027084789556580172 {'max_depth': 3, 'max_features': 3, 'n_estimators': 50}\n", "0.028556999189124684 {'max_depth': 3, 'max_features': 5, 'n_estimators': 50}\n", "0.05042647627227969 {'max_depth': 5, 'max_features': 3, 'n_estimators': 50}\n", "0.05530932712278905 {'max_depth': 5, 'max_features': 5, 'n_estimators': 50}\n", "0.04464716807967209 {'max_depth': 7, 'max_features': 3, 'n_estimators': 50}\n", "0.05370787731186078 {'max_depth': 7, 'max_features': 5, 'n_estimators': 50}\n" ] } ], "source": [ "cv_results = grid_search.cv_results_\n", "for mean_score, params in zip(cv_results['mean_test_score'],\n", " cv_results['params']):\n", " print(np.sqrt(mean_score), params)" ] }, { "cell_type": "code", "execution_count": 79, "id": "tired-abraham", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featuresfeature_importance
5illiq0.260451
1rev0.212757
7vol0.120819
8ivol0.115962
0size0.102099
2mom0.067074
4bm0.049063
6illiq_12m0.044348
3beta0.027427
\n", "
" ], "text/plain": [ " features feature_importance\n", "5 illiq 0.260451\n", "1 rev 0.212757\n", "7 vol 0.120819\n", "8 ivol 0.115962\n", "0 size 0.102099\n", "2 mom 0.067074\n", "4 bm 0.049063\n", "6 illiq_12m 0.044348\n", "3 beta 0.027427" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({\"features\":num_X_cols,\"feature_importance\":grid_search.best_estimator_.feature_importances_}).sort_values('feature_importance',\n", " ascending=False)" ] }, { "cell_type": "code", "execution_count": 80, "id": "polished-sequence", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.022154653336818875" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "code", "execution_count": 81, "id": "satisfactory-bonus", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.022154653336818875\n", "Test year 2017 : -0.08063812247577906\n", "Test year 2018 : -0.03875517775769799\n", "Test year 2019 : 0.00843188559327912\n", "Test year 2020 : 0.00426710793978613\n", "Test year 2021 : 0.011844755800809903\n", "Test year 2022 : -0.002779929058047914\n", "Test year 2023 : 0.009093821371165545\n", "Test year 2024 : -0.017035974030240375\n", "CPU times: user 18min 26s, sys: 10.6 s, total: 18min 37s\n", "Wall time: 19min 19s\n" ] } ], "source": [ "%%time\n", "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)\n", " grid_search.fit(X_fulltrain, y_fulltrain)\n", " y_pred = grid_search.predict(X=X_test)\n", " \n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "homeless-civilian", "metadata": {}, "source": [ "## Partial Least Squares" ] }, { "cell_type": "code", "execution_count": 82, "id": "trying-assistant", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = num_X_cols\n", "cols" ] }, { "cell_type": "code", "execution_count": 83, "id": "cultural-routine", "metadata": {}, "outputs": [], "source": [ "model = PLSRegression(n_components=4)" ] }, { "cell_type": "code", "execution_count": 84, "id": "rising-transcription", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(20383,)" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred.reshape(-1).shape" ] }, { "cell_type": "code", "execution_count": 85, "id": "greatest-falls", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.008579624491225069\n", "Test year 2017 : -0.0932107045146382\n", "Test year 2018 : -0.04912393839310547\n", "Test year 2019 : 0.006105302409689872\n", "Test year 2020 : -0.0015629368636205232\n", "Test year 2021 : 0.011195966741407215\n", "Test year 2022 : -0.000636538680146348\n", "Test year 2023 : 0.009706227073746132\n", "Test year 2024 : -0.02135918156321326\n", "CPU times: user 10.3 s, sys: 783 ms, total: 11.1 s\n", "Wall time: 3.4 s\n" ] } ], "source": [ "%%time\n", "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " model.fit(X_fulltrain, y_fulltrain)\n", " y_pred = model.predict(X=X_test)\n", " y_pred = y_pred.reshape(-1)\n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "enabling-worker", "metadata": {}, "source": [ "## Principal Component Regression" ] }, { "cell_type": "markdown", "id": "derived-insurance", "metadata": {}, "source": [ "### PCA transform" ] }, { "cell_type": "code", "execution_count": 86, "id": "favorite-display", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['size', 'rev', 'mom', 'beta', 'bm', 'illiq', 'illiq_12m', 'vol', 'ivol']" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = num_X_cols\n", "cols" ] }, { "cell_type": "code", "execution_count": 87, "id": "limited-february", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_test = df_rank.loc[test_idx[0],cols]\n", "y_test = df_rank.loc[test_idx[0],'exret']" ] }, { "cell_type": "code", "execution_count": 88, "id": "reliable-gabriel", "metadata": {}, "outputs": [], "source": [ "pca = PCA(3, random_state=42)" ] }, { "cell_type": "code", "execution_count": 89, "id": "advisory-table", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PCA(n_components=3, random_state=42)" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.fit(X_fulltrain)" ] }, { "cell_type": "code", "execution_count": 90, "id": "removed-swiss", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.54095879, -0.09832092, -0.01437701, 0.13052989, 0.10593014,\n", " -0.54353902, -0.55735791, -0.16921922, -0.18577673],\n", " [ 0.1230896 , 0.28038463, 0.28945179, 0.04301217, -0.37797118,\n", " -0.22714271, -0.12955155, 0.5400808 , 0.56362587],\n", " [ 0.06274676, -0.20182262, 0.52534126, -0.60948758, -0.43840652,\n", " -0.02333879, 0.01991072, -0.30854347, -0.13975493]])" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.components_" ] }, { "cell_type": "code", "execution_count": 91, "id": "committed-costa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3, 9)" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.components_.shape" ] }, { "cell_type": "code", "execution_count": 92, "id": "equivalent-equipment", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(194413, 9)" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_fulltrain.shape" ] }, { "cell_type": "code", "execution_count": 93, "id": "annoying-cincinnati", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(9, 3)" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.components_.T.shape" ] }, { "cell_type": "code", "execution_count": 94, "id": "underlying-reducing", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.03429184, 0.50775265, 0.41399241],\n", " [ 1.25904914, -0.35426324, 0.85397871],\n", " [ 1.13891321, 0.24847025, 0.21082479],\n", " ...,\n", " [-1.01192038, 0.71256279, -0.06717374],\n", " [-0.74623422, -0.52959361, 0.40740754],\n", " [-1.76547939, 0.46317872, 0.27584235]])" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.matmul(X_fulltrain.values,pca.components_.T)" ] }, { "cell_type": "code", "execution_count": 95, "id": "aerial-mills", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.03429184, 0.50775265, 0.41399241],\n", " [ 1.25904914, -0.35426324, 0.85397871],\n", " [ 1.13891321, 0.24847025, 0.21082479],\n", " ...,\n", " [-1.01192038, 0.71256279, -0.06717374],\n", " [-0.74623422, -0.52959361, 0.40740754],\n", " [-1.76547939, 0.46317872, 0.27584235]])" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.fit_transform(X_fulltrain)" ] }, { "cell_type": "markdown", "id": "connected-awareness", "metadata": {}, "source": [ "### PCA regression" ] }, { "cell_type": "markdown", "id": "cubic-drilling", "metadata": {}, "source": [ "sklearn 是 duck typing,因此无需继承,只需在定义类的时候包括对应的方法,`fit()`(return self),`transform()`,`fit_transform()`即可。\n", "\n", "但直接用继承,可以更方便。\n", "- `BaseEstimator`是sklearn里最基本的类,其他的类都从这个类继承而来,包括了`set_params()`和`get_params()`的方法。\n", "- `TransformerMixin`包括了`fit_transform()`方法。因此由这个类继承而来的话,就不用自定义 `fit_transform` 了\n", "- 类似的,`RegressorMixin`包括了`predict()`方法" ] }, { "cell_type": "code", "execution_count": 96, "id": "suitable-shareware", "metadata": {}, "outputs": [], "source": [ "class PCARegressor(BaseEstimator, RegressorMixin):\n", " def __init__(self, n_components=3):\n", " self.n_components = n_components\n", " \n", " def fit(self, X, y):\n", " self.pca_ = PCA(n_components=self.n_components).fit(X)\n", " self.X_ = self.pca_.transform(X)\n", " self.reg_ = LinearRegression().fit(self.X_,y)\n", " return self\n", " \n", " def predict(self, X):\n", " self.pred_ = self.reg_.predict(self.pca_.transform(X))\n", " return self.pred_" ] }, { "cell_type": "code", "execution_count": 97, "id": "proprietary-morrison", "metadata": {}, "outputs": [], "source": [ "model = PCARegressor()" ] }, { "cell_type": "code", "execution_count": 98, "id": "passing-jackson", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PCARegressor()" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X=X_fulltrain, y=y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 99, "id": "communist-burton", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.03429184, 0.50775265, 0.41399241],\n", " [ 1.25904914, -0.35426324, 0.85397871],\n", " [ 1.13891321, 0.24847025, 0.21082479],\n", " ...,\n", " [-1.01192038, 0.71256279, -0.06717374],\n", " [-0.74623422, -0.52959361, 0.40740754],\n", " [-1.76547939, 0.46317872, 0.27584235]])" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.X_" ] }, { "cell_type": "code", "execution_count": 100, "id": "passing-ecuador", "metadata": {}, "outputs": [], "source": [ "hyperparam_grid = [\n", " {'n_components': range(1, len(cols)+1)}\n", "]" ] }, { "cell_type": "code", "execution_count": 101, "id": "duplicate-cambridge", "metadata": {}, "outputs": [], "source": [ "grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)" ] }, { "cell_type": "code", "execution_count": 102, "id": "norman-backup", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=PCARegressor(),\n", " param_grid=[{'n_components': range(1, 10)}],\n", " return_train_score=True, scoring=make_scorer(r2_oos))" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.fit(X=X_fulltrain, y=y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 103, "id": "egyptian-bathroom", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'n_components': 6}" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 104, "id": "vocal-enforcement", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan {'n_components': 1}\n", "0.03874473370176844 {'n_components': 2}\n", "0.037587746458808476 {'n_components': 3}\n", "0.04997889197284674 {'n_components': 4}\n", "0.04997497095219836 {'n_components': 5}\n", "0.05161556457239314 {'n_components': 6}\n", "0.041827129620731866 {'n_components': 7}\n", "0.050300834943468646 {'n_components': 8}\n", "0.0487322179924835 {'n_components': 9}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt\n", " print(np.sqrt(mean_score), params)\n" ] } ], "source": [ "cv_results = grid_search.cv_results_\n", "for mean_score, params in zip(cv_results['mean_test_score'],\n", " cv_results['params']):\n", " print(np.sqrt(mean_score), params)" ] }, { "cell_type": "code", "execution_count": 105, "id": "fixed-washington", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.01057358710785472" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "r2_oos(y_true=y_test,y_pred=y_pred)" ] }, { "cell_type": "code", "execution_count": 106, "id": "dietary-static", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.01057358710785472\n", "Test year 2017 : -0.08914596959361498\n", "Test year 2018 : -0.04897289891375012\n", "Test year 2019 : 0.006451234792761107\n", "Test year 2020 : -0.00048657810249452815\n", "Test year 2021 : 0.0095202141605224\n", "Test year 2022 : -0.0061009049213132105\n", "Test year 2023 : 0.008028420859995777\n", "Test year 2024 : -0.022204856045419552\n", "CPU times: user 1min 41s, sys: 9.21 s, total: 1min 50s\n", "Wall time: 29.1 s\n" ] } ], "source": [ "%%time\n", "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)\n", " grid_search.fit(X_fulltrain, y_fulltrain)\n", " y_pred = grid_search.predict(X=X_test)\n", " y_pred = y_pred.reshape(-1)\n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "rocky-portfolio", "metadata": { "tags": [] }, "source": [ "## Pipeline" ] }, { "cell_type": "code", "execution_count": 107, "id": "provincial-bridal", "metadata": {}, "outputs": [], "source": [ "pca = PCA()\n", "linear_reg = LinearRegression()\n", "pipeline = Pipeline(steps=[('pca',pca),\n", " ('linear_regression', linear_reg)])\n", "hyperparam_grid = {'pca__n_components': range(1,len(cols)+1)}\n", "grid_search = GridSearchCV(pipeline, hyperparam_grid, cv=[cv_idx[0]],\n", " scoring=r2_oos_scorer,\n", " return_train_score=True)" ] }, { "cell_type": "code", "execution_count": 108, "id": "likely-moscow", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_test = df_rank.loc[test_idx[0],cols]\n", "y_test = df_rank.loc[test_idx[0],'exret']" ] }, { "cell_type": "code", "execution_count": 109, "id": "union-jamaica", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.1 s, sys: 430 ms, total: 4.53 s\n", "Wall time: 1.54 s\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=Pipeline(steps=[('pca', PCA()),\n", " ('linear_regression',\n", " LinearRegression())]),\n", " param_grid={'pca__n_components': range(1, 10)},\n", " return_train_score=True, scoring=make_scorer(r2_oos))" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "grid_search.fit(X=X_fulltrain,y=y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 110, "id": "beginning-compromise", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'pca__n_components': 6}" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 111, "id": "written-treaty", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan {'pca__n_components': 1}\n", "0.03874473370176844 {'pca__n_components': 2}\n", "0.037587746458808476 {'pca__n_components': 3}\n", "0.04997889197284674 {'pca__n_components': 4}\n", "0.04997497095219836 {'pca__n_components': 5}\n", "0.05161556457239422 {'pca__n_components': 6}\n", "0.041827129620731866 {'pca__n_components': 7}\n", "0.050300834943468646 {'pca__n_components': 8}\n", "0.0487322179924835 {'pca__n_components': 9}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/115342728.py:4: RuntimeWarning: invalid value encountered in sqrt\n", " print(np.sqrt(mean_score), params)\n" ] } ], "source": [ "cv_results = grid_search.cv_results_\n", "for mean_score, params in zip(cv_results['mean_test_score'],\n", " cv_results['params']):\n", " print(np.sqrt(mean_score), params)" ] }, { "cell_type": "code", "execution_count": 112, "id": "impressed-guide", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.01057358710785472" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "markdown", "id": "adverse-settlement", "metadata": {}, "source": [ "## Elastic Net" ] }, { "cell_type": "code", "execution_count": 113, "id": "rapid-matrix", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_test = df_rank.loc[test_idx[0],cols]\n", "y_test = df_rank.loc[test_idx[0],'exret']" ] }, { "cell_type": "code", "execution_count": 114, "id": "conservative-synthesis", "metadata": {}, "outputs": [], "source": [ "model = SGDRegressor(penalty='elasticnet')" ] }, { "cell_type": "code", "execution_count": 115, "id": "retained-values", "metadata": {}, "outputs": [], "source": [ "hyperparam_grid = [{'alpha':[0.001, 0.01, 0.1],\n", " 'l1_ratio':[0.15, 0.30, 0.5, 0.7]}]" ] }, { "cell_type": "code", "execution_count": 116, "id": "hairy-water", "metadata": {}, "outputs": [], "source": [ "grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)" ] }, { "cell_type": "code", "execution_count": 117, "id": "brutal-alarm", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=SGDRegressor(penalty='elasticnet'),\n", " param_grid=[{'alpha': [0.001, 0.01, 0.1],\n", " 'l1_ratio': [0.15, 0.3, 0.5, 0.7]}],\n", " return_train_score=True, scoring=make_scorer(r2_oos))" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.fit(X=X_fulltrain, y=y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 118, "id": "integral-stability", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'alpha': 0.001, 'l1_ratio': 0.7}" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 119, "id": "chubby-replica", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.0110463976016415" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "code", "execution_count": 120, "id": "impaired-meditation", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Test year 2016 : -0.012516057005940606\n", "Test year 2017 : -0.10558980314809396\n", "Test year 2018 : -0.038706726553651816\n", "Test year 2019 : 0.008269590365803325\n", "Test year 2020 : 0.006087524412165979\n", "Test year 2021 : 0.009994658939758372\n", "Test year 2022 : -0.005719992513779193\n", "Test year 2023 : 0.010637533125030907\n", "Test year 2024 : -0.013783028576732859\n", "CPU times: user 1min 41s, sys: 3.04 s, total: 1min 44s\n", "Wall time: 43.1 s\n" ] } ], "source": [ "%%time\n", "for i in range(len(fulltrain_idx)):\n", " X_fulltrain = df_rank.loc[fulltrain_idx[i], cols]\n", " y_fulltrain = df_rank.loc[fulltrain_idx[i], 'exret']\n", " X_test = df_rank.loc[test_idx[i], cols]\n", " y_test = df_rank.loc[test_idx[i], 'exret']\n", " \n", " grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[i]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)\n", " grid_search.fit(X_fulltrain, y_fulltrain)\n", " y_pred = grid_search.predict(X=X_test)\n", " y_pred = y_pred.reshape(-1)\n", " print(\"Test year\", test_years[i],\":\",r2_oos(y_true=y_test, y_pred=y_pred))" ] }, { "cell_type": "markdown", "id": "friendly-plymouth", "metadata": {}, "source": [ "## Gradient Boosted Regression Trees" ] }, { "cell_type": "code", "execution_count": 121, "id": "thousand-object", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_test = df_rank.loc[test_idx[0],cols]\n", "y_test = df_rank.loc[test_idx[0],'exret']" ] }, { "cell_type": "code", "execution_count": 122, "id": "phantom-dance", "metadata": {}, "outputs": [], "source": [ "hyperparam_grid = [\n", " {'max_depth': [1,2,3,4,5,6], \n", " 'learning_rate': [0.1, 0.05, 0.01]}\n", "]" ] }, { "cell_type": "code", "execution_count": 123, "id": "aboriginal-healthcare", "metadata": {}, "outputs": [], "source": [ "model = GradientBoostingRegressor()" ] }, { "cell_type": "code", "execution_count": 124, "id": "unusual-division", "metadata": {}, "outputs": [], "source": [ "grid_search = GridSearchCV(model, hyperparam_grid, cv=[cv_idx[0]], \n", " scoring=r2_oos_scorer,\n", " return_train_score=True)" ] }, { "cell_type": "code", "execution_count": 125, "id": "according-elizabeth", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 9min 37s, sys: 11 s, total: 9min 48s\n", "Wall time: 10min 36s\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=GradientBoostingRegressor(),\n", " param_grid=[{'learning_rate': [0.1, 0.05, 0.01],\n", " 'max_depth': [1, 2, 3, 4, 5, 6]}],\n", " return_train_score=True, scoring=make_scorer(r2_oos))" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "grid_search.fit(X=X_fulltrain, y=y_fulltrain)" ] }, { "cell_type": "code", "execution_count": 126, "id": "cutting-description", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'learning_rate': 0.1, 'max_depth': 3}" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 127, "id": "established-night", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0010920514714541918 {'learning_rate': 0.1, 'max_depth': 1}\n", "0.0024649624915880075 {'learning_rate': 0.1, 'max_depth': 2}\n", "0.005179742214071581 {'learning_rate': 0.1, 'max_depth': 3}\n", "0.0027306548772843 {'learning_rate': 0.1, 'max_depth': 4}\n", "0.0002629712278338081 {'learning_rate': 0.1, 'max_depth': 5}\n", "-0.0016882062465084502 {'learning_rate': 0.1, 'max_depth': 6}\n", "0.0005808239346339894 {'learning_rate': 0.05, 'max_depth': 1}\n", "0.0018532340083319276 {'learning_rate': 0.05, 'max_depth': 2}\n", "0.003954208330685383 {'learning_rate': 0.05, 'max_depth': 3}\n", "0.004945961987173897 {'learning_rate': 0.05, 'max_depth': 4}\n", "0.003509540482510065 {'learning_rate': 0.05, 'max_depth': 5}\n", "0.0011919994497078257 {'learning_rate': 0.05, 'max_depth': 6}\n", "-0.0012377392747824345 {'learning_rate': 0.01, 'max_depth': 1}\n", "-0.00038711061142104874 {'learning_rate': 0.01, 'max_depth': 2}\n", "0.0010593345400554677 {'learning_rate': 0.01, 'max_depth': 3}\n", "0.003114775494288402 {'learning_rate': 0.01, 'max_depth': 4}\n", "0.003665138899025089 {'learning_rate': 0.01, 'max_depth': 5}\n", "0.003195682240093589 {'learning_rate': 0.01, 'max_depth': 6}\n" ] } ], "source": [ "cv_results = grid_search.cv_results_\n", "for mean_score, params in zip(cv_results['mean_test_score'],\n", " cv_results['params']):\n", " print(mean_score, params)" ] }, { "cell_type": "code", "execution_count": 128, "id": "senior-apparatus", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.04906362476677084" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = grid_search.predict(X_test)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "markdown", "id": "solar-helena", "metadata": {}, "source": [ "## Neural Nets" ] }, { "cell_type": "code", "execution_count": 129, "id": "alive-newton", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2.8.0'" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.__version__" ] }, { "cell_type": "code", "execution_count": 130, "id": "reverse-trick", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2.8.0'" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keras.__version__" ] }, { "cell_type": "code", "execution_count": 131, "id": "adult-classroom", "metadata": {}, "outputs": [], "source": [ "X_fulltrain = df_rank.loc[fulltrain_idx[0], cols]\n", "y_fulltrain = df_rank.loc[fulltrain_idx[0], 'exret']\n", "X_train = X_fulltrain.values[cv_idx[0][0]]\n", "y_train = y_fulltrain.values[cv_idx[0][0]]\n", "X_val = X_fulltrain.values[cv_idx[0][1]]\n", "y_val = y_fulltrain.values[cv_idx[0][1]]\n", "X_test = df_rank.loc[test_idx[0],cols]\n", "y_test = df_rank.loc[test_idx[0],'exret']" ] }, { "cell_type": "code", "execution_count": 132, "id": "developing-mexican", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(77643, 9)" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape" ] }, { "cell_type": "code", "execution_count": 133, "id": "wrong-bishop", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(116770, 9)" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_val.shape" ] }, { "cell_type": "code", "execution_count": 134, "id": "ignored-clinton", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-05-13 10:01:17.440490: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] } ], "source": [ "nn_model = keras.models.Sequential()\n", "nn_model.add(keras.layers.InputLayer(input_shape=[X_fulltrain.shape[1]]))\n", "nn_model.add(keras.layers.Dense(8, activation='relu'))\n", "nn_model.add(keras.layers.Dense(4, activation='relu'))\n", "nn_model.add(keras.layers.Dense(1))" ] }, { "cell_type": "code", "execution_count": 135, "id": "coupled-colon", "metadata": {}, "outputs": [], "source": [ "nn_model.compile(loss='mse',optimizer='sgd')" ] }, { "cell_type": "code", "execution_count": 137, "id": "after-intranet", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0275\n", "Epoch 2/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277\n", "Epoch 3/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0278\n", "Epoch 4/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0277\n", "Epoch 5/10\n", "2427/2427 [==============================] - 4s 2ms/step - loss: 0.0223 - val_loss: 0.0274\n", "Epoch 6/10\n", "2427/2427 [==============================] - 7s 3ms/step - loss: 0.0223 - val_loss: 0.0273\n", "Epoch 7/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276\n", "Epoch 8/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276\n", "Epoch 9/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276\n", "Epoch 10/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0223 - val_loss: 0.0276\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nn_model.fit(X_train, y_train, epochs=10,\n", " validation_data=(X_val,y_val))" ] }, { "cell_type": "code", "execution_count": 138, "id": "exciting-cisco", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.003361474358112737" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = nn_model.predict(X_test).reshape(-1)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "markdown", "id": "dominant-cache", "metadata": {}, "source": [ "### GridSeachCV Neural Nets" ] }, { "cell_type": "code", "execution_count": 139, "id": "tribal-reunion", "metadata": {}, "outputs": [], "source": [ "def build_model(learning_rate=0.003):\n", " nn_model = keras.models.Sequential()\n", " nn_model.add(keras.layers.InputLayer(input_shape=[9]))\n", " nn_model.add(keras.layers.Dense(8, activation='relu'))\n", " nn_model.add(keras.layers.Dense(4, activation='relu'))\n", " nn_model.add(keras.layers.Dense(1))\n", " optimizer = keras.optimizers.SGD(learning_rate=learning_rate) \n", " nn_model.compile(loss=\"mse\", optimizer=optimizer)\n", " return nn_model" ] }, { "cell_type": "code", "execution_count": 140, "id": "italic-blind", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/gh/26025ywx7w128zfds279s_9r0000gn/T/ipykernel_26559/3997882518.py:3: DeprecationWarning: KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.\n", " keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)\n" ] } ], "source": [ "# from scikeras.wrappers import KerasRegressor\n", "# keras_reg = KerasRegressor(build_model)\n", "keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)" ] }, { "cell_type": "code", "execution_count": 141, "id": "uniform-estonia", "metadata": {}, "outputs": [], "source": [ "hyperparams_grid = {\n", " 'learning_rate':[0.003,0.001]\n", "}" ] }, { "cell_type": "code", "execution_count": 142, "id": "mysterious-carter", "metadata": {}, "outputs": [], "source": [ "nn_search_cv = GridSearchCV(keras_reg, hyperparams_grid, cv=[cv_idx[0]])" ] }, { "cell_type": "code", "execution_count": 143, "id": "differential-sector", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "2427/2427 [==============================] - 6s 2ms/step - loss: 0.0346 - val_loss: 0.0313\n", "Epoch 2/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0249 - val_loss: 0.0295\n", "Epoch 3/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0235 - val_loss: 0.0282\n", "Epoch 4/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279\n", "Epoch 5/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0227 - val_loss: 0.0279\n", "Epoch 6/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0226 - val_loss: 0.0277\n", "Epoch 7/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0225 - val_loss: 0.0277\n", "Epoch 8/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0224 - val_loss: 0.0275\n", "Epoch 9/10\n", "2427/2427 [==============================] - 7s 3ms/step - loss: 0.0224 - val_loss: 0.0278\n", "Epoch 10/10\n", "2427/2427 [==============================] - 6s 3ms/step - loss: 0.0224 - val_loss: 0.0275\n", "3650/3650 [==============================] - 5s 1ms/step - loss: 0.0275\n", "Epoch 1/10\n", "2427/2427 [==============================] - 6s 2ms/step - loss: 0.0432 - val_loss: 0.0342\n", "Epoch 2/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0278 - val_loss: 0.0303\n", "Epoch 3/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0251 - val_loss: 0.0291\n", "Epoch 4/10\n", "2427/2427 [==============================] - 6s 2ms/step - loss: 0.0241 - val_loss: 0.0286\n", "Epoch 5/10\n", "2427/2427 [==============================] - 6s 3ms/step - loss: 0.0236 - val_loss: 0.0283\n", "Epoch 6/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0233 - val_loss: 0.0281\n", "Epoch 7/10\n", "2427/2427 [==============================] - 6s 2ms/step - loss: 0.0232 - val_loss: 0.0280\n", "Epoch 8/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0230 - val_loss: 0.0280\n", "Epoch 9/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279\n", "Epoch 10/10\n", "2427/2427 [==============================] - 5s 2ms/step - loss: 0.0229 - val_loss: 0.0279\n", "3650/3650 [==============================] - 5s 1ms/step - loss: 0.0279\n", "Epoch 1/10\n", "6076/6076 [==============================] - 10s 2ms/step - loss: 0.0297 - val_loss: 0.0273\n", "Epoch 2/10\n", "6076/6076 [==============================] - 10s 2ms/step - loss: 0.0255 - val_loss: 0.0270\n", "Epoch 3/10\n", "6076/6076 [==============================] - 9s 1ms/step - loss: 0.0253 - val_loss: 0.0269\n", "Epoch 4/10\n", "6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268\n", "Epoch 5/10\n", "6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268\n", "Epoch 6/10\n", "6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0268\n", "Epoch 7/10\n", "6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0268\n", "Epoch 8/10\n", "6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0269\n", "Epoch 9/10\n", "6076/6076 [==============================] - 10s 2ms/step - loss: 0.0252 - val_loss: 0.0269\n", "Epoch 10/10\n", "6076/6076 [==============================] - 9s 1ms/step - loss: 0.0252 - val_loss: 0.0267\n" ] }, { "data": { "text/plain": [ "GridSearchCV(cv=[(array([ 0, 1, 2, ..., 77640, 77641, 77642]),\n", " array([ 77643, 77644, 77645, ..., 194410, 194411, 194412]))],\n", " estimator=,\n", " param_grid={'learning_rate': [0.003, 0.001]})" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nn_search_cv.fit(X_fulltrain, y_fulltrain, epochs=10,\n", " validation_data=(X_val,y_val))" ] }, { "cell_type": "code", "execution_count": 144, "id": "novel-edinburgh", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.02629758156221018" ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = nn_search_cv.predict(X_test).reshape(-1)\n", "r2_oos(y_true=y_test, y_pred=y_pred)" ] }, { "cell_type": "markdown", "id": "7b9e7889-b6eb-4aed-a5f7-3953193b0cd8", "metadata": { "tags": [] }, "source": [ "## Transformation pipeline example" ] }, { "cell_type": "code", "execution_count": 145, "id": "conditional-frank", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
secIDret_dateexretymsizerevmombetabmilliqilliq_12mvolivolyear
0000001.XSHE2008-01-0.1402552007-120.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.0000002008
1000001.XSHE2008-02-0.0074502008-010.971536-0.4516850.000000-0.170037-0.613483-0.9595510.000000-0.635955-0.7917602008
2000001.XSHE2008-03-0.1520682008-020.967335-0.7000740.0000000.345212-0.557535-0.9539720.0000000.437268-0.6258352008
3000001.XSHE2008-040.0474932008-030.9690270.4439530.0000000.048673-0.112094-0.9749260.0000000.2418880.3067852008
4000001.XSHE2008-05-0.1511642008-040.9645490.5450520.000000-0.264402-0.258493-0.970458-0.976366-0.704579-0.4977842008
.............................................
580477689009.XSHG2023-12-0.1059962023-110.719298-0.1862800.0455350.381234-0.645969-0.519022-0.5304550.1145280.4133652023
580478689009.XSHG2024-01-0.2149832023-120.691854-0.8414010.1688310.515545-0.574577-0.515152-0.5533250.5155450.6229832024
580479689009.XSHG2024-020.2964512024-010.674975-0.066928-0.2698720.369774-0.590579-0.492836-0.555643-0.3413150.2855742024
580480689009.XSHG2024-03-0.0133342024-020.7127740.947884-0.2535270.546042-0.709444-0.564655-0.563480-0.3032920.7194362024
580481689009.XSHG2024-04-0.0734742024-030.705445-0.3634940.0564040.515864-0.662554-0.621622-0.5742260.3615350.5781432024
\n", "

575818 rows × 14 columns

\n", "
" ], "text/plain": [ " secID ret_date exret ym size rev mom \\\n", "0 000001.XSHE 2008-01 -0.140255 2007-12 0.969559 -0.858447 0.000000 \n", "1 000001.XSHE 2008-02 -0.007450 2008-01 0.971536 -0.451685 0.000000 \n", "2 000001.XSHE 2008-03 -0.152068 2008-02 0.967335 -0.700074 0.000000 \n", "3 000001.XSHE 2008-04 0.047493 2008-03 0.969027 0.443953 0.000000 \n", "4 000001.XSHE 2008-05 -0.151164 2008-04 0.964549 0.545052 0.000000 \n", "... ... ... ... ... ... ... ... \n", "580477 689009.XSHG 2023-12 -0.105996 2023-11 0.719298 -0.186280 0.045535 \n", "580478 689009.XSHG 2024-01 -0.214983 2023-12 0.691854 -0.841401 0.168831 \n", "580479 689009.XSHG 2024-02 0.296451 2024-01 0.674975 -0.066928 -0.269872 \n", "580480 689009.XSHG 2024-03 -0.013334 2024-02 0.712774 0.947884 -0.253527 \n", "580481 689009.XSHG 2024-04 -0.073474 2024-03 0.705445 -0.363494 0.056404 \n", "\n", " beta bm illiq illiq_12m vol ivol year \n", "0 0.086758 -0.672755 -0.978691 0.000000 0.275495 0.000000 2008 \n", "1 -0.170037 -0.613483 -0.959551 0.000000 -0.635955 -0.791760 2008 \n", "2 0.345212 -0.557535 -0.953972 0.000000 0.437268 -0.625835 2008 \n", "3 0.048673 -0.112094 -0.974926 0.000000 0.241888 0.306785 2008 \n", "4 -0.264402 -0.258493 -0.970458 -0.976366 -0.704579 -0.497784 2008 \n", "... ... ... ... ... ... ... ... \n", "580477 0.381234 -0.645969 -0.519022 -0.530455 0.114528 0.413365 2023 \n", "580478 0.515545 -0.574577 -0.515152 -0.553325 0.515545 0.622983 2024 \n", "580479 0.369774 -0.590579 -0.492836 -0.555643 -0.341315 0.285574 2024 \n", "580480 0.546042 -0.709444 -0.564655 -0.563480 -0.303292 0.719436 2024 \n", "580481 0.515864 -0.662554 -0.621622 -0.574226 0.361535 0.578143 2024 \n", "\n", "[575818 rows x 14 columns]" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rank" ] }, { "cell_type": "code", "execution_count": 146, "id": "unauthorized-person", "metadata": {}, "outputs": [], "source": [ "illiq_idx = 5\n", "illiq_12m_idx = 6" ] }, { "cell_type": "code", "execution_count": 147, "id": "8c8adb6f-4248-4a09-b794-8523c915c252", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sizerevmombetabmilliqilliq_12mvolivol
00.969559-0.8584470.0000000.086758-0.672755-0.9786910.0000000.2754950.000000
10.971536-0.4516850.000000-0.170037-0.613483-0.9595510.000000-0.635955-0.791760
20.967335-0.7000740.0000000.345212-0.557535-0.9539720.0000000.437268-0.625835
30.9690270.4439530.0000000.048673-0.112094-0.9749260.0000000.2418880.306785
40.9645490.5450520.000000-0.264402-0.258493-0.970458-0.976366-0.704579-0.497784
..............................
556879-0.911797-0.9757440.000000-0.550165-0.2238150.9536930.0000000.351709-0.339214
556880-0.890682-0.0836390.000000-0.468452-0.2641230.2714600.000000-0.356566-0.264123
556881-0.9024930.5212610.000000-0.303519-0.3130500.3460410.0000000.3328450.835777
556882-0.9090580.0209020.000000-0.273194-0.3252660.5738910.000000-0.416208-0.318665
556883-0.8811450.8716070.211299-0.427733-0.5517240.7505500.9955980.0902420.579604
\n", "

194413 rows × 9 columns

\n", "
" ], "text/plain": [ " size rev mom beta bm illiq illiq_12m \\\n", "0 0.969559 -0.858447 0.000000 0.086758 -0.672755 -0.978691 0.000000 \n", "1 0.971536 -0.451685 0.000000 -0.170037 -0.613483 -0.959551 0.000000 \n", "2 0.967335 -0.700074 0.000000 0.345212 -0.557535 -0.953972 0.000000 \n", "3 0.969027 0.443953 0.000000 0.048673 -0.112094 -0.974926 0.000000 \n", "4 0.964549 0.545052 0.000000 -0.264402 -0.258493 -0.970458 -0.976366 \n", "... ... ... ... ... ... ... ... \n", "556879 -0.911797 -0.975744 0.000000 -0.550165 -0.223815 0.953693 0.000000 \n", "556880 -0.890682 -0.083639 0.000000 -0.468452 -0.264123 0.271460 0.000000 \n", "556881 -0.902493 0.521261 0.000000 -0.303519 -0.313050 0.346041 0.000000 \n", "556882 -0.909058 0.020902 0.000000 -0.273194 -0.325266 0.573891 0.000000 \n", "556883 -0.881145 0.871607 0.211299 -0.427733 -0.551724 0.750550 0.995598 \n", "\n", " vol ivol \n", "0 0.275495 0.000000 \n", "1 -0.635955 -0.791760 \n", "2 0.437268 -0.625835 \n", "3 0.241888 0.306785 \n", "4 -0.704579 -0.497784 \n", "... ... ... \n", "556879 0.351709 -0.339214 \n", "556880 -0.356566 -0.264123 \n", "556881 0.332845 0.835777 \n", "556882 -0.416208 -0.318665 \n", "556883 0.090242 0.579604 \n", "\n", "[194413 rows x 9 columns]" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_fulltrain" ] }, { "cell_type": "code", "execution_count": 148, "id": "indian-spokesman", "metadata": {}, "outputs": [], "source": [ "class FeatureAdder(BaseEstimator, TransformerMixin):\n", " def __init__(self, add_avg_illiq=True):\n", " self.add_avg_illiq = add_avg_illiq\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X, y=None):\n", " avg_illiq = (X[:,illiq_idx] + X[:, illiq_12m_idx]) / 2\n", " return np.c_[X, avg_illiq]\n", "\n", "feature_adder = FeatureAdder()" ] }, { "cell_type": "code", "execution_count": 149, "id": "certified-fiction", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(194413, 9)" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_fulltrain.values.shape" ] }, { "cell_type": "code", "execution_count": 150, "id": "continuing-desperate", "metadata": {}, "outputs": [], "source": [ "X_fulltrain_new = feature_adder.transform(X_fulltrain.values)" ] }, { "cell_type": "code", "execution_count": 151, "id": "completed-creator", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.9695586 , -0.85844749, 0. , ..., 0.27549467,\n", " 0. , -0.48934551],\n", " [ 0.97153558, -0.45168539, 0. , ..., -0.63595506,\n", " -0.7917603 , -0.47977528],\n", " [ 0.96733482, -0.70007424, 0. , ..., 0.437268 ,\n", " -0.62583519, -0.47698589],\n", " ...,\n", " [-0.90249267, 0.521261 , 0. , ..., 0.33284457,\n", " 0.83577713, 0.17302053],\n", " [-0.90905757, 0.02090209, 0. , ..., -0.41620829,\n", " -0.3186652 , 0.28694536],\n", " [-0.88114453, 0.87160675, 0.21129861, ..., 0.09024211,\n", " 0.57960382, 0.8730741 ]])" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_fulltrain_new" ] }, { "cell_type": "code", "execution_count": 152, "id": "wooden-driver", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(194413, 10)" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_fulltrain_new.shape" ] }, { "cell_type": "code", "execution_count": 153, "id": "sporting-celebrity", "metadata": {}, "outputs": [], "source": [ "# This can be added to a pipeline\n", "pipeline = Pipeline([\n", " ('feature_adder', FeatureAdder()),\n", " ('std_scaler', StandardScaler())\n", "])" ] }, { "cell_type": "code", "execution_count": 154, "id": "magnetic-strip", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.68015418e+00, -1.48771512e+00, -7.60380513e-19, ...,\n", " 4.77437532e-01, 4.06742109e-18, -9.01081929e-01],\n", " [ 1.68358010e+00, -7.82784266e-01, -7.60380513e-19, ...,\n", " -1.10212226e+00, -1.37679088e+00, -8.83459288e-01],\n", " [ 1.67630057e+00, -1.21324955e+00, -7.60380513e-19, ...,\n", " 7.57793805e-01, -1.08826394e+00, -8.78322906e-01],\n", " ...,\n", " [-1.56393521e+00, 9.03360864e-01, -7.60380513e-19, ...,\n", " 5.76826008e-01, 1.45333168e+00, 3.18600392e-01],\n", " [-1.57531157e+00, 3.62239461e-02, -7.60380513e-19, ...,\n", " -7.21296915e-01, -5.54126473e-01, 5.28381837e-01],\n", " [-1.52694089e+00, 1.51052051e+00, 3.66337572e-01, ...,\n", " 1.56391306e-01, 1.00787227e+00, 1.60768063e+00]])" ] }, "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline.fit_transform(X_fulltrain.values)" ] }, { "cell_type": "code", "execution_count": null, "id": "68d36806-83f6-44e6-86bc-b2aa96616c7e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c1a444fd-7fd7-4083-9148-057535ad06c1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc-autonumbering": true }, "nbformat": 4, "nbformat_minor": 5 }