import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
housing = pd.read_csv('./datasets/housing/housing.csv')
housing.head()
housing.info()
housing.describe()
housing.hist(bins=50, figsize=(20,15))
plt.show()
numeric_features = ["housing_median_age","total_rooms","total_bedrooms",
"population","households","median_income","median_house_value"]
sns.pairplot(housing[numeric_features])
housing['rooms_per_household'] = housing['total_rooms'] / housing['households']
housing['bedrooms_per_household'] = housing['total_bedrooms'] / housing['households']
housing['rooms_per_population'] = housing['total_rooms'] / housing['population']
housing['bedrooms_per_population'] = housing['total_bedrooms'] / housing['population']
housing['population_per_household'] = housing['population'] / housing['households']
numeric_features = numeric_features + ['rooms_per_household','bedrooms_per_household',
'rooms_per_population','bedrooms_per_population',
'population_per_household']
numeric_features
查看和median_house_value的相关性
housing.corr()['median_house_value'].sort_values(ascending=False)
housing.groupby('ocean_proximity').median_house_value.describe()
很明显,靠海的房子更值钱
sklearn
中的train_test_split
train, test = train_test_split(housing, test_size=0.2, random_state=42)
train_X = train.drop('median_house_value', axis=1)
train_y = train['median_house_value'].copy()
test_X = test.drop('median_house_value', axis=1)
test_y = test['median_house_value'].copy()
根据以上的考察,有以下几个需要注意的地方:
对于缺失值,这里采取用median填充
imputer = SimpleImputer(strategy='median')
train_X_num = train_X.drop('ocean_proximity', axis=1)
train_X_num = imputer.fit_transform(train_X_num)
test_X_num = test_X.drop('ocean_proximity', axis=1)
test_X_num = imputer.transform(test_X_num)
一般有两种处理分类变量的方法。
这里用虚拟变量的方法处理ocean_proximity
train_X_cat = train_X[['ocean_proximity']]
one_hot = OneHotEncoder()
train_X_cat = one_hot.fit_transform(train_X_cat)
train_X_cat = train_X_cat.toarray()
test_X_cat = test_X[['ocean_proximity']]
test_X_cat = one_hot.transform(test_X_cat)
test_X_cat = test_X_cat.toarray()
对于量级,这里采取Standard Scaler的处理办法,也即对每一个变量,减去其均值,除以其标准差
std_scaler = StandardScaler()
train_X_num = std_scaler.fit_transform(train_X_num)
test_X_num = std_scaler.transform(test_X_num)
train_X_num.shape
train_X_cat.shape
train_X_prepared = np.hstack((train_X_num, train_X_cat))
train_X_prepared.shape
test_X_prepared = np.hstack((test_X_num, test_X_cat))
test_X_num.shape
test_X_cat.shape
test_X_prepared.shape
lin_reg = LinearRegression()
lin_reg.fit(train_X_prepared, train_y)
考察模型表现
lin_reg_predict = lin_reg.predict(train_X_prepared)
lin_reg_rmse = np.sqrt(mean_squared_error(train_y, lin_reg_predict))
lin_reg_rmse
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_X_prepared, train_y)
tree_predict = tree_reg.predict(train_X_prepared)
tree_rmse = np.sqrt(mean_squared_error(train_y, tree_predict))
tree_rmse
很显然,决策树给出的模型产生了严重的过拟合。原因在于,sklearn
默认的DecisionTreeRegressor
对于很多参数不做任何限制。
如果我们对决策树的深度做出一定的限制,看看结果如何:
tree_reg = DecisionTreeRegressor(max_depth=5)
tree_reg.fit(train_X_prepared, train_y)
tree_predict = tree_reg.predict(train_X_prepared)
tree_rmse = np.sqrt(mean_squared_error(train_y, tree_predict))
print(tree_rmse)
如何确定参数?可以把训练数据进一步分组进行 cross validation.
我们考虑 $3*3 = 9$ 种参数组合。对于每一种参数组合,我们把训练数据分成5组,在其中4组种训练模型,剩下的1组上评估模型表现,并取5组表现的均值。然后选出表现最优的一组参数作为决策树的参数。
param_grid = [
{'max_depth': [5, 10, 20], 'max_features': [2,5,8]}
]
tree_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(tree_reg, param_grid, cv=5,
scoring='neg_mean_squared_error',
return_train_score=True)
grid_search.fit(train_X_prepared, train_y)
grid_search.best_params_
由此可以确定,在我们选定的9组候选参数中,max_depth设为10,max_features设为8,决策树的表现是最好的。如果我们把备选参数增多,则可以选出更优的模型,但程序运行的时间也会相应变长。
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results['mean_test_score'],
cv_results['params']):
print(np.sqrt(-mean_score), params)
tree_reg = DecisionTreeRegressor(max_depth=10,max_features=8)
tree_reg.fit(train_X_prepared, train_y)
这里我们考察最简单的ensemble模型,Random Forest, 随机森林
forest_reg = RandomForestRegressor()
forest_reg.fit(train_X_prepared, train_y)
forest_predict = forest_reg.predict(train_X_prepared)
forest_rmse = np.sqrt(mean_squared_error(train_y,forest_predict))
print(forest_rmse)
可以看到,随机森林给出的均方误差是最小的,同时,这个误差也没有像第一次的决策树那样不合理(为0)。因此,直观上,随机森林是最好的模型。
在测试集上最终看一下模型的表现优劣
test_lin_predict = lin_reg.predict(test_X_prepared)
test_tree_predict = tree_reg.predict(test_X_prepared)
test_forest_predict = forest_reg.predict(test_X_prepared)
test_lin_rmse = np.sqrt(mean_squared_error(test_y, test_lin_predict))
test_tree_rmse = np.sqrt(mean_squared_error(test_y, test_tree_predict))
test_forest_rmse = np.sqrt(mean_squared_error(test_y, test_forest_predict))
print('Linear Regression rmse on test set:', test_lin_rmse)
print('Decision Tree regression rmse on test set:', test_tree_rmse)
print('Random Forest regression rmse on test set:', test_forest_rmse)
可以看到,在测试集上,Random Forest 的表现是最好的。