import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore')
#reduce_mem_usage 函数通过调整数据类型,减少数据在内存中占用的空间 def reduce_mem_usage(df): """ iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """ start_mem = df.memory_usage().sum() print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df
sample_feature = reduce_mem_usage(pd.read_csv('data_for_tree.csv'))
Memory usage of dataframe is 62099672.00 MB
Memory usage after optimization is: 16520303.00 MB
Decreased by 73.4%
continuous_feature_names = [x for x in sample_feature.columns if x not in ['price','brand','model','brand']]
sample_feature = sample_feature.dropna().replace('-', 0).reset_index(drop=True) sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32) train = sample_feature[continuous_feature_names + ['price']] train_X = train[continuous_feature_names] train_y = train['price']
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model =, train_y)
'intercept:'+ str(model.intercept_) sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
[('v_6', 3342612.384537345), ('v_8', 684205.534533214), ('v_9', 178967.94192530424), ('v_7', 35223.07319016895), ('v_5', 21917.550249749802), ('v_3', 12782.03250792227), ('v_12', 11654.925634146672), ('v_13', 9884.194615297649), ('v_11', 5519.182176035517), ('v_10', 3765.6101415594258), ('gearbox', 900.3205339198406), ('fuelType', 353.5206495542567), ('bodyType', 186.51797317460046), ('city', 45.17354204168846), ('power', 31.163045441455335), ('brand_price_median', 0.535967111869784), ('brand_price_std', 0.4346788365040235), ('brand_amount', 0.15308295553300566), ('brand_price_max', 0.003891831020467389), ('seller', -1.2684613466262817e-06), ('offerType', -4.759058356285095e-06), ('brand_price_sum', -2.2430642281682917e-05), ('name', -0.00042591632723759166), ('used_time', -0.012574429533889028), ('brand_price_average', -0.414105722833381), ('brand_price_min', -2.3163823428971835), ('train', -5.392535065078232), ('power_bin', -59.24591853031839), ('v_14', -233.1604256172217), ('kilometer', -372.96600915402496), ('notRepairedDamage', -449.29703564695365), ('v_0', -1490.6790578168238), ('v_4', -14219.648899108111), ('v_2', -16528.55239086934), ('v_1', -42869.43976200439)]
from matplotlib import pyplot as plt
subsample_index = np.random.randint(low=0, high=len(train_y), size=50)
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black') plt.scatter(train_X['v_9'][subsample_index], model.predict(train_X.loc[subsample_index]), color='blue') plt.xlabel('v_9') plt.ylabel('price') plt.legend(['True Price','Predicted Price'],loc='upper right') print('The predicted price is obvious different from true price')
The predicted price is obvious different from true price
import seaborn as sns print('It is clear to see the price shows a typical exponential distribution') plt.figure(figsize=(15,5)) plt.subplot(1,2,1) sns.distplot(train_y) plt.subplot(1,2,2) sns.distplot(train_y[train_y < np.quantile(train_y, 0.9)])
It is clear to see the price shows a typical exponential distribution <matplotlib.axes._subplots.AxesSubplot at 0x1b33efb2f98>
对标签进行 l o g ( x + 1 ) log(x+1) log(x+1) 变换,使标签贴近于正态分布
train_y_ln = np.log(train_y + 1)
import seaborn as sns print('The transformed price seems like normal distribution') plt.figure(figsize=(15,5)) plt.subplot(1,2,1) sns.distplot(train_y_ln) plt.subplot(1,2,2) sns.distplot(train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)])
The transformed price seems like normal distribution <matplotlib.axes._subplots.AxesSubplot at 0x1b33f077160>
model =, train_y_ln) print('intercept:'+ str(model.intercept_)) sorted(dict(zip(continuous_feature_names, model.coef_)).items(), key=lambda x:x[1], reverse=True)
intercept:23.515920686637713 [('v_9', 6.043993029165403), ('v_12', 2.0357439855551394), ('v_11', 1.3607608712255672), ('v_1', 1.3079816298861897), ('v_13', 1.0788833838535354), ('v_3', 0.9895814429387444), ('gearbox', 0.009170812023421397), ('fuelType', 0.006447089787635784), ('bodyType', 0.004815242907679581), ('power_bin', 0.003151801949447194), ('power', 0.0012550361843629999), ('train', 0.0001429273782925814), ('brand_price_min', 2.0721302299502698e-05), ('brand_price_average', 5.308179717783439e-06), ('brand_amount', 2.8308531339942507e-06), ('brand_price_max', 6.764442596115763e-07), ('offerType', 1.6765966392995324e-10), ('seller', 9.308109838457312e-12), ('brand_price_sum', -1.3473184925468486e-10), ('name', -7.11403461065247e-08), ('brand_price_median', -1.7608143661053008e-06), ('brand_price_std', -2.7899058266986454e-06), ('used_time', -5.6142735899344175e-06), ('city', -0.0024992974087053223), ('v_14', -0.012754139659375262), ('kilometer', -0.013999175312751872), ('v_0', -0.04553774829634237), ('notRepairedDamage', -0.273686961116076), ('v_7', -0.7455902679730504), ('v_4', -0.9281349233755761), ('v_2', -1.2781892166433606), ('v_5', -1.5458846136756323), ('v_10', -1.8059217242413748), ('v_8', -42.611729973490604), ('v_6', -241.30992120503035)]
plt.scatter(train_X['v_9'][subsample_index], train_y[subsample_index], color='black') plt.scatter(train_X['v_9'][subsample_index], np.exp(model.predict(train_X.loc[subsample_index])), color='blue') plt.xlabel('v_9') plt.ylabel('price') plt.legend(['True Price','Predicted Price'],loc='upper right') print('The predicted price seems normal after np.log transforming')
The predicted price seems normal after np.log transforming
from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_absolute_error, make_scorer
def log_transfer(func): def wrapper(y, yhat): result = func(np.log(y), np.nan_to_num(np.log(yhat))) return result return wrapper
scores = cross_val_score(model, X=train_X, y=train_y, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.1s finished
使用线性回归模型,对未处理标签的特征数据进行五折交叉验证(Error 1.36)
print('AVG:', np.mean(scores))
AVG: 1.3658024042408414
使用线性回归模型,对处理过标签的特征数据进行五折交叉验证(Error 0.19)
scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 1.1s finished
print('AVG:', np.mean(scores))
AVG: 0.19325301535176911
scores = pd.DataFrame(scores.reshape(1,-1)) scores.columns = ['cv' + str(x) for x in range(1, 6)] scores.index = ['MAE'] scores
cv1 | cv2 | cv3 | cv4 | cv5 | |
MAE | 0.191642 | 0.194986 | 0.192737 | 0.195329 | 0.19445 |
# 采用时间顺序对数据集进行分隔。 # 选用靠前时间的4/5样本当作训练集,靠后时间的1/5当作验证集, # 最终结果与五折交叉验证差距不 import datetime
sample_feature = sample_feature.reset_index(drop=True)
split_point = len(sample_feature) // 5 * 4
train = sample_feature.loc[:split_point].dropna() val = sample_feature.loc[split_point:].dropna() train_X = train[continuous_feature_names] train_y_ln = np.log(train['price'] + 1) val_X = val[continuous_feature_names] val_y_ln = np.log(val['price'] + 1)
model =, train_y_ln)
mean_absolute_error(val_y_ln, model.predict(val_X))
from sklearn.model_selection import learning_curve, validation_curve
? learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,n_jobs=1, train_size=np.linspace(.1, 1.0, 5 )): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel('Training example') plt.ylabel('score') train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid()#区域 plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label="Training score") plt.plot(train_sizes, test_scores_mean,'o-',color="g", label="Cross-validation score") plt.legend(loc="best") return plt
plot_learning_curve(LinearRegression(), 'Liner_model', train_X[:1000], train_y_ln[:1000], ylim=(0.0, 0.5), cv=5, n_jobs=1)
<module 'matplotlib.pyplot' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\matplotlib\\'>
train = sample_feature[continuous_feature_names + ['price']].dropna() train_X = train[continuous_feature_names] train_y = train['price'] train_y_ln = np.log(train_y + 1)
from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso
models = [LinearRegression(), Ridge(), Lasso()]
result = dict() for model in models: model_name = str(model).split('(')[0] scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)) result[model_name] = scores print(model_name + ' is finished')
LinearRegression is finished Ridge is finished Lasso is finished
result = pd.DataFrame(result) result.index = ['cv' + str(x) for x in range(1, 6)] result
LinearRegression | Ridge | Lasso | |
cv1 | 0.191642 | 0.195665 | 0.382708 |
cv2 | 0.194986 | 0.198841 | 0.383916 |
cv3 | 0.192737 | 0.196629 | 0.380754 |
cv4 | 0.195329 | 0.199255 | 0.385683 |
cv5 | 0.194450 | 0.198173 | 0.383555 |
model = LinearRegression().fit(train_X, train_y_ln) print('intercept:'+ str(model.intercept_)) sns.barplot(abs(model.coef_), continuous_feature_names)
intercept:23.515984499017883 <matplotlib.axes._subplots.AxesSubplot at 0x1feb933ca58>
model = Ridge().fit(train_X, train_y_ln) print('intercept:'+ str(model.intercept_)) sns.barplot(abs(model.coef_), continuous_feature_names)
intercept:5.901527844424091 <matplotlib.axes._subplots.AxesSubplot at 0x1fea9056860>
model = Lasso().fit(train_X, train_y_ln) print('intercept:'+ str(model.intercept_)) sns.barplot(abs(model.coef_), continuous_feature_names)
intercept:8.674427764003347 <matplotlib.axes._subplots.AxesSubplot at 0x1fea90b69b0>
from sklearn.linear_model import LinearRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.neural_network import MLPRegressor from xgboost.sklearn import XGBRegressor from lightgbm.sklearn import LGBMRegressor
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), MLPRegressor(solver='lbfgs', max_iter=100), XGBRegressor(n_estimators = 100, objective='reg:squarederror'), LGBMRegressor(n_estimators = 100)]
result = dict() for model in models: model_name = str(model).split('(')[0] scores = cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)) result[model_name] = scores print(model_name + ' is finished')
LinearRegression is finished DecisionTreeRegressor is finished RandomForestRegressor is finished GradientBoostingRegressor is finished MLPRegressor is finished XGBRegressor is finished LGBMRegressor is finished
result = pd.DataFrame(result) result.index = ['cv' + str(x) for x in range(1, 6)] result
LinearRegression | DecisionTreeRegressor | RandomForestRegressor | GradientBoostingRegressor | MLPRegressor | XGBRegressor | LGBMRegressor | |
cv1 | 0.191642 | 0.184566 | 0.136266 | 0.168626 | 124.299426 | 0.168698 | 0.141159 |
cv2 | 0.194986 | 0.187029 | 0.139693 | 0.171905 | 257.886236 | 0.172258 | 0.143363 |
cv3 | 0.192737 | 0.184839 | 0.136871 | 0.169553 | 236.829589 | 0.168604 | 0.142137 |
cv4 | 0.195329 | 0.182605 | 0.138689 | 0.172299 | 130.197264 | 0.172474 | 0.143461 |
cv5 | 0.194450 | 0.186626 | 0.137420 | 0.171206 | 268.090236 | 0.170898 | 0.141921 |
## LGB的参数集合: objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair'] num_leaves = [3,5,10,15,20,40, 55] max_depth = [3,5,10,15,20,40, 55] bagging_fraction = [] feature_fraction = [] drop_rate = []
best_obj = dict() for obj in objective: model = LGBMRegressor(objective=obj) score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))) best_obj[obj] = score best_leaves = dict() for leaves in num_leaves: model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves) score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))) best_leaves[leaves] = score best_depth = dict() for depth in max_depth: model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0], max_depth=depth) score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error))) best_depth[depth] = score
sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves','3_turning_depth'], y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values())])
<matplotlib.axes._subplots.AxesSubplot at 0x1fea93f6080>
from sklearn.model_selection import GridSearchCV
parameters = {'objective': objective , 'num_leaves': num_leaves, 'max_depth': max_depth} model = LGBMRegressor() clf = GridSearchCV(model, parameters, cv=5) clf =, train_y)
{'max_depth': 15, 'num_leaves': 55, 'objective': 'regression'}
model = LGBMRegressor(objective='regression', num_leaves=55, max_depth=15)
np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, subsample, min_child_samples): val = cross_val_score( LGBMRegressor(objective = 'regression_l1', num_leaves=int(num_leaves), max_depth=int(max_depth), subsample = subsample, min_child_samples = int(min_child_samples) ), X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error) ).mean() return 1 - val
rf_bo = BayesianOptimization( rf_cv, { 'num_leaves': (2, 100), 'max_depth': (2, 100), 'subsample': (0.1, 1), 'min_child_samples' : (2, 100) } )
| iter | target | max_depth | min_ch... | num_le... | subsample | ------------------------------------------------------------------------- | [0m 1 [0m | [0m 0.8649 [0m | [0m 89.57 [0m | [0m 47.3 [0m | [0m 55.13 [0m | [0m 0.1792 [0m | | [0m 2 [0m | [0m 0.8477 [0m | [0m 99.86 [0m | [0m 60.91 [0m | [0m 15.35 [0m | [0m 0.4716 [0m | | [95m 3 [0m | [95m 0.8698 [0m | [95m 81.74 [0m | [95m 83.32 [0m | [95m 92.59 [0m | [95m 0.9559 [0m | | [0m 4 [0m | [0m 0.8627 [0m | [0m 90.2 [0m | [0m 8.754 [0m | [0m 43.34 [0m | [0m 0.7772 [0m | | [0m 5 [0m | [0m 0.8115 [0m | [0m 10.07 [0m | [0m 86.15 [0m | [0m 4.109 [0m | [0m 0.3416 [0m | | [95m 6 [0m | [95m 0.8701 [0m | [95m 99.15 [0m | [95m 9.158 [0m | [95m 99.47 [0m | [95m 0.494 [0m | | [0m 7 [0m | [0m 0.806 [0m | [0m 2.166 [0m | [0m 2.416 [0m | [0m 97.7 [0m | [0m 0.224 [0m | | [0m 8 [0m | [0m 0.8701 [0m | [0m 98.57 [0m | [0m 97.67 [0m | [0m 99.87 [0m | [0m 0.3703 [0m | | [95m 9 [0m | [95m 0.8703 [0m | [95m 99.87 [0m | [95m 43.03 [0m | [95m 99.72 [0m | [95m 0.9749 [0m | | [0m 10 [0m | [0m 0.869 [0m | [0m 10.31 [0m | [0m 99.63 [0m | [0m 99.34 [0m | [0m 0.2517 [0m | | [95m 11 [0m | [95m 0.8703 [0m | [95m 52.27 [0m | [95m 99.56 [0m | [95m 98.97 [0m | [95m 0.9641 [0m | | [0m 12 [0m | [0m 0.8669 [0m | [0m 99.89 [0m | [0m 8.846 [0m | [0m 66.49 [0m | [0m 0.1437 [0m | | [0m 13 [0m | [0m 0.8702 [0m | [0m 68.13 [0m | [0m 75.28 [0m | [0m 98.71 [0m | [0m 0.153 [0m | | [0m 14 [0m | [0m 0.8695 [0m | [0m 84.13 [0m | [0m 86.48 [0m | [0m 91.9 [0m | [0m 0.7949 [0m | | [0m 15 [0m | [0m 0.8702 [0m | [0m 98.09 [0m | [0m 59.2 [0m | [0m 99.65 [0m | [0m 0.3275 [0m | | [0m 16 [0m | [0m 0.87 [0m | [0m 68.97 [0m | [0m 98.62 [0m | [0m 98.93 [0m | [0m 0.2221 [0m | | [0m 17 [0m | [0m 0.8702 [0m | [0m 99.85 [0m | [0m 63.74 [0m | [0m 99.63 [0m | [0m 0.4137 [0m | | [0m 18 [0m | [0m 0.8703 [0m | [0m 45.87 [0m | [0m 99.05 [0m | [0m 99.89 [0m | [0m 0.3238 [0m | | [0m 19 [0m | [0m 0.8702 [0m | [0m 79.65 [0m | [0m 46.91 [0m | [0m 98.61 [0m | [0m 0.8999 [0m | | [0m 20 [0m | [0m 0.8702 [0m | [0m 99.25 [0m | [0m 36.73 [0m | [0m 99.05 [0m | [0m 0.1262 [0m | | [0m 21 [0m | [0m 0.8702 [0m | [0m 85.51 [0m | [0m 85.34 [0m | [0m 99.77 [0m | [0m 0.8917 [0m | | [0m 22 [0m | [0m 0.8696 [0m | [0m 99.99 [0m | [0m 38.51 [0m | [0m 89.13 [0m | [0m 0.9884 [0m | | [0m 23 [0m | [0m 0.8701 [0m | [0m 63.29 [0m | [0m 97.93 [0m | [0m 99.94 [0m | [0m 0.9585 [0m | | [0m 24 [0m | [0m 0.8702 [0m | [0m 93.04 [0m | [0m 71.42 [0m | [0m 99.94 [0m | [0m 0.9646 [0m | | [0m 25 [0m | [0m 0.8701 [0m | [0m 99.73 [0m | [0m 16.21 [0m | [0m 99.38 [0m | [0m 0.9778 [0m | | [0m 26 [0m | [0m 0.87 [0m | [0m 86.28 [0m | [0m 58.1 [0m | [0m 99.47 [0m | [0m 0.107 [0m | | [0m 27 [0m | [0m 0.8703 [0m | [0m 47.28 [0m | [0m 99.83 [0m | [0m 99.65 [0m | [0m 0.4674 [0m | | [0m 28 [0m | [0m 0.8703 [0m | [0m 68.29 [0m | [0m 99.51 [0m | [0m 99.4 [0m | [0m 0.2757 [0m | | [0m 29 [0m | [0m 0.8701 [0m | [0m 76.49 [0m | [0m 73.41 [0m | [0m 99.86 [0m | [0m 0.9394 [0m | | [0m 30 [0m | [0m 0.8695 [0m | [0m 37.27 [0m | [0m 99.87 [0m | [0m 89.87 [0m | [0m 0.7588 [0m | =========================================================================
1 - rf_bo.max['target']