机器学习回归算法
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import preprocessing #预处理
from sklearn.model_selection import train_test_split #划分数据
from sklearn.model_selection import GridSearchCV #网格搜索
#特征选择
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor #DT
from sklearn.ensemble import AdaBoostRegressor #Ada
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.ensemble import GradientBoostingRegressor #GBDT
from lightgbm import LGBMRegressor #LGBM
from xgboost.sklearn import XGBRegressor #XGB
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings(‘ignore’)
def obtainFormatData():
dataset = datasets.load_boston()
featuresMatrix,y = dataset.data,dataset.target
print(‘======================================================’)
print(‘nums = {} features = {}’.
format(featuresMatrix.shape[0],featuresMatrix.shape[1]))
print(‘======================================================’)
#最大最小化规约处理X = (X – min(X)) / (max(X) – min(X))
x = preprocessing.MinMaxScaler().fit_transform(featuresMatrix)
return x,y
#特征选择
def lassoFS(x,y):
print(‘FS based on L1:’)
for a in [0.1,0.2,0.3]:
clf = Lasso(alpha=a,random_state=1)
clf.fit(x,y)
index = list(np.where(clf.coef_!=0)[0])
print(‘A = {} FSI = {}’.format(a,index))
def ridgeFS(x,y):
print(‘FS based on L2:’)
for a in [0.1,0.5,0.8]:
clf = Ridge(alpha=a,random_state=1)
clf.fit(x,y)
index = list(np.where(clf.coef_>=0.1)[0])
print(‘A = {} FSI = {}’.format(a,index))
def treeModelFS(x,y):
clf = ExtraTreesRegressor(n_estimators=10,criterion=’mse’
,max_depth=None,random_state=1)
clf.fit(x,y)
scores = pd.Series(clf.feature_importances_).sort_values(ascending=False)
scores.plot.bar(rot=0,figsize=(8,3),title=’importance of features based on tree’)
#参数寻优
def SVRRegressionALG(x_train,x_test,y_train,y_test):
params = {‘C’:[0.01,0.1,1.0,10],
‘kernel’:[‘linear’,’rbf’,’sigmoid’],
‘epsilon’:[0.01,0.05,0.1]}
clf = GridSearchCV(estimator=SVR(),param_grid=params)
clf.fit(x_train,y_train)
best = clf.best_params_
print(‘SVM:’,best)
m = SVR(C=best[‘C’],kernel=best[‘kernel’],epsilon=best[‘epsilon’])
m.fit(x_train,y_train)
predict_train = m.predict(x_train)
predict_test = m.predict(x_test)
train_r2 = round(r2_score(y_train,predict_train),4)
test_r2 = round(r2_score(y_test,predict_test),4)
print(‘R2: Train = {} Test = {}’.format(train_r2,test_r2))
#树模型
def TreeModelRegressionALG(x_train,x_test,y_train,y_test):
models = {‘DT’:DecisionTreeRegressor(criterion=’mse’,splitter=’best’,
max_depth=None,min_samples_split=2,
max_features=None,max_leaf_nodes=None,
random_state=1),
‘Ada’:AdaBoostRegressor(DecisionTreeRegressor(random_state=1),n_estimators=50,
learning_rate=0.1),
‘RF’:RandomForestRegressor(n_estimators=50,criterion=’mse’,
max_depth=None,max_features=’auto’,
min_samples_split=2,min_samples_leaf=1,
random_state=1),
‘GBDT’:GradientBoostingRegressor(loss=’ls’,n_estimators=100,learning_rate=0.1,
subsample=0.9,max_features=None,
min_samples_split=2,min_samples_leaf=1,
random_state=1),
‘LGBM’:LGBMRegressor(boosting_type=’gbdt’,num_leaves=30,
n_estimators=100,learning_rate=0.1,
objective=’regression’,min_child_samples=20,
max_depth=-1),
‘XGB’:XGBRegressor(max_depth=3,learning_rate=0.1,
n_estimators=100,booster=’gbtree’,
subsample=1,objective=’reg:linear’,
reg_alpha=0,reg_lambda=1)
}
print(‘======================================================’)
for model in models:
clf = models[model]
clf.fit(x_train,y_train)
predict_train = clf.predict(x_train)
predict_test = clf.predict(x_test)
train_r2 = round(r2_score(y_train,predict_train),4)
test_r2 = round(r2_score(y_test,predict_test),4)
print(‘model: {}\nR2: Train = {} Test = {}’.format(model,train_r2,test_r2))
print(‘======================================================’)
x,y = obtainFormatData()
x_train,x_test,y_train,y_test = train_test_split(x,y,
test_size=0.2,random_state=1)
SVRRegressionALG(x_train,x_test,y_train,y_test)
TreeModelRegressionALG(x_train,x_test,y_train,y_test)