GBRT/GBDT算法实例

  • Post author:
  • Post category:其他



GBM实例1:

#1、构造数据集
# A least squares regression example # create some data  
N<-1000
X1<-runif(N)
X2<-2*runif(N)
X3<-ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4<-factor(sample(letters[1:6],N,replace=TRUE))
X5<-factor(sample(letters[1:3],N,replace=TRUE))
X6<-3*runif(N)
mu<-c(-1,0,1,2)[as.numeric(X3)]
SNR<-10                         # signal-to-noise ratio
Y<-X1**1.5+2*(X2**.5)+mu
sigma<-sqrt(var(Y)/SNR)
Y<-Y+rnorm(N,0,sigma)

# introduce some missing values
X1[sample(1:N,size=500)]<-NA
X4[sample(1:N,size=300)]<-NA
data<-data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)

#2、使用GBM函数建模
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
gbm1 <- gbm(Y~X1+X2+X3+X4+X5+X6,     #formula
            data = data,             #dataset
            var.monotone = c(0, 0, 0, 0, 0, 0), #-1:monotone decrease,+1:monotone increase,0:no monotone restrictions
            distribution="gaussian", #see the help for other choices
            n.trees=1000,            #number of trees
            shrinkage=0.05,          #shrinkageor learning rate, 0.001 to 0.1 usually work
            interaction.depth=3,     #1:additive model,2: two-way interactions, etc.
            bag.fraction=0.5,        #subsampling fraction,0.5 is probably best
            train.fraction=0.5,      #fraction of data for training, first train.fraction*N used for training
            n.minobsinnode=10,       # minimum total weight needed in each node
            cv.folds=3,              #do 3-fold cross-validation
            keep.data=TRUE,          #keep a copy of the dataset with the object
            verbose=FALSE,           #don't print out progress
            n.cores=1)               #use only a single core (detecting #cores is error-prone,so avoided here)

#3、使用交叉验证确定最佳迭代次数
best.iter <- gbm.perf(gbm1, method = "cv")


print(best.iter)
## [1] 109
#4、各解释变量的重要程度
summary(gbm1, n.trees=best.iter)


##    var    rel.inf
## X3  X3 67.2320878
## X2  X2 28.7445034
## X1  X1  2.7022822
## X6  X6  0.6273469
## X4  X4  0.5817726
## X5  X5  0.1120072

实例2:请参考《logit,GBM,knn,xgboost对一个信用卡数据的实现》之GBM

http://chiffon.gitcafe.io/2015/05/20/newtry.html#topofpage

##GBM算法

#1、使用组合算法建模

library(caret)

ctrl <-trainControl(method = "repeatedcv", number=5, repeats=5)

set.seed(300)

model_gbm <-train(V16~., data=train_data, method="gbm", metric="Kappa",trControl=ctrl)

 

#2、建模

pred_gbm <-predict(model_gbm, test_data)

 

#3、模型评估

> table(pred_gbm, test_data$V16)       pred_gbm  -  +        - 91 11        + 16 77 > mean(pred_gbm==test_data$V16) [1] 0.8615385



版权声明:本文为qq_16365849原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。