项目1:logit,GBM,knn,xgboost准确率测试
2016-02-16 17:38
501 查看
logit,GBM,knn,xgboost准确率测试
junjun
2016年2月16日
参考:http://chiffon.gitcafe.io/2015/05/20/newtry.html#topofpage数据来自UCIhttp://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening,一个信a用卡的数据,具体各项变量名以及变量名代表的含义不明(应该是出于保护隐私的目的),本文会用logit,GBM(Gradient Boosting Machines),knn,xgboost来对数据进行分类预测,对比准确率 预计的准确率应该是: xgboost > GBM > logit > knn
1、加载数据
dataset <- read.table("F:\\R\\Rworkspace\\信用卡数据/crx.data", header=F, sep=",", na.strings="?") head(dataset)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 ## 1 b 30.83 0.000 u g w v 1.25 t t 1 f g 202 0 + ## 2 a 58.67 4.460 u g q h 3.04 t t 6 f g 43 560 + ## 3 a 24.50 0.500 u g q h 1.50 t f 0 f g 280 824 + ## 4 b 27.83 1.540 u g w v 3.75 t t 5 t g 100 3 + ## 5 b 20.17 5.625 u g w v 1.71 t f 0 f s 120 0 + ## 6 b 32.08 4.000 u g m v 2.50 t f 0 t g 360 0 +
#以上是数据的形式,接下来看下数据是否有缺失值和各个数据的类型 sapply(dataset, function(x) sum(is.na(x)))
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 ## 12 12 0 6 6 9 9 0 0 0 0 0 0 13 0 0
sapply(dataset, class)
## V1 V2 V3 V4 V5 V6 V7 ## "factor" "numeric" "numeric" "factor" "factor" "factor" "factor" ## V8 V9 V10 V11 V12 V13 V14 ## "numeric" "factor" "factor" "integer" "factor" "factor" "integer" ## V15 V16 ## "integer" "factor"
2、创建训练集和测试集数据
#分割数据的训练集和测试集,这里set.seed(123),设定70%的训练集,30%的测试集. dim(dataset)
## [1] 690 16
#na.omit返回删除NA值后的对象 dataset <- na.omit(dataset) dim(dataset)
## [1] 653 16
index <- sample(nrow(dataset), round(0.7*nrow(dataset))) train <- dataset[index, ] test <- dataset[-index, ] dim(train)
## [1] 457 16
dim(test)
## [1] 196 16
3、将因子类型转化为哑元变量
有时候,因子型数据需要转化变量为哑变量,因为在一些挖掘场合,数据不能直接使用因子型的数据。knn • glmnet • svm • xgboost 有些挖掘方法是可以使用因子变量的,比如:• logistic regression • rpart • GBM • randomforestdataset2 <- dataset library(plyr) into_factor <- function(x) { if(class(x) == "factor") { n <- length(x) data.fac <- data.frame(x=x, y=1:n) output <- model.matrix(y~x, data.fac)[, -1] }else { output <- x } output } into_factor(dataset$V4)[1:5, ]
## xu xy ## 1 1 0 ## 2 1 0 ## 3 1 0 ## 4 1 0 ## 5 1 0
dataset2 <- colwise(into_factor)(dataset2) # 注意:此操作要删除所有的NA值后才行,否则会报错Error: 不是所有的length(rows) == 1都是TRUE dataset2 <- do.call(cbind, dataset2) dataset2 <- as.data.frame(dataset2) head(dataset2)
## V1 V2 V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd ## 1 1 30.83 0.000 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ## 2 0 58.67 4.460 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ## 3 0 24.50 0.500 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ## 4 1 27.83 1.540 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ## 5 1 20.17 5.625 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ## 6 1 32.08 4.000 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ## xff xh xj xn xo xv xz V8 V9 V10 V11 V12 xp xs V14 V15 V16 ## 1 0 0 0 0 0 1 0 1.25 1 1 1 0 0 0 202 0 1 ## 2 0 1 0 0 0 0 0 3.04 1 1 6 0 0 0 43 560 1 ## 3 0 1 0 0 0 0 0 1.50 1 0 0 0 0 0 280 824 1 ## 4 0 0 0 0 0 1 0 3.75 1 1 5 1 0 0 100 3 1 ## 5 0 0 0 0 0 1 0 1.71 1 0 0 0 0 1 120 0 1 ## 6 0 0 0 0 0 1 0 2.50 1 0 0 1 0 0 360 0 1
dim(dataset2)
## [1] 653 38
4、logistic 回归
使用logistic回归来进行测试建模和预测,使用的函数是glm#1)建模 logit.model <- glm(V16~., data=train, family="binomial") #3)模型优化 logit.model <- step(logit.model)
## Start: AIC=307.61 ## V16 ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + ## V12 + V13 + V14 + V15 ## ## ## Step: AIC=307.57 ## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + ## V13 + V14 + V15 ## ## Df Deviance AIC ## - V13 2 235.53 303.53 ## - V1 2 235.78 303.78 ## - V8 1 236.03 306.03 ## - V7 8 250.45 306.45 ## - V12 1 237.02 307.02 ## - V4 2 239.02 307.02 ## - V2 1 237.29 307.29 ## <none> 235.57 307.57 ## - V10 1 237.69 307.69 ## - V3 1 237.91 307.91 ## - V11 1 239.35 309.35 ## - V14 1 239.88 309.88 ## - V6 14 270.37 314.37 ## - V15 2 249.63 317.63 ## - V9 2 371.49 439.49 ## ## Step: AIC=303.53 ## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + ## V14 + V15 ## ## Df Deviance AIC ## - V12 1 235.59 301.59 ## - V1 1 235.79 301.79 ## - V8 1 235.94 301.94 ## - V7 8 250.28 302.28 ## - V4 2 239.11 303.11 ## - V2 1 237.29 303.29 ## - V10 1 237.43 303.43 ## <none> 235.53 303.53 ## - V3 1 237.93 303.93 ## - V11 1 239.02 305.02 ## - V14 1 239.99 305.99 ## - V6 13 270.50 312.50 ## - V15 1 249.74 315.74 ## - V9 1 371.58 437.58 ## ## Step: AIC=301.59 ## V16 ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 + ## V15 ## ## Df Deviance AIC ## - V1 1 235.87 299.87 ## - V8 1 236.06 300.06 ## - V7 8 250.51 300.51 ## - V4 2 239.13 301.13 ## - V2 1 237.40 301.40 ## - V10 1 237.54 301.54 ## <none> 235.59 301.59 ## - V3 1 237.96 301.96 ## - V11 1 239.08 303.08 ## - V14 1 240.05 304.05 ## - V6 13 270.53 310.53 ## - V15 1 249.75 313.75 ## - V9 1 371.58 435.58 ## ## Step: AIC=299.87 ## V16 ~ V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + V11 + V14 + V15 ## ## Df Deviance AIC ## - V8 1 236.33 298.33 ## - V7 8 250.56 298.56 ## - V4 2 239.56 299.56 ## - V2 1 237.56 299.56 ## - V10 1 237.81 299.81 ## <none> 235.87 299.87 ## - V3 1 238.27 300.27 ## - V11 1 239.36 301.36 ## - V14 1 240.38 302.38 ## - V6 13 270.53 308.53 ## - V15 1 250.02 312.02 ## - V9 1 371.87 433.87 ## ## Step: AIC=298.33 ## V16 ~ V2 + V3 + V4 + V6 + V7 + V9 + V10 + V11 + V14 + V15 ## ## Df Deviance AIC ## - V7 8 250.84 296.84 ## - V4 2 239.79 297.79 ## - V10 1 238.29 298.29 ## <none> 236.33 298.33 ## - V3 1 238.61 298.61 ## - V2 1 238.70 298.70 ## - V11 1 240.12 300.12 ## - V14 1 240.72 300.72 ## - V6 13 271.43 307.43 ## - V15 1 250.16 310.16 ## - V9 1 380.26 440.26 ## ## Step: AIC=296.84 ## V16 ~ V2 + V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15 ## ## Df Deviance AIC ## - V2 1 252.14 296.14 ## - V4 2 254.33 296.33 ## - V10 1 252.49 296.49 ## <none> 250.84 296.84 ## - V3 1 254.44 298.44 ## - V14 1 254.54 298.54 ## - V11 1 254.57 298.57 ## - V15 1 261.39 305.39 ## - V6 13 289.21 309.21 ## - V9 1 394.85 438.85 ## ## Step: AIC=296.14 ## V16 ~ V3 + V4 + V6 + V9 + V10 + V11 + V14 + V15 ## ## Df Deviance AIC ## - V10 1 253.58 295.58 ## - V4 2 256.00 296.00 ## <none> 252.14 296.14 ## - V3 1 255.26 297.26 ## - V14 1 255.43 297.43 ## - V11 1 256.66 298.66 ## - V15 1 263.54 305.54 ## - V6 13 289.22 307.22 ## - V9 1 402.78 444.78 ## ## Step: AIC=295.58 ## V16 ~ V3 + V4 + V6 + V9 + V11 + V14 + V15 ## ## Df Deviance AIC ## <none> 253.58 295.58 ## - V4 2 257.68 295.68 ## - V3 1 256.82 296.82 ## - V14 1 257.00 297.00 ## - V15 1 266.01 306.01 ## - V11 1 267.64 307.64 ## - V6 13 292.03 308.03 ## - V9 1 409.96 449.96
#2)预测 logit.predict <- predict(logit.model, test, type="response") logit.pred <- ifelse(logit.predict > 0.5, "+", "-") table(logit.pred, test$V16)
## ## logit.pred - + ## - 91 13 ## + 19 73
mean(logit.pred==test$V16)
## [1] 0.8367347
5、GBM:Gradient Boosting Machines
使用GBM方法来进行预测,这里用的是caret,repeat-cv来选择最优树#1、使用组合算法建模 library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats=5) set.seed(300) model_gbm <- train(V16~., data=train, method="gbm", metric="Kappa", trControl=ctrl)
## Loading required package: gbm
## Loading required package: survival
## ## Attaching package: 'survival'
## The following object is masked from 'package:caret': ## ## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Iter TrainDeviance ValidDeviance StepSize Improve ## 1 1.2753 nan 0.1000 0.0480 ## 2 1.1983 nan 0.1000 0.0392 。。。 ## 100 0.3115 nan 0.1000 -0.0018
#2、建模 pred_gbm <- predict(model_gbm, test) #3、模型评估 table(pred_gbm, test$V16)
## ## pred_gbm - + ## - 90 8 ## + 20 78
mean(pred_gbm==test$V16)
## [1] 0.8571429
6、knn method for classification
首先测试一个knn模型,不做CV,不做标准化,不做数据类型转换得到的结果,这里,不转换数据类型会把因子类型的变量舍弃,仅保留数值变量##knn算法 无标准化 #1、建模 library(caret) model_knn <- knn3(V16~., data=train, k=5) #2、预测 pred_knn <- predict(model_knn, test, class="response") #str(pred_knn) pred_knn1 <-ifelse(pred_knn[, 1]<0.5, "+", "-") table(pred_knn1, test$V16)
## ## pred_knn1 - + ## - 78 31 ## + 32 55
mean(pred_knn1==test$V16)
## [1] 0.6785714
##knn算法,标准化处理 #1、对数据集进行标准化处理 dataset2 <- cbind(colwise(scale)(dataset2[, -38]), V16=as.factor(dataset2$V16)) #2、重新创建训练集和测试集数据 set.seed(123) library(caret) index <- createDataPartition(dataset2$V16, p=0.7, list=F) train_scale <- dataset2[index, ] test_scale <- dataset2[-index, ] #3、建模 model_scale <- knn3(V16~., data=train_scale, k=5) #4、预测 pred_scale <- predict(model_scale, test_scale, type="class") table(pred_scale, test_scale$V16)
## ## pred_scale 0 1 ## 0 85 34 ## 1 22 54
mean(pred_scale==test_scale$V16)
## [1] 0.7128205
##knn算法的K-折交叉验证 #1、创建交叉验证函数 library(class) library(caret) knn_cv <- function(data, n=5, k) { index <- sample(1:5, nrow(data), replace=T) acc <- 0 for(i in 1:5) { ind = index == i train <- data[-ind, ] test <- data[ind, ] model_knn <- knn3(V16~., data=train, k=k) pred <- predict(model_knn, test, type="class") acc[i] <- mean(pred==test$V16) } mean(acc) } #2、重新创建训练集和测试集数据 set.seed(123) library(caret) index <- createDataPartition(dataset2$V16, p=0.7, list=F) train_cv <- dataset2[index, ] test_cv <- dataset2[-index, ] #3、验证K-折交叉验证函数 knn_cv(train_cv, 3, 5)
## [1] 0.8730277
acc <- 0 for(i in 2:20) { acc[i] <- knn_cv(train_cv, 3, i) print(paste(i,knn_cv(train_cv, 3, i), sep="——》")) }
## [1] "2——》0.890464698699993" ## [1] "3——》0.909093242809613" ## [1] "4——》0.883418367346939" ## [1] "5——》0.872213395351693" ## [1] "6——》0.867066796941293" ## [1] "7——》0.854495087053878" ## [1] "8——》0.848111602148951" ## [1] "9——》0.842036790410966" ## [1] "10——》0.840109115714773" ## [1] "11——》0.831505296371422" ## [1] "12——》0.833180115999597" ## [1] "13——》0.817368273676231" ## [1] "14——》0.834854658514452" ## [1] "15——》0.827973456326398" ## [1] "16——》0.827506897124673" ## [1] "17——》0.833968115218115" ## [1] "18——》0.832558368162111" ## [1] "19——》0.835863665139981" ## [1] "20——》0.832136554676483"
#上面等价于下面代码 #k <- 1:20 #set.seed(123) #acc <- sapply(k, function(x) knn_cv(train_cv, 3, x)) #4、确定准确率最大的K值,并重新建模 plot(1:20, acc, type="b")
k.final <- which.max(acc) k.final
## [1] 3
model <- knn3(V16~., data=train_cv, k=k.final) #5、预测 pred <- predict(model, test_cv, type="class") table(pred, test_cv$V16)
## ## pred 0 1 ## 0 89 37 ## 1 18 51
mean(pred==test_cv$V16)
## [1] 0.7179487
7、xgboost
require(xgboost)
## Loading required package: xgboost
require(methods) require(plyr) set.seed(123) n <- nrow(dataset2) index = sample(n,round(0.7*n)) train.xg = dataset2[index,] test.xg = dataset2[-index,] label <- as.matrix(train.xg[,38,drop =F]) data <- as.matrix(train.xg[,-38,drop =F]) data2 <- as.matrix(test.xg[,-38,drop =F]) label2 = as.matrix(test.xg[,38,drop =F]) xgmat <- xgb.DMatrix(data, label = label, missing = -10000) param <- list("objective" = "binary:logistic", "bst:eta" = 1, "bst:max_depth" = 2, "eval_metric" = "logloss", "silent" = 1, "nthread" = 16 , "min_child_weight" =1.45 ) nround =275 bst = xgb.train(param, xgmat, nround ) res1 = predict(bst,data2) pre1 = ifelse(res1>0.5,1,0) table(pre1,label2)
## label2 ## pre1 0 1 ## 0 91 15 ## 1 12 78
table(pre1,label2)
## label2 ## pre1 0 1 ## 0 91 15 ## 1 12 78
mean(pre1 ==label2)
## [1] 0.8622449
相关文章推荐
- 【WinForm】“System.Data.SqlClient.SqlConnection”的类型初始值设定项引发异常,无法识别的配置节 system.serviceModel
- jquery.js和jquery.min.js的区别介绍
- C语言:IF-ELSE的配对问题
- 听说在新的一年里你的证书过期了
- 大型网站架构演化历程
- androidStudio常见错误
- 华为机试题: 整型字符串排序
- 在Mongoose中使用嵌套的populate处理数据
- laravel的elixir和gulp用来对前端施工
- 冒泡排序
- 今天说一下where 中 exists 和 in 里面的一些区别
- Mac os X下使用Nginx跟PHP的连接问题
- android Button按钮实现点击效果代码附带demo下载
- 在线安装rar和unrar命令
- Simon Tatham:如何有效地报告Bug
- 禁止root用户远程登录
- Eclipse: Android Device Chooser - Unknown Target
- [转] 单元测试详解
- 129. Sum Root to Leaf Numbers LeetCode
- java的date类型无法读取 出现错误net.sf.json.JSONException: java.lang.reflect.InvocationTargetException