Kaggle | Titanic: Machine Learning from Disaster
2015-08-16 15:54
465 查看
Titanic题目链接
下面解释一下训练集之中的数据:
PassengerId 旅客ID
Survived 是否活下来了,1:yes 0:no
Pclass 旅客等级
Name 名字
Sex 性别
Age 年龄
SibSp 有多少兄弟姐妹/配偶同船 Number of Siblings/Spouses Aboard
Parch 有多少父母/子女同船 Number of Parents/Children Aboard
Ticket 船票号码?
Fare 船票收费
Cabin 所在小屋
Embarked 登船城市 Port of Embarkation C Q S 分别代表不同的城市
R语言代码
有几点要注意的:
1)为了好一些预测准确率,创建了一些新的feature
2)train test最好并在一起处理,这样就不会有train和test factor level不一致导致random forest不work的问题
3)cforest感觉比randomForest好使,变量中有NA也不会报错,而且支持的factor level更多
下面解释一下训练集之中的数据:
PassengerId 旅客ID
Survived 是否活下来了,1:yes 0:no
Pclass 旅客等级
Name 名字
Sex 性别
Age 年龄
SibSp 有多少兄弟姐妹/配偶同船 Number of Siblings/Spouses Aboard
Parch 有多少父母/子女同船 Number of Parents/Children Aboard
Ticket 船票号码?
Fare 船票收费
Cabin 所在小屋
Embarked 登船城市 Port of Embarkation C Q S 分别代表不同的城市
R语言代码
有几点要注意的:
1)为了好一些预测准确率,创建了一些新的feature
2)train test最好并在一起处理,这样就不会有train和test factor level不一致导致random forest不work的问题
3)cforest感觉比randomForest好使,变量中有NA也不会报错,而且支持的factor level更多
library(ggplot2) library(party) library(rpart) extractFeature <- function(data) { features <- c("Pclass", "Age", "SibSp", "Parch", "Fare", "Cat") fea <- data[, features] fea$Title = sapply(as.character(data$Name), function(x) strsplit(x,'[.,]')[[1]][2]) fea$Title = gsub(' ', '', fea$Title) #print(class(fea$Title)) fea$Title[fea$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir' fea$Title[fea$Title %in% c('Dona', 'Lady', 'the Countess', 'theCountess', 'Jonkheer')] <- 'Lady' fea$Title[fea$Title %in% c('Mme', 'Ms')] <- 'Mrs' fea$Title[fea$Title %in% c('Mlle')] <- 'Miss' fea$Title = as.factor(fea$Title) print(summary(fea$Title)) #predict age predict_age <- rpart(Age ~ Title, data = fea[!is.na(fea$Age), ], method = "anova") fea$Age[is.na(fea$Age)] <- predict(predict_age, fea[is.na(fea$Age), ]) #predict fare predict_fare <- rpart(Fare ~ Pclass, data = fea[!is.na(fea$Fare) & fea$Fare != 0, ], method = "anova") fea$Fare[is.na(fea$Fare) | fea$Fare == 0] <- predict(predict_fare, fea[is.na(fea$Fare) | fea$Fare == 0, ]) fea$Gender <- 0 fea$Gender <- as.numeric(sapply(data$Sex, function(x) {as.character(x) == as.character("male")})) fea$Mother <- 0 fea$Mother[fea$Gender == 0 & fea$Parch > 0 & fea$Age > 18 & fea$Title != 'Miss'] <- 1 fea$Child <- 0 fea$Child[fea$Parch > 0 & fea$Parch <= 18] <- 1 data$Embarked <- as.character(data$Embarked) data$Embarked[data$Embarked != "S" & data$Embarked != "C" & data$Embarked != "Q"] <- which.max(table(data$Embarked)) fea$Port_C = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("C")})) fea$Port_Q = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("Q")})) fea$Port_S = as.numeric(sapply(data$Embarked, function(x) {as.character(x) == as.character("S")})) fea$Surname <- sapply(as.character(data$Name), function(x) strsplit(x, '[.,]')[[1]][1]); family_id <- paste0(fea$FamilySize, fea$Surname) fea$Family_id <- as.factor(family_id) family_table <- data.frame(table(fea$Family_id)) SmallFamily <- family_table$Var1[family_table$Freq <= 2] family_id[family_id %in% SmallFamily] <- "small" fea$Family_id2 <- as.factor(family_id) #print(levels(fea$Family_id2)) fea$Deck <- sapply(as.character(data$Cabin), function(x) strsplit(x, NULL)[[1]][1]); #print(class(fea$Deck)) fea$Deck[is.na(fea$Deck)] <- "EMPTY" fea$Deck <- as.factor(fea$Deck) fea$CabinNum <- sapply(as.character(data$Cabin), function(x) strsplit(x, NULL)[[1]][2]); fea$CabinNum <- as.numeric(fea$CabinNum) num <- subset(fea$CabinNum, !is.na(fea$CabinNum)) CabinNumCluster <- kmeans(num, 3) #print(fea$CabinNum) #print(summary(CabinNumCluster$cluster)) fea$CabinPos[!is.na(fea$CabinNum)] <- CabinNumCluster$cluster; print(class(fea$CabinPos)) fea$CabinPos <- as.factor(fea$CabinPos) levels(fea$CabinPos) <- c("Front", "Middle", "End") fea$CabinNum <- NULL print(names(fea)) return (fea) } set.seed(1) train <- read.csv("input/train.csv", header = T) test <- read.csv("input/test.csv", header = T) train$Cat <- "train" test$Cat <- "test" test$Survived <- NA full <- rbind(train, test) full <- extractFeature(full) feaTrain <- full[full$Cat == "train", ] feaTest <- full[full$Cat == "test", ] #rf <- randomForest(feaTrain, as.factor(train$Survived), ntree = 100, importance = TRUE) feaTrain <- data.frame(Survived = as.factor(train$Survived), feaTrain) rf <- cforest(feaTrain$Survived ~ Pclass + Age + SibSp + Parch + Fare + Gender + Title + Mother + Child + Port_S + Port_Q+ Port_C + Family_id2 + CabinPos + Deck, data = feaTrain, controls=cforest_unbiased(ntree=200, mtry=3)) pre <- predict(rf, newdata = feaTest, OOB = TRUE, type = "response") pre <- predict(rf, newdata = feaTest) out <- data.frame(test$PassengerId, pre) names(out) <- cbind("PassengerId", "Survived") write.csv(out, file = "Survived.csv", row.names = FALSE) pre_train <- predict(rf, newdata = feaTrain) out_train <- data.frame(train$Survived, pre_train) names(out_train) <- cbind("Survived", "pre") print(sum(out_train$Survived == out_train$pre) / length(out_train$Survived))
相关文章推荐
- 8,16小感
- 排序算法(八)——基数排序
- 结构体对齐详解
- [Java][activiti]Activiti这个过程委托功能[getAssignee()与getOwner()差异]
- 软件开发人员角色定位和分工
- this指针
- ubuntu下Pycharm安装及配置
- HDU 5380 Travel with candy 单调队列
- 正确的加载xib文件的View或者ViewController
- Pow(x, n)
- 2015/8/16
- 制作翻转效果动画
- 计算几何基础(模板)
- 固定/etc/resolve.conf的方法
- Hbase 协处理器(一)之RowCount代码
- HTML5表单
- 图论基础概念
- 【LeetCode】102 - Binary Tree Level Order Traversal
- 计算机网络---三次握手,四次释放
- Cocos2dx创建碰撞刚体