您的位置:首页 > 其它

R语言实战分析预测海藻数量

2016-12-06 20:54 471 查看
********************预测海藻数量R语言脚本************************

---加载数据包

library(DMwR)

head(algae)

 

 

---对于数据给出行名称

algae=read.table("Analysis.txt",

header=F,

dec='.',

col.names=c('season','size','speed','mxPH','mno2','cl','no3','nh4','opo4','po4','chla','a1','a2','a3','a4','a5','a6','a7'),

na.strings=c('XXXXXXX'))

 

---绘制PH直方图

hist(algae$mxPH,prob=T)

 

 

---绘制PH直方图加密度图,用QQ图查看数据是否符合正态分布

library(car)

par(mfrow=c(1,2))

hist(algae$mxPH,prob=T,xlab='',main='Histogram of maximum ph value',ylim=0:1)

lines(density(algae$mxPH,na.rm=T))

rug(jitter(algae$mxPH))

qq.plot(algae$mxPH,main='Normal QQ Plot of maximum PH')

par(mfrow=c(1,1))

 

 

---绘制opo4箱线图

boxplot(algae$opo4,ylab="orthophosphate (opo4)")

rug(jitter(algae$opo4),side=2)

abline(h=mean(algae$opo4,na.rm=T),lty=2)

 

 

---离群值的检测,三条线分别表示均值,均值加标准差,中位数

plot(algae$nh4,xlab="")

abline(h=mean(algae$nh4,na.rm=T),lty=1)

abline(h=mean(algae$nh4,na.rm=T)+sd(algae$nh4,na.rm=T),lty=2)

abline(h=median(algae$nh4,na.rm=T),lty=3)

identify(algae$nh4)

 

---离群值的检测

plot(algae$nh4,xlab="")

clicked.lines=identify(algae$nh4)

 

algae[clicked.lines, ]

 

 

algae[algae$nh4.line>19000, ]

 

---因子变量绘制lattice箱线图(在规模较小的河流中,a1的频率较高)

library(lattice)

bwplot(size~a1,data=algae,ylab='Rive Size',xlab='Algal A1')

 

 

---分位箱线图

library(Hmisc)

bwplot(size~a1,data=algae,panel=panel.bpplot,

probs=seq(.01,.49,by=.01),datadensity=TRUE,

yalb='River Size',xlab='Algal A1'

)

 

 

---两个条件的影响绘图

mino2=equal.count(na.omit(algae$mno2),number=5,overlap=1/5)

stripplot(season ~ a3/mino2,data=algae[!is.na(algae$mno2), ])

 

******数据缺失处理*******

---剔除

library(DMwR)

data(algae)

algae[!complete.cases(algae),]

 

nrow(algae[!complete.cases(algae),])

 

algae=na.omit(algae)

algae=algae[-c(62,199),]

 

---找出每行数据缺失数据

apply(algae,1,function(x) sum(is.na(x)))

 

---设定值

data(algae)

manyNAs(algae,0.2)

 

algae=algae[-manyNAs(algae),]

 

---用最高频值填补缺失值

--均值

algae[48,"mxPH"]=mean(algae$mxPH,na.rm=T)

--中位数

algae[is.na(algae$Chla),"Chla"]=median(algae$Chla,na.rm=T)

--众数(函数包)

data(algae)

algae=algae[-manyNAs(algae),]

algae=centralImputation(algae)

 

--通过变量的相关关系来填写缺失值

cor(algae[,4:18],use="complete.obs")

symnum(cor(algae[,4:18],use="complete.obs"))

 

---通过线性相关po4和opo4值的关系填写缺失值

data(algae)

algae=algae[-manyNAs(algae),]

lm(PO4~oPO4,data=algae)

 

algae[28,"PO4"]=42.897+1.293*algae[28,"oPO4"]

---构造函数计算opo4和po4的值

data(algae)

algae=algae[manyNAs(algae),]

fillPO4=function(op){

if(is.na(op))

return(NA)

else return(42.897+1.239*op)

}

algae[is.na(algae$PO4),"PO4"]=sapply(algae[is.na(algae$PO4),

"oPO4"],fillPO4)

 

---不同季节直方图

histogram(~mxPH | season ,data=algae)

 

 

algae$season=factor(algae$season, levels=c("spring","summer","autum","winter"))

 

---扩展

histogram(~mxPH | size*speed,data=algae)

 

 

stripplot(size~mxPH | speed,data=algae,jitter= T)

 

 

*****通过探索案例之间的相似性来填补缺失值

data(algae)

algae=algae[-manyNAs(algae),]

---一般采用欧式距离

algae=knnImputation(algae,k=10)

--中位数

algae=knnImputation(algae,k=10,meth="median")

 

*********获取预测模型***********

 

---多元线性回归函数模型

 

--获取不含缺失值的数据框

library(rpart)

data(algae)

algae=algae[-manyNAs(algae),]

clean.algae=knnImputation(algae,k=10)

 

---建立一个预测海藻的线性回归模型

lm.a1=lm(a1~.,data=clean.algae[,1:12])

 

---获取更多线性模型

summary(lm.a1)

 

---利用anova做精简的线性模型,进行比较

anova(lm.a1)

---做出微小调整

lm2.a1=update(lm.a1,.~.-season)

summary(lm2.a1)

 

 

---对于两个模型进行正式比较

anova(lm.a1,lm2.a1)

 

---采用向后消元法得到一个新的模型

final.lm=step(lm.a1)

 

---获得最后模型

summary(final.lm)

--这个模型解释的方差比例仍然不是很乐观,这样的假设对于海藻案例应用不是很合适

 

 

**********决策树预测************

决策树模型可以处理缺失值

library(rpart)

data(algae)

algae=algae[-manyNAs(algae),]

---获取回归树

rt.a1=rpart(a1~.,data=algae[,1:12])

rt.a1

 

---绘制树形图

prettyTree(rt.a1)

 

---复杂度损失修剪方法

printcp(rt.a1)

 

--通过使用不同的CP值来建立这棵树

rt2.a1=prune(rt.a1,cp=0.08)

rt2.a1

 

---rpartXse函数自动完成

(rt.a1=rpartXse(a1~.,data=algae[,1:12]))

 

--使用函数snip.rpart()交互对树进行剪枝

first.tree=rpart(a1~.,data=algae[,1:12])

snip.rpart(first.tree,c(4,7))

 

---画出回归树

prettyTree(first.tree)

snip.rpart(first.tree)

 

********模型的评估**********

--回归模型的评估方法是将目标变量的预测值与实际值进行比较得到,并从这些比较中计算耨写平均误差的度量。

一种度量方法是平均绝对误差(MAE)

lm.predictions.a1=predict(final.lm,clean.algae)

rt.predictions.a1=predict(rt.a1,algae)

 

(mae.a1.lm=mean(abs(lm.predictions.a1-algae[,"a1"])))

(mae.a1.rt=mean(abs(rt.predictions.a1-algae[,"a1"])))

 

 

---另外一种度量误差的方法是均方误差(MSE)

(mse.a1.lm=mean(lm.predictions.a1-algae[,"a1"])^2)

(mse.a1.rt=mean(rt.predictions.a1-algae[,"a1"])^2)

 

---还有一种度量方法是标准化后的平均绝对误差(NMSE)

(nmse.a1.lm=mean((lm.predictions.a1-algae[,'a1'])^2)/

mean((mean(algae[,'a1'])-algae[,'a1'])^2))

(nmse.a1.rt=mean((rt.predictions.a1-algae[,'a1'])^2)/

mean((mean(algae[,'a1'])-algae[,'a1'])^2))

 

---NMSE的值越小越好,模型的性能越好

 

 

---利用函数来计算回归模型的性能指标

regr.eval(algae[,"a1"],rt.predictions.a1,train.y=algae[,"a1"])

 

---可视化展现预测模型的值

old.par=par(mfrow=c(1,2))

plot(lm.predictions.a1,algae[,"a1"],main="Liner model",xlab="predictions",ylab="True values")

abline(0,1,lty=2)

 

plot(rt.predictions.a1,algae[,"a1"],main="Regrsson Tree",xlab="Predictions",ylab="True value")

 

abline(0,1,lty=2)

par=(old.par)

 

--用函数identify()来检查哪些预测特别差的检查,该函数可以让用户通过互动方式点击图形的点

plot(lm.predictions.a1,algae[,'a1'],main="Liner Model",xlab="Predictions ",ylab="True Values")

abline(0,1,lty=2)

algae[identify(lm.predictions.a1,algae[,'a1']),]

--去除小于零的数

sensible.lm.predictions.a1=ifelse(lm.predictions.a1<0,0,lm.predictions.a1)

 

regr.eval(algae[,"a1"],lm.predictions.a1,stats=c("mae","mse"))

 

regr.eval(algae[,"a1"],sensible.lm.predictions.a1,stats=c("mae","mse"))

 

 

******预测任务的决策

1.为预测任务选择模型

2.比较模型性能评估指标

3.选择获取评估指标的可靠估计的实验方法

 

---给出目标模型函数

cv.rpart=function(form.train,test,...){

m=rpartXse(form,train,...)

p=predict(m,test)

mse=mean((p-resp(form,test))^2)

c(nmse=mse/mean(mean(resp(form,train))-resp(form,test))^2)

}

 

cv.lm=function(form,train,test,...){

m=lm(form,train,...)

p=predict(m,test)

p=ifelse(p<0,0,p)

mse=mean((p-resp(form,test))^2)

c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))

}

 

---对于模型交叉验证

 

res=experimentalComparison(

c(dataset(a1~.,clean.algae[,1:12],'a1')),

c(variants('cv.lm'),

variants('cv.rpart',se=c(0,0.5,1))),

cvSettings(3,10,1234))

 

cv.lm variant=cv.lm.defaults

 

cv.rpart variant=cv.rpart.v1

 

cv.rpart variant=cv.rpart.v2

 

summary(res)

 

---绘制图形

plot(res)

 

--由图可以看出模型NMSE最好,但是不明显,通过下面给出

getVariant("cv.rpart.v1",res)

 

对所有的7个预测模型任务进行与上面相似的比较实验

 

DSs=sapply(names(clean.algae)[12:18],

function(x,names.attrs){

f=as.formula(paste(x,"~ ."))

dataset(f,clean.algae[,c(names.attrs,x)],x)

},

names(clean.algae)[1:11])

 

res.all=experimentalComparison(

DSs,

c(variants('cv.lm'),

variants('cv.rpart',se=c(0,0.5,1))

),

cvSettings(5,10,1234))

 

plot(res.all)

 

bestScores(res.all)

 

**********组合模型-随机森林***********

library(randomForest)

 

cv.rf=function(form,train,test,...){

 

m=randomForest(form,train,...)

 

p=predict(m,test)

mse=mean((p-resp(form,test))^2)

 

c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))

}

res.all=experimentalComparison(

DSs,

c(variants('cv.lm'),

variants('cv.rpart',se=c(0,0.5,1)),

variants('cv.rf',ntree=c(200,500,700))

),

cvSettings(5,10,1234))

 

--应用函数bestScores()能正是组合方法的优势

bestScores(res.all)

 

---论述统计的显著性

compAnalysis(res.all,against='cv.rf.v3',

datasets=c('a1','a2','a4','a6'))

 

 

****预测7类海藻的频率***

--获得7个模型

bestModelNames=sapply(bestScores(res.all),

function(x) x['nmse','system'])

learners=c(rf='randomForest',rpart='rpartXse')

funs=learners[sapply(strsplit(bestModelsNames,'\\.'),

function(x) x[2])]

 

parSetts=lapply(bestModelsNames,function(x)getVariant(x,res.all)@pars)

 

bestModels=list()

for(a in 1:7){

form=as.formula(paste(names(clean.algae)[11+a],'~.'))

bestModels[[a]]=do.call(funcs[a],c(list(form,clean.algae[,c(1:11,11+a)]),parSetts[[a]]))}

 

--用训练数据集来填补测试数据集的缺失数据集

clean.test.algae=knnImputation(test.algae,k=10,distData=algae[,1:11])

 

--获取整个测试数据集的预测值矩阵

preds=matrus(ncol=7,nrow=140)

for(i in 1:nrow(clean.test.algae))

preds[i,]=sapply(1:7,function(x)

predict(bestModel[[x]],clean.test.algae[i,]))

 

 

--计算模型NMSE值

avg.preds=apply(algae[,12:18],2,mean)

apply(((algae.sols-preds)^2), 2,mean)/apply((scale(algae.sols,avg.preds,F)^2),2,mean)

 

 

 

 

 

 

 

 

 

 

 

 

 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息