您的位置:首页 > 其它

[置顶] 【R语言爬虫】R语言提交get请求抓取城城理财数据

2017-05-26 17:43 441 查看
一、需求分析

抓取城城理财数据。

抓取url:

https://www.cclc.co/debts/lctz_all_all



二、实现源代码

rm(list=ls())
library(XML)
library(RCurl)

url='https://www.cclc.co/debts/lctz_all_all'
myheader <- c(
"User-Agent"="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language"="gzip, deflate, sdch",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)

d <- debugGatherer()
web <- getURL(url, httpheader = myheader, debugfunction = d$update, verbose = T,ssl.verifyhost=FALSE,ssl.verifypeer=FALSE)

url_list=""
i=1:3
url_list[i]=paste0('https://www.cclc.co/debts/lctz_all_all_',i+1)

for(k in url_list){
web1 <-getURL(k, httpheader = myheader, debugfunction = d$update, verbose = T,ssl.verifyhost=FALSE,ssl.verifypeer=FALSE)
web<-c(web,web1)
}

doc<-htmlParse(web,encoding = "UTF-8")

project_title<-sapply(getNodeSet(doc,"//h2[@class='title']//a"),xmlValue)

project_rate<-sapply(getNodeSet(doc,"//span[@class='main_top_num']"),xmlValue)

project_day<-sapply(getNodeSet(doc,"//div[@class='datar fl']//p"),xmlValue)

kk=length(project_day)

project_money_start<-sapply(getNodeSet(doc,"//div[@class='progress-top']//span"),xmlValue)

target_time<-Sys.Date()
project<-data.frame(project_title=project_title[1:kk],project_rate=project_rate[1:kk],project_day=project_day[1:kk],target_time)
project$platform<-c("城城理财")
View(project)

##################模糊匹配处理天数################
a101<-grep("个月",project[,3])
a102<-project[a101,]
a102[,3]<-gsub("个月","",a102[,3])
a102[,3]<-as.numeric(a102[,3])*30

a201<-grep("天",project[,3])
a202<-project[a201,]
a202[,3]<-gsub("天","",a202[,3])
a301<-rbind(a102,a202)
a301[,3]<-as.numeric(a301[,3])

View(a301)


内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: