您的位置:首页 > 运维架构 > 网站架构

java爬虫 爬取网站信息 保存数据库

2018-01-31 10:06 453 查看
需求分析

1:爬取虎嗅首页获取首页文章地址:https://www.huxiu.com/

2:爬取虎嗅分页地址,获取分页上的文章地址。

3:爬取文章详情页,获取文章信息(标题、正文、作者、发布时间、评论数、点赞数、收藏数)。

4:将爬到的文章信息入库。

实现思路

1:爬首页

请求地址:https://www.huxiu.com/

请求方式:get  

请求参数:无

请求头:  User-Agent

返回数据:html文档 

使用jsoup解析html文档,获取文章id

2:爬文章详情页

请求地址:https://www.huxiu.com/article/231552.html

请求方式:get

请求参数:无

请求头:  User-Agent

返回数据:html文档 

使用jsoup解析html文档,获取文章信息(标题、正文、作者、发布时间、评论数、点赞数、收藏数)

将文章信息入库

3:爬下一页

请求地址:https://www.huxiu.com/v2_action/article_list

请求方式:POST

请求参数:

huxiu_hash_code    :bc1acc4ae8cc354069293a255b8140fc  //固定不变的

page:2                                             //变的,爬哪一页,从2开始     

last_dateline:1516942440//访问上一页的时间戳,如果爬的是第二页,从首页获取,如果爬的是第三页,从第二页的返回结果中获取

请求头:  User-Agent

返回数据:



 result:    1,

 msg:    获取成功,

 data:    <div class="mod-b mod-art" dat…v> </div> </div>,

 total_page:    1703,

 last_dateline:    1516788540

}                                  

解析:使用gson解析json串,使用jsoup解析html文档,然后获取当前页文章的id

4:参考第二步

总结:思路大于代码,思路怎么来:

1:分析喽

2:猜测喽

3:f12调试喽

准备工作

1:新建普通maven工程

2:pom依赖

        <dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.2.6.RELEASE</version>
</dependency>

<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.41</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.31</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>

java代码实现

package cn.itcast.huxiu;

import java.io.IOException;

import java.util.ArrayList;

import org.apache.http.HttpEntity;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

public class HuXiuTest {
public static void main(String[] args) throws Exception {
for (int i = 0; i < 100000; i++) {
//爬取首页的信息
String indexHtml = getIndex();
//解析首页 得到首页里面的所有的id(根据id来查询每一个页面的信息) 存储到集合里面
ArrayList<String> ids = parseIndexHtml(indexHtml);
//得到了所有详情文章的id 通过文章的id来查询每一篇文章的信息 并且把这些信息保存在自己的数据库里面
parseXianQingYeMian(ids);
}

}

private static void parseXianQingYeMian(ArrayList<String> ids) throws IOException, ClientProtocolException {

if(ids.size() != 0){
for (String pid : ids) {
//遍历得到了 每一个页面的id
ArticleDao articleDao = new ArticleDao();
int id = Integer.parseInt(pid);
//创建发送请求
HttpGet httpGet = new HttpGet("https://www.huxiu.com/article/"+id+".html");
//消息头
httpGet.addHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
CloseableHttpClient httpClient = HttpClients.createDefault();
//发送请求
CloseableHttpResponse execute = httpClient.execute(httpGet);
//判断这个详细页面是否可以加载成功
if(execute.getStatusLine().getStatusCode() == 200){//200表示加载到了详情的页面的信息
HttpEntity entity = execute.getEntity();
String html = EntityUtils.toString(entity);
Article article = new Article();
article.setId(id);
//将详细页面的信息 转换为文档对象
Document document = Jsoup.parse(html);
//获取文章的标题信息
String ownText = document.select(".t-h1").get(0).ownText();
article.setTitle(ownText);
//获取作者
String author = document.select(".author-name").get(0).text();
article.setAuthor(author);
//获取时间 根据页面
c7ca
上的信息可知时间有两种表示
Elements elements = document.select("span[class=article-time pull-left]");
if(elements.size() == 0){
String createTime = document.select(".article-time").get(0).ownText();
article.setCreateTime(createTime);
}else{
String createTime = elements.get(0).ownText();
article.setCreateTime(createTime);
}
//获取文章内容
String content = document.select(".article-content-wrap").get(0).text();
article.setContent(content);
//获取点赞
article.setZan(document.select(".num").get(0).ownText());
//获取评论
article.setPl(document.select(".article-pl").get(0).ownText());
System.out.println(article);

// articleDao.save(article);
}
}
}
}

//解析数据 得到url
private static ArrayList<String> parseIndexHtml(String indexHtml) {
// TODO Auto-generated method stub
if(indexHtml != null){
ArrayList<String> urls = new ArrayList<String>();
//解析得到的页面的信息 将其变成文档对象
Document document = Jsoup.parse(indexHtml);
//得到document对象后 就可以通过document对象来得到需要的东西
Elements elements = document.select(".mod-info-flow div[data-aid]");
for (Element element : elements) {
String url = element.attr("data-aid");
urls.add(url);
System.out.println(url);
}
return urls;
}
return null;
}

//首页的获取
private static String getIndex() throws Exception {
String url = "https://www.huxiu.com";
//发起一个get请求
HttpGet httpGet = new HttpGet(url);
//设置请求头
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
//返回页面的信息
return getHtml(httpGet);
}

//执行发送请求的方法
private static String getHtml(HttpGet httpGet) throws Exception {
// TODO Auto-generated method stub
String html = null;
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse execute = httpClient.execute(httpGet);
//判断响应码是否为200
if(execute.getStatusLine().getStatusCode() == 200){
HttpEntity entity = execute.getEntity();
html = EntityUtils.toString(entity);
System.out.println(html);//返回的的页面的所有信息
}
return html;
}

}

实体类代码

package cn.itcast.huxiu;

public class Article {
private int id;
private String title;
private String author;
private String createTime;
private String sc;
private String zan;
private String pl;
private String content;
private String url;

public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getSc() {
return sc;
}
public void setSc(String sc) {
this.sc = sc;
}
public String getZan() {
return zan;
}
public void setZan(String zan) {
this.zan = zan;
}
public String getPl() {
return pl;
}
public void setPl(String pl) {
this.pl = pl;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Article [id=" + id + ", title=" + title + ", author=" + author + ", createTime=" + createTime + ", sc="
+ sc + ", zan=" + zan + ", pl=" + pl + ", content=" + content + ", url=" + url + "]";
}

}

调数据库代码

package cn.itcast.huxiu;

import org.springframework.jdbc.core.JdbcTemplate;

import com.mchange.v2.c3p0.ComboPooledDataSource;

public class ArticleDao extends JdbcTemplate{
public ArticleDao() {
// 创建C3P0的datasource 1.配置 2.代码
ComboPooledDataSource dataSource = new ComboPooledDataSource();
// 1.url
// 2.driver
// 3.username&password
dataSource.setUser("root");
dataSource.setPassword("123");
dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
setDataSource(dataSource);
}
public void save(Article article) {
String sql = "INSERT INTO huxiu_article (id, title, author, createTime, zan, pl, sc, content, url ) VALUES( ?,?,?,?,?,?,?,?,?)";
update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐