您的位置:首页 > 其它

爬虫的综合案例

2017-10-19 22:15 197 查看

爬虫的综合案例爬取虎嗅网的完整Demo

创建Maven项目

首先引入依赖

<dependencies>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.2.6.RELEASE</version>
</dependency>

<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.41</version>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.31</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.1</version>
</dependency>

</dependencies>


使用的是mysql数据库

创建数据库名字为:spider

create database spider;


创建表的代码如下:

CREATE TABLE `huxiu_article` (
`id` varchar(250) DEFAULT NULL,
`title` varchar(250) DEFAULT NULL,
`author` varchar(250) DEFAULT NULL,
`createTime` varchar(250) DEFAULT NULL,
`zan` varchar(250) DEFAULT NULL,
`pl` varchar(250) DEFAULT NULL,
`sc` varchar(250) DEFAULT NULL,
`content` blob,
`url` varchar(250) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8


具体代码如下

实体类Article

public class Article {
private String id;
private String url;
private String title;
private String author;
private String createTime;
private String pl;
private String zan;
private String sc;
private String content;

public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
........

}


操作数据库的ArticleDao

public class ArticleDao extends JdbcTemplate {

public ArticleDao() {
// 创建C3P0的datasource 1.配置 2.代码
ComboPooledDataSource dataSource = new ComboPooledDataSource();
// 1.url
// 2.driver
// 3.username&password
dataSource.setUser("root");
dataSource.setPassword("root");
dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8");
setDataSource(dataSource);
}

public void save(Article article) {
String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)";
update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl());
}
}


返回值的实体类HuxiuPagingResponse

public class HuxiuPagingResponse {

private String data;
private String last_dateline;
private String msg;
private String result;
private String total_page;
public String getData() {
return data;
}
public void setData(String data) {
this.data = data;
}
........

}


程序主方法入口HuXiuSpider

public class HuXiuSpider {
// 保存数据
public static ArticleDao articleDao = new ArticleDao();
// dataLine用来做分页的请求
private static String dateLine = null;
// 创建固定大小的线程池(下载、解析、存储)
private static ExecutorService threadPool = Executors.newFixedThreadPool(30);
// 队列---从首页和分页解析出来的文章url,存放在这个队列中
public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000);
// 队列---每个文章解析出来的Html文档,存放这个队列中
public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000);
// 队列---每个文章的内容,也就是article对象,存放这个队列中
public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000);

public static void main(String[] args) throws Exception {
// 提交线程 用来针对每个文章的url ----进行网络请求
for (int i = 0; i < 10; i++) {
threadPool.execute(new ProcessSinglePageRunnable());
}
// 解析页面
for (int i = 0; i < 10; i++) {
threadPool.execute(new ParseHtmlRunnable());
}
// 保存数据
threadPool.execute(new SaveDBRunnable());
//获取首页的文章url列表
getIndexArticleUrlList();
//加载分页
processPaging();
}
/**
* 获取首页的文章列表信息
*
* @throws IOException
* @throws ClientProtocolException
*/
private static void getIndexArticleUrlList() throws IOException, ClientProtocolException {
// 1.指定首页url http://www.huxiu.com String indexUrl = "http://www.huxiu.com";
// 2.发起一个HttpGet请求
HttpGet indexHttpGet = new HttpGet(indexUrl);
//设置User-Agent
indexHttpGet.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

String html = getHtmlByRequest(indexHttpGet);

// 5.使用Jsoup进行解析,得到 文章的列表 ,获得文章aid。
Document indexDocument = Jsoup.parse(html);

// 获取date_line
Elements dateLines = indexDocument.select("[data-last_dateline]");
dateLine = dateLines.get(0).attr("data-last_dateline");

// 5.1 解析出div的某个属性data-id
Elements aidElements = indexDocument.select("div[data-aid]");
// 5.2 依次得到每个新闻的aid
for (Element element : aidElements) {
String aid = element.attr("data-aid");
try {
urlQueue.put(aid);
} catch (InterruptedException e) {
System.out.println("添加 aid 到urlQueue异常" + e);
}
}
}

private static void processPaging() {
for (int page = 2; page <= 1615; page++) {
try {
// 编写分页
String pagingUrl = "https://www.huxiu.com/v2_action/article_list";
HttpPost httpPost = new HttpPost(pagingUrl);
// 设置参数
ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>();
arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10"));
arrayList.add(new BasicNameValuePair("page", page + ""));
arrayList.add(new BasicNameValuePair("last_dateline", dateLine));
httpPost.setEntity(new UrlEncodedFormEntity(arrayList));
// 执行网络参数
String jsonText = getHtmlByRequest(httpPost);
// 想将json串转成对象
Gson gson = new Gson();
HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class);
// 每一次请求,都需要解析出新的dataLine
dateLine = huxiuPagingResponse.getLast_dateline();
// 获取数据
String htmlData = huxiuPagingResponse.getData();

Document doc = Jsoup.parse(htmlData);
// 解析出div的某个属性data-id
Elements aidElements = doc.select("div[data-aid]");
// 依次得到每个新闻的aid
for (Element element : aidElements) {
String aid = element.attr("data-aid");
urlQueue.put(aid);
}
} catch (Exception e) {
// log.errer()
System.out.println(page);
System.out.println(e);
}
try {
Thread.sleep(500);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
}

/**
* 获取html文档
* @throws IOException
* @throws ClientProtocolException
*/
public static String getHtml(String aidUrl) throws IOException, ClientProtocolException {
// 2.发起一个httpget请求
HttpGet indexHttpGet = new HttpGet(aidUrl);
return getHtmlByRequest(indexHttpGet);
}
private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException {
//设置请求头User-Agent
request.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");

// 3.使用HttpClient执行,得到一个entity。
CloseableHttpClient indexHttpClient = HttpClients.createDefault();
CloseableHttpResponse indexResponse = indexHttpClient.execute(request);
String html = null;
if (200 == indexResponse.getStatusLine().getStatusCode()) {
HttpEntity indexEntity = indexResponse.getEntity();
// 4.将entity转成字符串(html)
html = EntityUtils.toString(indexEntity, Charset.forName("utf-8"));
}
return html;
}
}


用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable

public class ProcessSinglePageRunnable  implements Runnable {

public void run() {
while (true) {
try {
processSingleUrl();
Thread.sleep(3000);
} catch (InterruptedException e) {
}
}
}

private void processSingleUrl() throws InterruptedException {
String aid = HuXiuSpider.urlQueue.take();
String aidUrl = "http://www.huxiu.com/article/" + aid + ".html";
try {
/*Article article = new Article();
article.setId(aid);*/
// 获取到单个新闻页面的html
String aidHtml = HuXiuSpider.getHtml(aidUrl);
HuXiuSpider.articleHtmlQueue.put(aidHtml);
} catch (Exception e) {
System.out.println(aidUrl);
System.out.println(e);
}
}
}


解析每个页面ParseHtmlRunnable

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class ParseHtmlRunnable implements Runnable {

public void run() {
while (true) {
String html = null;
try {
html = HuXiuSpider.articleHtmlQueue.take();
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Article article = new Article();
Document detailDocument = Jsoup.parse(html);

//解析出div的某个属性data-id
Elements aidElements = detailDocument.select("div[data-aid]");
String aid=aidElements.get(0).attr("data-aid");
article.setId(aid);
System.out.println(aid+".........");

// 解析文章title
Elements titles = detailDocument.select(".t-h1");
String title = titles.get(0).text();
article.setTitle(title);

// 解析文章author author-name
Elements names = detailDocument.select(".author-name");
String name = names.get(0).text();
article.setAuthor(name);
// 解析文章发布时间
Elements dates = detailDocument.select("[class^=article-time]");
String date = dates.get(0).text();
article.setCreateTime(date);
// 解析文章 收藏数
Elements shares = detailDocument.select("[class^=article-share]");
String share = shares.get(0).text();
article.setSc(share);
// 解析文章 评论数
Elements pls = detailDocument.select("[class^=article-pl]");
String pl = pls.get(0).text();
article.setPl(pl);
// 解析文章 点赞数 num
Elements nums = detailDocument.select(".num");
String num = nums.get(0).text();
article.setZan(num);
// 解析文章正文内容 article-content-wrap
Elements content = detailDocument.select(".article-content-wrap p");
String contentText = content.text();
article.setContent(contentText);
// article.setUrl(aidUrl);
try {
HuXiuSpider.articleContentQueue.put(article);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

}


保存数据的类SaveDBRunnable

public class SaveDBRunnable implements Runnable {
public void run() {
while (true) {
try {
Article article = HuXiuSpider.articleContentQueue.take();
HuXiuSpider.articleDao.save(article);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: