爬虫的综合案例
2017-10-19 22:15
197 查看
爬虫的综合案例爬取虎嗅网的完整Demo
创建Maven项目首先引入依赖
<dependencies> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.6.RELEASE</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.41</version> </dependency> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.31</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.1</version> </dependency> </dependencies>
使用的是mysql数据库
创建数据库名字为:spider
create database spider;
创建表的代码如下:
CREATE TABLE `huxiu_article` ( `id` varchar(250) DEFAULT NULL, `title` varchar(250) DEFAULT NULL, `author` varchar(250) DEFAULT NULL, `createTime` varchar(250) DEFAULT NULL, `zan` varchar(250) DEFAULT NULL, `pl` varchar(250) DEFAULT NULL, `sc` varchar(250) DEFAULT NULL, `content` blob, `url` varchar(250) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8
具体代码如下
实体类Article
public class Article { private String id; private String url; private String title; private String author; private String createTime; private String pl; private String zan; private String sc; private String content; public String getId() { return id; } public void setId(String id) { this.id = id; } ........ }
操作数据库的ArticleDao
public class ArticleDao extends JdbcTemplate { public ArticleDao() { // 创建C3P0的datasource 1.配置 2.代码 ComboPooledDataSource dataSource = new ComboPooledDataSource(); // 1.url // 2.driver // 3.username&password dataSource.setUser("root"); dataSource.setPassword("root"); dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8"); setDataSource(dataSource); } public void save(Article article) { String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)"; update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl()); } }
返回值的实体类HuxiuPagingResponse
public class HuxiuPagingResponse { private String data; private String last_dateline; private String msg; private String result; private String total_page; public String getData() { return data; } public void setData(String data) { this.data = data; } ........ }
程序主方法入口HuXiuSpider
public class HuXiuSpider { // 保存数据 public static ArticleDao articleDao = new ArticleDao(); // dataLine用来做分页的请求 private static String dateLine = null; // 创建固定大小的线程池(下载、解析、存储) private static ExecutorService threadPool = Executors.newFixedThreadPool(30); // 队列---从首页和分页解析出来的文章url,存放在这个队列中 public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000); // 队列---每个文章解析出来的Html文档,存放这个队列中 public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000); // 队列---每个文章的内容,也就是article对象,存放这个队列中 public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000); public static void main(String[] args) throws Exception { // 提交线程 用来针对每个文章的url ----进行网络请求 for (int i = 0; i < 10; i++) { threadPool.execute(new ProcessSinglePageRunnable()); } // 解析页面 for (int i = 0; i < 10; i++) { threadPool.execute(new ParseHtmlRunnable()); } // 保存数据 threadPool.execute(new SaveDBRunnable()); //获取首页的文章url列表 getIndexArticleUrlList(); //加载分页 processPaging(); } /** * 获取首页的文章列表信息 * * @throws IOException * @throws ClientProtocolException */ private static void getIndexArticleUrlList() throws IOException, ClientProtocolException { // 1.指定首页url http://www.huxiu.com String indexUrl = "http://www.huxiu.com"; // 2.发起一个HttpGet请求 HttpGet indexHttpGet = new HttpGet(indexUrl); //设置User-Agent indexHttpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); String html = getHtmlByRequest(indexHttpGet); // 5.使用Jsoup进行解析,得到 文章的列表 ,获得文章aid。 Document indexDocument = Jsoup.parse(html); // 获取date_line Elements dateLines = indexDocument.select("[data-last_dateline]"); dateLine = dateLines.get(0).attr("data-last_dateline"); // 5.1 解析出div的某个属性data-id Elements aidElements = indexDocument.select("div[data-aid]"); // 5.2 依次得到每个新闻的aid for (Element element : aidElements) { String aid = element.attr("data-aid"); try { urlQueue.put(aid); } catch (InterruptedException e) { System.out.println("添加 aid 到urlQueue异常" + e); } } } private static void processPaging() { for (int page = 2; page <= 1615; page++) { try { // 编写分页 String pagingUrl = "https://www.huxiu.com/v2_action/article_list"; HttpPost httpPost = new HttpPost(pagingUrl); // 设置参数 ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>(); arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10")); arrayList.add(new BasicNameValuePair("page", page + "")); arrayList.add(new BasicNameValuePair("last_dateline", dateLine)); httpPost.setEntity(new UrlEncodedFormEntity(arrayList)); // 执行网络参数 String jsonText = getHtmlByRequest(httpPost); // 想将json串转成对象 Gson gson = new Gson(); HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class); // 每一次请求,都需要解析出新的dataLine dateLine = huxiuPagingResponse.getLast_dateline(); // 获取数据 String htmlData = huxiuPagingResponse.getData(); Document doc = Jsoup.parse(htmlData); // 解析出div的某个属性data-id Elements aidElements = doc.select("div[data-aid]"); // 依次得到每个新闻的aid for (Element element : aidElements) { String aid = element.attr("data-aid"); urlQueue.put(aid); } } catch (Exception e) { // log.errer() System.out.println(page); System.out.println(e); } try { Thread.sleep(500); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 获取html文档 * @throws IOException * @throws ClientProtocolException */ public static String getHtml(String aidUrl) throws IOException, ClientProtocolException { // 2.发起一个httpget请求 HttpGet indexHttpGet = new HttpGet(aidUrl); return getHtmlByRequest(indexHttpGet); } private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException { //设置请求头User-Agent request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); // 3.使用HttpClient执行,得到一个entity。 CloseableHttpClient indexHttpClient = HttpClients.createDefault(); CloseableHttpResponse indexResponse = indexHttpClient.execute(request); String html = null; if (200 == indexResponse.getStatusLine().getStatusCode()) { HttpEntity indexEntity = indexResponse.getEntity(); // 4.将entity转成字符串(html) html = EntityUtils.toString(indexEntity, Charset.forName("utf-8")); } return html; } }
用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable
public class ProcessSinglePageRunnable implements Runnable { public void run() { while (true) { try { processSingleUrl(); Thread.sleep(3000); } catch (InterruptedException e) { } } } private void processSingleUrl() throws InterruptedException { String aid = HuXiuSpider.urlQueue.take(); String aidUrl = "http://www.huxiu.com/article/" + aid + ".html"; try { /*Article article = new Article(); article.setId(aid);*/ // 获取到单个新闻页面的html String aidHtml = HuXiuSpider.getHtml(aidUrl); HuXiuSpider.articleHtmlQueue.put(aidHtml); } catch (Exception e) { System.out.println(aidUrl); System.out.println(e); } } }
解析每个页面ParseHtmlRunnable
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; public class ParseHtmlRunnable implements Runnable { public void run() { while (true) { String html = null; try { html = HuXiuSpider.articleHtmlQueue.take(); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } Article article = new Article(); Document detailDocument = Jsoup.parse(html); //解析出div的某个属性data-id Elements aidElements = detailDocument.select("div[data-aid]"); String aid=aidElements.get(0).attr("data-aid"); article.setId(aid); System.out.println(aid+"........."); // 解析文章title Elements titles = detailDocument.select(".t-h1"); String title = titles.get(0).text(); article.setTitle(title); // 解析文章author author-name Elements names = detailDocument.select(".author-name"); String name = names.get(0).text(); article.setAuthor(name); // 解析文章发布时间 Elements dates = detailDocument.select("[class^=article-time]"); String date = dates.get(0).text(); article.setCreateTime(date); // 解析文章 收藏数 Elements shares = detailDocument.select("[class^=article-share]"); String share = shares.get(0).text(); article.setSc(share); // 解析文章 评论数 Elements pls = detailDocument.select("[class^=article-pl]"); String pl = pls.get(0).text(); article.setPl(pl); // 解析文章 点赞数 num Elements nums = detailDocument.select(".num"); String num = nums.get(0).text(); article.setZan(num); // 解析文章正文内容 article-content-wrap Elements content = detailDocument.select(".article-content-wrap p"); String contentText = content.text(); article.setContent(contentText); // article.setUrl(aidUrl); try { HuXiuSpider.articleContentQueue.put(article); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
保存数据的类SaveDBRunnable
public class SaveDBRunnable implements Runnable { public void run() { while (true) { try { Article article = HuXiuSpider.articleContentQueue.take(); HuXiuSpider.articleDao.save(article); } catch (InterruptedException e) { e.printStackTrace(); } } } }
相关文章推荐
- 爬虫——综合案例流程版
- 第02天多线程网络:(15):多图下载综合案例-数据展示
- C#事件-综合案例分析
- Python爬虫小案例:豆瓣电影TOP250
- 监听器综合案例
- 1-6 综合案例-生产者消费者
- python链接爬虫案例
- python 案例:使用BeautifuSoup4的爬虫
- ExtJS中grid按照使用Expand插件、分组显示、中文拼音首字母排序、改变行背景、列背景、静态数据分页综合案例
- 07_Android操作sqllite数据库(包括2中方式操作数据的方式),单元测试,BaseAdapter的使用,自定义view的综合使用案例
- 配置思科三层交换的综合案例
- JavaWeb综合案例-键盘模拟
- BufferedReader、PrintWriter自动行刷新、FileWriter 追加文本、HashSet、String的综合运用案例
- netfilter/iptables+squid综合案例分析
- 第一周、课时15:综合案例——选项卡
- 爬虫案例——豆瓣出版商
- 使用正则表达式写一个网页爬虫案例获取指定文档中的邮件地址保存到自己指定的文件夹中
- HtmlUnit 爬虫简单案例——模拟登陆CSDN
- node爬虫案例(经典版)
- UI基础控件综合案例之案例介绍及案例分析