优酷电视剧爬虫代码实现一:下载解析视频网站页面(1)
2017-01-12 10:51
986 查看
1.新建maven项目,导入基本jar包:
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.10</version>
</dependency>
</dependencies>
2.新建页面下载工具类PageDownLoadUtil
package com.dajiangtai.djt_spider.util;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
/**
* 页面下载工具:将整个页面的html源码全部下载下来
* @author lch
* created by 2017/1/9
*
*/
public class PageDownLoadUtil {
private static String sinaCharset="gb2312";
public static String getPageContent(String url){
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity,sinaCharset);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) {
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//String url = "http://news.sina.com.cn";
//String content = PageDownLoadUtil.getPageContent(url);
HttpClientDownLoadService down = new HttpClientDownLoadService();
Page page = down.download(url);
System.out.println(page.getContent());
}
}
3.测试。输出:
4.新建一个实体类,用来存储需要爬取的数据
package com.dajiangtai.djt_spider.entity;
/**
* 存储页面信息实体类
* @author dajiangtai
* created by 2017-01-09
*
*/
public class Page {
//页面内容
private String content;
//总播放量
private String allnumber;
//每日播放增量
private String daynumber;
//评论数
private String commentnumber;
//收藏数
private String collectnumber;
//赞
private String supportnumber;
//踩
private String againstnumber;
//电视剧名称
private String tvname;
//页面url
private String url;
//子集数据
private String episodenumber;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAllnumber() {
return allnumber;
}
public void setAllnumber(String allnumber) {
this.allnumber = allnumber;
}
public String getDaynumber() {
return daynumber;
}
public void setDaynumber(String daynumber) {
this.daynumber = daynumber;
}
public String getCommentnumber() {
return commentnumber;
}
public void setCommentnumber(String commentnumber) {
this.commentnumber = commentnumber;
}
public String getCollectnumber() {
return collectnumber;
}
public void setCollectnumber(String collectnumber) {
this.collectnumber = collectnumber;
}
public String getSupportnumber() {
return supportnumber;
}
public void setSupportnumber(String supportnumber) {
this.supportnumber = supportnumber;
}
public String getAgainstnumber() {
return againstnumber;
}
public void setAgainstnumber(String againstnumber) {
this.againstnumber = againstnumber;
}
public String getTvname() {
return tvname;
}
public void setTvname(String tvname) {
this.tvname = tvname;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEpisodenumber() {
return episodenumber;
}
public void setEpisodenumber(String episodenumber) {
this.episodenumber = episodenumber;
}
}
5.新建一个页面下载接口,给定url,返回Page,url返回的页面内容保存在page的content属性中。接口的目的,降低代码耦合度。
package com.dajiangtai.djt_spider.service;
import com.dajiangtai.djt_spider.entity.Page;
/**
* 页面下载接口
* @author dajiangtai
* created by 2017-01-09
*
*/
public interface IDownLoadService {
public Page download(String url);
}
6.页面接口实现类HttpClientDownLoadService
package com.dajiangtai.djt_spider.service.impl;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.util.PageDownLoadUtil;
public class HttpClientDownLoadService implements IDownLoadService {
public Page download(String url) {
Page page = new Page();
page.setContent(PageDownLoadUtil.getPageContent(url));
return page;
}
}
7.在页面下载工具类的main方法中进行测试:
package com.dajiangtai.djt_spider.util;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
/**
* 页面下载工具:将整个页面的html源码全部下载下来
* @author lch
* created by 2017/1/9
*
*/
public class PageDownLoadUtil {
private static String sinaCharset="gb2312";
public static String getPageContent(String url){
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity,sinaCharset);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) {
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//测试页面下载工具
//String content = PageDownLoadUtil.getPageContent(url);
//System.out.println(content);
//测试页面下载实现类HttpClientDownLoadService 的download方法是否有效
HttpClientDownLoadService down = new HttpClientDownLoadService();
Page page = down.download(url);
System.out.println(page.getContent());
}
}
8.运行main方法,输出:
总结:由于不同的页面有不同的下载方式,因此这里采用接口的思想,降低代码的耦合度。
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.10</version>
</dependency>
</dependencies>
2.新建页面下载工具类PageDownLoadUtil
package com.dajiangtai.djt_spider.util;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
/**
* 页面下载工具:将整个页面的html源码全部下载下来
* @author lch
* created by 2017/1/9
*
*/
public class PageDownLoadUtil {
private static String sinaCharset="gb2312";
public static String getPageContent(String url){
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity,sinaCharset);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) {
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//String url = "http://news.sina.com.cn";
//String content = PageDownLoadUtil.getPageContent(url);
HttpClientDownLoadService down = new HttpClientDownLoadService();
Page page = down.download(url);
System.out.println(page.getContent());
}
}
3.测试。输出:
4.新建一个实体类,用来存储需要爬取的数据
package com.dajiangtai.djt_spider.entity;
/**
* 存储页面信息实体类
* @author dajiangtai
* created by 2017-01-09
*
*/
public class Page {
//页面内容
private String content;
//总播放量
private String allnumber;
//每日播放增量
private String daynumber;
//评论数
private String commentnumber;
//收藏数
private String collectnumber;
//赞
private String supportnumber;
//踩
private String againstnumber;
//电视剧名称
private String tvname;
//页面url
private String url;
//子集数据
private String episodenumber;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAllnumber() {
return allnumber;
}
public void setAllnumber(String allnumber) {
this.allnumber = allnumber;
}
public String getDaynumber() {
return daynumber;
}
public void setDaynumber(String daynumber) {
this.daynumber = daynumber;
}
public String getCommentnumber() {
return commentnumber;
}
public void setCommentnumber(String commentnumber) {
this.commentnumber = commentnumber;
}
public String getCollectnumber() {
return collectnumber;
}
public void setCollectnumber(String collectnumber) {
this.collectnumber = collectnumber;
}
public String getSupportnumber() {
return supportnumber;
}
public void setSupportnumber(String supportnumber) {
this.supportnumber = supportnumber;
}
public String getAgainstnumber() {
return againstnumber;
}
public void setAgainstnumber(String againstnumber) {
this.againstnumber = againstnumber;
}
public String getTvname() {
return tvname;
}
public void setTvname(String tvname) {
this.tvname = tvname;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getEpisodenumber() {
return episodenumber;
}
public void setEpisodenumber(String episodenumber) {
this.episodenumber = episodenumber;
}
}
5.新建一个页面下载接口,给定url,返回Page,url返回的页面内容保存在page的content属性中。接口的目的,降低代码耦合度。
package com.dajiangtai.djt_spider.service;
import com.dajiangtai.djt_spider.entity.Page;
/**
* 页面下载接口
* @author dajiangtai
* created by 2017-01-09
*
*/
public interface IDownLoadService {
public Page download(String url);
}
6.页面接口实现类HttpClientDownLoadService
package com.dajiangtai.djt_spider.service.impl;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.util.PageDownLoadUtil;
public class HttpClientDownLoadService implements IDownLoadService {
public Page download(String url) {
Page page = new Page();
page.setContent(PageDownLoadUtil.getPageContent(url));
return page;
}
}
7.在页面下载工具类的main方法中进行测试:
package com.dajiangtai.djt_spider.util;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
/**
* 页面下载工具:将整个页面的html源码全部下载下来
* @author lch
* created by 2017/1/9
*
*/
public class PageDownLoadUtil {
private static String sinaCharset="gb2312";
public static String getPageContent(String url){
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity,sinaCharset);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
public static void main(String[] args) {
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//测试页面下载工具
//String content = PageDownLoadUtil.getPageContent(url);
//System.out.println(content);
//测试页面下载实现类HttpClientDownLoadService 的download方法是否有效
HttpClientDownLoadService down = new HttpClientDownLoadService();
Page page = down.download(url);
System.out.println(page.getContent());
}
}
8.运行main方法,输出:
总结:由于不同的页面有不同的下载方式,因此这里采用接口的思想,降低代码的耦合度。
相关文章推荐
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(4)补充正则:java Pattern和Matcher详解
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(3)
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(2)
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(3)补充知识点:XPath无效怎么办?
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(3)补充知识点:怎样获取XPath
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(3)补充知识点:htmlcleaner使用案例
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(4)补充: Java正则表达式Matcher.group(int group)相关类解析
- 优酷电视剧爬虫代码实现一:下载解析视频网站页面(4)
- 爬虫代码实现三:打通爬虫项目的下载、解析、存储流程
- 爬虫代码实现二:抽取网站规则模板,优化解析代码(2)
- java实现优酷视频地址解析示例代码分享
- .NET 实现解析全国各大视频网站真实视频下载地址
- 爬虫代码实现二:抽取网站规则模板,优化解析代码(1)
- java 实现视频网站视频地址解析
- (转)用.net实现远程获取其他网站页面内容!(核心代码分析)
- 视频播放网站CDN内容分发网络简单代码实现
- asp.net网站下载功能页面代码
- 用.net实现远程获取其他网站页面内容!(核心代码分析)
- 视频播放网站CDN内容分发网络简单代码实现
- 分享一个在线解析提取网页视频的网站 不用另外下载工具 绝非广告