您的位置:首页 > 数据库 > MySQL

爬取电影天堂的所有资源到mysql

2017-04-23 18:07 204 查看
最近在学习爬虫,想爬取电影天堂的所有电影

开始构思了半天,有几个类几个接口应该实现那些方法,到写的时候才发现想的不全面,加加减减的好歹把程序完成了

自定义一个用于保存将要下载的页面的队列

package Spider1;

import java.util.LinkedList;

/*次类用于实现一个保存要访问的url队列

 *先进先出,后进后出

 */

public class Queue {
//用于保存要访问的url
private LinkedList<String> queue = new LinkedList<String>();
//添加元素的方法
public void push(String url){
queue.add(url);
}
//弹出一个元素的方法
public String pop(){
return queue.poll();
}
//判断是否为空的方法
public Boolean isempty(){
return queue.isEmpty();
}
//判断是否存在的方法
public Boolean contains(String url){
return queue.contains(url);
}

}

一个用于根据url下载页面的类

package Spider1;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.UnsupportedEncodingException;

import org.apache.http.HttpEntity;

import org.apache.http.HttpResponse;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.methods.HttpPost;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

/*

 * 通过url下载html页面的类

 */

public class HtmlDownloader {
public String getHtml(String url){
String html = null;
//创建一个请求客户端
CloseableHttpClient httpclient = HttpClients.createDefault();
//声明一个相应
HttpResponse response = null;
//声明一个缓冲读取流
BufferedReader reader = null;
try {
//获取响应
response = httpclient.execute(new MethodSet().getGetMethod(url));
//获取实体
HttpEntity entity = response.getEntity();
//获取流
reader = new BufferedReader(new InputStreamReader(entity.getContent()));
//读取诗句
String buff = null;
StringBuilder sb = new StringBuilder();
while((buff = reader.readLine()) != null){
sb.append(buff);
}
html = new String(sb.toString().getBytes(),"GBK");
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println(url+"的连接失败");
}
return html;
}
//实现获得Get方法的接口,在里面实现代理ip,请求头等设置
private class MethodSet implements GetMethodSet{
@Override
public HttpGet getGetMethod(String url) {
// TODO Auto-generated method stub
//创建一个get请求方法
HttpGet getmethod = new HttpGet(url);
//HttpHost proxy = new HttpHost("124.88.67.81",80);这里不设置代理IP
//设置请求超时时间等
RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout(10000).setConnectTimeout(10000).setSocketTimeout(10000).build();
//设置请求头,主要是user-agent
getmethod.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
//设置请求参数
getmethod.setConfig(responseconfig);

return getmethod;
}
}

}

实现了一个接口,用于设置get方法

package Spider1;

import org.apache.http.client.methods.HttpGet;

/*

 * 设置Get方法的接口

 */

public interface GetMethodSet {
public HttpGet getGetMethod(String url);

}

一个根据html数据下载网页上所有url的类,实现了一个选择url的接口

package Spider1;

import java.util.LinkedList;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/*

 * 一个用于解析HTML数据的类,筛选出我们需要的链接

 */

public class UrlSpider {
//实现UrlFilter接口
private class Filter implements UrlFilter{
public Boolean isNeed(String url) {
// TODO Auto-generated method stub
if((url.startsWith("/") || url.indexOf("ygdy") > 0)&& !url.startsWith("ftp://")){
return true;
}else{
return false;
}
}
public String overUrl(String url) {
if(url.startsWith("/")){
url = "http://www.ygdy8.com"+url;
}
return url;
}
}
public LinkedList<String> getUrls(String html){
//用于储存爬取的url的链表
LinkedList<String> urls = new LinkedList<String>();
//解析html数据
Document doc = Jsoup.parse(html);
//获取所有a标签
Elements elements = doc.getElementsByTag("a");
Filter urlfilter = null;
try{
urlfilter = new Filter();
}catch(Exception e){

}
for(Element ele:elements){
//获取a标签中的所有链接
String links = ele.attr("href");
if(urlfilter.isNeed(links)){
links = urlfilter.overUrl(links);
urls.add(links);
}
}
System.out.println("已经下载一个网页的所有url");
return urls;
}

}

实现的接口

package Spider1;

public interface UrlFilter {
public Boolean isNeed(String url);
public String overUrl(String url);

}

一个下载页面上的电影资源并且将信息保存到一个map中的类,用的jsoup

package Spider1;

import java.util.HashMap;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/*

 * 获得一个资源页面上的以下资源

 * 1分类 

 * 2名字

 * 3主演

 * 4介绍

 * 5封面图片

 * 6介绍图片

 * 7地址

 * 8豆瓣评分

 */

public class ResourseDownloader {
public HashMap<String,String> getResourse(String html){
HashMap<String,String> resourse = new HashMap<String,String>();
Document doc = Jsoup.parse(html);
//获取分类,介绍,豆瓣主演
//获取分类
Pattern f = Pattern.compile("类  别\\W+\\<?+");
Matcher fm = f.matcher(html);
if(fm.find()){
String Classification = fm.group();
if(Classification != null &&Classification.length() > 6){
Classification = Classification.substring(5,Classification.length() - 1);
}else{
Classification = "未知类型";
}
resourse.put("Classification",Classification);
}
//获取主演
Pattern z = Pattern.compile("主  演\\W+\\<?+");
Matcher zm = z.matcher(html);
if(zm.find()){
String role = zm.group();
if(role != null  && role.length() > 6){
role = role.substring(5, role.length() - 1);
}else{
role = "未知主角";
}
resourse.put("role",role);
}
//获取介绍
Pattern j = Pattern.compile("简  介.+<img");
Matcher jm = j.matcher(html);
if(jm.find()){
String introduce = jm.group();
if(introduce != null && introduce.length() > 10&&introduce.length()<800){
introduce = introduce.replace("<br />","");
introduce = introduce.substring(5,introduce.length()-4);
}else{
introduce = "无介绍";
}
resourse.put("introduce", introduce);
}
//获取豆瓣
Pattern d = Pattern.compile("豆瓣评分.+/10");
Matcher dm = d.matcher(html);
if(dm.find()){
String douban = dm.group();
douban = douban.substring(5, douban.length()-3);
resourse.put("douban", douban);
}
//获取标题
Elements title = doc.getElementsByTag("title");
resourse.put("movename", title.text());
//获取图片
Elements imgs = doc.getElementsByTag("img");
String[] srcs = new String[2];
int i = 0;
for(Element img:imgs){
if(img.attr("src").endsWith(".jpg")){
if(i>=2){
break;
}
srcs[i] = img.attr("src");
i++;
}
}
resourse.put("titilepicture", srcs[0]);
resourse.put("introducepicture", srcs[1]);
//获取资源
Elements hrefs = doc.getElementsByTag("a");
StringBuilder sb1 = new StringBuilder();
for(Element href:hrefs){
if(href.attr("href").startsWith("ftp://"))
sb1.append(href.attr("href")+":~~~~:");
}
resourse.put("address", sb1.toString());
return resourse;
}

}

一个将数据写到数据库类

package Spider1;

import java.sql.Connection;

import java.sql.PreparedStatement;

import java.sql.SQLException;

import java.util.HashMap;

import org.apache.commons.dbcp.BasicDataSource;

public class MysqlSave {
//public static void main(String[] args) {
//new MysqlSave().savetoSql(new ResourseDownloader().getResourse(new HtmlDownloader().getHtml("http://www.ygdy8.com/html/gndy/dyzz/20170215/53264.html")));
//}
private Connection getConnection(){
Connection con = null;
//创建DataSource接口的实现类对象
BasicDataSource datasource = new BasicDataSource();
//设置连接设置的四个基本信息,用set方法设置
datasource.setDriverClassName("com.mysql.jdbc.Driver");
datasource.setUrl("jdbc:mysql://localhost:3306/mybase");
datasource.setUsername("root");
datasource.setPassword("123");
try{
con = datasource.getConnection();
}catch(SQLException e){
System.out.println(e);
throw new RuntimeException("连接失败");
}
return con;
}
public void savetoSql(HashMap<String,String> resourse){
Connection con = getConnection();
String Classification = null;
String movename = null;
String role = null;
String introduce = null;
String titilepicture = null;
String introducepicture = null;
String address = null;
float douban = 0;
if(resourse.get("Classification") != null && resourse.get("Classification").length() < 20){
Classification = resourse.get("Classification");
}else{
Classification = new String("null");
}
if(resourse.get("movename") != null && resourse.get("movename").length() < 50){
movename = resourse.get("movename");
}else{
movename = new String("unknown name");
}
if(resourse.get("role") != null && resourse.get("role").length() < 20){
role = resourse.get("role");
}else{
role = "unknown";
}
if(resourse.get("introduce") != null && resourse.get("introduce").length()<2000){
introduce = resourse.get("introduce");
}else{
introduce = "null";
}
if(resourse.get("titilepicture") != null && resourse.get("titilepicture").length()<100){
titilepicture = resourse.get("titilepicture");
}else{
titilepicture = "null";
}
if(resourse.get("introducepicture") != null && resourse.get("introducepicture").length() <100){
introducepicture = resourse.get("introducepicture");
}else{
introducepicture = "null";
}
if(resourse.get("address") != null && resourse.get("address").length() < 10000){
address = resourse.get("address");
}else{
address = "null";
}
if((resourse.get("douban")) != null){
try{
Float doubanobj = new Float(resourse.get("douban"));
douban = doubanobj.floatValue();
}catch(Exception e){
System.out.println("数字转换错误");
}
}else{

}
String sql = "INSERT INTO moves (Classification,movename,role,introduce,titilepicture,introducepicture,address,douban)"+

"VALUES(?,?,?,?,?,?,?,?)";
try {
//预编译语句
PreparedStatement pst = con.prepareStatement(sql);
pst.setObject(1,Classification);
pst.setObject(2, movename);
pst.setObject(3, role);
pst.setObject(4, introduce);
pst.setObject(5, titilepicture);
pst.setObject(6, introducepicture);
pst.setObject(7, address);
System.out.println(address);
pst.setObject(8, douban);
int num = pst.executeUpdate();
System.out.println("修改了"+num+"行数据");
System.out.println(movename+":"+role+":"+introduce+":"+titilepicture);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

爬虫类,开始爬取数据

package Spider1;

import java.util.HashMap;

import java.util.HashSet;

import java.util.Iterator;

import java.util.LinkedList;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/*

 * 爬取电影天堂的页面

 */

public class Spider{
public static void main(String[] args) {
new Spider().URLSpider();
}
public void URLSpider(){
//储存未被遍历的url数据
Queue unVisit = new Queue();
//已经遍历过的url
HashSet<String> Visited = new HashSet<String>();
//初始化Queue,加入初始url
unVisit.push("http://www.ygdy8.com");
//页面下载器
HtmlDownloader htmldownloader = new HtmlDownloader();
//链接下载器
UrlSpider urlspider = new UrlSpider();
//资源下载器
ResourseDownloader resoursedownloader = new ResourseDownloader();
//url种类判断器
urlparse urlchooser = new urlparse();
//资源保存器
MysqlSave save = new MysqlSave();
while(!unVisit.isempty()){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String url = unVisit.pop();
if(!Visited.contains(url)){
int num = urlchooser.geturlclass(url);

//获取一个页面的html
String html = null;
try{
html = htmldownloader.getHtml(url);
}catch(IllegalArgumentException e){
System.out.println(url+"非法的url");
continue;
}
//如果是普通网页,下载网页的所有链接
if(num == 0){
LinkedList<String> urls = new LinkedList<String>();
try{
//获得一个页面上的所有linkedList
urls = urlspider.getUrls(html);
}catch(Exception e){
System.out.println(url+"下载失败");
continue;
}
//将这个url加入到已遍历的队列中
Visited.add(url);
//将urls加入的队列中
Iterator<String> it = urls.iterator();
while(it.hasNext()){
String newurl = it.next();
if(!unVisit.contains(newurl)){
unVisit.push(newurl);
}
}
System.out.println("以遍历一个网站的所有url");
}
//如果是资源网站,下载并保存链接
if(num == 1){
try{
System.out.println("开始下载:"+url);
HashMap<String,String> resourse = resoursedownloader.getResourse(html);
save.savetoSql(resourse);
System.out.println("下载了一个电影");
LinkedList<String> urls = new LinkedList<String>();
try{
//获得一个页面上的所有linkedList
urls = urlspider.getUrls(html);
}catch(Exception e){
System.out.println(url+"下载失败");
continue;
}
//将这个url加入到已遍历的队列中
Visited.add(url);
//将urls加入的队列中
Iterator<String> it = urls.iterator();
while(it.hasNext()){
String newurl = it.next();
if(!unVisit.contains(newurl)){
unVisit.push(newurl);
}
}
System.out.println("以遍历一个网站的所有url");
}catch(Exception e){
System.out.println("资源下载错误");
continue;
}
}
}
}
System.out.println("下载完毕");
}
private class urlparse implements UrlChoose{
public int geturlclass(String url) {
Pattern p = Pattern.compile(".+\\d+\\.html");
Matcher m = p.matcher(url);
if(m.matches()){
return 1;
}
return 0;
}
}

}

实现一个接口用于判断url是否为电影页面,如果是,特殊处理

package Spider1;

/*

 * 判断是资源url或者是普通url

 */

public interface UrlChoose {
public int geturlclass(String url);

}

目前爬了三个小时,,爬取了大约六千多条电影和电视信息
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: