您的位置：首页 > 数据库

crawler4j 爬虫网页数据并保存到数据库中

2016-09-09 10:20 591 查看

package hotkidclub.controller;

import hotkidclub.model.Politic;

import java.io.IOException;

import java.sql.Timestamp;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import java.util.regex.Pattern;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import edu.uci.ics.crawler4j.crawler.Page;

import edu.uci.ics.crawler4j.crawler.WebCrawler;

import edu.uci.ics.crawler4j.parser.HtmlParseData;

import edu.uci.ics.crawler4j.url.WebURL;

public class MyCrawler extends WebCrawler{

private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"

+ "|png|mp3|mp3|zip|gz))$");

private final static String URL = "http://politics.people.com.cn";

/**
* 这方法决定了要抓取的URL及其内容
*/

@Override

public boolean shouldVisit(Page referringPage, WebURL url) {

String href = url.getURL().toLowerCase();

return !FILTERS.matcher(href).matches()

&& href.contains(URL+"/n1");

}



/**

* 当URL下载完成会调用这个方法，可以获取下载页面的URL，文本，链接，HTML和唯一ID等内容

*/
@Override

public void visit(Page page) {

String url = page.getWebURL().getURL();

System.out.println("URL: " + url);

if (page.getParseData() instanceof HtmlParseData) {

HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

String html = htmlParseData.getHtml();

Document doc = Jsoup.parse(html);

List<String> list = new ArrayList<String>();

Element element1 = doc.select("h1.clear,red").first();

Elements elements1 = element1.getElementsByAttribute("href");

list.add(elements1.attr("href"));//获取链接属性的值

Element element2 = doc.select("div.center,fr").first();

Elements elements2 = element2.getElementsByAttribute("href");

for (int i =0;i<9;i++) {

list.add(elements2.get(i).attr("href"));//获取链接属性的值

}

try {

for (int j = 0;j<list.size();j++) {

Politic politic = new Politic();

Document document = Jsoup.connect(URL+list.get(j)).get();//爬取此时的URL数据

Elements elements3 = document.select("#rwb_zw");

politic.setContent(elements3.select("p").text());//内容

Elements elements4 = document.select("div.clearfix,w1000_320,text_title");

politic.setTitle(elements4.select("h1").text());//标题

String str = elements4.select("div.box01 .fl").text();

//String publishedAt = str.substring(0,4)+"-"+str.substring(5,7)+"-"+str.substring(8,10)+" "+str.substring(11,16);//时间格式

String publishedAt = str.substring(0, 17);

SimpleDateFormat sf = new SimpleDateFormat("yyyy年MM月dd日HH:mm"); //定义时间格式

Date date = sf.parse(publishedAt); //转换成date类型

politic.setPublishedAt(new Timestamp(date.getTime()));//时间：//date类型转换成Timestamp类型

politic.setSource(str.substring(21));//来源

if(ConnectionUtil.select(politic.getTitle()) == 0){//判断title是否重复:若为0(不重复)则插入数据

int i = ConnectionUtil.insert(politic);

if(i == 0){//判断是否插入成功

throw new Exception("新增时政要闻资料失败！");

}

}else{

System.out.println("资料已经存在！");

}
}
}

catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {

e.printStackTrace();

} catch (Exception e) {
e.printStackTrace();
}

}

}

}

*************************************************************实体类***************************************************************************

package hotkidclub.model;

import java.sql.Timestamp;

/**

* 政治实体pojo

* @author ZhenhuaYuan

*

*/

public class Politic {
private Integer key;//主键
private Timestamp publishedAt;//时间
private String source;//来源
private String title;//标题
private String content;//内容

public Integer getKey() {
return key;
}

public void setKey(Integer key) {
this.key = key;
}

public Timestamp getPublishedAt() {
return publishedAt;
}

public void setPublishedAt(Timestamp publishedAt) {
this.publishedAt = publishedAt;
}

public String getSource() {
return source;
}

public void setSource(String source) {
this.source = source;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

}

******************************************************************MySQL jdbc数据库连接****************************************************************************************

package hotkidclub.controller;

import hotkidclub.model.Politic;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import com.mysql.jdbc.Connection;

import com.mysql.jdbc.PreparedStatement;

public class ConnectionUtil {

public static Connection getConn() {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost：8080/news";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); //classLoader,加载对应驱动
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}



public static int insert(Politic politic) {
Connection conn = getConn();
int i = 0;
String sql = "insert into politic(publishedAt,source,title,content) values(?,?,?,?)";
PreparedStatement pstmt;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, politic.getPublishedAt());//动态传参数
pstmt.setObject(2, politic.getSource());
pstmt.setObject(3, politic.getTitle());
pstmt.setObject(4, politic.getContent());
i = pstmt.executeUpdate();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}

public static int select(String title) {
Connection conn = getConn();
int i = 0;
String sql = "SELECT count(*) record FROM politic WHERE title=?";
PreparedStatement pstmt;
ResultSet rs;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, title);
rs = pstmt.executeQuery();
while(rs.next()){

i = rs.getInt("record");
}
rs.close();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}

}

**************************************************************************测试类TestCrawler ****************************************************************************************

package hotkidclub.controller;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;

import edu.uci.ics.crawler4j.crawler.CrawlController;

import edu.uci.ics.crawler4j.fetcher.PageFetcher;

import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;

import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

public class TestCrawler {

public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";//保存爬取数据的目录

int numberOfCrawlers = 5; //同时开启线程数(并发线程数)

CrawlConfig config = new CrawlConfig();
//在start之前进行一些配置，爬取速度优化

config.setCrawlStorageFolder(crawlStorageFolder);

config.setMaxDepthOfCrawling(1);
//种子爬取深度

config.setMaxPagesToFetch(1); //最多能爬取的页面

PageFetcher pageFetcher = new PageFetcher(config);//从网上提取内容

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();//robots协议类

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);//读取robots协议，获取是否可以抓取的信息

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);//爬行控制器监听各个爬虫的运行状态并向URL队列添加新的URL

controller.addSeed("http://politics.people.com.cn/");//添加种子URL的方法

controller.start(MyCrawler.class, numberOfCrawlers);//开启爬虫的方法

}

}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航