crawler4j 爬虫网页数据并保存到数据库中
2016-09-09 10:20
591 查看
package hotkidclub.controller;
import hotkidclub.model.Politic;
import java.io.IOException;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class MyCrawler extends WebCrawler{
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ "|png|mp3|mp3|zip|gz))$");
private final static String URL = "http://politics.people.com.cn";
/**
* 这方法决定了要抓取的URL及其内容
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches()
&& href.contains(URL+"/n1");
}
/**
* 当URL下载完成会调用这个方法,可以获取下载页面的URL,文本,链接,HTML和唯一ID等内容
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parse(html);
List<String> list = new ArrayList<String>();
Element element1 = doc.select("h1.clear,red").first();
Elements elements1 = element1.getElementsByAttribute("href");
list.add(elements1.attr("href"));//获取链接属性的值
Element element2 = doc.select("div.center,fr").first();
Elements elements2 = element2.getElementsByAttribute("href");
for (int i =0;i<9;i++) {
list.add(elements2.get(i).attr("href"));//获取链接属性的值
}
try {
for (int j = 0;j<list.size();j++) {
Politic politic = new Politic();
Document document = Jsoup.connect(URL+list.get(j)).get();//爬取此时的URL数据
Elements elements3 = document.select("#rwb_zw");
politic.setContent(elements3.select("p").text());//内容
Elements elements4 = document.select("div.clearfix,w1000_320,text_title");
politic.setTitle(elements4.select("h1").text());//标题
String str = elements4.select("div.box01 .fl").text();
//String publishedAt = str.substring(0,4)+"-"+str.substring(5,7)+"-"+str.substring(8,10)+" "+str.substring(11,16);//时间格式
String publishedAt = str.substring(0, 17);
SimpleDateFormat sf = new SimpleDateFormat("yyyy年MM月dd日HH:mm"); //定义时间格式
Date date = sf.parse(publishedAt); //转换成date类型
politic.setPublishedAt(new Timestamp(date.getTime()));//时间 ://date类型转换成Timestamp类型
politic.setSource(str.substring(21));//来源
if(ConnectionUtil.select(politic.getTitle()) == 0){//判断title是否重复:若为0(不重复)则插入数据
int i = ConnectionUtil.insert(politic);
if(i == 0){//判断是否插入成功
throw new Exception("新增时政要闻资料失败!");
}
}else{
System.out.println("资料已经存在!");
}
}
}
catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
*************************************************************实体类***************************************************************************
package hotkidclub.model;
import java.sql.Timestamp;
/**
* 政治 实体pojo
* @author ZhenhuaYuan
*
*/
public class Politic {
private Integer key;//主键
private Timestamp publishedAt;//时间
private String source;//来源
private String title;//标题
private String content;//内容
public Integer getKey() {
return key;
}
public void setKey(Integer key) {
this.key = key;
}
public Timestamp getPublishedAt() {
return publishedAt;
}
public void setPublishedAt(Timestamp publishedAt) {
this.publishedAt = publishedAt;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
******************************************************************MySQL jdbc数据库连接****************************************************************************************
package hotkidclub.controller;
import hotkidclub.model.Politic;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
public class ConnectionUtil {
public static Connection getConn() {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:8080/news";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); //classLoader,加载对应驱动
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static int insert(Politic politic) {
Connection conn = getConn();
int i = 0;
String sql = "insert into politic(publishedAt,source,title,content) values(?,?,?,?)";
PreparedStatement pstmt;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, politic.getPublishedAt());//动态传参数
pstmt.setObject(2, politic.getSource());
pstmt.setObject(3, politic.getTitle());
pstmt.setObject(4, politic.getContent());
i = pstmt.executeUpdate();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}
public static int select(String title) {
Connection conn = getConn();
int i = 0;
String sql = "SELECT count(*) record FROM politic WHERE title=?";
PreparedStatement pstmt;
ResultSet rs;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, title);
rs = pstmt.executeQuery();
while(rs.next()){
i = rs.getInt("record");
}
rs.close();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}
}
**************************************************************************测试类TestCrawler ****************************************************************************************
package hotkidclub.controller;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class TestCrawler {
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";//保存爬取数据的目录
int numberOfCrawlers = 5; //同时开启线程数(并发线程数)
CrawlConfig config = new CrawlConfig();
//在start之前进行一些配置,爬取速度优化
config.setCrawlStorageFolder(crawlStorageFolder);
config.setMaxDepthOfCrawling(1);
//种子爬取深度
config.setMaxPagesToFetch(1); //最多能爬取的页面
PageFetcher pageFetcher = new PageFetcher(config);//从网上提取内容
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();//robots协议类
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);//读取robots协议,获取是否可以抓取的信息
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);//爬行控制器 监听各个爬虫的运行状态并向URL队列添加新的URL
controller.addSeed("http://politics.people.com.cn/");//添加种子URL的方法
controller.start(MyCrawler.class, numberOfCrawlers);//开启爬虫的方法
}
}
import hotkidclub.model.Politic;
import java.io.IOException;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class MyCrawler extends WebCrawler{
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ "|png|mp3|mp3|zip|gz))$");
private final static String URL = "http://politics.people.com.cn";
/**
* 这方法决定了要抓取的URL及其内容
*/
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches()
&& href.contains(URL+"/n1");
}
/**
* 当URL下载完成会调用这个方法,可以获取下载页面的URL,文本,链接,HTML和唯一ID等内容
*/
@Override
public void visit(Page page) {
String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String html = htmlParseData.getHtml();
Document doc = Jsoup.parse(html);
List<String> list = new ArrayList<String>();
Element element1 = doc.select("h1.clear,red").first();
Elements elements1 = element1.getElementsByAttribute("href");
list.add(elements1.attr("href"));//获取链接属性的值
Element element2 = doc.select("div.center,fr").first();
Elements elements2 = element2.getElementsByAttribute("href");
for (int i =0;i<9;i++) {
list.add(elements2.get(i).attr("href"));//获取链接属性的值
}
try {
for (int j = 0;j<list.size();j++) {
Politic politic = new Politic();
Document document = Jsoup.connect(URL+list.get(j)).get();//爬取此时的URL数据
Elements elements3 = document.select("#rwb_zw");
politic.setContent(elements3.select("p").text());//内容
Elements elements4 = document.select("div.clearfix,w1000_320,text_title");
politic.setTitle(elements4.select("h1").text());//标题
String str = elements4.select("div.box01 .fl").text();
//String publishedAt = str.substring(0,4)+"-"+str.substring(5,7)+"-"+str.substring(8,10)+" "+str.substring(11,16);//时间格式
String publishedAt = str.substring(0, 17);
SimpleDateFormat sf = new SimpleDateFormat("yyyy年MM月dd日HH:mm"); //定义时间格式
Date date = sf.parse(publishedAt); //转换成date类型
politic.setPublishedAt(new Timestamp(date.getTime()));//时间 ://date类型转换成Timestamp类型
politic.setSource(str.substring(21));//来源
if(ConnectionUtil.select(politic.getTitle()) == 0){//判断title是否重复:若为0(不重复)则插入数据
int i = ConnectionUtil.insert(politic);
if(i == 0){//判断是否插入成功
throw new Exception("新增时政要闻资料失败!");
}
}else{
System.out.println("资料已经存在!");
}
}
}
catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
*************************************************************实体类***************************************************************************
package hotkidclub.model;
import java.sql.Timestamp;
/**
* 政治 实体pojo
* @author ZhenhuaYuan
*
*/
public class Politic {
private Integer key;//主键
private Timestamp publishedAt;//时间
private String source;//来源
private String title;//标题
private String content;//内容
public Integer getKey() {
return key;
}
public void setKey(Integer key) {
this.key = key;
}
public Timestamp getPublishedAt() {
return publishedAt;
}
public void setPublishedAt(Timestamp publishedAt) {
this.publishedAt = publishedAt;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
******************************************************************MySQL jdbc数据库连接****************************************************************************************
package hotkidclub.controller;
import hotkidclub.model.Politic;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
public class ConnectionUtil {
public static Connection getConn() {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:8080/news";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); //classLoader,加载对应驱动
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static int insert(Politic politic) {
Connection conn = getConn();
int i = 0;
String sql = "insert into politic(publishedAt,source,title,content) values(?,?,?,?)";
PreparedStatement pstmt;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, politic.getPublishedAt());//动态传参数
pstmt.setObject(2, politic.getSource());
pstmt.setObject(3, politic.getTitle());
pstmt.setObject(4, politic.getContent());
i = pstmt.executeUpdate();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}
public static int select(String title) {
Connection conn = getConn();
int i = 0;
String sql = "SELECT count(*) record FROM politic WHERE title=?";
PreparedStatement pstmt;
ResultSet rs;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setObject(1, title);
rs = pstmt.executeQuery();
while(rs.next()){
i = rs.getInt("record");
}
rs.close();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}
}
**************************************************************************测试类TestCrawler ****************************************************************************************
package hotkidclub.controller;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
public class TestCrawler {
public static void main(String[] args) throws Exception {
String crawlStorageFolder = "/data/crawl/root";//保存爬取数据的目录
int numberOfCrawlers = 5; //同时开启线程数(并发线程数)
CrawlConfig config = new CrawlConfig();
//在start之前进行一些配置,爬取速度优化
config.setCrawlStorageFolder(crawlStorageFolder);
config.setMaxDepthOfCrawling(1);
//种子爬取深度
config.setMaxPagesToFetch(1); //最多能爬取的页面
PageFetcher pageFetcher = new PageFetcher(config);//从网上提取内容
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();//robots协议类
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);//读取robots协议,获取是否可以抓取的信息
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);//爬行控制器 监听各个爬虫的运行状态并向URL队列添加新的URL
controller.addSeed("http://politics.people.com.cn/");//添加种子URL的方法
controller.start(MyCrawler.class, numberOfCrawlers);//开启爬虫的方法
}
}
相关文章推荐
- SQL操作 - 插入时更新数据
- 解决MYSQL启动后自动停止问题
- MySQL - 用户变量
- MySQL - 正则表达式
- ELK(ElasticSearch+Logstash+Kibana)+redis日志收集分析系统
- mongodb 创建副本 replSet
- Oracle 存储过程实例2
- 数据库锁表情况
- Oracle 常用函数
- oracle 自定义函数
- Oracle 异常处理
- Oracle 包(package)
- Oracle 存储过程实例
- Oracle 游标
- oracle 函数
- oracle中操作数据
- ORACLE 变量定义
- SqlServer数据库还原与备份所遇到的问题
- oracle 视图
- oracle 存储过程