您的位置:首页 > 数据库

crawler4j 爬虫网页数据并保存到数据库中

2016-09-09 10:20 591 查看
package hotkidclub.controller;

import hotkidclub.model.Politic;

import java.io.IOException;

import java.sql.Timestamp;

import java.text.ParseException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import java.util.regex.Pattern;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import edu.uci.ics.crawler4j.crawler.Page;

import edu.uci.ics.crawler4j.crawler.WebCrawler;

import edu.uci.ics.crawler4j.parser.HtmlParseData;

import edu.uci.ics.crawler4j.url.WebURL;

public class MyCrawler extends WebCrawler{

private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"

             + "|png|mp3|mp3|zip|gz))$");

private final static String URL = "http://politics.people.com.cn";

/**
* 这方法决定了要抓取的URL及其内容
*/

     @Override

     public boolean shouldVisit(Page referringPage, WebURL url) {

         String href = url.getURL().toLowerCase();

         return !FILTERS.matcher(href).matches()

                && href.contains(URL+"/n1");

     } 

    

    /**

     * 当URL下载完成会调用这个方法,可以获取下载页面的URL,文本,链接,HTML和唯一ID等内容

     */
@Override

     public void visit(Page page) {

    String url = page.getWebURL().getURL();

         System.out.println("URL: " + url);

         if (page.getParseData() instanceof HtmlParseData) {

              HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

              String html = htmlParseData.getHtml();

              Document doc = Jsoup.parse(html);

              List<String> list = new ArrayList<String>();

              Element element1 = doc.select("h1.clear,red").first();

              Elements elements1 = element1.getElementsByAttribute("href");

              list.add(elements1.attr("href"));//获取链接属性的值

              Element element2 = doc.select("div.center,fr").first();

              Elements elements2 = element2.getElementsByAttribute("href");

              for (int i =0;i<9;i++) {

             list.add(elements2.get(i).attr("href"));//获取链接属性的值

              }

              try {

             for (int j = 0;j<list.size();j++) {

            Politic politic = new Politic();

    Document document = Jsoup.connect(URL+list.get(j)).get();//爬取此时的URL数据

    Elements elements3 = document.select("#rwb_zw");

                politic.setContent(elements3.select("p").text());//内容

                Elements elements4 = document.select("div.clearfix,w1000_320,text_title");

                politic.setTitle(elements4.select("h1").text());//标题

                String str = elements4.select("div.box01 .fl").text();

                //String publishedAt = str.substring(0,4)+"-"+str.substring(5,7)+"-"+str.substring(8,10)+" "+str.substring(11,16);//时间格式

                String publishedAt = str.substring(0, 17);

                SimpleDateFormat sf = new SimpleDateFormat("yyyy年MM月dd日HH:mm"); //定义时间格式

                Date date = sf.parse(publishedAt);  //转换成date类型

                politic.setPublishedAt(new Timestamp(date.getTime()));//时间 ://date类型转换成Timestamp类型

                politic.setSource(str.substring(21));//来源

                if(ConnectionUtil.select(politic.getTitle()) == 0){//判断title是否重复:若为0(不重复)则插入数据

               int  i = ConnectionUtil.insert(politic);

                    if(i == 0){//判断是否插入成功

                   throw new Exception("新增时政要闻资料失败!"); 

                    }

                }else{

               System.out.println("资料已经存在!");

                }
}
}

             catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {  

                e.printStackTrace();  

            } catch (Exception e) {
e.printStackTrace();
}     

         } 

       }

    }

*************************************************************实体类***************************************************************************

package hotkidclub.model;

import java.sql.Timestamp;

/**

 * 政治   实体pojo

 * @author ZhenhuaYuan

 *

 */

public class Politic {
private Integer key;//主键
private Timestamp publishedAt;//时间
private String source;//来源
private String title;//标题
private String content;//内容

public Integer getKey() {
return key;
}

public void setKey(Integer key) {
this.key = key;
}

public Timestamp getPublishedAt() {
return publishedAt;
}

public void setPublishedAt(Timestamp publishedAt) {
this.publishedAt = publishedAt;
}

public String getSource() {
return source;
}

public void setSource(String source) {
this.source = source;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

}

******************************************************************MySQL jdbc数据库连接****************************************************************************************

package hotkidclub.controller;

import hotkidclub.model.Politic;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.SQLException;

import com.mysql.jdbc.Connection;

import com.mysql.jdbc.PreparedStatement;

public class ConnectionUtil {

    public static Connection getConn() {
   String driver = "com.mysql.jdbc.Driver";
   String url = "jdbc:mysql://localhost:8080/news";
   String username = "root";
   String password = "123456";
   Connection conn = null;
   try {
       Class.forName(driver); //classLoader,加载对应驱动
       conn = (Connection) DriverManager.getConnection(url, username, password);
   } catch (ClassNotFoundException e) {
       e.printStackTrace();
   } catch (SQLException e) {
       e.printStackTrace();
   }
   return conn;
}

    

    public static int insert(Politic politic) {
   Connection conn = getConn();
   int i = 0;
   String sql = "insert into politic(publishedAt,source,title,content) values(?,?,?,?)";
   PreparedStatement pstmt;
   try {
       pstmt = (PreparedStatement) conn.prepareStatement(sql);
       pstmt.setObject(1, politic.getPublishedAt());//动态传参数
       pstmt.setObject(2, politic.getSource());
       pstmt.setObject(3, politic.getTitle());
       pstmt.setObject(4, politic.getContent()); 
       i = pstmt.executeUpdate();
       pstmt.close();
       conn.close();
   } catch (SQLException e) {
       e.printStackTrace();
   }
   return i;
}

    public static int select(String title) {
   Connection conn = getConn();
   int i = 0;
   String sql = "SELECT count(*) record FROM politic WHERE title=?";
   PreparedStatement pstmt;
   ResultSet rs;
   try {
       pstmt = (PreparedStatement) conn.prepareStatement(sql);
       pstmt.setObject(1, title);
       rs = pstmt.executeQuery();
       while(rs.next()){
       
i = rs.getInt("record");
       }
       rs.close();
       pstmt.close();
       conn.close();
   } catch (SQLException e) {
       e.printStackTrace();
   }
   return i;
}

}

**************************************************************************测试类TestCrawler ****************************************************************************************

package hotkidclub.controller;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;

import edu.uci.ics.crawler4j.crawler.CrawlController;

import edu.uci.ics.crawler4j.fetcher.PageFetcher;

import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;

import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

public class TestCrawler {

public static void main(String[] args) throws Exception {                            
  String crawlStorageFolder = "/data/crawl/root";//保存爬取数据的目录

           int numberOfCrawlers = 5; //同时开启线程数(并发线程数)

           CrawlConfig config = new CrawlConfig();
//在start之前进行一些配置,爬取速度优化

           config.setCrawlStorageFolder(crawlStorageFolder);

           config.setMaxDepthOfCrawling(1);
//种子爬取深度

           config.setMaxPagesToFetch(1); //最多能爬取的页面

           PageFetcher pageFetcher = new PageFetcher(config);//从网上提取内容

           RobotstxtConfig robotstxtConfig = new RobotstxtConfig();//robots协议类

           RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);//读取robots协议,获取是否可以抓取的信息

           CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);//爬行控制器   监听各个爬虫的运行状态并向URL队列添加新的URL

           controller.addSeed("http://politics.people.com.cn/");//添加种子URL的方法

           controller.start(MyCrawler.class, numberOfCrawlers);//开启爬虫的方法

        }  

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: