您的位置:首页 > 理论基础 > 计算机网络

网络爬虫爬取全国省市区(动态ip代理的获取,实现对ip限制的突破)

2018-01-26 17:43 816 查看
项目中用到的包结构

项目使用Jsoup进行网络的链接与网页的解析,使用dbutils进行dao操作,使用c3p0进行链接的管理

源代码下载地址:http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接

爬虫程序的入口:

[java] view
plain copy

package com.crawlercity.main;  

  

  

import org.jsoup.nodes.Document;  

  

import com.crawlercity.util.HttpUtils;  

import com.crawlercity.util.JsoupUtils;  

  

public class Main {  

    public static void main(String[] args) {  

        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址  

        // 设置代理ip  

        HttpUtils.setProxyIp();  

        Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document  

        JsoupUtils.analysisDocument(document);  

          

    }  

}  

用于动态ip代理的工具类HttpUtils

[java] view
plain copy

package com.crawlercity.util;  

  

  

import java.io.BufferedReader;  

import java.io.IOException;  

import java.io.InputStreamReader;  

import java.util.ArrayList;  

import java.util.List;  

import java.util.Random;  

  

public class HttpUtils {  

    /** 

     * 设置代理ip 

     * @throws IOException 

     */  

    public static void setProxyIp() {  

        try {  

            List<String> ipList = new ArrayList<>();  

            BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt")));  

              

            String ip = "";  

            while((ip = proxyIpReader.readLine()) != null) {  

                ipList.add(ip);  

            }  

              

            Random random = new Random();  

            int randomInt = random.nextInt(ipList.size());  

            String ipport = ipList.get(randomInt);  

            String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));  

            String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());  

              

            System.setProperty("http.maxRedirects", "50");    

            System.getProperties().setProperty("proxySet", "true");     

            System.getProperties().setProperty("http.proxyHost", proxyIp);    

            System.getProperties().setProperty("http.proxyPort", proxyPort);  

              

            System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort);  

        } catch (Exception e) {  

            System.out.println("重新设置代理ip");  

            setProxyIp();  

        }  

            

          

    }  

}  

用于获取document对象的工具类JsoupUtils

[html] view
plain copy

public static Document getDocument(String url) {  

            try {  

                Document document = Jsoup.connect(url).timeout(70).get();  

                  

                if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况  

                    System.out.println("出现ip被拦截或者其他情况");  

                    HttpUtils.setProxyIp();  

                    getDocument(url);  

                }  

                  

                return document;  

            } catch (Exception e) { // 链接超时等其他情况  

                System.out.println("出现链接超时等其他情况");  

                HttpUtils.setProxyIp();// 换代理ip  

                getDocument(url);// 继续爬取网页  

            }  

            return getDocument(url);  

        }  

用于解析html文档的工具类JsoupUtils

[java] view
plain copy

public static void analysisDocument(Document document) {  

        try {  

            String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";  

            CityInfo cityInfo1 = new CityInfo();  

            CityInfo cityInfo2 = new CityInfo();  

            CityInfo cityInfo3 = new CityInfo();  

            CityInfo cityInfo4 = new CityInfo();  

            CityInfo cityInfo5 = new CityInfo();  

            // 将类型为1(省)的数据分析并且插入  

            Elements elements1 = document.getElementsByAttributeValue("class", "provincetr");  

            for(Element element1 : elements1) {  

                Elements elements2 = element1.getElementsByTag("a");  

                for(Element element2 : elements2) {  

                    cityInfo1.setName(element2.text());  

                    cityInfo1.setParentId(0);  

                    cityInfo1.setType(1);  

                    cityInfo1.setUrl(baseUrl + element2.attr("href"));  

//                  System.out.println("cityInfo1" + cityInfo1.toString());  

                    int key1 = DBUtils.insertCityInfo(cityInfo1);  

                    Document document2 = getDocument(cityInfo1.getUrl());  

                    Elements elements3 = document2.getElementsByAttributeValue("class", "citytr");  

                    for(Element element3 : elements3) {  

                        Elements elements4 = element3.getElementsByTag("a");  

                        if(elements4.toString().trim().equals("")) {  

                            Elements diffElements = element3.getElementsByTag("td");  

                            cityInfo2.setCode(diffElements.get(0).text());  

                            cityInfo2.setName(diffElements.get(1).text());  

                            cityInfo2.setParentId(key1);  

                            cityInfo2.setType(2);  

                            continue;  

                        }  

                        cityInfo2.setCode(elements4.get(0).text());  

                        cityInfo2.setName(elements4.get(1).text());  

                        cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href"));  

                        cityInfo2.setParentId(key1);  

                        cityInfo2.setType(2);  

                        /*System.out.println("cityInfo2" + cityInfo2.toString());*/  

                        int key2 = DBUtils.insertCityInfo(cityInfo2);  

                        Document document3 = getDocument(cityInfo2.getUrl());  

                        Elements elements5 = document3.getElementsByAttributeValue("class", "countytr");  

                        for(Element element5 : elements5) {  

                            Elements elements6 = element5.getElementsByTag("a");  

                            if(elements6.toString().trim().equals("")) {  

                                Elements diffElements = element5.getElementsByTag("td");  

                                cityInfo3.setCode(diffElements.get(0).text());  

                                cityInfo3.setName(diffElements.get(1).text());  

                                cityInfo3.setParentId(key2);  

                                cityInfo3.setType(3);  

                                continue;  

                            }  

                            cityInfo3.setCode(elements6.get(0).text());  

                            cityInfo3.setName(elements6.get(1).text());  

                            String cityInfo2Url = cityInfo2.getUrl();  

                            cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href"));  

                            cityInfo3.setParentId(key2);  

                            cityInfo3.setType(3);  

                    /*      System.out.println("cityInfo3" + cityInfo3.toString());*/  

                            int key3 = DBUtils.insertCityInfo(cityInfo3);  

                            Document document4 = getDocument(cityInfo3.getUrl());  

                            Elements elements7 = document4.getElementsByAttributeValue("class", "towntr");  

                            for(Element element7 : elements7) {  

                                Elements elements8 = element7.getElementsByTag("a");  

                                System.out.println(elements8.toString());  

                                if(elements8.toString().trim().equals("")) {// 表示没有a标签  

                                    Elements diffElements = element7.getElementsByTag("td");  

                                    cityInfo4.setCode(diffElements.get(0).text());  

                                    cityInfo4.setName(diffElements.get(1).text());  

                                    cityInfo4.setParentId(key3);  

                                    cityInfo4.setType(4);  

                                    continue;  

                                }  

                                cityInfo4.setCode(elements8.get(0).text());  

                                cityInfo4.setName(elements8.get(1).text());  

                                String cityInfo3Url = cityInfo3.getUrl();  

                                cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href"));  

                                cityInfo4.setParentId(key3);  

                                cityInfo4.setType(4);  

//                              System.out.println("cityInfo4" + cityInfo4.toString());  

                                int key4 = DBUtils.insertCityInfo(cityInfo4);  

                                Document document5 = getDocument(cityInfo4.getUrl());  

                                Elements elements9 = document5.getElementsByAttributeValue("class", "villagetr");  

                                for(Element element8 : elements9) {  

                                    Elements elements10 = element8.getElementsByTag("td");  

                                    cityInfo5.setCode(elements10.get(0).text());  

                                    cityInfo5.setName(elements10.get(2).text());  

                                    cityInfo5.setParentId(key4);  

                                    cityInfo5.setType(5);  

                                    /*System.out.println("cityInfo5" + cityInfo5.toString());*/  

                                    DBUtils.insertCityInfo(cityInfo5);  

                                }  

                            }  

                        }  

                    }  

                }  

            }  

              

        } catch (Exception e) {  

            e.printStackTrace();  

        }  

用于dao操作的工具类DbUtils

[html] view
plain copy

package com.crawlercity.util;  

  

import java.sql.Connection;  

import java.sql.ResultSet;  

import java.sql.SQLException;  

import java.sql.Statement;  

  

import javax.sql.DataSource;  

  

import org.apache.commons.dbutils.QueryRunner;  

import org.apache.commons.dbutils.handlers.ScalarHandler;  

  

import com.crawlercity.model.CityInfo;  

import com.mchange.v2.c3p0.ComboPooledDataSource;  

  

public class DBUtils {  

  

    private static DataSource ds = null;  

  

    public static Connection getConnection() {  

        if(ds == null) {  

            ds = new ComboPooledDataSource();  

        }  

        try {  

            return ds.getConnection();  

        } catch (SQLException e) {  

            e.printStackTrace();  

        }  

        return null;  

    }  

  

    public static DataSource getDataSource() {  

        return ds == null ? new ComboPooledDataSource() : ds;  

    }  

      

  

    public static void releaseSource(Connection conn, Statement st, ResultSet rs) {  

        try {  

            if(rs != null && !rs.isClosed()) {  

                rs.close();  

            }  

            if(st != null && !st.isClosed()) {  

                st.close();  

            }  

            if(conn != null && !conn.isClosed()) {  

                conn.close();  

            }  

        } catch (Exception e) {  

            e.printStackTrace();  

        }  

    }  

      

    public static int insertCityInfo(CityInfo cityInfo) {  

          

        Connection connection = DBUtils.getConnection();  

        QueryRunner qr = new QueryRunner();  

        String sql1 = "insert into cityinfo values (?,?,?,?,?,?)";  

        // 返回主键  

        String sql2 = "SELECT LAST_INSERT_ID()";  

          

        try {  

            int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl());  

            int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString());  

            releaseSource(connection, null, null);  

            return key;  

        } catch (SQLException e) {  

            e.printStackTrace();  

        }  

        return 0;  

    }  

}  

写代码的过程中出现了一些问题如: Jsoup如何在设置编码的同时设置连接超时,如何在超时或者动态ip代理无效的时候重新获取动态ip代理,如何在解析html失败后继续解析等。

通过这次编程发现自己在java网络方面的只是还是有待提高,以后继续努力!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: