您的位置:首页 > 理论基础 > 计算机网络

HttpClient连接网页,Jsoup解析网页

2015-08-11 21:13 411 查看
  这两天在爬取某个医疗网站的信息,一开始就没有用httpClient用的全是Jsoup,用Jsoup链接并解析,上网搜了一下发现HttpClient和Jsoup结合起来也可以获取内容.所以把我github上的一篇代码有翻修了一下.

  

[code]package GetMedicineName_001;

import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;

import java.io.*;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.lang.String;
/**
 * Created by panlu on 15-8-10.
 */

public class medicineName  {
    private String HTML = null;
    private String URL = null;
    private String masterURL = null;
    private List<String> medicineNames;
    private List<String> medicineLinks;

    medicineName() {
        URL = "http://jib.xywy.com/html/";
        medicineNames = new LinkedList<String>();
        medicineLinks = new LinkedList<String>();
    }

    public static void main(String[] args) {
        medicineName med = new medicineName();

        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();   //建立一个连接池
        CloseableHttpClient httpclient1 = HttpClients.custom().setConnectionManager(cm).build();
        String[] urlToGet = {"http://jib.xywy.com/html/a.html",
                "http://jib.xywy.com/html/b.html",
                "http://jib.xywy.com/html/c.html",
                "http://jib.xywy.com/html/d.html",
                "http://jib.xywy.com/html/e.html",
                "http://jib.xywy.com/html/f.html",
                "http://jib.xywy.com/html/g.html",
                "http://jib.xywy.com/html/h.html",
                "http://jib.xywy.com/html/i.html",
                "http://jib.xywy.com/html/j.html",
                "http://jib.xywy.com/html/k.html",
                "http://jib.xywy.com/html/l.html",
                "http://jib.xywy.com/html/m.html",
                "http://jib.xywy.com/html/n.html",
                "http://jib.xywy.com/html/p.html"};
        GetThread[] threads = new GetThread[urlToGet.length];
        for (int i = 0; i < threads.length; i++) {
            HttpGet get = new HttpGet(urlToGet[i]);
            threads[i] = new GetThread(httpclient1, get);
        }

        for (int j = 0; j < threads.length; j++) {
            threads[j].start();
        }

        for (int j = 0; j < threads.length; j++) {
            try {
                threads[j].join();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

}


线程方法:

[code]package GetMedicineName_001;

import org.apache.commons.httpclient.HttpStatus;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.xml.ws.spi.http.HttpContext;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;

/**
 * Created by panlu on 15-8-11.
 */
public class GetThread extends Thread {
    private final CloseableHttpClient client;
    private final HttpClientContext context;
    private final HttpGet httpget;
    public String html;
    private List<String> medicineLinks;
    private List<String> medicineNames;
    public String url;

    public GetThread(CloseableHttpClient chc,HttpGet hg){
        this.client = chc;
        this.context = HttpClientContext.create();
        this.httpget = hg;
        medicineLinks = new LinkedList<String>();
        medicineNames = new LinkedList<String>();
    }

    @Override
    public void run(){
        try {
            CloseableHttpResponse response1 = client.execute(httpget,context);
            int status = response1.getStatusLine().getStatusCode();
            if (status == HttpStatus.SC_OK){
                try {
                    HttpEntity entity1 = response1.getEntity();
                    if (entity1!=null) {
                        try {
                            html = EntityUtils.toString(entity1, "gb2312");
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    }
                }finally {
                    response1.close();
                }
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        Document doc = Jsoup.parse(html);
        Elements linksElements = doc.getElementsByClass("ks-ill-txt");
        Elements elemA = linksElements.select("a");
        for (Element e : elemA) {
            medicineLinks.add(e.attr("href"));
        }

        for (int i = 0; i < medicineLinks.size(); i++){
            url = "http://jib.xywy.com" + medicineLinks.get(i);
            try {
                Document doc2 = Jsoup.connect(url).get();
//                CloseableHttpClient client02 = = HttpClients.custom().setConnectionManager(cm).build();
//                HttpGet get = new HttpGet(url);
//                CloseableHttpResponse clr =
                String title = doc2.title();
                // 获取到title然后对title用,进行剪裁
                String patternStr = ",";   //正则表达式
                String[] attrStr = title.split(patternStr);
                title = attrStr[0];
                medicineNames.add(title);
                System.out.println(title);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: