HttpClient连接网页,Jsoup解析网页
2015-08-11 21:13
411 查看
这两天在爬取某个医疗网站的信息,一开始就没有用httpClient用的全是Jsoup,用Jsoup链接并解析,上网搜了一下发现HttpClient和Jsoup结合起来也可以获取内容.所以把我github上的一篇代码有翻修了一下.
线程方法:
[code]package GetMedicineName_001; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.protocol.HTTP; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sun.net.www.http.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import java.io.*; import java.net.URL; import java.util.LinkedList; import java.util.List; import java.lang.String; /** * Created by panlu on 15-8-10. */ public class medicineName { private String HTML = null; private String URL = null; private String masterURL = null; private List<String> medicineNames; private List<String> medicineLinks; medicineName() { URL = "http://jib.xywy.com/html/"; medicineNames = new LinkedList<String>(); medicineLinks = new LinkedList<String>(); } public static void main(String[] args) { medicineName med = new medicineName(); PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); //建立一个连接池 CloseableHttpClient httpclient1 = HttpClients.custom().setConnectionManager(cm).build(); String[] urlToGet = {"http://jib.xywy.com/html/a.html", "http://jib.xywy.com/html/b.html", "http://jib.xywy.com/html/c.html", "http://jib.xywy.com/html/d.html", "http://jib.xywy.com/html/e.html", "http://jib.xywy.com/html/f.html", "http://jib.xywy.com/html/g.html", "http://jib.xywy.com/html/h.html", "http://jib.xywy.com/html/i.html", "http://jib.xywy.com/html/j.html", "http://jib.xywy.com/html/k.html", "http://jib.xywy.com/html/l.html", "http://jib.xywy.com/html/m.html", "http://jib.xywy.com/html/n.html", "http://jib.xywy.com/html/p.html"}; GetThread[] threads = new GetThread[urlToGet.length]; for (int i = 0; i < threads.length; i++) { HttpGet get = new HttpGet(urlToGet[i]); threads[i] = new GetThread(httpclient1, get); } for (int j = 0; j < threads.length; j++) { threads[j].start(); } for (int j = 0; j < threads.length; j++) { try { threads[j].join(); } catch (InterruptedException e) { e.printStackTrace(); } } } }
线程方法:
[code]package GetMedicineName_001; import org.apache.commons.httpclient.HttpStatus; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import javax.xml.ws.spi.http.HttpContext; import java.io.IOException; import java.util.LinkedList; import java.util.List; /** * Created by panlu on 15-8-11. */ public class GetThread extends Thread { private final CloseableHttpClient client; private final HttpClientContext context; private final HttpGet httpget; public String html; private List<String> medicineLinks; private List<String> medicineNames; public String url; public GetThread(CloseableHttpClient chc,HttpGet hg){ this.client = chc; this.context = HttpClientContext.create(); this.httpget = hg; medicineLinks = new LinkedList<String>(); medicineNames = new LinkedList<String>(); } @Override public void run(){ try { CloseableHttpResponse response1 = client.execute(httpget,context); int status = response1.getStatusLine().getStatusCode(); if (status == HttpStatus.SC_OK){ try { HttpEntity entity1 = response1.getEntity(); if (entity1!=null) { try { html = EntityUtils.toString(entity1, "gb2312"); } catch (IOException e) { e.printStackTrace(); } } }finally { response1.close(); } } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } Document doc = Jsoup.parse(html); Elements linksElements = doc.getElementsByClass("ks-ill-txt"); Elements elemA = linksElements.select("a"); for (Element e : elemA) { medicineLinks.add(e.attr("href")); } for (int i = 0; i < medicineLinks.size(); i++){ url = "http://jib.xywy.com" + medicineLinks.get(i); try { Document doc2 = Jsoup.connect(url).get(); // CloseableHttpClient client02 = = HttpClients.custom().setConnectionManager(cm).build(); // HttpGet get = new HttpGet(url); // CloseableHttpResponse clr = String title = doc2.title(); // 获取到title然后对title用,进行剪裁 String patternStr = ","; //正则表达式 String[] attrStr = title.split(patternStr); title = attrStr[0]; medicineNames.add(title); System.out.println(title); } catch (IOException e) { e.printStackTrace(); } } } }
相关文章推荐
- linux/unix网络编程之 select
- Android异步加载学习笔记之四:利用缓存优化网络加载图片及ListView加载优化
- 网络编程第二篇
- Linux服务器网络开发模型
- 用Java的HttpClient写一个简单的http请求和返回
- HTTP请求详解
- HDU 4309 Seikimatsu Occult Tonneru 网络流量+像缩进
- 利用【深度网络】高效提取feature
- iOS项目开发实战——iOS网络编程获取网页Html源代码
- 网络流
- 游戏服务器逻辑分析TCP+P2P
- 黑马程序员————网络编程
- 面试复习重点——基础篇:操作系统、计算机网络、设计模式【山科大牛陈磊整理】
- tcp3次握手和4次挥手全过程
- 完美配置Tomcat的HTTPS
- 网络安全
- BP神经网络对音频分类
- 接口传递网络请求结果
- 网络篇----解析json
- 使用WinPcap抓包分析网络协议