【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
2017-04-29 22:23
826 查看
结构:
公共方法
url任务的pojo类
import com.tsj.simple.enumeration.TaskLevel; /** * url任务的pojo类 * @author tsj-pc */ public class UrlPojo { public UrlPojo(String url) { this.url = url; } @Override public String toString() { return "UrlPojo [taskLevel=" + taskLevel + ", url=" + url + "]"; } public UrlPojo(String url, TaskLevel taskLevel) { this.url = url; this.taskLevel = taskLevel; } private String url; private TaskLevel taskLevel = TaskLevel.MIDDLE; public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public TaskLevel getTaskLevel() { return taskLevel; } public void setTaskLevel(TaskLevel taskLevel) { this.taskLevel = taskLevel; } public HttpURLConnection getConnection() { try { URL url = new URL(this.url); URLConnection connection = url.openConnection(); if (connection instanceof HttpURLConnection) { return (HttpURLConnection) connection; }else { throw new Exception("connection is errr!"); } } catch (Exception e) { e.printStackTrace(); } return null; } public String getHost() { try { URL url = new URL(this.url); return url.getHost(); } catch (Exception e) { e.printStackTrace(); } return null; } }
url任务的pojo类
package com.tsj.simple.pojos; /** * 抓取结果的封装 * @author tsj-pc */ public class CrawlResultPojo { private boolean isSuccess; private String pageContent; private int httpStatuCode; public boolean isSuccess() { return isSuccess; } public void setSuccess(boolean isSuccess) { this.isSuccess = isSuccess; } public String getPageContent() { return pageContent; } public void setPageContent(String pageContent) { this.pageContent = pageContent; } public int getHttpStatuCode() { return httpStatuCode; } public void setHttpStatuCode(int httpStatuCode) { this.httpStatuCode = httpStatuCode; } }
枚举类
public enum TaskLevel { HIGH,MIDDLE,LOW }
接口类
public interface ICrawler { public CrawlResultPojo crawl(UrlPojo urlPojo); }
实现类
(1)Socket方式抓取数据
public class SocketCrawlerImpl implements ICrawler { @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if (urlPojo == null || urlPojo.getUrl() == null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); return crawlResultPojo; } //抓取的ip 端口号 String host = urlPojo.getHost(); if (host == null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); return crawlResultPojo; } //为了提高写入的效率,使用了字符流的缓冲区。 //BufferedReader 缓冲方式文本读取 BufferedWriter bw = null; BufferedReader br = null; try { Socket socket = new Socket(host, 80); //设置过期时间 socket.setKeepAlive(true); // socket.setSoTimeout(1000); bw = new BufferedWriter(new OutputStreamWriter(socket .getOutputStream())); //协议地址 //bw.write("GET " + urlPojo.getUrl() + " HTTP/1.0\r\n"); bw.write("GET " + urlPojo.getUrl() + " HTTP/1.1\r\n"); bw.write("HOST:" + host + "\r\n"); bw.write("\r\n");// 在行的结束符\r\n之前没有任何数据,说明这时候http head输出给服务器端完成 bw.flush();// 清空缓存流 br = new BufferedReader(new InputStreamReader(socket .getInputStream())); StringBuilder stringBuilder = new StringBuilder(); String line = null; while ((line = br.readLine()) != null) { // System.out.println(line); stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); } finally { try { if (bw != null) { bw.close(); } if (br != null) { br.close(); } } catch (Exception e) { e.printStackTrace(); System.out.println("流最终未关闭,请检查!"); } } return null; } public static void main(String[] args) { SocketCrawlerImpl socketCrawlerImpl = new SocketCrawlerImpl(); UrlPojo urlPojo = new UrlPojo("http://www.baidu.com"); // UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); // UrlPojo urlPojo = new UrlPojo( // "http://www.hao123.com/?tn=97961594_hao_pg"); CrawlResultPojo crawlResultPojo=socketCrawlerImpl.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); System.out.println("done!"); } }
(2) HttpUrlConnection方式抓取数据
public class HttpUrlConnectionCrawlerImpl implements ICrawler { @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { CrawlResultPojo crawlResultPojo=new CrawlResultPojo(); StringBuilder stringBulider=new StringBuilder(); if (urlPojo ==null || urlPojo.getUrl()==null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); } HttpURLConnection httpUrlConnection=urlPojo.getConnection(); BufferedReader br=null; String line=null; try { br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"utf-8")); //br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"gb2312 ")); while ((line = br.readLine())!=null) { //System.out.print(line); stringBulider.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBulider.toString()); } catch (Exception e) { e.printStackTrace(); }finally{ try { if (br!=null) { br.close(); } } catch (Exception e) { e.printStackTrace(); System.out.println("流最终为关闭"); } } return crawlResultPojo; } public static void main(String[] args) { HttpUrlConnectionCrawlerImpl httpUrlConnectionCrawlerImpl = new HttpUrlConnectionCrawlerImpl(); UrlPojo urlPojo = new UrlPojo("http://www.baidu.com"); // UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); // UrlPojo urlPojo = new UrlPojo( // "http://www.hao123.com/?tn=97961594_hao_pg"); CrawlResultPojo crawlResultPojo=httpUrlConnectionCrawlerImpl.crawl(urlPojo); System.out.println(crawlResultPojo.getPageContent()); System.out.println("done!"); } }
包含业务逻辑的抓取管理器
整合两种方法
public class CrawlerManager { private ICrawler crawler; public CrawlerManager(boolean isSocket) { if (isSocket) { this.crawler = new SocketCrawlerImpl(); } else { this.crawler = new HttpUrlConnectionCrawlerImpl(); } } public CrawlResultPojo crawl(UrlPojo urlPojo) { return this.crawler.crawl(urlPojo); } public static void main(String[] args) { CrawlerManager crawlerManager = new CrawlerManager(false); UrlPojo urlPojo = new UrlPojo("http://www.qq.com"); CrawlResultPojo crawlResultPojo=crawlerManager.crawl(urlPojo); System.out.println("CrawlResultPojo---"+crawlResultPojo.getPageContent()); } }
结果:
相关文章推荐
- 使用HttpURLConnection和使用HttpClient方式请求网络采用get方式和post方式请求数据
- 【黑马Android】(05)短信/查询和添加/内容观察者使用/子线程网络图片查看器和Handler消息处理器/html查看器/使用HttpURLConnection采用Post方式请求数据/开源项目
- HttpURLConnection模拟浏览器+网络数据抓取
- Android HttpURLConnection(Get,Post方式)进行网络通信 获取数据和网络图片
- java32.HTTP通信------使用Http的Get方式读取网络数据
- 关于java response的两种页面输出方式,以及HttpUrlconnection 代理使用注意点
- 【网络爬虫】【java】微博爬虫(二):如何抓取HTML页面及HttpClient使用
- Java使用HttpURLConnection检索网站时403错误处理方式
- HttpURLConnection模拟浏览器+网络数据抓取
- 使用 HttpURLConnection 获取不到网络数据
- HttpURLConnection模拟浏览器+网络数据抓取
- iOS—网络实用技术OC篇&网络爬虫-使用java语言抓取网络数据
- java使用HttpURLConnection检索网站时403错误处理方式
- Java基础知识强化之网络编程笔记16:Android网络通信之 使用Http的Get方式读取网络数据(基于HTTP通信技术)
- Java基础知识强化之网络编程笔记18:Android网络通信之 使用HttpClient的Post / Get 方式读取网络数据(基于HTTP通信技术)
- Java基础知识强化之网络编程笔记17:Android网络通信之 使用Http的Post方式读取网络数据(基于HTTP通信技术)
- java使用HttpURLConnection检索网站时403错误处理方式
- Java 通过HttpURLConnection Post方式提交json,并从服务端返回json数据
- Java HttpUrlConnection form-data方式提交数据
- HttpURLConnection模拟浏览器+网络数据抓取