您的位置:首页 > 理论基础 > 计算机网络

【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据

2017-04-29 22:23 826 查看

结构:



公共方法

url任务的pojo类

import com.tsj.simple.enumeration.TaskLevel;
/**
* url任务的pojo类
* @author tsj-pc
*/
public class UrlPojo {
public UrlPojo(String url) {
this.url = url;
}

@Override
public String toString() {
return "UrlPojo [taskLevel=" + taskLevel + ", url=" + url + "]";
}

public UrlPojo(String url, TaskLevel taskLevel) {
this.url = url;
this.taskLevel = taskLevel;
}

private String url;
private TaskLevel taskLevel = TaskLevel.MIDDLE;

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

public TaskLevel getTaskLevel() {
return taskLevel;
}

public void setTaskLevel(TaskLevel taskLevel) {
this.taskLevel = taskLevel;
}

public HttpURLConnection getConnection() {
try {
URL url = new URL(this.url);
URLConnection connection = url.openConnection();
if (connection instanceof HttpURLConnection) {
return (HttpURLConnection) connection;
}else {
throw new Exception("connection is errr!");
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

public String getHost() {
try {
URL url = new URL(this.url);
return url.getHost();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}


url任务的pojo类

package com.tsj.simple.pojos;
/**
*  抓取结果的封装
* @author tsj-pc
*/
public class CrawlResultPojo {
private boolean isSuccess;
private String pageContent;
private int httpStatuCode;
public boolean isSuccess() {
return isSuccess;
}
public void setSuccess(boolean isSuccess) {
this.isSuccess = isSuccess;
}
public String getPageContent() {
return pageContent;
}
public void setPageContent(String pageContent) {
this.pageContent = pageContent;
}
public int getHttpStatuCode() {
return httpStatuCode;
}
public void setHttpStatuCode(int httpStatuCode) {
this.httpStatuCode = httpStatuCode;
}
}


枚举类

public enum TaskLevel {
HIGH,MIDDLE,LOW
}


接口类

public interface ICrawler {
public CrawlResultPojo crawl(UrlPojo urlPojo);
}


实现类

(1)Socket方式抓取数据

public class SocketCrawlerImpl  implements ICrawler  {
@Override
public CrawlResultPojo crawl(UrlPojo urlPojo) {
CrawlResultPojo crawlResultPojo = new CrawlResultPojo();
if (urlPojo == null || urlPojo.getUrl() == null) {
crawlResultPojo.setSuccess(false);
crawlResultPojo.setPageContent(null);
return crawlResultPojo;
}
//抓取的ip 端口号
String host = urlPojo.getHost();
if (host == null) {
crawlResultPojo.setSuccess(false);
crawlResultPojo.setPageContent(null);
return crawlResultPojo;
}
//为了提高写入的效率,使用了字符流的缓冲区。
//BufferedReader 缓冲方式文本读取

BufferedWriter bw = null;
BufferedReader br = null;

try {
Socket socket = new Socket(host, 80);
//设置过期时间
socket.setKeepAlive(true);
// socket.setSoTimeout(1000);

bw = new BufferedWriter(new OutputStreamWriter(socket
.getOutputStream()));
//协议地址
//bw.write("GET " + urlPojo.getUrl() + " HTTP/1.0\r\n");
bw.write("GET " + urlPojo.getUrl() + " HTTP/1.1\r\n");
bw.write("HOST:" + host + "\r\n");
bw.write("\r\n");// 在行的结束符\r\n之前没有任何数据,说明这时候http head输出给服务器端完成
bw.flush();// 清空缓存流

br = new BufferedReader(new InputStreamReader(socket
.getInputStream()));
StringBuilder stringBuilder = new StringBuilder();

String line = null;
while ((line = br.readLine()) != null) {
// System.out.println(line);
stringBuilder.append(line + "\n");
}
crawlResultPojo.setSuccess(true);
crawlResultPojo.setPageContent(stringBuilder.toString());

return crawlResultPojo;
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (bw != null) {
bw.close();
}
if (br != null) {
br.close();
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("流最终未关闭,请检查!");
}
}
return null;
}

public static void main(String[] args) {
SocketCrawlerImpl socketCrawlerImpl = new SocketCrawlerImpl();
UrlPojo urlPojo = new UrlPojo("http://www.baidu.com");
// UrlPojo urlPojo = new UrlPojo("http://www.qq.com");
// UrlPojo urlPojo = new UrlPojo(
// "http://www.hao123.com/?tn=97961594_hao_pg");
CrawlResultPojo crawlResultPojo=socketCrawlerImpl.crawl(urlPojo);
System.out.println(crawlResultPojo.getPageContent());
System.out.println("done!");
}
}


(2) HttpUrlConnection方式抓取数据

public class HttpUrlConnectionCrawlerImpl implements ICrawler {

@Override
public CrawlResultPojo crawl(UrlPojo urlPojo) {
CrawlResultPojo crawlResultPojo=new CrawlResultPojo();
StringBuilder stringBulider=new StringBuilder();
if (urlPojo ==null || urlPojo.getUrl()==null) {
crawlResultPojo.setSuccess(false);
crawlResultPojo.setPageContent(null);
}
HttpURLConnection httpUrlConnection=urlPojo.getConnection();
BufferedReader br=null;
String line=null;
try {
br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"utf-8"));
//br=new BufferedReader(new InputStreamReader(httpUrlConnection.getInputStream(),"gb2312 "));

while ((line = br.readLine())!=null) {
//System.out.print(line);
stringBulider.append(line + "\n");
}
crawlResultPojo.setSuccess(true);
crawlResultPojo.setPageContent(stringBulider.toString());
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
if (br!=null) {
br.close();
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("流最终为关闭");
}
}

return crawlResultPojo;
}
public static void main(String[] args) {
HttpUrlConnectionCrawlerImpl httpUrlConnectionCrawlerImpl = new HttpUrlConnectionCrawlerImpl();
UrlPojo urlPojo = new UrlPojo("http://www.baidu.com");
// UrlPojo urlPojo = new UrlPojo("http://www.qq.com");
// UrlPojo urlPojo = new UrlPojo(
// "http://www.hao123.com/?tn=97961594_hao_pg");
CrawlResultPojo crawlResultPojo=httpUrlConnectionCrawlerImpl.crawl(urlPojo);
System.out.println(crawlResultPojo.getPageContent());
System.out.println("done!");
}
}


包含业务逻辑的抓取管理器

整合两种方法

public class CrawlerManager {
private ICrawler crawler;
public CrawlerManager(boolean isSocket) {
if (isSocket) {
this.crawler = new SocketCrawlerImpl();
} else {
this.crawler = new HttpUrlConnectionCrawlerImpl();
}
}
public CrawlResultPojo crawl(UrlPojo urlPojo) {
return this.crawler.crawl(urlPojo);
}
public static void main(String[] args) {
CrawlerManager crawlerManager = new CrawlerManager(false);
UrlPojo urlPojo = new UrlPojo("http://www.qq.com");
CrawlResultPojo crawlResultPojo=crawlerManager.crawl(urlPojo);
System.out.println("CrawlResultPojo---"+crawlResultPojo.getPageContent());

}
}


结果:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  socket java url 网络爬虫
相关文章推荐