您的位置:首页 > 理论基础 > 计算机网络

java 简单网络爬虫实现

2013-01-29 16:30 716 查看
今天在群里有人讨论到了网络爬虫原理,在此,我就写了一个简单的网络爬虫,由于时间仓促,存在很多不规范,望大家担待,但基本原理,代码中已经体现了。愿大家学习开心。

import java.io.BufferedInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.net.HttpCookie;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.Socket;

import java.net.URL;

import java.net.URLConnection;

import java.net.UnknownHostException;

import java.util.ArrayList;

import java.util.List;

public class SocketScan {

private static final int MAX_SIZE = 5;

public static List<String> httpContextList = new ArrayList<String>();

public static void main(String[] args) {

// 得到网站URL,并读取出来

String httpContext = searchHttpContexts("http://10.125.2.36:8080/FileUpload/test.html");

System.out.println("httpContext size: "+httpContextList.size());

for (String string : httpContextList) {

System.out.println(string);

System.out.println();

System.out.println("分隔符==============================================================================");

System.out.println();

}

}

private static List<String> GetURLByHttpContext(String httpContext) {

List<String> urlList = new ArrayList<String>();

String mark = "href=\"";

int len = mark.length();

int start = 0;

int end = 0 ;

while((start = httpContext.indexOf(mark,start))!=-1){

start = start + len;

end = httpContext.indexOf("\"",start);

urlList.add(httpContext.substring(start,end));

}

return urlList;

}

private synchronized static String searchHttpContexts(String urlPath) {

try {

if(httpContextList.size() > MAX_SIZE){

return null;

}

String sb = getHttpContext(urlPath);

httpContextList.add(sb);

List<String> urlList = GetURLByHttpContext(sb.toString());

if(urlList.size() >0){

for (String subUrl : urlList) {

String subHttpContext = searchHttpContexts(subUrl);

if(httpContextList.size() > MAX_SIZE){

return null;

}

httpContextList.add(subHttpContext);

}

}

return sb;

} catch (UnknownHostException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return null;

}

private static String getHttpContext(String urlPath)

throws MalformedURLException, IOException {

URL url = new URL(urlPath);

URLConnection conn = url.openConnection();

BufferedInputStream input = new BufferedInputStream(conn.getInputStream());

byte[] b = new byte[1024];

int temp;

StringBuilder sb = new StringBuilder();

while ((temp = input.read(b)) != -1) {

String value = new String(b);

sb.append(value);

}

return sb.toString();

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  java 网络爬虫