您的位置:首页 > 运维架构 > Linux

Linux下Java语言实现简陋Web爬虫

2009-09-09 20:28 260 查看
import java.io.BufferedReader;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.Socket;

import java.net.UnknownHostException;

public class WebCrawler {

private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";

//运行前最好先建立此目录和文件,用于存放爬取的页面内容

public static void main(String[] args) {

// TODO Auto-generated method stub

try {

File file = new File(Text_File_Path);

FileWriter fpWriter = new FileWriter(file);

//生成下载对象

Socket webclient = new Socket("www.woodpecker.org.cn",80);

PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);

BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));

//发送Http请求

result.println("GET / HTTP/1.1");

result.println("Host:localhost");

result.println("Connection: Close");

result.println();

//接收HTTP返回的消息

boolean bRet = true;

StringBuffer sb = new StringBuffer(8096);

while(bRet){

if(receiver.ready()){

int idx = 0;

while(idx != -1){

idx = receiver.read();

if(idx == '<')

break;

}

while(idx != -1){

sb.append((char)idx);

idx = receiver.read();

}

bRet = false;

}

}

//显示获得网页的正文,打印到控制台

System.out.println(sb.toString());

fpWriter.write(sb.toString());

webclient.close();

fpWriter.close();

} catch (UnknownHostException e) {

System.err.println("无法访问您指定的主机。");

e.printStackTrace();

System.exit(1);

} catch (IOException e) {

System.err.println("下载失败,请检查输入地址是否正确。");

e.printStackTrace();

System.exit(1);

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: