Linux下Java语言实现简陋Web爬虫
2009-09-09 20:28
260 查看
import java.io.BufferedReader;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;
public class WebCrawler {
private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";
//运行前最好先建立此目录和文件,用于存放爬取的页面内容
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
File file = new File(Text_File_Path);
FileWriter fpWriter = new FileWriter(file);
//生成下载对象
Socket webclient = new Socket("www.woodpecker.org.cn",80);
PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));
//发送Http请求
result.println("GET / HTTP/1.1");
result.println("Host:localhost");
result.println("Connection: Close");
result.println();
//接收HTTP返回的消息
boolean bRet = true;
StringBuffer sb = new StringBuffer(8096);
while(bRet){
if(receiver.ready()){
int idx = 0;
while(idx != -1){
idx = receiver.read();
if(idx == '<')
break;
}
while(idx != -1){
sb.append((char)idx);
idx = receiver.read();
}
bRet = false;
}
}
//显示获得网页的正文,打印到控制台
System.out.println(sb.toString());
fpWriter.write(sb.toString());
webclient.close();
fpWriter.close();
} catch (UnknownHostException e) {
System.err.println("无法访问您指定的主机。");
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
System.err.println("下载失败,请检查输入地址是否正确。");
e.printStackTrace();
System.exit(1);
}
}
}
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;
public class WebCrawler {
private static String Text_File_Path = "/home/zms/htmldoc/htmldoc1.html";
//运行前最好先建立此目录和文件,用于存放爬取的页面内容
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
File file = new File(Text_File_Path);
FileWriter fpWriter = new FileWriter(file);
//生成下载对象
Socket webclient = new Socket("www.woodpecker.org.cn",80);
PrintWriter result = new PrintWriter(webclient.getOutputStream(),true);
BufferedReader receiver = new BufferedReader(new InputStreamReader(webclient.getInputStream()));
//发送Http请求
result.println("GET / HTTP/1.1");
result.println("Host:localhost");
result.println("Connection: Close");
result.println();
//接收HTTP返回的消息
boolean bRet = true;
StringBuffer sb = new StringBuffer(8096);
while(bRet){
if(receiver.ready()){
int idx = 0;
while(idx != -1){
idx = receiver.read();
if(idx == '<')
break;
}
while(idx != -1){
sb.append((char)idx);
idx = receiver.read();
}
bRet = false;
}
}
//显示获得网页的正文,打印到控制台
System.out.println(sb.toString());
fpWriter.write(sb.toString());
webclient.close();
fpWriter.close();
} catch (UnknownHostException e) {
System.err.println("无法访问您指定的主机。");
e.printStackTrace();
System.exit(1);
} catch (IOException e) {
System.err.println("下载失败,请检查输入地址是否正确。");
e.printStackTrace();
System.exit(1);
}
}
}
相关文章推荐
- 学习用java基于webMagic+selenium+phantomjs实现爬虫Demo爬取淘宝搜索页面
- springboot+webmagic实现java爬虫jdbc及mysql的方法
- Linux平台,使用JavaComm3 API及SMSLib项目实现在Web Application中发送手机短信的功能
- 学习用java基于webMagic+selenium+phantomjs实现爬虫Demo爬取淘宝搜索页面
- Go语言实现的web爬虫实例
- Java实现查看Web部署在Linux所在机的CPU、内存、I/O运行情况
- Linux下apache-tomcat + mysql +jdk实现java-web项目搭建
- java语言实现网络爬虫
- Java语言实现的简单网络爬虫复习
- linux下使用jni实现c++调用java程序(1)准备工作
- Java语言计算器界面实现
- Java 语言使用 Observer/Observable 实现简单的观察者模式
- linux下搭建javaweb
- Java简单的网络爬虫实现
- Appium :Windows 平台上的使用 Java 语言实现 appium 自动化程序 for Android
- Java用WebSocket + tail命令实现Web实时日志
- Java语言实现简单FTP软件------>FTP软件效果图预览之下载功能(二)
- [未读] 概率语言模型及其变形系列(5)-LDA Gibbs Sampling 的JAVA实现
- Java通过SMS短信平台实现发短信功能 含多语言
- jsp struts实现的Java web信息供求系统项目源码