您的位置:首页 > 其它

一个简易的网页爬虫,可用于下载在线API文档

2012-12-13 09:52 513 查看
package wkx;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.PostMethod;

public class Main {

private static Set<String> filenames = new HashSet<String>();

public static String getSource(String url) {
String response = null;
HttpClient client = new HttpClient();
HttpMethod method = null;
try {
method = new PostMethod(url);
client.executeMethod(method);
if (method.getStatusCode() == HttpStatus.SC_OK) {
response = method.getResponseBodyAsString();
}
} catch (IOException e) {
System.out.println("Get Source Error!");
} finally {
if (method != null)
method.releaseConnection();
}
return response;
}

public static void create(String url, String froot, String cur) {
String curUrl = url + "/" + cur;
if (filenames.contains(curUrl)) {
return;
}
filenames.add(curUrl);
String cont = getSource(curUrl);
if (cont == null) {
return;
}
File f = null;
FileOutputStream fos = null;
try {
f = new File(froot);
if (!f.exists()) {
f.mkdirs();
}
f = new File(froot + "\\" + cur);
f.createNewFile();
fos = new FileOutputStream(f);
fos.write(cont.getBytes());
fos.flush();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (fos != null) {
try {
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
String[] files = cont.split("\"");
for (String file : files) {
if (file.matches(".*\\.html$") && !file.startsWith("http")) {
if (file.contains(" "))
continue;
String turl = url;
String tfroot = froot;
file.replaceAll("//", "/");
int tindex = file.lastIndexOf("?");
if (tindex != -1) {
file = file.substring(0, tindex);
}
int index = 0;
while ((index = file.indexOf("/")) != -1) {
if (file.charAt(index - 1 < 0 ? 0 : index - 1) == '.') {
turl = turl.substring(0, turl.lastIndexOf("/"));
tfroot = tfroot.substring(0, tfroot.lastIndexOf("\\"));
} else {
turl = turl + "/" + file.substring(0, index);
tfroot = tfroot + "\\" + file.substring(0, index);
}
file = file.substring(index + 1);
}
create(turl, tfroot, file);
}
}
}

public static void main(String[] args) {
String url = "http://localhost";
String froot = "C:\\Users\\Jack_Wong\\Desktop\\api";
create(url, froot, "index.html");
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐