您的位置:首页 > 理论基础 > 计算机网络

网络爬虫----男!生!福!利!

2015-03-09 11:23 302 查看
一、配置mvn依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.1.2</version>
</dependency>
二、代码
1、获取网页内容
package com.chenanyi.fuli.Helper;
import java.io.IOException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
public class GetHHH {
/**
* 根据URL抓取网页内容 此类要用到HttpClient组件
* @author 陈安一
* @param url
* @return
*/
public static String getContentFormUrl(String url)
{
/* 实例化一个HttpClient客户端 */
HttpClient client = new DefaultHttpClient();
HttpGet getHttp = new HttpGet(url);

String content = null;

HttpResponse response;
try
{
/*获得信息载体*/
response = client.execute(getHttp);
HttpEntity entity = response.getEntity();

if (entity != null)
{
/* 转化为文本信息 */
content = EntityUtils.toString(entity);
}
}catch (Exception e)
{
e.printStackTrace();
} finally
{
client.getConnectionManager().shutdown();
}

return content;
}
}
2、获取页面列表内所有标题的Url

package com.chenanyi.fuli.Helper;
import java.util.ArrayList;
import java.io.StringReader;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegContent {
/**
* @author 陈安一
* @功能 根据正则表达式匹配返回的网页信息
* @param reg
* @param info
* @return List<String>
*/
public static List<String> GetCon(String reg,String info){
List<String> result=new ArrayList<String>();
Matcher m = Pattern.compile(reg).matcher(info);
while (m.find()) {
String r = m.group();
result.add(r);
}
return result;
}
public static String GetDiv(String info){
SAXReader reader = new SAXReader();
Document doc;
try {
doc = reader.read(new StringReader(info));
Node node = doc.selectSingleNode("//body/div/div/div");
System.out.println(node.getText());
} catch (DocumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
}
return info;
}
public static String GetOneCon(String reg,String info){
String result=info;
Matcher m = Pattern.compile(reg).matcher(info);
while (m.find()) {
result = m.group();
}
return result;
}
/**
* @author 陈安一
* @功能 根据GetCon方法返回的List列表对数据进行重组,返回一个URL
* @param result
* @return
*/
public static List<String> GetallURL(List<String> result){
for(int i=0;i<result.size();i++){
result.set(i, "http://www.laossee.com/"+result.get(i)+".html");
}
return result;
}
}
3、将内容保存到电脑中
package com.chenanyi.fuli.Helper;
import java.io.FileWriter;
import java.io.IOException;
public class SaveTxt {
/**
* @author 陈安一
* @功能 将小说保存到本地中
* @param url
* @param title 例如"noexists.txt"
* @param cont
* @return
*/
public static void Sava(String title,String cont){
FileWriter fileWriter = null;
try {
fileWriter = new FileWriter(title,true);
fileWriter.write(cont);
fileWriter.flush();;
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
fileWriter.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
4、图片类的处理--下载图片保存到本地
package com.chenanyi.fuli.Helper;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
public class Download {
/**
* @author 陈安一
* @功能 根据url,保存路径,count(保存的标题,直接以数字保存)
*/
public static void down(String url, String path,int count) {
// 构造URL
URL img_url;
URLConnection con;
try {
img_url = new URL(url);
con = img_url.openConnection();
// 设置请求超时为5s
con.setConnectTimeout(5 * 1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf = new File(path);
if (!sf.exists()) {
sf.mkdirs();
}
String filename = count+".jpg";
OutputStream os;
try {
os = new FileOutputStream(sf.getPath() + "\\" + filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
catch (IOException e) {
e.printStackTrace();
}
}
}


5、根据获取的url处理返回的html代码,提取小说或者图片保存到文件中
(1)、获取txt的

package com.chenanyi.fuli.NBHelp;
import java.util.List;
import com.chenanyi.fuli.Helper.GetHHH;
import com.chenanyi.fuli.Helper.RegContent;
import com.chenanyi.fuli.Helper.SaveTxt;
public class GetTxT {
/**
* @author 陈安一
* @param cate 分类,16是RQ
* @param pagecount 爬取得总页数
*/
public static void Gettxt(int cate,int pagecount) {
for (int m = 1; m < pagecount; m++) {
int count = 0;
// article-list-id-16-page- 16是小说- RQ小说。
// 6是图片---ZPTP
String url = "http://www.laossee.com/article-list-id-"+cate+"-page-" + m
+ ".html";
String info = GetHHH.getContentFormUrl(url);
String reg = "article-show-id-\\d{6}";
List<String> result = RegContent.GetallURL(RegContent.GetCon(reg,
info));
for (int i = 0; i < result.size(); i++) {
String cont = GetHHH.getContentFormUrl(result.get(i));
List<String> titles = RegContent.GetCon("<title>.*?</title>",
cont);
String reggg = "<br />.*?<br />";
List<String> Content = RegContent.GetCon(reggg, cont);
String conts = "";
for (int f = 0; f < Content.size(); f++) {
conts += Content.get(f);
}
conts = conts.replace("<br />", "");
for (int j = 0; j < titles.size(); j++) {
count++;
String title = RegContent.GetOneCon(">.*?<", titles.get(j));
title = title.replace("/", "").replace(" ", "");
title = "txt/"
+ title.substring(1, title.length() - 1)
.replace('(', ' ').trim().replace(')', ' ')
.trim().replace('(', ' ').trim()
.replace(')', ' ').trim() + ".txt";
SaveTxt.Sava(title, conts);
System.out.println("第" + m + "页第" + count + "个" + title);
}
}
}
}
}
(2)、下载图片

package com.chenanyi.fuli.NBHelp;
import java.util.ArrayList;
import java.util.List;
import com.chenanyi.fuli.Helper.Download;
import com.chenanyi.fuli.Helper.GetHHH;
import com.chenanyi.fuli.Helper.RegContent;
public class Getimg {
/**
* @author 陈安一!
* @param cate 类别,6是ZpTp
* @param pagecount 爬取的页数
* @return List<String> 图片链接
*/
public static void Getimg(int cate, int pagecount,String path) {
int count=0;
for (int m = 1; m <= pagecount; m++) {
// article-list-id-16-page- 16是小说- RQ小说。
// 6是图片---ZPTP
String url = "http://www.laossee.com/article-list-id-" + cate
+ "-page-" + m + ".html";
String info = GetHHH.getContentFormUrl(url);
String reg = "article-show-id-\\d{6}";
List<String> result = RegContent.GetallURL(RegContent.GetCon(reg,
info));
for (int i = 0; i < result.size(); i++) {
String cont = GetHHH.getContentFormUrl(result.get(i));
List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>",
cont);
for (int j = 0; j < img_urls.size(); j++) {
count++;
String temp = img_urls.get(j).substring(10);
int index = temp.indexOf("\"");
temp = temp.substring(0, index);
Download.down(temp, path,count);
System.out.println(count+"\tOK");
}
}
}
}
/**
* @author 陈安一!
* @param cate 类别,6是ZpTp
* @param pagecount 爬取的页数
* @return List<String> 图片链接
*/
public static List<String> GetOnePageimg(int cate, int page) {
List<String> img_url = new ArrayList<String>();
// article-list-id-16-page- 16是小说- RQ小说。
// 6是图片---ZPTP
String url = "http://www.laossee.com/article-list-id-" + cate
+ "-page-" + page + ".html";
String info = GetHHH.getContentFormUrl(url);
String reg = "article-show-id-\\d{6}";
List<String> result = RegContent.GetallURL(RegContent.GetCon(reg,
info));
for (int i = 0; i < result.size(); i++) {
String cont = GetHHH.getContentFormUrl(result.get(i));
List<String> img_urls = RegContent.GetCon("<img src=\"(.*?)/>",
cont);
for (int j = 0; j < img_urls.size(); j++) {
String temp = img_urls.get(j).substring(10);
int index = temp.indexOf("\"");
temp = temp.substring(0, index);
System.out.println(temp);
img_url.add(temp);
}
}
return img_url;
}
}
6、运行!

package com.chenanyi.fuli.start;
import com.chenanyi.fuli.NBHelp.Getimg;
public class Start {
/**
* @author 陈安一
* @see 网络爬虫
* @功能 获取***** 哈哈哈哈哈哈
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
java.util.Scanner scanner = new java.util.Scanner(System.in);
System.out
.println("**************************************************************");
System.out
.println("**************************************************************");
System.out.println("第一个参数,分类(6,7,13),小说(14,15,16)");
System.out
.println("**************************************************************");
System.out.println("第二个参数,获取的总页码数,总页码数>=1");
System.out
.println("**************************************************************");
System.out.println("第三个参数,保存的地址 : 格式 f:\\\\image4\\\\");
System.out
.println("**************************************************************");
System.out.println("请输入第一个参数");
int value = scanner.nextInt();
System.out.println("请输入第二个参数");
int value1 = scanner.nextInt();
System.out.println("请输入第三个参数");
String line = scanner.next();
System.out.println("开始执行");
Getimg.Getimg(value, value1, line);
System.out.println("执行完毕");
} catch (Exception e) {
e.printStackTrace();
}
}
}
7、给我评论!
本文出自 “大包子” 博客,转载请与作者联系!
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: