您的位置:首页 > 其它

爬虫抓取网页图片

2009-10-15 10:06 281 查看
使用爬虫抓取艺术猫网页上的图片,信息

package com.yishu.site.cnrenti;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.yuxiong.utils.commonshc.MyGetMethod;

import com.yishu.commons.Domian;
import com.yishu.commons.Globals;

/**
* 艺术猫(http://www.ysmao.com)
* @author fuyun
* 2009-8-15
*/
public class PictureParser extends GetMethod{


public PictureParser(String uri) {
super(uri);
}

// <div id="content">
// <div class="onetd noBorder">
private static final NodeFilter CONTENT_DIV_ID = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("id","content"));
private static final NodeFilter ONETD_DIV_CLASS = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("class","onetd noBorder"));

private static final NodeFilter FILTER_A = new TagNameFilter("a");
private static final NodeFilter FILTER_IMG = new TagNameFilter("img");


public static String one(String url,int i,String type) throws HttpException, IOException{

MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);

NodeList aNlist1 = nodeParser1.parse(FILTER_A);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
Node aNode2 = aNlist1.elementAt(aNlist1.size()-1);
LinkTag link2 = (LinkTag)aNode2;
int yeshu = Integer.parseInt(link2.getLink().split(".htm")[0].split("_")[1]);
System.out.println("页数:"+yeshu);
if(yeshu > 0){
for(int a=1;a<yeshu+1;a++){
url = "http://www.ysmao.com/ystype/"+i+"_"+a+".htm";
two(type,url);
}
}
}
} catch (ParserException e) {e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));

}


public static String two(String type,String url) throws HttpException, IOException{

MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);

NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println("aNlist1 = "+aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size()-1);aa++){
Node aNode2 = aNlist1.elementAt(aa);
//NodeList aNlist1 = aNode2.parse(FILTER_SPAN);
Long click = Long.parseLong(aNode2.getLastChild().getChildren().elementAt(0).getChildren().elementAt(1).getChildren().toHtml());
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd");
Date submittime = sdf.parse(aNode2.getLastChild().getChildren().elementAt(2).getChildren().elementAt(1).getChildren().toHtml());
String subuser = aNode2.getLastChild().getChildren().elementAt(1).getChildren().elementAt(1).getChildren().toHtml();
String titlelist = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).toPlainTextString();
String surl = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).getText().split("/htm")[1];
String strurl = "http://www.ysmao.com/htm"+surl.substring(0, surl.length()-1);
System.out.println("图片分类 = "+type);
System.out.println("[组]标题 = "+titlelist);
System.out.println("[组]地址 = "+strurl);
System.out.println("点击次数 = "+click);
System.out.println("上传者 = "+subuser);
System.out.println("上传时间 = "+submittime);

picture(type,titlelist,strurl,click,subuser,submittime);
}
}
} catch (ParserException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}


public static String picture(String type,String titlelist,String strurl,Long click,String subuser,Date submittime) throws HttpException, IOException{

MyGetMethod method = new MyGetMethod(strurl);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);

NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size());aa++){
Node aNode2 = aNlist1.elementAt(aa);
//System.out.println(aNode2.toHtml());
String name = aNode2.getChildren().elementAt(0).getChildren().elementAt(0).getChildren().toHtml().split("名称:")[1];

System.out.println("图片名 = "+name.substring(0, name.length()-2));

Parser nodeParser11 = Parser.createParser(aNlist1.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist11 = nodeParser11.parse(FILTER_IMG);
Node aNode11 = aNlist11.elementAt(aa);
ImageTag img = (ImageTag)aNode11;
//String bigPicUrl = img.getImageURL().substring(0, img.getImageURL().length()-10)+"big/"+img.getImageURL().substring(img.getImageURL().length()-10, img.getImageURL().length());
System.out.println("小图片 = "+img.getImageURL());
if(img.getImageURL().split("/").length == 5){
String pic = img.getImageURL().split("/")[0]+"/"+img.getImageURL().split("/")[1]+"/"+img.getImageURL().split("/")[2]+"/"+img.getImageURL().split("/")[3];
String bigPicUrl = pic+"/big/"+img.getImageURL().split("/")[4];
System.out.println("大图片 = "+bigPicUrl);
}

//System.out.println("大图片 = "+bigPicUrl+"\n");
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}


public static void main(String[] arge) throws HttpException, IOException{

String type = "";
for(int i=1;i<9;i++){
if(i == 1) type = "东方人体";
else if(i == 2) type = "西方人体";
else if(i == 3) type = "人体彩绘";
else if(i == 4) type = "油画艺术";
else if(i == 5) type = "素描艺术";
else if(i == 6) type = "雕塑艺术";
else if(i == 7) type = "水墨艺术";
else if(i == 8) type = "其它艺术";
one("http://www.ysmao.com/ystype/"+i+"_1.htm",i,type);

}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: