爬虫抓取网页图片
2009-10-15 10:06
281 查看
使用爬虫抓取艺术猫网页上的图片,信息
package com.yishu.site.cnrenti;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.yuxiong.utils.commonshc.MyGetMethod;
import com.yishu.commons.Domian;
import com.yishu.commons.Globals;
/**
* 艺术猫(http://www.ysmao.com)
* @author fuyun
* 2009-8-15
*/
public class PictureParser extends GetMethod{
public PictureParser(String uri) {
super(uri);
}
// <div id="content">
// <div class="onetd noBorder">
private static final NodeFilter CONTENT_DIV_ID = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("id","content"));
private static final NodeFilter ONETD_DIV_CLASS = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("class","onetd noBorder"));
private static final NodeFilter FILTER_A = new TagNameFilter("a");
private static final NodeFilter FILTER_IMG = new TagNameFilter("img");
public static String one(String url,int i,String type) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(FILTER_A);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
Node aNode2 = aNlist1.elementAt(aNlist1.size()-1);
LinkTag link2 = (LinkTag)aNode2;
int yeshu = Integer.parseInt(link2.getLink().split(".htm")[0].split("_")[1]);
System.out.println("页数:"+yeshu);
if(yeshu > 0){
for(int a=1;a<yeshu+1;a++){
url = "http://www.ysmao.com/ystype/"+i+"_"+a+".htm";
two(type,url);
}
}
}
} catch (ParserException e) {e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static String two(String type,String url) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println("aNlist1 = "+aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size()-1);aa++){
Node aNode2 = aNlist1.elementAt(aa);
//NodeList aNlist1 = aNode2.parse(FILTER_SPAN);
Long click = Long.parseLong(aNode2.getLastChild().getChildren().elementAt(0).getChildren().elementAt(1).getChildren().toHtml());
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd");
Date submittime = sdf.parse(aNode2.getLastChild().getChildren().elementAt(2).getChildren().elementAt(1).getChildren().toHtml());
String subuser = aNode2.getLastChild().getChildren().elementAt(1).getChildren().elementAt(1).getChildren().toHtml();
String titlelist = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).toPlainTextString();
String surl = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).getText().split("/htm")[1];
String strurl = "http://www.ysmao.com/htm"+surl.substring(0, surl.length()-1);
System.out.println("图片分类 = "+type);
System.out.println("[组]标题 = "+titlelist);
System.out.println("[组]地址 = "+strurl);
System.out.println("点击次数 = "+click);
System.out.println("上传者 = "+subuser);
System.out.println("上传时间 = "+submittime);
picture(type,titlelist,strurl,click,subuser,submittime);
}
}
} catch (ParserException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static String picture(String type,String titlelist,String strurl,Long click,String subuser,Date submittime) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(strurl);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size());aa++){
Node aNode2 = aNlist1.elementAt(aa);
//System.out.println(aNode2.toHtml());
String name = aNode2.getChildren().elementAt(0).getChildren().elementAt(0).getChildren().toHtml().split("名称:")[1];
System.out.println("图片名 = "+name.substring(0, name.length()-2));
Parser nodeParser11 = Parser.createParser(aNlist1.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist11 = nodeParser11.parse(FILTER_IMG);
Node aNode11 = aNlist11.elementAt(aa);
ImageTag img = (ImageTag)aNode11;
//String bigPicUrl = img.getImageURL().substring(0, img.getImageURL().length()-10)+"big/"+img.getImageURL().substring(img.getImageURL().length()-10, img.getImageURL().length());
System.out.println("小图片 = "+img.getImageURL());
if(img.getImageURL().split("/").length == 5){
String pic = img.getImageURL().split("/")[0]+"/"+img.getImageURL().split("/")[1]+"/"+img.getImageURL().split("/")[2]+"/"+img.getImageURL().split("/")[3];
String bigPicUrl = pic+"/big/"+img.getImageURL().split("/")[4];
System.out.println("大图片 = "+bigPicUrl);
}
//System.out.println("大图片 = "+bigPicUrl+"\n");
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static void main(String[] arge) throws HttpException, IOException{
String type = "";
for(int i=1;i<9;i++){
if(i == 1) type = "东方人体";
else if(i == 2) type = "西方人体";
else if(i == 3) type = "人体彩绘";
else if(i == 4) type = "油画艺术";
else if(i == 5) type = "素描艺术";
else if(i == 6) type = "雕塑艺术";
else if(i == 7) type = "水墨艺术";
else if(i == 8) type = "其它艺术";
one("http://www.ysmao.com/ystype/"+i+"_1.htm",i,type);
}
}
}
package com.yishu.site.cnrenti;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.yuxiong.utils.commonshc.MyGetMethod;
import com.yishu.commons.Domian;
import com.yishu.commons.Globals;
/**
* 艺术猫(http://www.ysmao.com)
* @author fuyun
* 2009-8-15
*/
public class PictureParser extends GetMethod{
public PictureParser(String uri) {
super(uri);
}
// <div id="content">
// <div class="onetd noBorder">
private static final NodeFilter CONTENT_DIV_ID = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("id","content"));
private static final NodeFilter ONETD_DIV_CLASS = new AndFilter(
new TagNameFilter("div"), new HasAttributeFilter("class","onetd noBorder"));
private static final NodeFilter FILTER_A = new TagNameFilter("a");
private static final NodeFilter FILTER_IMG = new TagNameFilter("img");
public static String one(String url,int i,String type) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(FILTER_A);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
Node aNode2 = aNlist1.elementAt(aNlist1.size()-1);
LinkTag link2 = (LinkTag)aNode2;
int yeshu = Integer.parseInt(link2.getLink().split(".htm")[0].split("_")[1]);
System.out.println("页数:"+yeshu);
if(yeshu > 0){
for(int a=1;a<yeshu+1;a++){
url = "http://www.ysmao.com/ystype/"+i+"_"+a+".htm";
two(type,url);
}
}
}
} catch (ParserException e) {e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static String two(String type,String url) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(url);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println("aNlist1 = "+aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size()-1);aa++){
Node aNode2 = aNlist1.elementAt(aa);
//NodeList aNlist1 = aNode2.parse(FILTER_SPAN);
Long click = Long.parseLong(aNode2.getLastChild().getChildren().elementAt(0).getChildren().elementAt(1).getChildren().toHtml());
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd");
Date submittime = sdf.parse(aNode2.getLastChild().getChildren().elementAt(2).getChildren().elementAt(1).getChildren().toHtml());
String subuser = aNode2.getLastChild().getChildren().elementAt(1).getChildren().elementAt(1).getChildren().toHtml();
String titlelist = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).toPlainTextString();
String surl = aNode2.getChildren().elementAt(1).getChildren().elementAt(0).getText().split("/htm")[1];
String strurl = "http://www.ysmao.com/htm"+surl.substring(0, surl.length()-1);
System.out.println("图片分类 = "+type);
System.out.println("[组]标题 = "+titlelist);
System.out.println("[组]地址 = "+strurl);
System.out.println("点击次数 = "+click);
System.out.println("上传者 = "+subuser);
System.out.println("上传时间 = "+submittime);
picture(type,titlelist,strurl,click,subuser,submittime);
}
}
} catch (ParserException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static String picture(String type,String titlelist,String strurl,Long click,String subuser,Date submittime) throws HttpException, IOException{
MyGetMethod method = new MyGetMethod(strurl);
Globals.getClient(Domian.CNRENTI).executeMethod(method);
String body = method.getResponseBodyAsString();
Parser mobileParser = Parser.createParser(body, Globals.CHARSET_UTF8);
try {
NodeList nodeList = mobileParser.parse(CONTENT_DIV_ID);
Parser nodeParser1 = Parser.createParser(nodeList.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist1 = nodeParser1.parse(ONETD_DIV_CLASS);
//System.out.println(aNlist1.toHtml());
if(aNlist1 != null && aNlist1.size()>0){
for(int aa=0;aa<(aNlist1.size());aa++){
Node aNode2 = aNlist1.elementAt(aa);
//System.out.println(aNode2.toHtml());
String name = aNode2.getChildren().elementAt(0).getChildren().elementAt(0).getChildren().toHtml().split("名称:")[1];
System.out.println("图片名 = "+name.substring(0, name.length()-2));
Parser nodeParser11 = Parser.createParser(aNlist1.toHtml(), Globals.CHARSET_UTF8);
NodeList aNlist11 = nodeParser11.parse(FILTER_IMG);
Node aNode11 = aNlist11.elementAt(aa);
ImageTag img = (ImageTag)aNode11;
//String bigPicUrl = img.getImageURL().substring(0, img.getImageURL().length()-10)+"big/"+img.getImageURL().substring(img.getImageURL().length()-10, img.getImageURL().length());
System.out.println("小图片 = "+img.getImageURL());
if(img.getImageURL().split("/").length == 5){
String pic = img.getImageURL().split("/")[0]+"/"+img.getImageURL().split("/")[1]+"/"+img.getImageURL().split("/")[2]+"/"+img.getImageURL().split("/")[3];
String bigPicUrl = pic+"/big/"+img.getImageURL().split("/")[4];
System.out.println("大图片 = "+bigPicUrl);
}
//System.out.println("大图片 = "+bigPicUrl+"\n");
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return new String(body.getBytes(Globals.CHARSET_UTF8));
}
public static void main(String[] arge) throws HttpException, IOException{
String type = "";
for(int i=1;i<9;i++){
if(i == 1) type = "东方人体";
else if(i == 2) type = "西方人体";
else if(i == 3) type = "人体彩绘";
else if(i == 4) type = "油画艺术";
else if(i == 5) type = "素描艺术";
else if(i == 6) type = "雕塑艺术";
else if(i == 7) type = "水墨艺术";
else if(i == 8) type = "其它艺术";
one("http://www.ysmao.com/ystype/"+i+"_1.htm",i,type);
}
}
}
相关文章推荐
- python抓取网页图片示例(python爬虫)
- python抓取网页图片示例(python爬虫)
- Python爬虫之网页图片抓取的方法
- 爬虫抓取网页图片
- 第一个python程序,小爬虫--抓取网页图片
- Python之多线程爬虫抓取网页图片的示例代码
- 爬虫-简单抓取网页图片
- python爬虫实战(1)抓取网页图片自动保存
- Python3简单爬虫抓取网页图片
- python抓取网页图片 python爬虫实例
- Python爬虫实现抓取网页图片
- Python3简单爬虫抓取网页图片
- java爬虫实战简单用Jsoup框架进行网页爬虫(如抓取网页图片)
- Java爬虫网页抓取图片
- Python3简单爬虫抓取网页图片
- Python爬虫抓取网页图片
- python爬虫之抓取网页中的图片到本地
- Python爬虫学习笔记一:简单网页图片抓取
- Python爬虫学习笔记二:百度贴吧网页图片抓取
- 爬虫抓取网页图片