您的位置：首页 > 其它

网页内容抓取图片的抓取方法

2015-11-06 15:52 260 查看

DOWNLOADPIC下载图片方法：

package com.bwjf.webapp.util;

import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class picHtmlDown {

private static int COUNT = 0;

private static int DOWN_COUNT = 0;

public static void jsoupHTML(String urlPath) throws Exception {

Document doc = Jsoup.connect(urlPath).timeout(1000000).get();

// :当前页中的图片

Elements srcLinks = doc.select("a[class$=title-logo]");

for (Element link : srcLinks) {

//

String imagesPath = link.childNode(0).attr("src");//

String engName = link.attr("href");

System.out.println("当前访问路径:" + imagesPath);

getImages(imagesPath, "d://images//0000" + COUNT++ + ".png");

// getImages(imagesPath, "d://images//0000"+ ++COUNT +".png");

}

}

/**

* @param urlPath

* 图片路径

* @throws Exception

*/

public static void getImages(String urlPath, String fileName)

throws Exception {

URL url = new URL(urlPath);// ：获取的路径

// :http协议连接对象

HttpURLConnection conn = (HttpURLConnection) url.openConnection();

conn.setRequestMethod("GET");

conn.setReadTimeout(6 * 10000);

if (conn.getResponseCode() < 10000) {

InputStream inputStream = conn.getInputStream();

byte[] data = readStream(inputStream);

if (data.length > (1024 * 10)) {

FileOutputStream outputStream = new FileOutputStream(fileName);

outputStream.write(data);

System.out.println("第" + ++DOWN_COUNT + "图片下载成功");

outputStream.close();

}

}

}

/**

* 读取url中数据，并以字节的形式返回

*

* @param inputStream

* @return

* @throws Exception

*/

public static byte[] readStream(InputStream inputStream) throws Exception {

ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

byte[] buffer = new byte[1024];

int len = -1;

while ((len = inputStream.read(buffer)) != -1) {

outputStream.write(buffer, 0, len);

}

outputStream.close();

inputStream.close();

return outputStream.toByteArray();

}
/********************调用下面俩个方法就可以实现局部具体标签信息的抓取，具体操作根据具体的需要来改变**********/

/**

*通过路径下载图片方法：1

*

* @param args

* @throws IOException

*/

public static String daownPic(String url, String path) throws IOException {

URL ul = new URL(url);

DataInputStream dis = new DataInputStream(ul.openStream());

//下载的图片生成一个新的文件路径

String newPicName = "d://imagess//"+path;

FileOutputStream fis = new FileOutputStream(new File(newPicName));

byte[] buffer = new byte[2048];

int length;

while ((length = dis.read(buffer)) > 0) {

fis.write(buffer, 0, length);

}

fis.close();

dis.close();

return newPicName;

}
//测试方法

public static void main(String[] args) {

try {

jsoupHTML("A", "奥迪", "http://car.bitauto.com/tree_chexing/mb_9/");

} catch (Exception e) {

e.printStackTrace();

} finally {

System.out.println("共访问" + COUNT + "张图片，其中下载" + DOWN_COUNT + "张图片");

}

}
/**
* 方法 2
* @param firstChar
* @param name
* @param urlPath
* @return
* @throws Exception
*/

public static String jsoupHTML(String firstChar, String name, String urlPath)

throws Exception {

String pathh="";

//通过Jsoup包中的方法与传入的网址建立连接并生成一个Document对象

Document doc = Jsoup.connect(urlPath).timeout(1000000).get();

// :当前页中的图片,

//通过Document对象取属性值为：title-logo的a标签,得到一个Elements元素对象

Elements srcLinks = doc.select("a[class$=title-logo]");

//遍历这个Elements

for (Element link : srcLinks) {

// 通过childNode(0)方法找到当前a标签的对象，并通过attr方法得到src的值,即图片的链接路径

String imagesPath = link.childNode(0).attr("src");

//通过a标签的href属性获取其href属性值，并将形式如："/Audi/",通过replaceAll方法将"/"去除

//只取其中值得到图片对应的英文名称

String engName = link.attr("href").replaceAll("/", "");
System.out.println("当前访问图片的英文名称:"+engName);
System.out.println("当前访问图片路径:" + imagesPath);
//A/audi/奥迪.png生成的图片保存路径,通过传入的数据拼接需要的图片保存名称
String path = firstChar+engName+name + ".png";
//TODO update where name and firstchar数据库 set ename=engName , img=path
System.out.println("当前生成的文件名称:"+path);
//传入上面获取的图片路径和图片名称 .通过图片下载方法返回一个图片生成在本地的图片全路径,
//方便后面的数据库存储操作
String picPath = daownPic(imagesPath,path);
//返回图片路径和图片英文名称，方便后面数据库保存的时候取值处理

pathh=picPath+","+engName;

}

return pathh;

}
}

MysqlUTIL数据操作保存方法：

package com.bwjf.webapp.util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
/**
* 数据库连接工具类
*
* @author Administrator
*
*/
public class MysqlUtil {

private static final String URL= "jdbc:mysql://192.168.0.222:3306/cars?"
+ "user=root&password=soft123&useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull";

/**获取mysql数据库连接对象方法这个没用上**/

public static Connection getConnection(String URL){

Connection conn = null;

try {

Class.forName("com.mysql.jdbc.Driver");

conn = DriverManager.getConnection(URL);

}catch (Exception e1){

e1.printStackTrace();

}

return conn;

}

//以下集成了所有操作，也可以分开写

public static void main(String[] args) throws Exception {
Connection conn = null;

// MySQL的JDBC URL编写方式：jdbc:mysql://主机名称：连接端口/数据库的名称?参数=值
// 避免中文乱码要指定useUnicode和characterEncoding
// 执行数据库操作之前要在数据库管理系统上创建一个数据库，名字自己定，
// 下面语句之前就要先创建javademo数据库
String url = "jdbc:mysql://192.168.0.222:3306/cars?"
+ "user=root&password=soft123&useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull";
try {
// 之所以要使用下面这条语句，是因为要使用MySQL的驱动，所以我们要把它驱动起来，
// 可以通过Class.forName把它加载进去，也可以通过初始化来驱动起来，下面三种形式都可以
Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
// or:
// com.mysql.jdbc.Driver driver = new com.mysql.jdbc.Driver();
// or：
// new com.mysql.jdbc.Driver();

System.out.println("成功加载MySQL驱动程序");
// 一个Connection代表一个数据库连接
conn = DriverManager.getConnection(url);
// Statement里面带有很多方法，比如executeUpdate可以实现插入，更新和删除等
String sql = "select * from t_cars";
PreparedStatement stmt = conn.prepareStatement(sql);
ResultSet rs = stmt.executeQuery();
while(rs.next()){
//遍历查询结果，取出需要的字段信息

String name=rs.getString("name");

String chars = rs.getString("firstchar");

String urls = rs.getString("url");

//传入对应的网页路径获取对应网页上具体标签的所需内容

String picurl = picHtmlDown.jsoupHTML(chars,name,urls);

String[] pic = picurl.split(",");
//将查到的图片名称和生成的图片路径保存到数据库

String sqll = "update t_cars set img=? , ename=? where url=?;";

PreparedStatement pstmt = conn.prepareStatement(sqll);

//动态设定参数

pstmt.setString(1, pic[0]);

pstmt.setString(2, pic[1]);

pstmt.setString(3, urls);
int rss = pstmt.executeUpdate();
pstmt.close();
}
} catch (SQLException e) {
System.out.println("MySQL操作错误");
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
conn.close();
}

}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 网页数据抓取图片抓取数据保存

相关文章推荐

新的分享

章节导航

网页内容抓取 图片的抓取方法

DOWNLOADPIC下载图片方法：

MysqlUTIL数据操作保存方法：

网页内容抓取图片的抓取方法