您的位置:首页 > 数据库

使用htmlparser简单抓取京东图书信息存入数据库的小例子

2011-11-07 09:03 417 查看
在学习lucene的时候需要很大的数据,很大的数据怎么来,不可能手工一点输入,从网上找了个方法,抓取数据。

我把目标锁定在了京东商城的图书信息。下面是我抓取图书信息的一个简单例子,写的不是特别严谨,但对我学习来说足够了。如果每个页面都有效的话完全能达到80万的数据。我把抓到的信息存入我设计的数据库中。在处理过程中,把处理好的网址信息以及出错信息存入e:/360book/book1.txt,使用前在E盘创建一个360book的文件夹。

数据库表结构为

CREATE TABLE `books` (
`id` bigint(20) NOT NULL AUTO_INCREMENT,
`bookName` varchar(1024) DEFAULT NULL,
`auther` varchar(1024) DEFAULT NULL,
`booksName` varchar(1024) DEFAULT NULL,
`publish` varchar(1024) DEFAULT NULL,
`isbn` varchar(1024) DEFAULT NULL,
`publishDate` varchar(1024) DEFAULT NULL,
`edition` varchar(1024) DEFAULT NULL,
`pages` varchar(1024) DEFAULT NULL,
`frame` varchar(1024) DEFAULT NULL,
`format` varchar(1024) DEFAULT NULL,
`type` varchar(1024) DEFAULT NULL,
`price` varchar(1024) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=15779 DEFAULT CHARSET=utf8


例子中使用的插件是htmlparser,版本为2.0。

这是处理页面信息的一个Class.

package com;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.HashMap;
import java.util.Map;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;

public class Buy {

public static void main(String []args) throws Exception {
FileWriter fileWriter = null;
PrintWriter printWriter = null;
File file = new File("e:/360book/book1.txt");
for(int i=2;i<=861852;i++){
int t = 10000000 + i;
String url = "http://book.360buy.com/"+t+".html";
//System.out.println(url);
try {
Buy.getBook(url);
fileWriter = new FileWriter(file,true);
printWriter = new PrintWriter(fileWriter);
printWriter.println(url+"\t is ok!");
printWriter.close();
fileWriter.close();
} catch (Exception e) {
e.printStackTrace();
try {
fileWriter = new FileWriter(file,true);
printWriter = new PrintWriter(fileWriter);
printWriter.println(e);
printWriter.close();
fileWriter.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
}

public static void getBook(String url) throws Exception {

Parser parser = new Parser((HttpURLConnection)new URL(url).openConnection());
parser.setEncoding("gb2312");
NodeFilter nameFilter = new HasAttributeFilter("id","name");
NodeList nodeList = parser.extractAllNodesThatMatch(nameFilter);
String bookName = getBookName(nodeList);

NodeFilter infoFilter = new HasAttributeFilter("id","summary");
parser.reset();
NodeList summaryList = parser.extractAllNodesThatMatch(infoFilter);
Map<String,String> map = getSummary(summaryList);

NodeFilter priceFilter = new HasAttributeFilter("id","book-price");
parser.reset();
NodeList priceList = parser.extractAllNodesThatMatch(priceFilter);
String price = getPrice(priceList);
Connection con = DBCon.getConnection();
save(con,bookName,map,price);
DBCon.closeConnection(con);
}

public static void save(Connection con ,String bookName,Map<String,String> map,String price)throws Exception {
PreparedStatement pstmt = null;
String sql = "insert into books (bookName,auther,booksName,publish,isbn,publishDate,edition,pages,frame,format,type,price) values(?,?,?,?,?,?,?,?,?,?,?,?)";
pstmt = con.prepareStatement(sql);
pstmt.setString(1, bookName);
pstmt.setString(2, map.get("auther"));
pstmt.setString(3, map.get("booksName"));
pstmt.setString(4, map.get("publish"));
pstmt.setString(5, map.get("isbn"));
pstmt.setString(6, map.get("publishDate"));
pstmt.setString(7, map.get("edition"));
pstmt.setString(8, map.get("pages"));
pstmt.setString(9, map.get("frame"));
pstmt.setString(10, map.get("format"));
pstmt.setString(11, map.get("type"));
pstmt.setString(12, price);
pstmt.executeUpdate();
DBCon.closePreparedStatement(pstmt);
}

public static Map<String,String> getSummary(NodeList nodeList){

Map<String,String> map = new HashMap<String,String>();
nodeList = nodeList.elementAt(0).getChildren().extractAllNodesThatMatch(new TagNameFilter("li"));
for(int i=0;i<nodeList.size();i++){
switch(i){
case 0:map.put("auther", getAuther(nodeList.elementAt(i)));break;
case 1:map.put("booksName", getValue(nodeList.elementAt(i)));break;
case 2:map.put("publish", getPublish(nodeList.elementAt(i)));break;
case 3:map.put("isbn", getValue(nodeList.elementAt(i)));break;
case 4:map.put("publishDate", getValue(nodeList.elementAt(i)));break;
case 5:map.put("edition", getValue(nodeList.elementAt(i)));break;
case 6:map.put("pages", getValue(nodeList.elementAt(i)));break;
case 7:map.put("frame", getValue(nodeList.elementAt(i)));break;
case 8:map.put("format", getValue(nodeList.elementAt(i)));break;
case 9:map.put("type", getPublish(nodeList.elementAt(i)));break;
}
}

return map;
}

public static String getValue(Node node){

String temp = node.toHtml();
return temp.substring(temp.indexOf("</span>")+7, temp.indexOf("</li>"));
}

public static String getPublish(Node node){

String publish = "";
NodeFilter nodeFilter = new TagNameFilter("a");
NodeList nodeList = node.getChildren().extractAllNodesThatMatch(nodeFilter);
if(nodeList!=null){
for(int i=0;i<nodeList.size();i++){
publish = publish + nodeList.elementAt(i).getChildren().elementAt(0).getText();
if(i!=nodeList.size()-1){
publish = publish + ",";
}
}
}
return publish;
}

public static String getAuther(Node node){

String auther = "";
NodeFilter nodeFilter = new TagNameFilter("a");
NodeList nodeList = node.getChildren().extractAllNodesThatMatch(nodeFilter);
if(nodeList!=null){
for(int i=0;i<nodeList.size();i++){
auther = auther + nodeList.elementAt(i).getChildren().elementAt(0).getText();
if(i!=nodeList.size()-1){
auther = auther +",";
}
}
}
return auther;
}

public static String getBookName(NodeList nodeList){

String bookName = "";
String temp = nodeList.elementAt(0).getChildren().toHtml();
bookName = temp.substring(temp.indexOf("<h1>")+4, temp.indexOf("<span>"));

return bookName;
}

public static String getPrice(NodeList nodeList){

String price = "";
NodeFilter childFilter = new TagNameFilter("del");
NodeFilter nodeFilter = new HasChildFilter(childFilter);
nodeList = nodeList.elementAt(0).getChildren().extractAllNodesThatMatch(nodeFilter);
String temp = nodeList.toHtml();
price = temp.substring(temp.indexOf("<del>")+5, temp.indexOf("</del>"));

return price;
}
}


DBCon.java
package com;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

//import javax.naming.Context;
//import javax.naming.InitialContext;
//import javax.sql.DataSource;

public class DBCon {
public static Connection getConnection(){
Connection con=null;
//Context ctx;
try {
//ctx = new InitialContext();
//DataSource ds = (DataSource) ctx.lookup("java:comp/env/jdbc/myhelp");
//con = ds.getConnection();
Class.forName("com.mysql.jdbc.Driver").newInstance();
con=DriverManager.getConnection("jdbc:mysql://localhost:3306/test?user=root&password=root");
} catch (Exception e) {
e.printStackTrace();
}
return con;
}

public static void closeConnection(Connection con) {
if(con!=null){
try {
con.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}

public static void closeResultSet(ResultSet rs) {
if(rs!=null){
try {
rs.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}

public static void closePreparedStatement(PreparedStatement pstmt){
if(pstmt!=null){
try {
pstmt.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}

public static void closeStatement(Statement stmt){
if(stmt!=null){
try {
stmt.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: