Java获取网络文件并插入数据库
2010-08-20 10:36
211 查看
抓取各大网站的数据插入数据库,这样就不用为没有数据而烦恼了
获取百度的歌曲名,歌手和链接!!
DBTools数据库链接类:
Servlet调用:
获取金书网的图书名:
调用Servlet:
每种功能的实现方法有很多,希望各位可以交流不同的思想和方法。可以加QQ412546724。呵呵
获取百度的歌曲名,歌手和链接!!
package webTools; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import dbTools.DBTools; public class IOTOWeb { public String getHtmlContent(String htmlURL) { URL url = null; String rowContent = ""; StringBuffer htmlContent = new StringBuffer(); try { url = new URL(htmlURL); BufferedReader in = new BufferedReader(new InputStreamReader(url .openStream(), "gb2312")); while ((rowContent = in.readLine()) != null) { htmlContent.append(rowContent); } in.close(); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return htmlContent.toString(); } public List getLink(String htmlContent) { ArrayList listLink = new ArrayList(); String regex = "<td[^>]*>[//(]*<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)[//)]*[//s]*</td>"; Pattern pattern = Pattern.compile(regex, Pattern.DOTALL); Matcher matcher = pattern.matcher(htmlContent); while (matcher.find()) { listLink.add(matcher.group()); } return listLink; } public List<String> getHref(String htmlContent) { String regex; List listtHref = new ArrayList(); regex = "href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))/""; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(htmlContent); while (ma.find()) { listtHref.add(ma.group().replaceFirst("href=/"", "").replace("/"", "")); } return listtHref; } public List<String> getPerson(String htmlContent) { String regex; List list = new ArrayList(); regex = "//(<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)//)"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(htmlContent); while (ma.find()) { list.add(ma.group().replaceFirst("href=/"", "").replace("/"", "")); } return list; } public List<String> getSongName(String htmlContent) { String regex; List listPerson = new ArrayList(); regex = "<a[^>]*href=(/"([^/"]*)/"|/'([^/']*)/'|([^//s>]*))[^>]*>(.*?)</a>//s"; Pattern pa = Pattern.compile(regex, Pattern.DOTALL); Matcher ma = pa.matcher(htmlContent); while (ma.find()) { listPerson.add(ma.group()); } return listPerson; } public String getMainContent(String htmlContent) { String regex = "<table width=/"100%/" align=/"center/" cellpadding=/"0/" cellspacing=/"0/" class=/"list/">(.*?)</table>"; StringBuffer mainContent = new StringBuffer(); Pattern pattern = Pattern.compile(regex, Pattern.DOTALL); Matcher matcher = pattern.matcher(htmlContent); while (matcher.find()) { mainContent.append(matcher.group()); } return mainContent.toString(); } public String outTag(final String s) { return s.replaceAll("<.*?>", ""); } DBTools dbTools = new DBTools(); public void getFromBaiduMap3(String htmlURL) throws Throwable { HashMap htmlContentMap = new HashMap(); String htmlContent = getHtmlContent(htmlURL); String mainContent = getMainContent(htmlContent); List listLink = getLink(mainContent); for (int j = 0; j < listLink.size(); j++) { String tdTag = listLink.get(j).toString(); List songNameList = getSongName(tdTag); String songName = outTag(songNameList.get(0).toString()); List personList = getPerson(tdTag); String songPerson = ""; if (personList.size() != 0) { for (int n = 0; n < personList.size(); n++) { // System.out.println(personList.get(n).toString()); songPerson = outTag(personList.get(n).toString()); } } else { songPerson = "无"; } // System.out.print(songNameList.get(0).toString()); List hrefList = getHref(songNameList.get(0).toString()); String songHref = hrefList.get(0).toString(); System.out.println(); String sql = "insert into song(songName,songPerson,songHref) values(?,?,?)"; ArrayList list_values = new ArrayList(); list_values.add(songName); list_values.add(songPerson); list_values.add(songHref); dbTools.update(sql, list_values); } } }
DBTools数据库链接类:
package dbTools; import java.util.ArrayList; import java.sql.*; public class DBTools { private PreparedStatement preparedStatement; private ResultSet resultSet; private Connection connection; public DBTools() { try { Class.forName("com.mysql.jdbc.Driver"); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } try { connection = DriverManager.getConnection( "jdbc:mysql://localhost:3306/TestURL", "root", "zhuyi"); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public ArrayList query(String sql, ArrayList list_values) throws Throwable { ArrayList listRows = new ArrayList(); preparedStatement = connection.prepareStatement(sql); for (int i = 0; i < list_values.size(); i++) { preparedStatement.setObject(i + 1, list_values.get(i)); } resultSet = preparedStatement.executeQuery(); while (resultSet.next()) { String[] rowinfo = new String[resultSet.getMetaData() .getColumnCount()]; for (int i = 0; i < rowinfo.length; i++) { rowinfo[i] = resultSet.getString(i + 1); } listRows.add(rowinfo); } return listRows; } public void update(String sql, ArrayList list_values) throws Throwable { preparedStatement = connection.prepareStatement(sql); for (int i = 0; i < list_values.size(); i++) { preparedStatement.setObject(i + 1, list_values.get(i)); } preparedStatement.executeUpdate(); preparedStatement.close(); } }
Servlet调用:
package controller; import java.io.IOException; import java.io.PrintWriter; import java.util.List; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import webTools.IOTOWeb; public class TestURL extends HttpServlet { /** * Constructor of the object. */ public TestURL() { super(); } /** * Destruction of the servlet. <br> */ public void destroy() { super.destroy(); // Just puts "destroy" string in log // Put your code here } /** * The doGet method of the servlet. <br> * * This method is called when a form has its tag value method equals to get. * * @param request * the request send by the client to the server * @param response * the response send by the server to the client * @throws ServletException * if an error occurred * @throws IOException * if an error occurred */ public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { try { IOTOWeb iotoWeb = new IOTOWeb(); iotoWeb.getFromBaiduMap3("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2"); } catch (Throwable e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * The doPost method of the servlet. <br> * * This method is called when a form has its tag value method equals to * post. * * @param request * the request send by the client to the server * @param response * the response send by the server to the client * @throws ServletException * if an error occurred * @throws IOException * if an error occurred */ public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html"); PrintWriter out = response.getWriter(); out .println("<!DOCTYPE HTML PUBLIC /"-//W3C//DTD HTML 4.01 Transitional//EN/">"); out.println("<HTML>"); out.println(" <HEAD><TITLE>A Servlet</TITLE></HEAD>"); out.println(" <BODY>"); out.print(" This is "); out.print(this.getClass()); out.println(", using the POST method"); out.println(" </BODY>"); out.println("</HTML>"); out.flush(); out.close(); } /** * Initialization of the servlet. <br> * * @throws ServletException * if an error occurs */ public void init() throws ServletException { // Put your code here } }
获取金书网的图书名:
package webTools; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import dbTools.DBTools; public class GetBook { public String getHtmlContent(String htmlURL) throws Throwable { URL url = null; String rowContent = ""; StringBuffer htmlContent = new StringBuffer(); url = new URL(htmlURL); BufferedReader in = new BufferedReader(new InputStreamReader(url .openStream(), "gb2312")); while ((rowContent = in.readLine()) != null) { htmlContent.append(rowContent); } in.close(); return htmlContent.toString(); } public String getBookName(String htmlContent) { String bookName = ""; String regex = "<span class=/"style15/">[^>]*</span>"; Pattern pattern = Pattern.compile(regex, Pattern.DOTALL); Matcher matcher = pattern.matcher(htmlContent); if (matcher.find()) { bookName = matcher.group(); } return bookName; } public String outTag(final String s) { return s.replaceAll("<.*?>", ""); } DBTools dbtools = new DBTools(); public void getFromJINSHU(String htmlURL) throws Throwable { String htmlContent = getHtmlContent(htmlURL); String bookName = outTag(getBookName(htmlContent)); if (bookName != null && !"".equals(bookName)) { System.out.println(bookName); String sql = "insert into bookinfo(bookName) values(?)"; ArrayList list_values = new ArrayList(); list_values.add(bookName); dbtools.update(sql, list_values); } } }
调用Servlet:
package controller; import java.io.IOException; import java.io.PrintWriter; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import webTools.GetBook; public class TestBook extends HttpServlet { /** * Constructor of the object. */ public TestBook() { super(); } /** * Destruction of the servlet. <br> */ public void destroy() { super.destroy(); // Just puts "destroy" string in log // Put your code here } /** * The doGet method of the servlet. <br> * * This method is called when a form has its tag value method equals to get. * * @param request * the request send by the client to the server * @param response * the response send by the server to the client * @throws ServletException * if an error occurred * @throws IOException * if an error occurred */ int i = 1; public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { GetBook bookinfo = new GetBook(); for (; i < 10000; i++) { String bookURL = "http://www.golden-book.com/booksinfo/12/" + i + ".html"; try { bookinfo.getFromJINSHU(bookURL); } catch (Throwable e) { i++; doPost(request, response); } } } /** * The doPost method of the servlet. <br> * * This method is called when a form has its tag value method equals to * post. * * @param request * the request send by the client to the server * @param response * the response send by the server to the client * @throws ServletException * if an error occurred * @throws IOException * if an error occurred */ public void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { GetBook bookinfo = new GetBook(); for (; i < 10000; i++) { String bookURL = "http://www.golden-book.com/booksinfo/12/" + i + ".html"; try { bookinfo.getFromJINSHU(bookURL); } catch (Throwable e) { i++; doGet(request, response); } } } /** * Initialization of the servlet. <br> * * @throws ServletException * if an error occurs */ public void init() throws ServletException { // Put your code here } }
每种功能的实现方法有很多,希望各位可以交流不同的思想和方法。可以加QQ412546724。呵呵
相关文章推荐
- Java获取网络文件并插入数据库
- Java获取网络文件并插入数据库的代码
- Java获取数据库自增主键表中插入数据的ID
- Java通过apache poi 读取excel(.xlsx)文件,并通过MyBbtis插入数据库中
- JAVA获取网络文件的大小
- JavaWeb之JDBC(二)采用读取配置文件方式编写JDBC的工具类,获取数据库的连接
- Java实现把excel xls中数据转为可直接插入数据库的sql文件
- java实现读取XML文件数据插入到数据库中
- Java通过URL下载网络文件,获取文件流并修改文件名
- java 获取网络文件的编码问题
- Java 获取网络上的文件
- java获取网络文件大小
- 解决java网络下载获取不到文件长度
- 遇到问题---java获取网络文件大小失败getContentLength()为-1 完整解决方法
- java 获取当前时间插入数据库
- Java读取csv文件并将内容插入到数据库
- 遇到问题---java获取网络文件大小失败getContentLength()为-1 完整解决方法
- 转载:基于java的网络爬虫框架(实现京东数据的爬取,并将插入数据库)
- Java实现向数据库插入图片和获取数据库图片
- 在java中获取当前系统时间 插入数据库中的时间值没有时间只有日期的原因