您的位置:首页 > 运维架构 > 网站架构

java模拟浏览器抓取网站信息和下载附件

2017-06-22 00:00 411 查看
package com.teamdev.jxbrowser.chromium.demoTest.Huhehaote;

import java.awt.BorderLayout;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.util.concurrent.CountDownLatch;

import java.util.concurrent.TimeUnit;

import java.util.logging.Level;

import javax.swing.JFrame;

import javax.swing.WindowConstants;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import com.teamdev.jxbrowser.chromium.Browser;

import com.teamdev.jxbrowser.chromium.BrowserPreferences;

import com.teamdev.jxbrowser.chromium.LoggerProvider;

import com.teamdev.jxbrowser.chromium.events.FinishLoadingEvent;

import com.teamdev.jxbrowser.chromium.events.LoadAdapter;

import com.teamdev.jxbrowser.chromium.swing.BrowserView;

/**

* 爬取食药以及附件信息

* */

public class Test_Jsopu {

public static void main(String[] args) {

//模拟浏览器

LoggerProvider.getBrowserLogger().setLevel(Level.SEVERE);

LoggerProvider.getIPCLogger().setLevel(Level.SEVERE);

LoggerProvider.getChromiumProcessLogger().setLevel(Level.SEVERE);

final Browser browser = new Browser();

BrowserView browserView = new BrowserView(browser);

BrowserPreferences preferences = browser.getPreferences();

preferences.setImagesEnabled(false);

JFrame frame = new JFrame();

frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);

frame.add(browserView, BorderLayout.CENTER);

frame.setSize(700, 500);

frame.setLocationRelativeTo(null);

frame.setVisible(true);

try {

//

String url="";

for(int i=0;i<7;i++){

if(i==0){

url="http://www.sda.gov.cn/WS01/CL1698/index.html";

}else{

url="http://www.sda.gov.cn/WS01/CL1698/index_"+i+".html";

}

search(browser,frame,url);

}

} catch (Exception e) {

e.printStackTrace();

}

}

//一次页面检索

public static void search(final Browser br ,JFrame frm,final String url) throws Exception {

new Thread().sleep(3000);

Document doc = null;

String attrUrl =url;

System.out.println("attrUrl" + attrUrl);

try{

invokeAndWaitReady(br, new Runnable() {

public void run() {

br.loadURL(url);

}

});

} catch (Exception e2) {

e2.printStackTrace();

}

String html = br.getHTML();

doc = Jsoup.parse(html);

String text = doc.text();

System.out.println(text);

Elements els= doc.getElementsByAttributeValue("class", "ListColumnClass15");

System.out.println(els.size());

for (Element el : els) {

String title=el.getElementsByTag("a").get(0).text();

String href="http://www.sda.gov.cn/WS01"+el.getElementsByTag("a").attr("href").replace("..", "");

//System.out.println("title="+title);

System.out.println("href="+href);

try {

search2( br,frm,href,title);

} catch (Exception e) {

// TODO: handle exception

e.printStackTrace();

continue;

}

}

}

public static void search2(final Browser br ,JFrame frm,final String url,String title) throws Exception {

new Thread().sleep(3000);

Document doc = null;

String attrUrl =url;

System.out.println("attrUrl" + attrUrl);

try{

invokeAndWaitReady(br, new Runnable() {

public void run() {

br.loadURL(url);

}

});

} catch (Exception e2) {

e2.printStackTrace();

}

String html = br.getHTML();

doc = Jsoup.parse(html);

String strs[]=doc.html().split("line-height:16px");

for(int i=0;i<strs.length;i++){

Thread.sleep(3000);

if(i!=0&&i!=strs.length){

String str="http://www.sda.gov.cn/"+strs[i].split("href=\"")[1].split("\">")[0];

String firlname=strs[i].split("href=\"")[1].split("\">")[1].split("</")[0];

System.out.println("title:"+title);

System.out.println("str:"+str);

System.out.println("firlname:"+firlname);

runDownLoad(title, str,firlname);

}

System.out.println("---------------------------------------");

}

}

public static void invokeAndWaitReady(Browser browser,Runnable runnable) {

final CountDownLatch latch = new CountDownLatch(1);

LoadAdapter listener = new LoadAdapter() {

@Override

public void onFinishLoadingFrame(FinishLoadingEvent event) {

if (event.isMainFrame()) {

latch.countDown();

}

}

};

browser.addLoadListener(listener);

try {

runnable.run();

try {

if (!latch.await(60, TimeUnit.SECONDS)) {

//throw new RuntimeException(new TimeoutException());

}

} catch (InterruptedException ignore) {

ignore.printStackTrace();

Thread.currentThread().interrupt();

}

} finally {

browser.removeLoadListener(listener);

}

}

public static void runDownLoad(String title,String fileurl,String filename){

// 构造URL

URL url;

try {

url = new URL(fileurl);

// 打开URL连接

URLConnection con = (URLConnection)url.openConnection();

// 得到URL的输入流

InputStream input = con.getInputStream();

// 设置数据缓冲

byte[] bs = new byte[1024 * 2];

// 读取到的数据长度

int len;

// 输出的文件流保存图片至本地

String path1 = "c:/aa/"+title;

File f = new File(path1);

f.mkdirs();

// String hzm=fileurl.split("\\.")[fileurl.split("\\.").length-1];

OutputStream os = new FileOutputStream(path1+"\\"+filename);

while ((len = input.read(bs)) != -1) {

os.write(bs, 0, len);

}

os.close();

input.close();

} catch (MalformedURLException e) {

// TODO 自动生成的 catch 块

e.printStackTrace();

} catch (IOException e) {

// TODO 自动生成的 catch 块

e.printStackTrace();

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: