Java微博搜索关键字采集
2016-04-09 02:32
435 查看
import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.List; import java.util.Random; import java.util.concurrent.Callable; import org.apache.http.client.CookieStore; import org.apache.log4j.Logger; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.util.Cookie; public class SinaSearchCrawlerCommand implements Callable<Object> { private static Logger logger = Logger.getLogger(SinaSearchCrawlerCommand.class); private static String word="如家"; private static String cookiePath="E:\\学习\\微博爬虫\\cookie\\cookie.file"; private static String outputpath="E:\\学习\\微博爬虫\\"; //public Object call(){ public static void main(String[] args){ try { word= java.net.URLEncoder.encode(word, "utf-8"); } catch (UnsupportedEncodingException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); webClient.getCookieManager().setCookiesEnabled(true); for(int i=1;i<=100;i++){ System.out.println(cookiePathAppendRandom()); File file = new File(cookiePathAppendRandom()); if (file.exists()) { FileInputStream fin = null; try { fin = new FileInputStream(file); } catch (FileNotFoundException e1) { e1.printStackTrace(); } CookieStore cookieStore = null; ObjectInputStream in; try { in = new ObjectInputStream(fin); cookieStore = (CookieStore) in.readObject(); in.close(); } catch (IOException e) { logger.error(e); } catch (ClassNotFoundException e) { logger.error(e); } List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies(); for (org.apache.http.cookie.Cookie temp : l) { Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(), temp.getExpiryDate(), false); webClient.getCookieManager().addCookie(cookie); } /*HtmlPage page = null; try { page = webClient.getPage("http://weibo.cn/search/?tf=5_012"); } catch (FailingHttpStatusCodeException e) { logger.error(e); } catch (MalformedURLException e) { logger.error(e); } catch (IOException e) { logger.error(e); } HtmlForm form = page.getForms().get(0); HtmlSubmitInput button = form.getInputByName("smblog"); form.getInputByName("keyword").setValueAttribute(word); logger.info("search:" + word); try { page = button.click(); } catch (IOException e1) { logger.error(e1); }*/ HtmlPage page = null; try { //logger.info("execution:"+this); page = webClient.getPage("http://weibo.cn/search/mblog?hideSearchFrame=&keyword="+word+"&page="+i); } catch (FailingHttpStatusCodeException e) { logger.error(e); } catch (MalformedURLException e) { logger.error(e); } catch (IOException e) { logger.error(e); } SimpleDateFormat dayformat = new SimpleDateFormat("yyyyMMdd"); long start = System.currentTimeMillis(); start = System.currentTimeMillis(); String path = null; File file2 = null; path = new String(outputpath + "/" + dayformat.format(start) + "/" + System.currentTimeMillis() + file.getName()+".html" ); file2 = new File(outputpath + "/" + dayformat.format(start)); if (!file2.exists()) file2.mkdirs(); file2 = new File(path); System.out.println("当前页"+i+",采集至"+path); if (file2.exists()) logger.warn("outfile exit!"); else { FileOutputStream outputStream; try { outputStream = new FileOutputStream(file2); outputStream.write(page.getWebResponse().getContentAsString().getBytes()); outputStream.close(); } catch (FileNotFoundException e) { logger.error(e); } catch (IOException e) { logger.error(e); } } webClient.closeAllWindows(); } else { logger.warn("CookiePath doesn`t exit !!!"); } logger.info("execution:"); try { Thread.sleep(10000); } catch (InterruptedException e) { logger.error(e); return; } } return; } private static String cookiePathAppendRandom() { Random random = new Random(); return cookiePath+random.nextInt(7); } public SinaSearchCrawlerCommand(String word, String cookiePath, String outputpath) { if(word.contains("&")) { word = word.replace("&", " "); } this.word = word; this.cookiePath = cookiePath; this.outputpath = outputpath; } @Override public String toString() { return "SinaSearchCrawlerCommand [word=" + word + ", outputpath=" + outputpath + "]"; } @Override public Object call() throws Exception { // TODO Auto-generated method stub return null; } }
相关文章推荐
- Java获取新浪微博cookies
- java程序控制台编译成功但无法运行,报“找不到或无法加载主类”错误!
- 开发问题记录之Cookie
- 【java基础】String中的equal与==的区别
- #6JAVA异常处理方案#
- MyEclipse搭建struts2环境
- Java 中 hashCode 和 equals 方法 小结
- Java自定义标签
- java基础-----2016.4.9(1)
- java 哪个函数都可以操作类变量 / 类函数只能操作类变量
- MyBatis(8)——Mybatis3.x与Spring4.x整合
- spring 容器中TypeConverter对象的使用
- eclipse集成Resin服务器图文教程
- 深入理解Java的接口和抽象类
- Java开发环境中使用CKEditor集成
- spring packagesToScan无法扫描到jar中类的解决办法
- Java基础语言(4)
- Java——面向对象(一)
- Maven修改全局和局部JDK版本
- eclipse maven plugin 插件 安装 和 配置