java 学习:网络爬虫--中国人才热线邮箱抓取
2014-04-05 02:10
411 查看
很简陋的一个抓取邮箱的,抓取效率很低,纯当熟悉键盘。
1. 函数入口
2. 正则表达式
3. 页面内容抓取
4. 多线程
1. 函数入口
public class Test01 { /** * @param args * @throws IOException */ public static void main(String[] args) { for (int i=1; i<=20; i++) { HtmlPage h1 = new HtmlPage( "http://www.cjol.com/search/l2008/"+i+"/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43",1); h1.pageCode(); //new Thread(new mRunable(h1), ""+i).start(); } //http://www.cjol.com/search/l2008/4/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43 // h1.email(); //h1.pageCode(); // h1.email(); } }
2. 正则表达式
public class Regx { /** * 搜索业务员找公司页面 pat = "href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>" * * @param buf * @throws IOException */ public synchronized static void findCompany(String buf) { List<String> companyList = new ArrayList<String>(); // System.out.println("resource:"+buf); // System.out.println("findCompany()"); Pattern pattern = Pattern // \\s*target=\"_blank\" .compile("href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>"); Matcher matcher = pattern.matcher(buf); Pattern innerPattern = Pattern.compile("http:\\S+\""); while (matcher.find()) { String string = matcher.group(); // System.out.println(string); Matcher innerMatcher = innerPattern.matcher(string); if (innerMatcher.find()) { String tmp = innerMatcher.group().replaceAll("\"", ""); new HtmlPage(tmp, 2); String ttString = "公司招聘页面地址:" + tmp; System.out.println(ttString); HtmlPage.writLog(ttString); } companyList.add(string); } } /** * 找到公司官网主页地址 * * @param buf * @throws IOException */ public synchronized static void findWebSite(String buf) { List<String> webSiteList = new ArrayList<String>(); Pattern pattern = Pattern.compile("网址:<a href=\"http://[\\w-\\./]+\""); Matcher matcher = pattern.matcher(buf); Pattern innerPattern = Pattern.compile("http:\\S+\""); while (matcher.find()) { String string = matcher.group(); // System.out.println("找到啦:"+string); Matcher innerMatcher = innerPattern.matcher(string); if (innerMatcher.find()) { String tmp = innerMatcher.group().replaceAll("\"", ""); String ttsString = "公司主页地址:" + tmp; System.out.println(ttsString); HtmlPage.writLog(ttsString); new HtmlPage(tmp, 3); } webSiteList.add(string); } } /** * 在官网主页找 联系我们/contact us <a href="contactus.asp">联系方式</a> * href="contact.php">CONTACT US</a> * * @param str * @throws IOException */ public synchronized static void findContanct(String url, String str) { List<String> webSiteList = new ArrayList<String>(); Pattern pattern = Pattern.compile("href=\"[\\w-\\./\\?=]+\">contact", Pattern.CASE_INSENSITIVE); Matcher matcher = pattern.matcher(str); Pattern innerPattern = Pattern.compile("\".+\""); while (matcher.find()) { String string = matcher.group(); String ttsString = "联系方式地址:" + string; System.out.println(ttsString); HtmlPage.writLog(ttsString); Matcher innerMatcher = innerPattern.matcher(string); if (innerMatcher.find()) { String tmp = innerMatcher.group().replaceAll("\"", ""); new HtmlPage(url + "//" + tmp, 4); } webSiteList.add(string); } } /** * 邮箱地址验证 * * @param str * @return */ public synchronized static List<String> email(String str) { File file = new File("1.txt"); RandomAccessFile rd = null; try { rd = new RandomAccessFile(file, "rw"); rd.seek(file.length()); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } Pattern pattern = Pattern .compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}"); Matcher matcher = pattern.matcher(str); List<String> list = new ArrayList<String>(); while (matcher.find()) { String reString = matcher.group(); if (HtmlPage.putEmail(reString)) { System.out .println("邮箱:------------------------------------------------------------------------- " + reString + "---------------"); HtmlPage.writLog("邮箱: "+reString); try { rd.write(reString.getBytes()); rd.write("\r\n".getBytes()); } catch (IOException e) { HtmlPage.writLog(reString+" 写邮箱失败:"+e.getMessage()); System.out.println("邮箱写入失败:"+e.getMessage()); e.printStackTrace(); } finally { try { if (rd!= null) rd.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } list.add(reString); return list; } } return null; } }
3. 页面内容抓取
public class HtmlPage { private String spec; private int depth; // private String pageCode; private static List<String> emailList = new ArrayList<String>(); public HtmlPage(String urlString, int depth) { this.spec = urlString; this.depth = depth; System.out.println("---------"+urlString +"----"+ depth); HtmlPage.writLog("---------"+urlString +"----"+ depth); if (depth !=1) pageCode(); } public void pageCode() { URL url = null; try { url = new URL(spec); } catch (MalformedURLException e) { HtmlPage.writLog(spec+" 初始化失败:"+e.getMessage()); System.out.println("url初始化失败"); e.printStackTrace(); return; } StringBuffer sBuffer = new StringBuffer(); HttpURLConnection connection; try { connection = (HttpURLConnection) url.openConnection(); connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); } catch (IOException e) { HtmlPage.writLog(spec+" 打开网址失败:"+e.getMessage()); System.out.println("打开网址失败"); e.printStackTrace(); return; } connection.setDoOutput(true); // 网页编码 // String charset = getCharset(connection.getContentType()); BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader( connection.getInputStream(), charset)); } catch (UnsupportedEncodingException e) { HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage()); e.printStackTrace(); return; } catch (IOException e) { HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage()); e.printStackTrace(); return; } String str = null; try { while ((str = br.readLine()) != null) { switch (depth) { case 1: Regx.findCompany(str); break; case 2: Regx.findWebSite(str); break; case 3: Regx.findContanct(spec, str); Regx.email(str); break; case 4: Regx.email(str); break; default: break; } } } catch (IOException e) { HtmlPage.writLog(spec+" 读取输入流:"+e.getMessage()); System.out.println(e.getMessage()); //e.printStackTrace(); return; } } /** * 网页编码 * * @param contentType * @return */ private String getCharset(String contentType) { if (contentType == null) return "gbk"; Pattern pattern = Pattern.compile("charset=.*"); Matcher matcher = pattern.matcher(contentType); if (matcher.find()) return matcher.group(0).split("charset=")[1]; return "gbk"; } public synchronized static boolean putEmail(String str) { if (!emailList.contains(str)) { emailList.add(str); return true; } return false; } public synchronized static void writLog(String str) { File file = new File("log.txt"); RandomAccessFile rd = null; try { rd = new RandomAccessFile(file, "rw"); int len = (int) file.length(); rd.seek(len); rd.write(str.getBytes()); rd.write("\r\n".getBytes()); } catch (FileNotFoundException e) { System.out.println("日志写入失败!"); e.printStackTrace(); } catch (IOException e) { System.out.println("日志写入失败!"); e.printStackTrace(); } finally { try { if (rd!= null) rd.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public void email() { int len = emailList.size(); for (int i=0; i<len; i++) { System.out.println(emailList.get(i)); } } }
4. 多线程
public class mRunable implements Runnable { private HtmlPage htmlPage; public mRunable() { } public mRunable(HtmlPage htmlPage) { this.htmlPage = htmlPage; } @Override public void run() { System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"滴滴开始了啦----------\n\n\n"); HtmlPage.writLog("线程"+Thread.currentThread().getName()+"开始运行"); htmlPage.pageCode(); System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"完成工作啦----------\n\n\n"); HtmlPage.writLog("线程"+Thread.currentThread().getName()+"运行结束"); } }
相关文章推荐
- java 网络爬虫jsoup 抓取全中国 省市县镇村 完整全集信息 代码
- java jsoup 网络爬虫 学习例子(二) 只抓取豆瓣电影5星(力荐)电影名称
- java学习之正则表达式网络爬取邮箱
- 简单的Java网络爬虫(获取一个网页中的邮箱)
- WebCollector2.X 网络JAVA爬虫入门(抓取百度百科)
- Java网络爬虫(八)--使用多线程进行百度图片的抓取
- iOS—网络实用技术OC篇&网络爬虫-使用java语言抓取网络数据
- [Python学习] 简单网络爬虫抓取博客文章及思想介绍
- java网络爬虫之网页邮箱采集器源码
- Java学习: 网络爬虫
- python3.4学习笔记(十三) 网络爬虫实例代码,使用pyspider抓取多牛投资吧里面的文章信息,抓取政府网新闻内容
- [Python学习] 简单网络爬虫抓取博客文章及思想介绍
- Java--实现网络爬虫抓取RSS新闻(1)网络爬虫详解
- 【 网络爬虫】java 使用Socket, HttpUrlConnection方式抓取数据
- java 网络爬虫之多线程抓取文件
- Node.js学习之网络爬虫(使用cheerio抓取网页数据)
- java实现网络爬虫--抓取网站数据
- java抓取网页 --- 网络爬虫
- Java爬虫,信息抓取的实现(Jsoup)转载,仅用于学习