您的位置:首页 > 其它

使用正则表达式获取部分文本信息

2017-01-12 12:23 501 查看
通过xpath可以在控制台得到如下信息:

总播放数:16,961,138,348
评论:1,254,731
顶:13,754,080

但是,我并不需要前面的文字,只需要后面的数字,因此我们还需要对获取到信息做进一步的处理。也就是使用正则表达式。

分析,前面都是文字,后面是数字加逗号,因此,正则表达式表示的意思是,除去前面文字部分,截取后面数字部分,同时,规则为(数字,)一个或多个

所以,它们正则表达式为:

//这里解析得到的是 : 总播放数:16,931,628,832,因此采用正则表达式获取数字
private String allnumberRegex = "(?<=总播放数:)[\\d,]+";
private String commentnumberRegex = "(?<=评论:)[\\d,]+";
private String supportnumberRegex = "(?<=顶:)[\\d,]+";

其中,(?<=顶:)表示除去 顶: 的文本信息。

之后,利用java正则表达式Pattern和Matcher知识,对它进行正则匹配,最后获取到想要的信息。代码修改如下:

1. 页面解析接口:

package com.dajiangtai.djt_spider.service;

import com.dajiangtai.djt_spider.entity.Page;

/**
* 页面解析接口
* @author Administrator
*
*/
public interface IProcessService {

public void process(Page page);
}

2.优酷网站解析实现类:

package com.dajiangtai.djt_spider.service.impl;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.util.HtmlUtil;
import com.dajiangtai.djt_spider.util.LoadPropertyUtil;
import com.dajiangtai.djt_spider.util.RegexUtil;

/**
* 优酷页面解析实现类
* @author Administrator
*
*/
public class YOUKUProcessService implements IProcessService{

//这里解析得到的是 : 总播放数:16,931,628,832,因此采用正则表达式获取数字
private String allnumberRegex = "(?<=总播放数:)[\\d,]+";
private String commentnumberRegex = "(?<=评论:)[\\d,]+";
private String supportnumberRegex = "(?<=顶:)[\\d,]+";

//获取到的总播放数:16,960,789,989 其xpath为:
// /html/body/div[4]/div/div[1]/div[2]/div[2]/ul/li[11]
//这里ul最近的div[2]其class为"p-base",因此,仿写其他xpath,改成如下相对路径
private String parseAllNumber = "/body/div/div/div/div/div/ul/li[11]";
//评论数
private String parseCommentNumber = "//div[@class=\"p-base\"]/ul/li[12]";
//赞数
private String parseSupportNumber = "//div[@class=\"p-base\"]/ul/li[13]";

public void process(Page page) {

String content = page.getContent();
HtmlCleaner htmlCleaner = new HtmlCleaner();
//利用htmlCleaner对网页进行解析,得到根节点
TagNode rootNode = htmlCleaner.clean(content);
try {
Object[] evaluateXPath = rootNode.evaluateXPath(parseAllNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
Pattern numberPattern = Pattern.compile(allnumberRegex,Pattern.DOTALL);
Matcher matcher = numberPattern.matcher(node.getText().toString());
if(matcher.find()){
System.out.println("总播放数量为:"+matcher.group(0));
}
}

evaluateXPath = rootNode.evaluateXPath(parseCommentNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
Pattern numberPattern = Pattern.compile(commentnumberRegex,Pattern.DOTALL);
Matcher matcher = numberPattern.matcher(node.getText().toString());
if(matcher.find()){
System.out.println("总评论数量为:"+matcher.group(0));
}
}

evaluateXPath = rootNode.evaluateXPath(parseSupportNumber);
if(evaluateXPath.length>0){
TagNode node = (TagNode)evaluateXPath[0];
Pattern numberPattern = Pattern.compile(supportnumberRegex,Pattern.DOTALL);
Matcher matcher = numberPattern.matcher(node.getText().toString());
if(matcher.find()){
System.out.println("总赞为:"+matcher.group(0));
}
}
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

3.同样,调用爬虫入口类,测试:

package com.dajiangtai.djt_spider.start;

import com.dajiangtai.djt_spider.entity.Page;
import com.dajiangtai.djt_spider.service.IDownLoadService;
import com.dajiangtai.djt_spider.service.IProcessService;
import com.dajiangtai.djt_spider.service.IStoreService;
import com.dajiangtai.djt_spider.service.impl.ConsoleStoreService;
import com.dajiangtai.djt_spider.service.impl.HttpClientDownLoadService;
import com.dajiangtai.djt_spider.service.impl.YOUKUProcessService;

/**
* 电视剧爬虫入口类
* @author Administrator
*
*/
public class StartDSJCount {

//页面下载接口
private IDownLoadService downLoadService;

private IProcessService processService;

public static void main(String[] args) {
StartDSJCount dsj = new StartDSJCount();
dsj.setDownLoadService(new HttpClientDownLoadService());
dsj.setProcessService(new YOUKUProcessService());
String url = "http://list.youku.com/show/id_z9cd2277647d311e5b692.html?spm=a2h0j.8191423.sMain.5~5~A!2.iCUyO9";
//下载页面
Page page = dsj.downloadPage(url);
//解析页面
dsj.processPage(page);

}

//下载页面方法
public Page downloadPage(String url){
return this.downLoadService.download(url);
}

//解析页面方法
public void processPage(Page page){
this.processService.process(page);
}

public IDownLoadService getDownLoadService() {
return downLoadService;
}

public void setDownLoadService(IDownLoadService downLoadService) {
this.downLoadService = downLoadService;
}

public IProcessService getProcessService() {
return processService;
}

public void setProcessService(IProcessService processService) {
this.processService = processService;
}

}

测试结果为:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐