您的位置:首页 > 编程语言 > Java开发

Java网页爬虫:Spring Boot通过webmagic实现网页爬虫

2019-01-03 17:22 369 查看

一、需求

因为业务需求,需要实现新闻资讯功能。初步方案通过第三方提供的服务接口来实现此功能。由于谈判失败,因此决定自开发一套爬虫接口。因此通过查询相关文档,决定采用webmagic开源框架实现自己的爬虫功能。

二、实施过程

1、引入依赖

在pom文件中添加依赖:

[code]<!-- 爬虫 -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>

2、创建相关接口

创建实现类,代码如下(仅供参考):

[code]import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import team.biteeny.admin.db.write.cache.ConfigMapper;
import team.biteeny.admin.db.write.mapper.CrawlMapper;
import team.biteeny.admin.db.write.model.CrawlModel;
import team.biteeny.push.getui.PushApp;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Component
public class HuobiInfoProcessor implements PageProcessor {

@Autowired
private CrawlMapper crawlMapper;

@Autowired
private ConfigMapper configMapper;

private Site site;

private static Map map = new ConcurrentHashMap();

@Override
public void process(Page page) {
if (page.getUrl().toString().contains("flash")){
insertFlash(page);
}
if (page.getUrl().toString().contains("article")){
List<String> urlList = new ArrayList<>();
Json json = page.getJson();
JSONObject jsonObject = JSONObject.parseObject(json.toString());
JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("data");
for (Object o : jsonArray) {
JSONObject object = JSONObject.parseObject(JSONObject.toJSONString(o));
String key = "baseDetail_" + object.getString("id");
urlList.add("https://www.huobiinfo.com/news/"+key);
map.put(key + "_listPicturePath",object.getString("listPicturePath"));
map.put(key + "_title",object.getString("title"));
}
page.addTargetRequests(urlList);
}
if (page.getUrl().toString().contains("news/baseDetail_")){
insertNews(page);
}
}

@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.huobiinfo.com")
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36")
.setCharset("UTF-8")
.setSleepTime(500);
}
return site;
}

public static void main(String[] args) {
//        Spider.create(new HuobiInfoProcessor()).addUrl("https://www.huobiinfo.com/flash/").runAsync();
//        Request request = new Request("https://huobi-news-app-gateway-outer.huobi.cn:8005/article/listPagedArticleListByParam");
//        request.setMethod(HttpConstant.Method.POST);
//        request.setRequestBody(HttpRequestBody.json("{\"pageSize\":10,\"pageNum\":1,\"categoryPcId\":15}","utf-8"));
//        Spider.create(new HuobiInfoProcessor()).addRequest(request).runAsync();
//        String title = "BTC链上基础指标略有回暖,链上场内场外交易均较活跃";
//        String c = "根据Searchain.io数据分析:昨日BTC从4100下降到3900点。从链上指标来看,昨日反映BTC内部价值的基础指标整体有所上升,新增地址上升14.89%,活跃地址上升12.20%。从链上交易指标来看,交易用户的活跃度也在上升,交易所流入增加49.16%,流出增加40.78%;链上大额转账的活跃程度集中在100-600 BTC区间,600+ BTC的转账有所下降,大额流入交易所占比有所上升,场内场外均比较活跃。综合链上基础和交易指标来看,近期BTC内部价值略有回暖,链上场内场外交易均活跃。独立分析师Edward对近期BTC市场呈较为悲观状态。\n" +
//                "只有币名和百分比,没有价格波动词,所以不符合推送条件";
//        boolean b = checkPush(title+c);
//        System.out.println(b);

}

private void insertFlash(Page page){
Elements elements = page.getHtml().getDocument().getElementsByClass("item-flash");
for (Element element : elements) {
Html html = new Html(element.toString());
String s = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/@to").toString();
String key = s.substring(1, s.lastIndexOf("/")).replace("/", "_");
if (crawlMapper.checkExist(key) <= 0){
String title = html.xpath("//div[@class='item-flash']//h3[@class='med']//nuxt-link/text()").toString();
String content = html.xpath("//div[@class='item-flash']//div[@class='content']/text()").toString();
CrawlModel model = new CrawlModel();
boolean b = checkPush(title + content);
model.setId(key);
model.setBody(content);
model.setTitle(title);
model.setSource("HuobiInfo");
model.setType("flash");
if (b){
model.setIs_push(true);
push(title,content);
}else {
model.setIs_push(false);
}
model.setCreate_time(new Date());
crawlMapper.crawlInsert(model);
}
}
}
private void insertNews(Page page){
String path = page.getUrl().toString();
String key = path.substring(path.lastIndexOf("/") + 1);
if (crawlMapper.checkExist(key) <= 0) {
String source = "<div><p>来源:" + page.getHtml().xpath("//div[@class='detail-platform-msg']//p[@class='detail-platform']/text()").toString()+"</p></div>";
String notice = "<div><em><span style='font-size: 12px;'>" +
page.getHtml().xpath("//div[@class='detail-source']/text()") +
"</span></em></div>";
String article = page.getHtml().xpath("//div[@class='detail-content article-content hb-article']").toString();
String content = source + notice + article;
if (!checkDomain(article)){
CrawlModel model = new CrawlModel();
model.setId(key);
model.setTitle((String) map.get(key + "_title"));
model.setBody(content);
model.setList_picture((String) map.get(key + "_listPicturePath"));
model.setSource("HuobiInfo");
model.setType("news");
model.setCreate_time(new Date());
crawlMapper.crawlInsert(model);
}
}
}

private static boolean checkPush(String str){
if (str == null){
return false;
}
String regex = "btc|eth|bch|ltc|etc|eos|xrp|dash|trx";
String regex1 = "涨|跌|涨幅|跌幅|上涨|下跌";
String regexF = "大额转账|净流入|净流出";
String regexH = "okex|火币|币安|比特大陆";
String regex2 = "\\d+(\\.?\\d*?)(?=%)";
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Pattern p1 = Pattern.compile(regex1);
Pattern pf = Pattern.compile(regexF);
Pattern ph = Pattern.compile(regexH);
Pattern p2 = Pattern.compile(regex2);
Matcher matcher = p.matcher(str);
Matcher matcher1 = p1.matcher(str);
Matcher matcherF = pf.matcher(str);
Matcher matcherH = ph.matcher(str);
Matcher matcher2 = p2.matcher(str);
if (matcher.find() && matcherF.find()){
return true;
}
if (matcherH.find()){
return true;
}
if (matcher.find() && matcher1.find()){
while (matcher2.find()){
Double d = Double.valueOf(matcher2.group());
if (d > 5){
return true;
}
}
return false;
}
return false;
}

private void push(String title,String text){
// 推送相关
//        int hour = Calendar.getInstance().get(Calendar.HOUR);
//        if (hour >= 8 && hour <= 22){
//        }
}

private boolean checkDomain(String content){
if (content == null){
return false;
}
String pattern = "mmbiz\\.qpic\\.cn";
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(content);
if(m.find()){
return true;
}
return false;
}

以上是一个简单的实例,包括一些过滤逻辑和数据持久化逻辑等。

三、总结

以上是java通过webmagic实现网页爬虫的简单例子,对webmagic的应用也不是很到位,只是已经满足了当前的业务需求。也未作深入研究。对webmagic也没有详细描述,有兴趣的童鞋们可以自行查阅相关文档学习,欢迎大家加入进来一起讨论学习。

能力有限,难免有不当之处,欢迎大家批评指正。把技术死磕到底!

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐