Java爬虫初体验:简单抓取IT之家热评(整合Spring Boot+Elasticsearch+Redis+Mybatis)
2017-08-27 17:44
1006 查看
爬取主程序
使用Jsoup解析网页源代码
@Component public class WebCrawler { private static final String encoding = "utf-8"; @Autowired private HotCommentMapper hotCommentMapper; @Autowired private RedisService redisService; @Autowired private EsService esService; private static boolean done = false; private static final int THREAD_NUM = 15; private static AtomicInteger page = new AtomicInteger(0); private static List<String> breakpoints; /** * 定时爬取更新 */ //@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3) public void start(){ done = false; System.out.println("开始爬取:"+System.currentTimeMillis()); for (int i = 0;i<THREAD_NUM;++i){ new Thread(new Runnable() { @Override public void run() { while (!done) { int p = page.incrementAndGet(); crawl(p); } System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis()); } },"Thread--"+i).start(); } } public synchronized void stop(){ done = true; redisService.listRemove("ithome:breakpoints"); redisService.listAdd("ithome:breakpoints",breakpoints); } /** * @param page :页码 */ public void crawl(int page){ String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" + "page="+page+"&type=indexpage&randnum="+Math.random(); String src = getHtmlSrc(url,encoding); List<String> links = getArticleLinks(src); if (links.size()<=0){ stop(); return ; } //不知还有没更好的方法判断最近一次抓取的位置? if(redisService.containsValue("ithome:breakpoints",links)){ stop(); return ; } //保存第一页链接做结束点 if (page == 1){ breakpoints = links; } for (String link:links){ parseAndSaveHotComments(link); } } /** * * @param url * @param encoding 编码 * @return 网页源代码 */ public String getHtmlSrc(String url,String encoding){ StringBuilder src = new StringBuilder(); InputStreamReader isr = null; try { URL urlObj = new URL(url);//建立网络链接 URLConnection urlConn = urlObj.openConnection();//打开链接 isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流 BufferedReader reader = new BufferedReader(isr);//建立缓冲 String line = null; while ((line = reader.readLine())!=null){ src.append(line); } }catch (Exception e){ e.printStackTrace(); }finally { try { if (isr != null){ isr.close(); } }catch (Exception e){ e.printStackTrace(); } } return src.toString(); } /** * @param srcCode * @return 解析源代码,获取文章链接 */ public List<String> getArticleLinks(String srcCode){ List<String> links = new ArrayList<String>(); Document document = Jsoup.parse(srcCode); Elements articleEls = document.select("h2>a"); for (Element el:articleEls){ String href = el.attr("href"); links.add(href); } return links; } /** * * @param articleHref 文章链接 * @description 使用Jsoup解析热评内容并保存 */ public void parseAndSaveHotComments(String articleHref){ String articlePage = getHtmlSrc(articleHref,encoding); Document document = Jsoup.parse(articlePage); Element iframeEl = document.getElementById("ifcomment"); if(iframeEl == null) { return ; } String commentHref = iframeEl.attr("src");//评论页面URL //获取文章ID document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding)); Element articleIdInput = document.getElementById("newsid"); String articleId = articleIdInput.attr("value"); //获取热评数据并解析 String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment"; String hotCommentPage = getHtmlSrc(link,encoding); document = Jsoup.parse(hotCommentPage); Elements hotCommentEls = document.select("li.entry"); HotComment hotComment = null; for (Element el:hotCommentEls){ hotComment = new HotComment(); String commontId = el.attr("cid"); String user = el.select("strong.nick a").text(); String comment = el.getElementsByTag("P").text(); int up = getNumber(el.select("a.s").text()); int down = getNumber(el.select("a.a").text()); String posandtime = el.select("span.posandtime").text(); String mobile = el.select("span.mobile a").text(); hotComment = new HotComment(); hotComment.setCommentId(commontId); hotComment.setArticleUrl(articleHref); hotComment.setUser(user); hotComment.setComment(comment); hotComment.setUp(up); hotComment.setDown(down); hotComment.setPosandtime(posandtime); hotComment.setMobile(mobile); hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以 esService.addHotComment(hotComment);//添加索引 if(hotComment.getUp()>=2500){ redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评 } //System.out.println(hotComment.toString()); } } /** * * @param str * @return 解析"()"中的数字 */ public int getNumber(String str){ Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))"); Matcher matcher = pattern.matcher(str); if(matcher.find()){ return Integer.parseInt(matcher.group()); } return 0; } // public static void main(String [] args){ // new WebCrawler().start(); // } }
整合Elasticsearch
相关依赖:
<!--es全文搜索--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency>
application.yml配置:
spring: data: ##elasticsearch配置 elasticsearch: cluster-name: elasticsearch cluster-nodes: localhost:9300
代码部分:
Respository
public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{ public List<HotComment> findByUser(String user); }
实体
@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1") public class HotComment implements Serializable{ private static final long serialVersionUID = -4249699545233058684L; @Id private Long id;//热评编号 private String commentId; private String user;//用户 private String comment;//内容 private int up;//支持数 private int down;//反对数 private String posandtime;//位置和时间 private String mobile;//设备 private String articleUrl;//源文章地址 public Long getId() { return id; } public void setId(Long id) { this.id = id; } public String getCommentId() { return commentId; } public void setCommentId(String commentId) { this.commentId = commentId; } public String getUser() { return user; } public void setUser(String user) { this.user = user; } public String getComment() { return comment; } public void setComment(String comment) { this.comment = comment; } public int getUp() { return up; } public void setUp(int up) { this.up = up; } public int getDown() { return down; } public void setDown(int down) { this.down = down; } public String getPosandtime() { return posandtime; } public void setPosandtime(String posandtime) { this.posandtime = posandtime; } public String getMobile() { return mobile; } public void setMobile(String mobile) { this.mobile = mobile; } public String getArticleUrl() { return articleUrl; } public void setArticleUrl(String articleUrl) { this.articleUrl = articleUrl; } @Override public String toString() { return "HotComment{" + "id='" + id + '\'' + "commentId='" + commentId + '\'' + ", user='" + user + '\'' + ", comment='" + comment + '\'' + ", up=" + up + ", down=" + down + ", posandtime='" + posandtime + '\'' + ", mobile='" + mobile + '\'' + ", articleUrl='" + articleUrl + '\'' + '}'; } }
Service
@Service public class EsService { @Autowired private EsRepository esRepository; public void addHotComment(HotComment hotComment){ esRepository.save(hotComment); } /** * 缓存搜索结果 * @param user * @return */ @Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user") public List<HotComment> findByUser(String user){ return esRepository.findByUser(user); } }
整合Redis
相关依赖
<!-- redis --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-redis</artifactId> </dependency>
相关配置
appliaction.yml配置
spring: ##redis配置 redis: database: 0 host: localhost port: 6379 password: redis pool: max-active: 15 max-wait: 1 max-idle: 0 timeout: 0
缓存相关配置
@Configuration @EnableCaching public class RedisConfig { @Bean public KeyGenerator keyGenerator(){ return new KeyGenerator(){ @Override public Object generate(Object o, Method method, Object... objects) { StringBuilder sb = new StringBuilder(); sb.append(o.getClass().getName()); sb.append(method.getName()); for(Object obj : objects){ sb.append(obj.toString()); } return sb.toString(); } }; } @Bean public CacheManager cacheManager(RedisTemplate redisTemplate){ RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate); // redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s Map<String,Long> map = new HashMap<>(); map.put("ithome:hotcomments",60*60*24L); return redisCacheManager; } @Bean public RedisTemplate<String,String> redisTemplate(RedisConnectionFactory factory){ StringRedisTemplate template = new StringRedisTemplate(factory); Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class); ObjectMapper om = new ObjectMapper(); om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY); om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL); jackson2JsonRedisSerializer.setObjectMapper(om); template.setValueSerializer(jackson2JsonRedisSerializer); template.afterPropertiesSet(); return template; } }
代码部分:
Service
@Service public class RedisService { @Autowired private RedisTemplate redisTemplate; /** * * @param key * @param hotComment * 添加热评至 redis */ public void rankAdd(String key, HotComment hotComment){ ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet(); zSetOperations.add(key,hotComment,hotComment.getUp()); } /** * * @param key * @param top 前top条记录 * @return */ public Set<HotComment> rankGet(String key,int top){ ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet(); return zSetOperations.range(key,0,top); } /** * * @param key * @param values * @desc 保存最近一次抓取的位置点 */ public void listAdd(String key,List<String> values){ ListOperations<String,String> listOperations = redisTemplate.opsForList(); listOperations.rightPushAll(key,values); } public void listRemove(String key){ redisTemplate.delete(key); } /** * * @param key * @param values * @return * @desc 判断是否抓取结束位置 */ public boolean containsValue(String key,List<String> values){ ListOperations<String,String> listOperations = redisTemplate.opsForList(); List<String> list = listOperations.range(key,0,-1); for (String val : values){ if(list.contains(val)){ return true; } } return false; } }
整合Mybatis
依赖配置
<!-- mybatis --> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>1.3.1</version> </dependency> <!-- mysql驱动 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <!--数据源依赖--> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.0.24</version> </dependency>
数据源配置
@Configuration @MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper") public class DatabaseConfig { /** * 数据源配置 * @return */ @Bean public DataSource druidDataSource(){ DruidDataSource dataSource=new DruidDataSource(); dataSource.setUsername("root"); dataSource.setPassword("mysql"); dataSource.setUrl("jdbc:mysql://localhost:3306/ithome"); dataSource.setDriverClassName("com.mysql.jdbc.Driver"); return dataSource; } }
代码部分:
Mapper
public interface HotCommentMapper { @Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})") @Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id") public void addHotComment(HotComment hotComment); }
Controller
@Controller @RequestMapping("/ithome") public class HotCommentController { @Autowired private RedisService redisService; @Autowired private EsService esService; /** * 首页 * @return */ @GetMapping("/index") public ModelAndView index(){ ModelAndView mav = new ModelAndView("search"); Set<HotComment> set = redisService.rankGet("ithome:hotrank",50); mav.addObject("comments",set); return mav; } /** * 搜索 * @param keyword * @return */ @GetMapping("/search/{keyword}") public ModelAndView search(@PathVariable("keyword") String keyword){ ModelAndView mav = new ModelAndView("search"); List<HotComment> list = esService.findByUser(keyword); mav.addObject("comments",list); return mav; } }
主程序
@SpringBootApplication @EnableElasticsearchRepositories @EnableScheduling public class IthomecrawlerApplication { public static void main(String[] args) { SpringApplication.run(IthomecrawlerApplication.class, args); } }
完整application.yml文件
#端口号
server:
port: 8081
spring: data: ##elasticsearch配置 elasticsearch: cluster-name: elasticsearch cluster-nodes: localhost:9300
##redis配置
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
##freemarker配置
freemarker:
##是否允许属性覆盖
allow-request-override: false
allow-session-override: false
cache: true
check-template-location: true
content-type: text/html
##暴露request属性
expose-request-attributes: false
expose-session-attributes: false
expose-spring-macro-helpers: false
suffix: .ftl
template-loader-path: classpath:/templates/
request-context-attribute: request
settings:
classic_compatible: true
locale: zh_CN
date_format: yyyy-MM-dd
time_format: HH:mm:ss
datetime_format: yyyy-MM-dd HH:mm:ss
完整pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion>
<groupId>comcrazy</groupId>
<artifactId>ithomecrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>ithomecrawler</name>
<description>ITHome Crawler.</description>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--es全文搜索--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency>
<!-- redis --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-redis</artifactId> </dependency><!-- mybatis --> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>1.3.1</version> </dependency> <!-- mysql驱动 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <!--数据源依赖--> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.0.24</version> </dependency>
<!-- jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>
<!-- freemarder -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-freemarker</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<finalName>ithomecrawler</finalName>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
完整代码
相关文章推荐
- java springboot与redis整合
- java鬼混笔记:springboot 9、springboot整合mybatis加上分页功能
- 简单的SpringBoot+Mybatis框架整合
- SpringBoot+Redis+Mybatis+AngularJS整合开发
- SpringBoot整合mybatis、shiro、redis实现基于数据库的细粒度动态权限管理系统实例
- 【spring boot】SpringBoot初学(8)– 简单整合redis
- SpringCloud SpringBoot mybatis 分布式微服务(二十四)整合Redis
- spring boot 整合mybatis、redis、 spring mvc基本配置
- java鬼混笔记:springboot 7、springboot整合mybatis后再加上druid数据库连接池
- Spring Boot、Mybatis框架整合开发Java RESTful Web Service
- SpringBoot整合mybatis、shiro、redis实现基于数据库的细粒度动态权限管理系统实例(转)
- spring boot mybatis 简单整合使用
- SpringBoot整合mybatis、shiro、redis实现基于数据库的细粒度动态权限管理系统实例
- springboot整合mybatis一个简单的demo
- java spring+mybatis整合实现爬虫之《今日头条》搞笑动态图片爬取
- SpringBoot整合mybatis、shiro、redis实现基于数据库的细粒度动态权限管理系统实例
- spring boot mybatis 整合shiro简单实现登陆权限管理
- java spring+mybatis整合实现爬虫之《今日头条》搞笑动态图片爬取
- java鬼混笔记:springboot 6、springboot整合mybatis(支持多数源)
- springBoot整合myBatis错误:java.lang.NoSuchMethodException