您的位置:首页 > 数据库 > Redis

Java爬虫初体验:简单抓取IT之家热评(整合Spring Boot+Elasticsearch+Redis+Mybatis)

2017-08-27 17:44 1006 查看

爬取主程序

使用Jsoup解析网页源代码

@Component
public class WebCrawler {

private static final String encoding = "utf-8";

@Autowired
private HotCommentMapper hotCommentMapper;
@Autowired
private RedisService redisService;
@Autowired
private EsService esService;

private static boolean done = false;
private static final int THREAD_NUM = 15;
private static AtomicInteger page = new AtomicInteger(0);
private static List<String> breakpoints;

/**
* 定时爬取更新
*/
//@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3)
public void start(){
done = false;
System.out.println("开始爬取:"+System.currentTimeMillis());
for (int i = 0;i<THREAD_NUM;++i){
new Thread(new Runnable() {
@Override
public void run() {
while (!done) {
int p = page.incrementAndGet();
crawl(p);
}
System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());
}
},"Thread--"+i).start();
}
}

public synchronized void stop(){
done = true;
redisService.listRemove("ithome:breakpoints");
redisService.listAdd("ithome:breakpoints",breakpoints);
}

/**
* @param page :页码
*/
public void crawl(int page){
String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +
"page="+page+"&type=indexpage&randnum="+Math.random();
String src = getHtmlSrc(url,encoding);
List<String> links = getArticleLinks(src);
if (links.size()<=0){
stop();
return ;
}
//不知还有没更好的方法判断最近一次抓取的位置?
if(redisService.containsValue("ithome:breakpoints",links)){
stop();
return ;
}
//保存第一页链接做结束点
if (page == 1){
breakpoints = links;
}
for (String link:links){
parseAndSaveHotComments(link);
}
}

/**
*
* @param url
* @param encoding 编码
* @return 网页源代码
*/
public String getHtmlSrc(String url,String encoding){
StringBuilder src = new StringBuilder();
InputStreamReader isr = null;
try {
URL urlObj = new URL(url);//建立网络链接
URLConnection urlConn = urlObj.openConnection();//打开链接
isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流
BufferedReader reader = new BufferedReader(isr);//建立缓冲
String line = null;
while ((line = reader.readLine())!=null){
src.append(line);
}
}catch (Exception e){
e.printStackTrace();
}finally {
try {
if (isr != null){
isr.close();
}
}catch (Exception e){
e.printStackTrace();
}
}
return src.toString();
}

/**
* @param srcCode
* @return 解析源代码,获取文章链接
*/
public List<String> getArticleLinks(String srcCode){
List<String> links = new ArrayList<String>();
Document document = Jsoup.parse(srcCode);
Elements articleEls = document.select("h2>a");

for (Element el:articleEls){
String href = el.attr("href");
links.add(href);
}
return links;
}

/**
*
* @param articleHref 文章链接
* @description 使用Jsoup解析热评内容并保存
*/
public void parseAndSaveHotComments(String articleHref){
String articlePage = getHtmlSrc(articleHref,encoding);
Document document = Jsoup.parse(articlePage);
Element iframeEl = document.getElementById("ifcomment");
if(iframeEl == null) {
return ;
}
String commentHref = iframeEl.attr("src");//评论页面URL

//获取文章ID
document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));
Element articleIdInput = document.getElementById("newsid");
String articleId = articleIdInput.attr("value");

//获取热评数据并解析
String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";
String hotCommentPage = getHtmlSrc(link,encoding);
document = Jsoup.parse(hotCommentPage);
Elements hotCommentEls = document.select("li.entry");

HotComment hotComment = null;
for (Element el:hotCommentEls){
hotComment = new HotComment();
String  commontId = el.attr("cid");
String user = el.select("strong.nick a").text();
String comment = el.getElementsByTag("P").text();
int up = getNumber(el.select("a.s").text());
int down = getNumber(el.select("a.a").text());
String posandtime = el.select("span.posandtime").text();
String mobile = el.select("span.mobile a").text();

hotComment = new HotComment();
hotComment.setCommentId(commontId);
hotComment.setArticleUrl(articleHref);
hotComment.setUser(user);
hotComment.setComment(comment);
hotComment.setUp(up);
hotComment.setDown(down);
hotComment.setPosandtime(posandtime);
hotComment.setMobile(mobile);

hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以
esService.addHotComment(hotComment);//添加索引
if(hotComment.getUp()>=2500){
redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评
}

//System.out.println(hotComment.toString());
}
}

/**
*
* @param str
* @return 解析"()"中的数字
*/
public int getNumber(String str){
Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");
Matcher matcher = pattern.matcher(str);

if(matcher.find()){
return Integer.parseInt(matcher.group());
}
return 0;
}

//    public static void main(String [] args){
//        new WebCrawler().start();
//    }
}


整合Elasticsearch

相关依赖:

<!--es全文搜索-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>


application.yml配置:

spring:
data:
##elasticsearch配置
elasticsearch:
cluster-name: elasticsearch
cluster-nodes: localhost:9300


代码部分:

Respository

public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{
public List<HotComment> findByUser(String user);
}


实体

@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")
public class HotComment implements Serializable{

private static final long serialVersionUID = -4249699545233058684L;
@Id
private Long id;//热评编号
private String commentId;
private String user;//用户
private String comment;//内容
private int up;//支持数
private int down;//反对数
private String posandtime;//位置和时间
private String mobile;//设备
private String articleUrl;//源文章地址

public Long getId() {
return id;
}

public void setId(Long id) {
this.id = id;
}

public String getCommentId() {
return commentId;
}

public void setCommentId(String commentId) {
this.commentId = commentId;
}

public String getUser() {
return user;
}

public void setUser(String user) {
this.user = user;
}

public String getComment() {
return comment;
}

public void setComment(String comment) {
this.comment = comment;
}

public int getUp() {
return up;
}

public void setUp(int up) {
this.up = up;
}

public int getDown() {
return down;
}

public void setDown(int down) {
this.down = down;
}

public String getPosandtime() {
return posandtime;
}

public void setPosandtime(String posandtime) {
this.posandtime = posandtime;
}

public String getMobile() {
return mobile;
}

public void setMobile(String mobile) {
this.mobile = mobile;
}

public String getArticleUrl() {
return articleUrl;
}

public void setArticleUrl(String articleUrl) {
this.articleUrl = articleUrl;
}

@Override
public String toString() {
return "HotComment{" +
"id='" + id + '\'' +
"commentId='" + commentId + '\'' +
", user='" + user + '\'' +
", comment='" + comment + '\'' +
", up=" + up +
", down=" + down +
", posandtime='" + posandtime + '\'' +
", mobile='" + mobile + '\'' +
", articleUrl='" + articleUrl + '\'' +
'}';
}
}


Service

@Service
public class EsService {
@Autowired
private EsRepository esRepository;

public void addHotComment(HotComment hotComment){
esRepository.save(hotComment);
}

/**
* 缓存搜索结果
* @param user
* @return
*/
@Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")
public List<HotComment> findByUser(String user){
return esRepository.findByUser(user);
}
}


整合Redis

相关依赖

<!-- redis -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>


相关配置

appliaction.yml配置

spring:
##redis配置
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0


缓存相关配置

@Configuration
@EnableCaching
public class RedisConfig {
@Bean
public KeyGenerator keyGenerator(){
return new KeyGenerator(){

@Override
public Object generate(Object o, Method method, Object... objects) {
StringBuilder sb = new StringBuilder();
sb.append(o.getClass().getName());
sb.append(method.getName());
for(Object obj : objects){
sb.append(obj.toString());
}
return sb.toString();
}
};
}

@Bean
public CacheManager cacheManager(RedisTemplate redisTemplate){
RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);
//        redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s
Map<String,Long> map = new HashMap<>();
map.put("ithome:hotcomments",60*60*24L);
return redisCacheManager;
}

@Bean
public RedisTemplate<String,String> redisTemplate(RedisConnectionFactory factory){
StringRedisTemplate template = new StringRedisTemplate(factory);
Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);
ObjectMapper om = new ObjectMapper();
om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
jackson2JsonRedisSerializer.setObjectMapper(om);
template.setValueSerializer(jackson2JsonRedisSerializer);
template.afterPropertiesSet();

return template;
}
}


代码部分:

Service

@Service
public class RedisService {

@Autowired
private RedisTemplate redisTemplate;

/**
*
* @param key
* @param hotComment
* 添加热评至 redis
*/
public void rankAdd(String key, HotComment hotComment){
ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
zSetOperations.add(key,hotComment,hotComment.getUp());
}

/**
*
* @param key
* @param top 前top条记录
* @return
*/
public Set<HotComment> rankGet(String key,int top){
ZSetOperations<String,HotComment> zSetOperations = redisTemplate.opsForZSet();
return zSetOperations.range(key,0,top);
}

/**
*
* @param key
* @param values
* @desc 保存最近一次抓取的位置点
*/
public void listAdd(String key,List<String> values){
ListOperations<String,String> listOperations = redisTemplate.opsForList();
listOperations.rightPushAll(key,values);
}

public void listRemove(String key){
redisTemplate.delete(key);
}

/**
*
* @param key
* @param values
* @return
* @desc 判断是否抓取结束位置
*/
public boolean containsValue(String key,List<String> values){
ListOperations<String,String> listOperations = redisTemplate.opsForList();
List<String> list = listOperations.range(key,0,-1);
for (String val : values){
if(list.contains(val)){
return true;
}
}
return false;
}
}


整合Mybatis

依赖配置

<!-- mybatis -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.1</version>
</dependency>
<!-- mysql驱动 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<!--数据源依赖-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.24</version>
</dependency>


数据源配置

@Configuration
@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")
public class DatabaseConfig {
/**
* 数据源配置
* @return
*/
@Bean
public DataSource druidDataSource(){
DruidDataSource dataSource=new DruidDataSource();
dataSource.setUsername("root");
dataSource.setPassword("mysql");
dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");
dataSource.setDriverClassName("com.mysql.jdbc.Driver");
return dataSource;
}
}


代码部分:

Mapper

public interface HotCommentMapper {
@Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")
@Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")
public void addHotComment(HotComment hotComment);
}


Controller

@Controller
@RequestMapping("/ithome")
public class HotCommentController {

@Autowired
private RedisService redisService;
@Autowired
private EsService esService;

/**
* 首页
* @return
*/
@GetMapping("/index")
public ModelAndView index(){
ModelAndView mav = new ModelAndView("search");
Set<HotComment> set = redisService.rankGet("ithome:hotrank",50);
mav.addObject("comments",set);
return mav;
}

/**
* 搜索
* @param keyword
* @return
*/
@GetMapping("/search/{keyword}")
public ModelAndView search(@PathVariable("keyword") String keyword){
ModelAndView mav = new ModelAndView("search");
List<HotComment> list = esService.findByUser(keyword);
mav.addObject("comments",list);
return mav;
}
}


主程序

@SpringBootApplication
@EnableElasticsearchRepositories
@EnableScheduling
public class IthomecrawlerApplication {

public static void main(String[] args) {
SpringApplication.run(IthomecrawlerApplication.class, args);
}
}


完整application.yml文件

#端口号
server:
port: 8081

spring: data: ##elasticsearch配置 elasticsearch: cluster-name: elasticsearch cluster-nodes: localhost:9300
##redis配置
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
##freemarker配置
freemarker:
##是否允许属性覆盖
allow-request-override: false
allow-session-override: false
cache: true
check-template-location: true
content-type: text/html
##暴露request属性
expose-request-attributes: false
expose-session-attributes: false
expose-spring-macro-helpers: false
suffix: .ftl
template-loader-path: classpath:/templates/
request-context-attribute: request
settings:
classic_compatible: true
locale: zh_CN
date_format: yyyy-MM-dd
time_format: HH:mm:ss
datetime_format: yyyy-MM-dd HH:mm:ss


完整pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion>

<groupId>comcrazy</groupId>
<artifactId>ithomecrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>ithomecrawler</name>
<description>ITHome Crawler.</description>

<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>1.5.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>

<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--es全文搜索--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency>
<!-- redis --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-redis</artifactId> </dependency><!-- mybatis --> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>1.3.1</version> </dependency> <!-- mysql驱动 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <!--数据源依赖--> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.0.24</version> </dependency>
<!-- jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.3</version>
</dependency>

<!-- freemarder -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-freemarker</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<finalName>ithomecrawler</finalName>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>

</project>


完整代码
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息