hive数据直接写入到es索引中
2017-11-23 11:41
726 查看
1、创建索引
put: http://es.dm.csdn.net/item_for_related
post: http://es.dm.csdn.net/item_for_related/item/_mapping
{
"blog": {
"_all": {
"enabled": false
},
"properties": {
"id": {
"type": "long"
},
"source_type": {
"type": "keyword"
},
"title": {
"type": "text"
},
"body": {
"type": "text"
},
"user_name": {
"type": "keyword"
},
"created_at": {
"type": "keyword"
},
"quality_score": {
"type": "float"
},
"tags": {
"type": "text"
},
"system_tag": {
"type": "text"
}
}
}
}
2、创建hive表结构和es的对应
CREATE EXTERNAL TABLE `item_for_related_txt` (
id string,
title string,
body string,
source_type string,
user_name string,
created_at string,
tags string,
quality_score string,
system_tag string
)STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES ('es.nodes' = '192.168.100.212,192.168.100.213,192.168.100.214,192.168.100.215,192.168.100.216',
'es.index.auto.create' = 'false',
'es.resource' = 'item_for_related/item',
'es.write.operation' = 'upsert',
'es.mapping.id' = 'id',
'es.batch.size.entries'='1000',
'es.batch.write.refresh'='true',
'es.batch.write.retry.wait'='30s');
3、写数据到hive表中
set mapred.job.name=import_item_for_related_txt;
set mapred.job.queue.name=hadoop;
add jar /data/1/usr/local/hive/lib/elasticsearch-hadoop-5.1.1.jar;
insert overwrite table item_for_related_txt
select a.itemid as id,a.title,b.content,'blog' as source_type,a.username as user_name,a.posttime as created_at,c.tags as tags, d.quality_score, '' as system_tag from item_txt a
left join itemcontent_txt b on a.articleid = b.articleid
left join itemtags_txt c on a.itemid = c.itemid
left join blog_extend_attr_txt d on a.itemid = d.id
where a.posttime > '2011-01-01 00:00:00' and d.quality_score > 1.0;
4、去es中查数据
get: http://es.dm.csdn.net/item_for_related/blog/_search
post:http://es.dm.csdn.net/item_for_related/blog/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"created_at": {
"gte": "2017-11-11"
}
}
}
]
}
},
"from": 0,
"size": 10,
"_source": [
"id",
"title",
"created_at",
"user_name",
"quality_score"
]
}
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "python",
"type": "best_fields",
"fields": [
"title",
"tags"
]
}
},
{
"range": {
"created_at": {
"gte": "2017-10-21"
}
}
}
]
}
},
"from": 0,
"size": 10,
"_source": [
"id",
"title",
"created_at",
"user_name",
"quality_score"
]
}
put: http://es.dm.csdn.net/item_for_related
post: http://es.dm.csdn.net/item_for_related/item/_mapping
{
"blog": {
"_all": {
"enabled": false
},
"properties": {
"id": {
"type": "long"
},
"source_type": {
"type": "keyword"
},
"title": {
"type": "text"
},
"body": {
"type": "text"
},
"user_name": {
"type": "keyword"
},
"created_at": {
"type": "keyword"
},
"quality_score": {
"type": "float"
},
"tags": {
"type": "text"
},
"system_tag": {
"type": "text"
}
}
}
}
2、创建hive表结构和es的对应
CREATE EXTERNAL TABLE `item_for_related_txt` (
id string,
title string,
body string,
source_type string,
user_name string,
created_at string,
tags string,
quality_score string,
system_tag string
)STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES ('es.nodes' = '192.168.100.212,192.168.100.213,192.168.100.214,192.168.100.215,192.168.100.216',
'es.index.auto.create' = 'false',
'es.resource' = 'item_for_related/item',
'es.write.operation' = 'upsert',
'es.mapping.id' = 'id',
'es.batch.size.entries'='1000',
'es.batch.write.refresh'='true',
'es.batch.write.retry.wait'='30s');
3、写数据到hive表中
set mapred.job.name=import_item_for_related_txt;
set mapred.job.queue.name=hadoop;
add jar /data/1/usr/local/hive/lib/elasticsearch-hadoop-5.1.1.jar;
insert overwrite table item_for_related_txt
select a.itemid as id,a.title,b.content,'blog' as source_type,a.username as user_name,a.posttime as created_at,c.tags as tags, d.quality_score, '' as system_tag from item_txt a
left join itemcontent_txt b on a.articleid = b.articleid
left join itemtags_txt c on a.itemid = c.itemid
left join blog_extend_attr_txt d on a.itemid = d.id
where a.posttime > '2011-01-01 00:00:00' and d.quality_score > 1.0;
4、去es中查数据
get: http://es.dm.csdn.net/item_for_related/blog/_search
post:http://es.dm.csdn.net/item_for_related/blog/_search
{
"query": {
"bool": {
"must": [
{
"range": {
"created_at": {
"gte": "2017-11-11"
}
}
}
]
}
},
"from": 0,
"size": 10,
"_source": [
"id",
"title",
"created_at",
"user_name",
"quality_score"
]
}
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "python",
"type": "best_fields",
"fields": [
"title",
"tags"
]
}
},
{
"range": {
"created_at": {
"gte": "2017-10-21"
}
}
}
]
}
},
"from": 0,
"size": 10,
"_source": [
"id",
"title",
"created_at",
"user_name",
"quality_score"
]
}
相关文章推荐
- 利用hive将数据写入es
- 将hive数据查询直接写入文件
- hive 数据写入es
- hive读取es数据
- 测试使用-批量往es索引中添加数据,es的使用小结。
- 测试使用-批量往es索引中添加数据,es的使用小结。
- ES Java API - 获取索引历史更新数据
- ES索引写入性能优化
- kettle 从hive中读取数据并写入MongoDB
- python下用find命令行直接遍历某文件夹下的数据并按顺序写入txt文档中
- IOS开发—数据存储(直接写入、NSUserDefaults、NSkeyedArchiver)
- 扩展:hive插入数据到ES
- 测试使用-批量往es索引中添加数据,es的使用小结。
- 测试使用-批量往es索引中添加数据,es的使用小结。
- Hive数据仓库--HiveQL视图和索引
- heritrix3抓取的数据直接写入到mysql中
- Elasticsearch生成json,创建索引(把数据写入索引中)
- 直接循环写入数据
- 测试使用-批量往es索引中添加数据,es的使用小结。
- ES Java API - 获取索引下数据量