您的位置:首页 > 运维架构

结合Ansible技术监控Storm集群

2017-04-28 00:00 344 查看
结合Ansible技术监控Storm集群2014-11-2001:13:46

标签:stormansible

原创作品,允许转载,转载时请务必以超链接形式标明文章原始出处、作者信息和本声明。否则将追究法律责任。http://sofar.blog.51cto.com/353572/1579897

1、我的hosts配置

#vim/etc/hosts

1
2
3
4
5
6
7
8
9
10
11
12
192.168.1.100storm_zk1
192.168.1.101storm_zk2
192.168.1.102storm_zk3

192.168.1.103storm_nimbus

192.168.1.104storm_supervisor1
192.168.1.105storm_supervisor2
192.168.1.106storm_supervisor3
192.168.1.107storm_supervisor4
192.168.1.108storm_supervisor5
192.168.1.109storm_supervisor6
2、我的storm配置

#vim/usr/local/storm/conf/storm.yaml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
drpc.servers:
-"storm_supervisor1"
-"storm_supervisor2"
-"storm_supervisor3"

storm.zookeeper.servers:
-"storm_zk1"
-"storm_zk2"
-"storm_zk3"

storm.local.dir:"/data/storm/workdir"

nimbus.host:"storm_nimbus"
nimbus.thrift.port:6627
nimbus.thrift.max_buffer_size:1048576
nimbus.childopts:"-Xmx1024m"
nimbus.task.timeout.secs:30
nimbus.supervisor.timeout.secs:60
nimbus.monitor.freq.secs:10
nimbus.cleanup.inbox.freq.secs:600
nimbus.inbox.jar.expiration.secs:3600
nimbus.task.launch.secs:240
nimbus.reassign:true
nimbus.file.copy.expiration.secs:600
nimbus.topology.validator:"backtype.storm.nimbus.DefaultTopologyValidator"

storm.zookeeper.port:2181
storm.zookeeper.root:"/data/storm/zkinfo"
storm
3ff0
.cluster.mode:"distributed"
storm.local.mode.zmq:false

ui.port:8080
ui.childopts:"-Xmx768m"

supervisor.slots.ports:
-6700
-6701
-6702
-6703
-6704
-6705
-6706
-6707
-6708
-6709

supervisor.childopts:"-Xmx2048m"
supervisor.worker.start.timeout.secs:240
supervisor.worker.timeout.secs:30
supervisor.monitor.frequency.secs:3
supervisor.heartbeat.frequency.secs:5
supervisor.enable:true

worker.childopts:"-Xmx4096m"
topology.max.spout.pending:5000

storm.zookeeper.session.timeout:5000
storm.zookeeper.connection.timeout:3000
storm.zookeeper.retry.times:6
storm.zookeeper.retry.interval:2000
storm.zookeeper.retry.intervalceiling.millis:30000

storm.thrift.transport:"backtype.storm.security.auth.SimpleTransportPlugin"
storm.messaging.transport:"backtype.storm.messaging.netty.Context"
storm.messaging.netty.server_worker_threads:50
storm.messaging.netty.client_worker_threads:50
storm.messaging.netty.buffer_size:20971520
storm.messaging.netty.max_retries:100
storm.messaging.netty.max_wait_ms:1000
storm.messaging.netty.min_wait_ms:100
3、nimbus节点部署

#vim/data/scripts/monitor_status_for_storm.sh

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin

./etc/profile

##监控页面地址参数
MON_SRV_IPADDR="192.168.1.103"
MON_SRV_PORT="8080"

##是否已正确扫描
SCAN_FLAG=0

##工作基路径
BASE_PATH="/data/scripts"

##异常stormSupervisor主机地址列表
FAIL_SUPERVISOR_LIST="${BASE_PATH}/fail_supervisor.txt"

#---------------------------------------------------------------------------------------------------
##重启storm的nimbus服务
functionrestart_storm_nimbus_server()
{
[[-n`psaux|grepjava|grepstorm`]]&&kill-9`psaux|grepjava|grepstorm|awk'{print$2}'`
nohup/usr/local/storm/bin/stormnimbus>/dev/null2>&1&
nohup/usr/local/storm/bin/stormui>/dev/null2>&1&

sleep30
}

#---------------------------------------------------------------------------------------------------
##1、检查监控页面是否正常【8080端口不通的情况】
for((i=0;i<3;i++));do
RETVAL=`/usr/bin/nmap-n-sS-p${MON_SRV_PORT}${MON_SRV_IPADDR}|grepopen`
[[-n"${RETVAL}"]]&&SCAN_FLAG=1;break||sleep10
done

[[${SCAN_FLAG}-ne1]]&&restart_storm_nimbus_server

#---------------------------------------------------------------------------------------------------
##2、将监控页面抓取内容与本地hosts内容进行差异比较,以确定是否存在异常的stormsupervisor服务
curl-shttp://${MON_SRV_IPADDR}:${MON_SRV_PORT}/|sed's/<td>/<td>\n/g'|awk-F'<''/^storm_/{print$1}'|awk'!/nimbus/{print}'|sort>${BASE_PATH}/supervisor_list_from_page.txt

##如果获取的stormnimbus监控页面数据为空,代表stormnimbus服务存在异常
[[-z`sed'/^$/d'${BASE_PATH}/supervisor_list_from_page.txt`]]&&restart_storm_nimbus_server

sort-nr${BASE_PATH}/supervisor_list_from_page.txt${BASE_PATH}/supervisor_list.txt|uniq-u>${BASE_PATH}/supervisor_list_for_failed.txt
[[-z`sed'/^$/d'${BASE_PATH}/supervisor_list_for_failed.txt`]]&&rm-f${BASE_PATH}/supervisor_list_for_failed.txt&&exit0

#---------------------------------------------------------------------------------------------------
##3、获得异常的stormsupervisor服务的IP地址列表
echo"[fail_supervisor]">>${FAIL_SUPERVISOR_LIST}

forSUPERVISOR_NAMEADDRin`cat${BASE_PATH}/supervisor_list_for_failed.txt`
do
TEMP_IPADDR=`grep-w${SUPERVISOR_NAMEADDR}/etc/hosts|grep-v'#'|awk'{print$1}'|tail-1`
echo"${TEMP_IPADDR}">>${FAIL_SUPERVISOR_LIST}
IPLIST="${IPLIST}${TEMP_IPADDR}"
done

#---------------------------------------------------------------------------------------------------
##4、远程重启stormsupervisor服务
/usr/local/bin/ansible-i${FAIL_SUPERVISOR_LIST}fail_supervisor-mshell-a"/data/scripts/restart_storm_service.sh"
rm-f${FAIL_SUPERVISOR_LIST}
#vim/data/scripts/supervisor_list.txt

1
2
3
4
5
6
storm_supervisor1
storm_supervisor2
storm_supervisor3
storm_supervisor4
storm_supervisor5
storm_supervisor6
#touch/var/run/check_storm.lock

#crontab-e

*/2****(flock--timeout=0/var/run/check_storm.lock/data/scripts/monitor_status_for_storm.sh>/dev/null2>&1)

4、supervisor节点部署

#vim/data/scripts/restart_storm_service.sh

1
2
3
4
5
6
#!/bin/sh
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin

./etc/profile
[[-n`psaux|grepjava|grepstorm`]]&&kill-9`psaux|grepjava|grepstorm|awk'{print$2}'`
nohup/usr/local/storm/bin/stormsupervisor>/dev/null2>&1&
本文出自“人生理想在于坚持不懈”博客,请务必保留此出处http://sofar.blog.51cto.com/353572/1579897
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: