您的位置:首页 > 数据库 > Oracle

<< Oracle高可用>>部分书面作业 - 第五课 RAC-故障定位和处理

2013-10-13 16:58 661 查看
1.模拟两个节点内联网不通,观察RAC会出现什么现象?给出故障定位的整个过程。
现象:id小的节点会被驱逐出集群。

故障定位过程如下:
正常情況下:
[root@rac3 ~]# crs_stat -t
Name Type Target State Host
------------------------------------------------------------
ora....SM1.asm application ONLINE ONLINE rac3
ora....C3.lsnr application ONLINE ONLINE rac3
ora.rac3.gsd application ONLINE ONLINE rac3
ora.rac3.ons application ONLINE ONLINE rac3
ora.rac3.vip application ONLINE ONLINE rac3
ora....SM2.asm application ONLINE ONLINE rac4
ora....C4.lsnr application ONLINE ONLINE rac4
ora.rac4.gsd application ONLINE ONLINE rac4
ora.rac4.ons application ONLINE ONLINE rac4
ora.rac4.vip application ONLINE ONLINE rac4
ora.racdb.db application ONLINE ONLINE rac3
ora....b1.inst application ONLINE ONLINE rac3
ora....b2.inst application ONLINE ONLINE rac4
[root@rac3 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:E5
inet addr:192.168.1.103 Bcast:192.168.1.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:feed:f0e5/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:2827 errors:0 dropped:0 overruns:0 frame:0
TX packets:1898 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:260984 (254.8 KiB) TX bytes:346646 (338.5 KiB)

eth0:2 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:E5
inet addr:192.168.1.203 Bcast:192.168.1.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1

eth1 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:EF
inet addr:192.168.2.103 Bcast:192.168.2.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:feed:f0ef/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:187730 errors:0 dropped:0 overruns:0 frame:0
TX packets:207907 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:84843845 (80.9 MiB) TX bytes:158607242 (151.2 MiB)

lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:16436 Metric:1
RX packets:166598 errors:0 dropped:0 overruns:0 frame:0
TX packets:166598 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:15978738 (15.2 MiB) TX bytes:15978738 (15.2 MiB)

模拟内联网中断:
[root@rac3 ~]# ifdown eth1
[root@rac3 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:E5
inet addr:192.168.1.103 Bcast:192.168.1.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:feed:f0e5/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:2896 errors:0 dropped:0 overruns:0 frame:0
TX packets:1950 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:266354 (260.1 KiB) TX bytes:354046 (345.7 KiB)

eth0:2 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:E5
inet addr:192.168.1.203 Bcast:192.168.1.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1

lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:16436 Metric:1
RX packets:166610 errors:0 dropped:0 overruns:0 frame:0
TX packets:166610 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:15979086 (15.2 MiB) TX bytes:15979086 (15.2 MiB)

[root@rac3 ~]#

等待大概兩分鐘後, rac4离线:
[root@rac3 ~]# crs_stat -t
Name Type Target State Host
------------------------------------------------------------
ora....SM1.asm application ONLINE ONLINE rac3
ora....C3.lsnr application ONLINE ONLINE rac3
ora.rac3.gsd application ONLINE ONLINE rac3
ora.rac3.ons application ONLINE ONLINE rac3
ora.rac3.vip application ONLINE ONLINE rac3
ora....SM2.asm application ONLINE OFFLINE
ora....C4.lsnr application ONLINE OFFLINE
ora.rac4.gsd application ONLINE OFFLINE
ora.rac4.ons application ONLINE OFFLINE
ora.rac4.vip application ONLINE ONLINE rac3
ora.racdb.db application OFFLINE OFFLINE
ora....b1.inst application ONLINE OFFLINE
ora....b2.inst application ONLINE OFFLINE
[root@rac3 ~]#

rac3上crs正常:
[root@rac3 rac3]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac3 rac3]#

rac4上crs停止:
[root@rac4 ~]# crsctl check crs
Failure 1 contacting CSS daemon
Cannot communicate with CRS
Cannot communicate with EVM
[root@rac4 ~]#

检查rac4上os log,发现css启动失败信息:
[root@rac4 ~]# tail -n 10 /var/log/messages
Jul 15 23:15:12 rac4 logger: Cluster Ready Services completed waiting on dependencies.
Jul 15 23:15:17 rac4 udevd[1552]: udev done!
Jul 15 23:16:13 rac4 su(pam_unix)[29582]: session opened for user oracle by (uid=0)
Jul 15 23:16:13 rac4 logger: Running CRSD with TZ =
Jul 15 23:16:14 rac4 su(pam_unix)[30721]: session opened for user oracle by (uid=0)
Jul 15 23:17:20 rac4 su(pam_unix)[30721]: session closed for user oracle
Jul 15 23:17:20 rac4 logger: Oracle CSS daemon failed to start up. Check CRS logs for diagnostics.
Jul 15 23:18:30 rac4 sshd(pam_unix)[32629]: session opened for user root by (uid=0)
Jul 15 23:18:40 rac4 su(pam_unix)[349]: session opened for user oracle by (uid=0)
Jul 15 23:18:40 rac4 su(pam_unix)[349]: session closed for user oracle
[root@rac4 ~]#

检查crs log,再次指向css启动失败:
[root@rac4 crsd]# tail -n 30 crsd.log

2013-07-15 23:18:33.809: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-15 23:18:35.227: [ COMMCRS][1084229984]clsc_connect: (0xb43e80) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=OCSSD_LL_rac4_crs))

2013-07-15 23:18:35.227: [ CSSCLNT][2541121088]clsssInitNative: connect failed, rc 9

2013-07-15 23:18:35.229: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-15 23:18:36.643: [ COMMCRS][1084229984]clsc_connect: (0xb43e80) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=OCSSD_LL_rac4_crs))

2013-07-15 23:18:36.644: [ CSSCLNT][2541121088]clsssInitNative: connect failed, rc 9

2013-07-15 23:18:36.645: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-15 23:18:38.107: [ COMMCRS][1084229984]clsc_connect: (0xb47d20) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=OCSSD_LL_rac4_crs))

2013-07-15 23:18:38.107: [ CSSCLNT][2541121088]clsssInitNative: connect failed, rc 9

2013-07-15 23:18:38.109: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-15 23:18:39.527: [ COMMCRS][1084229984]clsc_connect: (0xb47d20) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=OCSSD_LL_rac4_crs))

2013-07-15 23:18:39.527: [ CSSCLNT][2541121088]clsssInitNative: connect failed, rc 9

2013-07-15 23:18:39.528: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-15 23:18:40.530: [ CRSMAIN][2541121088][PANIC]0CRSD exiting: Could not init the CSS context

2013-07-15 23:18:40.531: [ default][2541121088]Terminating clsd session
[root@rac4 crsd]# pwd
/opt/ora10g/product/10.2.0/crs_1/log/rac4/crsd
[root@rac4 crsd]#

检查css log,发现是由于node1不响应DskHeartbeat:
[root@rac4 cssd]# tail ocssd.log
[ CSSD]2013-07-15 23:17:20.149 [1210108256] >TRACE: 0x0x776150 28 41 44 44 52 45 53 53 - 3d 28 50 52 4f 54 4f 43 (ADDRESS=(PROTOC
[ CSSD]2013-07-15 23:17:20.149 [1210108256] >TRACE: 0x0x776160 4f 4c 3d 74 63 70 29 28 - 44 45 56 3d 31 39 29 28 OL=tcp)(DEV=19)(
[ CSSD]2013-07-15 23:17:20.149 [1210108256] >TRACE: 0x0x776170 48 4f 53 54 3d 31 39 32 - 2e 31 36 38 2e 32 2e 31 HOST=192.168.2.1
[ CSSD]2013-07-15 23:17:20.149 [1210108256] >TRACE: 0x0x776180 30 34 29 28 50 4f 52 54 - 3d 33 32 37 37 33 29 29 04)(PORT=32773))
[ CSSD]2013-07-15 23:17:20.149 [1210108256] >TRACE: clssscctx->nmctx->nmnode[002]->con: dump of 0x(nil), len 168
[ CSSD]--- DUMP GROCK STATE DB ---
[ CSSD]--- END OF GROCK STATE DUMP ---
[ CSSD]------- End Dump -------

[ CSSD]2013-07-15 23:17:20.154 [1115699552] >TRACE: clssnmReadDskHeartbeat: node(1) is down. rcfg(4) wrtcnt(664) LATS(4294222000) Disk lastSeqNo(664)
[root@rac4 cssd]# pwd
/opt/ora10g/product/10.2.0/crs_1/log/rac4/cssd
[root@rac4 cssd]#

检查rac4到rac3内联网的联通性,无法联通:
[root@rac4 cssd]# ping 192.168.2.203
PING 192.168.2.203 (192.168.2.203) 56(84) bytes of data.
From 192.168.2.104 icmp_seq=1 Destination Host Unreachable
From 192.168.2.104 icmp_seq=2 Destination Host Unreachable
From 192.168.2.104 icmp_seq=3 Destination Host Unreachable

--- 192.168.2.203 ping statistics ---
4 packets transmitted, 0 received, +3 errors, 100% packet loss, time 2999ms
, pipe 4

检查rac4的网卡ip,正常
[root@rac4 cssd]# ifconfig eth1
eth1 Link encap:Ethernet HWaddr 00:0C:29:56:FC:24
inet addr:192.168.2.104 Bcast:192.168.2.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fe56:fc24/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:4 errors:0 dropped:0 overruns:0 frame:0
TX packets:21 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:316 (316.0 b) TX bytes:1162 (1.1 KiB)

[root@rac4 cssd]#

检查rac3的网卡ip,找到原因后启动内联网网卡
[root@rac3 rac3]# ifconfig eth1
eth1 Link encap:Ethernet HWaddr 00:0C:29:ED:F0:EF
BROADCAST MULTICAST MTU:1500 Metric:1
RX packets:187746 errors:0 dropped:0 overruns:0 frame:0
TX packets:207923 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:84846273 (80.9 MiB) TX bytes:158609942 (151.2 MiB)

[root@rac3 rac3]# ifup eth1
[root@rac3 rac3]#

等待两分钟后,发现rac4的crs没有自动起来,于是手动重启:
[root@rac4 cssd]# crsctl check crs
Failure 1 contacting CSS daemon
Cannot communicate with CRS
Cannot communicate with EVM
[root@rac4 cssd]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac4 cssd]#

等到两份钟后,rac4上的crs成功启动:
[root@rac4 cssd]# ps -ef | grep crs
root 339 1 0 23:18 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
root 4368 32730 0 23:36 pts/1 00:00:00 grep crs
[root@rac4 cssd]# ps -ef | grep crs
root 339 1 0 23:18 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/crsd.bin restart
root 1682 1 0 23:24 ? 00:00:00 /bin/su -l oracle -c sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/evmd; exec /opt/ora10g/product/10.2.0/crs_1/bin/evmd '
oracle 4399 1682 0 23:37 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmd.bin
root 4516 4447 0 23:37 ? 00:00:00 /bin/su -l oracle -c /bin/sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?'
oracle 4517 4516 0 23:37 ? 00:00:00 /bin/sh -c ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?
oracle 4551 4517 0 23:37 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/ocssd.bin
oracle 4838 4399 0 23:37 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmlogger.bin -o /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.info -l /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.log
root 5660 32730 0 23:38 pts/1 00:00:00 grep crs
[root@rac4 cssd]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac4 cssd]#
检查rac3上的crs,正常:
[root@rac3 rac3]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac3 rac3]#
启动其他未恢复的服务:
[root@rac3 rac3]# crs_stat -t
Name Type Target State Host
------------------------------------------------------------
ora....SM1.asm application ONLINE ONLINE rac3
ora....C3.lsnr application ONLINE ONLINE rac3
ora.rac3.gsd application ONLINE ONLINE rac3
ora.rac3.ons application ONLINE ONLINE rac3
ora.rac3.vip application ONLINE ONLINE rac3
ora....SM2.asm application ONLINE OFFLINE
ora....C4.lsnr application ONLINE OFFLINE
ora.rac4.gsd application ONLINE OFFLINE
ora.rac4.ons application ONLINE OFFLINE
ora.rac4.vip application ONLINE ONLINE rac3
ora.racdb.db application OFFLINE OFFLINE
ora....b1.inst application ONLINE OFFLINE
ora....b2.inst application ONLINE OFFLINE
[root@rac3 rac3]# srvctl start asm -n rac4
[root@rac3 rac3]# srvctl start listener -n rac4
[root@rac3 rac3]# srvctl start nodeapps -n rac4 #启动ora.rac4.gsd与ora.rac4.ons
[root@rac3 rac3]# srvctl start database -d racdb
[root@rac3 rac3]# crs_stat -t
Name Type Target State Host
------------------------------------------------------------
ora....SM1.asm application ONLINE ONLINE rac3
ora....C3.lsnr application ONLINE ONLINE rac3
ora.rac3.gsd application ONLINE ONLINE rac3
ora.rac3.ons application ONLINE ONLINE rac3
ora.rac3.vip application ONLINE ONLINE rac3
ora....SM2.asm application ONLINE ONLINE rac4
ora....C4.lsnr application ONLINE ONLINE rac4
ora.rac4.gsd application ONLINE ONLINE rac4
ora.rac4.ons application ONLINE ONLINE rac4
ora.rac4.vip application ONLINE ONLINE rac4
ora.racdb.db application ONLINE ONLINE rac4
ora....b1.inst application ONLINE ONLINE rac3
ora....b2.inst application ONLINE ONLINE rac4
[root@rac3 rac3]#

--EOF--
2.模拟OCR磁盘不可以时,RAC会出现什么现象?给出故障定位的整个过程。

准备,在各节点停止crs,备份ocr:
[root@rac3 ~]# crsctl stop crs
Stopping resources.
Successfully stopped CRS resources
Stopping CSSD.
Shutting down CSS daemon.
Shutdown request successfully issued.
[root@rac3 ~]# ocrconfig -h

[root@rac4 cssd]# crsctl stop crs
Stopping resources.
Successfully stopped CRS resources
Stopping CSSD.
Shutting down CSS daemon.
Shutdown request successfully issued.
[root@rac4 cssd]#

[root@rac3 ~]# ocrcheck
Status of Oracle Cluster Registry is as follows :
Version : 2
Total space (kbytes) : 104344
Used space (kbytes) : 3832
Available space (kbytes) : 100512
ID : 1742158402
Device/File Name : /dev/raw/raw1
Device/File integrity check succeeded

Device/File not configured

Cluster registry integrity check succeeded

[root@rac3 ~]# dd if=/dev/raw/raw1 of=ocr20130715.dd
208864+0 records in
208864+0 records out
[root@rac3 ~]# ocrconfig -export ocr20130715.exp
[root@rac3 ~]# ls -lh ocr20130715.dd ocr20130715.exp
-rw-r--r-- 1 root root 102M Jul 15 23:59 ocr20130715.dd
-rw-r--r-- 1 root root 84K Jul 15 23:59 ocr20130715.exp
[root@rac3 ~]#

模拟破坏ocr后启动crs
[root@rac3 ~]# dd if=/dev/zero of=/dev/raw/raw1 bs=1M count=10
10+0 records in
10+0 records out
[root@rac3 ~]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac3 ~]#

等待2分钟后,发现crs进程没有启动
[root@rac3 ~]# ps -ef | grep crs
root 15616 1 0 Jul15 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
root 19031 15050 0 00:04 pts/1 00:00:00 grep crs
[root@rac3 ~]#

查看os log:
[root@rac3 ~]# tail /var/log/messages
Jul 15 23:48:11 rac3 logger: Oracle CSSD graceful shutdown
Jul 16 00:02:12 rac3 su(pam_unix)[18766]: session opened for user oracle by (uid=0)
Jul 16 00:02:12 rac3 su(pam_unix)[18785]: session opened for user oracle by (uid=0)
Jul 16 00:02:12 rac3 su(pam_unix)[18850]: session opened for user oracle by (uid=0)
Jul 16 00:02:12 rac3 su(pam_unix)[18766]: session closed for user oracle
Jul 16 00:02:12 rac3 logger: Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.18622.
Jul 16 00:02:12 rac3 su(pam_unix)[18850]: session closed for user oracle
Jul 16 00:02:12 rac3 logger: Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.18640.
Jul 16 00:02:12 rac3 su(pam_unix)[18785]: session closed for user oracle
Jul 16 00:02:12 rac3 logger: Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.18612.
[root@rac3 ~]#

查看/tmp下的log,发现是ocr损坏:
[root@rac3 ~]# cat /tmp/crsctl.18612
OCR initialization failed with invalid format: PROC-22: The OCR backend has an invalid format
[root@rac3 ~]#

通过ocrcheck验证确定ocr损坏:
[root@rac3 ~]# ocrcheck
PROT-601: Failed to initialize ocrcheck
[root@rac3 ~]#

恢复ocr
[root@rac3 ~]# ocrconfig -import ocr20130715.exp
[root@rac3 ~]# ocrcheck
Status of Oracle Cluster Registry is as follows :
Version : 2
Total space (kbytes) : 104344
Used space (kbytes) : 3832
Available space (kbytes) : 100512
ID : 88272179
Device/File Name : /dev/raw/raw1
Device/File integrity check succeeded

Device/File not configured

Cluster registry integrity check succeeded

[root@rac3 ~]#
恢复ocr后检查os log,发现crs完成等待依赖
[root@rac3 ~]# tail /var/log/messages
Jul 16 00:04:12 rac3 logger: Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.18612.
Jul 16 00:05:12 rac3 su(pam_unix)[19157]: session opened for user oracle by (uid=0)
Jul 16 00:05:12 rac3 su(pam_unix)[19194]: session opened for user oracle by (uid=0)
Jul 16 00:05:12 rac3 su(pam_unix)[19195]: session opened for user oracle by (uid=0)
Jul 16 00:05:14 rac3 su(pam_unix)[19157]: session closed for user oracle
Jul 16 00:05:14 rac3 logger: Cluster Ready Services completed waiting on dependencies.
Jul 16 00:05:14 rac3 su(pam_unix)[19194]: session closed for user oracle
Jul 16 00:05:14 rac3 logger: Cluster Ready Services completed waiting on dependencies.
Jul 16 00:05:14 rac3 su(pam_unix)[19195]: session closed for user oracle
Jul 16 00:05:14 rac3 logger: Cluster Ready Services completed waiting on dependencies.
[root@rac3 ~]#

等待2分钟后,发现crs进程启动成功:
[root@rac3 ~]# ps -ef | grep crs
root 15616 1 0 Jul15 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
root 19283 15050 0 00:06 pts/1 00:00:00 grep crs
[root@rac3 ~]# ps -ef | grep crs
root 15615 1 0 Jul15 ? 00:00:00 /bin/su -l oracle -c sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/evmd; exec /opt/ora10g/product/10.2.0/crs_1/bin/evmd '
root 15616 1 0 Jul15 ? 00:00:01 /opt/ora10g/product/10.2.0/crs_1/bin/crsd.bin reboot
oracle 19295 15615 0 00:06 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmd.bin
root 19416 19291 0 00:06 ? 00:00:00 /bin/su -l oracle -c /bin/sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?'
oracle 19417 19416 0 00:06 ? 00:00:00 /bin/sh -c ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?
oracle 19451 19417 0 00:06 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/ocssd.bin
oracle 19716 19295 0 00:06 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmlogger.bin -o /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.info -l /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.log
oracle 20683 1 0 00:06 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/opmn/bin/ons -d
oracle 20684 20683 0 00:06 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/opmn/bin/ons -d
[root@rac3 ~]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac3 ~]#

--EOF--

3.模拟votedisk不可用时,RAC会出现什么现象?给出故障定位的整个过程。

准备,各节点停止crs,备份votedisk:
[root@rac3 ~]# crsctl query css votedisk
0. 0 /dev/raw/raw2

located 1 votedisk(s).
[root@rac3 ~]#

[root@rac3 ~]# crsctl stop crs
Stopping resources.
Successfully stopped CRS resources
Stopping CSSD.
Shutting down CSS daemon.
Shutdown request successfully issued.
[root@rac3 ~]#

[root@rac4 ~]# crsctl stop crs
Stopping resources.
Successfully stopped CRS resources
Stopping CSSD.
Shutting down CSS daemon.
Shutdown request successfully issued.
[root@rac4 ~]#

[root@rac3 ~]# ps -ef | grep crs
root 429 1 0 21:06 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
root 732 22288 0 21:07 pts/1 00:00:00 grep crs
[root@rac3 ~]# dd if=/dev/raw/raw2 of=votedisk.dd
208864+0 records in
208864+0 records out
[root@rac3 ~]# ls -lh votedisk.dd
-rw-r--r-- 1 root root 102M Jul 16 21:08 votedisk.dd
[root@rac3 ~]#

重启crs,模拟votedisk被破坏:
[root@rac3 ~]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac3 ~]#

[root@rac4 ~]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac4 ~]#

[root@rac3 ~]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac3 ~]# crs_stat -t
Name Type Target State Host
------------------------------------------------------------
ora....SM1.asm application ONLINE ONLINE rac3
ora....C3.lsnr application ONLINE ONLINE rac3
ora.rac3.gsd application ONLINE ONLINE rac3
ora.rac3.ons application ONLINE ONLINE rac3
ora.rac3.vip application ONLINE ONLINE rac3
ora....SM2.asm application ONLINE ONLINE rac4
ora....C4.lsnr application ONLINE ONLINE rac4
ora.rac4.gsd application ONLINE ONLINE rac4
ora.rac4.ons application ONLINE ONLINE rac4
ora.rac4.vip application ONLINE ONLINE rac4
ora.racdb.db application ONLINE ONLINE rac4
ora....b1.inst application ONLINE ONLINE rac3
ora....b2.inst application ONLINE ONLINE rac4
[root@rac3 ~]#

[root@rac3 ~]# dd if=/dev/zero of=/dev/raw/raw2 bs=1M count=10
10+0 records in
10+0 records out
[root@rac3 ~]#

执行完以上dd命令模拟votedisk损坏后,两节点都立即重启。

重启后,发现crs起不来了:
[root@rac3 ~]# ps -ef | grep crs
root 29705 1 0 21:16 ? 00:00:00 /bin/su -l oracle -c sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/evmd; exec /opt/ora10g/product/10.2.0/crs_1/bin/evmd '
oracle 30529 29705 0 21:17 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmd.bin
root 31158 1 0 21:19 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
root 31470 31411 0 21:20 pts/1 00:00:00 grep crs
[root@rac3 ~]# crsctl check crs
Failure 1 contacting CSS daemon
Cannot communicate with CRS
Cannot communicate with EVM
[root@rac3 ~]#

依次检查os log, crs log, css log后发现votedisk损坏:
os log
[root@rac3 ~]# tail /var/log/messages
Jul 16 21:16:43 rac3 logger: Cluster Ready Services completed waiting on dependencies.
Jul 16 21:16:51 rac3 udevd[1486]: udev done!
Jul 16 21:17:42 rac3 su(pam_unix)[29705]: session opened for user oracle by (uid=0)
Jul 16 21:17:43 rac3 logger: Running CRSD with TZ =
Jul 16 21:17:45 rac3 su(pam_unix)[30661]: session opened for user oracle by (uid=0)
Jul 16 21:17:48 rac3 su(pam_unix)[30661]: session closed for user oracle
Jul 16 21:17:48 rac3 logger: Oracle CSS daemon failed to start up. Check CRS logs for diagnostics.
Jul 16 21:19:07 rac3 su(pam_unix)[31168]: session opened for user oracle by (uid=0)
Jul 16 21:19:07 rac3 su(pam_unix)[31168]: session closed for user oracle
Jul 16 21:19:53 rac3 sshd(pam_unix)[31247]: session opened for user root by (uid=0)
[root@rac3 ~]#

crs log
[root@rac3 rac3]# tail crsd/crsd.log

2013-07-16 21:19:06.650: [ COMMCRS][1084229984]clsc_connect: (0xb47d20) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=OCSSD_LL_rac3_crs))

2013-07-16 21:19:06.650: [ CSSCLNT][2541121088]clsssInitNative: connect failed, rc 9

2013-07-16 21:19:06.650: [ CRSRTI][2541121088]0CSS is not ready. Received status 3 from CSS. Waiting for good status ..

2013-07-16 21:19:07.652: [ CRSMAIN][2541121088][PANIC]0CRSD exiting: Could not init the CSS context

2013-07-16 21:19:07.654: [ default][2541121088]Terminating clsd session
[root@rac3 rac3]#

css log
[root@rac3 rac3]# tail cssd/ocssd.log
[ CSSD]2013-07-16 21:17:46.409 [2538118848] >TRACE: clssnmReadNodeInfo: added node 2 (rac4) to cluster
[ CSSD]2013-07-16 21:17:46.413 [1115699552] >TRACE: clssnm_skgxnmon: skgxn init failed, rc 1
[ CSSD]2013-07-16 21:17:46.413 [2538118848] >TRACE: clssnm_skgxnonline: Using vacuous skgxn monitor
[ CSSD]2013-07-16 21:17:46.414 [2538118848] >TRACE: clssnmInitNMInfo: misscount set to 60
[ CSSD]2013-07-16 21:17:46.418 [2538118848] >TRACE: clssnmDiskStateChange: state from 1 to 2 disk (0//dev/raw/raw2)
[ CSSD]2013-07-16 21:17:48.427 [1115699552] >TRACE: clssnmvDiskOpen: corrupt kill block on disk (0x00!=0x636c73536b696c4c)
[ CSSD]2013-07-16 21:17:48.427 [1115699552] >TRACE: clssnmDiskStateChange: state from 2 to 3 disk (0//dev/raw/raw2)
[ CSSD]2013-07-16 21:17:48.471 [2538118848] >ERROR: clssnmvInit: Insufficient number of voting devices available (0 of 1)
[ CSSD]2013-07-16 21:17:48.471 [2538118848] >ERROR: clssnmInitNMInfo: Failed to initialize voting device
[ CSSD]2013-07-16 21:17:48.471 [2538118848] >ERROR: clssscmain: clssnmNMInitialize failed
[root@rac3 rac3]#

各节点停止crs,恢复votedisk:
[root@rac3 rac3]# crsctl stop crs
Stopping resources.
Error while stopping resources. Possible cause: CRSD is down.
Stopping CSSD.
Unable to communicate with the CSS daemon.
[root@rac3 rac3]# ps -ef | grep crs
root 669 31411 0 21:27 pts/1 00:00:00 grep crs
root 31158 1 0 21:19 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
[root@rac3 rac3]#

[root@rac4 ~]# crsctl stop crs
Stopping resources.
Error while stopping resources. Possible cause: CRSD is down.
Stopping CSSD.
Unable to communicate with the CSS daemon.
[root@rac4 ~]# ps -ef| grep crs
root 766 31491 0 21:26 pts/1 00:00:00 grep crs
root 31236 1 0 21:18 ? 00:00:00 /bin/sh /etc/init.d/init.crsd run
[root@rac4 ~]#

[root@rac3 ~]# dd if=votedisk.dd of=/dev/raw/raw2
208864+0 records in
208864+0 records out
[root@rac3 ~]#

各节点启动crs:
[root@rac3 ~]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac3 ~]#

[root@rac4 ~]# crsctl start crs
Attempting to start CRS stack
The CRS stack will be started shortly
[root@rac4 ~]#

等待2分钟后,发现各节点crs成功启动:
[root@rac3 ~]# ps -ef | grep crs
root 1472 1407 0 21:30 ? 00:00:00 /bin/su -l oracle -c /bin/sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?'
oracle 1473 1472 0 21:30 ? 00:00:00 /bin/sh -c ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac3/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?
oracle 1508 1473 0 21:30 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/ocssd.bin
root 2087 31411 0 21:31 pts/1 00:00:00 grep crs
root 31158 1 0 21:19 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/crsd.bin reboot
[root@rac3 ~]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac3 ~]#

[root@rac4 ~]# ps -ef | grep crs
root 1529 1463 0 21:30 ? 00:00:00 /bin/su -l oracle -c /bin/sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?'
oracle 1530 1529 0 21:30 ? 00:00:00 /bin/sh -c ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/cssd; /opt/ora10g/product/10.2.0/crs_1/bin/ocssd || exit $?
oracle 1564 1530 0 21:30 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/ocssd.bin
oracle 2244 32593 5 21:30 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmd.bin
oracle 2331 2244 2 21:30 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/evmlogger.bin -o /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.info -l /opt/ora10g/product/10.2.0/crs_1/evm/log/evmlogger.log
root 2370 31491 0 21:30 pts/1 00:00:00 grep crs
root 31236 1 0 21:18 ? 00:00:00 /opt/ora10g/product/10.2.0/crs_1/bin/crsd.bin reboot
root 32593 1 0 21:23 ? 00:00:00 /bin/su -l oracle -c sh -c 'ulimit -c unlimited; cd /opt/ora10g/product/10.2.0/crs_1/log/rac4/evmd; exec /opt/ora10g/product/10.2.0/crs_1/bin/evmd '
[root@rac4 ~]# crsctl check crs
CSS appears healthy
CRS appears healthy
EVM appears healthy
[root@rac4 ~]#

--EOF--
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: