ext3/ext4 superblock recovery

创建ext4文件系统

[root@localhost ~]# mkfs.ext4 /dev/sdb1
mke2fs 1.42.9 (28-Dec-2013)
Filesystem label=
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
Stride=0 blocks, Stripe width=0 blocks
1310720 inodes, 5242624 blocks
262131 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=2153775104
160 block groups
32768 blocks per group, 32768 fragments per group
8192 inodes per group
Superblock backups stored on blocks: 
        32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 
        4096000

Allocating group tables: done                            
Writing inode tables: done                            
Creating journal (32768 blocks): done
Writing superblocks and filesystem accounting information: done   

[root@localhost ~]# mkdir /sdb
[root@localhost ~]# mount /dev/sdb1 /sdb
[root@localhost ~]# df -h
Filesystem           Size  Used Avail Use% Mounted on
/dev/mapper/ol-root   36G  4.0G   32G  12% /
devtmpfs             1.8G     0  1.8G   0% /dev
tmpfs                1.8G     0  1.8G   0% /dev/shm
tmpfs                1.8G  8.9M  1.8G   1% /run
tmpfs                1.8G     0  1.8G   0% /sys/fs/cgroup
/dev/sda1            497M  195M  303M  40% /boot
tmpfs                369M     0  369M   0% /run/user/0
/dev/sdb1             20G   45M   19G   1% /sdb

准备测试数据

[root@localhost sdb]# cd /etc/sysctl.d/
[root@localhost sysctl.d]# ls
99-sysctl.conf
[root@localhost sysctl.d]# cp 99-sysctl.conf /sdb
[root@localhost sysctl.d]# more 99-sysctl.conf 
# System default settings live in /usr/lib/sysctl.d/00-system.conf.
# To override those settings, enter new settings here, or in an /etc/sysctl.d/<name>.conf file
#
# For more information, see sysctl.conf(5) and sysctl.d(5).

破坏ext4文件系统

[root@localhost ~]#  dd if=/dev/zero of=/dev/sdb1 bs=1024 count=5
5+0 records in
5+0 records out
5120 bytes (5.1 kB) copied, 0.00270838 s, 1.9 MB/s
[root@localhost ~]# mount /dev/sdb1 /sdb
mount: unknown filesystem type '(null)'

日志报错

[ 8868.362628] sd 32:0:1:0: [sdb] Cache data unavailable
[ 8868.362632] sd 32:0:1:0: [sdb] Assuming drive cache: write through
[ 8868.363714]  sdb: sdb1
[ 8868.390297] sd 32:0:1:0: [sdb] Cache data unavailable
[ 8868.390301] sd 32:0:1:0: [sdb] Assuming drive cache: write through
[ 8868.391462]  sdb: sdb1
[ 8900.130143] EXT4-fs (sdb1): mounted filesystem with ordered data mode. Opts: (null)
[ 8900.130163] SELinux: initialized (dev sdb1, type ext4), uses xattr
[ 8902.803966] sdb1: WRITE SAME failed. Manually zeroing.
1
fsck修复
1
[root@localhost ~]# fsck -t ext4 /dev/sdb1
fsck from util-linux 2.23.2
e2fsck 1.42.9 (28-Dec-2013)
ext2fs_open2: Bad magic number in super-block
fsck.ext4: Superblock invalid, trying backup blocks...
/dev/sdb1 was not cleanly unmounted, check forced.
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Free blocks count wrong for group #1 (31740, counted=31739).
Fix<y>? yes
Free blocks count wrong (5116302, counted=5116301).
Fix<y>? yes
Free inodes count wrong for group #0 (8181, counted=8180).
Fix<y>? yes
Free inodes count wrong (1310709, counted=1310708).
Fix<y>? yes

/dev/sdb1: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sdb1: 12/1310720 files (0.0% non-contiguous), 126323/5242624 blocks

测试修复结果

[root@localhost ~]# 
[root@localhost ~]# 
[root@localhost ~]# mount /dev/sdb1 /sdb
[root@localhost ~]# df -h
Filesystem           Size  Used Avail Use% Mounted on
/dev/mapper/ol-root   36G  4.0G   32G  12% /
devtmpfs             1.8G     0  1.8G   0% /dev
tmpfs                1.8G     0  1.8G   0% /dev/shm
tmpfs                1.8G  8.9M  1.8G   1% /run
tmpfs                1.8G     0  1.8G   0% /sys/fs/cgroup
/dev/sda1            497M  195M  303M  40% /boot
tmpfs                369M     0  369M   0% /run/user/0
/dev/sdb1             20G   45M   19G   1% /sdb
[root@localhost ~]# cd /sdb
[root@localhost sdb]# ls
99-sysctl.conf  lost+found
[root@localhost sdb]# more 99-sysctl.conf 
# System default settings live in /usr/lib/sysctl.d/00-system.conf.
# To override those settings, enter new settings here, or in an /etc/sysctl.d/<name>.conf file
#
# For more information, see sysctl.conf(5) and sysctl.d(5).

ext4文件系统修复

[root@localhost ~]# mkfs.ext3 /dev/sdb1
mke2fs 1.42.9 (28-Dec-2013)
Filesystem label=
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
Stride=0 blocks, Stripe width=0 blocks
1310720 inodes, 5242624 blocks
262131 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=4294967296
160 block groups
32768 blocks per group, 32768 fragments per group
8192 inodes per group
Superblock backups stored on blocks: 
        32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208, 
        4096000

Allocating group tables: done                            
Writing inode tables: done                            
Creating journal (32768 blocks): done
Writing superblocks and filesystem accounting information: done   

[root@localhost ~]# mount /dev/sdb1 /sdb
[root@localhost ~]# df -h
Filesystem           Size  Used Avail Use% Mounted on
/dev/mapper/ol-root   36G  4.0G   32G  12% /
devtmpfs             1.8G     0  1.8G   0% /dev
tmpfs                1.8G     0  1.8G   0% /dev/shm
tmpfs                1.8G  8.9M  1.8G   1% /run
tmpfs                1.8G     0  1.8G   0% /sys/fs/cgroup
/dev/sda1            497M  195M  303M  40% /boot
tmpfs                369M     0  369M   0% /run/user/0
/dev/sdb1             20G   45M   19G   1% /sdb
[root@localhost ~]# dd if=/dev/zero of=/dev/sdb1 bs=1024 count=5
5+0 records in
5+0 records out
5120 bytes (5.1 kB) copied, 0.0138915 s, 369 kB/s
[root@localhost ~]# fsck -t ext3 /dev/sdb1
fsck from util-linux 2.23.2
e2fsck 1.42.9 (28-Dec-2013)
ext2fs_open2: Bad magic number in super-block
fsck.ext3: Superblock invalid, trying backup blocks...
/dev/sdb1 was not cleanly unmounted, check forced.
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information

/dev/sdb1: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sdb1: 11/1310720 files (0.0% non-contiguous), 126322/5242624 blocks
[root@localhost ~]# mount /dev/sdb1 /sdb
[root@localhost ~]# df -h
Filesystem           Size  Used Avail Use% Mounted on
/dev/mapper/ol-root   36G  4.0G   32G  12% /
devtmpfs             1.8G     0  1.8G   0% /dev
tmpfs                1.8G     0  1.8G   0% /dev/shm
tmpfs                1.8G  8.9M  1.8G   1% /run
tmpfs                1.8G     0  1.8G   0% /sys/fs/cgroup
/dev/sda1            497M  195M  303M  40% /boot
tmpfs                369M     0  369M   0% /run/user/0
/dev/sdb1             20G   45M   19G   1% /sdb

fsck修复危险性较大,建议先备份对应的分区(dd命令备份分区)然后再处理,有导致分区数据全部或者部分丢失的风险

File #N is offline, but is part of an online tablespace

在看一个客户的数据库恢复日志的时候发现类似警告(File #N is offline, but is part of an online tablespace.),以前没有注意,这次通过试验来重现该部分内容
创建表空间

SQL> create tablespace readonly datafile '/home/oracle/.oradata/test/readonly01.dbf' size 128M;

Tablespace created.

SQL> alter tablespace  readonly add datafile '/home/oracle/.oradata/test/readonly02.dbf' size 128M;

Tablespace altered.

写入数据

SQL> create table t_readonly tablespace readonly as select * from dba_objects;

Table created.

read only 表空间

SQL> select count(*) from t_readonly;

  COUNT(*)
----------
     72226

SQL> alter system switch logfile;

System altered.

SQL> /

System altered.

SQL> /

System altered.

SQL>  alter tablespace readonly  read only;

Tablespace altered.

SQL> alter system switch logfile;

System altered.

SQL> /

System altered.

SQL> /

System altered.

SQL> /

System altered.

备份数据库

[oracle@localhost ~]$ rman target /

Recovery Manager: Release 11.2.0.1.0 - Production on Tue Nov 1 21:15:51 2016

Copyright (c) 1982, 2009, Oracle and/or its affiliates.  All rights reserved.

connected to target database: TEST (DBID=2210907828)

RMAN> backup database format '/home/oracle/full_%U.rman';

Starting backup at 01-NOV-16
using target database control file instead of recovery catalog
allocated channel: ORA_DISK_1
channel ORA_DISK_1: SID=197 device type=DISK
channel ORA_DISK_1: starting full datafile backup set
channel ORA_DISK_1: specifying datafile(s) in backup set
input datafile file number=00001 name=/home/oracle/.oradata/test/system01.dbf
input datafile file number=00002 name=/home/oracle/.oradata/test/sysaux01.dbf
input datafile file number=00005 name=/home/oracle/.oradata/test/test01.dbf
input datafile file number=00006 name=/home/oracle/.oradata/test/test02.dbf
input datafile file number=00007 name=/home/oracle/.oradata/test/readonly01.dbf
input datafile file number=00008 name=/home/oracle/.oradata/test/readonly02.dbf
input datafile file number=00003 name=/home/oracle/.oradata/test/undotbs01.dbf
input datafile file number=00004 name=/home/oracle/.oradata/test/users01.dbf
channel ORA_DISK_1: starting piece 1 at 01-NOV-16


channel ORA_DISK_1: finished piece 1 at 01-NOV-16
piece handle=/home/oracle/full_03rjrp0t_1_1.rman tag=TAG20161101T211613 comment=NONE
channel ORA_DISK_1: backup set complete, elapsed time: 00:00:25
channel ORA_DISK_1: starting full datafile backup set
channel ORA_DISK_1: specifying datafile(s) in backup set
including current control file in backup set
including current SPFILE in backup set
channel ORA_DISK_1: starting piece 1 at 01-NOV-16
channel ORA_DISK_1: finished piece 1 at 01-NOV-16
piece handle=/home/oracle/full_04rjrp1m_1_1.rman tag=TAG20161101T211613 comment=NONE
channel ORA_DISK_1: backup set complete, elapsed time: 00:00:01
Finished backup at 01-NOV-16

RMAN> sql 'alter system archive log current';
backup  as compressed backupset archivelog  all format '/home/oracle/arch_%T_%U.rman'  delete input;

sql statement: alter system archive log current

RMAN> 
Starting backup at 01-NOV-16
current log archived
using channel ORA_DISK_1
channel ORA_DISK_1: starting compressed archived log backup set
channel ORA_DISK_1: specifying archived log(s) in backup set
input archived log thread=1 sequence=494 RECID=1 STAMP=926802386
input archived log thread=1 sequence=495 RECID=2 STAMP=926802386
input archived log thread=1 sequence=496 RECID=3 STAMP=926802389
input archived log thread=1 sequence=497 RECID=4 STAMP=926802693
input archived log thread=1 sequence=498 RECID=5 STAMP=926802693
input archived log thread=1 sequence=499 RECID=6 STAMP=926802696
input archived log thread=1 sequence=500 RECID=7 STAMP=926802787
input archived log thread=1 sequence=501 RECID=8 STAMP=926802789
input archived log thread=1 sequence=502 RECID=9 STAMP=926802792
input archived log thread=1 sequence=503 RECID=10 STAMP=926802793
input archived log thread=1 sequence=504 RECID=11 STAMP=926802812
input archived log thread=1 sequence=505 RECID=12 STAMP=926802813
input archived log thread=1 sequence=506 RECID=13 STAMP=926802816
input archived log thread=1 sequence=507 RECID=14 STAMP=926803076
input archived log thread=1 sequence=508 RECID=15 STAMP=926803077
channel ORA_DISK_1: starting piece 1 at 01-NOV-16
channel ORA_DISK_1: finished piece 1 at 01-NOV-16
piece handle=/home/oracle/arch_20161101_05rjrp45_1_1.rman tag=TAG20161101T211757 comment=NONE
channel ORA_DISK_1: backup set complete, elapsed time: 00:00:03
channel ORA_DISK_1: deleting archived log(s)
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_494_d1k4tkot_.arc RECID=1 STAMP=926802386
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_495_d1k4tln7_.arc RECID=2 STAMP=926802386
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_496_d1k4tot5_.arc RECID=3 STAMP=926802389
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_497_d1k544w3_.arc RECID=4 STAMP=926802693
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_498_d1k545wc_.arc RECID=5 STAMP=926802693
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_499_d1k548bm_.arc RECID=6 STAMP=926802696
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_500_d1k5734v_.arc RECID=7 STAMP=926802787
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_501_d1k5752s_.arc RECID=8 STAMP=926802789
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_502_d1k578c2_.arc RECID=9 STAMP=926802792
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_503_d1k579hy_.arc RECID=10 STAMP=926802793
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_504_d1k57w6s_.arc RECID=11 STAMP=926802812
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_505_d1k57xj1_.arc RECID=12 STAMP=926802813
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_506_d1k580hj_.arc RECID=13 STAMP=926802816
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_507_d1k5j4q0_.arc RECID=14 STAMP=926803076
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_508_d1k5j4yq_.arc RECID=15 STAMP=926803077
Finished backup at 01-NOV-16

RMAN> backup  format '/home/oracle/ctl_%T_%U.rman' current controlfile;

Starting backup at 01-NOV-16
using channel ORA_DISK_1
channel ORA_DISK_1: starting full datafile backup set
channel ORA_DISK_1: specifying datafile(s) in backup set
including current control file in backup set
channel ORA_DISK_1: starting piece 1 at 01-NOV-16
channel ORA_DISK_1: finished piece 1 at 01-NOV-16
piece handle=/home/oracle/ctl_20161101_06rjrp75_1_1.rman tag=TAG20161101T211933 comment=NONE
channel ORA_DISK_1: backup set complete, elapsed time: 00:00:01
Finished backup at 01-NOV-16

清理环境还原数据库

RMAN> shutdown immediate;

database closed
database dismounted
Oracle instance shut down

RMAN> exit


Recovery Manager complete.

[oracle@localhost .oradata]$ mv test  test_20161101
[oracle@localhost .oradata]$ mkdir test

还原数据库

[oracle@localhost ~]$ sqlplus / as sysdba

SQL*Plus: Release 11.2.0.1.0 Production on Tue Nov 1 21:21:09 2016

Copyright (c) 1982, 2009, Oracle.  All rights reserved.

Connected to an idle instance.

SQL> startup nomount;
ORACLE instance started.

Total System Global Area 2421825536 bytes
Fixed Size                  2215744 bytes
Variable Size            1795162304 bytes
Database Buffers          603979776 bytes
Redo Buffers               20467712 bytes
SQL> exit
Disconnected from Oracle Database 11g Enterprise Edition Release 11.2.0.1.0 - 64bit Production
With the Partitioning, OLAP, Data Mining and Real Application Testing options
[oracle@localhost ~]$ rman target /

Recovery Manager: Release 11.2.0.1.0 - Production on Tue Nov 1 21:21:22 2016

Copyright (c) 1982, 2009, Oracle and/or its affiliates.  All rights reserved.

connected to target database: TEST (not mounted)

RMAN> restore controlfile from '/home/oracle/ctl_20161101_06rjrp75_1_1.rman';

Starting restore at 01-NOV-16
using target database control file instead of recovery catalog
allocated channel: ORA_DISK_1
channel ORA_DISK_1: SID=63 device type=DISK

channel ORA_DISK_1: restoring control file
channel ORA_DISK_1: restore complete, elapsed time: 00:00:01
output file name=/home/oracle/.oradata/test/control01.ctl
Finished restore at 01-NOV-16

RMAN> alter database mount;

database mounted
released channel: ORA_DISK_1

RMAN> restore database;

Starting restore at 01-NOV-16
Starting implicit crosscheck backup at 01-NOV-16
allocated channel: ORA_DISK_1
channel ORA_DISK_1: SID=63 device type=DISK
Crosschecked 3 objects
Finished implicit crosscheck backup at 01-NOV-16

Starting implicit crosscheck copy at 01-NOV-16
using channel ORA_DISK_1
Finished implicit crosscheck copy at 01-NOV-16

searching for all files in the recovery area
cataloging files...
no files cataloged

using channel ORA_DISK_1

datafile 5 not processed because file is offline
datafile 6 not processed because file is offline
channel ORA_DISK_1: starting datafile backup set restore
channel ORA_DISK_1: specifying datafile(s) to restore from backup set
channel ORA_DISK_1: restoring datafile 00001 to /home/oracle/.oradata/test/system01.dbf
channel ORA_DISK_1: restoring datafile 00002 to /home/oracle/.oradata/test/sysaux01.dbf
channel ORA_DISK_1: restoring datafile 00003 to /home/oracle/.oradata/test/undotbs01.dbf
channel ORA_DISK_1: restoring datafile 00004 to /home/oracle/.oradata/test/users01.dbf
channel ORA_DISK_1: restoring datafile 00007 to /home/oracle/.oradata/test/readonly01.dbf
channel ORA_DISK_1: restoring datafile 00008 to /home/oracle/.oradata/test/readonly02.dbf
channel ORA_DISK_1: reading from backup piece /home/oracle/full_03rjrp0t_1_1.rman
channel ORA_DISK_1: piece handle=/home/oracle/full_03rjrp0t_1_1.rman tag=TAG20161101T211613
channel ORA_DISK_1: restored backup piece 1
channel ORA_DISK_1: restore complete, elapsed time: 00:00:15
Finished restore at 01-NOV-16

RMAN> exit


Recovery Manager complete.

检查数据库文件
read_only1


通过Oracle Database Recovery Check很明显发现,这里看到文件状态是read only的.
recover database

[oracle@localhost ~]$ rman target /

Recovery Manager: Release 11.2.0.1.0 - Production on Tue Nov 1 21:28:14 2016

Copyright (c) 1982, 2009, Oracle and/or its affiliates.  All rights reserved.

connected to target database: TEST (DBID=2210907828, not open)

RMAN> recover database;

Starting recover at 01-NOV-16
using target database control file instead of recovery catalog
allocated channel: ORA_DISK_1
channel ORA_DISK_1: SID=63 device type=DISK
datafile 7 not processed because file is read-only    <<<<=====注意
datafile 8 not processed because file is read-only    <<<<=====注意

starting media recovery

channel ORA_DISK_1: starting archived log restore to default destination
channel ORA_DISK_1: restoring archived log
archived log thread=1 sequence=507
channel ORA_DISK_1: restoring archived log
archived log thread=1 sequence=508
channel ORA_DISK_1: reading from backup piece /home/oracle/arch_20161101_05rjrp45_1_1.rman
channel ORA_DISK_1: piece handle=/home/oracle/arch_20161101_05rjrp45_1_1.rman tag=TAG20161101T211757
channel ORA_DISK_1: restored backup piece 1
channel ORA_DISK_1: restore complete, elapsed time: 00:00:01
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_507_d1k63pj8_.arc thread=1 sequence=507
channel default: deleting archived log(s)
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_507_d1k63pj8_.arc RECID=16 STAMP=926803702
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_508_d1k63pww_.arc thread=1 sequence=508
channel default: deleting archived log(s)
archived log file name=/opt/oracle/flash_recovery_area/TEST/archivelog/2016_11_01/o1_mf_1_508_d1k63pww_.arc RECID=17 STAMP=926803702
unable to find archived log
archived log thread=1 sequence=509
RMAN-00571: ===========================================================
RMAN-00569: =============== ERROR MESSAGE STACK FOLLOWS ===============
RMAN-00571: ===========================================================
RMAN-03002: failure of recover command at 11/01/2016 21:28:24
RMAN-06054: media recovery requesting unknown archived log for thread 1 with sequence 509 and starting SCN of 8933048

RMAN> exit

resetlogs 打开数据库

SQL> set numw 16
SQL> SELECT status,
checkpoint_change#,
  2    3  checkpoint_time,last_change#,
count(*) ROW_NUM
FROM v$datafile
GROUP BY status, checkpoint_change#, checkpoint_time,last_change#
ORDER BY status, checkpoint_change#, checkpoint_time;  4    5    6    7  

STATUS  CHECKPOINT_CHANGE# CHECKPOIN     LAST_CHANGE#          ROW_NUM
------- ------------------ --------- ---------------- ----------------
ONLINE             8932792 01-NOV-16                                 2
ONLINE             8933048 01-NOV-16                                 3
SYSTEM             8933048 01-NOV-16                                 1

SQL> SELECT status,
  2  checkpoint_change#,
  3  checkpoint_time,FUZZY,
  4  count(*) ROW_NUM
  5  FROM v$datafile_header
GROUP BY status, checkpoint_change#, checkpoint_time,fuzzy
ORDER BY status, checkpoint_change#, checkpoint_time;
  6    7  
STATUS  CHECKPOINT_CHANGE# CHECKPOIN FUZ          ROW_NUM
------- ------------------ --------- --- ----------------
ONLINE             8932792 01-NOV-16 NO                 2
ONLINE             8933048 01-NOV-16 NO                 4

SQL> alter database open resetlogs;

Database altered.

alert日志信息

Tue Nov 01 21:29:56 2016
alter database open resetlogs
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 1 of thread 1
ORA-00312: online log 1 thread 1: '/home/oracle/.oradata/test/redo01.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 1 of thread 1
ORA-00312: online log 1 thread 1: '/home/oracle/.oradata/test/redo01.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 2 of thread 1
ORA-00312: online log 2 thread 1: '/home/oracle/.oradata/test/redo02.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 2 of thread 1
ORA-00312: online log 2 thread 1: '/home/oracle/.oradata/test/redo02.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 3 of thread 1
ORA-00312: online log 3 thread 1: '/home/oracle/.oradata/test/redo03.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 3 of thread 1
ORA-00312: online log 3 thread 1: '/home/oracle/.oradata/test/redo03.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
RESETLOGS after incomplete recovery UNTIL CHANGE 8933048
Resetting resetlogs activation ID 2210869684 (0x83c731b4)
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 1 of thread 1
ORA-00312: online log 1 thread 1: '/home/oracle/.oradata/test/redo01.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 1 of thread 1
ORA-00312: online log 1 thread 1: '/home/oracle/.oradata/test/redo01.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 2 of thread 1
ORA-00312: online log 2 thread 1: '/home/oracle/.oradata/test/redo02.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 2 of thread 1
ORA-00312: online log 2 thread 1: '/home/oracle/.oradata/test/redo02.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 3 of thread 1
ORA-00312: online log 3 thread 1: '/home/oracle/.oradata/test/redo03.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_ora_14479.trc:
ORA-00313: open failed for members of log group 3 of thread 1
ORA-00312: online log 3 thread 1: '/home/oracle/.oradata/test/redo03.log'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Tue Nov 01 21:29:58 2016
Setting recovery target incarnation to 3
Tue Nov 01 21:29:58 2016
Assigning activation ID 2224900353 (0x849d4901)
LGWR: STARTING ARCH PROCESSES
Tue Nov 01 21:29:58 2016
ARC0 started with pid=20, OS id=14486
ARC0: Archival started
LGWR: STARTING ARCH PROCESSES COMPLETE
ARC0: STARTING ARCH PROCESSES
Tue Nov 01 21:29:59 2016
ARC1 started with pid=21, OS id=14488
Tue Nov 01 21:29:59 2016
ARC2 started with pid=22, OS id=14490
ARC1: Archival started
Tue Nov 01 21:29:59 2016
ARC3 started with pid=23, OS id=14492
ARC2: Archival started
ARC1: Becoming the 'no FAL' ARCH
ARC1: Becoming the 'no SRL' ARCH
ARC2: Becoming the heartbeat ARCH
Thread 1 opened at log sequence 1
  Current log# 1 seq# 1 mem# 0: /home/oracle/.oradata/test/redo01.log
Successful open of redo thread 1
MTTR advisory is disabled because FAST_START_MTTR_TARGET is not set
Tue Nov 01 21:29:59 2016
SMON: enabling cache recovery
Successfully onlined Undo Tablespace 2.
Dictionary check beginning
File #7 is offline, but is part of an online tablespace.
data file 7: '/home/oracle/.oradata/test/readonly01.dbf'
Successfuly brought file #7 online.
File #8 is offline, but is part of an online tablespace.
data file 8: '/home/oracle/.oradata/test/readonly02.dbf'
Successfuly brought file #8 online.
Tue Nov 01 21:29:59 2016
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_dbw0_14226.trc:
ORA-01157: cannot identify/lock data file 201 - see DBWR trace file
ORA-01110: data file 201: '/home/oracle/.oradata/test/temp01.dbf'
ORA-27037: unable to obtain file status
Linux-x86_64 Error: 2: No such file or directory
Additional information: 3
Errors in file /opt/oracle/diag/rdbms/test/test/trace/test_dbw0_14226.trc:
ORA-01186: file 201 failed verification tests
ORA-01157: cannot identify/lock data file 201 - see DBWR trace file
ORA-01110: data file 201: '/home/oracle/.oradata/test/temp01.dbf'
File 201 not verified due to error ORA-01157
Dictionary check complete
Verifying file header compatibility for 11g tablespace encryption..
Verifying 11g file header compatibility for tablespace encryption completed
SMON: enabling tx recovery
Re-creating tempfile /home/oracle/.oradata/test/temp01.dbf
Database Characterset is AL32UTF8
No Resource Manager plan active
replication_dependency_tracking turned off (no async multimaster replication found)
Starting background process QMNC
Tue Nov 01 21:30:00 2016
QMNC started with pid=24, OS id=14494
LOGSTDBY: Validating controlfile with logical metadata
LOGSTDBY: Validation complete
ARC3: Archival started
ARC0: STARTING ARCH PROCESSES COMPLETE
Completed: alter database open resetlogs

这里看到了我们期待的警告
File #7 is offline, but is part of an online tablespace.
data file 7: ‘/home/oracle/.oradata/test/readonly01.dbf’
Successfuly brought file #7 online.
File #8 is offline, but is part of an online tablespace.
data file 8: ‘/home/oracle/.oradata/test/readonly02.dbf’
Successfuly brought file #8 online.
结论:如果数据库的表空间之前是read only,然后resetlogs操作就会有类似提示(File #N is offline, but is part of an online tablespace).这样的整个恢复过程不影响read only表空间中的数据

ORA-600 kcrfr_update_nab_2

由于控制器掉线导致数据库启动报ORA-600 kcrfr_update_nab_2错误,导致无法正常open
数据库版本信息

ORACLE V10.2.0.4.0 - 64bit Production vsnsta=0
vsnsql=14 vsnxtr=3
Oracle Database 10g Enterprise Edition Release 10.2.0.4.0 - 64bit Production
With the Partitioning, OLAP, Data Mining and Real Application Testing options
Windows Server 2003 Version V5.2 Service Pack 2
CPU                 : 12 - type 8664, 2 Physical Cores
Process Affinity    : 0x0000000000000000
Memory (Avail/Total): Ph:22579M/32754M, Ph+PgF:24594M/33845M

ORA-600 kcrfr_update_nab_2报错

Mon Oct 24 17:42:57 2016
Database mounted in Exclusive Mode
Completed: ALTER DATABASE   MOUNT
Mon Oct 24 17:42:58 2016
ALTER DATABASE OPEN
Mon Oct 24 17:43:14 2016
Beginning crash recovery of 1 threads
 parallel recovery started with 11 processes
Mon Oct 24 17:43:14 2016
Started redo scan
Mon Oct 24 17:43:16 2016
Errors in file d:\oracle\product\10.2.0\admin\spcsjkdb\udump\spcsjkdb_ora_10108.trc:
ORA-00600: internal error code, arguments: [kcrfr_update_nab_2], [0x7FFC22A2150], [2], [], [], [], [], []

Mon Oct 24 17:43:18 2016
Aborting crash recovery due to error 600
Mon Oct 24 17:43:18 2016
Errors in file d:\oracle\product\10.2.0\admin\spcsjkdb\udump\spcsjkdb_ora_10108.trc:
ORA-00600: internal error code, arguments: [kcrfr_update_nab_2], [0x7FFC22A2150], [2], [], [], [], [], []

ORA-600 signalled during: ALTER DATABASE OPEN...

trace文件信息

*** 2016-10-24 17:43:14.515
*** ACTION NAME:() 2016-10-24 17:43:14.515
*** MODULE NAME:(sqlplus.exe) 2016-10-24 17:43:14.515
*** SERVICE NAME:() 2016-10-24 17:43:14.515
*** SESSION ID:(356.3) 2016-10-24 17:43:14.515
Successfully allocated 11 recovery slaves
Using 101 overflow buffers per recovery slave
Thread 1 checkpoint: logseq 33251, block 2, scn 14624215134369
  cache-low rba: logseq 33251, block 2463324
    on-disk rba: logseq 33251, block 2803965, scn 14624216078841
  start recovery at logseq 33251, block 2463324, scn 0
*** 2016-10-24 17:43:16.406
ksedmp: internal or fatal error
ORA-00600: internal error code, arguments: [kcrfr_update_nab_2], [0x7FFC22A2150], [2], [], [], [], [], []
Current SQL statement for this session:
ALTER DATABASE OPEN
----- Call Stack Trace -----
calling              call     entry                argument values in hex      
location             type     point                (? means dubious value)     
-------------------- -------- -------------------- ----------------------------
ksedmp+663           CALL???  ksedst+55            003C878B8 000000000 012B863E8
                                                   000000000
ksfdmp+19            CALL???  ksedmp+663           000000003 015572A70 007222698
                                                   003CACC80
kgerinv+158          CALL???  ksfdmp+19            015572430 000000000 0FFFFFFFF
                                                   000000000
kgeasnmierr+62       CALL???  kgerinv+158          000000000 000000000 000000000
                                                   004FD788F
kcrfr_update_nab+18  CALL???  kgeasnmierr+62       00BDA1170 000000000 000000000
6                                                  000000002
kcrfr_read+1078      CALL???  kcrfr_update_nab+18  007222698 00001E650 015572430
                              6                    0072229B8
kcrfrgv+8134         CALL???  kcrfr_read+1078      000000000 0051525D7 000000000
                                                   0051525D7
kcratr1+488          CALL???  kcrfrgv+8134         007222698 000000000 000000000
                                                   000000000
kcratr+412           CALL???  kcratr1+488          012B891C8 012B890A4 00727FFB8
                                                   00BEA7FF0
kctrec+1910          CALL???  kcratr+412           012B891C8 012B91E18 000000000
                                                   012B91E48
kcvcrv+3585          CALL???  kctrec+1910          012B92C58 000000000 00726DF00
                                                   00726BDB0
kcfopd+1007          CALL???  kcvcrv+3585          012B93350 000000000 000000000
                                                   000000000
adbdrv+55820         CALL???  kcfopd+1007          000000000 000000000 000000000
                                                   000000000
opiexe+13897         CALL???  adbdrv+55820         000000023 000000003 000000102
                                                   000000000
opiosq0+3558         CALL???  opiexe+13897         000000004 000000000 012B9B238
                                                   4155474E414C5F45
kpooprx+339          CALL???  opiosq0+3558         000000003 00000000E 012B9B3C8
                                                   0000000A4
kpoal8+894           CALL???  kpooprx+339          015587550 000000018 0041AE700
                                                   000000001
opiodr+1136          CALL???  kpoal8+894           00000005E 000000017 012B9E868
                                                   0072F5100
ttcpip+5146          CALL???  opiodr+1136          00000005E 000000017 012B9E868
                                                   2D8C00000000
opitsk+1818          CALL???  ttcpip+5146          015587550 000000000 000000000
                                                   000000000
opiino+1129          CALL???  opitsk+1818          00000001E 000000000 000000000
                                                   000000000
opiodr+1136          CALL???  opiino+1129          00000003C 000000004 012B9FB20
                                                   000000000
opidrv+815           CALL???  opiodr+1136          00000003C 000000004 012B9FB20
                                                   000000000
sou2o+52             CALL???  opidrv+815           00000003C 000000004 012B9FB20
                                                   7FF7FC48580
opimai_real+131      CALL???  sou2o+52             000000000 012B9FC40
                                                   7FFFFF7F258 077EF4D1C
opimai+96            CALL???  opimai_real+131      7FF7FC48580 7FFFFF7E000
                                                   0001F0003 000000000
OracleThreadStart+6  CALL???  opimai+96            012B9FEF0 01289FF3C 012B9FCC0
40                                                 7FF7FC48580
0000000077D6B6DA     CALL???  OracleThreadStart+6  01289FF3C 000000000 000000000
                              40                   012B9FFA8

官方描述
The assert ORA-600: [kcrfr_update_nab_2] is a direct result of a lost write in the current on line log that we are attempting to resolve.So, this confirms the theory that this is a OS/hardware lost write issue not an internal oracle bug. In fact the assert ORA-600: [kcrfr_update_nab_2] is how we detect a lost log write.
Bug 5692594
Hdr: 5692594 10.2.0.1 RDBMS 10.2.0.1 RECOVERY PRODID-5 PORTID-226 ORA-600
Abstract: AFTER DATABASE CRASHED DOESN’T OPEN ORA-600 [KCRFR_UPDATE_NAB_2]
Status: 95,Closed, Vendor OS Problem

Bug 6655116
Hdr: 6655116 10.2.0.3 RDBMS 10.2.0.3 RECOVERY PRODID-5 PORTID-23
Abstract: INSTANCES CRASH WITH ORA-600 [KCRFR_UPDATE_NAB_2] AFTER DISK FAILURE
根据官方的描述,结合故障情况,基本上可以确定是由于硬件异常导致Oracle写丢失,从而除非oracle相关bug导致数据库无法正常启动

ORA-600 [kcrfr_update_nab_2] [a] [b]


VERSIONS:
versions 10.2 to 11.1

DESCRIPTION:

Failure of upgrade of recovery node (RN) enqueue to SSX mode

ARGUMENTS:
Arg [a] State Object for redo nab enqueue for resilvering
Arg [b] Redo nab enqueue mode

FUNCTIONALITY:
Kernel Cache Redo File Read

IMPACT:

INSTANCE FAILURE

处理方法
1.如果有备份,利用备份进行不完全恢复,跳过最后异常的redo,数据库resetlogs打开
2.如果没有备份,尝试使用历史的控制文件进行不完全恢复,或者直接跳过数据库一致性打开库.
3.互联网有人解决删除redo第二组成员数据库open成功(http://blog.itpub.net/16976507/viewspace-1266952/)
redo


自动精简配置(Thin Provisioning)导致asm异常

有朋友在一个存储空间给asm使用,发生空间不足,然后使用另外一个存储中的lun给asm的data磁盘组增加asm disk,运行了大概1天之后,asm磁盘组直接dismount,数据库crash.然后就无法正常mount.包括这个存储上的几个其他磁盘组也无法正常mount.
数据库异常日志

Sun Oct 23 08:43:59 2016
SUCCESS: diskgroup DATA was dismounted
SUCCESS: diskgroup DATA was dismounted
Sun Oct 23 08:44:00 2016
Errors in file /oracle/app/oracle/diag/rdbms/orcl/orcl1/trace/orcl1_lmon_79128.trc:
ORA-00202: control file: '+DATA/orcl/controlfile/current.278.892363163'
ORA-15078: ASM diskgroup was forcibly dismounted
Sun Oct 23 08:44:00 2016
Errors in file /oracle/app/oracle/diag/rdbms/orcl/orcl1/trace/orcl1_lgwr_79174.trc:
ORA-00345: redo log write error block 15924 count 2
ORA-00312: online log 2 thread 1: '+DATA/orcl/onlinelog/group_2.274.892363167'
ORA-15078: ASM diskgroup was forcibly dismounted
ORA-15078: ASM diskgroup was forcibly dismounted
Errors in file /oracle/app/oracle/diag/rdbms/orcl/orcl1/trace/orcl1_lgwr_79174.trc:
ORA-00202: control file: '+DATA/orcl/controlfile/current.278.892363163'
ORA-15078: ASM diskgroup was forcibly dismounted
Errors in file /oracle/app/oracle/diag/rdbms/orcl/orcl1/trace/orcl1_lgwr_79174.trc:
ORA-00204: error in reading (block 1, # blocks 1) of control file
ORA-00202: control file: '+DATA/orcl/controlfile/current.278.892363163'
ORA-15078: ASM diskgroup was forcibly dismounted
Sun Oct 23 08:44:00 2016
LGWR (ospid: 79174): terminating the instance due to error 204
Sun Oct 23 08:44:00 2016
opiodr aborting process unknown ospid (79742) as a result of ORA-1092
Sun Oct 23 08:44:01 2016
ORA-1092 : opitsk aborting process
Sun Oct 23 08:44:01 2016
ORA-1092 : opitsk aborting process
System state dump requested by (instance=1, osid=79174 (LGWR)), summary=[abnormal instance termination].
System State dumped to trace file /oracle/app/oracle/diag/rdbms/orcl/orcl1/trace/orcl1_diag_79118.trc
Instance terminated by LGWR, pid = 79174

很明显,数据库异常是由于asm diskgroup dismount,因此分析asm 日志

asm 日志

Sun Oct 23 07:00:31 2016
Time drift detected. Please check VKTM trace file for more details.
Sun Oct 23 08:43:55 2016
Errors in file /oracle/app/oracle/diag/asm/+asm/+ASM1/trace/+ASM1_arb0_8755.trc:
ORA-27061: waiting for async I/Os failed
Linux-x86_64 Error: 5: Input/output error
Additional information: -1
Additional information: 1048576
WARNING: Write Failed. group:1 disk:2 AU:1222738 offset:0 size:1048576
ERROR: failed to copy file +DATA.524, extent 15030
ERROR: ORA-15080 thrown in ARB0 for group number 1
Errors in file /oracle/app/oracle/diag/asm/+asm/+ASM1/trace/+ASM1_arb0_8755.trc:
ORA-15080: synchronous I/O operation to a disk failed
Sun Oct 23 08:43:55 2016
NOTE: stopping process ARB0
NOTE: rebalance interrupted for group 1/0xec689cdd (DATA)
NOTE: ASM did background COD recovery for group 1/0xec689cdd (DATA)
NOTE: starting rebalance of group 1/0xec689cdd (DATA) at power 1
Starting background process ARB0
Sun Oct 23 08:43:56 2016
ARB0 started with pid=24, OS id=103554 
NOTE: assigning ARB0 to group 1/0xec689cdd (DATA) with 1 parallel I/O
Errors in file /oracle/app/oracle/diag/asm/+asm/+ASM1/trace/+ASM1_arb0_103554.trc:
ORA-27061: waiting for async I/Os failed
Linux-x86_64 Error: 5: Input/output error
Additional information: -1
Additional information: 1048576
WARNING: Write Failed. group:1 disk:2 AU:1222738 offset:0 size:1048576
ERROR: failed to copy file +DATA.256, extent 6570
ERROR: ORA-15080 thrown in ARB0 for group number 1
Errors in file /oracle/app/oracle/diag/asm/+asm/+ASM1/trace/+ASM1_arb0_103554.trc:
ORA-15080: synchronous I/O operation to a disk failed
NOTE: stopping process ARB0
Sun Oct 23 08:43:58 2016
Errors in file /oracle/app/oracle/diag/asm/+asm/+ASM1/trace/+ASM1_dbw0_8521.trc:
ORA-27061: waiting for async I/Os failed
Linux-x86_64 Error: 5: Input/output error
Additional information: -1
Additional information: 4096
WARNING: Write Failed. group:1 disk:3 AU:6789 offset:24576 size:4096
NOTE: cache initiating offline of disk 3 group DATA
NOTE: process _dbw0_+asm1 (8521) initiating offline of disk 3.3915934787 (DATA_0003) with mask 0x7e in group 1
Sun Oct 23 08:43:58 2016
WARNING: Disk 3 (DATA_0003) in group 1 mode 0x7f is now being offlined
WARNING: Disk 3 (DATA_0003) in group 1 in mode 0x7f is now being taken offline on ASM inst 1
NOTE: initiating PST update: grp = 1, dsk = 3/0xe9686c43, mask = 0x6a, op = clear
GMON updating disk modes for group 1 at 14 for pid 14, osid 8521
ERROR: Disk 3 cannot be offlined, since diskgroup has external redundancy.
ERROR: too many offline disks in PST (grp 1)
Sun Oct 23 08:43:58 2016
NOTE: cache dismounting (not clean) group 1/0xEC689CDD (DATA) 
NOTE: messaging CKPT to quiesce pins Unix process pid: 103577, image: oracle@node1 (B000)
WARNING: Disk 3 (DATA_0003) in group 1 mode 0x7f offline is being aborted
WARNING: Offline of disk 3 (DATA_0003) in group 1 and mode 0x7f failed on ASM inst 1
NOTE: halting all I/Os to diskgroup 1 (DATA)
Sun Oct 23 08:43:59 2016
NOTE: LGWR doing non-clean dismount of group 1 (DATA)
NOTE: LGWR sync ABA=160.10145 last written ABA 160.10145

错误信息很明显,由于Write Failed导致asm diskgroup dismount.

系统日志

Oct 23 08:43:55 node1 kernel: sd 6:0:12:1: [sdd]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:55 node1 kernel: sd 6:0:12:1: [sdd]  Sense Key : Data Protect [current] 
Oct 23 08:43:55 node1 kernel: sd 6:0:12:1: [sdd]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:55 node1 kernel: sd 6:0:12:1: [sdd] CDB: Write(16): 8a 00 00 00 00 02 e7 18 37 f9 00 00 00 07 00 00
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev sdd, sector 12467058681
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev dm-3, sector 12467058681
Oct 23 08:43:55 node1 kernel: sd 8:0:6:1: [sdh]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:55 node1 kernel: sd 8:0:6:1: [sdh]  Sense Key : Data Protect [current] 
Oct 23 08:43:55 node1 kernel: sd 8:0:6:1: [sdh]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:55 node1 kernel: sd 8:0:6:1: [sdh] CDB: Write(16): 8a 00 00 00 00 02 e7 18
Oct 23 08:43:55 node1 kernel: sd 6:0:4:1: [sdb]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:55 node1 kernel: sd 6:0:4:1: [sdb]  Sense Key : Data Protect [current] 
Oct 23 08:43:55 node1 kernel: sd 6:0:4:1: [sdb]   33Add. Sense: Space allocation failed write protect
Oct 23 08:43:55 node1 kernel: sd 6:0:4:1: [sdb] CDB: Write(16): 8a 00 00 00 00 02 e7 18 30 00 00 00 03 f9 00 00
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev sdb, sector 12467056640
Oct 23 08:43:55 node1 kernel: f9 00 00 04 00
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev dm-3, sector 12467056640
Oct 23 08:43:55 node1 kernel: 00 00
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev sdh, sector 12467057657
Oct 23 08:43:55 node1 kernel: end_request: critical space allocation error, dev dm-3, sector 12467057657
Oct 23 08:43:57 node1 kernel: sd 8:0:6:1: [sdh]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:57 node1 kernel: sd 8:0:6:1: [sdh]  Sense Key : Data Protect [current] 
Oct 23 08:43:57 node1 kernel: sd 8:0:6:1: [sdh]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:57 node1 kernel: sd 8:0:6:1: [sdh] CDB: Write(16): 8a 00 00 00 00 02 e7 18 37 f9 00 00 00 07 00 00
Oct 23 08:43:57 node1 kernel: end_request: critical space allocation error, dev sdh, sector 12467058681
Oct 23 08:43:57 node1 kernel: end_request: critical space allocation error, dev dm-3, sector 12467058681
Oct 23 08:43:57 node1 kernel: sd 8:0:12:1: [sdj]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:57 node1 kernel: sd 8:0:12:1: [sdj]  Sense Key : Data Protect [current] 
Oct 23 08:43:57 node1 kernel: sd 8:0:12:1: [sdj]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:57 node1 kernel: sd 8:0:12:1: [sdj] CDB: Write(16): 8a 00 00 00 00 02 e7 18 30 00 00 00 03 f9 00 00
Oct 23 08:43:57 node1 kernel: end_request: critical space allocation error, dev sdj, sector 12467056640
Oct 23 08:43:57 node1 kernel: end_request: critical space allocation error, dev dm-3, sector 12467056640
Oct 23 08:43:57 node1 kernel: sd 6:0:4:1: [sdb]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:57 node1 kernel: sd 6:0:4:1: [sdb]  Sense Key : Data Protect [current] 
Oct 23 08:43:57 node1 kernel: sd 6:0:4:1: [sdb]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:57 node1 kernel: sd 6:0:4:1: [sdb] CDB: Write(16): 8a 00 00 00 00 02 e7 18 33 f9 00 00 04 00 00 00
Oct 23 08:43:58 node1 kernel: sd 6:0:4:1: [sdb]  Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
Oct 23 08:43:58 node1 kernel: sd 6:0:4:1: [sdb]  Sense Key : Data Protect [current] 
Oct 23 08:43:58 node1 kernel: sd 6:0:4:1: [sdb]  Add. Sense: Space allocation failed write protect
Oct 23 08:43:58 node1 kernel: sd 6:0:4:1: [sdb] CDB: Write(16): 8a 00 00 00 00 03 3b 7e 78 30 00 00 00 08 00 00
Oct 23 10:50:59 node1 init: oracle-ohasd main process (6150) killed by TERM signal

错误信息为:critical space allocation error,严重空间分配错误.也就是linux在分配空间之时发生错误.在换而言之,由于分配空间错误导致asm 磁盘组dismount.

查看多路径信息

[root@node1 ~]# multipath -ll
36000d31003190c000000000000000003 dm-3 COMPELNT,Compellent Vol
size=80T features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  |- 6:0:9:1  sdd 8:48  active ready running
  `- 8:0:9:1  sdi 8:128 active ready running
delldisk2 (36000d310031908000000000000000003) dm-4 COMPELNT,Compellent Vol
size=8.0T features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  |- 6:0:12:1 sde 8:64  active ready running
  |- 8:0:6:1  sdh 8:112 active ready running
  |- 6:0:4:1  sdb 8:16  active ready running
  `- 8:0:12:1 sdj 8:144 active ready running
delldisk1 (36000d31003190a000000000000000007) dm-2 COMPELNT,Compellent Vol
size=12T features='1 queue_if_no_path' hwhandler='0' wp=rw
`-+- policy='round-robin 0' prio=1 status=active
  |- 6:0:1:1  sda 8:0   active ready running
  |- 8:0:2:1  sdf 8:80  active ready running
  |- 6:0:7:1  sdc 8:32  active ready running
  `- 8:0:3:1  sdg 8:96  active ready running

很明显报错的都是同一个lun(delldisk2),也就是存储空间使用完的存储.也就是说,由于delldisk2存储的空间使用尽了导致系统出现分配空间错误,从而导致asm 写失败,进而导致数据库异常.这种问题的本质其实就是存储给系统分配了8T,但是实际存储可以使用的空间不足8T,而os按照8T来使用从而出现该问题.专业名字叫做”存储精简卷”.因此各位在存储配置之时需要注意该问题.因为这种情况的出现一般只是写io异常,读依旧正常,因此不会丢失数据.

ORA-20011 KUP-11024

数据库alert日志出现ORA-20011 KUP-11024等错误

Thu Sep 22 18:00:31 2016
DBMS_STATS: GATHER_STATS_JOB encountered errors.  Check the trace file.
Errors in file /u1/oracle/diag/rdbms/xifenfei/xifenfei/trace/xifenfei_j002_2686.trc:
ORA-20011: Approximate NDV failed: ORA-29913: error in executing ODCIEXTTABLEOPEN callout
KUP-11024: This external table can only be accessed from within a Data Pump job.

从报错的信息看应该是数据库收集统计信息报错(GATHER_STATS_JOB),但是报错原因是由于访问外部表导致,而该外部表很可能和data pump有关系.

查看trace日志

[oracle@xifenfei]$ more /u1/oracle/diag/rdbms/xifenfei/xifenfei/trace/xifenfei_j002_2686.trc
Trace file /u1/oracle/diag/rdbms/xifenfei/xifenfei/trace/xifenfei_j002_2686.trc
Oracle Database 11g Enterprise Edition Release 11.2.0.3.0 - 64bit Production
With the Partitioning, OLAP, Data Mining and Real Application Testing options
ORACLE_HOME = /u1/oracle/pruduct/11.2.0.3
System name:    Linux
Node name:      xifenfei
Release:        2.6.32-220.el6.x86_64
Version:        #1 SMP Wed Nov 9 08:03:13 EST 2011
Machine:        x86_64
Instance name: xifenfei
Redo thread mounted by this instance: 1
Oracle process number: 356
Unix process pid: 2686, image: oracle@xifenfei (J002)


*** 2016-09-22 18:00:31.939
*** SESSION ID:(835.16363) 2016-09-22 18:00:31.939
*** CLIENT ID:() 2016-09-22 18:00:31.939
*** SERVICE NAME:(SYS$USERS) 2016-09-22 18:00:31.939
*** MODULE NAME:(DBMS_SCHEDULER) 2016-09-22 18:00:31.939
*** ACTION NAME:(ORA$AT_OS_OPT_SY_10669) 2016-09-22 18:00:31.939
 
ORA-20011: Approximate NDV failed: ORA-29913: error in executing ODCIEXTTABLEOPEN callout
KUP-11024: This external table can only be accessed from within a Data Pump job.

*** 2016-09-22 18:00:31.939
DBMS_STATS: GATHER_STATS_JOB: GATHER_TABLE_STATS('"DWDBA"','"ET$012D00070001"','""', ...)
DBMS_STATS: ORA-20011: Approximate NDV failed: ORA-29913: error in executing ODCIEXTTABLEOPEN callout
KUP-11024: This external table can only be accessed from within a Data Pump job.

*** 2016-09-22 18:00:31.960
DBMS_STATS: GATHER_STATS_JOB: GATHER_TABLE_STATS('"DWDBA"','"ET$01D10D4F0001"','""', ...)
DBMS_STATS: ORA-20011: Approximate NDV failed: ORA-29913: error in executing ODCIEXTTABLEOPEN callout
KUP-11024: This external table can only be accessed from within a Data Pump job.

通过trace文件,我们已经可以明确是由于数据库对DWDBA.ET$012D00070001和DWDBA.ET$01D10D4F0001这两个表收集统计信息时候报的上述alert日志中看到的错误.

查询数据库记录

SYS@xifenfei>select OWNER,OBJECT_NAME,OBJECT_TYPE, status,
  2  to_char(CREATED,'dd-mon-yyyy hh24:mi:ss') created
  3  ,to_char(LAST_DDL_TIME , 'dd-mon-yyyy hh24:mi:ss') last_ddl_time
  4  from dba_objects
  5  where object_name like 'ET$%'
  6  /

OWNER     OBJECT_NAME      OBJECT_TYPE  STATUS  CREATED               LAST_DDL_TIME
--------- ---------------- ------------ ------- ------------------------- ----------------
DWDBA     ET$012D00070001  TABLE        VALID   10-mar-2016 16:32:25  10-mar-2016 16:32:25
DWDBA     ET$01D10D4F0001  TABLE        VALID   10-mar-2016 17:29:29  10-mar-2016 17:29:29

SYS@xifenfei> select owner, TABLE_NAME, DEFAULT_DIRECTORY_NAME, ACCESS_TYPE
  2  from dba_external_tables
  3  order by 1,2
  4  /

OWNER       TABLE_NAME                     DEFAULT_DIRECTORY_NAME         ACCESS_
----------- ------------------------------ ------------------------------ -------
DWDBA       ET$012D00070001                EXP_FILE_DIR                   CLOB
DWDBA       ET$01D10D4F0001                EXP_FILE_DIR                   CLOB

到这一步,我们已经完全清楚,ET$012D00070001和ET$01D10D4F0001是两个外部表,由于他们的存在使得收集统计信息异常。

分析ET$012D00070001表

SYS@xifenfei>desc DWDBA.ET$012D00070001
 Name                                                  Null?    Type
 ----------------------------------------------------- -------- ------------------------------------
 STORE_NO                                                       NUMBER(3)
 ITEM_NO                                                        NUMBER(6)
 WORK_DATE                                                      DATE
 DIVISION_NO                                                    NUMBER(2)
 SECTION_NO                                                     NUMBER(3)
 SUP_NO                                                         NUMBER(6)
 GRP_NO                                                         NUMBER(3)
 SUBGRP_NO                                                      NUMBER(3)
 USR                                                            VARCHAR2(30)
 TYPE                                                           NUMBER(1)
 ACTIVE_SELL_PRICE                                              NUMBER(8,2)
 SELL_PRICE                                                     NUMBER(8,2)
 SELL_VAT                                                       NUMBER(1)
 BUY_PRICE                                                      NUMBER(10,4)
 BUY_VAT                                                        NUMBER(1)
 PROMOTION_NO                                                   NUMBER(10)
 PROM_CLASS                                                     VARCHAR2(1)
 PROM_LEVEL                                                     NUMBER(1)
 STOCK                                                          NUMBER(10,3)
 STOCK_ADJ                                                      NUMBER(10,3)
 RECPT                                                          NUMBER(10,3)
 SALES                                                          NUMBER(10,3)
 STOCK_ADJ_AMNT                                                 NUMBER(10,2)
 RECPT_AMNT                                                     NUMBER(10,2)
 SALES_AMNT                                                     NUMBER(10,2)
 PROF_AMNT                                                      NUMBER(10,2)
 COST_CHANGE                                                    NUMBER(10,2)
 DISC                                                           NUMBER(10,3)
 RTN_QTY                                                        NUMBER(9,3)
 DISC_AMNT                                                      NUMBER(10,2)
 RTN_AMNT                                                       NUMBER(10,2)
 LOSS_AMNT                                                      NUMBER(10,2)
 CREATED_DATE                                                   DATE
 COST                                                           NUMBER(10,4)
 NBR_PK                                                         NUMBER(5)
 NBR_VISIT                                                      NUMBER(5)
 NBR_PK_LINE                                                    NUMBER(5)
 N_N_PROF_AMNT                                                  NUMBER(9,2)
 CON_FORE                                                       NUMBER(10,2)
 CON_FORE_OTH                                                   NUMBER(10,2)
 SALES_B                                                        NUMBER(10,3)
 SALES_AMNT_B                                                   NUMBER(10,2)

SYS@xifenfei>select count(*) from DWDBA.ET$012D00070001;
select count(*) from DWDBA.ET$012D00070001
*
ERROR at line 1:
ORA-29913: error in executing ODCIEXTTABLEOPEN callout
KUP-11024: This external table can only be accessed from within a Data Pump job.

通过对ET$012D00070001表查询重新了alert日志一样的错误,可以明确定位问题就是由于该外部表异常导致.通过查询mos,确定Bug 10327346 DBMS_WORKLOAD_CAPTURE does not drop external tables (causing ORA-20011 from DBMS_STATS)可能导致DBMS_WORKLOAD_CAPTURE无法正常清理掉Data pump的外部表导致出现Datapump出现孤立的外部表对象,从而出现该问题.解决方案就是直接drop 相关外部表.也就是这里的(ET$012D00070001和ET$01D10D4F0001)