daemons have recently crashed
[root@controller ~]# ceph health detail
HEALTH_WARN 1 daemons have recently crashed
RECENT_CRASH 1 daemons have recently crashed
mgr.controller crashed on host controller at 2023-06-19 03:42:02.966898Z
[root@controller ~]# ceph -s
cluster:
id: e546a991-577b-4c94-967b-6aa379be9a64
health: HEALTH_WARN
1 daemons have recently crashed
services:
mon: 3 daemons, quorum controller,compute01,compute02 (age 37m)
mgr: compute01(active, since 35m), standbys: controller, compute02
mds: cephfs:1 {0=controller=up:active} 2 up:standby
osd: 9 osds: 9 up (since 22m), 9 in (since 22m)
rgw: 3 daemons active (compute01.rgw0, compute02.rgw0, controller.rgw0)
task status:
data:
pools: 6 pools, 144 pgs
objects: 210 objects, 3.8 KiB
usage: 9.1 GiB used, 441 GiB / 450 GiB avail
pgs: 144 active+clean
解决方法:
最近有一个或多个Ceph守护进程崩溃,管理员尚未对该崩溃进行存档(确认)。这可能表示软件错误、硬件问题(例如,故障磁盘)或某些其它问题。
系统中所有的崩溃可以通过以下方式列出
[root@controller ~]# ceph crash ls
新的崩溃可以通过以下方式列出
[root@controller ~]# ceph crash ls-new
ID ENTITY NEW
2023-06-19_03:42:02.966898Z_c055cf4a-5f65-4a27-bc1e-abe1b51cd6ef mgr.controller *
有关特定崩溃的信息可以通过以下方式检查
[root@controller ~]# ceph crash info 2023-06-19_03:42:02.966898Z_c055cf4a-5f65-4a27-bc1e-abe1b51cd6ef
{
"os_version_id": "7",
"assert_condition": "ret == 0",
"utsname_release": "3.10.0-957.el7.x86_64",
"os_name": "CentOS Linux",
"entity_name": "mgr.controller",
"assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/common/Thread.cc",
"timestamp": "2023-06-19 03:42:02.966898Z",
"process_name": "ceph-mgr",
"utsname_machine": "x86_64",
"assert_line": 157,
"utsname_sysname": "Linux",
"os_version": "7 (Core)",
"os_id": "centos",
"assert_thread_name": "mgrsb-fin",
"utsname_version": "#1 SMP Thu Nov 8 23:39:32 UTC 2018",
"backtrace": [
"(()+0xf5d0) [0x7f63435d15d0]",
"(gsignal()+0x37) [0x7f63421b0207]",
"(abort()+0x148) [0x7f63421b18f8]",
"(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x7f6345cd6436]",
"(()+0x25d5af) [0x7f6345cd65af]",
"(()+0x327be2) [0x7f6345da0be2]",
"(()+0x20fa41) [0x55ce0241ea41]",
"(FunctionContext::finish(int)+0x2c) [0x55ce02378b3c]",
"(Context::complete(int)+0x9) [0x55ce02374259]",
"(Finisher::finisher_thread_entry()+0x16f) [0x7f6345d614af]",
"(()+0x7dd5) [0x7f63435c9dd5]",
"(clone()+0x6d) [0x7f6342277ead]"
],
"utsname_hostname": "controller",
"assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/common/Thread.cc: In function 'void Thread::create(const char*, size_t)' thread 7f633b283700 time 2023-06-19 11:42:02.962514\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos7/DIST/centos7/MACHINE_SIZE/gigantic/release/14.2.22/rpm/el7/BUILD/ceph-14.2.22/src/common/Thread.cc: 157: FAILED ceph_assert(ret == 0)\n",
"crash_id": "2023-06-19_03:42:02.966898Z_c055cf4a-5f65-4a27-bc1e-abe1b51cd6ef",
"assert_func": "void Thread::create(const char*, size_t)",
"ceph_version": "14.2.22"
}
可以通过"存档"崩溃(可能是在管理员检查之后)来消除此警告,从而不会生成此警告
ceph crash archive <crash-id>
[root@controller ~]# ceph crash archive 2023-06-19_03:42:02.966898Z_c055cf4a-5f65-4a27-bc1e-abe1b51cd6ef
同样,所有新的崩溃都可以通过以下方式存档
# ceph crash archive-all
通过ceph crash ls仍然可以看到已存档的崩溃,但不是ceph crash ls-new即可看到。
"recent"所指的时间段由选项mgr/crash/warn_recent_interval控制(默认值:两周)。
可以通过以下方式完全禁用这些警告:
# ceph config set mgr mgr/crash/warn_recent_interval 0