应用 2026-06-11

Kubernetes 集群故障排查全记录

阅读次数 15 评论数 0

问题背景

在 Debian 12 上使用 kubeadm 搭建 Kubernetes v1.28.15 集群时,工作节点(k8s-node1)一直处于 NotReady 状态,而控制平面节点(k8s-master)正常工作。

故障排查思路

第一层:检查节点和 Pod 基础状态

kubectl get nodes

# k8s-node1 NotReady

kubectl get pods -n kube-system

# kube-proxy-hf9m5 ContainerCreating

# coredns 都处于 Pending 状态

排查方向:

  1. 工作节点是否成功加入集群?

  2. 系统 Pod 为什么没有运行?

结论: 节点已加入,但网络插件未就绪导致 Pod 无法调度。

第二层:分析节点 NotReady 原因

执行命令:

kubectl describe node k8s-node1 | grep -A 5 "Conditions"

kubectl get pods --all-namespaces | grep flannel

发现:

节点 Conditions 显示正常(内存、磁盘、PID 均无压力)

Flannel Pod 在 node1 上状态为 Init:0/2Init:1/2(卡在初始化)

Master 节点上的 Flannel Pod 正常 Running

结论: 问题出在工作节点的网络插件初始化上。

第三层:检查容器运行时状态

执行命令(在 node1 上):

systemctl status containerd

ls -la /var/run/containerd/containerd.sock

发现: containerd 运行正常,socket 文件存在。

继续排查:

crictl info

# NetworkReady: false

# lastCNILoadStatus: "no network config found in /etc/cni/net.d"

结论: CNI 配置目录为空,Flannel 无法写入配置文件。

第四层:检查 CNI 配置目录

执行命令:

ls -la /etc/cni/net.d/

# 只有 .kubernetes-cni-keep 空文件,没有 .conf 或 .conflist 文件

核心问题定位:

Flannel Pod 的 init 容器 install-cni-plugin 应该负责创建 CNI 配置

但该容器异常退出,导致配置未能写入

验证:

crictl ps -a | grep flannel

# install-cni-plugin 容器状态为 Exited

结论: Flannel 的 init 容器无法完成 CNI 配置安装。

解决方案汇总

方案一:手动创建 CNI 配置(最终生效方案)

在 k8s-node1 上执行:

# 1. 手动创建 Flannel CNI 配置文件

cat > /etc/cni/net.d/10-flannel.conflist << 'EOF'

{

"name": "cbr0",

"cniVersion": "0.3.1",

"plugins": [

{

"type": "flannel",

"delegate": {

"hairpinMode": true,

"isDefaultGateway": true

}

},

{

"type": "portmap",

"capabilities": {

"portMappings": true

}

}

]

}

EOF

# 2. 重启 kubelet

systemctl restart kubelet

方案二:配置镜像加速(解决拉取问题)

由于工作节点无法直接访问 registry.k8s.io,需要配置 containerd 使用国内镜像

cat > /etc/containerd/config.toml << 'EOF'

disabled_plugins = []

imports = []

oom_score = 0

plugin_dir = ""

required_plugins = []

root = "/var/lib/containerd"

state = "/run/containerd"

temp = ""

version = 2

[cgroup]

path = ""

[debug]

address = ""

format = ""

gid = 0

level = ""

uid = 0

[grpc]

address = "/run/containerd/containerd.sock"

gid = 0

max_recv_message_size = 16777216

max_send_message_size = 16777216

tcp_address = ""

tcp_tls_ca = ""

tcp_tls_cert = ""

tcp_tls_key = ""

uid = 0

[metrics]

address = ""

grpc_histogram = false

[plugins]

[plugins."io.containerd.gc.v1.scheduler"]

deletion_threshold = 0

mutation_threshold = 100

pause_threshold = 0.02

schedule_delay = "0s"

startup_delay = "100ms"

[plugins."io.containerd.grpc.v1.cri"]

device_ownership_from_security_context = false

disable_apparmor = false

disable_cgroup = false

disable_hugetlb_controller = true

disable_proc_mount = false

disable_tcp_service = true

enable_selinux = false

enable_tls_streaming = false

enable_unprivileged_icmp = false

enable_unprivileged_ports = false

ignore_image_defined_volumes = false

max_concurrent_downloads = 3

max_container_log_line_size = 16384

netns_mounts_under_state_dir = false

restrict_oom_score_adj = false

sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"

selinux_category_range = 1024

stats_collect_period = 10

stream_idle_timeout = "4h0m0s"

stream_server_address = "127.0.0.1"

stream_server_port = "0"

systemd_cgroup = false

tolerate_missing_hugetlb_controller = true

unset_seccomp_profile = ""

[plugins."io.containerd.grpc.v1.cri".cni]

bin_dir = "/opt/cni/bin"

conf_dir = "/etc/cni/net.d"

conf_template = ""

ip_pref = ""

max_conf_num = 1

[plugins."io.containerd.grpc.v1.cri".containerd]

default_runtime_name = "runc"

disable_snapshot_annotations = true

discard_unpacked_layers = false

ignore_rdt_not_enabled_errors = false

no_pivot = false

snapshotter = "overlayfs"

[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]

base_runtime_spec = ""

cni_conf_dir = ""

cni_max_conf_num = 0

container_annotations = []

pod_annotations = []

privileged_without_host_devices = false

runtime_engine = ""

runtime_path = ""

runtime_root = ""

runtime_type = ""

[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]

base_runtime_spec = ""

cni_conf_dir = ""

cni_max_conf_num = 0

container_annotations = []

pod_annotations = []

privileged_without_host_devices = false

runtime_engine = ""

runtime_path = ""

runtime_root = ""

runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]

BinaryName = ""

CriuImagePath = ""

CriuPath = ""

CriuWorkPath = ""

IoGid = 0

IoUid = 0

NoNewKeyring = false

NoPivotRoot = false

Root = ""

ShimCgroup = ""

SystemdCgroup = true

[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]

base_runtime_spec = ""

cni_conf_dir = ""

cni_max_conf_num = 0

container_annotations = []

pod_annotations = []

privileged_without_host_devices = false

runtime_engine = ""

runtime_path = ""

runtime_root = ""

runtime_type = ""

[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]

[plugins."io.containerd.grpc.v1.cri".image_decryption]

key_model = "node"

[plugins."io.containerd.grpc.v1.cri".registry]

config_path = ""

[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]

endpoint = ["https://docker.m.daocloud.io", "https://hub-mirror.c.163.com"]

[plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"]

endpoint = ["https://registry.aliyuncs.com/google_containers"]

[plugins."io.containerd.grpc.v1.cri".registry.auths]

[plugins."io.containerd.grpc.v1.cri".registry.configs]

[plugins."io.containerd.grpc.v1.cri".registry.headers]

[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]

tls_cert_file = ""

tls_key_file = ""

[plugins."io.containerd.internal.v1.opt"]

path = "/opt/containerd"

[plugins."io.containerd.internal.v1.restart"]

interval = "10s"

[plugins."io.containerd.metadata.v1.bolt"]

content_sharing_policy = "shared"

[plugins."io.containerd.monitor.v1.cgroups"]

no_prometheus = false

[plugins."io.containerd.runtime.v1.linux"]

no_shim = false

runtime = "runc"

runtime_root = ""

shim = "containerd-shim"

shim_debug = false

[plugins."io.containerd.runtime.v2.task"]

platforms = ["linux/amd64"]

sched_core = false

[plugins."io.containerd.service.v1.diff-service"]

default = ["walking"]

[plugins."io.containerd.service.v1.tasks-service"]

rdt_config_file = ""

[plugins."io.containerd.snapshotter.v1.aufs"]

root_path = ""

[plugins."io.containerd.snapshotter.v1.btrfs"]

root_path = ""

[plugins."io.containerd.snapshotter.v1.devmapper"]

async_remove = false

base_image_size = ""

discard_blocks = false

fs_options = ""

fs_type = ""

pool_name = ""

root_path = ""

[plugins."io.containerd.snapshotter.v1.native"]

root_path = ""

[plugins."io.containerd.snapshotter.v1.overlayfs"]

root_path = ""

upperdir_label = false

[plugins."io.containerd.snapshotter.v1.zfs"]

root_path = ""

[proxy_plugins]

[stream_processors]

[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]

accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]

args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]

env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]

path = "ctd-decoder"

returns = "application/vnd.oci.image.layer.v1.tar"

[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]

accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]

args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]

env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]

path = "ctd-decoder"

returns = "application/vnd.oci.image.layer.v1.tar+gzip"

[timeouts]

"io.containerd.timeout.bolt.open" = "0s"

"io.containerd.timeout.shim.cleanup" = "5s"

"io.containerd.timeout.shim.load" = "5s"

"io.containerd.timeout.shim.shutdown" = "3s"

"io.containerd.timeout.task.state" = "2s"

[ttrpc]

address = ""

gid = 0

uid = 0

EOF

systemctl restart containerd

方案三:重置节点并重新加入

当配置混乱时,完全重置是最干净的方案:

在 master 上:

kubectl delete node k8s-node1

kubeadm token create --print-join-command

在 node1 上:

kubeadm reset -f #完全清除

rm -rf /etc/cni/net.d/*

rm -rf /var/lib/kubelet/*

systemctl restart containerd

# 执行 master 输出的 join 命令

kubeadm join <master-ip>:6443 --token <token> --discovery-token-ca-cert-hash sha256:<hash>

验证步骤

1.检查节点状态

kubectl get nodes

# 预期:两个节点都是 Ready

2. 检查 Flannel Pod

kubectl get pods -n kube-flannel

# 预期:两个 Pod 都是 Running

3. 检查系统 Pod

kubectl get pods -n kube-system

# 预期:所有 Pod 都是 Running

4. 测试网络连通性

kubectl run test-nginx --image=nginx --restart=Never

kubectl get pods -o wide

# 预期:Pod 能获得 IP 并正常运行

kubectl delete pod test-nginx

经验总结

问题现象

根本原因

解决方案

节点 NotReady

网络插件未就绪

安装/修复 CNI 配置

Flannel Init 卡住

CNI 配置目录为空

手动创建配置文件

镜像拉取失败

无法访问 Google 仓库

配置阿里云镜像加速

kube-proxy ContainerCreating

网络插件未初始化

先解决 CNI 问题

CoreDNS Pending

网络插件未就绪

等待网络插件运行

关键排查命令速查

# 节点状态

kubectl get nodes -w

# Pod 状态(含所有命名空间)

kubectl get pods --all-namespaces

# 节点详细信息

kubectl describe node <node-name>

# Pod 日志

kubectl logs -n <namespace> <pod-name>

# 容器运行时状态

crictl info

crictl ps -a

# CNI 配置

ls -la /etc/cni/net.d/

# containerd 日志

journalctl -u containerd -n 50

0%