2026/5/21 12:36:12
网站建设
项目流程
全国免费发布信息网站大全,手机端网站模板,邮件格式模板,网站备案接入商名称k8s_inspect.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
Kubernetes 集群健康巡检脚本检查内容#xff1a;- 节点是否处于 Ready 状态- Pod 是否正常运行#xff08;跳过已完成的 Job#xff09;- Deployment 是否达到期望副本数- Service 是否有…k8s_inspect.py#!/usr/bin/env python3# -*- coding: utf-8 -*- Kubernetes 集群健康巡检脚本 检查内容 - 节点是否处于 Ready 状态 - Pod 是否正常运行跳过已完成的 Job - Deployment 是否达到期望副本数 - Service 是否有可用的后端 Endpoints 使用方法 python3 k8s_inspect.py 依赖安装建议在虚拟环境中 pipinstallkubernetesimportsys from kubernetesimportclient, config from kubernetes.client.restimportApiException def load_kube_config():加载 Kubernetes 配置优先尝试 in-cluster 模式否则使用 kubeconfig 文件 try: config.load_incluster_config()print([成功] 使用集群内配置in-cluster config)except config.ConfigException: try: config.load_kube_config()print([成功] 使用本地 kubeconfig 文件)except config.ConfigException as e: print(f[失败] 无法加载 Kubernetes 配置: {e})sys.exit(1)def check_nodes():检查所有节点是否就绪Ready print(\n[信息] 正在检查节点状态...)v1client.CoreV1Api()try: nodesv1.list_node().itemsifnot nodes: print([警告] 未发现任何节点)returnFalse all_readyTruefornodeinnodes: namenode.metadata.name readyany(cond.typeReadyand cond.statusTrueforcondinnode.status.conditions or[])ifnot ready: print(f[错误] 节点 {name} 未就绪NotReady)all_readyFalse else: print(f[正常] 节点 {name} 已就绪)returnall_ready except ApiException as e: print(f[错误] 获取节点列表失败: {e})returnFalse def check_pods():检查所有命名空间中的 Pod 状态跳过已完成的 Job Pod print(\n[信息] 正在检查 Pod 状态...)v1client.CoreV1Api()try: podsv1.list_pod_for_all_namespaces().itemsifnot pods: print([信息] 未发现任何 Pod)returnTrue all_okTrueforpodinpods: namespacepod.metadata.namespace namepod.metadata.name phasepod.status.phase is_job_pod(pod.metadata.owner_references and any(owner.kindJobforownerinpod.metadata.owner_references))ifis_job_pod and phasein(Succeeded,Completed):continueifphaseRunning:continueelifphasePending:continueelifany(c.state.waiting and c.state.waiting.reasonCrashLoopBackOffforcin(pod.status.container_statuses or[])): print(f[错误] Pod {namespace}/{name} 处于 CrashLoopBackOff 状态)all_okFalse else: print(f[警告] Pod {namespace}/{name} 状态异常: {phase})all_okFalsereturnall_ok except ApiException as e: print(f[错误] 获取 Pod 列表失败: {e})returnFalse def check_deployments():检查所有 Deployment 是否达到期望的可用副本数 print(\n[信息] 正在检查 Deployment 状态...)apps_v1client.AppsV1Api()try: deploymentsapps_v1.list_deployment_for_all_namespaces().itemsifnot deployments: print([信息] 未发现任何 Deployment)returnTrue all_okTruefordepindeployments: namespacedep.metadata.namespace namedep.metadata.name desireddep.spec.replicas or0availabledep.status.available_replicas or0ifavailabledesired: print(f[错误] Deployment {namespace}/{name} 可用副本不足: {available}/{desired})all_okFalsereturnall_ok except ApiException as e: print(f[错误] 获取 Deployment 列表失败: {e})returnFalse def check_services():检查每个带 selector 的 Service 是否有可用的 Endpoints print(\n[信息] 正在检查 Service 的 Endpoints...)v1client.CoreV1Api()try: servicesv1.list_service_for_all_namespaces().items services[sforsinservicesifs.spec.selector]ifnot services: print([信息] 未发现带 selector 的 Service)returnTrue all_okTrueforsvcinservices: namespacesvc.metadata.namespace namesvc.metadata.name try: endpointsv1.read_namespaced_endpoints(name, namespace)ifnot endpoints.subsets: print(f[错误] Service {namespace}/{name} 没有可用的后端 Endpoints)all_okFalse except ApiException as e: print(f[警告] 读取 Service {namespace}/{name} 的 Endpoints 失败: {e})all_okFalsereturnall_ok except ApiException as e: print(f[错误] 获取 Service 列表失败: {e})returnFalse def main():主函数执行检查并汇总结果 print(开始 Kubernetes 集群健康巡检...)load_kube_config()node_okcheck_nodes()pod_okcheck_pods()deploy_okcheck_deployments()svc_okcheck_services()print(\n 巡检结果汇总 )print(f节点状态: {正常 if node_ok else 异常})print(fPod 状态: {正常 if pod_ok else 异常})print(fDeployment: {正常 if deploy_ok else 异常})print(fService 后端: {正常 if svc_ok else 异常})ifall([node_ok, pod_ok, deploy_ok, svc_ok]): print(\n集群整体健康无异常)sys.exit(0)else: print(\n集群存在异常请根据上述信息排查)sys.exit(1)if__name____main__:main()