介绍
目前prometheus对于Kubernetes集群的监控主要是包括这几方面:
监控指标 |
具体实现 |
描述 |
pod、container性能 |
cadvisor |
容器cpu、内存、网络等 |
node性能 |
node_exporter |
各节点cpu、内存、磁盘、网络等 |
K8S资源对象 |
kube-state-metrics |
pod、deployment、service等资源 |
部署prometheus
应用 prometheus-rbac.yaml
apiVersion: v1 # 创建 ServiceAccount 授予权限 kind: ServiceAccount metadata: name: prometheus namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile rules: - apiGroups: - "" # 授予的权限 resources: - nodes - nodes/metrics - services - endpoints - pods verbs: - get - list - watch - apiGroups: - "" resources: - configmaps verbs: - get - nonResourceURLs: - "/metrics" verbs: - get --- # 角色绑定 apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: kube-system
|
应用 prometheus-configmap.yaml
# configuration format https://prometheus.io/docs/prometheus/latest/configuration/configuration/ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: EnsureExists data: # 存放prometheus配置文件 prometheus.yml: | global: scrape_interval: 15s scrape_timeout: 10s evaluation_interval: 15s # 配置采集目标 scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 # 采集:Apiserver 生存指标 # 创建的job name 名称为 kubernetes-apiservers - job_name: kubernetes-apiservers # 基于k8s的服务发现 kubernetes_sd_configs: - role: endpoints # 使用通信标记标签 relabel_configs: # 保留正则匹配标签 - action: keep # 已经包含 regex: default;kubernetes;https source_labels: - __meta_kubernetes_namespace - __meta_kubernetes_service_name - __meta_kubernetes_endpoint_port_name # 使用方法为https、默认http scheme: https tls_config: # promethus访问Apiserver使用认证 ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # 跳过https认证 insecure_skip_verify: true # promethus访问Apiserver使用认证 bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # 采集:Kubelet 生存指标 - job_name: kubernetes-nodes-kubelet kubernetes_sd_configs: # 发现集群中所有的Node - role: node relabel_configs: # 通过regex获取关键信息 - action: labelmap regex: __meta_kubernetes_node_label_(.+) scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 采集:nodes-cadvisor 信息 - job_name: kubernetes-nodes-cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # 重命名标签 - target_label: __metrics_path__ replacement: /metrics/cadvisor scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# 采集:service-endpoints 信息 - job_name: kubernetes-service-endpoints # 选定指标 kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true # 指定源标签 source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme # 重命名标签采集 target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name
# 采集:kubernetes-services 服务指标 - job_name: kubernetes-services kubernetes_sd_configs: - role: service # 黑盒探测,探测IP与端口是否可用 metrics_path: /probe params: module: - http_2xx relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - source_labels: - __address__ target_label: __param_target # 使用 blackbox进行黑盒探测 - replacement: blackbox target_label: __address__ - source_labels: - __param_target target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - source_labels: - __meta_kubernetes_service_name target_label: kubernetes_name
# 采集: kubernetes-pods 信息 - job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - action: keep regex: true source_labels: # 只保留采集的信息 - __meta_kubernetes_pod_annotation_prometheus_io_scrape - action: replace regex: (.+) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: # 采集地址 - __address__ # 采集端口 - __meta_kubernetes_pod_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: kubernetes_namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: kubernetes_pod_name alerting: # 告警配置文件 alertmanagers: - kubernetes_sd_configs: # 采用动态获取 - role: pod tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace] regex: kube-system action: keep - source_labels: [__meta_kubernetes_pod_label_k8s_app] regex: alertmanager action: keep - source_labels: [__meta_kubernetes_pod_container_port_number] regex: action: drop
|
应用 prometheus-statefulset.yaml
在 volumeClaimTemplates 字段绑定了storageclass 动态pv供给,可以参考我的这篇文章
来使用
apiVersion: apps/v1 kind: StatefulSet metadata: name: prometheus # 部署命名空间 namespace: kube-system labels: k8s-app: prometheus kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile version: v2.2.1 spec: serviceName: "prometheus" replicas: 1 podManagementPolicy: "Parallel" updateStrategy: type: "RollingUpdate" selector: matchLabels: k8s-app: prometheus template: metadata: labels: k8s-app: prometheus annotations: scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical serviceAccountName: prometheus # 初始化容器 initContainers: - name: "init-chown-data" image: "busybox:latest" imagePullPolicy: "IfNotPresent" command: ["chown", "-R", "65534:65534", "/data"] volumeMounts: - name: prometheus-data mountPath: /data subPath: "" containers: - name: prometheus-server-configmap-reload image: "jimmidyson/configmap-reload:v0.1" imagePullPolicy: "IfNotPresent" args: - --volume-dir=/etc/config - --webhook-url=http://localhost:9090/-/reload volumeMounts: - name: config-volume mountPath: /etc/config readOnly: true resources: limits: cpu: 10m memory: 10Mi requests: cpu: 10m memory: 10Mi
- name: prometheus-server # 主要使用镜像 image: "prom/prometheus:v2.2.1" imagePullPolicy: "IfNotPresent" args: - --config.file=/etc/config/prometheus.yml - --storage.tsdb.path=/data - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles - --web.enable-lifecycle ports: - containerPort: 9090 readinessProbe: # 健康检查 httpGet: path: /-/ready port: 9090 initialDelaySeconds: 30 timeoutSeconds: 30 livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 30 timeoutSeconds: 30 # based on 10 running nodes with 30 pods each resources: limits: cpu: 200m memory: 1000Mi requests: cpu: 200m memory: 1000Mi # 数据卷 volumeMounts: - name: config-volume mountPath: /etc/config - name: prometheus-data mountPath: /data subPath: "" terminationGracePeriodSeconds: 300 volumes: - name: config-volume configMap: name: prometheus-config volumeClaimTemplates: - metadata: name: prometheus-data spec: # 使用动态PV、修改为已创建的PV动态存储 storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: "5Gi"
|
应用 prometheus-service.yaml
apiVersion: v1 kind: Service metadata: name: prometheus # 指定命名空间 namespace: kube-system labels: kubernetes.io/name: "Prometheus" kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile spec: # 添加外部访问 type: NodePort # 指定内部访问协议 ports: - name: http port: 9090 protocol: TCP targetPort: 9090 selector: k8s-app: prometheus
|
应用这些yaml后,查看 kube-system 下的 prometheus service的NodePort来进行访问

部署grafana
grafana可以部署在K8S内,也可以选择部署在集群外
应用 grafana-statefulset.yaml
apiVersion: apps/v1 kind: StatefulSet metadata: name: grafana namespace: kube-system spec: replicas: 1 selector: matchLabels: app: grafana serviceName: grafana volumeClaimTemplates: - metadata: name: grafana-data spec: storageClassName: managed-nfs-storage accessModes: - ReadWriteOnce resources: requests: storage: 1Gi template: metadata: labels: app: grafana spec: containers: - name: grafana image: grafana/grafana ports: - containerPort: 3000 resources: limits: cpu: 100m memory: 256Mi requests: cpu: 100m memory: 256Mi volumeMounts: - name: grafana-data mountPath: /var/lib/grafana subPath: grafana
|
应用 grafana-service.yaml
apiVersion: v1 kind: Service metadata: name: grafana namespace: kube-system spec: type: NodePort selector: app: grafana ports: - port: 3000 targetPort: 3000
|
应用yaml后,查看 kube-system 下的 grafana service的NodePort来进行访问
默认的用户名和密码都是 admin

添加数据源

填写数据源名称、prometheus地址后保存即可,这边地址是 http://prometheus:9090 是因为我们grafana也在K8S集群内部,可以通过coredns进行访问

使用node_exporter采集监控数据并展示
node_exporter不建议部署在K8S集群中
查看这篇文章部署node_exporter:https://1335402049.github.io/2020/12/07/node-exporter%E5%AE%89%E8%A3%85%E5%92%8C%E9%85%8D%E7%BD%AE/
在所有节点部署node_exporter启动后,我们修改 prometheus-configmap.yaml
# 存放prometheus配置文件 prometheus.yml: | global: scrape_interval: 15s scrape_timeout: 10s evaluation_interval: 15s # 配置采集目标 scrape_configs: - job_name: node static_configs: - targets: - 192.168.1.110:9100 - 192.168.1.111:9100 - 192.168.1.112:9100
|
重新应用生效
kubectl apply -f prometheus-configmap.yaml
|
在prometheus页面的targets里面可以看到监控的node已经生效

在grafana添加监控模板,当然你也可以去官网: https://grafana.com/grafana/dashboards 找其他的模板

导入进来的模板,有的图表也有可能无法正常显示,根据自己的情况进行修改,主要也就是对prometheus查询语法的修改以及和grafana variables的修改


最后grafana绘制成功的图表


展示cadvisor数据图形
cadvisor能够帮我们对容器的性能数据进行采集,并且K8S内部也提供了cadvisor采集器,我们无需在单独部署,
在prometheus-configmap.yaml中也配置了相关的cadvisor的服务发现
# 采集:nodes-cadvisor 信息 - job_name: kubernetes-nodes-cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # 重命名标签 - target_label: __metrics_path__ replacement: /metrics/cadvisor scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
在prometheus的targets页面看到nodes-cadvisor状态

在grafana中导入模板ID:3119

使用kube-state-metrics收集数据并展示
应用 kube-state-metrics-rbac.yaml
apiVersion: v1 kind: ServiceAccount metadata: name: kube-state-metrics namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: kube-state-metrics labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile rules: - apiGroups: [""] resources: - configmaps - secrets - nodes - pods - services - resourcequotas - replicationcontrollers - limitranges - persistentvolumeclaims - persistentvolumes - namespaces - endpoints verbs: ["list", "watch"] - apiGroups: ["extensions"] resources: - daemonsets - deployments - replicasets verbs: ["list", "watch"] - apiGroups: ["apps"] resources: - statefulsets verbs: ["list", "watch"] - apiGroups: ["batch"] resources: - cronjobs - jobs verbs: ["list", "watch"] - apiGroups: ["autoscaling"] resources: - horizontalpodautoscalers verbs: ["list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: kube-state-metrics-resizer namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile rules: - apiGroups: [""] resources: - pods verbs: ["get"] - apiGroups: ["extensions"] resources: - deployments resourceNames: ["kube-state-metrics"] verbs: ["get", "update"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: kube-state-metrics labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: kube-state-metrics subjects: - kind: ServiceAccount name: kube-state-metrics namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: kube-state-metrics namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: kube-state-metrics-resizer subjects: - kind: ServiceAccount name: kube-state-metrics namespace: kube-system
|
应用 kube-state-metrics-deployment.yaml
apiVersion: apps/v1 kind: Deployment metadata: name: kube-state-metrics namespace: kube-system labels: k8s-app: kube-state-metrics kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile version: v1.3.0 spec: selector: matchLabels: k8s-app: kube-state-metrics version: v1.3.0 replicas: 1 template: metadata: labels: k8s-app: kube-state-metrics version: v1.3.0 annotations: scheduler.alpha.kubernetes.io/critical-pod: '' spec: priorityClassName: system-cluster-critical serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics image: lizhenliang/kube-state-metrics:v1.3.0 ports: - name: http-metrics containerPort: 8080 - name: telemetry containerPort: 8081 readinessProbe: httpGet: path: /healthz port: 8080 initialDelaySeconds: 5 timeoutSeconds: 5 - name: addon-resizer image: lizhenliang/addon-resizer:1.8.3 resources: limits: cpu: 100m memory: 30Mi requests: cpu: 100m memory: 30Mi env: - name: MY_POD_NAME valueFrom: fieldRef: fieldPath: metadata.name - name: MY_POD_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace volumeMounts: - name: config-volume mountPath: /etc/config command: - /pod_nanny - --config-dir=/etc/config - --container=kube-state-metrics - --cpu=100m - --extra-cpu=1m - --memory=100Mi - --extra-memory=2Mi - --threshold=5 - --deployment=kube-state-metrics volumes: - name: config-volume configMap: name: kube-state-metrics-config --- # Config map for resource configuration. apiVersion: v1 kind: ConfigMap metadata: name: kube-state-metrics-config namespace: kube-system labels: k8s-app: kube-state-metrics kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile data: NannyConfiguration: |- apiVersion: nannyconfig/v1alpha1 kind: NannyConfiguration
|
应用 kube-state-metrics-service.yaml
apiVersion: v1 kind: Service metadata: name: kube-state-metrics namespace: kube-system labels: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile kubernetes.io/name: "kube-state-metrics" annotations: prometheus.io/scrape: 'true' spec: ports: - name: http-metrics port: 8080 targetPort: http-metrics protocol: TCP - name: telemetry port: 8081 targetPort: telemetry protocol: TCP selector: k8s-app: kube-state-metrics
|
应用这些yaml,在prometheus-configmap.yaml中也有kubernetes的服务发现配置,在targets的页面有了apiserver、pod、endpoint等等目标
在grafana中导入模板ID:6417
在这里你可以看到pod资源情况、滚动更新、pod数量、pod状态等等
