GKE 系统 Pod gke-metrics-agent OOMKilld

Anv*_*ith 5 google-kubernetes-engine

我注意到我们的 gke 集群系统 Pod (gke-metrics-agent) 内存不足。我尝试编辑 daemonset yaml 文件,将内存请求增加到 200Mi,并将内存限制增加到 200Mi。但是,它不允许我应用它。它使用默认值重新创建,与之前一样,即 50Mi。 Pod 状态图像

请帮助我增加 gke-metrics-agent 的内存资源

小智 4

一般CrashLoopBackOff表示容器重启后反复崩溃。您可以按照文档解决CrashLoopBackOff问题。

限制 gke-metric-agent 的 OOM 终止的可能解决方法是增加 gke-metric-agent pod 的内存限制。这可以通过禁用 GKE 监控并使用自定义 Metric Agent 清单将 gke-metric-agent 部署到集群来完成。这将允许您调整 gke-metric-agent 的内存资源以防止其被终止。

为此,您可以按照以下步骤操作:

1.禁用GKE监控

CLUSTER=<cluster_name>
PROJECT=<project>
LOCATION=<location>
Run Code Online (Sandbox Code Playgroud)
gcloud container clusters update $CLUSTER --zone=$LOCATION --project=$PROJECT --monitoring-service=none --logging-service=logging.googleapis.com/kubernetes
Run Code Online (Sandbox Code Playgroud)

2.使用以下配置并运行它:

sed -u -e's/{{.ClusterName}}/'${CLUSTER}'/g' -e's/{{.Location}}/'${LOCATION}'/g' metrics-agent.yaml | kubectl apply -f - 
Run Code Online (Sandbox Code Playgroud)
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: gke-metrics-agent-conf
  namespace: default
data:
  gke-metrics-agent-config: |
    receivers:
      prometheus:
        use_start_time_metric: true
        config:
          scrape_configs:
          - job_name: "kubelet"
            scrape_interval: 60s
            static_configs:
            - targets: ["$KUBELET_HOST:10255"]
            metric_relabel_configs:
            - source_labels: [ __name__ ]
              target_label: gke_component_name
              replacement: "nodes/kubelet"
          - job_name: "kubelet-prober"
            scrape_interval: 60s
            static_configs:
            - targets: ["$KUBELET_HOST:10255"]
            metrics_path: /metrics/probes
            metric_relabel_configs:
            - source_labels: [__name__]
              regex: "prober_probe_total|process_start_time_seconds"
              action: keep
            - source_labels: [ __name__ ]
              target_label: gke_component_name
              replacement: "nodes/kubelet"
          - job_name: "addons"
            scrape_interval: 60s
            kubernetes_sd_configs:
            - role: pod
              namespaces:
                names:
                - kube-system
              selectors:
              - role: pod
                field: "spec.nodeName=$NODE_NAME"
            relabel_configs:
            - source_labels: [ __meta_kubernetes_pod_container_port_name ]
              regex: ".*metrics"
              action: keep
            - source_labels: [ __meta_kubernetes_pod_annotationpresent_components_gke_io_component_name ]
              regex: true
              action: keep
            - source_labels: [ __meta_kubernetes_pod_annotationpresent_monitoring_gke_io_path, __meta_kubernetes_pod_annotation_monitoring_gke_io_path ]
              regex: "true;(.*)"
              target_label: __metrics_path__
            - source_labels: [ __meta_kubernetes_pod_name ]
              target_label: pod
            - source_labels: [ __meta_kubernetes_pod_container_name ]
              target_label: container
            - source_labels: [ __meta_kubernetes_namespace ]
              target_label: namespace
            - source_labels: [ __meta_kubernetes_pod_annotation_components_gke_io_component_name ]
              target_label: gke_component_name
              replacement: "addons/${ARG1}"
            - source_labels: [ gke_component_name ]
              target_label: gke_component_name
              regex: "(.*)-(.*)"
              replacement: "${ARG1}_${ARG2}"
            - source_labels: [ gke_component_name ]
              target_label: gke_component_name
              regex: "(.*)-(.*)"
              replacement: "${ARG1}_${ARG2}"
          - job_name: "coredns"
            scrape_interval: 60s
            static_configs:
            - targets: ["$KUBELET_HOST:9253"]
            metric_relabel_configs:
            - source_labels: [ __name__ ]
              target_label: gke_component_name
              replacement: "nodes/coredns"
          - job_name: "coredns-nodecache"
            scrape_interval: 60s
            static_configs:
            - targets: ["$KUBELET_HOST:9353"]
            metric_relabel_configs:
            - source_labels: [ __name__ ]
              target_label: gke_component_name
              replacement: "nodes/coredns"
          - job_name: "node"
            scrape_interval: 60s
            static_configs:
            - targets: ["$KUBELET_HOST:10231"]
            metric_relabel_configs:
            - source_labels: [ __name__ ]
              target_label: gke_component_name
              replacement: "net/cluster/node"
      kubenode:
        endpoint: "http://$KUBELET_HOST:10255"
        scrape_interval: 60s
        cluster_name: {{.ClusterName}}
        location: {{.Location}}
        node_name: "$NODE_NAME"
        kubernetes_service_host: "$KUBERNETES_SERVICE_HOST"
    exporters:
      stackdriver:
        endpoint: monitoring.googleapis.com:443
        skip_create_metric_descriptor: true
    processors:
      resource:
        type: "host"
        labels:
          cloud.zone: {{.Location}}
          host.name: "$NODE_NAME"
          k8s.cluster.name: {{.ClusterName}}
      metrics_export:
        common_prefix: "kubernetes.io/internal"
        detect_container_metrics: true

        allowed_labels:
        - "project"
        - "location"
        - "cluster_name"
        - "node_name"
        - "namespace"
        - "pod"
        - "container"
        export_map:
        - "kubernetes.io/internal/nodes/kubelet/process_start_time_seconds":
            drop: true
        - "kubernetes.io/internal/nodes/kubelet/kubelet_docker_operations_total":
            allowed_labels:
            - "operation_type"
            export_name: "kubernetes.io/internal/nodes/kubelet/docker_operations_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/kubelet_docker_operations_errors_total":
            allowed_labels:
            - "operation_type"
            export_name: "kubernetes.io/internal/nodes/kubelet/docker_operations_errors_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/kubelet_runtime_operations_total":
            allowed_labels:
            - "operation_type"
            export_name: "kubernetes.io/internal/nodes/kubelet/runtime_operations_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/kubelet_runtime_operations_errors_total":
            allowed_labels:
            - "operation_type"
            export_name: "kubernetes.io/internal/nodes/kubelet/runtime_operations_errors_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/rest_client_requests_total":
            allowed_labels:
            - "code"
            - "method"
            - "host"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/storage_operation_duration_seconds":
            allowed_labels:
            - "volume_plugin"
            - "operation_name"
        - "kubernetes.io/internal/nodes/kubelet/kubelet_network_plugin_operations_duration_seconds":
            allowed_labels:
            - "operation_type"
            export_name: "kubernetes.io/internal/nodes/kubelet/network_plugin_operations_duration_seconds"
        - "kubernetes.io/internal/nodes/kubelet/storage_operation_errors_total":
            allowed_labels:
            - "volume_plugin"
            - "operation_name"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/storage_operation_status_count":
            allowed_labels:
            - "volume_plugin"
            - "operation_name"
            - "status"
            export_as_int: true
        - "kubernetes.io/internal/nodes/kubelet/prober_probe_total":
            allowed_labels:
            - "container"
            - "namespace"
            - "pod"
            - "pod_uid"
            - "result"
            - "probe_type"
            export_as_int: true
            is_container_metric: true


        - "kubernetes.io/internal/nodes/coredns/process_start_time_seconds":
            drop: true
        - "kubernetes.io/internal/nodes/coredns/coredns_cache_drops_total":
            allowed_labels:
            - "server"
            export_name: "kubernetes.io/internal/nodes/coredns/cache_drops_total"
        - "kubernetes.io/internal/nodes/coredns/coredns_cache_hits_total":
            allowed_labels:
            - "server"
            - "type"
            export_name: "kubernetes.io/internal/nodes/coredns/cache_hits_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_cache_misses_total":
            allowed_labels:
            - "server"
            export_name: "kubernetes.io/internal/nodes/coredns/cache_misses_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_cache_prefetch_total":
            allowed_labels:
            - "server"
            export_name: "kubernetes.io/internal/nodes/coredns/cache_prefetch_total"
        - "kubernetes.io/internal/nodes/coredns/coredns_cache_size":
            allowed_labels:
            - "server"
            - "type"
            export_name: "kubernetes.io/internal/nodes/coredns/cache_size"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_dns_request_count_total":
            allowed_labels:
            - "family"
            - "proto"
            - "server"
            - "zone"
            export_name: "kubernetes.io/internal/nodes/coredns/dns_request_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_dns_request_duration_seconds":
            allowed_labels:
            - "server"
            - "zone"
            export_name: "kubernetes.io/internal/nodes/coredns/dns_request_duration_seconds"
        - "kubernetes.io/internal/nodes/coredns/coredns_dns_request_type_count_total":
            allowed_labels:
            - "server"
            - "type"
            - "zone"
            export_name: "kubernetes.io/internal/nodes/coredns/dns_request_type_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_dns_response_rcode_count_total":
            allowed_labels:
            - "rcode"
            - "server"
            - "zone"
            export_name: "kubernetes.io/internal/nodes/coredns/dns_response_rcode_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_forward_healthcheck_failure_count_total":
            allowed_labels:
            - "to"
            export_name: "kubernetes.io/internal/nodes/coredns/forward_healthcheck_failure_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_forward_request_count_total":
            allowed_labels:
            - "to"
            export_name: "kubernetes.io/internal/nodes/coredns/forward_request_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_forward_request_duration_seconds":
            allowed_labels:
            - "to"
            export_name: "kubernetes.io/internal/nodes/coredns/forward_request_duration_seconds"
        - "kubernetes.io/internal/nodes/coredns/coredns_forward_response_rcode_count_total":
            allowed_labels:
            - "rcode"
            - "to"
            export_name: "kubernetes.io/internal/nodes/coredns/forward_response_rcode_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_forward_sockets_open":
            allowed_labels:
            - "to"
            export_name: "kubernetes.io/internal/nodes/coredns/forward_sockets_open"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_health_request_duration_seconds":
            allowed_labels: []
            export_name: "kubernetes.io/internal/nodes/coredns/health_request_duration_seconds"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/coredns_panic_count_total":
            allowed_labels: []
            export_name: "kubernetes.io/internal/nodes/coredns/dns_panic_count_total"
            export_as_int: true
        - "kubernetes.io/internal/nodes/coredns/nodecache_setup_errors_total":
            allowed_labels:
            - "errortype"
            export_name: "kubernetes.io/internal/nodes/coredns/nodecache_setup_errors_total"
        - "kubernetes.io/internal/net/cluster/node/process_start_time_seconds":
            drop: true
        - "kubernetes.io/internal/net/cluster/node/conntrack_entries":
            allowed_labels: []
            export_as_int: true
        - "kubernetes.io/internal/net/cluster/node/conntrack_error_count":
            allowed_labels:
            - "type"
            export_as_int: true
        - "kubernetes.io/internal/net/cluster/node/num_inuse_sockets":
            allowed_labels:
            - "protocol"
            export_as_int: true
        - "kubernetes.io/internal/net/cluster/node/num_tw_sockets":
            allowed_labels: []
            export_as_int: true
        - "kubernetes.io/internal/net/cluster/node/socket_memory":
            allowed_labels: []
            export_as_int: true
        - "kubernetes.io/internal/addons/kubedns/process_start_time_seconds":
            drop: true
        - "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_request_count_total":
            allowed_labels:
            - "system"
            export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_request_count_total"
            export_as_int: true
        - "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_request_duration_seconds":
            allowed_labels:
            - "system"
            export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_request_duration_seconds"
        - "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_response_size_bytes":
            allowed_labels:
            - "system"
            export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_response_size_bytes"
        - "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_error_count_total":
            allowed_labels:
            - "system"
            - "cause"
            export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_error_count_total"
            export_as_int: true
        - "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_cachemiss_count_total":
            allowed_labels:
            - "cache"
            export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_cachemiss_count_total"
            export_as_int: true
    extensions:
      observability:
        endpoint: monitoring.googleapis.com:443
        prefix: "kubernetes.io/internal/addons/gke_otelsvc"
        resource:
          type: "k8s_container"
          labels:
            location: {{.Location}}
            cluster_name: {{.ClusterName}}
            pod_name: "$POD_NAME"
            namespace_name: "$POD_NAMESPACE"
            container_name: "gke-metrics-agent"
    service:
      extensions:
      - observability
      pipelines:
        metrics/kube:
          receivers:
            - kubenode
          exporters:
            - stackdriver
        metrics/prom:
          receivers:
            - prometheus
          processors:
            - resource
            - metrics_export
          exporters:
            - stackdriver
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: gke-metrics-agent
  namespace: default
---
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
  annotations:
    apparmor.security.beta.kubernetes.io/allowedProfileNames: runtime/default
    apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
    kubernetes.io/description: Policy used by the gke-metrics-agent addon.
    seccomp.security.alpha.kubernetes.io/allowedProfileNames: runtime/default,docker/default
    seccomp.security.alpha.kubernetes.io/defaultProfileName: docker/default
  name: gce.gke-metrics-agent
  labels:
    kubernetes.io/cluster-service: 'true'
spec:
  privileged: false
  allowPrivilegeEscalation: false
  volumes:
  - 'hostPath'
  - 'secret'
  - 'configMap'
  allowedHostPaths:
  - pathPrefix: /etc/ssl/certs
  hostNetwork: true
  hostIPC: false
  hostPID: false
  runAsUser:
    rule: 'RunAsAny'
  seLinux:
    rule: 'RunAsAny'
  supplementalGroups:
    rule: 'RunAsAny'
  fsGroup:
    rule: 'RunAsAny'
  readOnlyRootFilesystem: false
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: gke-metrics-agent
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - pods
  verbs:
  - list
  - watch
- apiGroups:
  - policy
  resourceNames:
  - gce.gke-metrics-agent
  resources:
  - podsecuritypolicies
  verbs:
  - use
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: gke-metrics-agent
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: gke-metrics-agent
subjects:
- kind: ServiceAccount
  name: gke-metrics-agent
  namespace: default
---
# linux deployment
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: gke-metrics-agent
  namespace: default
  labels:
    k8s-app: gke-metrics-agent
    component: gke-metrics-agent
spec:
  selector:
    matchLabels:
      k8s-app: gke-metrics-agent
      component: gke-metrics-agent
  template:
    metadata:
      labels:
        k8s-app: gke-metrics-agent
        component: gke-metrics-agent
    spec:
      nodeSelector:
        kubernetes.io/os: linux
      tolerations:
      - effect: NoExecute
        operator: Exists
      - effect: NoSchedule
        operator: Exists
      hostNetwork: true
      serviceAccount: gke-metrics-agent
      containers:
      - name: gke-metrics-agent
        image: "gcr.io/gke-release/gke-metrics-agent:0.1.3-gke.0"
        resources:
          requests:
            memory: 50Mi
            cpu: 3m
          limits:
            memory: 70Mi
        env:
        - name: NODE_NAME
          valueFrom:
            fieldRef:
              fieldPath: spec.nodeName
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: KUBELET_HOST
          value: "127.0.0.1"
        - name: ARG1
          value: "${1}"
        - name: ARG2
          value: "${2}"
        - name: WINDOWS_JOB_ACTION
          value: "drop"
        command:
        - "/otelsvc"
        - "--config=/conf/gke-metrics-agent-config.yaml"
        - "--metrics-level=NONE"
        volumeMounts:
        - name: gke-metrics-agent-config-vol
          mountPath: /conf
        - name: ssl-certs
          mountPath: /etc/ssl/certs
          readOnly: true
      volumes:
      - configMap:
          name: gke-metrics-agent-conf
          items:
          - key: gke-metrics-agent-config
            path: gke-metrics-agent-config.yaml
        name: gke-metrics-agent-config-vol
      - name: ssl-certs
        hostPath:
          path: /etc/ssl/certs
---
# windows deployment
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: gke-metrics-agent-windows
  namespace: default
  labels:
    k8s-app: gke-metrics-agent
    component: gke-metrics-agent
spec:
  selector:
    matchLabels:
      k8s-app: gke-metrics-agent
      component: gke-metrics-agent
  template:
    metadata:
      labels:
        k8s-app: gke-metrics-agent
        component: gke-metrics-agent
    spec:
      nodeSelector:
        kubernetes.io/os: windows
      tolerations:
      - effect: NoExecute
        key: node.kubernetes.io/not-ready
        operator: Exists
        tolerationSeconds: 300
      - effect: NoExecute
        key: node.kubernetes.io/unreachable
        operator: Exists