我们正在使用 AWS Cloudwatch 来监控 CPU 使用率、API 调用的 p99 延迟等。问题是在高峰流量期间,Amazon Cloudwatch Agent 本身的 CPU 使用率为 25%-35%,因此在很大程度上导致了高 CPU 使用率触发。我观察到 p99 延迟指标和 CPU 使用率指标之间存在直接关联。
我将 Amazon Cloudwatch 的配置文件粘贴到此处:
[agent]
collection_jitter = "0s"
debug = false
flush_interval = "1s"
flush_jitter = "0s"
hostname = ""
interval = "60s"
logfile = "/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log"
logtarget = "lumberjack"
metric_batch_size = 1000
metric_buffer_limit = 10000
omit_hostname = false
precision = ""
quiet = false
round_interval = false
[inputs]
[[inputs.cpu]]
fieldpass = ["usage_active"]
interval = …Run Code Online (Sandbox Code Playgroud)