【Prometheus】cAdvisor容器监控规则


其他说明参考host主机监控规则:https://www.cnblogs.com/sanduzxcvbnm/p/13589848.html

在prometheus主程序目录下的rules目录下新建docker.yml文件,添加上如下内容,然后重启prometheus。

groups:
- name:  Docker containers monitoring
  rules:
  - alert: ContainerKilled
    expr: time() - container_last_seen > 60
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container killed (instance {{ $labels.instance }})"
      description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerCpuUsage
    expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container CPU usage (instance {{ $labels.instance }})"
      description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerMemoryUsage
    expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Memory usage (instance {{ $labels.instance }})"
      description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerVolumeUsage
    expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume usage (instance {{ $labels.instance }})"
      description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerVolumeIoUsage
    expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container Volume IO usage (instance {{ $labels.instance }})"
      description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ContainerHighThrottleRate
    expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Container high throttle rate (instance {{ $labels.instance }})"
      description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerActiveConnectinos
    expr: pgbouncer_pools_server_active_connections > 200
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
      description: "PGBouncer pools are filling up\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerErrors
    expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "PGBouncer errors (instance {{ $labels.instance }})"
      description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: PgbouncerMaxConnections
    expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "PGBouncer max connections (instance {{ $labels.instance }})"
      description: "The number of PGBouncer client connections has reached max_client_conn.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: SidekiqQueueSize
    expr: sidekiq_queue_size{} > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Sidekiq queue size (instance {{ $labels.instance }})"
      description: "Sidekiq queue {{ $labels.name }} is growing\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: SidekiqSchedulingLatencyTooHigh
    expr: max(sidekiq_queue_latency) > 120
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})"
      description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulServiceHealthcheckFailed
    expr: consul_catalog_service_node_healthy == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul service healthcheck failed (instance {{ $labels.instance }})"
      description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulMissingMasterNode
    expr: consul_raft_peers < 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul missing master node (instance {{ $labels.instance }})"
      description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
  - alert: ConsulAgentUnhealthy
    expr: consul_health_node_status{status="critical"} == 1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Consul agent unhealthy (instance {{ $labels.instance }})"
      description: "A Consul agent is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"