From 9d54ffaeefe4e13b6e9ec98dbaf6b119070f8773 Mon Sep 17 00:00:00 2001 From: Miroslav Hadzhiev Date: Tue, 6 Nov 2018 10:58:39 +0200 Subject: [PATCH 1/2] Configure Prometheus to Send Properly Alerts to Slack Channels As an ops, I want an alert when a pod is restarting more than 5 times, so that I can verify the healthiness of the deployment As an ops, I want an alert when a node cpu reaches a certain threshold, so that I can preemptively handle possible outage Monitor Istio's Metrics with the NYNJA's External Prometheus Instance --- prometheus/releases/dev/prometheus.yaml | 1198 +++++++++++------ .../templates/alertmanager-deployment.yaml | 38 +- .../templates/alertmanager-rpl-configmap.yaml | 32 + ...alertmanager-slackapiurl-sealedsecret.yaml | 15 + .../pagerduty-servicekey-sealedsecrets.yaml | 15 + prometheus/templates/server-deployment.yaml | 8 +- prometheus/values.yaml | 401 +++++- 7 files changed, 1228 insertions(+), 479 deletions(-) create mode 100644 prometheus/templates/alertmanager-rpl-configmap.yaml create mode 100644 prometheus/templates/alertmanager-slackapiurl-sealedsecret.yaml create mode 100644 prometheus/templates/pagerduty-servicekey-sealedsecrets.yaml diff --git a/prometheus/releases/dev/prometheus.yaml b/prometheus/releases/dev/prometheus.yaml index 278a655..af4d1dd 100644 --- a/prometheus/releases/dev/prometheus.yaml +++ b/prometheus/releases/dev/prometheus.yaml @@ -59,6 +59,21 @@ spec: ## to NOT generate a ConfigMap resource ## configMapOverrideName: "" + pagerduty: + ## If true, PagerDuty will be enabled for Alertmanager + ## + enabled: true + ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) + servicekey: "AgB9OBhMUBSsH40xfk75iG86hQpPRrlHMrt+qF0zgIJa6ELlWWDz5ZyR0bNIFpZ05rcRT0CHgaIpgsDii8JZU0GcMor7jO96f1GR8ryqOtijKJgxMvKgCN0ZIJaSLBKNKEEwWcN2WKIe8HDaSp39fg91lp4nde7tHGDnRv76nuukFzILFhJ8aTt8a2+5nQhrNGsbCUZSoOI0nyHuw79dpamPJc3GfVYHwmZRxWxp9Pxi7JN7699fYQUS/RCdxzOw9uKCS1iwEFFXDZRWq27ylsjp0QLx+6tdWF1fWuYh1kmAaHB8ud5ZXnOde1IshgkZvhh50Du94EY4ywdyOtDFVe5aufkI9Mj1WM9Vb9fu6iDl7WDFH8QOefsYzi1HSyDoFz7IlD6h+PCuzsU6xGCmHOpad/73apSjW5vRRqTPRJNRvisr/OnSn5+wVKTsNBqe700pBKtjuXJ1lpIMQssXz1CJClwZ1ssS7EhMTnodU+J2CdsABskVW1Nj2cDHUGvEp32N39TTpidMG0Qhj/0qdpLcpUZZRGPzRw3Z3tsEtyMv2CxkgQgSCMrH5VvVkPltQ2lPAolMg0zu/3fjLgeKPf+ba24GORckRIrW4Y+Vwm5gn8OUI1eypBTrOdvFO7yIdtzDOzV4N2pPD/thZRFRk48MDxXXtlnZQzcdq6MEXj1s6nhSHQ0fKHYIqpCzA1zpqpkyoLGe9FfbWDMipk6YeFXsmnTLPepHdi5SUlQDAt7Eeg==" + + slackapi: + ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty + ## + enabled: true + ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) + url: "AgARq4InE0N92opyhRiR7dXnHrtovhFnR1BaKc5eI0tKNhLxZNErNoLuICOePtcI3z8f229NBMHaCBQwjqTQxajATSliHv3Ti2kDEqN0ZQFIVgdBnQef7jgamS5q3fZM8Jd6hSnHE3vBCIqh2REMlCCxhM7OtSqNtzed8uCHqUtjo4fHmL6aG/P+QXtd9kybmIFhbWb9l3BQLEVujvZsLdrFLv1wGHuoSgqiWLmM6H+AfwGdZzylRIrzoRICeYHIGZBQgKMfHIF946R90PLbmydaA0JHej3mVtdtXNkRUXuLR/pupvZHkMIBBcvBwW/vDR+VEwheO+KMmo9vNvQ5aJSD7qRHGC0ZdySa4s+UjeKe4Qdf0lwjhqyr74scOs9MflLn2XY6UlnHQU8RZOSrlC+Pw/NqQDO3jPk2RfpNaf6Mm9FnkO8ThpMoZlM42u7Aj/BG0mGMHX0S5OtkjvEQk6zQgp1HvOgyfAlABH/fX6TlU9z0du7hI8JZWELS1RvJ7EnXhu0oEmOqpl23YjDtoihG7OFMyg4kvenS8rEXYsRWfbbW8Q2iEQBqDMck4PSkBjSKMp76UgmuFY7fwL724eO9kpO9Oigb01BtItPZv61eMkvvYN0EUzeoFz9omjg5euvlK0X08hWchG8/moaa7HUlysZNVzIYGd0FTjs5cip85Sq8r7QPPoLsWfUb7A80e4WVv44G4LuxMTB4uNw6n8ewAqf90pHSUMRQ+N8jycZYkiz9oGOOAwO4sWVEKu8Qlpf3QAkJQHIMi8Y4AFmxr+EITKB7sQFmM9zT4Uozog==" + + ingress: ## If true, alertmanager Ingress will be created ## @@ -126,7 +141,6 @@ spec: mountPath: /data ## alertmanager data Persistent Volume size ## - # size: 2Gi size: 10Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: @@ -510,7 +524,6 @@ spec: mountPath: /data ## Prometheus server data Persistent Volume size ## - # size: 8Gi size: 50Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: @@ -539,16 +552,19 @@ spec: requests: cpu: 500m memory: 512Mi - + ## Security context to be added to server pods ## securityContext: {} + externalLabels: + cluster: "DEV" + service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "dev" - nynja.biz/probe: "prometheus" + nynja.biz/env: "dev" + nynja.biz/probe: "prometheus" labels: {} clusterIP: "" ## List of IP addresses at which the Prometheus server service is available @@ -565,13 +581,17 @@ spec: selector: - internal-gateway.default.svc.cluster.local hosts: - - prometheus.dev-eu.nynja.net + - prometheus.dev-eu.nynja.net ## Prometheus server pod termination grace period ## terminationGracePeriodSeconds: 300 ## Prometheus data retention period (i.e 360h) ## retention: "" + ## Set the namespace where Istio is installed - default: "istio-system" + namespace: + istio: "istio-system" + pushgateway: ## If false, pushgateway will not be installed ## @@ -579,7 +599,7 @@ spec: ## pushgateway container name ## name: pushgateway - + ## pushgateway container image ## image: @@ -659,419 +679,759 @@ spec: ## alertmanager ConfigMap entries ## alertmanagerFiles: - alertmanager.yml: - global: - # slack_api_url: '' - slack_api_url: https://hooks.slack.com/services/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 - receivers: - - name: default-receiver - slack_configs: - - channel: '#ops-alerts' - send_resolved: true - username: 'alerts{{ if eq .Status "firing" }}_firing{{ else }}_resolved{{ end }}' - title: '{{ template "slack.default.title" . }}' - text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity | toUpper }}` <{{ .GeneratorURL }}|:chart_with_upwards_trend:> - *Description:* {{ .Annotations.description }} - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} - icon_emoji: '{{ if eq .Status "firing" }}:fire:{{ else }}:sun_with_face:{{ end }}' - route: - group_wait: 30s - group_interval: 5m - receiver: default-receiver - repeat_interval: 3h - #group_by: ['alertname', 'cluster', 'env'] - group_by: ['alertname', 'cluster'] - routes: - - match: - env: dev - group_wait: 5m - repeat_interval: 24h - ## Prometheus server ConfigMap entries - ## - serverFiles: - alerts: {} + notifications.tpl: |- + {{ define "__alertmanager" }}Environment: ___PROMETHEUS_CLUSTER_NAME___{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + + + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + {{ define "slack.default.footer" }}{{ end }} + + alertmanager.yml: + global: + # slack_api_url: '' + slack_api_url: ___ALERTMANAGER_SLACK_API_URL___ + + receivers: + - name: default-receiver + + slack_configs: + - channel: '#ops-alerts' + send_resolved: false + username: '{{ template "slack.default.username" . }}' + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "slack.default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ .CommonAnnotations.summary }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + fallback: '{{ template "slack.default.fallback" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + + pagerduty_configs: + - service_key: ___PAGERDUTY_SERVICEKEY___ + + templates: + - /automations/notifications.tpl + + route: + group_wait: 30s + group_interval: 5m + receiver: default-receiver + repeat_interval: 3h + #group_by: ['alertname', 'cluster', 'env'] + group_by: ['alertname', 'cluster'] + routes: + - match: + env: dev + group_wait: 5m + repeat_interval: 24h + + +## Prometheus server ConfigMap entries +## +serverFiles: + alerts: {} + rules: + groups: + - name: general.rules rules: - groups: - - name: Cassandra - rules: - - alert: CassandraProbeDown - expr: up{job="cassandra"} != 3 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra probe down" - description: "The Cassandra probe of {{ $labels.instance }} ({{$labels.env}}) is down" - - alert: CassandraDashboardNotReachable - expr: probe_success{job="cassandra"} == 0 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra dashboard not reachable" - description: "The Cassandra dashboard on {{ $labels.instance }} ({{ $labels.env }}) is not reachable" - - alert: CassandraDeadNodes - expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra dead nodes count - description: "The Cassandra node of {{ $labels.instance }} ({{$labels.env}}) is down" - - alert: CassandraConnectionTimeouts - expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra - number of requests timeouts over 1 min - description: "Cassandra cluster: {{$labels.release}} ({{$labels.env}}) - number of requests timeouts over 1 min" - - name: Monitoring - rules: - - alert: MonitoringStackProbeDown - expr: up{job="monitoring-stack"} != 2 - for: 30s - labels: - severity: major - annotations: - summary: "Monitoring Stack probe down" - description: "The Monitoring Stack probe of {{ $labels.instance }} ({{$labels.env}}) is down" - prometheus.yml: - rule_files: - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - labels: - env: dev - #- job_name: grafana - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "grafana.dev-eu.nynja.net:80" - # labels: - # env: dev - - job_name: 'grafana' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: grafana - - job_name: 'prometheus-pushgateway' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: pushgateway - - job_name: 'prometheus-alertmanager' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: alertmanager - #- job_name: prometheus-alertmanager - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "10.43.242.113:80" - # labels: - # env: dev - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - # Scrape config for API servers. - # - # Kubernetes exposes API servers as endpoints to the default/kubernetes - # service so this uses `endpoints` role and uses relabelling to only keep - # the endpoints associated with the default/kubernetes service using the - # default named port `https`. This works for single API server deployments as - # well as HA API server deployments. - - job_name: 'kubernetes-apiservers' - kubernetes_sd_configs: - - role: endpoints - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - # Keep only the default/kubernetes service endpoints for the https port. This - # will add targets for each API server which Kubernetes adds an endpoint to - # the default/kubernetes service. - relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: default;kubernetes;https - - job_name: 'kubernetes-nodes' - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - - job_name: 'kubernetes-nodes-cadvisor' - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/${1}:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - # Scrape config for service endpoints. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/scrape`: Only scrape services that have a value of `true` - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - - job_name: 'kubernetes-service-endpoints' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - # Example scrape config for probing services via the Blackbox Exporter. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DaemonsetFailedPods + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready > 0 + for: 5m + labels: + severity: critical + annotations: + description: 'DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes' + summary: DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes + + - name: Cassandra + rules: + - alert: CassandraProbeDown + expr: up{job="cassandra"} != 3 + for: 30s + labels: + severity: major + annotations: + summary: "Cassandra probe down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + - alert: CassandraDashboardNotReachable + expr: probe_success{job="cassandra"} == 0 + for: 30s + labels: + severity: major + annotations: + summary: "Cassandra dashboard not reachable" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra dashboard on {{ $labels.instance }} ({{- $labels.env -}}) is not reachable" + - alert: CassandraDeadNodes + expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 + for: 30s + labels: + severity: major + annotations: + summary: Cassandra dead nodes count + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra node of {{ $labels.instance }} ({{- $labels.env -}}) is down" + - alert: CassandraConnectionTimeouts + expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 + for: 30s + labels: + severity: major + annotations: + summary: Cassandra - number of requests timeouts over 1 min + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "Cassandra cluster: {{ $labels.release }} ({{- $labels.env -}}) - number of requests timeouts over 1 min" + - alert: MonitoringStackProbeDown + expr: up{job="monitoring-stack"} != 2 + for: 30s + labels: + severity: major + annotations: + summary: "Monitoring Stack probe down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Monitoring Stack probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + + - name: kube-state-metrics.rules + rules: + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + summary: Daemonsets are not scheduled correctly + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: A number of daemonsets are not scheduled. + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + summary: DaemonSet is missing pods + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Only {{ $value }}% of desired pods scheduled and ready for daemon + set {{ $labels.namespaces }}/{{ $labels.daemonset }} + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + summary: Pod is restarting frequently + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Pod {{ $labels.pod}} on {{ $labels.instance }}, ({{- $labels.env -}}) was restarted {{ $value }} + times within the last hour + + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + + - name: node.rules + rules: + - alert: NodeCPUUsage + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[5m])) * 100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }}: High Node CPU usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: Node CPU usage is above 80% (current value is: {{ $value }})" + - alert: NodeMemoryUsage + expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }}: High node memory usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: Node Memory usage is above 80% (current value is: {{ $value }})" + + - name: istio.rules + rules: + - alert: IstioRequests-to-CallingService + expr: sum(rate(istio_requests_total{reporter="source",destination_service=~"calling-service.callconf.svc.cluster.local"}[30s])) > 20 + for: 1m + labels: + severity: major + annotations: + summary: "{{ $labels.instance }}: High Number of Istio Requests to Calling-Service" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" + + prometheus.yml: + rule_files: + - /data/etc/config/rules + - /data/etc/config/alerts + + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + labels: + env: dev + + #- job_name: grafana + # scrape_interval: 15s + # scrape_timeout: 10s + # metrics_path: /metrics + # scheme: http + # static_configs: + # - targets: + # - "grafana.dev-eu.nynja.net:80" + # labels: + # env: dev + + - job_name: 'grafana' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: grafana + + + - job_name: 'prometheus-pushgateway' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: pushgateway + + - job_name: 'prometheus-alertmanager' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: alertmanager + + + #- job_name: prometheus-alertmanager + # scrape_interval: 15s + # scrape_timeout: 10s + # metrics_path: /metrics + # scheme: http + # static_configs: + # - targets: + # - "10.43.242.113:80" + # labels: + # env: dev + + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # * `prometheus.io/probe`: Only probe services that have a value of `true` - - job_name: 'kubernetes-services' - metrics_path: /probe - params: - module: [http_2xx] - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: true - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: blackbox - - source_labels: [__param_target] - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name - # Example scrape config for pods + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + + - job_name: 'kubernetes-nodes' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + + - job_name: 'kubernetes-nodes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - - job_name: 'cassandra' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: cassandra-exporter - networkPolicy: - ## Enable creation of NetworkPolicy resources. - ## - enabled: false \ No newline at end of file + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + # This configuration will work only on kubelet 1.7.3+ + # As the scrape endpoints for cAdvisor have changed + # if you are using older version you need to change the replacement to + # replacement: /api/v1/nodes/${1}:4194/proxy/metrics + # more info here https://github.com/coreos/prometheus-operator/issues/633 + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + + + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods' + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + + - job_name: 'cassandra' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: cassandra-exporter + + + - job_name: 'istio-mesh' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;prometheus + +# {{- if ne .Values.global.proxy.stats.prometheusPort 0. -}} + # Scrape config for envoy stats + - job_name: 'envoy-stats' + metrics_path: /stats/prometheus + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: '.*-envoy-prom' + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:{{ .Values.global.proxy.stats.prometheusPort }} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod_name + + metric_relabel_configs: + # Exclude some of the envoy metrics that have massive cardinality + # This list may need to be pruned further moving forward, as informed + # by performance and scalability testing. + - source_labels: [ cluster_name ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ tcp_prefix ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ listener_address ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_listener_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tls.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tcp_downstream.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_http_(stats|admin).*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' + action: drop +# {{- end -}} + + - job_name: 'istio-policy' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-policy;http-monitoring + + - job_name: 'istio-telemetry' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;http-monitoring + + - job_name: 'pilot' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-pilot;http-monitoring + + - job_name: 'galley' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-galley;http-monitoring + + +networkPolicy: + ## Enable creation of NetworkPolicy resources. + ## + enabled: false diff --git a/prometheus/templates/alertmanager-deployment.yaml b/prometheus/templates/alertmanager-deployment.yaml index 34c27e6..6682122 100644 --- a/prometheus/templates/alertmanager-deployment.yaml +++ b/prometheus/templates/alertmanager-deployment.yaml @@ -34,20 +34,30 @@ spec: - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }} image: "{{ .Values.alertmanager.image.repository }}:{{ .Values.alertmanager.image.tag }}" imagePullPolicy: "{{ .Values.alertmanager.image.pullPolicy }}" + command: + - "/bin/sh" + - "-c" + - "/automations/alertmanager_rpl.sh" + env: + {{- if (.Values.alertmanager.pagerduty.enabled) and (.Values.alertmanager.slackapi.enabled) }} + - name: PAGERDUTY_SERVICEKEY + valueFrom: + secretKeyRef: + name: {{ template "prometheus.alertmanager.fullname" . }}-pagerduty-servicekey + key: servicekey + + - name: ALERTMANAGER_SLACK_API_URL + valueFrom: + secretKeyRef: + name: {{ template "prometheus.alertmanager.fullname" . }}-slackapi-url + key: slackapi + {{- end }} + {{- range $key, $value := .Values.alertmanager.extraEnv }} - name: {{ $key }} value: {{ $value }} {{- end }} - args: - - --config.file=/etc/config/alertmanager.yml - - --storage.path={{ .Values.alertmanager.persistentVolume.mountPath }} - {{- range $key, $value := .Values.alertmanager.extraArgs }} - - --{{ $key }}={{ $value }} - {{- end }} - {{- if .Values.alertmanager.baseURL }} - - --web.external-url={{ .Values.alertmanager.baseURL }} - {{- end }} ports: - containerPort: 9093 @@ -65,6 +75,10 @@ spec: - name: storage-volume mountPath: "{{ .Values.alertmanager.persistentVolume.mountPath }}" subPath: "{{ .Values.alertmanager.persistentVolume.subPath }}" + - name: config-volume-alertmanager-rpl + mountPath: /automations/alertmanager_rpl.sh + readOnly: true + subPath: alertmanager_rpl.sh - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }}-{{ .Values.configmapReload.name }} image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}" @@ -105,4 +119,10 @@ spec: {{- else }} emptyDir: {} {{- end -}} + {{- if (.Values.alertmanager.pagerduty.enabled) and (.Values.alertmanager.slackapi.enabled) }} + - name: config-volume-alertmanager-rpl + configMap: + name: {{ template "prometheus.alertmanager.fullname" . }}-rpl-configmap + defaultMode: 0700 + {{- end }} {{- end }} diff --git a/prometheus/templates/alertmanager-rpl-configmap.yaml b/prometheus/templates/alertmanager-rpl-configmap.yaml new file mode 100644 index 0000000..510e078 --- /dev/null +++ b/prometheus/templates/alertmanager-rpl-configmap.yaml @@ -0,0 +1,32 @@ +{{- if (.Values.alertmanager.pagerduty.enabled) and (.Values.alertmanager.slackapi.enabled) }} +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app: {{ template "prometheus.name" . }} + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: "{{ .Values.alertmanager.name }}-rpl" + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} + name: {{ template "prometheus.alertmanager.fullname" . }}-rpl-configmap +data: + alertmanager_rpl.sh: |- + #!/bin/sh + + # Set the Prometheus Cluster Name in the Alertmanager's notifications + /bin/cp -p /etc/config/notifications.tpl /automations/notifications.tpl; + /bin/sed -i s:___PROMETHEUS_CLUSTER_NAME___:"{{ .Values.server.externalLabels.cluster }}": /automations/notifications.tpl; + cat /automations/notifications.tpl; + + # Set the PagerDuty Service Key value in the Alertmanager's config + /bin/cp -p /etc/config/alertmanager.yml /etc/alertmanager/alertmanager_conf.yml; + /bin/sed -i s:___PAGERDUTY_SERVICEKEY___:${PAGERDUTY_SERVICEKEY}: /etc/alertmanager/alertmanager_conf.yml; + + # Set the Alertmanager Slack API URL value in the Alertmanager's config + /bin/sed -i s~___ALERTMANAGER_SLACK_API_URL___~${ALERTMANAGER_SLACK_API_URL}~ /etc/alertmanager/alertmanager_conf.yml; + cat /etc/alertmanager/alertmanager_conf.yml; + + # Start Alertmanager + /bin/alertmanager --config.file=/etc/alertmanager/alertmanager_conf.yml --storage.path={{ .Values.alertmanager.persistentVolume.mountPath }} {{- range $key, $value := .Values.alertmanager.extraArgs }} --{{ $key }}={{ $value }}{{- end }} {{- if .Values.alertmanager.baseURL }} --web.external-url={{ .Values.alertmanager.baseURL }}{{- end }} + +{{- end -}} diff --git a/prometheus/templates/alertmanager-slackapiurl-sealedsecret.yaml b/prometheus/templates/alertmanager-slackapiurl-sealedsecret.yaml new file mode 100644 index 0000000..79dc925 --- /dev/null +++ b/prometheus/templates/alertmanager-slackapiurl-sealedsecret.yaml @@ -0,0 +1,15 @@ +{{- if .Values.alertmanager.slackapi.enabled -}} +apiVersion: bitnami.com/v1alpha1 +kind: SealedSecret +metadata: + name: {{ template "prometheus.alertmanager.fullname" . }}-slackapi-url + labels: + app: {{ template "prometheus.name" . }} + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: "{{ .Values.alertmanager.name }}-slackapi-url" + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +spec: + encryptedData: + slackapi: {{ .Values.alertmanager.slackapi.url }} +{{- end -}} diff --git a/prometheus/templates/pagerduty-servicekey-sealedsecrets.yaml b/prometheus/templates/pagerduty-servicekey-sealedsecrets.yaml new file mode 100644 index 0000000..4068cfb --- /dev/null +++ b/prometheus/templates/pagerduty-servicekey-sealedsecrets.yaml @@ -0,0 +1,15 @@ +{{- if .Values.alertmanager.pagerduty.enabled -}} +apiVersion: bitnami.com/v1alpha1 +kind: SealedSecret +metadata: + name: {{ template "prometheus.alertmanager.fullname" . }}-pagerduty-servicekey + labels: + app: {{ template "prometheus.name" . }} + chart: {{ .Chart.Name }}-{{ .Chart.Version }} + component: "{{ .Values.alertmanager.name }}-pagerduty" + heritage: {{ .Release.Service }} + release: {{ .Release.Name }} +spec: + encryptedData: + servicekey: {{ .Values.alertmanager.pagerduty.servicekey }} +{{- end -}} diff --git a/prometheus/templates/server-deployment.yaml b/prometheus/templates/server-deployment.yaml index 2f1cb12..b952cbe 100644 --- a/prometheus/templates/server-deployment.yaml +++ b/prometheus/templates/server-deployment.yaml @@ -48,7 +48,7 @@ spec: image: "{{ .Values.configmapReload.image.repository }}:{{ .Values.configmapReload.image.tag }}" imagePullPolicy: "{{ .Values.configmapReload.image.pullPolicy }}" args: - - --volume-dir=/etc/config + - --volume-dir=/data/etc/config - --webhook-url=http://localhost:9090{{ .Values.server.prefixURL }}/-/reload {{- range $key, $value := .Values.configmapReload.extraArgs }} - --{{ $key }}={{ $value }} @@ -57,7 +57,7 @@ spec: {{ toYaml .Values.configmapReload.resources | indent 12 }} volumeMounts: - name: config-volume - mountPath: /etc/config + mountPath: /data/etc/config readOnly: true {{- range .Values.configmapReload.extraConfigmapMounts }} - name: {{ $.Values.configmapReload.name }}-{{ .name }} @@ -72,7 +72,7 @@ spec: {{- if .Values.server.retention }} - --storage.tsdb.retention={{ .Values.server.retention }} {{- end }} - - --config.file=/etc/config/prometheus.yml + - --config.file=/data/etc/config/prometheus.yml - --storage.tsdb.path={{ .Values.server.persistentVolume.mountPath }} - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles @@ -104,7 +104,7 @@ spec: {{ toYaml .Values.server.resources | indent 12 }} volumeMounts: - name: config-volume - mountPath: /etc/config + mountPath: /data/etc/config - name: storage-volume mountPath: {{ .Values.server.persistentVolume.mountPath }} subPath: "{{ .Values.server.persistentVolume.subPath }}" diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 7e2dbe0..b625426 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -60,6 +60,20 @@ alertmanager: ## configMapOverrideName: "" + pagerduty: + ## If true, PagerDuty will be enabled for Alertmanager + ## + enabled: true + ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) + servicekey: "AgB9OBhMUBSsH40xfk75iG86hQpPRrlHMrt+qF0zgIJa6ELlWWDz5ZyR0bNIFpZ05rcRT0CHgaIpgsDii8JZU0GcMor7jO96f1GR8ryqOtijKJgxMvKgCN0ZIJaSLBKNKEEwWcN2WKIe8HDaSp39fg91lp4nde7tHGDnRv76nuukFzILFhJ8aTt8a2+5nQhrNGsbCUZSoOI0nyHuw79dpamPJc3GfVYHwmZRxWxp9Pxi7JN7699fYQUS/RCdxzOw9uKCS1iwEFFXDZRWq27ylsjp0QLx+6tdWF1fWuYh1kmAaHB8ud5ZXnOde1IshgkZvhh50Du94EY4ywdyOtDFVe5aufkI9Mj1WM9Vb9fu6iDl7WDFH8QOefsYzi1HSyDoFz7IlD6h+PCuzsU6xGCmHOpad/73apSjW5vRRqTPRJNRvisr/OnSn5+wVKTsNBqe700pBKtjuXJ1lpIMQssXz1CJClwZ1ssS7EhMTnodU+J2CdsABskVW1Nj2cDHUGvEp32N39TTpidMG0Qhj/0qdpLcpUZZRGPzRw3Z3tsEtyMv2CxkgQgSCMrH5VvVkPltQ2lPAolMg0zu/3fjLgeKPf+ba24GORckRIrW4Y+Vwm5gn8OUI1eypBTrOdvFO7yIdtzDOzV4N2pPD/thZRFRk48MDxXXtlnZQzcdq6MEXj1s6nhSHQ0fKHYIqpCzA1zpqpkyoLGe9FfbWDMipk6YeFXsmnTLPepHdi5SUlQDAt7Eeg==" + + slackapi: + ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty + ## + enabled: true + ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) + url: "AgARq4InE0N92opyhRiR7dXnHrtovhFnR1BaKc5eI0tKNhLxZNErNoLuICOePtcI3z8f229NBMHaCBQwjqTQxajATSliHv3Ti2kDEqN0ZQFIVgdBnQef7jgamS5q3fZM8Jd6hSnHE3vBCIqh2REMlCCxhM7OtSqNtzed8uCHqUtjo4fHmL6aG/P+QXtd9kybmIFhbWb9l3BQLEVujvZsLdrFLv1wGHuoSgqiWLmM6H+AfwGdZzylRIrzoRICeYHIGZBQgKMfHIF946R90PLbmydaA0JHej3mVtdtXNkRUXuLR/pupvZHkMIBBcvBwW/vDR+VEwheO+KMmo9vNvQ5aJSD7qRHGC0ZdySa4s+UjeKe4Qdf0lwjhqyr74scOs9MflLn2XY6UlnHQU8RZOSrlC+Pw/NqQDO3jPk2RfpNaf6Mm9FnkO8ThpMoZlM42u7Aj/BG0mGMHX0S5OtkjvEQk6zQgp1HvOgyfAlABH/fX6TlU9z0du7hI8JZWELS1RvJ7EnXhu0oEmOqpl23YjDtoihG7OFMyg4kvenS8rEXYsRWfbbW8Q2iEQBqDMck4PSkBjSKMp76UgmuFY7fwL724eO9kpO9Oigb01BtItPZv61eMkvvYN0EUzeoFz9omjg5euvlK0X08hWchG8/moaa7HUlysZNVzIYGd0FTjs5cip85Sq8r7QPPoLsWfUb7A80e4WVv44G4LuxMTB4uNw6n8ewAqf90pHSUMRQ+N8jycZYkiz9oGOOAwO4sWVEKu8Qlpf3QAkJQHIMi8Y4AFmxr+EITKB7sQFmM9zT4Uozog==" + ingress: ## If true, alertmanager Ingress will be created ## @@ -141,7 +155,7 @@ alertmanager: ## alertmanager data Persistent Volume size ## - size: 2Gi + size: 4Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: @@ -167,12 +181,12 @@ alertmanager: ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## resources: {} - # limits: - # cpu: 10m - # memory: 32Mi - # requests: - # cpu: 10m - # memory: 32Mi + # limits: + # cpu: 50m + # memory: 160Mi + # requests: + # cpu: 10m + # memory: 32Mi ## Security context to be added to alertmanager pods ## @@ -586,7 +600,7 @@ server: ## Prometheus server data Persistent Volume size ## - size: 8Gi + size: 15Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: @@ -624,12 +638,15 @@ server: ## securityContext: {} + externalLabels: {} + #cluster: "DEV" + service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "dev" - nynja.biz/probe: "prometheus" + #nynja.biz/env: "dev" + nynja.biz/probe: "prometheus" labels: {} clusterIP: "" @@ -658,6 +675,10 @@ server: ## retention: "" + ## Set the namespace where Istio is installed - default: "istio-system" + namespace: + istio: "istio-system" + pushgateway: ## If false, pushgateway will not be installed ## @@ -761,27 +782,64 @@ pushgateway: ## alertmanager ConfigMap entries ## alertmanagerFiles: + notifications.tpl: |- + {{ define "__alertmanager" }}Environment: ___PROMETHEUS_CLUSTER_NAME___{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + + + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + {{ define "slack.default.footer" }}{{ end }} + alertmanager.yml: global: # slack_api_url: '' - slack_api_url: https://hooks.slack.com/services/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 + slack_api_url: ___ALERTMANAGER_SLACK_API_URL___ receivers: - name: default-receiver + slack_configs: - channel: '#ops-alerts' - send_resolved: true - username: 'alerts{{ if eq .Status "firing" }}_firing{{ else }}_resolved{{ end }}' + send_resolved: false + username: '{{ template "slack.default.username" . }}' + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' title: '{{ template "slack.default.title" . }}' - text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity | toUpper }}` <{{ .GeneratorURL }}|:chart_with_upwards_trend:> - *Description:* {{ .Annotations.description }} - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} - icon_emoji: '{{ if eq .Status "firing" }}:fire:{{ else }}:sun_with_face:{{ end }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ .CommonAnnotations.summary }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + fallback: '{{ template "slack.default.fallback" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + + pagerduty_configs: + - service_key: ___PAGERDUTY_SERVICEKEY___ + + templates: + - /automations/notifications.tpl route: group_wait: 30s @@ -796,12 +854,32 @@ alertmanagerFiles: group_wait: 5m repeat_interval: 24h + ## Prometheus server ConfigMap entries ## serverFiles: alerts: {} rules: groups: + - name: general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DaemonsetFailedPods + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready > 0 + for: 5m + labels: + severity: critical + annotations: + description: 'DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes' + summary: DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes + - name: Cassandra rules: - alert: CassandraProbeDown @@ -811,7 +889,8 @@ serverFiles: severity: major annotations: summary: "Cassandra probe down" - description: "The Cassandra probe of {{ $labels.instance }} ({{$labels.env}}) is down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" - alert: CassandraDashboardNotReachable expr: probe_success{job="cassandra"} == 0 for: 30s @@ -819,7 +898,8 @@ serverFiles: severity: major annotations: summary: "Cassandra dashboard not reachable" - description: "The Cassandra dashboard on {{ $labels.instance }} ({{ $labels.env }}) is not reachable" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra dashboard on {{ $labels.instance }} ({{- $labels.env -}}) is not reachable" - alert: CassandraDeadNodes expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 for: 30s @@ -827,7 +907,8 @@ serverFiles: severity: major annotations: summary: Cassandra dead nodes count - description: "The Cassandra node of {{ $labels.instance }} ({{$labels.env}}) is down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra node of {{ $labels.instance }} ({{- $labels.env -}}) is down" - alert: CassandraConnectionTimeouts expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 for: 30s @@ -835,7 +916,8 @@ serverFiles: severity: major annotations: summary: Cassandra - number of requests timeouts over 1 min - description: "Cassandra cluster: {{$labels.release}} ({{$labels.env}}) - number of requests timeouts over 1 min" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "Cassandra cluster: {{ $labels.release }} ({{- $labels.env -}}) - number of requests timeouts over 1 min" - alert: MonitoringStackProbeDown expr: up{job="monitoring-stack"} != 2 for: 30s @@ -843,12 +925,94 @@ serverFiles: severity: major annotations: summary: "Monitoring Stack probe down" - description: "The Monitoring Stack probe of {{ $labels.instance }} ({{$labels.env}}) is down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Monitoring Stack probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + + - name: kube-state-metrics.rules + rules: + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + summary: Daemonsets are not scheduled correctly + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: A number of daemonsets are not scheduled. + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + summary: DaemonSet is missing pods + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Only {{ $value }}% of desired pods scheduled and ready for daemon + set {{ $labels.namespaces }}/{{ $labels.daemonset }} + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + summary: Pod is restarting frequently + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Pod {{ $labels.pod}} on {{ $labels.instance }}, ({{- $labels.env -}}) was restarted {{ $value }} + times within the last hour + + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + + - name: node.rules + rules: + - alert: NodeCPUUsage + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[5m])) * 100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }}: High Node CPU usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: Node CPU usage is above 80% (current value is: {{ $value }})" + - alert: NodeMemoryUsage + expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }}: High node memory usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: Node Memory usage is above 80% (current value is: {{ $value }})" + + - name: istio.rules + rules: + - alert: IstioRequests-to-CallingService + expr: sum(rate(istio_requests_total{reporter="source",destination_service=~"calling-service.callconf.svc.cluster.local"}[30s])) > 20 + for: 1m + labels: + severity: major + annotations: + summary: "{{ $labels.instance }}: High Number of Istio Requests to Calling-Service" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" prometheus.yml: rule_files: - - /etc/config/rules - - /etc/config/alerts + - /data/etc/config/rules + - /data/etc/config/alerts scrape_configs: - job_name: prometheus @@ -856,8 +1020,8 @@ serverFiles: - targets: - localhost:9090 labels: - env: dev - + # env: dev + # #- job_name: grafana # scrape_interval: 15s # scrape_timeout: 10s @@ -867,7 +1031,7 @@ serverFiles: # - targets: # - "grafana.dev-eu.nynja.net:80" # labels: - # env: dev + # env: dev - job_name: 'grafana' honor_labels: true @@ -891,7 +1055,7 @@ serverFiles: action: keep regex: grafana - + - job_name: 'prometheus-pushgateway' honor_labels: true @@ -913,7 +1077,7 @@ serverFiles: - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] action: keep regex: pushgateway - + - job_name: 'prometheus-alertmanager' honor_labels: true @@ -936,7 +1100,7 @@ serverFiles: action: keep regex: alertmanager - + #- job_name: prometheus-alertmanager # scrape_interval: 15s # scrape_timeout: 10s @@ -946,8 +1110,8 @@ serverFiles: # - targets: # - "10.43.242.113:80" # labels: - # env: dev - + # env: dev + # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) # and services to allow each to use different authentication configs. @@ -996,7 +1160,7 @@ serverFiles: action: keep regex: default;kubernetes;https - + - job_name: 'kubernetes-nodes' # Default to scraping over https. If required, just disable this or change to @@ -1075,7 +1239,7 @@ serverFiles: target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - + # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured @@ -1117,9 +1281,9 @@ serverFiles: - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name - - + + # Example scrape config for probing services via the Blackbox Exporter. # # The relabeling allows the actual service scrape endpoint to be configured @@ -1152,7 +1316,7 @@ serverFiles: - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name - + # Example scrape config for pods # # The relabeling allows the actual pod scrape endpoint to be configured via the @@ -1188,7 +1352,7 @@ serverFiles: action: replace target_label: kubernetes_pod_name - + - job_name: 'cassandra' kubernetes_sd_configs: @@ -1218,15 +1382,158 @@ serverFiles: target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace - target_label: kubernetes_name + target_label: kubernetes_name - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] action: replace - target_label: env + target_label: env - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] action: keep regex: cassandra-exporter - - + + + - job_name: 'istio-mesh' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;prometheus + +# {{- if ne .Values.global.proxy.stats.prometheusPort 0. -}} + # Scrape config for envoy stats + - job_name: 'envoy-stats' + metrics_path: /stats/prometheus + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: '.*-envoy-prom' + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:{{ .Values.global.proxy.stats.prometheusPort }} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod_name + + metric_relabel_configs: + # Exclude some of the envoy metrics that have massive cardinality + # This list may need to be pruned further moving forward, as informed + # by performance and scalability testing. + - source_labels: [ cluster_name ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ tcp_prefix ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ listener_address ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_listener_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tls.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tcp_downstream.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_http_(stats|admin).*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' + action: drop +# {{- end -}} + + - job_name: 'istio-policy' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-policy;http-monitoring + + - job_name: 'istio-telemetry' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;http-monitoring + + - job_name: 'pilot' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-pilot;http-monitoring + + - job_name: 'galley' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-galley;http-monitoring + + networkPolicy: ## Enable creation of NetworkPolicy resources. ## -- GitLab From 0652bf41b35b88bb23ea4bf34d6968b5b5ac83d5 Mon Sep 17 00:00:00 2001 From: Miroslav Hadzhiev Date: Tue, 20 Nov 2018 18:32:55 +0200 Subject: [PATCH 2/2] PR 3; Pull 2 - (1) storage space tweaks related to logs' retention policy; (2) setting relevant pod resources limits; (3) no secret data in 'values.yaml'. --- prometheus/releases/dev/prometheus.yaml | 100 ++++++----- .../templates/alertmanager-deployment.yaml | 1 + prometheus/values.yaml | 170 ++++++++++-------- 3 files changed, 152 insertions(+), 119 deletions(-) diff --git a/prometheus/releases/dev/prometheus.yaml b/prometheus/releases/dev/prometheus.yaml index af4d1dd..35a3a25 100644 --- a/prometheus/releases/dev/prometheus.yaml +++ b/prometheus/releases/dev/prometheus.yaml @@ -37,7 +37,6 @@ spec: ## image: repository: prom/alertmanager - #tag: v0.15.0 tag: v0.15.2 pullPolicy: IfNotPresent ## Additional alertmanager container arguments @@ -64,14 +63,14 @@ spec: ## enabled: true ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) - servicekey: "AgB9OBhMUBSsH40xfk75iG86hQpPRrlHMrt+qF0zgIJa6ELlWWDz5ZyR0bNIFpZ05rcRT0CHgaIpgsDii8JZU0GcMor7jO96f1GR8ryqOtijKJgxMvKgCN0ZIJaSLBKNKEEwWcN2WKIe8HDaSp39fg91lp4nde7tHGDnRv76nuukFzILFhJ8aTt8a2+5nQhrNGsbCUZSoOI0nyHuw79dpamPJc3GfVYHwmZRxWxp9Pxi7JN7699fYQUS/RCdxzOw9uKCS1iwEFFXDZRWq27ylsjp0QLx+6tdWF1fWuYh1kmAaHB8ud5ZXnOde1IshgkZvhh50Du94EY4ywdyOtDFVe5aufkI9Mj1WM9Vb9fu6iDl7WDFH8QOefsYzi1HSyDoFz7IlD6h+PCuzsU6xGCmHOpad/73apSjW5vRRqTPRJNRvisr/OnSn5+wVKTsNBqe700pBKtjuXJ1lpIMQssXz1CJClwZ1ssS7EhMTnodU+J2CdsABskVW1Nj2cDHUGvEp32N39TTpidMG0Qhj/0qdpLcpUZZRGPzRw3Z3tsEtyMv2CxkgQgSCMrH5VvVkPltQ2lPAolMg0zu/3fjLgeKPf+ba24GORckRIrW4Y+Vwm5gn8OUI1eypBTrOdvFO7yIdtzDOzV4N2pPD/thZRFRk48MDxXXtlnZQzcdq6MEXj1s6nhSHQ0fKHYIqpCzA1zpqpkyoLGe9FfbWDMipk6YeFXsmnTLPepHdi5SUlQDAt7Eeg==" + servicekey: "AgA2hDVLrHgSL8dVZCg1FVRijEd8UuzFyGFllVFAFpeUSBmfELoxKR2kddkNb89Zh1W/CB3wiY6itGB9EuNgu/i8K+JGSd8aZJBnF9yZRe3Ug7mI0r4CHZfVb/q3f9AZHoGbYJRlOFxjWO+Gk62hjetTCBFRQ6aTuYc49kYGPqwresz0EeQ8AYtezqag3+3S3hafCQk2VPg/7p9KSry6vJ60LWjQaxqUkUNgT/4785briyXqsKoSSxuu7PBQ52Gklax5YO2Ik1JikUxxn91MqUvNzd3TUCTWm6ssUZaDcG50/ABEoKHrT47BzCyhebYZHfF+pfKdEbPy58WwvBt2UhZprytkZPs4FuCmj9LVVzwv80Sct5ro+ZxwjCkzUwtzSgy1ZH9oNano/lHBbhkZJ3rx03sDZcCP26myEqabGob9sD4iutDG2MAq6ytgh8FLmGZ6nAKg+kG9mSstnAxi0PzClJTRix60YAWTzMYQWbi1fngo2JoK7opGoIgUzeBiqDrIfWjhbUXhLxSdqub6G4N2iFozCr5jWpwdJCPVbWLSj+eQyhb9/WmFJixED1QIr/hLJYtIs1zYAEu23vyn8Mctw2F/vO/SUp7QxP32pSuMHM/hrOa7B6jJEXgSvxShMgKE0IfkeVsnQPeWBZvVz/iLFTRoDF3SSV7kdZlHiPS2UQa1hyBG0SGq9thUGn5DRZC3Oi8rBm9r2mUzmpVN9KgMGmv5q/x6/SGV3UxoxJ2ZJg==" slackapi: ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty ## enabled: true ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) - url: "AgARq4InE0N92opyhRiR7dXnHrtovhFnR1BaKc5eI0tKNhLxZNErNoLuICOePtcI3z8f229NBMHaCBQwjqTQxajATSliHv3Ti2kDEqN0ZQFIVgdBnQef7jgamS5q3fZM8Jd6hSnHE3vBCIqh2REMlCCxhM7OtSqNtzed8uCHqUtjo4fHmL6aG/P+QXtd9kybmIFhbWb9l3BQLEVujvZsLdrFLv1wGHuoSgqiWLmM6H+AfwGdZzylRIrzoRICeYHIGZBQgKMfHIF946R90PLbmydaA0JHej3mVtdtXNkRUXuLR/pupvZHkMIBBcvBwW/vDR+VEwheO+KMmo9vNvQ5aJSD7qRHGC0ZdySa4s+UjeKe4Qdf0lwjhqyr74scOs9MflLn2XY6UlnHQU8RZOSrlC+Pw/NqQDO3jPk2RfpNaf6Mm9FnkO8ThpMoZlM42u7Aj/BG0mGMHX0S5OtkjvEQk6zQgp1HvOgyfAlABH/fX6TlU9z0du7hI8JZWELS1RvJ7EnXhu0oEmOqpl23YjDtoihG7OFMyg4kvenS8rEXYsRWfbbW8Q2iEQBqDMck4PSkBjSKMp76UgmuFY7fwL724eO9kpO9Oigb01BtItPZv61eMkvvYN0EUzeoFz9omjg5euvlK0X08hWchG8/moaa7HUlysZNVzIYGd0FTjs5cip85Sq8r7QPPoLsWfUb7A80e4WVv44G4LuxMTB4uNw6n8ewAqf90pHSUMRQ+N8jycZYkiz9oGOOAwO4sWVEKu8Qlpf3QAkJQHIMi8Y4AFmxr+EITKB7sQFmM9zT4Uozog==" + url: "AgAjEehRPdWEzfjd74xhn0Zuh6SuSbZcRZe77Dcoj9wgXpfiTJEP+SEXUi9ft7UHutYSwvdi+c3JBRPkUAQFfgC99NglQMqT57Kyy6OGklW1RiA9PeAD2+sRYLuovNAhTpQyfE4b/5LV7wky/MRS+r2YOHjlsDXzKxyke2sTe+003/a8Ieo4lb9YuqzAfrH6eqgNlV1GFMya6hwI5AAJwXfWyV8TzZMgmcZ4QjIfSO+sqXaA+4asw0PXI+oGwtjvB7kuwUR9E/Y6LKnpsaCFXbddq7xfNH8cHU44eF0NPPSxTpBOqYguY0o/eVhSMullGUbW95m73rpyrSnV0YCJyOZjidmelANifintOT81cNVDXhkAOMAo+GcjStU1QC7LggS0+T6ThyPh62j4ZUXklJd/NJk6ltTmh38JK9EFzvDr2IBAaWbEvpPz05PZ5TDSfRrm3VSD5623LBqCZFCsf3QGyPfF3mu05Ya6wnaizdPaTp2EJMyNeeYOzP5rqezUY4IubVlclm6PGsfXdguJ1uSAyULpB87NG8+CzgMZ18TRiNwyQclZTEQ3npsLG53ZjByYg3Iu7DI/Kf0f/SK2c3yRpqM3qEtw6SMj4sgJWwNg0hgikxQUdMICbTPdhFiQOJXTohp+d+eNqUvm0mqdW1MvEf9jeiiWUg0DFgYUORdo7YxsL7PDjCgxPVNqdDRTMy2ZJMxNCmFaEfrHos9jYPi5PBN5TbTbVUu+aQmbEK0rG1xENir777SjhP8g5XZ6USe1QzJ1syXdGUaqhhTsS5eqrj7WfpyJs1MhZX9Png==" ingress: @@ -141,7 +140,7 @@ spec: mountPath: /data ## alertmanager data Persistent Volume size ## - size: 10Gi + size: 75Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -225,11 +224,11 @@ spec: #resources: {} resources: limits: - cpu: 50m - memory: 160Mi + cpu: 100m + memory: 320Mi requests: - cpu: 10m - memory: 32Mi + cpu: 20m + memory: 64Mi initChownData: ## If false, data ownership will not be reset at startup ## This allows the prometheus-server to be run with an arbitrary user @@ -250,11 +249,11 @@ spec: #resources: {} resources: limits: - cpu: 50m - memory: 160Mi + cpu: 75m + memory: 320Mi requests: - cpu: 10m - memory: 32Mi + cpu: 20m + memory: 64Mi kubeStateMetrics: ## If false, kube-state-metrics will not be installed ## @@ -266,7 +265,6 @@ spec: ## image: repository: quay.io/coreos/kube-state-metrics - #tag: v1.3.1 tag: v1.4.0 pullPolicy: IfNotPresent ## kube-state-metrics container arguments @@ -295,11 +293,11 @@ spec: ## resources: limits: - cpu: 50m - memory: 160Mi + cpu: 150m + memory: 480Mi requests: - cpu: 10m - memory: 16Mi + cpu: 30m + memory: 48Mi ## Security context to be added to kube-state-metrics pods ## securityContext: {} @@ -374,11 +372,11 @@ spec: ## resources: limits: - cpu: 100m - memory: 150Mi + cpu: 200m + memory: 300Mi requests: - cpu: 20m - memory: 32Mi + cpu: 40m + memory: 64Mi ## Security context to be added to node-exporter pods ## securityContext: {} @@ -407,7 +405,6 @@ spec: ## image: repository: prom/prometheus - #tag: v2.3.1 tag: v2.4.3 pullPolicy: IfNotPresent ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug @@ -430,6 +427,11 @@ spec: ## How frequently to evaluate rules ## evaluation_interval: 1m + ## Attach these labels to any time series or alerts when communicating with + ## external systems (federation, remote storage, Alertmanager). + ## + external_labels: + monitor: 'dev' ## Additional Prometheus server container arguments ## extraArgs: {} @@ -524,7 +526,7 @@ spec: mountPath: /data ## Prometheus server data Persistent Volume size ## - size: 50Gi + size: 250Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -547,17 +549,20 @@ spec: ## resources: limits: - cpu: 750m - memory: 768Mi + cpu: 1 + memory: 4Gi requests: - cpu: 500m - memory: 512Mi + cpu: 750m + memory: 2Gi ## Security context to be added to server pods ## securityContext: {} + + ## The environment name - shown as a headline in the Prometheus Alerts + # externalLabels: - cluster: "DEV" + cluster: "dev" service: annotations: @@ -823,7 +828,7 @@ serverFiles: annotations: summary: "Monitoring Stack probe down" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Monitoring Stack probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + description: "The Monitoring Stack probe of {{ $labels.instance }} on {{ $labels.monitor }} ({{- $labels.monitor -}}) is down" - name: kube-state-metrics.rules rules: @@ -856,7 +861,7 @@ serverFiles: annotations: summary: Pod is restarting frequently identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: Pod {{ $labels.pod}} on {{ $labels.instance }}, ({{- $labels.env -}}) was restarted {{ $value }} + description: Pod {{ $labels.pod}} of {{ $labels.instance }} on {{ $labels.monitor }}, ({{- $labels.monitor -}}) was restarted {{ $value }} times within the last hour - name: kubernetes.rules @@ -881,18 +886,18 @@ serverFiles: labels: severity: warning annotations: - summary: "{{ $labels.instance }}: High Node CPU usage detected" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Node CPU usage detected" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: Node CPU usage is above 80% (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node CPU usage is above 80% (current value is: {{ $value }})" - alert: NodeMemoryUsage expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 for: 30m labels: severity: warning annotations: - summary: "{{ $labels.instance }}: High node memory usage detected" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High node memory usage detected" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: Node Memory usage is above 80% (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node Memory usage is above 80% (current value is: {{ $value }})" - name: istio.rules rules: @@ -902,9 +907,9 @@ serverFiles: labels: severity: major annotations: - summary: "{{ $labels.instance }}: High Number of Istio Requests to Calling-Service" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Number of Istio Requests to Calling-Service" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" prometheus.yml: rule_files: @@ -997,17 +1002,18 @@ serverFiles: action: keep regex: alertmanager - - #- job_name: prometheus-alertmanager - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "10.43.242.113:80" - # labels: - # env: dev + ## Scrape the metrics of the Messaging Service on DEV + # + - job_name: Messaging-Service + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - "dev.ci.nynja.net:8888" + labels: + env: dev # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) diff --git a/prometheus/templates/alertmanager-deployment.yaml b/prometheus/templates/alertmanager-deployment.yaml index 6682122..e4a9903 100644 --- a/prometheus/templates/alertmanager-deployment.yaml +++ b/prometheus/templates/alertmanager-deployment.yaml @@ -34,6 +34,7 @@ spec: - name: {{ template "prometheus.name" . }}-{{ .Values.alertmanager.name }} image: "{{ .Values.alertmanager.image.repository }}:{{ .Values.alertmanager.image.tag }}" imagePullPolicy: "{{ .Values.alertmanager.image.pullPolicy }}" + #command: [ "/bin/sh", "-c", "ls -ltrahF '/automations/'; /bin/sh -c '/automations/alertmanager_rpl.sh';" ] command: - "/bin/sh" - "-c" diff --git a/prometheus/values.yaml b/prometheus/values.yaml index b625426..3352a4c 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -33,7 +33,7 @@ alertmanager: ## image: repository: prom/alertmanager - tag: v0.15.0 + tag: v0.15.2 pullPolicy: IfNotPresent ## Additional alertmanager container arguments @@ -65,14 +65,14 @@ alertmanager: ## enabled: true ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) - servicekey: "AgB9OBhMUBSsH40xfk75iG86hQpPRrlHMrt+qF0zgIJa6ELlWWDz5ZyR0bNIFpZ05rcRT0CHgaIpgsDii8JZU0GcMor7jO96f1GR8ryqOtijKJgxMvKgCN0ZIJaSLBKNKEEwWcN2WKIe8HDaSp39fg91lp4nde7tHGDnRv76nuukFzILFhJ8aTt8a2+5nQhrNGsbCUZSoOI0nyHuw79dpamPJc3GfVYHwmZRxWxp9Pxi7JN7699fYQUS/RCdxzOw9uKCS1iwEFFXDZRWq27ylsjp0QLx+6tdWF1fWuYh1kmAaHB8ud5ZXnOde1IshgkZvhh50Du94EY4ywdyOtDFVe5aufkI9Mj1WM9Vb9fu6iDl7WDFH8QOefsYzi1HSyDoFz7IlD6h+PCuzsU6xGCmHOpad/73apSjW5vRRqTPRJNRvisr/OnSn5+wVKTsNBqe700pBKtjuXJ1lpIMQssXz1CJClwZ1ssS7EhMTnodU+J2CdsABskVW1Nj2cDHUGvEp32N39TTpidMG0Qhj/0qdpLcpUZZRGPzRw3Z3tsEtyMv2CxkgQgSCMrH5VvVkPltQ2lPAolMg0zu/3fjLgeKPf+ba24GORckRIrW4Y+Vwm5gn8OUI1eypBTrOdvFO7yIdtzDOzV4N2pPD/thZRFRk48MDxXXtlnZQzcdq6MEXj1s6nhSHQ0fKHYIqpCzA1zpqpkyoLGe9FfbWDMipk6YeFXsmnTLPepHdi5SUlQDAt7Eeg==" + servicekey: "" slackapi: ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty ## enabled: true ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) - url: "AgARq4InE0N92opyhRiR7dXnHrtovhFnR1BaKc5eI0tKNhLxZNErNoLuICOePtcI3z8f229NBMHaCBQwjqTQxajATSliHv3Ti2kDEqN0ZQFIVgdBnQef7jgamS5q3fZM8Jd6hSnHE3vBCIqh2REMlCCxhM7OtSqNtzed8uCHqUtjo4fHmL6aG/P+QXtd9kybmIFhbWb9l3BQLEVujvZsLdrFLv1wGHuoSgqiWLmM6H+AfwGdZzylRIrzoRICeYHIGZBQgKMfHIF946R90PLbmydaA0JHej3mVtdtXNkRUXuLR/pupvZHkMIBBcvBwW/vDR+VEwheO+KMmo9vNvQ5aJSD7qRHGC0ZdySa4s+UjeKe4Qdf0lwjhqyr74scOs9MflLn2XY6UlnHQU8RZOSrlC+Pw/NqQDO3jPk2RfpNaf6Mm9FnkO8ThpMoZlM42u7Aj/BG0mGMHX0S5OtkjvEQk6zQgp1HvOgyfAlABH/fX6TlU9z0du7hI8JZWELS1RvJ7EnXhu0oEmOqpl23YjDtoihG7OFMyg4kvenS8rEXYsRWfbbW8Q2iEQBqDMck4PSkBjSKMp76UgmuFY7fwL724eO9kpO9Oigb01BtItPZv61eMkvvYN0EUzeoFz9omjg5euvlK0X08hWchG8/moaa7HUlysZNVzIYGd0FTjs5cip85Sq8r7QPPoLsWfUb7A80e4WVv44G4LuxMTB4uNw6n8ewAqf90pHSUMRQ+N8jycZYkiz9oGOOAwO4sWVEKu8Qlpf3QAkJQHIMi8Y4AFmxr+EITKB7sQFmM9zT4Uozog==" + url: "" ingress: ## If true, alertmanager Ingress will be created @@ -155,7 +155,7 @@ alertmanager: ## alertmanager data Persistent Volume size ## - size: 4Gi + size: 75Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: @@ -180,13 +180,13 @@ alertmanager: ## alertmanager resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} - # limits: - # cpu: 50m - # memory: 160Mi - # requests: - # cpu: 10m - # memory: 32Mi + resources: + limits: + cpu: 50m + memory: 160Mi + requests: + cpu: 10m + memory: 32Mi ## Security context to be added to alertmanager pods ## @@ -202,9 +202,9 @@ alertmanager: clusterIP: "" gateway: selector: - #- internal-gateway.default.svc.cluster.local + - internal-gateway.default.svc.cluster.local hosts: - #- alertmanager.dev-eu.nynja.net + #- alertmanager-miro.dev-eu.nynja.net ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md @@ -252,7 +252,14 @@ configmapReload: ## configmap-reload resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} + #resources: {} + resources: + limits: + cpu: 100m + memory: 320Mi + requests: + cpu: 20m + memory: 64Mi initChownData: ## If false, data ownership will not be reset at startup @@ -274,7 +281,14 @@ initChownData: ## initChownData resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} + #resources: {} + resources: + limits: + cpu: 75m + memory: 320Mi + requests: + cpu: 20m + memory: 64Mi kubeStateMetrics: ## If false, kube-state-metrics will not be installed @@ -289,7 +303,7 @@ kubeStateMetrics: ## image: repository: quay.io/coreos/kube-state-metrics - tag: v1.3.1 + tag: v1.4.0 pullPolicy: IfNotPresent ## kube-state-metrics container arguments @@ -322,13 +336,14 @@ kubeStateMetrics: ## kube-state-metrics resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} - # limits: - # cpu: 10m - # memory: 16Mi - # requests: - # cpu: 10m - # memory: 16Mi + #resources: {} + resources: + limits: + cpu: 150m + memory: 480Mi + requests: + cpu: 30m + memory: 48Mi ## Security context to be added to kube-state-metrics pods ## @@ -418,13 +433,14 @@ nodeExporter: ## node-exporter resource limits & requests ## Ref: https://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} - # limits: - # cpu: 200m - # memory: 50Mi - # requests: - # cpu: 100m - # memory: 30Mi + #resources: {} + resources: + limits: + cpu: 200m + memory: 300Mi + requests: + cpu: 40m + memory: 64Mi ## Security context to be added to node-exporter pods ## @@ -460,7 +476,7 @@ server: ## image: repository: prom/prometheus - tag: v2.3.1 + tag: v2.4.3 pullPolicy: IfNotPresent ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug @@ -486,6 +502,11 @@ server: ## How frequently to evaluate rules ## evaluation_interval: 1m + ## Attach these labels to any time series or alerts when communicating with + ## external systems (federation, remote storage, Alertmanager). + ## + external_labels: + monitor: 'dev' ## Additional Prometheus server container arguments ## @@ -600,7 +621,7 @@ server: ## Prometheus server data Persistent Volume size ## - size: 15Gi + size: 250Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: @@ -626,26 +647,29 @@ server: ## Prometheus server resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} - # limits: - # cpu: 500m - # memory: 512Mi - # requests: - # cpu: 500m - # memory: 512Mi + #resources: {} + resources: + limits: + cpu: 1 + memory: 4Gi + requests: + cpu: 750m + memory: 2Gi ## Security context to be added to server pods ## securityContext: {} - externalLabels: {} - #cluster: "DEV" + ## The environment name - shown as a headline in the Prometheus Alerts + # + externalLabels: + cluster: "dev" service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - #nynja.biz/env: "dev" + nynja.biz/env: "dev" nynja.biz/probe: "prometheus" labels: {} clusterIP: "" @@ -663,9 +687,9 @@ server: #type: LoadBalancer gateway: selector: - #- internal-gateway.default.svc.cluster.local + - internal-gateway.default.svc.cluster.local hosts: - #- prometheus.dev-eu.nynja.net + #- prometheus-miro.dev-eu.nynja.net ## Prometheus server pod termination grace period ## @@ -748,13 +772,14 @@ pushgateway: ## pushgateway resource requests and limits ## Ref: http://kubernetes.io/docs/user-guide/compute-resources/ ## - resources: {} - # limits: - # cpu: 10m - # memory: 32Mi - # requests: - # cpu: 10m - # memory: 32Mi + #resources: {} + resources: + limits: + cpu: 100m + memory: 320Mi + requests: + cpu: 20m + memory: 64Mi ## Security context to be added to push-gateway pods ## @@ -926,7 +951,7 @@ serverFiles: annotations: summary: "Monitoring Stack probe down" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Monitoring Stack probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + description: "The Monitoring Stack probe of {{ $labels.instance }} on {{ $labels.monitor }} ({{- $labels.monitor -}}) is down" - name: kube-state-metrics.rules rules: @@ -959,7 +984,7 @@ serverFiles: annotations: summary: Pod is restarting frequently identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: Pod {{ $labels.pod}} on {{ $labels.instance }}, ({{- $labels.env -}}) was restarted {{ $value }} + description: Pod {{ $labels.pod}} of {{ $labels.instance }} on {{ $labels.monitor }}, ({{- $labels.monitor -}}) was restarted {{ $value }} times within the last hour - name: kubernetes.rules @@ -984,18 +1009,18 @@ serverFiles: labels: severity: warning annotations: - summary: "{{ $labels.instance }}: High Node CPU usage detected" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Node CPU usage detected" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: Node CPU usage is above 80% (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node CPU usage is above 80% (current value is: {{ $value }})" - alert: NodeMemoryUsage expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 for: 30m labels: severity: warning annotations: - summary: "{{ $labels.instance }}: High node memory usage detected" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High node memory usage detected" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: Node Memory usage is above 80% (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node Memory usage is above 80% (current value is: {{ $value }})" - name: istio.rules rules: @@ -1005,9 +1030,9 @@ serverFiles: labels: severity: major annotations: - summary: "{{ $labels.instance }}: High Number of Istio Requests to Calling-Service" + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Number of Istio Requests to Calling-Service" identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" prometheus.yml: rule_files: @@ -1020,8 +1045,8 @@ serverFiles: - targets: - localhost:9090 labels: - # env: dev - # + env: dev + #- job_name: grafana # scrape_interval: 15s # scrape_timeout: 10s @@ -1100,17 +1125,18 @@ serverFiles: action: keep regex: alertmanager - - #- job_name: prometheus-alertmanager - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "10.43.242.113:80" - # labels: - # env: dev + ## Scrape the metrics of the Messaging Service on DEV + # + - job_name: Messaging-Service + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - "dev.ci.nynja.net:8888" + labels: + env: dev # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) -- GitLab