From 23dd876f3c19e4cc2af8d2518946f5479915791b Mon Sep 17 00:00:00 2001 From: Miroslav Hadzhiev Date: Tue, 20 Nov 2018 18:50:27 +0200 Subject: [PATCH 1/2] PR 3; Pull 2 - (1) storage space tweaks related to logs' retention policy; (2) setting relevant pod resources limits; (3) no secret data in 'values.yaml'. --- monitoring/prometheus.yaml | 1254 +++++++++++++++++++++++------------- 1 file changed, 810 insertions(+), 444 deletions(-) diff --git a/monitoring/prometheus.yaml b/monitoring/prometheus.yaml index 278a655..35a3a25 100644 --- a/monitoring/prometheus.yaml +++ b/monitoring/prometheus.yaml @@ -37,7 +37,6 @@ spec: ## image: repository: prom/alertmanager - #tag: v0.15.0 tag: v0.15.2 pullPolicy: IfNotPresent ## Additional alertmanager container arguments @@ -59,6 +58,21 @@ spec: ## to NOT generate a ConfigMap resource ## configMapOverrideName: "" + pagerduty: + ## If true, PagerDuty will be enabled for Alertmanager + ## + enabled: true + ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) + servicekey: "AgA2hDVLrHgSL8dVZCg1FVRijEd8UuzFyGFllVFAFpeUSBmfELoxKR2kddkNb89Zh1W/CB3wiY6itGB9EuNgu/i8K+JGSd8aZJBnF9yZRe3Ug7mI0r4CHZfVb/q3f9AZHoGbYJRlOFxjWO+Gk62hjetTCBFRQ6aTuYc49kYGPqwresz0EeQ8AYtezqag3+3S3hafCQk2VPg/7p9KSry6vJ60LWjQaxqUkUNgT/4785briyXqsKoSSxuu7PBQ52Gklax5YO2Ik1JikUxxn91MqUvNzd3TUCTWm6ssUZaDcG50/ABEoKHrT47BzCyhebYZHfF+pfKdEbPy58WwvBt2UhZprytkZPs4FuCmj9LVVzwv80Sct5ro+ZxwjCkzUwtzSgy1ZH9oNano/lHBbhkZJ3rx03sDZcCP26myEqabGob9sD4iutDG2MAq6ytgh8FLmGZ6nAKg+kG9mSstnAxi0PzClJTRix60YAWTzMYQWbi1fngo2JoK7opGoIgUzeBiqDrIfWjhbUXhLxSdqub6G4N2iFozCr5jWpwdJCPVbWLSj+eQyhb9/WmFJixED1QIr/hLJYtIs1zYAEu23vyn8Mctw2F/vO/SUp7QxP32pSuMHM/hrOa7B6jJEXgSvxShMgKE0IfkeVsnQPeWBZvVz/iLFTRoDF3SSV7kdZlHiPS2UQa1hyBG0SGq9thUGn5DRZC3Oi8rBm9r2mUzmpVN9KgMGmv5q/x6/SGV3UxoxJ2ZJg==" + + slackapi: + ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty + ## + enabled: true + ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) + url: "AgAjEehRPdWEzfjd74xhn0Zuh6SuSbZcRZe77Dcoj9wgXpfiTJEP+SEXUi9ft7UHutYSwvdi+c3JBRPkUAQFfgC99NglQMqT57Kyy6OGklW1RiA9PeAD2+sRYLuovNAhTpQyfE4b/5LV7wky/MRS+r2YOHjlsDXzKxyke2sTe+003/a8Ieo4lb9YuqzAfrH6eqgNlV1GFMya6hwI5AAJwXfWyV8TzZMgmcZ4QjIfSO+sqXaA+4asw0PXI+oGwtjvB7kuwUR9E/Y6LKnpsaCFXbddq7xfNH8cHU44eF0NPPSxTpBOqYguY0o/eVhSMullGUbW95m73rpyrSnV0YCJyOZjidmelANifintOT81cNVDXhkAOMAo+GcjStU1QC7LggS0+T6ThyPh62j4ZUXklJd/NJk6ltTmh38JK9EFzvDr2IBAaWbEvpPz05PZ5TDSfRrm3VSD5623LBqCZFCsf3QGyPfF3mu05Ya6wnaizdPaTp2EJMyNeeYOzP5rqezUY4IubVlclm6PGsfXdguJ1uSAyULpB87NG8+CzgMZ18TRiNwyQclZTEQ3npsLG53ZjByYg3Iu7DI/Kf0f/SK2c3yRpqM3qEtw6SMj4sgJWwNg0hgikxQUdMICbTPdhFiQOJXTohp+d+eNqUvm0mqdW1MvEf9jeiiWUg0DFgYUORdo7YxsL7PDjCgxPVNqdDRTMy2ZJMxNCmFaEfrHos9jYPi5PBN5TbTbVUu+aQmbEK0rG1xENir777SjhP8g5XZ6USe1QzJ1syXdGUaqhhTsS5eqrj7WfpyJs1MhZX9Png==" + + ingress: ## If true, alertmanager Ingress will be created ## @@ -126,8 +140,7 @@ spec: mountPath: /data ## alertmanager data Persistent Volume size ## - # size: 2Gi - size: 10Gi + size: 75Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -211,11 +224,11 @@ spec: #resources: {} resources: limits: - cpu: 50m - memory: 160Mi + cpu: 100m + memory: 320Mi requests: - cpu: 10m - memory: 32Mi + cpu: 20m + memory: 64Mi initChownData: ## If false, data ownership will not be reset at startup ## This allows the prometheus-server to be run with an arbitrary user @@ -236,11 +249,11 @@ spec: #resources: {} resources: limits: - cpu: 50m - memory: 160Mi + cpu: 75m + memory: 320Mi requests: - cpu: 10m - memory: 32Mi + cpu: 20m + memory: 64Mi kubeStateMetrics: ## If false, kube-state-metrics will not be installed ## @@ -252,7 +265,6 @@ spec: ## image: repository: quay.io/coreos/kube-state-metrics - #tag: v1.3.1 tag: v1.4.0 pullPolicy: IfNotPresent ## kube-state-metrics container arguments @@ -281,11 +293,11 @@ spec: ## resources: limits: - cpu: 50m - memory: 160Mi + cpu: 150m + memory: 480Mi requests: - cpu: 10m - memory: 16Mi + cpu: 30m + memory: 48Mi ## Security context to be added to kube-state-metrics pods ## securityContext: {} @@ -360,11 +372,11 @@ spec: ## resources: limits: - cpu: 100m - memory: 150Mi + cpu: 200m + memory: 300Mi requests: - cpu: 20m - memory: 32Mi + cpu: 40m + memory: 64Mi ## Security context to be added to node-exporter pods ## securityContext: {} @@ -393,7 +405,6 @@ spec: ## image: repository: prom/prometheus - #tag: v2.3.1 tag: v2.4.3 pullPolicy: IfNotPresent ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug @@ -416,6 +427,11 @@ spec: ## How frequently to evaluate rules ## evaluation_interval: 1m + ## Attach these labels to any time series or alerts when communicating with + ## external systems (federation, remote storage, Alertmanager). + ## + external_labels: + monitor: 'dev' ## Additional Prometheus server container arguments ## extraArgs: {} @@ -510,8 +526,7 @@ spec: mountPath: /data ## Prometheus server data Persistent Volume size ## - # size: 8Gi - size: 50Gi + size: 250Gi ## Prometheus server data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -534,21 +549,27 @@ spec: ## resources: limits: - cpu: 750m - memory: 768Mi + cpu: 1 + memory: 4Gi requests: - cpu: 500m - memory: 512Mi - + cpu: 750m + memory: 2Gi + ## Security context to be added to server pods ## securityContext: {} + + ## The environment name - shown as a headline in the Prometheus Alerts + # + externalLabels: + cluster: "dev" + service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "dev" - nynja.biz/probe: "prometheus" + nynja.biz/env: "dev" + nynja.biz/probe: "prometheus" labels: {} clusterIP: "" ## List of IP addresses at which the Prometheus server service is available @@ -565,13 +586,17 @@ spec: selector: - internal-gateway.default.svc.cluster.local hosts: - - prometheus.dev-eu.nynja.net + - prometheus.dev-eu.nynja.net ## Prometheus server pod termination grace period ## terminationGracePeriodSeconds: 300 ## Prometheus data retention period (i.e 360h) ## retention: "" + ## Set the namespace where Istio is installed - default: "istio-system" + namespace: + istio: "istio-system" + pushgateway: ## If false, pushgateway will not be installed ## @@ -579,7 +604,7 @@ spec: ## pushgateway container name ## name: pushgateway - + ## pushgateway container image ## image: @@ -659,419 +684,760 @@ spec: ## alertmanager ConfigMap entries ## alertmanagerFiles: - alertmanager.yml: - global: - # slack_api_url: '' - slack_api_url: https://hooks.slack.com/services/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 - receivers: - - name: default-receiver - slack_configs: - - channel: '#ops-alerts' - send_resolved: true - username: 'alerts{{ if eq .Status "firing" }}_firing{{ else }}_resolved{{ end }}' - title: '{{ template "slack.default.title" . }}' - text: >- - {{ range .Alerts }} - *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity | toUpper }}` <{{ .GeneratorURL }}|:chart_with_upwards_trend:> - *Description:* {{ .Annotations.description }} - *Details:* - {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - {{ end }} - {{ end }} - icon_emoji: '{{ if eq .Status "firing" }}:fire:{{ else }}:sun_with_face:{{ end }}' - route: - group_wait: 30s - group_interval: 5m - receiver: default-receiver - repeat_interval: 3h - #group_by: ['alertname', 'cluster', 'env'] - group_by: ['alertname', 'cluster'] - routes: - - match: - env: dev - group_wait: 5m - repeat_interval: 24h - ## Prometheus server ConfigMap entries - ## - serverFiles: - alerts: {} + notifications.tpl: |- + {{ define "__alertmanager" }}Environment: ___PROMETHEUS_CLUSTER_NAME___{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + + + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + {{ define "slack.default.footer" }}{{ end }} + + alertmanager.yml: + global: + # slack_api_url: '' + slack_api_url: ___ALERTMANAGER_SLACK_API_URL___ + + receivers: + - name: default-receiver + + slack_configs: + - channel: '#ops-alerts' + send_resolved: false + username: '{{ template "slack.default.username" . }}' + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "slack.default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ .CommonAnnotations.summary }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + fallback: '{{ template "slack.default.fallback" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + + pagerduty_configs: + - service_key: ___PAGERDUTY_SERVICEKEY___ + + templates: + - /automations/notifications.tpl + + route: + group_wait: 30s + group_interval: 5m + receiver: default-receiver + repeat_interval: 3h + #group_by: ['alertname', 'cluster', 'env'] + group_by: ['alertname', 'cluster'] + routes: + - match: + env: dev + group_wait: 5m + repeat_interval: 24h + + +## Prometheus server ConfigMap entries +## +serverFiles: + alerts: {} + rules: + groups: + - name: general.rules rules: - groups: - - name: Cassandra - rules: - - alert: CassandraProbeDown - expr: up{job="cassandra"} != 3 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra probe down" - description: "The Cassandra probe of {{ $labels.instance }} ({{$labels.env}}) is down" - - alert: CassandraDashboardNotReachable - expr: probe_success{job="cassandra"} == 0 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra dashboard not reachable" - description: "The Cassandra dashboard on {{ $labels.instance }} ({{ $labels.env }}) is not reachable" - - alert: CassandraDeadNodes - expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra dead nodes count - description: "The Cassandra node of {{ $labels.instance }} ({{$labels.env}}) is down" - - alert: CassandraConnectionTimeouts - expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra - number of requests timeouts over 1 min - description: "Cassandra cluster: {{$labels.release}} ({{$labels.env}}) - number of requests timeouts over 1 min" - - name: Monitoring - rules: - - alert: MonitoringStackProbeDown - expr: up{job="monitoring-stack"} != 2 - for: 30s - labels: - severity: major - annotations: - summary: "Monitoring Stack probe down" - description: "The Monitoring Stack probe of {{ $labels.instance }} ({{$labels.env}}) is down" - prometheus.yml: - rule_files: - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - labels: - env: dev - #- job_name: grafana - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "grafana.dev-eu.nynja.net:80" - # labels: - # env: dev - - job_name: 'grafana' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: grafana - - job_name: 'prometheus-pushgateway' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: pushgateway - - job_name: 'prometheus-alertmanager' - honor_labels: true - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: alertmanager - #- job_name: prometheus-alertmanager - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "10.43.242.113:80" - # labels: - # env: dev - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - # Scrape config for API servers. - # - # Kubernetes exposes API servers as endpoints to the default/kubernetes - # service so this uses `endpoints` role and uses relabelling to only keep - # the endpoints associated with the default/kubernetes service using the - # default named port `https`. This works for single API server deployments as - # well as HA API server deployments. - - job_name: 'kubernetes-apiservers' - kubernetes_sd_configs: - - role: endpoints - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - # Keep only the default/kubernetes service endpoints for the https port. This - # will add targets for each API server which Kubernetes adds an endpoint to - # the default/kubernetes service. - relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: default;kubernetes;https - - job_name: 'kubernetes-nodes' - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - - job_name: 'kubernetes-nodes-cadvisor' - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - kubernetes_sd_configs: - - role: node - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/${1}:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - # Scrape config for service endpoints. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/scrape`: Only scrape services that have a value of `true` - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - - job_name: 'kubernetes-service-endpoints' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - # Example scrape config for probing services via the Blackbox Exporter. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DaemonsetFailedPods + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready > 0 + for: 5m + labels: + severity: critical + annotations: + description: 'DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes' + summary: DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes + + - name: Cassandra + rules: + - alert: CassandraProbeDown + expr: up{job="cassandra"} != 3 + for: 30s + labels: + severity: major + annotations: + summary: "Cassandra probe down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" + - alert: CassandraDashboardNotReachable + expr: probe_success{job="cassandra"} == 0 + for: 30s + labels: + severity: major + annotations: + summary: "Cassandra dashboard not reachable" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra dashboard on {{ $labels.instance }} ({{- $labels.env -}}) is not reachable" + - alert: CassandraDeadNodes + expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 + for: 30s + labels: + severity: major + annotations: + summary: Cassandra dead nodes count + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Cassandra node of {{ $labels.instance }} ({{- $labels.env -}}) is down" + - alert: CassandraConnectionTimeouts + expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 + for: 30s + labels: + severity: major + annotations: + summary: Cassandra - number of requests timeouts over 1 min + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "Cassandra cluster: {{ $labels.release }} ({{- $labels.env -}}) - number of requests timeouts over 1 min" + - alert: MonitoringStackProbeDown + expr: up{job="monitoring-stack"} != 2 + for: 30s + labels: + severity: major + annotations: + summary: "Monitoring Stack probe down" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "The Monitoring Stack probe of {{ $labels.instance }} on {{ $labels.monitor }} ({{- $labels.monitor -}}) is down" + + - name: kube-state-metrics.rules + rules: + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + summary: Daemonsets are not scheduled correctly + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: A number of daemonsets are not scheduled. + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + summary: DaemonSet is missing pods + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Only {{ $value }}% of desired pods scheduled and ready for daemon + set {{ $labels.namespaces }}/{{ $labels.daemonset }} + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + summary: Pod is restarting frequently + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: Pod {{ $labels.pod}} of {{ $labels.instance }} on {{ $labels.monitor }}, ({{- $labels.monitor -}}) was restarted {{ $value }} + times within the last hour + + - name: kubernetes.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, + "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, + controller, pod_name, container_name) + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", + "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, + container_name) + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) + BY (cluster) + + - name: node.rules + rules: + - alert: NodeCPUUsage + expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[5m])) * 100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Node CPU usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node CPU usage is above 80% (current value is: {{ $value }})" + - alert: NodeMemoryUsage + expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High node memory usage detected" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node Memory usage is above 80% (current value is: {{ $value }})" + + - name: istio.rules + rules: + - alert: IstioRequests-to-CallingService + expr: sum(rate(istio_requests_total{reporter="source",destination_service=~"calling-service.callconf.svc.cluster.local"}[30s])) > 20 + for: 1m + labels: + severity: major + annotations: + summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Number of Istio Requests to Calling-Service" + identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" + description: "{{ $labels.instance }} on {{ $labels.monitor }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" + + prometheus.yml: + rule_files: + - /data/etc/config/rules + - /data/etc/config/alerts + + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + labels: + env: dev + + #- job_name: grafana + # scrape_interval: 15s + # scrape_timeout: 10s + # metrics_path: /metrics + # scheme: http + # static_configs: + # - targets: + # - "grafana.dev-eu.nynja.net:80" + # labels: + # env: dev + + - job_name: 'grafana' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: grafana + + + - job_name: 'prometheus-pushgateway' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: pushgateway + + - job_name: 'prometheus-alertmanager' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: alertmanager + + ## Scrape the metrics of the Messaging Service on DEV + # + - job_name: Messaging-Service + scrape_interval: 30s + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + static_configs: + - targets: + - "dev.ci.nynja.net:8888" + labels: + env: dev + + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # * `prometheus.io/probe`: Only probe services that have a value of `true` - - job_name: 'kubernetes-services' - metrics_path: /probe - params: - module: [http_2xx] - kubernetes_sd_configs: - - role: service - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: true - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: blackbox - - source_labels: [__param_target] - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name - # Example scrape config for pods + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + + - job_name: 'kubernetes-nodes' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics + + + - job_name: 'kubernetes-nodes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - - job_name: 'cassandra' - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: cassandra-exporter - networkPolicy: - ## Enable creation of NetworkPolicy resources. - ## - enabled: false \ No newline at end of file + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + # This configuration will work only on kubelet 1.7.3+ + # As the scrape endpoints for cAdvisor have changed + # if you are using older version you need to change the replacement to + # replacement: /api/v1/nodes/${1}:4194/proxy/metrics + # more info here https://github.com/coreos/prometheus-operator/issues/633 + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + + + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods' + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + + - job_name: 'cassandra' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] + action: replace + target_label: env + - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] + action: keep + regex: cassandra-exporter + + + - job_name: 'istio-mesh' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;prometheus + +# {{- if ne .Values.global.proxy.stats.prometheusPort 0. -}} + # Scrape config for envoy stats + - job_name: 'envoy-stats' + metrics_path: /stats/prometheus + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_container_port_name] + action: keep + regex: '.*-envoy-prom' + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:{{ .Values.global.proxy.stats.prometheusPort }} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod_name + + metric_relabel_configs: + # Exclude some of the envoy metrics that have massive cardinality + # This list may need to be pruned further moving forward, as informed + # by performance and scalability testing. + - source_labels: [ cluster_name ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ tcp_prefix ] + regex: '(outbound|inbound|prometheus_stats).*' + action: drop + - source_labels: [ listener_address ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_listener_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ http_conn_manager_prefix ] + regex: '(.+)' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tls.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_tcp_downstream.*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_http_(stats|admin).*' + action: drop + - source_labels: [ __name__ ] + regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' + action: drop +# {{- end -}} + + - job_name: 'istio-policy' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-policy;http-monitoring + + - job_name: 'istio-telemetry' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-telemetry;http-monitoring + + - job_name: 'pilot' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-pilot;http-monitoring + + - job_name: 'galley' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - "istio-system" + + relabel_configs: + - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: istio-galley;http-monitoring + + +networkPolicy: + ## Enable creation of NetworkPolicy resources. + ## + enabled: false -- GitLab From 7035b4ac76ff3a61251bea449d10e514d42e185d Mon Sep 17 00:00:00 2001 From: Miroslav Hadzhiev Date: Fri, 30 Nov 2018 14:47:54 +0200 Subject: [PATCH 2/2] (1) Modulizing the scrape configs and the Prometheus rules; (2) upgrading Prometheus to v2.5.0; Alertmanager - to v0.15.3; (3) further optimizing the resource usage and limits; (4) starting using the Slack channel "#ops-alerts-dev" (until the next change re PagerDuty). --- monitoring/prometheus.yaml | 696 +------------------------------------ 1 file changed, 15 insertions(+), 681 deletions(-) diff --git a/monitoring/prometheus.yaml b/monitoring/prometheus.yaml index 35a3a25..d2a5122 100644 --- a/monitoring/prometheus.yaml +++ b/monitoring/prometheus.yaml @@ -37,7 +37,7 @@ spec: ## image: repository: prom/alertmanager - tag: v0.15.2 + tag: v0.15.3 pullPolicy: IfNotPresent ## Additional alertmanager container arguments ## @@ -63,14 +63,14 @@ spec: ## enabled: true ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) - servicekey: "AgA2hDVLrHgSL8dVZCg1FVRijEd8UuzFyGFllVFAFpeUSBmfELoxKR2kddkNb89Zh1W/CB3wiY6itGB9EuNgu/i8K+JGSd8aZJBnF9yZRe3Ug7mI0r4CHZfVb/q3f9AZHoGbYJRlOFxjWO+Gk62hjetTCBFRQ6aTuYc49kYGPqwresz0EeQ8AYtezqag3+3S3hafCQk2VPg/7p9KSry6vJ60LWjQaxqUkUNgT/4785briyXqsKoSSxuu7PBQ52Gklax5YO2Ik1JikUxxn91MqUvNzd3TUCTWm6ssUZaDcG50/ABEoKHrT47BzCyhebYZHfF+pfKdEbPy58WwvBt2UhZprytkZPs4FuCmj9LVVzwv80Sct5ro+ZxwjCkzUwtzSgy1ZH9oNano/lHBbhkZJ3rx03sDZcCP26myEqabGob9sD4iutDG2MAq6ytgh8FLmGZ6nAKg+kG9mSstnAxi0PzClJTRix60YAWTzMYQWbi1fngo2JoK7opGoIgUzeBiqDrIfWjhbUXhLxSdqub6G4N2iFozCr5jWpwdJCPVbWLSj+eQyhb9/WmFJixED1QIr/hLJYtIs1zYAEu23vyn8Mctw2F/vO/SUp7QxP32pSuMHM/hrOa7B6jJEXgSvxShMgKE0IfkeVsnQPeWBZvVz/iLFTRoDF3SSV7kdZlHiPS2UQa1hyBG0SGq9thUGn5DRZC3Oi8rBm9r2mUzmpVN9KgMGmv5q/x6/SGV3UxoxJ2ZJg==" + servicekey: "AgCbKg+3Yr8pSJ6HZEpiInUt+A8QeNcQJgpKOfNoTVYKKRkm84fIN2ejKTAJP2jbLawJW54MFoR2Z+DGhzSy3lKXTMHyt7pCBVrJN0kqeyJBO/LmkEOFGhW2Cs8mCjm4XKEihlnAxhGdmWVjnfJ5GAsyC6cTB53KvBm+k6PWtDkdXc2wZRHW3IU/l697moGMy8SuEVXL3l40UaruPbgtZbJ0NwRaZylB+jKqleYFjeY2Vpl4UsK0MQ3CktiHVi4sB/4b85Ggz9TkiOwsQCuAaeQ9/Dm0ihgxxi9cOHO0Ng+Ji1nFmV4sXL8xSpMulOldr754vbHz+k1kiwXRROJlXHkSS6rvlm1bMsttQ2MIiuO8c8Dc2mVR4q2lWv/uZiDkcrU4YiXBLX8nYfEhCfZ3eGVO4npZay0nuGhtGadCRHaj3AESLqjGfnZ2CAD21QSA/IdOn1EjCj6ddeSrae0YvSnpUwyC99RkS7GBRmZeuUkBRiv5NKk0jt3rSROpcRIucSFWdcgBDbe1WBkV0rsX6zJz2TRAwn9/3xm0lkFcZv7/wNI51zxeOAG6FJDGPy92Jpi3azbsO78nVnUarXz+s1QJqIqpLAdsRvYY+OCWzV4vANi4v/Dq48QBYTGH1rtQkcuQ7nESy3Qq2uVvBEmOdxtvCIy1hnBuKSXzuE7OpRMYhE0XTDb0A0vfSg/IjBKFZaBd4ilRmYeraqRg6HUJjl6ebeWN8KcZyzIk6q7sOrAasA==" slackapi: ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty ## enabled: true ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) - url: "AgAjEehRPdWEzfjd74xhn0Zuh6SuSbZcRZe77Dcoj9wgXpfiTJEP+SEXUi9ft7UHutYSwvdi+c3JBRPkUAQFfgC99NglQMqT57Kyy6OGklW1RiA9PeAD2+sRYLuovNAhTpQyfE4b/5LV7wky/MRS+r2YOHjlsDXzKxyke2sTe+003/a8Ieo4lb9YuqzAfrH6eqgNlV1GFMya6hwI5AAJwXfWyV8TzZMgmcZ4QjIfSO+sqXaA+4asw0PXI+oGwtjvB7kuwUR9E/Y6LKnpsaCFXbddq7xfNH8cHU44eF0NPPSxTpBOqYguY0o/eVhSMullGUbW95m73rpyrSnV0YCJyOZjidmelANifintOT81cNVDXhkAOMAo+GcjStU1QC7LggS0+T6ThyPh62j4ZUXklJd/NJk6ltTmh38JK9EFzvDr2IBAaWbEvpPz05PZ5TDSfRrm3VSD5623LBqCZFCsf3QGyPfF3mu05Ya6wnaizdPaTp2EJMyNeeYOzP5rqezUY4IubVlclm6PGsfXdguJ1uSAyULpB87NG8+CzgMZ18TRiNwyQclZTEQ3npsLG53ZjByYg3Iu7DI/Kf0f/SK2c3yRpqM3qEtw6SMj4sgJWwNg0hgikxQUdMICbTPdhFiQOJXTohp+d+eNqUvm0mqdW1MvEf9jeiiWUg0DFgYUORdo7YxsL7PDjCgxPVNqdDRTMy2ZJMxNCmFaEfrHos9jYPi5PBN5TbTbVUu+aQmbEK0rG1xENir777SjhP8g5XZ6USe1QzJ1syXdGUaqhhTsS5eqrj7WfpyJs1MhZX9Png==" + url: "AgAyiVw+tXMqpdNTb0MuEE8ladTu3L2rlMPvLzJmzeHoA0sJaz/ywmXMYXAlG3Xx+C8tFiCybbzOyLeb8pZjv5t7G2YpsOgDTGIb06OBeDNHr3a6LOwWujSR3YrgNq9z1Dh2BpFrG3n3XSevxuI4GctzRHY8BC4g3oLi5yFTrUqZE5cmLS0G0gPZpXe1573HGTpm2vee1t0Z6imdhJkj4QUzRiENyD19QLrN0j+KALWtejW1UyvVjpPZufcUPNzoM9WRRdbS0XkKUwFlPWjSk4kJUJ+AszO1VsitSRFhg4GbBk/9fiqDL5DycNwgNdWOFKNBZ2gbLVqMqzXRyiO/1jK70c0bGkwnyk9hwJfeG5m715bL1CgC1hUrXH7FwN7k1qasZEKoQ+iuj0wRPxyAR2DCsrvUgVZQacTp0lOKoer9WE7V8NyD/8FsTpLQ5BTAoW+NhMtItD6l1AG48XQBWZxirK7ZTutesP6d8RmbYyfo0P4LcbwupBkc0Nmk+sAqd0MwrURnb4fMU/cFPkM2pxYvDSHFDYm/eWUDb7JVzanGNChfo6Md3lGjoQ+aM3QDA+vizXVqs2BZCuvCADtrOdl6YxNsDEQz8ED1k+BF9EOL1E8MlDRG1px8DnGtXBoOrcY+KZTlgnHJ6V2xbS75Plit1l8jvj1RvW51m2lPtXOLZqysRcHYYUUitZPx8c4/P4uLPps2/VNjErxoASWvT0c+UAn/uoVE0hG1ain3hDW8unQ0B1FzuP9EnS5xF8HUseO1BOspQP18/5xB2wqVQ2G09uvndha3y/F+0S5x+g==" ingress: @@ -140,7 +140,7 @@ spec: mountPath: /data ## alertmanager data Persistent Volume size ## - size: 75Gi + size: 30Gi ## alertmanager data Persistent Volume Storage Class ## If defined, storageClassName: ## If set to "-", storageClassName: "", which disables dynamic provisioning @@ -405,7 +405,7 @@ spec: ## image: repository: prom/prometheus - tag: v2.4.3 + tag: v2.5.0 pullPolicy: IfNotPresent ## The URL prefix at which the container can be accessed. Useful in the case the '-web.external-url' includes a slug ## so that the various internal URLs are still able to access as they are in the default case. @@ -549,7 +549,7 @@ spec: ## resources: limits: - cpu: 1 + cpu: 1250m memory: 4Gi requests: cpu: 750m @@ -596,6 +596,10 @@ spec: ## Set the namespace where Istio is installed - default: "istio-system" namespace: istio: "istio-system" + externalFiles: + rules: + enabled: true + confFile: "rules" pushgateway: ## If false, pushgateway will not be installed @@ -657,11 +661,11 @@ spec: ## resources: limits: - cpu: 50m - memory: 160Mi + cpu: 100m + memory: 320Mi requests: - cpu: 10m - memory: 32Mi + cpu: 20m + memory: 64Mi ## Security context to be added to push-gateway pods ## securityContext: {} @@ -711,14 +715,13 @@ spec: alertmanager.yml: global: - # slack_api_url: '' slack_api_url: ___ALERTMANAGER_SLACK_API_URL___ receivers: - name: default-receiver slack_configs: - - channel: '#ops-alerts' + - channel: '#ops-alerts-dev' send_resolved: false username: '{{ template "slack.default.username" . }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' @@ -761,681 +764,12 @@ spec: ## serverFiles: alerts: {} - rules: - groups: - - name: general.rules - rules: - - alert: TargetDown - expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 - for: 10m - labels: - severity: warning - annotations: - description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' - summary: Targets are down - - alert: DaemonsetFailedPods - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready > 0 - for: 5m - labels: - severity: critical - annotations: - description: 'DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes' - summary: DaemonSet {{ $labels.daemonset }} is down on at least one node for more than 5 minutes - - - name: Cassandra - rules: - - alert: CassandraProbeDown - expr: up{job="cassandra"} != 3 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra probe down" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Cassandra probe of {{ $labels.instance }} ({{- $labels.env -}}) is down" - - alert: CassandraDashboardNotReachable - expr: probe_success{job="cassandra"} == 0 - for: 30s - labels: - severity: major - annotations: - summary: "Cassandra dashboard not reachable" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Cassandra dashboard on {{ $labels.instance }} ({{- $labels.env -}}) is not reachable" - - alert: CassandraDeadNodes - expr: kube_statefulset_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} - kube_statefulset_status_replicas{namespace="cassandra",release="prometheus",statefulset="cassandra"} > 0 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra dead nodes count - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Cassandra node of {{ $labels.instance }} ({{- $labels.env -}}) is down" - - alert: CassandraConnectionTimeouts - expr: sum(cassandra_stats{datacenter="europe-west3", cluster="cassandra",name="org:apache:cassandra:metrics:connection:totaltimeouts:oneminuterate"}) by (name) > 1 - for: 30s - labels: - severity: major - annotations: - summary: Cassandra - number of requests timeouts over 1 min - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "Cassandra cluster: {{ $labels.release }} ({{- $labels.env -}}) - number of requests timeouts over 1 min" - - alert: MonitoringStackProbeDown - expr: up{job="monitoring-stack"} != 2 - for: 30s - labels: - severity: major - annotations: - summary: "Monitoring Stack probe down" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "The Monitoring Stack probe of {{ $labels.instance }} on {{ $labels.monitor }} ({{- $labels.monitor -}}) is down" - - - name: kube-state-metrics.rules - rules: - - alert: K8SDaemonSetsNotScheduled - expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled - > 0 - for: 10m - labels: - severity: warning - annotations: - summary: Daemonsets are not scheduled correctly - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: A number of daemonsets are not scheduled. - - alert: DaemonSetRolloutStuck - expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled - * 100 < 100 - for: 15m - labels: - severity: warning - annotations: - summary: DaemonSet is missing pods - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: Only {{ $value }}% of desired pods scheduled and ready for daemon - set {{ $labels.namespaces }}/{{ $labels.daemonset }} - - alert: PodFrequentlyRestarting - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 - for: 10m - labels: - severity: warning - annotations: - summary: Pod is restarting frequently - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: Pod {{ $labels.pod}} of {{ $labels.instance }} on {{ $labels.monitor }}, ({{- $labels.monitor -}}) was restarted {{ $value }} - times within the last hour - - - name: kubernetes.rules - rules: - - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, - "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - controller, pod_name, container_name) - - record: cluster_namespace_controller_pod_container:memory_usage:bytes - expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", - "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - container_name) - - record: cluster:memory_used:percent - expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes) - BY (cluster) - - - name: node.rules - rules: - - alert: NodeCPUUsage - expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{component="node-exporter",mode="idle"}[5m])) * 100)) > 80 - for: 30m - labels: - severity: warning - annotations: - summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Node CPU usage detected" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node CPU usage is above 80% (current value is: {{ $value }})" - - alert: NodeMemoryUsage - expr: ( avg by (instance) ((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) > 80 - for: 30m - labels: - severity: warning - annotations: - summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High node memory usage detected" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }} on {{ $labels.monitor }}: Node Memory usage is above 80% (current value is: {{ $value }})" - - - name: istio.rules - rules: - - alert: IstioRequests-to-CallingService - expr: sum(rate(istio_requests_total{reporter="source",destination_service=~"calling-service.callconf.svc.cluster.local"}[30s])) > 20 - for: 1m - labels: - severity: major - annotations: - summary: "{{ $labels.instance }} on {{ $labels.monitor }}: High Number of Istio Requests to Calling-Service" - identifier: "Instance: {{ $labels.instance }}; Prometheus Job: {{ $labels.job }}" - description: "{{ $labels.instance }} on {{ $labels.monitor }}: The number of Istio requests to Calling-Service is above 20 per minute (current value is: {{ $value }})" prometheus.yml: rule_files: - /data/etc/config/rules - /data/etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - labels: - env: dev - - #- job_name: grafana - # scrape_interval: 15s - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # static_configs: - # - targets: - # - "grafana.dev-eu.nynja.net:80" - # labels: - # env: dev - - - job_name: 'grafana' - honor_labels: true - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: grafana - - - - job_name: 'prometheus-pushgateway' - honor_labels: true - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: pushgateway - - - job_name: 'prometheus-alertmanager' - honor_labels: true - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: alertmanager - - ## Scrape the metrics of the Messaging Service on DEV - # - - job_name: Messaging-Service - scrape_interval: 30s - scrape_timeout: 10s - metrics_path: /metrics - scheme: http - static_configs: - - targets: - - "dev.ci.nynja.net:8888" - labels: - env: dev - - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - - # Scrape config for API servers. - # - # Kubernetes exposes API servers as endpoints to the default/kubernetes - # service so this uses `endpoints` role and uses relabelling to only keep - # the endpoints associated with the default/kubernetes service using the - # default named port `https`. This works for single API server deployments as - # well as HA API server deployments. - - job_name: 'kubernetes-apiservers' - - kubernetes_sd_configs: - - role: endpoints - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - # Keep only the default/kubernetes service endpoints for the https port. This - # will add targets for each API server which Kubernetes adds an endpoint to - # the default/kubernetes service. - relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: default;kubernetes;https - - - - job_name: 'kubernetes-nodes' - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - - - - job_name: 'kubernetes-nodes-cadvisor' - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/${1}:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - - - # Scrape config for service endpoints. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/scrape`: Only scrape services that have a value of `true` - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - - job_name: 'kubernetes-service-endpoints' - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - - - # Example scrape config for probing services via the Blackbox Exporter. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/probe`: Only probe services that have a value of `true` - - job_name: 'kubernetes-services' - - metrics_path: /probe - params: - module: [http_2xx] - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: true - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: blackbox - - source_labels: [__param_target] - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name - - - # Example scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - - - - job_name: 'cassandra' - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_nynja_biz_scrape_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_env] - action: replace - target_label: env - - source_labels: [__meta_kubernetes_service_annotation_nynja_biz_probe] - action: keep - regex: cassandra-exporter - - - - job_name: 'istio-mesh' - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - "istio-system" - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-telemetry;prometheus - -# {{- if ne .Values.global.proxy.stats.prometheusPort 0. -}} - # Scrape config for envoy stats - - job_name: 'envoy-stats' - metrics_path: /stats/prometheus - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_container_port_name] - action: keep - regex: '.*-envoy-prom' - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:{{ .Values.global.proxy.stats.prometheusPort }} - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod_name - - metric_relabel_configs: - # Exclude some of the envoy metrics that have massive cardinality - # This list may need to be pruned further moving forward, as informed - # by performance and scalability testing. - - source_labels: [ cluster_name ] - regex: '(outbound|inbound|prometheus_stats).*' - action: drop - - source_labels: [ tcp_prefix ] - regex: '(outbound|inbound|prometheus_stats).*' - action: drop - - source_labels: [ listener_address ] - regex: '(.+)' - action: drop - - source_labels: [ http_conn_manager_listener_prefix ] - regex: '(.+)' - action: drop - - source_labels: [ http_conn_manager_prefix ] - regex: '(.+)' - action: drop - - source_labels: [ __name__ ] - regex: 'envoy_tls.*' - action: drop - - source_labels: [ __name__ ] - regex: 'envoy_tcp_downstream.*' - action: drop - - source_labels: [ __name__ ] - regex: 'envoy_http_(stats|admin).*' - action: drop - - source_labels: [ __name__ ] - regex: 'envoy_cluster_(lb|retry|bind|internal|max|original).*' - action: drop -# {{- end -}} - - - job_name: 'istio-policy' - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - # metrics_path defaults to '/metrics' - # scheme defaults to 'http'. - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - "istio-system" - - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-policy;http-monitoring - - - job_name: 'istio-telemetry' - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - # metrics_path defaults to '/metrics' - # scheme defaults to 'http'. - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - "istio-system" - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-telemetry;http-monitoring - - - job_name: 'pilot' - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - # metrics_path defaults to '/metrics' - # scheme defaults to 'http'. - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - "istio-system" - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-pilot;http-monitoring - - - job_name: 'galley' - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - # metrics_path defaults to '/metrics' - # scheme defaults to 'http'. - - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - "istio-system" - - relabel_configs: - - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: istio-galley;http-monitoring - networkPolicy: ## Enable creation of NetworkPolicy resources. -- GitLab