From 513e35e594f781bd42b6614e24e671d1eaf0d1be Mon Sep 17 00:00:00 2001 From: Dimitar Zafirov Date: Fri, 15 Mar 2019 17:19:27 +0200 Subject: [PATCH 1/5] new EFK alert rules created --- prometheus/values.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 87f3ebd..bd2fd1e 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -3,7 +3,7 @@ rbac: ## Define the NYNJA Group's current environment # -nynja_env: '' +nynja_env: 'dev' ## Define serviceAccount names for components. Defaults to component's fully qualified name. # @@ -60,7 +60,7 @@ alertmanager: ## enabled: true ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) - servicekey: "" + servicekey: "AgBh3etLxrPAvoydN8b6mC5gO1ABF3OgraQCH3nFbZ4Y/HgYuVt/VHJBaxYluQQW6aY/oPOqS4KekjU2n8dyQ9YCFFx8/RFWcgxjCHQky3DY3MeC8k4zpa5B2+h6nkIK0cI6a3hC4aNxVNc0K7G5Ll3tlXkCE1TuME5w81q5HP87jnTlEW5NC4nV84TLfSiPMV6ER3ppLt0HgbzJMTZ64yHVnAc3RYBcY8TJM3I578HwW4Ot/wi6VExipFjPIhH3bUfxX1C7bTNkQ/vcdRx6lEi/5LFD04ymKvIc/IvTqNC3mwDR8+DMDPSUnfntgzL4kgzdnn2q7/YWcshSH3nOvd4bKUQqi4HZ0cmGSm8JKvb+djeZPkHGmP9Ttmoz8GQ8xOfCSRHOAOoufmlf6JgQalCLpR7oejvVL7k/CLekBktQLBewEyec020sgM/13J8VlZGXzxCOUpkL8PlsqOfquJVJVKpRvEA7Nilot9W6AL1xgOCB4ZFo8VJ4gxnm2pfKFqqOXZcRUs5aMrt+DwAvX/rWSyl6viEcoWDHKai7e/nska+/eTqncUYFhUCnM9nAT5VfPBMnujukX0axlTSCcJMJ4vJnxgao5crHb3HsE9VeM/JeNKSQilVQwULVQ60J7oXHUENVtIexpM4QmoG4O14kg97aGbyvkGzn/HNC7JedsoBcXNaCjG94PcEhtgfYScvdk7s1aQq7PmEsG8JhTprBXFTPS7U8pazPbzM2Gt6yFA==" slackapi: ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty @@ -68,7 +68,7 @@ alertmanager: # enabled: true ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) - url: "" + url: "AgAmHixjjBzVW3Q6G9KiCkbipGE+bL3n07nShM6sydNw75/pCrF8bzCnIHpdAvf1evfr05K+7COwLkN0Qx+VMyUuUYl+9wn/BRHkoD+jK7iYL4Hs5z4F2rMB/nznK+o6XhO/LLxrBLI6f61ZwLuoWlTLTMLuxbpdnkcIsMVzbIvfo2YxESut2DJMzysQhoJG0o+Zmr+l2yZf/5s5qC0rz6ePfH9rfvkla4qBSSj8Us7vUtPsi7pN4Cs1p2s0TOTBj2AaF3iU0tJCncMUwWzSFEGrED2gBmDUJicVjGxh0Pn4CRIBmCG5ddnUADvFWwFkUP50g+DLIw3I7yd12vrNWA651ClJf1QE+ap4+Lll4kSISgE5CqRQ/oL/cy9re0+uJ3Ta2dd1yTKWrJBOqA8FHf1Wa97l8ThKAFfpB3tPL5c1HJkhbtv73+pW5O72HcqcUw4gBzp4JQZflbDR6YgsnhA79c/1hdKcm+2Otm4dLehkUqQUkW6k+kor4C3ZT37ThQVl3srENTgkVVWZKQpaKC/JWBdQIoEDyL52Tsyp9yOBQ06ZD1NA9sqK+w2skXqklEOU7OsjQsPadfsvoyITaAA8dPv+QE1mcTAXCZuR88mmcuyexL43QIbKu/rnZGDWPUVrkPQOSrzndJq97W8OxyGczoqWJOCE8G1uhlen6fbTdutOGrTcUj11s6G19QnCg0+1CtvrUEutr++CwSos8NG2scZdyhUnHcGWc7xZYGOObko46l5Oiz7AdjFnADbDFjDBP/lP8ZDh+wtcKpmu8AFXMhWgN3uD53PpTib9Fg==" ingress: ## If true, alertmanager Ingress will be created @@ -171,7 +171,7 @@ alertmanager: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "" + nynja.biz/env: "dev" nynja.biz/probe: "alertmanager" labels: {} clusterIP: "" @@ -179,7 +179,7 @@ alertmanager: selector: - internal-gateway.default.svc.cluster.local hosts: - #- alertmanager-miro.dev-eu.nynja.net + #- alertmanager.dev-eu.nynja.net ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md @@ -429,7 +429,7 @@ server: ## external systems (federation, remote storage, Alertmanager). ## external_labels: - monitor: '' + monitor: 'dev' ## Additional Prometheus server container arguments ## @@ -561,13 +561,13 @@ server: ## The environment name - shown as a headline in the Prometheus Alerts # externalLabels: - cluster: "" + cluster: "dev" service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "" + nynja.biz/env: "dev" nynja.biz/probe: "prometheus" labels: {} clusterIP: "" @@ -676,7 +676,7 @@ pushgateway: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "9091" - nynja.biz/env: "" + nynja.biz/env: "dev" nynja.biz/probe: "pushgateway" labels: {} clusterIP: "" @@ -743,7 +743,7 @@ alertmanagerFiles: - name: default-receiver slack_configs: - - channel: '' + - channel: '#ops-alerts-dev' send_resolved: true username: '{{ template "slack.default.username" . }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' @@ -770,7 +770,7 @@ alertmanagerFiles: group_by: ['alertname', 'cluster'] routes: - match: - env: '' + env: 'dev' group_wait: 5m repeat_interval: 24h -- GitLab From 82003bc7656505c1388facda7dd5708b73fc5752 Mon Sep 17 00:00:00 2001 From: Dimitar Zafirov Date: Fri, 15 Mar 2019 17:20:48 +0200 Subject: [PATCH 2/5] new EFK alert rules created --- prometheus/rules/elasticsearch_rules.yaml | 41 +++++++++++++++++++++++ prometheus/rules/fluent-bit_rules.yaml | 10 ++++++ 2 files changed, 51 insertions(+) create mode 100644 prometheus/rules/elasticsearch_rules.yaml create mode 100644 prometheus/rules/fluent-bit_rules.yaml diff --git a/prometheus/rules/elasticsearch_rules.yaml b/prometheus/rules/elasticsearch_rules.yaml new file mode 100644 index 0000000..5d46b00 --- /dev/null +++ b/prometheus/rules/elasticsearch_rules.yaml @@ -0,0 +1,41 @@ +- name: elasticsearch + rules: + - record: elasticsearch_filesystem_data_used_percent + expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) + / elasticsearch_filesystem_data_size_bytes + - record: elasticsearch_filesystem_data_free_percent + expr: 100 - elasticsearch_filesystem_data_used_percent + - alert: ElasticsearchClusterHealthNumberOfNodes + expr: elasticsearch_cluster_health_number_of_nodes < 8 + for: 5m + labels: + severity: critical + annotations: + description: There are only {{$value}} < 8 ElasticSearch nodes running + summary: ElasticSearch running on less than 8 nodes + - alert: ElasticsearchHeapTooHigh + expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} + > 0.9 + for: 15m + labels: + severity: critical + annotations: + description: The heap usage is over 90% for 15m + summary: ElasticSearch node {{$labels.node}} heap usage is high + - alert: ElasticSearchCpuCritical + expr: elasticsearch_process_cpu_percent > 95 + for: 3m + labels: + severity: critical + annotations: + description: '{{$labels.instance}} reports critical cpu usage. Please verify + workload, or add another node to the cluster ' + summary: Critical CPU usage on {{$labels.instance}} + - alert: ElasticSearchlusterHealthStatus + expr: elasticsearch_cluster_health_status{color="green"} < 1 + for: 3m + labels: + severity: critical + annotations: + description: 'ElasticSearch status is "Unhealthy"!' + summary: Critical Elastic Search health status diff --git a/prometheus/rules/fluent-bit_rules.yaml b/prometheus/rules/fluent-bit_rules.yaml new file mode 100644 index 0000000..1f6ad05 --- /dev/null +++ b/prometheus/rules/fluent-bit_rules.yaml @@ -0,0 +1,10 @@ +- name: fluent-bit + rules: + - alert: FluentBitNewOutputErrors + expr: fluentbit_output_errors_total offset 5m < fluentbit_output_errors_total + labels: + severity: critical + annotations: + identifier: "*Job:* `{{ $labels.job }}`, *Instance:* `{{ $labels.instance }}`" + description: 'Fluent Bit output errors detected in the last 5 minutes!' + summary: '*`CRITICAL` - Fluent_Bit output errors*' -- GitLab From 968f41f71c8b2c197a2a8cdd4effcd1d23c4c344 Mon Sep 17 00:00:00 2001 From: Dimitar Zafirov Date: Fri, 15 Mar 2019 17:47:00 +0200 Subject: [PATCH 3/5] New EFK Rules Created --- prometheus/rules/elasticsearch_rules.yaml | 18 +++++++++++++----- prometheus/rules/fluent-bit_rules.yaml | 8 ++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/prometheus/rules/elasticsearch_rules.yaml b/prometheus/rules/elasticsearch_rules.yaml index 5d46b00..ccf3beb 100644 --- a/prometheus/rules/elasticsearch_rules.yaml +++ b/prometheus/rules/elasticsearch_rules.yaml @@ -22,15 +22,15 @@ annotations: description: The heap usage is over 90% for 15m summary: ElasticSearch node {{$labels.node}} heap usage is high - - alert: ElasticSearchCpuCritical - expr: elasticsearch_process_cpu_percent > 95 - for: 3m + - alert: ElasticSearchProcessCpuCritical + expr: elasticsearch_process_cpu_percent > 90 + for: 1m labels: severity: critical annotations: - description: '{{$labels.instance}} reports critical cpu usage. Please verify + description: '{{$labels.instance}} reports critical Elastic Search process cpu usage. Please verify workload, or add another node to the cluster ' - summary: Critical CPU usage on {{$labels.instance}} + summary: Critical ES process CPU usage on {{$labels.instance}} - alert: ElasticSearchlusterHealthStatus expr: elasticsearch_cluster_health_status{color="green"} < 1 for: 3m @@ -39,3 +39,11 @@ annotations: description: 'ElasticSearch status is "Unhealthy"!' summary: Critical Elastic Search health status + - alert: ElasticSearchFilesystemDataUsage + expr: elasticsearch_filesystem_data_used_percent > 80 + for: 5m + labels: + severity: warning + annotations: + description: 'ElasticSearch filesystem data usage is more than 80 percents!' + summary: ElasticSearch - high filesystem data usage diff --git a/prometheus/rules/fluent-bit_rules.yaml b/prometheus/rules/fluent-bit_rules.yaml index 1f6ad05..3f52b59 100644 --- a/prometheus/rules/fluent-bit_rules.yaml +++ b/prometheus/rules/fluent-bit_rules.yaml @@ -8,3 +8,11 @@ identifier: "*Job:* `{{ $labels.job }}`, *Instance:* `{{ $labels.instance }}`" description: 'Fluent Bit output errors detected in the last 5 minutes!' summary: '*`CRITICAL` - Fluent_Bit output errors*' + - alert: FluentBitRetries + expr: ( fluentbit_output_retries_total - fluentbit_output_retries_total offset 5m ) > 50 + for: 5m + labels: + severity: critical + annotations: + description: 'Fluent Bit output retry errors is more than 100 in the last 5 minutes!' + summary: 'Fluent Bit output retry errors!' -- GitLab From 3751a1026ce37d3b070ae585dd7a21c541c73e57 Mon Sep 17 00:00:00 2001 From: Dimitar Zafirov Date: Tue, 16 Apr 2019 11:20:19 +0300 Subject: [PATCH 4/5] new Prometheus rules for fluent bit and elastix search --- prometheus/rules/fluent-bit_rules.yaml | 2 +- prometheus/values.yaml | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/prometheus/rules/fluent-bit_rules.yaml b/prometheus/rules/fluent-bit_rules.yaml index 3f52b59..d2310ae 100644 --- a/prometheus/rules/fluent-bit_rules.yaml +++ b/prometheus/rules/fluent-bit_rules.yaml @@ -9,7 +9,7 @@ description: 'Fluent Bit output errors detected in the last 5 minutes!' summary: '*`CRITICAL` - Fluent_Bit output errors*' - alert: FluentBitRetries - expr: ( fluentbit_output_retries_total - fluentbit_output_retries_total offset 5m ) > 50 + expr: ( fluentbit_output_retries_total - fluentbit_output_retries_total offset 5m ) > 100 for: 5m labels: severity: critical diff --git a/prometheus/values.yaml b/prometheus/values.yaml index bd2fd1e..55fc671 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -60,7 +60,7 @@ alertmanager: ## enabled: true ## The sealed PagerDuty Service Key (sealed with Bitnami's SealedSecret) - servicekey: "AgBh3etLxrPAvoydN8b6mC5gO1ABF3OgraQCH3nFbZ4Y/HgYuVt/VHJBaxYluQQW6aY/oPOqS4KekjU2n8dyQ9YCFFx8/RFWcgxjCHQky3DY3MeC8k4zpa5B2+h6nkIK0cI6a3hC4aNxVNc0K7G5Ll3tlXkCE1TuME5w81q5HP87jnTlEW5NC4nV84TLfSiPMV6ER3ppLt0HgbzJMTZ64yHVnAc3RYBcY8TJM3I578HwW4Ot/wi6VExipFjPIhH3bUfxX1C7bTNkQ/vcdRx6lEi/5LFD04ymKvIc/IvTqNC3mwDR8+DMDPSUnfntgzL4kgzdnn2q7/YWcshSH3nOvd4bKUQqi4HZ0cmGSm8JKvb+djeZPkHGmP9Ttmoz8GQ8xOfCSRHOAOoufmlf6JgQalCLpR7oejvVL7k/CLekBktQLBewEyec020sgM/13J8VlZGXzxCOUpkL8PlsqOfquJVJVKpRvEA7Nilot9W6AL1xgOCB4ZFo8VJ4gxnm2pfKFqqOXZcRUs5aMrt+DwAvX/rWSyl6viEcoWDHKai7e/nska+/eTqncUYFhUCnM9nAT5VfPBMnujukX0axlTSCcJMJ4vJnxgao5crHb3HsE9VeM/JeNKSQilVQwULVQ60J7oXHUENVtIexpM4QmoG4O14kg97aGbyvkGzn/HNC7JedsoBcXNaCjG94PcEhtgfYScvdk7s1aQq7PmEsG8JhTprBXFTPS7U8pazPbzM2Gt6yFA==" + servicekey: "" slackapi: ## If true, the specified Slack Channel will be used with Alertmanager and PagerDuty @@ -68,7 +68,7 @@ alertmanager: # enabled: true ## The sealed Slack Channel API URL (sealed with Bitnami's SealedSecret) - url: "AgAmHixjjBzVW3Q6G9KiCkbipGE+bL3n07nShM6sydNw75/pCrF8bzCnIHpdAvf1evfr05K+7COwLkN0Qx+VMyUuUYl+9wn/BRHkoD+jK7iYL4Hs5z4F2rMB/nznK+o6XhO/LLxrBLI6f61ZwLuoWlTLTMLuxbpdnkcIsMVzbIvfo2YxESut2DJMzysQhoJG0o+Zmr+l2yZf/5s5qC0rz6ePfH9rfvkla4qBSSj8Us7vUtPsi7pN4Cs1p2s0TOTBj2AaF3iU0tJCncMUwWzSFEGrED2gBmDUJicVjGxh0Pn4CRIBmCG5ddnUADvFWwFkUP50g+DLIw3I7yd12vrNWA651ClJf1QE+ap4+Lll4kSISgE5CqRQ/oL/cy9re0+uJ3Ta2dd1yTKWrJBOqA8FHf1Wa97l8ThKAFfpB3tPL5c1HJkhbtv73+pW5O72HcqcUw4gBzp4JQZflbDR6YgsnhA79c/1hdKcm+2Otm4dLehkUqQUkW6k+kor4C3ZT37ThQVl3srENTgkVVWZKQpaKC/JWBdQIoEDyL52Tsyp9yOBQ06ZD1NA9sqK+w2skXqklEOU7OsjQsPadfsvoyITaAA8dPv+QE1mcTAXCZuR88mmcuyexL43QIbKu/rnZGDWPUVrkPQOSrzndJq97W8OxyGczoqWJOCE8G1uhlen6fbTdutOGrTcUj11s6G19QnCg0+1CtvrUEutr++CwSos8NG2scZdyhUnHcGWc7xZYGOObko46l5Oiz7AdjFnADbDFjDBP/lP8ZDh+wtcKpmu8AFXMhWgN3uD53PpTib9Fg==" + url: "" ingress: ## If true, alertmanager Ingress will be created @@ -179,7 +179,8 @@ alertmanager: selector: - internal-gateway.default.svc.cluster.local hosts: - #- alertmanager.dev-eu.nynja.net + - alertmanager.dev-eu.nynja.net + #- alertmanager.staging.nynja.net ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md @@ -585,7 +586,7 @@ server: selector: - internal-gateway.default.svc.cluster.local hosts: - #- prometheus-miro.dev-eu.nynja.net + - prometheus.dev.nynja.net ## Prometheus server pod termination grace period ## -- GitLab From b0952a6eeb7f3c8217d88cda742d63313d81b368 Mon Sep 17 00:00:00 2001 From: Dimitar Zafirov Date: Tue, 16 Apr 2019 11:23:48 +0300 Subject: [PATCH 5/5] new Prometheus rules for fluent bit and elastix search --- prometheus/values.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 55fc671..ef2a827 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -3,7 +3,7 @@ rbac: ## Define the NYNJA Group's current environment # -nynja_env: 'dev' +nynja_env: '' ## Define serviceAccount names for components. Defaults to component's fully qualified name. # @@ -171,7 +171,7 @@ alertmanager: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "dev" + nynja.biz/env: "" nynja.biz/probe: "alertmanager" labels: {} clusterIP: "" @@ -179,7 +179,7 @@ alertmanager: selector: - internal-gateway.default.svc.cluster.local hosts: - - alertmanager.dev-eu.nynja.net + #- alertmanager.dev-eu.nynja.net #- alertmanager.staging.nynja.net ## Enabling peer mesh service end points for enabling the HA alert manager @@ -430,7 +430,7 @@ server: ## external systems (federation, remote storage, Alertmanager). ## external_labels: - monitor: 'dev' + monitor: '' ## Additional Prometheus server container arguments ## @@ -562,13 +562,13 @@ server: ## The environment name - shown as a headline in the Prometheus Alerts # externalLabels: - cluster: "dev" + cluster: "" service: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "80" - nynja.biz/env: "dev" + nynja.biz/env: "" nynja.biz/probe: "prometheus" labels: {} clusterIP: "" @@ -586,7 +586,7 @@ server: selector: - internal-gateway.default.svc.cluster.local hosts: - - prometheus.dev.nynja.net + #- prometheus.dev.nynja.net ## Prometheus server pod termination grace period ## @@ -677,7 +677,7 @@ pushgateway: annotations: nynja.biz/scrape: "true" nynja.biz/scrape_port: "9091" - nynja.biz/env: "dev" + nynja.biz/env: "" nynja.biz/probe: "pushgateway" labels: {} clusterIP: "" @@ -744,7 +744,7 @@ alertmanagerFiles: - name: default-receiver slack_configs: - - channel: '#ops-alerts-dev' + - channel: '' send_resolved: true username: '{{ template "slack.default.username" . }}' color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' @@ -771,7 +771,7 @@ alertmanagerFiles: group_by: ['alertname', 'cluster'] routes: - match: - env: 'dev' + env: '' group_wait: 5m repeat_interval: 24h -- GitLab