diff --git a/prometheus/rules/elasticsearch_rules.yaml b/prometheus/rules/elasticsearch_rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ccf3beb85585af81b423984fa35c758740d5b38f --- /dev/null +++ b/prometheus/rules/elasticsearch_rules.yaml @@ -0,0 +1,49 @@ +- name: elasticsearch + rules: + - record: elasticsearch_filesystem_data_used_percent + expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) + / elasticsearch_filesystem_data_size_bytes + - record: elasticsearch_filesystem_data_free_percent + expr: 100 - elasticsearch_filesystem_data_used_percent + - alert: ElasticsearchClusterHealthNumberOfNodes + expr: elasticsearch_cluster_health_number_of_nodes < 8 + for: 5m + labels: + severity: critical + annotations: + description: There are only {{$value}} < 8 ElasticSearch nodes running + summary: ElasticSearch running on less than 8 nodes + - alert: ElasticsearchHeapTooHigh + expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} + > 0.9 + for: 15m + labels: + severity: critical + annotations: + description: The heap usage is over 90% for 15m + summary: ElasticSearch node {{$labels.node}} heap usage is high + - alert: ElasticSearchProcessCpuCritical + expr: elasticsearch_process_cpu_percent > 90 + for: 1m + labels: + severity: critical + annotations: + description: '{{$labels.instance}} reports critical Elastic Search process cpu usage. Please verify + workload, or add another node to the cluster ' + summary: Critical ES process CPU usage on {{$labels.instance}} + - alert: ElasticSearchlusterHealthStatus + expr: elasticsearch_cluster_health_status{color="green"} < 1 + for: 3m + labels: + severity: critical + annotations: + description: 'ElasticSearch status is "Unhealthy"!' + summary: Critical Elastic Search health status + - alert: ElasticSearchFilesystemDataUsage + expr: elasticsearch_filesystem_data_used_percent > 80 + for: 5m + labels: + severity: warning + annotations: + description: 'ElasticSearch filesystem data usage is more than 80 percents!' + summary: ElasticSearch - high filesystem data usage diff --git a/prometheus/rules/fluent-bit_rules.yaml b/prometheus/rules/fluent-bit_rules.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2310ae21e8a26205ddefd365d2424aab942f4b4 --- /dev/null +++ b/prometheus/rules/fluent-bit_rules.yaml @@ -0,0 +1,18 @@ +- name: fluent-bit + rules: + - alert: FluentBitNewOutputErrors + expr: fluentbit_output_errors_total offset 5m < fluentbit_output_errors_total + labels: + severity: critical + annotations: + identifier: "*Job:* `{{ $labels.job }}`, *Instance:* `{{ $labels.instance }}`" + description: 'Fluent Bit output errors detected in the last 5 minutes!' + summary: '*`CRITICAL` - Fluent_Bit output errors*' + - alert: FluentBitRetries + expr: ( fluentbit_output_retries_total - fluentbit_output_retries_total offset 5m ) > 100 + for: 5m + labels: + severity: critical + annotations: + description: 'Fluent Bit output retry errors is more than 100 in the last 5 minutes!' + summary: 'Fluent Bit output retry errors!' diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 87f3ebdbec83c6604bc0969f620929885a8485a6..ef2a82743ba42bc61fe80aab69075edf7861ba87 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -179,7 +179,8 @@ alertmanager: selector: - internal-gateway.default.svc.cluster.local hosts: - #- alertmanager-miro.dev-eu.nynja.net + #- alertmanager.dev-eu.nynja.net + #- alertmanager.staging.nynja.net ## Enabling peer mesh service end points for enabling the HA alert manager ## Ref: https://github.com/prometheus/alertmanager/blob/master/README.md @@ -585,7 +586,7 @@ server: selector: - internal-gateway.default.svc.cluster.local hosts: - #- prometheus-miro.dev-eu.nynja.net + #- prometheus.dev.nynja.net ## Prometheus server pod termination grace period ##