diff --git a/modules/monitoring-creating-scrape-sample-alerts.adoc b/modules/monitoring-creating-scrape-sample-alerts.adoc index f8343d22ba..2b96ad1dc1 100644 --- a/modules/monitoring-creating-scrape-sample-alerts.adoc +++ b/modules/monitoring-creating-scrape-sample-alerts.adoc @@ -30,38 +30,38 @@ metadata: labels: prometheus: k8s role: alert-rules - name: monitoring-stack-alerts <1> - namespace: ns1 <2> + name: monitoring-stack-alerts #<1> + namespace: ns1 #<2> spec: groups: - name: general.rules rules: - - alert: TargetDown <3> + - alert: TargetDown #<3> annotations: message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service - }} targets in {{ $labels.namespace }} namespace are down.' <4> + }} targets in {{ $labels.namespace }} namespace are down.' #<4> expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 - for: 10m <5> + for: 10m #<5> labels: - severity: warning <6> - - alert: ApproachingEnforcedSamplesLimit <7> + severity: warning #<6> + - alert: ApproachingEnforcedSamplesLimit #<7> annotations: - message: '{{ $labels.container }} container of the {{ $labels.pod }} pod in the {{ $labels.namespace }} namespace consumes {{ $value | humanizePercentage }} of the samples limit budget.' <8> - expr: scrape_samples_scraped/50000 > 0.8 <9> - for: 10m <10> + message: '{{ $labels.container }} container of the {{ $labels.pod }} pod in the {{ $labels.namespace }} namespace consumes {{ $value | humanizePercentage }} of the samples limit budget.' #<8> + expr: (scrape_samples_post_metric_relabeling / (scrape_sample_limit > 0)) > 0.9 #<9> + for: 10m #<10> labels: - severity: warning <11> + severity: warning #<11> ---- <1> Defines the name of the alerting rule. -<2> Specifies the user-defined project where the alerting rule will be deployed. -<3> The `TargetDown` alert will fire if the target cannot be scraped or is not available for the `for` duration. -<4> The message that will be output when the `TargetDown` alert fires. +<2> Specifies the user-defined project where the alerting rule is deployed. +<3> The `TargetDown` alert fires if the target cannot be scraped and is not available for the `for` duration. +<4> The message that is displayed when the `TargetDown` alert fires. <5> The conditions for the `TargetDown` alert must be true for this duration before the alert is fired. <6> Defines the severity for the `TargetDown` alert. -<7> The `ApproachingEnforcedSamplesLimit` alert will fire when the defined scrape sample threshold is reached or exceeded for the specified `for` duration. -<8> The message that will be output when the `ApproachingEnforcedSamplesLimit` alert fires. -<9> The threshold for the `ApproachingEnforcedSamplesLimit` alert. In this example the alert will fire when the number of samples per target scrape has exceeded 80% of the enforced sample limit of `50000`. The `for` duration must also have passed before the alert will fire. The `` in the expression `scrape_samples_scraped/ > ` must match the `enforcedSampleLimit` value defined in the `user-workload-monitoring-config` `ConfigMap` object. +<7> The `ApproachingEnforcedSamplesLimit` alert fires when the defined scrape sample threshold is exceeded and lasts for the specified `for` duration. +<8> The message that is displayed when the `ApproachingEnforcedSamplesLimit` alert fires. +<9> The threshold for the `ApproachingEnforcedSamplesLimit` alert. In this example, the alert fires when the number of ingested samples exceeds 90% of the configured limit. <10> The conditions for the `ApproachingEnforcedSamplesLimit` alert must be true for this duration before the alert is fired. <11> Defines the severity for the `ApproachingEnforcedSamplesLimit` alert. @@ -71,3 +71,9 @@ spec: ---- $ oc apply -f monitoring-stack-alerts.yaml ---- + +. Additionally, you can check if a target has hit the configured limit: + +.. In the *Administrator* perspective of the web console, go to *Observe* -> *Targets* and select an endpoint with a `Down` status that you want to check. ++ +The *Scrape failed: sample limit exceeded* message is displayed if the endpoint failed because of an exceeded sample limit.