summaryrefslogtreecommitdiffstats
path: root/tools/grafana
diff options
context:
space:
mode:
authorElijah DeLee <kdelee@redhat.com>2022-09-07 19:01:10 +0200
committerElijah DeLee <kdelee@redhat.com>2022-09-14 22:23:53 +0200
commit10d06f219d0b4076798b350bd94d05e4d602b2bf (patch)
treeecc4eec3df236e85bd62e2f5707b26de73a560df /tools/grafana
parentMerge pull request #12868 from keithjgrant/12853-ws-event-duplication (diff)
downloadawx-10d06f219d0b4076798b350bd94d05e4d602b2bf.tar.xz
awx-10d06f219d0b4076798b350bd94d05e4d602b2bf.zip
add alerting rule to grafana
This rule alerts if the redis queue is larger than what the rolling average event insertion rate/second * 120. In other words, if the redis queue is larger than it appears we can process events in two minutes. It appears it has to meet this condition for 60 seconds to start firing. Future commits will address how to configure contact points like slack. shout out to @jainnikhil30 and @rebeccahhh who figured this out in jam session this morning.
Diffstat (limited to 'tools/grafana')
-rw-r--r--tools/grafana/alerting/alerts.yml145
1 files changed, 145 insertions, 0 deletions
diff --git a/tools/grafana/alerting/alerts.yml b/tools/grafana/alerting/alerts.yml
new file mode 100644
index 0000000000..155bcf9733
--- /dev/null
+++ b/tools/grafana/alerting/alerts.yml
@@ -0,0 +1,145 @@
+---
+apiVersion: 1
+groups:
+ - folder: awx
+ interval: 60s
+ name: awx_rules
+ orgId: 1
+ rules:
+ - condition: A
+ dashboardUid: awx
+ data:
+ - datasourceUid: PBFA97CFB590B2093
+ model:
+ editorMode: code
+ expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
+ hide: false
+ intervalMs: 1000
+ legendFormat: __auto
+ maxDataPoints: 43200
+ range: true
+ refId: events_insertion_rate_per_second
+ queryType: ""
+ refId: events_insertion_rate_per_second
+ relativeTimeRange:
+ from: 300
+ to: 0
+ - datasourceUid: -100
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 3
+ type: gt
+ operator:
+ type: and
+ query:
+ params:
+ - event_insertion_rate
+ reducer:
+ params: []
+ type: last
+ type: query
+ datasource:
+ type: __expr__
+ uid: -100
+ expression: events_insertion_rate_per_second
+ hide: false
+ intervalMs: 1000
+ maxDataPoints: 43200
+ reducer: mean
+ refId: mean_event_insertion_rate
+ type: reduce
+ queryType: ""
+ refId: mean_event_insertion_rate
+ relativeTimeRange:
+ from: 0
+ to: 0
+ - datasourceUid: PBFA97CFB590B2093
+ model:
+ datasource:
+ type: prometheus
+ uid: PBFA97CFB590B2093
+ editorMode: code
+ expr: callback_receiver_events_queue_size_redis{node='awx_1'}
+ hide: false
+ intervalMs: 1000
+ legendFormat: __auto
+ maxDataPoints: 43200
+ range: true
+ refId: redis_queue_size
+ queryType: ""
+ refId: redis_queue_size
+ relativeTimeRange:
+ from: 300
+ to: 0
+ - datasourceUid: -100
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 3
+ type: gt
+ operator:
+ type: and
+ query:
+ params:
+ - event_insertion_rate
+ reducer:
+ params: []
+ type: last
+ type: query
+ datasource:
+ type: __expr__
+ uid: -100
+ expression: redis_queue_size
+ hide: false
+ intervalMs: 1000
+ maxDataPoints: 43200
+ reducer: last
+ refId: mean_redis_queue_size
+ type: reduce
+ queryType: ""
+ refId: mean_redis_queue_size
+ relativeTimeRange:
+ from: 0
+ to: 0
+ - datasourceUid: -100
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 0
+ - 0
+ type: gt
+ operator:
+ type: and
+ query:
+ params:
+ - mean_redis_queue_size
+ reducer:
+ params: []
+ type: avg
+ type: query
+ datasource:
+ name: Expression
+ type: __expr__
+ uid: __expr__
+ expression: '(
+ ${mean_redis_queue_size} >
+ ($mean_event_insertion_rate\ * 120))'
+ hide: false
+ intervalMs: 1000
+ maxDataPoints: 43200
+ refId: redis_queue_growing_faster_than_insertion_rate
+ type: math
+ queryType: ""
+ refId: redis_queue_growing_faster_than_insertion_rate
+ relativeTimeRange:
+ from: 0
+ to: 0
+ for: 60s
+ noDataState: OK
+ panelId: 1
+ title: redis_queue_too_large_to_clear_in_2_min
+ uid: redis_queue_too_large_to_clear_in_2_min