From ef99770383adf547317dcf2b0ea4de32b1c83af4 Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Wed, 17 May 2023 14:29:31 -0400 Subject: Add subsystem metrics for the dispatcher (#13989) This adds a handful of metrics to /api/v2/metrics/ recorded from the dispatcher main process Adds logic in the dispatcher period tasks to calculate these for the last collection interval Reports worker count, task count, scale up events, and availability Add data to demo grafana dashboard --- awx/main/analytics/subsystem_metrics.py | 5 + awx/main/dispatch/pool.py | 18 +- awx/main/dispatch/worker/base.py | 25 +- tools/grafana/dashboards/demo_dashboard.json | 729 +++++++++++++++++++-------- 4 files changed, 561 insertions(+), 216 deletions(-) diff --git a/awx/main/analytics/subsystem_metrics.py b/awx/main/analytics/subsystem_metrics.py index 4e10ff98b8..9b93b98bda 100644 --- a/awx/main/analytics/subsystem_metrics.py +++ b/awx/main/analytics/subsystem_metrics.py @@ -209,6 +209,11 @@ class Metrics: SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'), SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow tasks'), SetFloatM('workflow_manager_get_tasks_seconds', 'Time spent loading workflow tasks from db'), + # dispatcher subsystem metrics + SetIntM('dispatcher_pool_scale_up_events', 'Number of times local dispatcher scaled up a worker since startup'), + SetIntM('dispatcher_pool_active_task_count', 'Number of active tasks in the worker pool when last task was submitted'), + SetIntM('dispatcher_pool_max_worker_count', 'Highest number of workers in worker pool in last collection interval, about 20s'), + SetFloatM('dispatcher_availability', 'Fraction of time (in last collection interval) dispatcher was able to receive messages'), ] # turn metric list into dictionary with the metric name as a key self.METRICS = {} diff --git a/awx/main/dispatch/pool.py b/awx/main/dispatch/pool.py index dd2fdffa2a..b8208012b6 100644 --- a/awx/main/dispatch/pool.py +++ b/awx/main/dispatch/pool.py @@ -339,6 +339,17 @@ class AutoscalePool(WorkerPool): # but if the task takes longer than the time defined here, we will force it to stop here self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD + # initialize some things for subsystem metrics periodic gathering + # the AutoscalePool class does not save these to redis directly, but reports via produce_subsystem_metrics + self.scale_up_ct = 0 + self.worker_count_max = 0 + + def produce_subsystem_metrics(self, metrics_object): + metrics_object.set('dispatcher_pool_scale_up_events', self.scale_up_ct) + metrics_object.set('dispatcher_pool_active_task_count', sum(len(w.managed_tasks) for w in self.workers)) + metrics_object.set('dispatcher_pool_max_worker_count', self.worker_count_max) + self.worker_count_max = len(self.workers) + @property def should_grow(self): if len(self.workers) < self.min_workers: @@ -443,7 +454,12 @@ class AutoscalePool(WorkerPool): idx = random.choice(range(len(self.workers))) return idx, self.workers[idx] else: - return super(AutoscalePool, self).up() + self.scale_up_ct += 1 + ret = super(AutoscalePool, self).up() + new_worker_ct = len(self.workers) + if new_worker_ct > self.worker_count_max: + self.worker_count_max = new_worker_ct + return ret def write(self, preferred_queue, body): if 'guid' in body: diff --git a/awx/main/dispatch/worker/base.py b/awx/main/dispatch/worker/base.py index 9a9d4c803c..c10564f6dd 100644 --- a/awx/main/dispatch/worker/base.py +++ b/awx/main/dispatch/worker/base.py @@ -19,6 +19,7 @@ from awx.main.dispatch.pool import WorkerPool from awx.main.dispatch import pg_bus_conn from awx.main.utils.common import log_excess_runtime from awx.main.utils.db import set_connection_name +import awx.main.analytics.subsystem_metrics as s_metrics if 'run_callback_receiver' in sys.argv: logger = logging.getLogger('awx.main.commands.run_callback_receiver') @@ -154,17 +155,30 @@ class AWXConsumerPG(AWXConsumerBase): self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE # if no successful loops have ran since startup, then we should fail right away self.pg_is_down = True # set so that we fail if we get database errors on startup - self.pg_down_time = time.time() - self.pg_max_wait # allow no grace period - self.last_cleanup = time.time() + init_time = time.time() + self.pg_down_time = init_time - self.pg_max_wait # allow no grace period + self.last_cleanup = init_time + self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False) + self.last_metrics_gather = init_time + self.listen_cumulative_time = 0.0 def run_periodic_tasks(self): self.record_statistics() # maintains time buffer in method - if time.time() - self.last_cleanup > 60: # same as cluster_node_heartbeat + current_time = time.time() + if current_time - self.last_cleanup > 60: # same as cluster_node_heartbeat # NOTE: if we run out of database connections, it is important to still run cleanup # so that we scale down workers and free up connections self.pool.cleanup() - self.last_cleanup = time.time() + self.last_cleanup = current_time + + # record subsystem metrics for the dispatcher + if current_time - self.last_metrics_gather > 20: + self.pool.produce_subsystem_metrics(self.subsystem_metrics) + self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather)) + self.subsystem_metrics.pipe_execute() + self.listen_cumulative_time = 0.0 + self.last_metrics_gather = current_time def run(self, *args, **kwargs): super(AWXConsumerPG, self).run(*args, **kwargs) @@ -180,11 +194,14 @@ class AWXConsumerPG(AWXConsumerBase): if init is False: self.worker.on_start() init = True + self.listen_start = time.time() for e in conn.events(yield_timeouts=True): + self.listen_cumulative_time += time.time() - self.listen_start if e is not None: self.process_task(json.loads(e.payload)) self.run_periodic_tasks() self.pg_is_down = False + self.listen_start = time.time() if self.should_stop: return except psycopg2.InterfaceError: diff --git a/tools/grafana/dashboards/demo_dashboard.json b/tools/grafana/dashboards/demo_dashboard.json index f654bc8e6f..7c0eacc42b 100644 --- a/tools/grafana/dashboards/demo_dashboard.json +++ b/tools/grafana/dashboards/demo_dashboard.json @@ -29,244 +29,549 @@ "liveNow": false, "panels": [ { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 37, - "panels": [], - "title": "System", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "awx_prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "id": 38, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "description": "Fraction of time dispatcher is listening for new messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 14, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "editorMode": "builder", + "expr": "dispatcher_availability", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Dispatcher Availability", + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { "type": "prometheus", "uid": "awx_prometheus" }, - "editorMode": "builder", - "expr": "awx_database_connections_total", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "Database", - "type": "timeseries" - }, - { - "datasource": {}, - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "light-blue", - "value": null + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - { - "color": "red", - "value": 80 + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ] + }, + "overrides": [] }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 12, - "y": 1 - }, - "id": 25, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_max_worker_count", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_active_task_count", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } ], - "fields": "/^tower_version$/", - "values": false + "title": "Dispatcher Workers", + "type": "timeseries" }, - "textMode": "auto" - }, - "pluginVersion": "9.1.6", - "targets": [ { "datasource": { "type": "prometheus", - "uid": "000000021" + "uid": "awx_prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 }, - "editorMode": "code", - "exemplar": false, - "expr": "awx_system_info", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "dispatcher_pool_scale_up_events", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Dispatcher Pool Scale-Up Events", + "type": "timeseries" } ], - "title": "Controller Version", - "type": "stat" + "title": "Dispatcher", + "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "awx_prometheus" + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 1 }, - "fieldConfig": { - "defaults": { - "displayName": "Instances", - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "light-blue", - "value": null + "id": 37, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ] - } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "builder", + "expr": "awx_database_connections_total", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Database", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 5, - "x": 12, - "y": 5 - }, - "id": 13, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-blue", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 12, + "y": 26 + }, + "id": 25, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^tower_version$/", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "000000021" + }, + "editorMode": "code", + "exemplar": false, + "expr": "awx_system_info", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } ], - "fields": "", - "values": false + "title": "Controller Version", + "type": "stat" }, - "textMode": "auto" - }, - "pluginVersion": "9.1.6", - "targets": [ { "datasource": { "type": "prometheus", "uid": "awx_prometheus" }, - "editorMode": "code", - "expr": "count(awx_instance_info)", - "interval": "", - "legendFormat": " ", - "range": true, - "refId": "A" + "fieldConfig": { + "defaults": { + "displayName": "Instances", + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 5, + "x": 12, + "y": 30 + }, + "id": 13, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "awx_prometheus" + }, + "editorMode": "code", + "expr": "count(awx_instance_info)", + "interval": "", + "legendFormat": " ", + "range": true, + "refId": "A" + } + ], + "title": "Controller Node Count", + "type": "stat" } ], - "title": "Controller Node Count", - "type": "stat" + "title": "System", + "type": "row" }, { "collapsed": true, @@ -274,7 +579,7 @@ "h": 1, "w": 24, "x": 0, - "y": 9 + "y": 2 }, "id": 35, "panels": [ @@ -385,7 +690,7 @@ "h": 8, "w": 12, "x": 0, - "y": 10 + "y": 26 }, "id": 8, "options": { @@ -523,7 +828,7 @@ "h": 8, "w": 12, "x": 12, - "y": 10 + "y": 26 }, "id": 29, "options": { @@ -616,7 +921,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 34 }, "id": 16, "options": { @@ -740,7 +1045,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 34 }, "id": 18, "options": { @@ -840,7 +1145,7 @@ "h": 6, "w": 12, "x": 0, - "y": 26 + "y": 42 }, "id": 27, "options": { @@ -932,7 +1237,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 42 }, "id": 20, "options": { @@ -973,7 +1278,7 @@ "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 3 }, "id": 33, "panels": [ @@ -1022,7 +1327,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1064,7 +1370,7 @@ "h": 8, "w": 12, "x": 0, - "y": 2 + "y": 27 }, "id": 12, "options": { @@ -1164,7 +1470,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1179,7 +1486,7 @@ "h": 8, "w": 12, "x": 12, - "y": 2 + "y": 27 }, "id": 10, "options": { @@ -1268,7 +1575,7 @@ "h": 1, "w": 24, "x": 0, - "y": 11 + "y": 4 }, "id": 31, "panels": [ @@ -1336,7 +1643,7 @@ "h": 8, "w": 12, "x": 0, - "y": 36 + "y": 28 }, "id": 26, "options": { @@ -1455,7 +1762,7 @@ "h": 8, "w": 12, "x": 12, - "y": 36 + "y": 28 }, "id": 24, "options": { @@ -1504,7 +1811,7 @@ } ], "refresh": "5s", - "schemaVersion": 37, + "schemaVersion": 38, "style": "dark", "tags": [], "templating": { @@ -1518,6 +1825,6 @@ "timezone": "", "title": "awx-demo", "uid": "GISWZOXnk", - "version": 12, + "version": 13, "weekStart": "" } -- cgit v1.2.3