summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.yamllint2
-rw-r--r--Makefile6
-rw-r--r--awx/main/utils/handlers.py47
-rw-r--r--awx/settings/defaults.py1
-rw-r--r--requirements/requirements_dev.txt6
-rw-r--r--tools/docker-compose/README.md10
-rw-r--r--tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j240
-rw-r--r--tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j212
-rw-r--r--tools/grafana/datasources/loki_source.yml11
-rw-r--r--tools/loki/local-config.yaml96
-rw-r--r--tools/otel/otel-collector-config.yaml39
11 files changed, 270 insertions, 0 deletions
diff --git a/.yamllint b/.yamllint
index a937588cdc..87a0d311a6 100644
--- a/.yamllint
+++ b/.yamllint
@@ -11,6 +11,8 @@ ignore: |
# django template files
awx/api/templates/instance_install_bundle/**
.readthedocs.yaml
+ tools/loki
+ tools/otel
extends: default
diff --git a/Makefile b/Makefile
index 2f5223621f..5df99c544a 100644
--- a/Makefile
+++ b/Makefile
@@ -47,6 +47,10 @@ VAULT ?= false
VAULT_TLS ?= false
# If set to true docker-compose will also start a tacacs+ instance
TACACS ?= false
+# If set to true docker-compose will also start an OpenTelemetry Collector instance
+OTEL ?= false
+# If set to true docker-compose will also start a Loki instance
+LOKI ?= false
# If set to true docker-compose will install editable dependencies
EDITABLE_DEPENDENCIES ?= false
@@ -535,6 +539,8 @@ docker-compose-sources: .git/hooks/pre-commit
-e enable_vault=$(VAULT) \
-e vault_tls=$(VAULT_TLS) \
-e enable_tacacs=$(TACACS) \
+ -e enable_otel=$(OTEL) \
+ -e enable_loki=$(LOKI) \
-e install_editable_dependencies=$(EDITABLE_DEPENDENCIES) \
$(EXTRA_SOURCES_ANSIBLE_OPTS)
diff --git a/awx/main/utils/handlers.py b/awx/main/utils/handlers.py
index 15343463e8..4def0b6ba0 100644
--- a/awx/main/utils/handlers.py
+++ b/awx/main/utils/handlers.py
@@ -2,9 +2,11 @@
# All Rights Reserved.
# Python
+import base64
import logging
import sys
import traceback
+import os
from datetime import datetime
# Django
@@ -15,6 +17,15 @@ from django.utils.encoding import force_str
# AWX
from awx.main.exceptions import PostRunError
+# OTEL
+from opentelemetry._logs import set_logger_provider
+from opentelemetry.exporter.otlp.proto.grpc._log_exporter import OTLPLogExporter as OTLPGrpcLogExporter
+from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter as OTLPHttpLogExporter
+
+from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
+from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
+from opentelemetry.sdk.resources import Resource
+
class RSysLogHandler(logging.handlers.SysLogHandler):
append_nul = False
@@ -133,3 +144,39 @@ if settings.COLOR_LOGS is True:
pass
else:
ColorHandler = logging.StreamHandler
+
+
+class OTLPHandler(LoggingHandler):
+ def __init__(self, endpoint=None, protocol='grpc', service_name=None, instance_id=None, auth=None, username=None, password=None):
+ if not endpoint:
+ raise ValueError("endpoint required")
+
+ if auth == 'basic' and (username is None or password is None):
+ raise ValueError("auth type basic requires username and passsword parameters")
+
+ self.endpoint = endpoint
+ self.service_name = service_name or (sys.argv[1] if len(sys.argv) > 1 else (sys.argv[0] or 'unknown_service'))
+ self.instance_id = instance_id or os.uname().nodename
+
+ logger_provider = LoggerProvider(
+ resource=Resource.create(
+ {
+ "service.name": self.service_name,
+ "service.instance.id": self.instance_id,
+ }
+ ),
+ )
+ set_logger_provider(logger_provider)
+
+ headers = {}
+ if auth == 'basic':
+ secret = f'{username}:{password}'
+ headers['Authorization'] = "Basic " + base64.b64encode(secret.encode()).decode()
+
+ if protocol == 'grpc':
+ otlp_exporter = OTLPGrpcLogExporter(endpoint=self.endpoint, insecure=True, headers=headers)
+ elif protocol == 'http':
+ otlp_exporter = OTLPHttpLogExporter(endpoint=self.endpoint, headers=headers)
+ logger_provider.add_log_record_processor(BatchLogRecordProcessor(otlp_exporter))
+
+ super().__init__(level=logging.NOTSET, logger_provider=logger_provider)
diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py
index 9a144777bb..12a880a634 100644
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -880,6 +880,7 @@ LOGGING = {
'address': '/var/run/awx-rsyslog/rsyslog.sock',
'filters': ['external_log_enabled', 'dynamic_level_filter', 'guid'],
},
+ 'otel': {'class': 'logging.NullHandler'},
},
'loggers': {
'django': {'handlers': ['console']},
diff --git a/requirements/requirements_dev.txt b/requirements/requirements_dev.txt
index 15f662fa8e..48437e53f2 100644
--- a/requirements/requirements_dev.txt
+++ b/requirements/requirements_dev.txt
@@ -30,3 +30,9 @@ pip>=21.3 # PEP 660 – Editable installs for pyproject.toml based builds (wheel
debugpy
remote-pdb
sdb
+
+# OTEL
+opentelemetry-api==1.24.0
+opentelemetry-sdk==1.24.0
+opentelemetry-instrumentation-logging
+opentelemetry-exporter-otlp
diff --git a/tools/docker-compose/README.md b/tools/docker-compose/README.md
index 7139281d7b..22a3c7b390 100644
--- a/tools/docker-compose/README.md
+++ b/tools/docker-compose/README.md
@@ -613,3 +613,13 @@ docker exec -it -e VAULT_TOKEN=<token> tools_vault_1 vault kv get --address=http
### Prometheus and Grafana integration
See docs at https://github.com/ansible/awx/blob/devel/tools/grafana/README.md
+
+### OpenTelemetry Integration
+
+```bash
+OTEL=true GRAFANA=true LOKI=true PROMETHEUS=true make docker-compose
+```
+
+This will start the sidecar container `tools_otel_1` and configure AWX logging to send to it. The OpenTelemetry Collector is configured to export logs to Loki. Grafana is configured with Loki as a datasource. AWX logs can be viewed in Grafana.
+
+`http://localhost:3001` grafana
diff --git a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2 b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2
index e6cb929482..c6a0b4ed90 100644
--- a/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2
+++ b/tools/docker-compose/ansible/roles/sources/templates/docker-compose.yml.j2
@@ -269,6 +269,42 @@ services:
# pg_notify will NOT work in transaction mode.
PGBOUNCER_POOL_MODE: session
{% endif %}
+{% if enable_otel|bool %}
+ otel:
+ image: otel/opentelemetry-collector-contrib:0.88.0
+ container_name: tools_otel_1
+ hostname: otel
+ command: ["--config=/etc/otel-collector-config.yaml", ""]
+ networks:
+ - awx
+ ports:
+ - "4317:4317" # OTLP gRPC receiver
+ - "4318:4318" # OTLP http receiver
+ - "55679:55679" # zpages http://localhost:55679/debug/servicez /tracez
+ volumes:
+ - "../../otel/otel-collector-config.yaml:/etc/otel-collector-config.yaml"
+ depends_on:
+ - loki
+{% endif %}
+{% if enable_loki|bool %}
+ loki:
+ image: grafana/loki:2.9.5
+ container_name: tools_loki_1
+ hostname: loki
+ ports:
+ - "3100:3100"
+ command: -config.file=/etc/loki/local-config.yaml
+ networks:
+ - awx
+ volumes:
+ - "loki_storage:/loki:rw"
+ #- "../../docker-compose/loki/volumes/index:/loki/index"
+ #- "../../docker-compose/loki/volumes/boltdb-cache:/loki/boltdb-cache"
+ - "../../loki/local-config.yaml:/etc/loki/local-config.yaml"
+ depends_on:
+ - grafana
+{% endif %}
+
{% if execution_node_count|int > 0 %}
receptor-hop:
image: {{ receptor_image }}
@@ -360,6 +396,10 @@ volumes:
grafana_storage:
name: tools_grafana_storage
{% endif %}
+{% if enable_loki|bool %}
+ loki_storage:
+ name: tools_loki_storage
+{% endif %}
networks:
awx:
diff --git a/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2 b/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2
index fe9596a7b0..fa93ccecc5 100644
--- a/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2
+++ b/tools/docker-compose/ansible/roles/sources/templates/local_settings.py.j2
@@ -46,6 +46,18 @@ OPTIONAL_API_URLPATTERN_PREFIX = '{{ api_urlpattern_prefix }}'
# LOGGING['loggers']['django_auth_ldap']['handlers'] = ['console']
# LOGGING['loggers']['django_auth_ldap']['level'] = 'DEBUG'
+{% if enable_otel|bool %}
+LOGGING['handlers']['otel'] |= {
+ 'class': 'awx.main.utils.handlers.OTLPHandler',
+ 'endpoint': 'http://otel:4317',
+}
+# Add otel log handler to all log handlers
+for name in LOGGING['loggers'].keys():
+ handler = LOGGING['loggers'][name].get('handlers', [])
+ if 'otel' not in handler:
+ LOGGING['loggers'][name].get('handlers', []).append('otel')
+{% endif %}
+
BROADCAST_WEBSOCKET_PORT = 8013
BROADCAST_WEBSOCKET_VERIFY_CERT = False
BROADCAST_WEBSOCKET_PROTOCOL = 'http'
diff --git a/tools/grafana/datasources/loki_source.yml b/tools/grafana/datasources/loki_source.yml
new file mode 100644
index 0000000000..4a6c740f34
--- /dev/null
+++ b/tools/grafana/datasources/loki_source.yml
@@ -0,0 +1,11 @@
+---
+apiVersion: 1
+
+datasources:
+ - name: Loki
+ type: loki
+ access: proxy
+ url: http://loki:3100
+ jsonData:
+ timeout: 60
+ maxLines: 100000
diff --git a/tools/loki/local-config.yaml b/tools/loki/local-config.yaml
new file mode 100644
index 0000000000..dde03673aa
--- /dev/null
+++ b/tools/loki/local-config.yaml
@@ -0,0 +1,96 @@
+auth_enabled: false
+
+server:
+ http_listen_port: 3100
+ grpc_server_max_recv_msg_size: 524288000 # 500 MB
+ grpc_server_max_send_msg_size: 524288000 # 500 MB, might be too much, be careful
+
+frontend_worker:
+ match_max_concurrent: true
+ grpc_client_config:
+ max_send_msg_size: 524288000 # 500 MB
+
+
+ingester:
+ max_chunk_age: 8766h
+
+common:
+ path_prefix: /loki
+ storage:
+ filesystem:
+ chunks_directory: /loki/chunks
+ rules_directory: /loki/rules
+ replication_factor: 1
+ ring:
+ kvstore:
+ store: inmemory
+
+# compactor:
+# retention_enabled: true
+# # cmeyers: YOLO. 1s seems wrong but it works so right
+# compaction_interval: 1s # default 10m
+
+schema_config:
+ configs:
+ - from: 2020-10-24
+ store: boltdb-shipper
+ object_store: filesystem
+ schema: v11
+ index:
+ prefix: index_
+ period: 24h
+
+storage_config:
+ boltdb_shipper:
+ active_index_directory: /loki/index
+ cache_location: /loki/boltdb-cache
+
+ruler:
+ alertmanager_url: http://localhost:9093
+
+limits_config:
+ retention_period: 3y
+ # cmeyers: The default of 30m triggers a loop of queries that take a long time
+ # to complete and the UI times out
+ split_queries_by_interval: 1d
+ # cmeyers: Default of 30d1h limits grafana time queries. Can't, for example,
+ # query last 90 days
+ max_query_length: 3y
+ # cmeyers: Made the batch post request succeed.
+ reject_old_samples: false
+ reject_old_samples_max_age: 365d
+
+ ingestion_rate_mb: 32
+ ingestion_burst_size_mb: 32
+ per_stream_rate_limit: 32M
+ per_stream_rate_limit_burst: 32M
+ ingestion_rate_strategy: local # Default: global
+ max_global_streams_per_user: 100000000
+ max_entries_limit_per_query: 100000000
+ max_query_series: 1000000
+ max_query_parallelism: 32 # Old Default: 14
+ max_streams_per_user: 100000000 # Old Default: 10000
+
+# Taken from aap-log-visualizer
+frontend:
+ max_outstanding_per_tenant: 2048
+
+query_scheduler:
+ max_outstanding_requests_per_tenant: 2048
+
+query_range:
+ parallelise_shardable_queries: false
+ split_queries_by_interval: 0
+
+# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration
+# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/
+#
+# Statistics help us better understand how Loki is used, and they show us performance
+# levels for most users. This helps us prioritize features and documentation.
+# For more information on what's sent, look at
+# https://github.com/grafana/loki/blob/main/pkg/usagestats/stats.go
+# Refer to the buildReport method to see what goes into a report.
+#
+# If you would like to disable reporting, uncomment the following lines:
+#analytics:
+# reporting_enabled: false
diff --git a/tools/otel/otel-collector-config.yaml b/tools/otel/otel-collector-config.yaml
new file mode 100644
index 0000000000..ebbf0606cb
--- /dev/null
+++ b/tools/otel/otel-collector-config.yaml
@@ -0,0 +1,39 @@
+receivers:
+ otlp:
+ protocols:
+ grpc:
+
+exporters:
+ debug:
+ verbosity: detailed
+
+ loki:
+ endpoint: http://loki:3100/loki/api/v1/push
+ tls:
+ insecure: true
+ headers:
+ "X-Scope-OrgID": "1"
+ default_labels_enabled:
+ exporter: true
+ job: true
+ instance: true
+ level: true
+
+processors:
+ batch:
+
+extensions:
+ health_check:
+ zpages:
+ endpoint: ":55679"
+
+service:
+ pipelines:
+ logs:
+ receivers: [otlp]
+ processors: [batch]
+ exporters: [loki]
+
+ extensions:
+ - health_check
+ - zpages