feat: add monitoring and instrumentation
Some checks are pending
/ test (push) Waiting to run

This commit is contained in:
wilson 2026-05-22 22:40:17 +01:00
parent 84c5c29ee1
commit 1a026e5056
12 changed files with 399 additions and 10 deletions

View file

@ -19,6 +19,9 @@ run-prod-locally:
down: down:
docker compose down docker compose down
down-dev:
docker compose -f docker-compose-dev.yml down
logs-dev: logs-dev:
docker compose -f docker-compose-dev.yml logs -f docker compose -f docker-compose-dev.yml logs -f

View file

@ -1,6 +1,6 @@
import asyncio import asyncio
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from prometheus_fastapi_instrumentator import Instrumentator
from .routers.api import generation, pos from .routers.api import generation, pos
from fastapi import FastAPI from fastapi import FastAPI
@ -9,12 +9,14 @@ from .routers import media as media_router
from .routers.api.main import api_router from .routers.api.main import api_router
from .routers.bff.main import bff_router from .routers.bff.main import bff_router
from .outbound.storage_factory import init_storage from .outbound.storage_factory import init_storage
from .observability import setup_observability
from . import worker from . import worker
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
init_storage() init_storage()
setup_observability(app)
worker_task = asyncio.create_task(worker.worker_loop()) worker_task = asyncio.create_task(worker.worker_loop())
yield yield
worker_task.cancel() worker_task.cancel()
@ -29,7 +31,7 @@ app = FastAPI(title="Language Learning API", lifespan=lifespan)
app.include_router(api_router) app.include_router(api_router)
app.include_router(bff_router) app.include_router(bff_router)
app.include_router(media_router.router) app.include_router(media_router.router)
Instrumentator().instrument(app).expose(app, should_gzip=True)
@app.get("/health") @app.get("/health")
async def health() -> dict: async def health() -> dict:

49
api/app/observability.py Normal file
View file

@ -0,0 +1,49 @@
import os
from fastapi import FastAPI
from opentelemetry import metrics, trace
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
ConsoleSpanExporter,
)
from prometheus_client import start_http_server
_observability_initialized = False
def setup_observability(app: FastAPI) -> None:
global _observability_initialized
if _observability_initialized:
return
service_name = os.getenv("OTEL_SERVICE_NAME", "language-learning-api")
metrics_host = os.getenv("OTEL_EXPORTER_PROMETHEUS_HOST", "0.0.0.0")
metrics_port = int(os.getenv("OTEL_EXPORTER_PROMETHEUS_PORT", "9464"))
resource = Resource.create({SERVICE_NAME: service_name})
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
metric_reader = PrometheusMetricReader()
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
LoggingInstrumentor().instrument(set_logging_format=True)
# Expose OTel metrics for Prometheus scraping on the standard endpoint.
start_http_server(port=metrics_port, addr=metrics_host)
_observability_initialized = True

View file

@ -17,7 +17,14 @@ dependencies = [
"google-genai>=1.0.0", "google-genai>=1.0.0",
"boto3>=1.35.0", "boto3>=1.35.0",
"httpx>=0.28.1", "httpx>=0.28.1",
"deepgram-sdk>=6.1.0" "deepgram-sdk>=6.1.0",
"opentelemetry-instrumentation-logging>=0.63b1",
"opentelemetry-instrumentation-fastapi>=0.63b1",
"opentelemetry-api>=1.42.1",
"opentelemetry-sdk>=1.42.1",
"opentelemetry-exporter-prometheus>=0.63b1",
"prometheus-client>=0.25.0",
"prometheus-fastapi-instrumentator>=7.1.0",
] ]
[build-system] [build-system]

View file

@ -59,6 +59,9 @@ services:
STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY} STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY}
STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn} STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn}
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub}
OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api}
OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0
OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464}
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy
@ -72,9 +75,9 @@ services:
args: args:
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000}
ports: ports:
- "${FRONTEND_PORT:-3000}:3000" - "${FRONTEND_PORT:-3001}:3001"
environment: environment:
ORIGIN: ${ORIGIN:-http://localhost:3000} ORIGIN: ${ORIGIN:-http://localhost:3001}
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000}
PRIVATE_JWT_SECRET: ${JWT_SECRET} PRIVATE_JWT_SECRET: ${JWT_SECRET}
PRIVATE_DEEPL_API_KEY: ${DEEPL_API_KEY} PRIVATE_DEEPL_API_KEY: ${DEEPL_API_KEY}
@ -83,6 +86,71 @@ services:
- api - api
restart: unless-stopped restart: unless-stopped
prometheus:
image: prom/prometheus:v2.54.1
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro,z
- prometheusdata:/prometheus
ports:
- "9090:9090"
depends_on:
- api
restart: unless-stopped
loki:
image: grafana/loki:3.1.1
command: -config.file=/etc/loki/loki-config.yml
volumes:
- ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro,z
- lokidata:/loki
ports:
- "3100:3100"
restart: unless-stopped
alloy:
image: grafana/alloy:v1.7.1
user: "0:0"
security_opt:
- label=disable
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
volumes:
- ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro,z
- /var/run/docker.sock:/var/run/docker.sock:ro,z
- alloydata:/var/lib/alloy/data
ports:
- "12345:12345"
depends_on:
- loki
restart: unless-stopped
grafana:
image: grafana/grafana:11.2.0
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro,z
- grafanadata:/var/lib/grafana
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_AUTH_ANONYMOUS_ENABLED: "false"
depends_on:
- prometheus
- loki
restart: unless-stopped
volumes: volumes:
pgdata: pgdata:
storagedata: storagedata:
prometheusdata:
grafanadata:
lokidata:
alloydata:

View file

@ -41,6 +41,9 @@ services:
BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY} BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY}
BUNNY_STORAGE_ENDPOINT: ${BUNNY_STORAGE_ENDPOINT} BUNNY_STORAGE_ENDPOINT: ${BUNNY_STORAGE_ENDPOINT}
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER}
OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api}
OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0
OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464}
healthcheck: healthcheck:
test: test:
[ [
@ -68,7 +71,7 @@ services:
args: args:
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL}
ports: ports:
- "${FRONTEND_PORT:-3000}:3000" - "${FRONTEND_PORT:-3001}:3000"
environment: environment:
ORIGIN: ${ORIGIN} ORIGIN: ${ORIGIN}
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL}
@ -85,8 +88,94 @@ services:
cpus: "0.5" cpus: "0.5"
memory: 256M memory: 256M
prometheus:
image: prom/prometheus:v2.54.1
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheusdata:/prometheus
ports:
- "127.0.0.1:${PROMETHEUS_PORT:-9090}:9090"
depends_on:
api:
condition: service_healthy
restart: unless-stopped
deploy:
resources:
limits:
cpus: "0.5"
memory: 512M
loki:
image: grafana/loki:3.1.1
command: -config.file=/etc/loki/loki-config.yml
volumes:
- ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- lokidata:/loki
ports:
- "127.0.0.1:${LOKI_PORT:-3100}:3100"
restart: unless-stopped
deploy:
resources:
limits:
cpus: "0.5"
memory: 512M
alloy:
image: grafana/alloy:v1.7.1
user: "0:0"
security_opt:
- label=disable
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
volumes:
- ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- alloydata:/var/lib/alloy/data
ports:
- "127.0.0.1:${ALLOY_PORT:-12345}:12345"
depends_on:
- loki
restart: unless-stopped
deploy:
resources:
limits:
cpus: "0.5"
memory: 256M
grafana:
image: grafana/grafana:11.2.0
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- grafanadata:/var/lib/grafana
ports:
- "127.0.0.1:${GRAFANA_PORT:-3000}:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:?set GRAFANA_ADMIN_PASSWORD}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_AUTH_ANONYMOUS_ENABLED: "false"
depends_on:
- prometheus
- loki
restart: unless-stopped
deploy:
resources:
limits:
cpus: "0.5"
memory: 512M
volumes: volumes:
pgdata: pgdata:
prometheusdata:
grafanadata:
lokidata:
alloydata:
networks: networks:
default: default:

View file

@ -54,6 +54,9 @@ services:
STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY} STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY}
STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn} STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn}
TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub}
OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api}
OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0
OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464}
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy
@ -67,13 +70,78 @@ services:
args: args:
PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000}
ports: ports:
- "${FRONTEND_PORT:-3000}:3000" - "${FRONTEND_PORT:-3001}:3000"
environment: environment:
ORIGIN: ${ORIGIN:-http://localhost:3000} ORIGIN: ${ORIGIN:-http://localhost:3001}
depends_on: depends_on:
- api - api
restart: unless-stopped restart: unless-stopped
prometheus:
image: prom/prometheus:v2.54.1
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheusdata:/prometheus
ports:
- "9090:9090"
depends_on:
- api
restart: unless-stopped
loki:
image: grafana/loki:3.1.1
command: -config.file=/etc/loki/loki-config.yml
volumes:
- ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- lokidata:/loki
ports:
- "3100:3100"
restart: unless-stopped
alloy:
image: grafana/alloy:v1.7.1
user: "0:0"
security_opt:
- label=disable
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
volumes:
- ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
- alloydata:/var/lib/alloy/data
ports:
- "12345:12345"
depends_on:
- loki
restart: unless-stopped
grafana:
image: grafana/grafana:11.2.0
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- grafanadata:/var/lib/grafana
ports:
- "3000:3000"
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_AUTH_ANONYMOUS_ENABLED: "false"
depends_on:
- prometheus
- loki
restart: unless-stopped
volumes: volumes:
pgdata: pgdata:
storagedata: storagedata:
prometheusdata:
grafanadata:
lokidata:
alloydata:

View file

@ -25,9 +25,9 @@ COPY --from=builder /app/build ./build
COPY --from=builder /app/node_modules ./node_modules COPY --from=builder /app/node_modules ./node_modules
COPY --from=builder /app/package.json ./package.json COPY --from=builder /app/package.json ./package.json
EXPOSE 3000 EXPOSE 3001
ENV PORT=3000 ENV PORT=3001
ENV HOST=0.0.0.0 ENV HOST=0.0.0.0
CMD ["node", "build/index.js"] CMD ["node", "build/index.js"]

View file

@ -0,0 +1,35 @@
discovery.docker "containers" {
host = "unix:///var/run/docker.sock"
}
discovery.relabel "containers" {
targets = discovery.docker.containers.targets
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "container"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_project"]
target_label = "compose_project"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "compose_service"
}
}
loki.source.docker "containers" {
host = "unix:///var/run/docker.sock"
targets = discovery.relabel.containers.output
forward_to = [loki.write.default.receiver]
}
loki.write "default" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}

View file

@ -0,0 +1,15 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false

View file

@ -0,0 +1,40 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
filesystem:
directory: /loki/chunks
limits_config:
volume_enabled: true
compactor:
working_directory: /loki/compactor
ruler:
alertmanager_url: http://localhost:9093

View file

@ -0,0 +1,13 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["prometheus:9090"]
- job_name: api
metrics_path: /metrics
static_configs:
- targets: ["api:9464"]