diff --git a/Makefile b/Makefile index dafacf8..35c94b5 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ run-prod-locally: down: docker compose down +down-dev: + docker compose -f docker-compose-dev.yml down + logs-dev: docker compose -f docker-compose-dev.yml logs -f diff --git a/api/app/main.py b/api/app/main.py index 1028c4e..4db2caa 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -1,6 +1,6 @@ import asyncio from contextlib import asynccontextmanager - +from prometheus_fastapi_instrumentator import Instrumentator from .routers.api import generation, pos from fastapi import FastAPI @@ -9,12 +9,14 @@ from .routers import media as media_router from .routers.api.main import api_router from .routers.bff.main import bff_router from .outbound.storage_factory import init_storage +from .observability import setup_observability from . import worker @asynccontextmanager async def lifespan(app: FastAPI): init_storage() + setup_observability(app) worker_task = asyncio.create_task(worker.worker_loop()) yield worker_task.cancel() @@ -29,7 +31,7 @@ app = FastAPI(title="Language Learning API", lifespan=lifespan) app.include_router(api_router) app.include_router(bff_router) app.include_router(media_router.router) - +Instrumentator().instrument(app).expose(app, should_gzip=True) @app.get("/health") async def health() -> dict: diff --git a/api/app/observability.py b/api/app/observability.py new file mode 100644 index 0000000..38cb8bb --- /dev/null +++ b/api/app/observability.py @@ -0,0 +1,49 @@ +import os + +from fastapi import FastAPI +from opentelemetry import metrics, trace +from opentelemetry.exporter.prometheus import PrometheusMetricReader +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.resources import SERVICE_NAME, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + ConsoleSpanExporter, +) +from prometheus_client import start_http_server + + +_observability_initialized = False + +def setup_observability(app: FastAPI) -> None: + global _observability_initialized + if _observability_initialized: + return + + + + service_name = os.getenv("OTEL_SERVICE_NAME", "language-learning-api") + metrics_host = os.getenv("OTEL_EXPORTER_PROMETHEUS_HOST", "0.0.0.0") + metrics_port = int(os.getenv("OTEL_EXPORTER_PROMETHEUS_PORT", "9464")) + + resource = Resource.create({SERVICE_NAME: service_name}) + + tracer_provider = TracerProvider(resource=resource) + trace.set_tracer_provider(tracer_provider) + + tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + + metric_reader = PrometheusMetricReader() + meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader]) + metrics.set_meter_provider(meter_provider) + + LoggingInstrumentor().instrument(set_logging_format=True) + + # Expose OTel metrics for Prometheus scraping on the standard endpoint. + start_http_server(port=metrics_port, addr=metrics_host) + + + + _observability_initialized = True \ No newline at end of file diff --git a/api/pyproject.toml b/api/pyproject.toml index 4de1e30..61c056a 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -17,7 +17,14 @@ dependencies = [ "google-genai>=1.0.0", "boto3>=1.35.0", "httpx>=0.28.1", - "deepgram-sdk>=6.1.0" + "deepgram-sdk>=6.1.0", + "opentelemetry-instrumentation-logging>=0.63b1", + "opentelemetry-instrumentation-fastapi>=0.63b1", + "opentelemetry-api>=1.42.1", + "opentelemetry-sdk>=1.42.1", + "opentelemetry-exporter-prometheus>=0.63b1", + "prometheus-client>=0.25.0", + "prometheus-fastapi-instrumentator>=7.1.0", ] [build-system] diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index ba219b8..cb4df06 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -59,6 +59,9 @@ services: STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY} STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub} + OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api} + OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0 + OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464} depends_on: db: condition: service_healthy @@ -72,9 +75,9 @@ services: args: PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000} ports: - - "${FRONTEND_PORT:-3000}:3000" + - "${FRONTEND_PORT:-3001}:3001" environment: - ORIGIN: ${ORIGIN:-http://localhost:3000} + ORIGIN: ${ORIGIN:-http://localhost:3001} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://api:8000} PRIVATE_JWT_SECRET: ${JWT_SECRET} PRIVATE_DEEPL_API_KEY: ${DEEPL_API_KEY} @@ -83,6 +86,71 @@ services: - api restart: unless-stopped + prometheus: + image: prom/prometheus:v2.54.1 + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro,z + - prometheusdata:/prometheus + ports: + - "9090:9090" + depends_on: + - api + restart: unless-stopped + + loki: + image: grafana/loki:3.1.1 + command: -config.file=/etc/loki/loki-config.yml + volumes: + - ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro,z + - lokidata:/loki + ports: + - "3100:3100" + restart: unless-stopped + + alloy: + image: grafana/alloy:v1.7.1 + user: "0:0" + security_opt: + - label=disable + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + volumes: + - ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro,z + - /var/run/docker.sock:/var/run/docker.sock:ro,z + - alloydata:/var/lib/alloy/data + ports: + - "12345:12345" + depends_on: + - loki + restart: unless-stopped + + grafana: + image: grafana/grafana:11.2.0 + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro,z + - grafanadata:/var/lib/grafana + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_AUTH_ANONYMOUS_ENABLED: "false" + depends_on: + - prometheus + - loki + restart: unless-stopped + volumes: pgdata: storagedata: + prometheusdata: + grafanadata: + lokidata: + alloydata: diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 115ffa6..c2b3157 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -41,6 +41,9 @@ services: BUNNY_TOKEN_AUTH_KEY: ${BUNNY_TOKEN_AUTH_KEY} BUNNY_STORAGE_ENDPOINT: ${BUNNY_STORAGE_ENDPOINT} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER} + OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api} + OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0 + OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464} healthcheck: test: [ @@ -68,7 +71,7 @@ services: args: PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL} ports: - - "${FRONTEND_PORT:-3000}:3000" + - "${FRONTEND_PORT:-3001}:3000" environment: ORIGIN: ${ORIGIN} PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL} @@ -85,8 +88,94 @@ services: cpus: "0.5" memory: 256M + prometheus: + image: prom/prometheus:v2.54.1 + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheusdata:/prometheus + ports: + - "127.0.0.1:${PROMETHEUS_PORT:-9090}:9090" + depends_on: + api: + condition: service_healthy + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + + loki: + image: grafana/loki:3.1.1 + command: -config.file=/etc/loki/loki-config.yml + volumes: + - ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - lokidata:/loki + ports: + - "127.0.0.1:${LOKI_PORT:-3100}:3100" + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + + alloy: + image: grafana/alloy:v1.7.1 + user: "0:0" + security_opt: + - label=disable + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + volumes: + - ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - alloydata:/var/lib/alloy/data + ports: + - "127.0.0.1:${ALLOY_PORT:-12345}:12345" + depends_on: + - loki + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.5" + memory: 256M + + grafana: + image: grafana/grafana:11.2.0 + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - grafanadata:/var/lib/grafana + ports: + - "127.0.0.1:${GRAFANA_PORT:-3000}:3000" + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:?set GRAFANA_ADMIN_PASSWORD} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_AUTH_ANONYMOUS_ENABLED: "false" + depends_on: + - prometheus + - loki + restart: unless-stopped + deploy: + resources: + limits: + cpus: "0.5" + memory: 512M + volumes: pgdata: + prometheusdata: + grafanadata: + lokidata: + alloydata: networks: default: diff --git a/docker-compose.yml b/docker-compose.yml index 547e490..4f530a7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -54,6 +54,9 @@ services: STORAGE_SECRET_KEY: ${STORAGE_SECRET_KEY} STORAGE_BUCKET: ${STORAGE_BUCKET:-langlearn} TRANSACTIONAL_EMAIL_PROVIDER: ${TRANSACTIONAL_EMAIL_PROVIDER:-stub} + OTEL_SERVICE_NAME: ${OTEL_SERVICE_NAME:-language-learning-api} + OTEL_EXPORTER_PROMETHEUS_HOST: 0.0.0.0 + OTEL_EXPORTER_PROMETHEUS_PORT: ${OTEL_EXPORTER_PROMETHEUS_PORT:-9464} depends_on: db: condition: service_healthy @@ -67,13 +70,78 @@ services: args: PUBLIC_API_BASE_URL: ${PUBLIC_API_BASE_URL:-http://localhost:8000} ports: - - "${FRONTEND_PORT:-3000}:3000" + - "${FRONTEND_PORT:-3001}:3000" environment: - ORIGIN: ${ORIGIN:-http://localhost:3000} + ORIGIN: ${ORIGIN:-http://localhost:3001} depends_on: - api restart: unless-stopped + prometheus: + image: prom/prometheus:v2.54.1 + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + volumes: + - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheusdata:/prometheus + ports: + - "9090:9090" + depends_on: + - api + restart: unless-stopped + + loki: + image: grafana/loki:3.1.1 + command: -config.file=/etc/loki/loki-config.yml + volumes: + - ./monitoring/loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - lokidata:/loki + ports: + - "3100:3100" + restart: unless-stopped + + alloy: + image: grafana/alloy:v1.7.1 + user: "0:0" + security_opt: + - label=disable + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + volumes: + - ./monitoring/alloy/config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - alloydata:/var/lib/alloy/data + ports: + - "12345:12345" + depends_on: + - loki + restart: unless-stopped + + grafana: + image: grafana/grafana:11.2.0 + volumes: + - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro + - grafanadata:/var/lib/grafana + ports: + - "3000:3000" + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + GF_AUTH_ANONYMOUS_ENABLED: "false" + depends_on: + - prometheus + - loki + restart: unless-stopped + volumes: pgdata: storagedata: + prometheusdata: + grafanadata: + lokidata: + alloydata: diff --git a/frontend/Dockerfile b/frontend/Dockerfile index f5c8e11..0f92501 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -25,9 +25,9 @@ COPY --from=builder /app/build ./build COPY --from=builder /app/node_modules ./node_modules COPY --from=builder /app/package.json ./package.json -EXPOSE 3000 +EXPOSE 3001 -ENV PORT=3000 +ENV PORT=3001 ENV HOST=0.0.0.0 CMD ["node", "build/index.js"] diff --git a/monitoring/alloy/config.alloy b/monitoring/alloy/config.alloy new file mode 100644 index 0000000..122a863 --- /dev/null +++ b/monitoring/alloy/config.alloy @@ -0,0 +1,35 @@ +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" +} + +discovery.relabel "containers" { + targets = discovery.docker.containers.targets + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_project"] + target_label = "compose_project" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "compose_service" + } +} + +loki.source.docker "containers" { + host = "unix:///var/run/docker.sock" + targets = discovery.relabel.containers.output + forward_to = [loki.write.default.receiver] +} + +loki.write "default" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } +} diff --git a/monitoring/grafana/provisioning/datasources/datasource.yml b/monitoring/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..2c0808d --- /dev/null +++ b/monitoring/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,15 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: false diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 0000000..4b70193 --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,40 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + +limits_config: + volume_enabled: true + +compactor: + working_directory: /loki/compactor + +ruler: + alertmanager_url: http://localhost:9093 diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..8e63737 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,13 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: ["prometheus:9090"] + + - job_name: api + metrics_path: /metrics + static_configs: + - targets: ["api:9464"]