version: "3.8" # Production Docker Swarm stack for stratoflights-predictor. # # Deploy: TAG=v1.0.0 docker stack deploy -c docker-compose.swarm.yml --with-registry-auth predictor # (or import via Swarmpit; the CI pipeline deploys it through the Swarmpit API) # # Storage & placement (see DEPLOYMENT.md): # * The wind dataset (~8.9 GiB) lives on NODE-LOCAL disk — never NFS. To keep # the number of copies bounded, the service is pinned to nodes labelled # `predictor.data=true`; label at most two such nodes. Each carries one copy. # * Replicas are spread one-per-node by default (redundancy + load balancing); # scaling to multiple replicas per node is safe because they share the # node-local volume and coordinate downloads via an flock (no duplicate fetch). # # The predictor is an internal backend: it has no public Traefik router. The # Django API gateway and Prometheus reach it over the shared `stratoflights-net` # overlay by the alias `predictor`. services: predictor: image: git.intra.yksa.space/web/predictor:${TAG:-latest} networks: stratoflights-net: aliases: - predictor environment: PREDICTOR_DATA_DIR: /data PREDICTOR_ELEVATION_DATASET: /srv/ruaumoko-dataset PREDICTOR_SOURCE: ${PREDICTOR_SOURCE:-gfs-0p50-3h} PREDICTOR_DOWNLOAD_PARALLEL: ${PREDICTOR_DOWNLOAD_PARALLEL:-16} PREDICTOR_UPDATE_INTERVAL: 6h PREDICTOR_DATASET_TTL: 48h PREDICTOR_METRICS_ENABLED: "true" PREDICTOR_METRICS_PATH: /metrics PREDICTOR_LOG_LEVEL: info volumes: # Node-local storage. Provision these directories on each labelled node # (chown to 65532:65532 — see DEPLOYMENT.md). NOT a shared/NFS volume. - type: bind source: /srv/predictor/data target: /data - type: bind source: /srv/predictor/elevation target: /srv/ruaumoko-dataset read_only: true healthcheck: test: ["CMD", "/predictor", "-healthcheck"] interval: 30s timeout: 5s retries: 3 start_period: 120s logging: driver: json-file options: max-size: "10m" max-file: "3" deploy: mode: replicated replicas: 2 placement: max_replicas_per_node: 2 constraints: - node.labels.predictor.data == true preferences: # Spread across the labelled nodes so the two default replicas land # on different hosts (redundancy across both dataset copies). - spread: node.labels.predictor.data update_config: parallelism: 1 delay: 15s order: start-first failure_action: rollback rollback_config: parallelism: 1 order: stop-first restart_policy: condition: on-failure delay: 5s max_attempts: 3 resources: limits: memory: 3072M reservations: memory: 512M labels: # Prometheus Swarm service-discovery hints (adjust to your SD relabel rules). - "prometheus.scrape=true" - "prometheus.port=8080" - "prometheus.path=/metrics" # Let Swarmpit auto-redeploy when a new :latest (or pinned TAG) is pushed. - "swarmpit.service.deployment.autoredeploy=true" networks: # Shared overlay also joined by the API gateway and Prometheus. # Create once: docker network create -d overlay --attachable stratoflights-net stratoflights-net: external: true