predictor/docker-compose.swarm.yml

98 lines
3.4 KiB
YAML

version: "3.8"
# Production Docker Swarm stack for stratoflights-predictor.
#
# Deploy: TAG=v1.0.0 docker stack deploy -c docker-compose.swarm.yml --with-registry-auth predictor
# (or import via Swarmpit; the CI pipeline deploys it through the Swarmpit API)
#
# Storage & placement (see DEPLOYMENT.md):
# * The wind dataset (~8.9 GiB) lives on NODE-LOCAL disk — never NFS. To keep
# the number of copies bounded, the service is pinned to nodes labelled
# `predictor.data=true`; label at most two such nodes. Each carries one copy.
# * Replicas are spread one-per-node by default (redundancy + load balancing);
# scaling to multiple replicas per node is safe because they share the
# node-local volume and coordinate downloads via an flock (no duplicate fetch).
#
# The predictor is an internal backend: it has no public Traefik router. The
# Django API gateway and Prometheus reach it over the shared `stratoflights-net`
# overlay by the alias `predictor`.
services:
predictor:
image: git.intra.yksa.space/web/predictor:${TAG:-latest}
networks:
stratoflights-net:
aliases:
- predictor
environment:
PREDICTOR_DATA_DIR: /data
PREDICTOR_ELEVATION_DATASET: /srv/ruaumoko-dataset
PREDICTOR_SOURCE: ${PREDICTOR_SOURCE:-gfs-0p50-3h}
PREDICTOR_DOWNLOAD_PARALLEL: ${PREDICTOR_DOWNLOAD_PARALLEL:-16}
PREDICTOR_UPDATE_INTERVAL: 6h
PREDICTOR_DATASET_TTL: 48h
PREDICTOR_METRICS_ENABLED: "true"
PREDICTOR_METRICS_PATH: /metrics
PREDICTOR_LOG_LEVEL: info
volumes:
# Node-local storage. Provision these directories on each labelled node
# (chown to 65532:65532 — see DEPLOYMENT.md). NOT a shared/NFS volume.
- type: bind
source: /srv/predictor/data
target: /data
- type: bind
source: /srv/predictor/elevation
target: /srv/ruaumoko-dataset
read_only: true
healthcheck:
test: ["CMD", "/predictor", "-healthcheck"]
interval: 30s
timeout: 5s
retries: 3
start_period: 120s
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
deploy:
mode: replicated
replicas: 2
placement:
max_replicas_per_node: 2
constraints:
- node.labels.predictor.data == true
preferences:
# Spread across the labelled nodes so the two default replicas land
# on different hosts (redundancy across both dataset copies).
- spread: node.labels.predictor.data
update_config:
parallelism: 1
delay: 15s
order: start-first
failure_action: rollback
rollback_config:
parallelism: 1
order: stop-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
resources:
limits:
memory: 3072M
reservations:
memory: 512M
labels:
# Prometheus Swarm service-discovery hints (adjust to your SD relabel rules).
- "prometheus.scrape=true"
- "prometheus.port=8080"
- "prometheus.path=/metrics"
# Let Swarmpit auto-redeploy when a new :latest (or pinned TAG) is pushed.
- "swarmpit.service.deployment.autoredeploy=true"
networks:
# Shared overlay also joined by the API gateway and Prometheus.
# Create once: docker network create -d overlay --attachable stratoflights-net
stratoflights-net:
external: true