98 lines
3.4 KiB
YAML
98 lines
3.4 KiB
YAML
version: "3.8"
|
|
|
|
# Production Docker Swarm stack for stratoflights-predictor.
|
|
#
|
|
# Deploy: TAG=v1.0.0 docker stack deploy -c docker-compose.swarm.yml --with-registry-auth predictor
|
|
# (or import via Swarmpit; the CI pipeline deploys it through the Swarmpit API)
|
|
#
|
|
# Storage & placement (see DEPLOYMENT.md):
|
|
# * The wind dataset (~8.9 GiB) lives on NODE-LOCAL disk — never NFS. To keep
|
|
# the number of copies bounded, the service is pinned to nodes labelled
|
|
# `predictor.data=true`; label at most two such nodes. Each carries one copy.
|
|
# * Replicas are spread one-per-node by default (redundancy + load balancing);
|
|
# scaling to multiple replicas per node is safe because they share the
|
|
# node-local volume and coordinate downloads via an flock (no duplicate fetch).
|
|
#
|
|
# The predictor is an internal backend: it has no public Traefik router. The
|
|
# Django API gateway and Prometheus reach it over the shared `stratoflights-net`
|
|
# overlay by the alias `predictor`.
|
|
|
|
services:
|
|
predictor:
|
|
image: git.intra.yksa.space/web/predictor:${TAG:-latest}
|
|
networks:
|
|
stratoflights-net:
|
|
aliases:
|
|
- predictor
|
|
environment:
|
|
PREDICTOR_DATA_DIR: /data
|
|
PREDICTOR_ELEVATION_DATASET: /srv/ruaumoko-dataset
|
|
PREDICTOR_SOURCE: ${PREDICTOR_SOURCE:-gfs-0p50-3h}
|
|
PREDICTOR_DOWNLOAD_PARALLEL: ${PREDICTOR_DOWNLOAD_PARALLEL:-16}
|
|
PREDICTOR_UPDATE_INTERVAL: 6h
|
|
PREDICTOR_DATASET_TTL: 48h
|
|
PREDICTOR_METRICS_ENABLED: "true"
|
|
PREDICTOR_METRICS_PATH: /metrics
|
|
PREDICTOR_LOG_LEVEL: info
|
|
volumes:
|
|
# Node-local storage. Provision these directories on each labelled node
|
|
# (chown to 65532:65532 — see DEPLOYMENT.md). NOT a shared/NFS volume.
|
|
- type: bind
|
|
source: /srv/predictor/data
|
|
target: /data
|
|
- type: bind
|
|
source: /srv/predictor/elevation
|
|
target: /srv/ruaumoko-dataset
|
|
read_only: true
|
|
healthcheck:
|
|
test: ["CMD", "/predictor", "-healthcheck"]
|
|
interval: 30s
|
|
timeout: 5s
|
|
retries: 3
|
|
start_period: 120s
|
|
logging:
|
|
driver: json-file
|
|
options:
|
|
max-size: "10m"
|
|
max-file: "3"
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 2
|
|
placement:
|
|
max_replicas_per_node: 2
|
|
constraints:
|
|
- node.labels.predictor.data == true
|
|
preferences:
|
|
# Spread across the labelled nodes so the two default replicas land
|
|
# on different hosts (redundancy across both dataset copies).
|
|
- spread: node.labels.predictor.data
|
|
update_config:
|
|
parallelism: 1
|
|
delay: 15s
|
|
order: start-first
|
|
failure_action: rollback
|
|
rollback_config:
|
|
parallelism: 1
|
|
order: stop-first
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
resources:
|
|
limits:
|
|
memory: 3072M
|
|
reservations:
|
|
memory: 512M
|
|
labels:
|
|
# Prometheus Swarm service-discovery hints (adjust to your SD relabel rules).
|
|
- "prometheus.scrape=true"
|
|
- "prometheus.port=8080"
|
|
- "prometheus.path=/metrics"
|
|
# Let Swarmpit auto-redeploy when a new :latest (or pinned TAG) is pushed.
|
|
- "swarmpit.service.deployment.autoredeploy=true"
|
|
|
|
networks:
|
|
# Shared overlay also joined by the API gateway and Prometheus.
|
|
# Create once: docker network create -d overlay --attachable stratoflights-net
|
|
stratoflights-net:
|
|
external: true
|