feat: polish & windviz & deploy
This commit is contained in:
parent
81b8e763bd
commit
465ad00f7b
78 changed files with 20622 additions and 2154 deletions
98
docker-compose.swarm.yml
Normal file
98
docker-compose.swarm.yml
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
version: "3.8"
|
||||
|
||||
# Production Docker Swarm stack for stratoflights-predictor.
|
||||
#
|
||||
# Deploy: TAG=v1.0.0 docker stack deploy -c docker-compose.swarm.yml --with-registry-auth predictor
|
||||
# (or import via Swarmpit; the CI pipeline deploys it through the Swarmpit API)
|
||||
#
|
||||
# Storage & placement (see DEPLOYMENT.md):
|
||||
# * The wind dataset (~8.9 GiB) lives on NODE-LOCAL disk — never NFS. To keep
|
||||
# the number of copies bounded, the service is pinned to nodes labelled
|
||||
# `predictor.data=true`; label at most two such nodes. Each carries one copy.
|
||||
# * Replicas are spread one-per-node by default (redundancy + load balancing);
|
||||
# scaling to multiple replicas per node is safe because they share the
|
||||
# node-local volume and coordinate downloads via an flock (no duplicate fetch).
|
||||
#
|
||||
# The predictor is an internal backend: it has no public Traefik router. The
|
||||
# Django API gateway and Prometheus reach it over the shared `stratoflights-net`
|
||||
# overlay by the alias `predictor`.
|
||||
|
||||
services:
|
||||
predictor:
|
||||
image: git.intra.yksa.space/web/predictor:${TAG:-latest}
|
||||
networks:
|
||||
stratoflights-net:
|
||||
aliases:
|
||||
- predictor
|
||||
environment:
|
||||
PREDICTOR_DATA_DIR: /data
|
||||
PREDICTOR_ELEVATION_DATASET: /srv/ruaumoko-dataset
|
||||
PREDICTOR_SOURCE: ${PREDICTOR_SOURCE:-gfs-0p50-3h}
|
||||
PREDICTOR_DOWNLOAD_PARALLEL: ${PREDICTOR_DOWNLOAD_PARALLEL:-16}
|
||||
PREDICTOR_UPDATE_INTERVAL: 6h
|
||||
PREDICTOR_DATASET_TTL: 48h
|
||||
PREDICTOR_METRICS_ENABLED: "true"
|
||||
PREDICTOR_METRICS_PATH: /metrics
|
||||
PREDICTOR_LOG_LEVEL: info
|
||||
volumes:
|
||||
# Node-local storage. Provision these directories on each labelled node
|
||||
# (chown to 65532:65532 — see DEPLOYMENT.md). NOT a shared/NFS volume.
|
||||
- type: bind
|
||||
source: /srv/predictor/data
|
||||
target: /data
|
||||
- type: bind
|
||||
source: /srv/predictor/elevation
|
||||
target: /srv/ruaumoko-dataset
|
||||
read_only: true
|
||||
healthcheck:
|
||||
test: ["CMD", "/predictor", "-healthcheck"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 120s
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 2
|
||||
placement:
|
||||
max_replicas_per_node: 2
|
||||
constraints:
|
||||
- node.labels.predictor.data == true
|
||||
preferences:
|
||||
# Spread across the labelled nodes so the two default replicas land
|
||||
# on different hosts (redundancy across both dataset copies).
|
||||
- spread: node.labels.predictor.data
|
||||
update_config:
|
||||
parallelism: 1
|
||||
delay: 15s
|
||||
order: start-first
|
||||
failure_action: rollback
|
||||
rollback_config:
|
||||
parallelism: 1
|
||||
order: stop-first
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 5s
|
||||
max_attempts: 3
|
||||
resources:
|
||||
limits:
|
||||
memory: 3072M
|
||||
reservations:
|
||||
memory: 512M
|
||||
labels:
|
||||
# Prometheus Swarm service-discovery hints (adjust to your SD relabel rules).
|
||||
- "prometheus.scrape=true"
|
||||
- "prometheus.port=8080"
|
||||
- "prometheus.path=/metrics"
|
||||
# Let Swarmpit auto-redeploy when a new :latest (or pinned TAG) is pushed.
|
||||
- "swarmpit.service.deployment.autoredeploy=true"
|
||||
|
||||
networks:
|
||||
# Shared overlay also joined by the API gateway and Prometheus.
|
||||
# Create once: docker network create -d overlay --attachable stratoflights-net
|
||||
stratoflights-net:
|
||||
external: true
|
||||
Loading…
Add table
Add a link
Reference in a new issue