feat: polish & windviz & deploy

This commit is contained in:
Anatoly Antonov 2026-05-30 06:29:39 +09:00
parent 81b8e763bd
commit 465ad00f7b
78 changed files with 20622 additions and 2154 deletions

View file

@ -0,0 +1,11 @@
//go:build !unix
package datasets
import "context"
// flockExclusive is a no-op on platforms without flock. The service targets
// Linux containers; this stub only keeps non-Unix builds compiling.
func flockExclusive(_ context.Context, _ string) (func(), error) {
return func() {}, nil
}

View file

@ -0,0 +1,50 @@
//go:build unix
package datasets
import (
"context"
"errors"
"fmt"
"os"
"syscall"
"time"
)
// lockPollInterval is how often a contended lock is retried. The lock is held
// for the duration of a dataset download (minutes), so sub-second acquisition
// latency is irrelevant.
const lockPollInterval = 150 * time.Millisecond
// flockExclusive acquires an exclusive flock on path, creating the lock file
// if needed, and blocks until it is held or ctx is cancelled.
//
// It uses non-blocking LOCK_NB attempts in a poll loop rather than a blocking
// flock in a goroutine: the file descriptor is only ever touched by this
// goroutine, so there is no race between a pending syscall and Close on
// cancellation.
func flockExclusive(ctx context.Context, path string) (func(), error) {
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o644)
if err != nil {
return nil, fmt.Errorf("open lock file: %w", err)
}
for {
err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err == nil {
return func() {
_ = syscall.Flock(int(f.Fd()), syscall.LOCK_UN)
_ = f.Close()
}, nil
}
if !errors.Is(err, syscall.EWOULDBLOCK) {
f.Close()
return nil, fmt.Errorf("flock: %w", err)
}
select {
case <-ctx.Done():
f.Close()
return nil, ctx.Err()
case <-time.After(lockPollInterval):
}
}
}

View file

@ -27,16 +27,16 @@ const (
// JobInfo is the externally-visible snapshot of a download job.
type JobInfo struct {
ID string
Source string
Dataset DatasetID
Status JobStatus
StartedAt time.Time
EndedAt *time.Time
Err string
Total int
Done int
Bytes int64
ID string
Source string
Dataset DatasetID
Status JobStatus
StartedAt time.Time
EndedAt *time.Time
Err string
Total int
Done int
Bytes int64
}
type jobEntry struct {
@ -75,9 +75,9 @@ func (e *jobEntry) snapshot() JobInfo {
type jobProgress struct{ e *jobEntry }
func (p jobProgress) SetTotal(n int) { p.e.total.Store(int64(n)) }
func (p jobProgress) StepComplete() { p.e.done.Add(1) }
func (p jobProgress) Bytes(n int64) { p.e.bytes.Add(n) }
func (p jobProgress) SetTotal(n int) { p.e.total.Store(int64(n)) }
func (p jobProgress) StepComplete() { p.e.done.Add(1) }
func (p jobProgress) Bytes(n int64) { p.e.bytes.Add(n) }
// loadedDataset bundles a loaded WindField with its identity and coverage.
type loadedDataset struct {
@ -387,7 +387,7 @@ func (m *Manager) runDownload(ctx context.Context, e *jobEntry) {
zap.String("job", e.id),
zap.String("dataset", e.dataset.Filename()))
err := m.src.Download(ctx, e.dataset, m.store, jobProgress{e: e}, m.throttle)
err := m.downloadLocked(ctx, e)
now := time.Now().UTC()
e.mu.Lock()
@ -410,6 +410,26 @@ func (m *Manager) runDownload(ctx context.Context, e *jobEntry) {
zap.NamedError("err", err))
}
// downloadLocked runs the source download while holding the storage's
// cross-process lock, so multiple replicas sharing a node-local dataset
// volume coordinate instead of each fetching ~9 GB. After acquiring the lock
// it re-checks existence: if another replica committed the dataset while this
// one waited, it skips the download and lets the caller load the committed file.
func (m *Manager) downloadLocked(ctx context.Context, e *jobEntry) error {
release, err := m.store.Lock(ctx)
if err != nil {
return fmt.Errorf("acquire download lock: %w", err)
}
defer release()
if m.store.Exists(e.dataset) {
m.log.Info("dataset committed by another instance while waiting; skipping download",
zap.String("dataset", e.dataset.Filename()))
return nil
}
return m.src.Download(ctx, e.dataset, m.store, jobProgress{e: e}, m.throttle)
}
func (m *Manager) completeShortCircuit(ctx context.Context, e *jobEntry) {
_ = ctx
defer m.inFlight.Delete(e.dataset.Filename())

View file

@ -1,6 +1,7 @@
package datasets
import (
"context"
"errors"
"fmt"
"os"
@ -132,6 +133,13 @@ func (s *LocalStore) Remove(id DatasetID) error {
return nil
}
// Lock acquires the storage-wide download lock (an exclusive flock on a
// sentinel file in the root), serialising downloads across processes that
// share this directory.
func (s *LocalStore) Lock(ctx context.Context) (func(), error) {
return flockExclusive(ctx, filepath.Join(s.Root, ".download.lock"))
}
// BeginWrite opens or resumes a TempHandle for id.
func (s *LocalStore) BeginWrite(id DatasetID) (TempHandle, error) {
man, err := LoadManifest(s.manifestPath(id))
@ -148,8 +156,8 @@ type localHandle struct {
closed bool
}
func (h *localHandle) Path() string { return h.store.tempPath(h.id) }
func (h *localHandle) Manifest() *Manifest { return h.manifest }
func (h *localHandle) Path() string { return h.store.tempPath(h.id) }
func (h *localHandle) Manifest() *Manifest { return h.manifest }
func (h *localHandle) Commit() error {
if h.closed {

View file

@ -1,11 +1,64 @@
package datasets
import (
"context"
"os"
"testing"
"time"
)
func TestLocalStoreLockSerializes(t *testing.T) {
dir := t.TempDir()
store, _ := NewLocalStore(dir, "gfs-test")
ctx := context.Background()
release, err := store.Lock(ctx)
if err != nil {
t.Fatalf("first Lock: %v", err)
}
// A second acquisition must block until the first releases.
got := make(chan struct{})
go func() {
r2, err := store.Lock(ctx)
if err == nil {
r2()
}
close(got)
}()
select {
case <-got:
t.Fatal("second Lock acquired while first was held")
case <-time.After(100 * time.Millisecond):
// expected: still blocked
}
release()
select {
case <-got:
// expected: acquired after release
case <-time.After(2 * time.Second):
t.Fatal("second Lock did not acquire after release")
}
}
func TestLocalStoreLockContextCancel(t *testing.T) {
dir := t.TempDir()
store, _ := NewLocalStore(dir, "gfs-test")
release, err := store.Lock(context.Background())
if err != nil {
t.Fatalf("Lock: %v", err)
}
defer release()
ctx, cancel := context.WithCancel(context.Background())
cancel()
if _, err := store.Lock(ctx); err == nil {
t.Error("expected Lock to fail on cancelled context while held elsewhere")
}
}
func TestLocalStoreBeginWriteResume(t *testing.T) {
dir := t.TempDir()
store, err := NewLocalStore(dir, "gfs-test")

View file

@ -61,6 +61,12 @@ type Storage interface {
// BeginWrite opens (or resumes) a transactional handle for downloading
// id's dataset.
BeginWrite(id DatasetID) (TempHandle, error)
// Lock acquires an exclusive, storage-wide lock that serialises downloads
// across every process sharing this storage (e.g. multiple replicas on a
// node that share a dataset volume). It blocks until the lock is held or
// ctx is cancelled. The returned function releases the lock.
Lock(ctx context.Context) (release func(), err error)
}
// TempHandle is the storage state for one in-progress download.