feat: polish & windviz & deploy
This commit is contained in:
parent
81b8e763bd
commit
465ad00f7b
78 changed files with 20622 additions and 2154 deletions
11
internal/datasets/lock_other.go
Normal file
11
internal/datasets/lock_other.go
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
//go:build !unix
|
||||
|
||||
package datasets
|
||||
|
||||
import "context"
|
||||
|
||||
// flockExclusive is a no-op on platforms without flock. The service targets
|
||||
// Linux containers; this stub only keeps non-Unix builds compiling.
|
||||
func flockExclusive(_ context.Context, _ string) (func(), error) {
|
||||
return func() {}, nil
|
||||
}
|
||||
50
internal/datasets/lock_unix.go
Normal file
50
internal/datasets/lock_unix.go
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
//go:build unix
|
||||
|
||||
package datasets
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// lockPollInterval is how often a contended lock is retried. The lock is held
|
||||
// for the duration of a dataset download (minutes), so sub-second acquisition
|
||||
// latency is irrelevant.
|
||||
const lockPollInterval = 150 * time.Millisecond
|
||||
|
||||
// flockExclusive acquires an exclusive flock on path, creating the lock file
|
||||
// if needed, and blocks until it is held or ctx is cancelled.
|
||||
//
|
||||
// It uses non-blocking LOCK_NB attempts in a poll loop rather than a blocking
|
||||
// flock in a goroutine: the file descriptor is only ever touched by this
|
||||
// goroutine, so there is no race between a pending syscall and Close on
|
||||
// cancellation.
|
||||
func flockExclusive(ctx context.Context, path string) (func(), error) {
|
||||
f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o644)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open lock file: %w", err)
|
||||
}
|
||||
for {
|
||||
err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
|
||||
if err == nil {
|
||||
return func() {
|
||||
_ = syscall.Flock(int(f.Fd()), syscall.LOCK_UN)
|
||||
_ = f.Close()
|
||||
}, nil
|
||||
}
|
||||
if !errors.Is(err, syscall.EWOULDBLOCK) {
|
||||
f.Close()
|
||||
return nil, fmt.Errorf("flock: %w", err)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
f.Close()
|
||||
return nil, ctx.Err()
|
||||
case <-time.After(lockPollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -27,16 +27,16 @@ const (
|
|||
|
||||
// JobInfo is the externally-visible snapshot of a download job.
|
||||
type JobInfo struct {
|
||||
ID string
|
||||
Source string
|
||||
Dataset DatasetID
|
||||
Status JobStatus
|
||||
StartedAt time.Time
|
||||
EndedAt *time.Time
|
||||
Err string
|
||||
Total int
|
||||
Done int
|
||||
Bytes int64
|
||||
ID string
|
||||
Source string
|
||||
Dataset DatasetID
|
||||
Status JobStatus
|
||||
StartedAt time.Time
|
||||
EndedAt *time.Time
|
||||
Err string
|
||||
Total int
|
||||
Done int
|
||||
Bytes int64
|
||||
}
|
||||
|
||||
type jobEntry struct {
|
||||
|
|
@ -75,9 +75,9 @@ func (e *jobEntry) snapshot() JobInfo {
|
|||
|
||||
type jobProgress struct{ e *jobEntry }
|
||||
|
||||
func (p jobProgress) SetTotal(n int) { p.e.total.Store(int64(n)) }
|
||||
func (p jobProgress) StepComplete() { p.e.done.Add(1) }
|
||||
func (p jobProgress) Bytes(n int64) { p.e.bytes.Add(n) }
|
||||
func (p jobProgress) SetTotal(n int) { p.e.total.Store(int64(n)) }
|
||||
func (p jobProgress) StepComplete() { p.e.done.Add(1) }
|
||||
func (p jobProgress) Bytes(n int64) { p.e.bytes.Add(n) }
|
||||
|
||||
// loadedDataset bundles a loaded WindField with its identity and coverage.
|
||||
type loadedDataset struct {
|
||||
|
|
@ -387,7 +387,7 @@ func (m *Manager) runDownload(ctx context.Context, e *jobEntry) {
|
|||
zap.String("job", e.id),
|
||||
zap.String("dataset", e.dataset.Filename()))
|
||||
|
||||
err := m.src.Download(ctx, e.dataset, m.store, jobProgress{e: e}, m.throttle)
|
||||
err := m.downloadLocked(ctx, e)
|
||||
now := time.Now().UTC()
|
||||
|
||||
e.mu.Lock()
|
||||
|
|
@ -410,6 +410,26 @@ func (m *Manager) runDownload(ctx context.Context, e *jobEntry) {
|
|||
zap.NamedError("err", err))
|
||||
}
|
||||
|
||||
// downloadLocked runs the source download while holding the storage's
|
||||
// cross-process lock, so multiple replicas sharing a node-local dataset
|
||||
// volume coordinate instead of each fetching ~9 GB. After acquiring the lock
|
||||
// it re-checks existence: if another replica committed the dataset while this
|
||||
// one waited, it skips the download and lets the caller load the committed file.
|
||||
func (m *Manager) downloadLocked(ctx context.Context, e *jobEntry) error {
|
||||
release, err := m.store.Lock(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("acquire download lock: %w", err)
|
||||
}
|
||||
defer release()
|
||||
|
||||
if m.store.Exists(e.dataset) {
|
||||
m.log.Info("dataset committed by another instance while waiting; skipping download",
|
||||
zap.String("dataset", e.dataset.Filename()))
|
||||
return nil
|
||||
}
|
||||
return m.src.Download(ctx, e.dataset, m.store, jobProgress{e: e}, m.throttle)
|
||||
}
|
||||
|
||||
func (m *Manager) completeShortCircuit(ctx context.Context, e *jobEntry) {
|
||||
_ = ctx
|
||||
defer m.inFlight.Delete(e.dataset.Filename())
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package datasets
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
|
|
@ -132,6 +133,13 @@ func (s *LocalStore) Remove(id DatasetID) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Lock acquires the storage-wide download lock (an exclusive flock on a
|
||||
// sentinel file in the root), serialising downloads across processes that
|
||||
// share this directory.
|
||||
func (s *LocalStore) Lock(ctx context.Context) (func(), error) {
|
||||
return flockExclusive(ctx, filepath.Join(s.Root, ".download.lock"))
|
||||
}
|
||||
|
||||
// BeginWrite opens or resumes a TempHandle for id.
|
||||
func (s *LocalStore) BeginWrite(id DatasetID) (TempHandle, error) {
|
||||
man, err := LoadManifest(s.manifestPath(id))
|
||||
|
|
@ -148,8 +156,8 @@ type localHandle struct {
|
|||
closed bool
|
||||
}
|
||||
|
||||
func (h *localHandle) Path() string { return h.store.tempPath(h.id) }
|
||||
func (h *localHandle) Manifest() *Manifest { return h.manifest }
|
||||
func (h *localHandle) Path() string { return h.store.tempPath(h.id) }
|
||||
func (h *localHandle) Manifest() *Manifest { return h.manifest }
|
||||
|
||||
func (h *localHandle) Commit() error {
|
||||
if h.closed {
|
||||
|
|
|
|||
|
|
@ -1,11 +1,64 @@
|
|||
package datasets
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLocalStoreLockSerializes(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, _ := NewLocalStore(dir, "gfs-test")
|
||||
ctx := context.Background()
|
||||
|
||||
release, err := store.Lock(ctx)
|
||||
if err != nil {
|
||||
t.Fatalf("first Lock: %v", err)
|
||||
}
|
||||
|
||||
// A second acquisition must block until the first releases.
|
||||
got := make(chan struct{})
|
||||
go func() {
|
||||
r2, err := store.Lock(ctx)
|
||||
if err == nil {
|
||||
r2()
|
||||
}
|
||||
close(got)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-got:
|
||||
t.Fatal("second Lock acquired while first was held")
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
// expected: still blocked
|
||||
}
|
||||
release()
|
||||
select {
|
||||
case <-got:
|
||||
// expected: acquired after release
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("second Lock did not acquire after release")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalStoreLockContextCancel(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, _ := NewLocalStore(dir, "gfs-test")
|
||||
|
||||
release, err := store.Lock(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("Lock: %v", err)
|
||||
}
|
||||
defer release()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
if _, err := store.Lock(ctx); err == nil {
|
||||
t.Error("expected Lock to fail on cancelled context while held elsewhere")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLocalStoreBeginWriteResume(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := NewLocalStore(dir, "gfs-test")
|
||||
|
|
|
|||
|
|
@ -61,6 +61,12 @@ type Storage interface {
|
|||
// BeginWrite opens (or resumes) a transactional handle for downloading
|
||||
// id's dataset.
|
||||
BeginWrite(id DatasetID) (TempHandle, error)
|
||||
|
||||
// Lock acquires an exclusive, storage-wide lock that serialises downloads
|
||||
// across every process sharing this storage (e.g. multiple replicas on a
|
||||
// node that share a dataset volume). It blocks until the lock is held or
|
||||
// ctx is cancelled. The returned function releases the lock.
|
||||
Lock(ctx context.Context) (release func(), err error)
|
||||
}
|
||||
|
||||
// TempHandle is the storage state for one in-progress download.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue