feat: refactor

2026-03-28 03:07:13 +09:00 · 2026-03-28 03:07:13 +09:00 · 51bbf3c579
commit 51bbf3c579
parent 82ef1cb3b8
44 changed files with 8589 additions and 0 deletions
--- a/internal/downloader/config.go
+++ b/internal/downloader/config.go
@ -0,0 +1,58 @@
+package downloader
+
+import (
+	"os"
+	"strconv"
+	"time"
+)
+
+// Config holds downloader configuration, loaded from environment variables.
+type Config struct {
+	// DataDir is the directory for storing dataset files and temporary GRIB data.
+	DataDir string
+
+	// Parallel is the maximum number of concurrent GRIB downloads.
+	Parallel int
+
+	// UpdateInterval is how often the scheduler checks for new forecast data.
+	UpdateInterval time.Duration
+
+	// DatasetTTL is how long a dataset is considered fresh before a new one is needed.
+	DatasetTTL time.Duration
+}
+
+// DefaultConfig returns the default configuration.
+func DefaultConfig() *Config {
+	return &Config{
+		DataDir:        "/tmp/predictor-data",
+		Parallel:       8,
+		UpdateInterval: 6 * time.Hour,
+		DatasetTTL:     48 * time.Hour,
+	}
+}
+
+// LoadConfig loads configuration from environment variables, falling back to defaults.
+func LoadConfig() *Config {
+	cfg := DefaultConfig()
+
+	if v := os.Getenv("PREDICTOR_DATA_DIR"); v != "" {
+		cfg.DataDir = v
+	}
+	if v := os.Getenv("PREDICTOR_DOWNLOAD_PARALLEL"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			cfg.Parallel = n
+		}
+	}
+	if v := os.Getenv("PREDICTOR_UPDATE_INTERVAL"); v != "" {
+		if d, err := time.ParseDuration(v); err == nil {
+			cfg.UpdateInterval = d
+		}
+	}
+	if v := os.Getenv("PREDICTOR_DATASET_TTL"); v != "" {
+		if d, err := time.ParseDuration(v); err == nil {
+			cfg.DatasetTTL = d
+		}
+	}
+
+	return cfg
+}
--- a/internal/downloader/downloader.go
+++ b/internal/downloader/downloader.go
@ -0,0 +1,380 @@
+package downloader
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"net/http"
+	"os"
+	"path/filepath"
+	"time"
+
+	"predictor-refactored/internal/dataset"
+
+	"github.com/nilsmagnus/grib/griblib"
+	"go.uber.org/zap"
+	"golang.org/x/sync/errgroup"
+)
+
+// Downloader handles fetching GFS forecast data from S3 and assembling dataset files.
+type Downloader struct {
+	cfg    *Config
+	client *http.Client
+	log    *zap.Logger
+}
+
+// NewDownloader creates a new Downloader.
+func NewDownloader(cfg *Config, log *zap.Logger) *Downloader {
+	return &Downloader{
+		cfg: cfg,
+		client: &http.Client{
+			Timeout: 2 * time.Minute,
+		},
+		log: log,
+	}
+}
+
+// neededVariables is the set of GRIB variable names we need.
+var neededVariables = map[string]bool{
+	"HGT":  true,
+	"UGRD": true,
+	"VGRD": true,
+}
+
+// FindLatestRun finds the most recent available GFS model run on S3.
+// It checks the last forecast step of each run to confirm availability.
+func (d *Downloader) FindLatestRun(ctx context.Context) (time.Time, error) {
+	now := time.Now().UTC()
+	hour := now.Hour() - (now.Hour() % 6)
+	current := time.Date(now.Year(), now.Month(), now.Day(), hour, 0, 0, 0, time.UTC)
+
+	for i := 0; i < 8; i++ {
+		date := current.Format("20060102")
+		url := dataset.GribURL(date, current.Hour(), dataset.MaxHour) + ".idx"
+
+		req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
+		if err != nil {
+			current = current.Add(-6 * time.Hour)
+			continue
+		}
+
+		resp, err := d.client.Do(req)
+		if err == nil {
+			resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				d.log.Info("found latest model run",
+					zap.Time("run", current),
+					zap.String("verified_url", url))
+				return current, nil
+			}
+		}
+
+		current = current.Add(-6 * time.Hour)
+	}
+
+	return time.Time{}, fmt.Errorf("no recent GFS forecast found (checked 8 runs)")
+}
+
+// Download downloads a complete forecast and assembles a dataset file.
+// Returns the path to the completed dataset file.
+func (d *Downloader) Download(ctx context.Context, run time.Time) (string, error) {
+	date := run.Format("20060102")
+	runHour := run.Hour()
+
+	finalPath := filepath.Join(d.cfg.DataDir, run.Format("2006010215"))
+	tempPath := finalPath + ".downloading"
+
+	// Check if final dataset already exists
+	if info, err := os.Stat(finalPath); err == nil && info.Size() == dataset.DatasetSize {
+		d.log.Info("dataset already exists", zap.String("path", finalPath))
+		return finalPath, nil
+	}
+
+	d.log.Info("starting dataset download",
+		zap.Time("run", run),
+		zap.String("temp_path", tempPath))
+
+	// Create the dataset file
+	ds, err := dataset.Create(tempPath)
+	if err != nil {
+		return "", fmt.Errorf("create dataset: %w", err)
+	}
+	defer ds.Close()
+
+	steps := dataset.Hours()
+	totalSteps := len(steps) * 2 // pgrb2 + pgrb2b per step
+	completed := 0
+
+	// Process each forecast step with bounded concurrency
+	g, ctx := errgroup.WithContext(ctx)
+	sem := make(chan struct{}, d.cfg.Parallel)
+
+	for _, step := range steps {
+		step := step
+		hourIdx := dataset.HourIndex(step)
+		if hourIdx < 0 {
+			continue
+		}
+
+		// Download pgrb2 (level set A)
+		sem <- struct{}{}
+		g.Go(func() error {
+			defer func() { <-sem }()
+			url := dataset.GribURL(date, runHour, step)
+			err := d.DownloadAndBlit(ctx, ds, url, hourIdx, dataset.LevelSetA)
+			if err != nil {
+				return fmt.Errorf("step %d pgrb2: %w", step, err)
+			}
+			completed++
+			d.log.Debug("step complete",
+				zap.Int("step", step),
+				zap.String("set", "pgrb2"),
+				zap.Int("progress", completed),
+				zap.Int("total", totalSteps))
+			return nil
+		})
+
+		// Download pgrb2b (level set B)
+		sem <- struct{}{}
+		g.Go(func() error {
+			defer func() { <-sem }()
+			url := dataset.GribURLB(date, runHour, step)
+			err := d.DownloadAndBlit(ctx, ds, url, hourIdx, dataset.LevelSetB)
+			if err != nil {
+				return fmt.Errorf("step %d pgrb2b: %w", step, err)
+			}
+			completed++
+			d.log.Debug("step complete",
+				zap.Int("step", step),
+				zap.String("set", "pgrb2b"),
+				zap.Int("progress", completed),
+				zap.Int("total", totalSteps))
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		os.Remove(tempPath)
+		return "", err
+	}
+
+	// Flush to disk
+	if err := ds.Flush(); err != nil {
+		os.Remove(tempPath)
+		return "", fmt.Errorf("flush dataset: %w", err)
+	}
+
+	// Close before rename
+	ds.Close()
+
+	// Atomic rename
+	if err := os.Rename(tempPath, finalPath); err != nil {
+		os.Remove(tempPath)
+		return "", fmt.Errorf("rename dataset: %w", err)
+	}
+
+	d.log.Info("dataset download complete", zap.String("path", finalPath))
+	return finalPath, nil
+}
+
+// DownloadAndBlit downloads needed GRIB fields from a URL and writes them into the dataset.
+func (d *Downloader) DownloadAndBlit(ctx context.Context, ds *dataset.File, baseURL string, hourIdx int, levelSet dataset.LevelSet) error {
+	// 1. Download .idx
+	idxURL := baseURL + ".idx"
+	idxBody, err := d.httpGet(ctx, idxURL)
+	if err != nil {
+		return fmt.Errorf("download idx: %w", err)
+	}
+
+	// 2. Parse and filter
+	entries := ParseIdx(idxBody)
+	filtered := FilterIdx(entries, neededVariables)
+
+	// Further filter to only levels in this level set
+	var relevant []IdxEntry
+	for _, e := range filtered {
+		ls, ok := dataset.PressureLevelSet(e.LevelMB)
+		if ok && ls == levelSet {
+			relevant = append(relevant, e)
+		}
+	}
+
+	if len(relevant) == 0 {
+		d.log.Warn("no relevant entries found in idx",
+			zap.String("url", idxURL),
+			zap.Int("total_entries", len(entries)),
+			zap.Int("filtered", len(filtered)))
+		return nil
+	}
+
+	// 3. Download byte ranges and write to temp file
+	ranges := EntriesToRanges(relevant)
+	tmpFile, err := d.downloadRangesToTempFile(ctx, baseURL, ranges)
+	if err != nil {
+		return fmt.Errorf("download ranges: %w", err)
+	}
+	defer os.Remove(tmpFile)
+
+	// 4. Read GRIB messages from temp file
+	f, err := os.Open(tmpFile)
+	if err != nil {
+		return fmt.Errorf("open temp grib: %w", err)
+	}
+
+	messages, err := griblib.ReadMessages(f)
+	f.Close()
+	if err != nil {
+		return fmt.Errorf("read grib messages: %w", err)
+	}
+
+	// 5. Decode and blit each message into the dataset
+	for _, msg := range messages {
+		if msg.Section4.ProductDefinitionTemplateNumber != 0 {
+			continue
+		}
+
+		product := msg.Section4.ProductDefinitionTemplate
+
+		varIdx := dataset.VariableIndex(int(product.ParameterCategory), int(product.ParameterNumber))
+		if varIdx < 0 {
+			continue
+		}
+
+		if product.FirstSurface.Type != 100 { // isobaric surface
+			continue
+		}
+
+		pressurePa := float64(product.FirstSurface.Value)
+		pressureMB := int(math.Round(pressurePa / 100.0))
+		levelIdx := dataset.PressureIndex(pressureMB)
+		if levelIdx < 0 {
+			continue
+		}
+
+		data := msg.Data()
+		if err := ds.BlitGribData(hourIdx, levelIdx, varIdx, data); err != nil {
+			d.log.Warn("blit failed",
+				zap.Int("var", varIdx),
+				zap.Int("level_mb", pressureMB),
+				zap.Error(err))
+			continue
+		}
+	}
+
+	return nil
+}
+
+// downloadRangesToTempFile downloads multiple byte ranges from a URL,
+// concatenating them into a single temp file (valid concatenated GRIB messages).
+func (d *Downloader) downloadRangesToTempFile(ctx context.Context, baseURL string, ranges []ByteRange) (string, error) {
+	tmpFile, err := os.CreateTemp(d.cfg.DataDir, "grib-*.tmp")
+	if err != nil {
+		return "", fmt.Errorf("create temp file: %w", err)
+	}
+	tmpPath := tmpFile.Name()
+
+	for _, r := range ranges {
+		data, err := d.httpGetRange(ctx, baseURL, r.Start, r.End)
+		if err != nil {
+			tmpFile.Close()
+			os.Remove(tmpPath)
+			return "", fmt.Errorf("download range %d-%d: %w", r.Start, r.End, err)
+		}
+		if _, err := tmpFile.Write(data); err != nil {
+			tmpFile.Close()
+			os.Remove(tmpPath)
+			return "", fmt.Errorf("write temp: %w", err)
+		}
+	}
+
+	if err := tmpFile.Close(); err != nil {
+		os.Remove(tmpPath)
+		return "", err
+	}
+
+	return tmpPath, nil
+}
+
+// httpGet downloads a URL and returns the body bytes.
+func (d *Downloader) httpGet(ctx context.Context, url string) ([]byte, error) {
+	var lastErr error
+	for attempt := 0; attempt < 3; attempt++ {
+		if attempt > 0 {
+			select {
+			case <-time.After(time.Duration(attempt*2) * time.Second):
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			}
+		}
+
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+		if err != nil {
+			return nil, err
+		}
+
+		resp, err := d.client.Do(req)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		body, err := io.ReadAll(resp.Body)
+		resp.Body.Close()
+
+		if resp.StatusCode != http.StatusOK {
+			lastErr = fmt.Errorf("HTTP %d for %s", resp.StatusCode, url)
+			continue
+		}
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		return body, nil
+	}
+
+	return nil, fmt.Errorf("after 3 attempts: %w", lastErr)
+}
+
+// httpGetRange downloads a byte range from a URL.
+func (d *Downloader) httpGetRange(ctx context.Context, url string, start, end int64) ([]byte, error) {
+	var lastErr error
+	for attempt := 0; attempt < 3; attempt++ {
+		if attempt > 0 {
+			select {
+			case <-time.After(time.Duration(attempt*2) * time.Second):
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			}
+		}
+
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+		if err != nil {
+			return nil, err
+		}
+		req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end))
+
+		resp, err := d.client.Do(req)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		body, err := io.ReadAll(resp.Body)
+		resp.Body.Close()
+
+		if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {
+			lastErr = fmt.Errorf("HTTP %d for range %d-%d of %s", resp.StatusCode, start, end, url)
+			continue
+		}
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		return body, nil
+	}
+
+	return nil, fmt.Errorf("after 3 attempts: %w", lastErr)
+}
--- a/internal/downloader/idx.go
+++ b/internal/downloader/idx.go
@ -0,0 +1,157 @@
+package downloader
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// IdxEntry represents a single parsed line from a GRIB .idx file.
+// Example line: "15:1207405:d=2024010100:HGT:1000 mb:0 hour fcst:"
+type IdxEntry struct {
+	Index     int
+	Offset    int64
+	Variable  string // "HGT", "UGRD", "VGRD", etc.
+	LevelMB   int    // pressure level in mb (0 if not a pressure level)
+	Hour      int    // forecast hour
+	EndOffset int64  // byte after this message (from next entry's offset, or -1 if last)
+}
+
+// Length returns the byte length of this GRIB message, or -1 if unknown.
+func (e *IdxEntry) Length() int64 {
+	if e.EndOffset <= 0 {
+		return -1
+	}
+	return e.EndOffset - e.Offset
+}
+
+// ParseIdx parses a .idx file body and returns all entries.
+// Lines that can't be parsed are silently skipped.
+func ParseIdx(body []byte) []IdxEntry {
+	lines := strings.Split(string(body), "\n")
+	var entries []IdxEntry
+
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+
+		parts := strings.Split(line, ":")
+		if len(parts) < 7 {
+			continue
+		}
+
+		idx, err := strconv.Atoi(parts[0])
+		if err != nil {
+			continue
+		}
+
+		offset, err := strconv.ParseInt(parts[1], 10, 64)
+		if err != nil {
+			continue
+		}
+
+		variable := parts[3]
+		levelStr := parts[4]
+		hourStr := parts[5]
+
+		levelMB := parseLevelMB(levelStr)
+		hour := parseHour(hourStr)
+
+		entries = append(entries, IdxEntry{
+			Index:     idx,
+			Offset:    offset,
+			Variable:  variable,
+			LevelMB:   levelMB,
+			Hour:      hour,
+			EndOffset: -1, // filled in below
+		})
+	}
+
+	// Fill in EndOffset from the next entry's Offset.
+	for i := 0; i < len(entries)-1; i++ {
+		entries[i].EndOffset = entries[i+1].Offset
+	}
+
+	return entries
+}
+
+// FilterIdx returns entries matching the given variables at pressure levels.
+// Only entries with a recognized pressure level (levelMB > 0) are returned.
+func FilterIdx(entries []IdxEntry, variables map[string]bool) []IdxEntry {
+	var filtered []IdxEntry
+	for _, e := range entries {
+		if !variables[e.Variable] {
+			continue
+		}
+		if e.LevelMB <= 0 {
+			continue
+		}
+		// Must have a known length (not the last entry) or be handled specially
+		if e.Length() <= 0 {
+			continue
+		}
+		filtered = append(filtered, e)
+	}
+	return filtered
+}
+
+// parseLevelMB parses a level string like "1000 mb" and returns the pressure in mb.
+// Returns 0 if not a pressure level.
+func parseLevelMB(s string) int {
+	s = strings.TrimSpace(s)
+	if !strings.HasSuffix(s, " mb") {
+		return 0
+	}
+	numStr := strings.TrimSuffix(s, " mb")
+	n, err := strconv.Atoi(numStr)
+	if err != nil {
+		return 0
+	}
+	return n
+}
+
+// parseHour parses a forecast hour string like "0 hour fcst" or "anl".
+// Returns -1 if it can't be parsed.
+func parseHour(s string) int {
+	s = strings.TrimSpace(s)
+	if s == "anl" {
+		return 0
+	}
+	s = strings.TrimSuffix(s, " hour fcst")
+	n, err := strconv.Atoi(s)
+	if err != nil {
+		return -1
+	}
+	return n
+}
+
+// GroupByRange groups idx entries into byte ranges suitable for HTTP Range downloads.
+// Each range covers one contiguous GRIB message.
+type ByteRange struct {
+	Start int64
+	End   int64 // inclusive
+	Entry IdxEntry
+}
+
+// EntriesToRanges converts filtered idx entries to byte ranges.
+func EntriesToRanges(entries []IdxEntry) []ByteRange {
+	ranges := make([]ByteRange, 0, len(entries))
+	for _, e := range entries {
+		if e.Length() <= 0 {
+			continue
+		}
+		ranges = append(ranges, ByteRange{
+			Start: e.Offset,
+			End:   e.EndOffset - 1, // inclusive
+			Entry: e,
+		})
+	}
+	return ranges
+}
+
+// FormatRange returns an HTTP Range header value for a byte range.
+func (r ByteRange) FormatRange() string {
+	return fmt.Sprintf("bytes=%d-%d", r.Start, r.End)
+}
--- a/internal/downloader/idx_test.go
+++ b/internal/downloader/idx_test.go
@ -0,0 +1,110 @@
+package downloader
+
+import (
+	"testing"
+)
+
+const sampleIdx = `1:0:d=2024010100:HGT:1000 mb:0 hour fcst:
+2:289012:d=2024010100:HGT:975 mb:0 hour fcst:
+3:541876:d=2024010100:TMP:1000 mb:0 hour fcst:
+4:789012:d=2024010100:UGRD:1000 mb:0 hour fcst:
+5:1045678:d=2024010100:VGRD:1000 mb:0 hour fcst:
+6:1298765:d=2024010100:UGRD:975 mb:0 hour fcst:
+7:1567890:d=2024010100:UGRD:2 m above ground:0 hour fcst:
+8:1812345:d=2024010100:VGRD:975 mb:0 hour fcst:
+9:2098765:d=2024010100:HGT:500 mb:3 hour fcst:
+`
+
+func TestParseIdx(t *testing.T) {
+	entries := ParseIdx([]byte(sampleIdx))
+	if len(entries) != 9 {
+		t.Fatalf("expected 9 entries, got %d", len(entries))
+	}
+
+	// Check first entry
+	e := entries[0]
+	if e.Index != 1 || e.Offset != 0 || e.Variable != "HGT" || e.LevelMB != 1000 || e.Hour != 0 {
+		t.Errorf("entry 0: got %+v", e)
+	}
+	if e.EndOffset != 289012 {
+		t.Errorf("entry 0 EndOffset: got %d, want 289012", e.EndOffset)
+	}
+
+	// Check "2 m above ground" is not a pressure level
+	e = entries[6] // UGRD at "2 m above ground"
+	if e.LevelMB != 0 {
+		t.Errorf("non-pressure level should have LevelMB=0, got %d", e.LevelMB)
+	}
+
+	// Last entry should have EndOffset = -1
+	last := entries[len(entries)-1]
+	if last.EndOffset != -1 {
+		t.Errorf("last entry EndOffset: got %d, want -1", last.EndOffset)
+	}
+}
+
+func TestFilterIdx(t *testing.T) {
+	entries := ParseIdx([]byte(sampleIdx))
+	filtered := FilterIdx(entries, neededVariables)
+
+	// Should include HGT/UGRD/VGRD at pressure levels, exclude TMP and "above ground"
+	// And exclude last entry (no EndOffset)
+	for _, e := range filtered {
+		if !neededVariables[e.Variable] {
+			t.Errorf("unexpected variable %s", e.Variable)
+		}
+		if e.LevelMB <= 0 {
+			t.Errorf("non-pressure level included: %+v", e)
+		}
+		if e.Length() <= 0 {
+			t.Errorf("entry with unknown length included: %+v", e)
+		}
+	}
+
+	// Count expected: HGT@1000, HGT@975, UGRD@1000, VGRD@1000, UGRD@975, VGRD@975 = 6
+	// But HGT@500 at 3hr fcst is the last entry (no EndOffset), so excluded
+	if len(filtered) != 6 {
+		t.Errorf("expected 6 filtered entries, got %d", len(filtered))
+		for _, e := range filtered {
+			t.Logf("  %s %d mb (offset %d, len %d)", e.Variable, e.LevelMB, e.Offset, e.Length())
+		}
+	}
+}
+
+func TestParseLevelMB(t *testing.T) {
+	tests := []struct {
+		input string
+		want  int
+	}{
+		{"1000 mb", 1000},
+		{"975 mb", 975},
+		{"1 mb", 1},
+		{"2 m above ground", 0},
+		{"surface", 0},
+		{"tropopause", 0},
+	}
+	for _, tt := range tests {
+		got := parseLevelMB(tt.input)
+		if got != tt.want {
+			t.Errorf("parseLevelMB(%q) = %d, want %d", tt.input, got, tt.want)
+		}
+	}
+}
+
+func TestParseHour(t *testing.T) {
+	tests := []struct {
+		input string
+		want  int
+	}{
+		{"0 hour fcst", 0},
+		{"3 hour fcst", 3},
+		{"192 hour fcst", 192},
+		{"anl", 0},
+	}
+	for _, tt := range tests {
+		got := parseHour(tt.input)
+		if got != tt.want {
+			t.Errorf("parseHour(%q) = %d, want %d", tt.input, got, tt.want)
+		}
+	}
+}