feat: s3 download

This commit is contained in:
Anatoly Antonov 2025-10-20 19:10:07 +09:00
parent a850615e1f
commit c4f355a32e
15 changed files with 590 additions and 109 deletions

View file

@ -11,9 +11,13 @@ type Config struct {
Dir string `env:"DIR" envDefault:"/tmp/grib"`
TTL time.Duration `env:"TTL" envDefault:"24h"`
CacheTTL time.Duration `env:"CACHE_TTL" envDefault:"1h"`
Parallel int `env:"PARALLEL" envDefault:"4"`
Timeout time.Duration `env:"TIMEOUT" envDefault:"30s"`
Parallel int `env:"PARALLEL" envDefault:"8"`
DatasetURL string `env:"DATASET_URL" envDefault:"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod"`
// S3 configuration
UseS3 bool `env:"USE_S3" envDefault:"true"`
S3Bucket string `env:"S3_BUCKET" envDefault:"noaa-gfs-bdp-pds"`
S3Region string `env:"S3_REGION" envDefault:"us-east-1"`
S3Timeout time.Duration `env:"S3_TIMEOUT" envDefault:"300s"`
}
func NewConfig() (*Config, error) {
@ -23,5 +27,6 @@ func NewConfig() (*Config, error) {
}); err != nil {
return nil, errcodes.Wrap(err, "failed to parse GRIB config")
}
return cfg, nil
}

View file

@ -28,8 +28,8 @@ func openCube(path string) (*cube, error) {
}
const (
nT = 17
nP = 34
nT = 97 // 0-96 hours with step 1 hour
nP = 47 // 47 pressure levels matching tawhiri
nLat = 361
nLon = 720
)

View file

@ -24,28 +24,51 @@ func (d *Downloader) fileURL(run string, hour int, step int) string {
return fmt.Sprintf("%s/gfs.%s/%02d/atmos/gfs.t%02dz.pgrb2.0p50.f%03d", d.DatasetURL, run, hour, hour, step)
}
func (d *Downloader) fetch(ctx context.Context, url, dst string) error {
func (d *Downloader) fetch(ctx context.Context, url, dst string) (err error) {
// Check if final file already exists
if _, err := os.Stat(dst); err == nil {
return nil
}
tmp := dst + ".part"
// Remove old .part file if it exists (fixes race condition)
os.Remove(tmp)
f, err := os.Create(tmp)
if err != nil {
return err
}
defer f.Close()
// Cleanup .part file on any error (using named return value)
defer func() {
f.Close()
if err != nil {
os.Remove(tmp)
}
}()
req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
resp, err := d.Client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return errcodes.Wrap(errcodes.ErrDownload, "bad status: "+resp.Status)
}
if _, err := io.Copy(f, resp.Body); err != nil {
return err
}
// Close file before rename
if err := f.Close(); err != nil {
return err
}
// If rename fails, err will be set and defer will cleanup .part file
return os.Rename(tmp, dst)
}

View file

@ -17,8 +17,9 @@ func (d *dataset) uv(lat, lon, alt float64, tHours float64) (float64, float64) {
x0 := int(math.Floor(ix)) % d.cube.lon
x1 := (x0 + 1) % d.cube.lon
wx := ix - float64(x0)
it0 := int(math.Floor(tHours / 3.0))
wt := (tHours - float64(it0*3)) / 3.0
// For hourly data (step = 1 hour)
it0 := int(math.Floor(tHours))
wt := tHours - float64(it0)
p := pressureFromAlt(alt)
ip0 := 0
for ip0+1 < len(pressureLevels) && pressureLevels[ip0+1] > p {
@ -27,14 +28,14 @@ func (d *dataset) uv(lat, lon, alt float64, tHours float64) (float64, float64) {
ip1 := ip0 + 1
wp := (pressureLevels[ip0] - p) / (pressureLevels[ip0] - pressureLevels[ip1])
fetch := func(ti, pi int) (float64, float64) {
u00 := d.cube.val(0, ti, pi, y0, x0)
u10 := d.cube.val(0, ti, pi, y0, x1)
u01 := d.cube.val(0, ti, pi, y1, x0)
u11 := d.cube.val(0, ti, pi, y1, x1)
v00 := d.cube.val(1, ti, pi, y0, x0)
v10 := d.cube.val(1, ti, pi, y0, x1)
v01 := d.cube.val(1, ti, pi, y1, x0)
v11 := d.cube.val(1, ti, pi, y1, x1)
u00 := d.cube.val(1, ti, pi, y0, x0)
u10 := d.cube.val(1, ti, pi, y0, x1)
u01 := d.cube.val(1, ti, pi, y1, x0)
u11 := d.cube.val(1, ti, pi, y1, x1)
v00 := d.cube.val(2, ti, pi, y0, x0)
v10 := d.cube.val(2, ti, pi, y0, x1)
v01 := d.cube.val(2, ti, pi, y1, x0)
v11 := d.cube.val(2, ti, pi, y1, x1)
uxy := (1-wy)*((1-wx)*float64(u00)+wx*float64(u10)) + wy*((1-wx)*float64(u01)+wx*float64(u11))
vxy := (1-wy)*((1-wx)*float64(v00)+wx*float64(v10)) + wy*((1-wx)*float64(v01)+wx*float64(v11))
return uxy, vxy

View file

@ -23,22 +23,13 @@ type Service interface {
GetStatus() (ready bool, lastUpdate time.Time, isFresh bool, errMsg string)
}
type ServiceConfig struct {
Dir string
TTL time.Duration
CacheTTL time.Duration
Parallel int
Client *http.Client
DatasetURL string
}
type service struct {
cfg ServiceConfig
cfg *Config
cache memCache
data atomic.Pointer[dataset]
}
func New(cfg ServiceConfig) (Service, error) {
func New(cfg *Config) (Service, error) {
if cfg.TTL == 0 {
cfg.TTL = 24 * time.Hour
}
@ -135,8 +126,7 @@ func (s *service) Update(ctx context.Context) error {
}
}
dl := Downloader{Dir: s.cfg.Dir, Parallel: s.cfg.Parallel, Client: s.cfg.Client, DatasetURL: s.cfg.DatasetURL}
run := nearestRun(time.Now().UTC().Add(-4 * time.Hour))
run := nearestRun(time.Now().UTC().Add(-24 * time.Hour))
// Check if we already have this run
cubePath := filepath.Join(s.cfg.Dir, run.Format("20060102_15")) + ".cube"
@ -156,9 +146,26 @@ func (s *service) Update(ctx context.Context) error {
}
}
// Download new data
if err := dl.Run(ctx, run); err != nil {
return err
// Download new data using S3 or HTTP
var downloadErr error
if s.cfg.UseS3 {
s3dl, err := NewS3Downloader(s.cfg.Dir, s.cfg.Parallel, s.cfg.S3Bucket, s.cfg.S3Region)
if err != nil {
return errcodes.Wrap(err, "failed to create S3 downloader")
}
downloadErr = s3dl.Run(ctx, run)
} else {
dl := Downloader{
Dir: s.cfg.Dir,
Parallel: s.cfg.Parallel,
Client: http.DefaultClient,
DatasetURL: s.cfg.DatasetURL,
}
downloadErr = dl.Run(ctx, run)
}
if downloadErr != nil {
return downloadErr
}
// Assemble cube if it doesn't exist
@ -179,8 +186,8 @@ func (s *service) Update(ctx context.Context) error {
}
func assembleCube(dir string, run time.Time, cubePath string) error {
const sizePerVar = 17 * 34 * 361 * 720 * 4
total := int64(sizePerVar * 2)
const sizePerVar = 97 * 47 * 361 * 720 * 4 // 97 time steps (0-96 hours), 47 pressure levels
total := int64(sizePerVar * 3) // 3 variables: gh, u, v
f, err := os.Create(cubePath)
if err != nil {
return err
@ -214,24 +221,30 @@ func assembleCube(dir string, run time.Time, cubePath string) error {
}
for _, m := range messages {
// Check if this is a wind component (u or v)
// Check if this is a wind component (u or v) or geopotential height
// ParameterCategory 2 = momentum, ParameterNumber 2 = u-wind, 3 = v-wind
// ParameterCategory 3 = mass, ParameterNumber 5 = geopotential height
if m.Section4.ProductDefinitionTemplateNumber != 0 {
continue
}
product := m.Section4.ProductDefinitionTemplate
if product.ParameterCategory != 2 {
continue
}
var varIdx int
switch product.ParameterNumber {
case 2: // u-wind
// Match tawhiri variable order: ['gh', 'u', 'v'] (indices 0, 1, 2)
if product.ParameterCategory == 2 {
switch product.ParameterNumber {
case 2: // u-wind
varIdx = 1
case 3: // v-wind
varIdx = 2
default:
continue
}
} else if product.ParameterCategory == 3 && product.ParameterNumber == 5 {
// geopotential height
varIdx = 0
case 3: // v-wind
varIdx = 1
default:
} else {
continue
}
@ -253,7 +266,7 @@ func assembleCube(dir string, run time.Time, cubePath string) error {
for i, v := range vals {
binary.LittleEndian.PutUint32(raw[i*4:], math.Float32bits(float32(v)))
}
base := int64(varIdx*sizePerVar + (ti*34+pIdx)*361*720*4)
base := int64(varIdx*sizePerVar + (ti*47+pIdx)*361*720*4)
copy(mm[base:base+int64(len(raw))], raw)
}
}
@ -266,7 +279,7 @@ func (s *service) Extract(ctx context.Context, lat, lon, alt float64, ts time.Ti
if d == nil {
return zero, errcodes.ErrNoDataset
}
if ts.Before(time.Unix(d.runUTC, 0)) || ts.After(time.Unix(d.runUTC, 0).Add(48*time.Hour)) {
if ts.Before(time.Unix(d.runUTC, 0)) || ts.After(time.Unix(d.runUTC, 0).Add(96*time.Hour)) {
return zero, errcodes.ErrOutOfBounds
}

View file

@ -2,7 +2,14 @@ package grib
import "math"
var pressureLevels = []float64{1000, 975, 950, 925, 900, 875, 850, 825, 800, 775, 750, 725, 700, 650, 600, 550, 500, 450, 400, 350, 300, 250, 200, 150, 100, 70, 50, 30, 20, 10, 7, 5, 3, 2}
// 47 pressure levels matching tawhiri configuration
var pressureLevels = []float64{
1000, 975, 950, 925, 900, 875, 850, 825, 800, 775,
750, 725, 700, 675, 650, 625, 600, 575, 550, 525,
500, 475, 450, 425, 400, 375, 350, 325, 300, 275,
250, 225, 200, 175, 150, 125, 100, 70, 50, 30,
20, 10, 7, 5, 3, 2, 1,
}
func pressureFromAlt(alt float64) float64 { // ICAO ISA
return 1013.25 * math.Pow(1-alt/44307.69396, 5.255877)

View file

@ -0,0 +1,265 @@
package grib
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"time"
"git.intra.yksa.space/gsn/predictor/internal/pkg/errcodes"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
"golang.org/x/sync/errgroup"
)
// S3Downloader downloads GRIB files from AWS S3
type S3Downloader struct {
Dir string
Parallel int
Bucket string
Region string
Client *s3.Client
}
// NewS3Downloader creates a new S3 downloader with anonymous access
func NewS3Downloader(dir string, parallel int, bucket, region string) (*S3Downloader, error) {
// Create AWS config with anonymous credentials for public bucket
cfg, err := config.LoadDefaultConfig(context.Background(),
config.WithRegion(region),
config.WithCredentialsProvider(aws.AnonymousCredentials{}),
)
if err != nil {
return nil, errcodes.Wrap(err, "failed to load AWS config")
}
client := s3.NewFromConfig(cfg)
return &S3Downloader{
Dir: dir,
Parallel: parallel,
Bucket: bucket,
Region: region,
Client: client,
}, nil
}
// s3Key generates the S3 key for a GRIB file
// Path format: gfs.YYYYMMDD/HH/atmos/gfs.tHHz.pgrb2.0p50.fFFF
func (d *S3Downloader) s3Key(run string, hour int, step int) string {
return fmt.Sprintf("gfs.%s/%02d/atmos/gfs.t%02dz.pgrb2.0p50.f%03d", run, hour, hour, step)
}
// CheckFileExists checks if a file exists in S3 using HeadObject
func (d *S3Downloader) CheckFileExists(ctx context.Context, key string) (bool, int64, error) {
input := &s3.HeadObjectInput{
Bucket: aws.String(d.Bucket),
Key: aws.String(key),
}
result, err := d.Client.HeadObject(ctx, input)
if err != nil {
// Check if error is NotFound
// AWS SDK v2 doesn't export specific error types, check error string
if isNotFoundError(err) {
return false, 0, nil
}
return false, 0, errcodes.Wrap(err, "failed to check file existence")
}
size := int64(0)
if result.ContentLength != nil {
size = *result.ContentLength
}
return true, size, nil
}
// isNotFoundError checks if error is a NotFound error
func isNotFoundError(err error) bool {
if err == nil {
return false
}
// AWS SDK v2 error handling
errStr := err.Error()
return contains(errStr, "NotFound") || contains(errStr, "404") || contains(errStr, "NoSuchKey")
}
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr))
}
func findSubstring(s, substr string) bool {
for i := 0; i <= len(s)-len(substr); i++ {
if s[i:i+len(substr)] == substr {
return true
}
}
return false
}
// ListAvailableFiles lists all available files for a given run
func (d *S3Downloader) ListAvailableFiles(ctx context.Context, run string, hour int) ([]string, error) {
prefix := fmt.Sprintf("gfs.%s/%02d/atmos/", run, hour)
input := &s3.ListObjectsV2Input{
Bucket: aws.String(d.Bucket),
Prefix: aws.String(prefix),
}
var files []string
paginator := s3.NewListObjectsV2Paginator(d.Client, input)
for paginator.HasMorePages() {
page, err := paginator.NextPage(ctx)
if err != nil {
return nil, errcodes.Wrap(err, "failed to list S3 objects")
}
for _, obj := range page.Contents {
if obj.Key != nil {
files = append(files, *obj.Key)
}
}
}
return files, nil
}
// fetchFromS3 downloads a file from S3 to local disk with retry logic
func (d *S3Downloader) fetchFromS3(ctx context.Context, key, dst string) (err error) {
// Check if final file already exists
if _, err := os.Stat(dst); err == nil {
return nil
}
const maxRetries = 3
var lastErr error
for attempt := 0; attempt < maxRetries; attempt++ {
if attempt > 0 {
// Exponential backoff: 2s, 4s, 8s
waitTime := time.Duration(1<<uint(attempt)) * time.Second
time.Sleep(waitTime)
}
lastErr = d.fetchFromS3Once(ctx, key, dst)
if lastErr == nil {
return nil
}
}
return errcodes.Wrap(lastErr, fmt.Sprintf("failed after %d retries", maxRetries))
}
// fetchFromS3Once performs a single download attempt
func (d *S3Downloader) fetchFromS3Once(ctx context.Context, key, dst string) (err error) {
tmp := dst + ".part"
// Remove old .part file if it exists
os.Remove(tmp)
f, err := os.Create(tmp)
if err != nil {
return err
}
fileClosed := false
// Cleanup .part file on any error (using named return value)
defer func() {
if !fileClosed {
f.Close()
}
if err != nil {
os.Remove(tmp)
}
}()
// Check if file exists in S3
exists, size, checkErr := d.CheckFileExists(ctx, key)
if checkErr != nil {
return errcodes.Wrap(checkErr, "failed to check S3 file existence")
}
if !exists {
return errcodes.Wrap(errcodes.ErrDownload, fmt.Sprintf("file not found in S3: %s", key))
}
// Download from S3
input := &s3.GetObjectInput{
Bucket: aws.String(d.Bucket),
Key: aws.String(key),
}
result, err := d.Client.GetObject(ctx, input)
if err != nil {
return errcodes.Wrap(err, "failed to get S3 object")
}
defer result.Body.Close()
// Copy to local file
written, err := io.Copy(f, result.Body)
if err != nil {
return errcodes.Wrap(err, fmt.Sprintf("failed to write S3 object to file %s", dst))
}
// Verify size if available
if size > 0 && written != size {
return errcodes.Wrap(errcodes.ErrDownload, fmt.Sprintf("size mismatch: got %d bytes, expected %d", written, size))
}
// Close file before rename
if err := f.Close(); err != nil {
return err
}
fileClosed = true
// If rename fails, err will be set and defer will cleanup .part file
return os.Rename(tmp, dst)
}
// Run downloads all required GRIB files for a forecast run
func (d *S3Downloader) Run(ctx context.Context, run time.Time) error {
runStr := run.Format("20060102")
hour := run.Hour()
// First, list available files to verify they exist
availableFiles, err := d.ListAvailableFiles(ctx, runStr, hour)
if err != nil {
return errcodes.Wrap(err, "failed to list available files")
}
if len(availableFiles) == 0 {
return errcodes.Wrap(errcodes.ErrDownload, fmt.Sprintf("no files found for run %s/%02d", runStr, hour))
}
// Build a map of available files for quick lookup
availableMap := make(map[string]bool)
for _, file := range availableFiles {
availableMap[file] = true
}
g, ctx := errgroup.WithContext(ctx)
sem := make(chan struct{}, d.Parallel)
for _, step := range steps {
step := step
key := d.s3Key(runStr, hour, step)
// Check if file is available in S3
if !availableMap[key] {
// Log warning but don't fail - some forecast hours might not be available yet
continue
}
sem <- struct{}{}
g.Go(func() error {
defer func() { <-sem }()
dst := filepath.Join(d.Dir, fileName(run, step))
return d.fetchFromS3(ctx, key, dst)
})
}
return g.Wait()
}

View file

@ -6,7 +6,15 @@ import (
"time"
)
var steps = []int{0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48}
// Generate steps from 0 to 96 with step 1 hour (97 steps total)
// GFS provides hourly data for 0-120 hours, we use first 96 hours
var steps = func() []int {
result := make([]int, 0, 97)
for i := 0; i <= 96; i++ {
result = append(result, i)
}
return result
}()
func nearestRun(t time.Time) time.Time {
h := t.UTC().Hour() - t.UTC().Hour()%6

View file

@ -1,22 +0,0 @@
package service
import (
"net/http"
"time"
)
type Config struct {
// --- GRIB Configuration ---
GribDir string `env:"GSN_PREDICTOR_GRIB_DIR" envDefault:"/tmp/grib"`
GribTTL time.Duration `env:"GSN_PREDICTOR_GRIB_TTL" envDefault:"24h"`
GribCacheTTL time.Duration `env:"GSN_PREDICTOR_GRIB_CACHE_TTL" envDefault:"1h"`
GribParallel int `env:"GSN_PREDICTOR_GRIB_PARALLEL" envDefault:"4"`
GribTimeout time.Duration `env:"GSN_PREDICTOR_GRIB_TIMEOUT" envDefault:"30s"`
GribDatasetURL string `env:"GSN_PREDICTOR_GRIB_DATASET_URL" envDefault:"https://nomads.ncep.noaa.gov/pub/data/nccf/com/gfs/prod"`
}
func (c *Config) CreateHTTPClient() *http.Client {
return &http.Client{
Timeout: c.GribTimeout,
}
}

View file

@ -8,13 +8,11 @@ import (
)
type Service struct {
cfg *Config
grib Grib
}
func New(cfg *Config, gribService Grib) (*Service, error) {
func New(gribService Grib) (*Service, error) {
svc := &Service{
cfg: cfg,
grib: gribService,
}