diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 333f3655..97497329 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,6 +27,11 @@ jobs: # ~/go/pkg/mod and ~/.cache/go-build stay on disk between runs automatically. cache: false go-version: '1.25.4' + + - name: Check UFFD pager version + run: | + git fetch origin main --depth=1 + bash scripts/check-uffd-version.sh origin/main - name: Install dependencies run: | diff --git a/.goreleaser.yaml b/.goreleaser.yaml index 3fd34b9e..2d9aff6f 100644 --- a/.goreleaser.yaml +++ b/.goreleaser.yaml @@ -37,6 +37,19 @@ builds: ldflags: - -s -w + - id: hypeman-uffd-pager + main: ./cmd/uffd-pager + binary: hypeman-uffd-pager + env: + - CGO_ENABLED=0 + goos: + - linux + goarch: + - amd64 + - arm64 + ldflags: + - -s -w + archives: - id: default formats: diff --git a/Makefile b/Makefile index 08c14fa8..a0b71091 100644 --- a/Makefile +++ b/Makefile @@ -254,6 +254,7 @@ endif build-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded | $(BIN_DIR) go build -tags containers_image_openpgp -o $(BIN_DIR)/hypeman ./cmd/api + go build -o $(BIN_DIR)/hypeman-uffd-pager ./cmd/uffd-pager # Build all binaries build-all: build diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index a9ae8311..e315c7bb 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -193,10 +193,12 @@ type CapacityConfig struct { // HypervisorConfig holds hypervisor settings. type HypervisorConfig struct { - Default string `koanf:"default"` - CloudHypervisorDefaultVersion string `koanf:"cloud_hypervisor_default_version"` - FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` - Memory HypervisorMemoryConfig `koanf:"memory"` + Default string `koanf:"default"` + CloudHypervisorDefaultVersion string `koanf:"cloud_hypervisor_default_version"` + FirecrackerBinaryPath string `koanf:"firecracker_binary_path"` + FirecrackerSnapshotMemoryBackend string `koanf:"firecracker_snapshot_memory_backend"` + FirecrackerUFFDCacheMaxBytes string `koanf:"firecracker_uffd_cache_max_bytes"` + Memory HypervisorMemoryConfig `koanf:"memory"` } // HypervisorMemoryConfig holds guest memory management settings. @@ -404,9 +406,11 @@ func defaultConfig() *Config { }, Hypervisor: HypervisorConfig{ - Default: "cloud-hypervisor", - CloudHypervisorDefaultVersion: "", - FirecrackerBinaryPath: "", + Default: "cloud-hypervisor", + CloudHypervisorDefaultVersion: "", + FirecrackerBinaryPath: "", + FirecrackerSnapshotMemoryBackend: "file", + FirecrackerUFFDCacheMaxBytes: "4294967296", Memory: HypervisorMemoryConfig{ Enabled: false, KernelPageInitMode: "hardened", @@ -618,6 +622,19 @@ func (c *Config) Validate() error { if c.Hypervisor.Memory.KernelPageInitMode != "performance" && c.Hypervisor.Memory.KernelPageInitMode != "hardened" { return fmt.Errorf("hypervisor.memory.kernel_page_init_mode must be one of {performance,hardened}, got %q", c.Hypervisor.Memory.KernelPageInitMode) } + backend := strings.ToLower(strings.TrimSpace(c.Hypervisor.FirecrackerSnapshotMemoryBackend)) + if backend == "" { + backend = "file" + } + switch backend { + case "file", "uffd": + c.Hypervisor.FirecrackerSnapshotMemoryBackend = backend + default: + return fmt.Errorf("hypervisor.firecracker_snapshot_memory_backend must be one of {file,uffd}, got %q", c.Hypervisor.FirecrackerSnapshotMemoryBackend) + } + if err := validateByteSize("hypervisor.firecracker_uffd_cache_max_bytes", c.Hypervisor.FirecrackerUFFDCacheMaxBytes); err != nil { + return err + } if err := validateDuration("hypervisor.memory.active_ballooning.poll_interval", c.Hypervisor.Memory.ActiveBallooning.PollInterval); err != nil { return err } diff --git a/cmd/api/config/config_test.go b/cmd/api/config/config_test.go index e7183504..f627c628 100644 --- a/cmd/api/config/config_test.go +++ b/cmd/api/config/config_test.go @@ -55,6 +55,35 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) { if cfg.Instances.LifecycleEventBufferSize != 256 { t.Fatalf("expected default instances.lifecycle_event_buffer_size to be 256, got %d", cfg.Instances.LifecycleEventBufferSize) } + if cfg.Hypervisor.FirecrackerSnapshotMemoryBackend != "file" { + t.Fatalf("expected default firecracker snapshot backend to be file, got %q", cfg.Hypervisor.FirecrackerSnapshotMemoryBackend) + } + if cfg.Hypervisor.FirecrackerUFFDCacheMaxBytes != "4294967296" { + t.Fatalf("expected default firecracker uffd cache size to be 4294967296, got %q", cfg.Hypervisor.FirecrackerUFFDCacheMaxBytes) + } +} + +func TestValidateFirecrackerSnapshotMemoryBackend(t *testing.T) { + cfg := defaultConfig() + cfg.Hypervisor.FirecrackerSnapshotMemoryBackend = "UFFD" + if err := cfg.Validate(); err != nil { + t.Fatalf("expected UFFD backend to validate, got %v", err) + } + if cfg.Hypervisor.FirecrackerSnapshotMemoryBackend != "uffd" { + t.Fatalf("expected backend to normalize to uffd, got %q", cfg.Hypervisor.FirecrackerSnapshotMemoryBackend) + } + + cfg = defaultConfig() + cfg.Hypervisor.FirecrackerSnapshotMemoryBackend = "bad" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected invalid firecracker snapshot backend validation error") + } + + cfg = defaultConfig() + cfg.Hypervisor.FirecrackerUFFDCacheMaxBytes = "not-a-size" + if err := cfg.Validate(); err == nil { + t.Fatalf("expected invalid firecracker uffd cache size validation error") + } } func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) { diff --git a/cmd/uffd-pager/main.go b/cmd/uffd-pager/main.go new file mode 100644 index 00000000..0063205a --- /dev/null +++ b/cmd/uffd-pager/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "log/slog" + "os" + + "github.com/kernel/hypeman/lib/uffdpager" +) + +func main() { + if err := uffdpager.Main(os.Args[1:]); err != nil { + slog.Error("uffd pager terminated", "error", err) + os.Exit(1) + } +} diff --git a/lib/hypervisor/cloudhypervisor/process.go b/lib/hypervisor/cloudhypervisor/process.go index 866866f1..45b53da1 100644 --- a/lib/hypervisor/cloudhypervisor/process.go +++ b/lib/hypervisor/cloudhypervisor/process.go @@ -167,7 +167,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s // RestoreVM starts Cloud Hypervisor and restores VM state from a snapshot. // The VM is in paused state after restore; caller should call Resume() to continue execution. -func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, _ hypervisor.RestoreOptions) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) startTime := time.Now() diff --git a/lib/hypervisor/firecracker/config.go b/lib/hypervisor/firecracker/config.go index e4ccefd8..fb11afdb 100644 --- a/lib/hypervisor/firecracker/config.go +++ b/lib/hypervisor/firecracker/config.go @@ -74,11 +74,15 @@ type snapshotCreateParams struct { } type snapshotLoadParams struct { - MemFilePath string `json:"mem_file_path,omitempty"` - SnapshotPath string `json:"snapshot_path"` - EnableDiffSnapshots bool `json:"enable_diff_snapshots,omitempty"` - ResumeVM bool `json:"resume_vm,omitempty"` - NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` + MemBackend *snapshotMemBackend `json:"mem_backend,omitempty"` + SnapshotPath string `json:"snapshot_path"` + EnableDiffSnapshots bool `json:"enable_diff_snapshots,omitempty"` + NetworkOverrides []networkOverride `json:"network_overrides,omitempty"` +} + +type snapshotMemBackend struct { + BackendType string `json:"backend_type"` + BackendPath string `json:"backend_path"` } type networkOverride struct { @@ -213,16 +217,22 @@ func toSnapshotCreateParams(snapshotDir string) snapshotCreateParams { } } -func toSnapshotLoadParams(snapshotDir string, networkOverrides []networkOverride) snapshotLoadParams { +func toSnapshotLoadParams(snapshotDir string, networkOverrides []networkOverride, backend snapshotMemBackend) snapshotLoadParams { return snapshotLoadParams{ - MemFilePath: snapshotMemoryPath(snapshotDir), + MemBackend: &backend, SnapshotPath: snapshotStatePath(snapshotDir), EnableDiffSnapshots: true, - ResumeVM: false, NetworkOverrides: networkOverrides, } } +func fileSnapshotMemBackend(snapshotDir string) snapshotMemBackend { + return snapshotMemBackend{ + BackendType: "File", + BackendPath: snapshotMemoryPath(snapshotDir), + } +} + func snapshotStatePath(snapshotDir string) string { return filepath.Join(snapshotDir, snapshotStateFile) } diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index 6e912ee7..c5e8dc6c 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -82,14 +82,25 @@ func TestSnapshotParamPaths(t *testing.T) { load := toSnapshotLoadParams("/tmp/snapshot-latest", []networkOverride{ {IfaceID: "eth0", HostDevName: "hype-abc123"}, - }) + }, fileSnapshotMemBackend("/tmp/snapshot-latest")) assert.Equal(t, "/tmp/snapshot-latest/state", load.SnapshotPath) - assert.Equal(t, "/tmp/snapshot-latest/memory", load.MemFilePath) + require.NotNil(t, load.MemBackend) + assert.Equal(t, "File", load.MemBackend.BackendType) + assert.Equal(t, "/tmp/snapshot-latest/memory", load.MemBackend.BackendPath) assert.True(t, load.EnableDiffSnapshots) - assert.False(t, load.ResumeVM) require.Len(t, load.NetworkOverrides, 1) } +func TestSnapshotLoadParamsSupportsUFFDBackend(t *testing.T) { + load := toSnapshotLoadParams("/tmp/snapshot-latest", nil, snapshotMemBackend{ + BackendType: "Uffd", + BackendPath: "/tmp/pager.sock", + }) + require.NotNil(t, load.MemBackend) + assert.Equal(t, "Uffd", load.MemBackend.BackendType) + assert.Equal(t, "/tmp/pager.sock", load.MemBackend.BackendPath) +} + func TestToBalloonConfig(t *testing.T) { cfg := hypervisor.VMConfig{ GuestMemory: hypervisor.GuestMemoryConfig{ diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index e22e42f3..e3012aad 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -223,8 +223,8 @@ func (f *Firecracker) instanceStart(ctx context.Context) error { return f.postAction(ctx, "InstanceStart") } -func (f *Firecracker) loadSnapshot(ctx context.Context, snapshotDir string, networkOverrides []networkOverride) error { - params := toSnapshotLoadParams(snapshotDir, networkOverrides) +func (f *Firecracker) loadSnapshot(ctx context.Context, snapshotDir string, networkOverrides []networkOverride, backend snapshotMemBackend) error { + params := toSnapshotLoadParams(snapshotDir, networkOverrides, backend) if _, err := f.do(ctx, http.MethodPut, "/snapshot/load", params, http.StatusNoContent); err != nil { return err } diff --git a/lib/hypervisor/firecracker/process.go b/lib/hypervisor/firecracker/process.go index b29539e9..43e52913 100644 --- a/lib/hypervisor/firecracker/process.go +++ b/lib/hypervisor/firecracker/process.go @@ -14,6 +14,7 @@ import ( "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/paths" + "github.com/kernel/hypeman/lib/uffdpager" "gvisor.dev/gvisor/pkg/cleanup" ) @@ -31,11 +32,30 @@ func init() { }) } +type UFFDClient interface { + CreateSession(ctx context.Context, req uffdpager.CreateSessionRequest) (*uffdpager.CreateSessionResponse, error) + CloseSession(ctx context.Context, sessionID string) error +} + +type StarterOption func(*Starter) + // Starter implements hypervisor.VMStarter for Firecracker. -type Starter struct{} +type Starter struct { + uffd UFFDClient +} -func NewStarter() *Starter { - return &Starter{} +func NewStarter(opts ...StarterOption) *Starter { + s := &Starter{} + for _, opt := range opts { + opt(s) + } + return s +} + +func WithUFFDClient(client UFFDClient) StarterOption { + return func(s *Starter) { + s.uffd = client + } } var _ hypervisor.VMStarter = (*Starter)(nil) @@ -93,7 +113,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s return pid, hv, nil } -func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, opts hypervisor.RestoreOptions) (int, hypervisor.Hypervisor, error) { processCtx, processSpan := hypervisor.StartProcessSpan(ctx, hypervisor.TypeFirecracker) pid, err := s.startProcess(processCtx, p, version, socketPath) hypervisor.FinishTraceSpan(processSpan, err) @@ -115,14 +135,39 @@ func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, if err != nil { return 0, nil, fmt.Errorf("load firecracker restore metadata: %w", err) } + backend := fileSnapshotMemBackend(snapshotPath) + createdUFFDSession := "" + if opts.SnapshotMemoryBackend == hypervisor.SnapshotMemoryBackendUFFD { + if s.uffd == nil { + return 0, nil, fmt.Errorf("uffd snapshot restore requested but no uffd pager is configured") + } + sessionID := strings.TrimSpace(opts.SnapshotMemorySessionID) + if sessionID == "" { + sessionID = filepath.Base(filepath.Dir(socketPath)) + } + resp, err := s.uffd.CreateSession(ctx, uffdpager.CreateSessionRequest{ + SessionID: sessionID, + InstanceID: sessionID, + BackingMemoryPath: snapshotMemoryPath(snapshotPath), + CacheKey: opts.SnapshotMemoryCacheKey, + }) + if err != nil { + return 0, nil, fmt.Errorf("create uffd pager session: %w", err) + } + createdUFFDSession = resp.SessionID + backend = snapshotMemBackend{BackendType: "Uffd", BackendPath: resp.UFFDSocketPath} + } err = func() error { snapshotSourceAliasMu.Lock() defer snapshotSourceAliasMu.Unlock() return withSnapshotSourceDirAlias(meta, filepath.Dir(socketPath), func() error { - return hv.loadSnapshot(ctx, snapshotPath, meta.NetworkOverrides) + return hv.loadSnapshot(ctx, snapshotPath, meta.NetworkOverrides, backend) }) }() if err != nil { + if createdUFFDSession != "" { + _ = s.uffd.CloseSession(context.Background(), createdUFFDSession) + } return 0, nil, fmt.Errorf("load firecracker snapshot: %w", err) } if meta.SnapshotSourceDataDir != "" && !meta.RetainSnapshotSourceDataDirAlias { diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index 7331f471..de1cb698 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -115,7 +115,7 @@ type VMStarter interface { // - Cloud Hypervisor: starts process, calls Restore API // - QEMU: would start with -incoming or -loadvm flags (not yet implemented) // Returns the process ID and a Hypervisor client. The VM is in paused state after restore. - RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (pid int, hv Hypervisor, err error) + RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, opts RestoreOptions) (pid int, hv Hypervisor, err error) // PrepareFork allows hypervisors to prepare forked instance state. // For snapshot-based forks, implementations can rewrite snapshot config with @@ -124,6 +124,19 @@ type VMStarter interface { PrepareFork(ctx context.Context, req ForkPrepareRequest) (ForkPrepareResult, error) } +type SnapshotMemoryBackend string + +const ( + SnapshotMemoryBackendFile SnapshotMemoryBackend = "file" + SnapshotMemoryBackendUFFD SnapshotMemoryBackend = "uffd" +) + +type RestoreOptions struct { + SnapshotMemoryBackend SnapshotMemoryBackend + SnapshotMemoryCacheKey string + SnapshotMemorySessionID string +} + // ForkNetworkConfig contains network identity fields for fork preparation. type ForkNetworkConfig struct { TAPDevice string diff --git a/lib/hypervisor/qemu/process.go b/lib/hypervisor/qemu/process.go index f13fb222..39780ec4 100644 --- a/lib/hypervisor/qemu/process.go +++ b/lib/hypervisor/qemu/process.go @@ -429,7 +429,7 @@ func shouldRetryWithReducedBalloon(err error) bool { // RestoreVM starts QEMU and restores VM state from a snapshot. // The VM is in paused state after restore; caller should call Resume() to continue execution. -func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, _ hypervisor.RestoreOptions) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) startTime := time.Now() diff --git a/lib/hypervisor/tracing.go b/lib/hypervisor/tracing.go index 79888302..0ee93ae0 100644 --- a/lib/hypervisor/tracing.go +++ b/lib/hypervisor/tracing.go @@ -296,7 +296,7 @@ func (s *tracingVMStarter) StartVM(ctx context.Context, p *paths.Paths, version return pid, hv, err } -func (s *tracingVMStarter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (pid int, hv Hypervisor, err error) { +func (s *tracingVMStarter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, opts RestoreOptions) (pid int, hv Hypervisor, err error) { ctx, span := startTraceSpan(ctx, s.tracer, "hypervisor.restore_vm", attribute.String("hypervisor", string(s.hvType)), attribute.String("operation", "restore_vm"), @@ -307,7 +307,7 @@ func (s *tracingVMStarter) RestoreVM(ctx context.Context, p *paths.Paths, versio } finishTraceSpan(span, err) }() - pid, hv, err = s.next.RestoreVM(ctx, p, version, socketPath, snapshotPath) + pid, hv, err = s.next.RestoreVM(ctx, p, version, socketPath, snapshotPath, opts) return pid, hv, err } diff --git a/lib/hypervisor/tracing_test.go b/lib/hypervisor/tracing_test.go index 8d5147c6..409e1bd0 100644 --- a/lib/hypervisor/tracing_test.go +++ b/lib/hypervisor/tracing_test.go @@ -70,7 +70,7 @@ func (s fakeStarter) GetVersion(*paths.Paths) (string, error) { return "test", n func (s fakeStarter) StartVM(context.Context, *paths.Paths, string, string, VMConfig) (int, Hypervisor, error) { return 42, s.returned, nil } -func (s fakeStarter) RestoreVM(context.Context, *paths.Paths, string, string, string) (int, Hypervisor, error) { +func (s fakeStarter) RestoreVM(context.Context, *paths.Paths, string, string, string, RestoreOptions) (int, Hypervisor, error) { return 43, s.returned, nil } func (s fakeStarter) PrepareFork(context.Context, ForkPrepareRequest) (ForkPrepareResult, error) { diff --git a/lib/hypervisor/vz/starter.go b/lib/hypervisor/vz/starter.go index 72e61da1..2cd42a80 100644 --- a/lib/hypervisor/vz/starter.go +++ b/lib/hypervisor/vz/starter.go @@ -122,7 +122,7 @@ func (s *Starter) StartVM(ctx context.Context, p *paths.Paths, version string, s // RestoreVM starts a vz-shim process and restores VM state from a snapshot. // The VM is in paused state after restore; caller should call Resume(). -func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string) (int, hypervisor.Hypervisor, error) { +func (s *Starter) RestoreVM(ctx context.Context, p *paths.Paths, version string, socketPath string, snapshotPath string, _ hypervisor.RestoreOptions) (int, hypervisor.Hypervisor, error) { manifestPath := filepath.Join(snapshotPath, shimconfig.SnapshotManifestFile) manifestData, err := os.ReadFile(manifestPath) if err != nil { diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 283e0b19..90236809 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -81,6 +81,7 @@ func (m *manager) deleteInstance( log.WarnContext(ctx, "failed to kill hypervisor, continuing with cleanup", "instance_id", id, "error", err) } } + m.closeFirecrackerUFFDSession(ctx, stored) // 6. Release network allocation if inst.NetworkEnabled { diff --git a/lib/instances/firecracker_uffd.go b/lib/instances/firecracker_uffd.go new file mode 100644 index 00000000..4a4dc67a --- /dev/null +++ b/lib/instances/firecracker_uffd.go @@ -0,0 +1,147 @@ +package instances + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/kernel/hypeman/lib/hypervisor" + "github.com/kernel/hypeman/lib/logger" + "github.com/kernel/hypeman/lib/uffdpager" +) + +func (m *manager) useFirecrackerUFFD(stored *StoredMetadata) bool { + return stored != nil && + stored.HypervisorType == hypervisor.TypeFirecracker && + m.firecrackerSnapshotMemoryBackend == uffdpager.BackendUFFD +} + +func (m *manager) firecrackerSnapshotRestoreOptions(stored *StoredMetadata, snapshotDir string) (hypervisor.RestoreOptions, error) { + opts := hypervisor.RestoreOptions{SnapshotMemoryBackend: hypervisor.SnapshotMemoryBackendFile} + if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker { + return opts, nil + } + if !m.useFirecrackerUFFD(stored) { + stored.FirecrackerUFFDSessionID = "" + stored.FirecrackerUFFDPagerVersion = "" + return opts, nil + } + if m.firecrackerUFFDPager == nil { + return opts, fmt.Errorf("firecracker uffd snapshot restore is enabled but the pager is not configured") + } + + cacheKey := strings.TrimSpace(stored.FirecrackerSnapshotCacheKey) + if cacheKey == "" { + var err error + cacheKey, err = firecrackerSnapshotCacheKey(stored, snapshotDir) + if err != nil { + return opts, err + } + stored.FirecrackerSnapshotCacheKey = cacheKey + } + stored.FirecrackerUFFDSessionID = stored.Id + stored.FirecrackerUFFDPagerVersion = m.firecrackerUFFDPager.VersionKey() + opts.SnapshotMemoryBackend = hypervisor.SnapshotMemoryBackendUFFD + opts.SnapshotMemoryCacheKey = cacheKey + opts.SnapshotMemorySessionID = stored.FirecrackerUFFDSessionID + return opts, nil +} + +func (m *manager) refreshFirecrackerSnapshotCacheKey(stored *StoredMetadata, snapshotDir string) error { + if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker { + return nil + } + key, err := firecrackerSnapshotCacheKey(stored, snapshotDir) + if err != nil { + return err + } + stored.FirecrackerSnapshotCacheKey = key + return nil +} + +func (m *manager) closeFirecrackerUFFDSession(ctx context.Context, stored *StoredMetadata) { + if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker || stored.FirecrackerUFFDSessionID == "" { + return + } + log := logger.FromContext(ctx) + if m.firecrackerUFFDPager == nil { + log.WarnContext(ctx, "cannot close firecracker uffd session; pager is not configured", + "instance_id", stored.Id, + "session_id", stored.FirecrackerUFFDSessionID, + "pager_version", stored.FirecrackerUFFDPagerVersion) + stored.FirecrackerUFFDSessionID = "" + stored.FirecrackerUFFDPagerVersion = "" + return + } + version := stored.FirecrackerUFFDPagerVersion + if version == "" { + version = m.firecrackerUFFDPager.VersionKey() + } + closeCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + if err := m.firecrackerUFFDPager.CloseSessionVersion(closeCtx, version, stored.FirecrackerUFFDSessionID); err != nil { + log.WarnContext(ctx, "failed to close firecracker uffd session", + "instance_id", stored.Id, + "session_id", stored.FirecrackerUFFDSessionID, + "pager_version", version, + "error", err) + } + stored.FirecrackerUFFDSessionID = "" + stored.FirecrackerUFFDPagerVersion = "" +} + +func (m *manager) checkFirecrackerUFFDSessionHealth(ctx context.Context, stored *StoredMetadata) error { + if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker || stored.FirecrackerUFFDSessionID == "" { + return nil + } + if m.firecrackerUFFDPager == nil { + return fmt.Errorf("firecracker uffd session %s has no configured pager", stored.FirecrackerUFFDSessionID) + } + version := stored.FirecrackerUFFDPagerVersion + if version == "" { + version = m.firecrackerUFFDPager.VersionKey() + } + healthCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) + defer cancel() + if _, err := m.firecrackerUFFDPager.HealthVersion(healthCtx, version); err != nil { + return fmt.Errorf("firecracker uffd pager %s for session %s is unhealthy: %w", version, stored.FirecrackerUFFDSessionID, err) + } + return nil +} + +func firecrackerSnapshotCacheKey(stored *StoredMetadata, snapshotDir string) (string, error) { + memoryInfo, err := os.Stat(filepath.Join(snapshotDir, "memory")) + if err != nil { + return "", fmt.Errorf("stat firecracker snapshot memory for uffd cache key: %w", err) + } + stateInfo, err := os.Stat(filepath.Join(snapshotDir, "state")) + if err != nil { + return "", fmt.Errorf("stat firecracker snapshot state for uffd cache key: %w", err) + } + sum := sha256.Sum256([]byte(fmt.Sprintf( + "%s:%s:%d:%d:%d:%d", + stored.Id, + fileStatFingerprint(memoryInfo), + memoryInfo.Size(), + memoryInfo.ModTime().UnixNano(), + stateInfo.Size(), + stateInfo.ModTime().UnixNano(), + ))) + return hex.EncodeToString(sum[:])[:24], nil +} + +func fileStatFingerprint(info os.FileInfo) string { + if info == nil { + return "" + } + if stat, ok := info.Sys().(*syscall.Stat_t); ok { + return fmt.Sprintf("%s:%d:%d:%d", info.Name(), info.Mode(), stat.Dev, stat.Ino) + } + return fmt.Sprintf("%s:%d", info.Name(), info.Mode()) +} diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 1d4f72a8..ccea12db 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -291,6 +291,11 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin forkMeta.ExitCode = nil forkMeta.ExitMessage = "" forkMeta.RestartStatus = restartpolicy.Status{} + forkMeta.FirecrackerUFFDSessionID = "" + forkMeta.FirecrackerUFFDPagerVersion = "" + if source.State != StateStandby { + forkMeta.FirecrackerSnapshotCacheKey = "" + } // Forks are new instances; phase accounting must not inherit the source's // cumulative durations. The first transition into the fork's runtime // phase (Standby for snapshot forks, Stopped for stopped forks) will be diff --git a/lib/instances/hypervisor_darwin.go b/lib/instances/hypervisor_darwin.go index 183a928e..1cd97abe 100644 --- a/lib/instances/hypervisor_darwin.go +++ b/lib/instances/hypervisor_darwin.go @@ -3,10 +3,15 @@ package instances import ( + "context" + "fmt" + "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/hypervisor/cloudhypervisor" "github.com/kernel/hypeman/lib/hypervisor/qemu" "github.com/kernel/hypeman/lib/hypervisor/vz" + "github.com/kernel/hypeman/lib/paths" + "github.com/kernel/hypeman/lib/uffdpager" ) func init() { @@ -14,3 +19,10 @@ func init() { platformStarters[hypervisor.TypeQEMU] = qemu.NewStarter() platformStarters[hypervisor.TypeVZ] = vz.NewStarter() } + +func configurePlatformStarters(_ context.Context, _ *paths.Paths, _ map[hypervisor.Type]hypervisor.VMStarter, cfg ManagerConfig) (*uffdpager.Supervisor, error) { + if cfg.FirecrackerSnapshotMemoryBackend == uffdpager.BackendUFFD { + return nil, fmt.Errorf("firecracker uffd snapshot restore is only supported on linux") + } + return nil, nil +} diff --git a/lib/instances/hypervisor_linux.go b/lib/instances/hypervisor_linux.go index 3e2269f0..149f1718 100644 --- a/lib/instances/hypervisor_linux.go +++ b/lib/instances/hypervisor_linux.go @@ -3,10 +3,15 @@ package instances import ( + "context" + "fmt" + "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/hypervisor/cloudhypervisor" "github.com/kernel/hypeman/lib/hypervisor/firecracker" "github.com/kernel/hypeman/lib/hypervisor/qemu" + "github.com/kernel/hypeman/lib/paths" + "github.com/kernel/hypeman/lib/uffdpager" ) func init() { @@ -14,3 +19,15 @@ func init() { platformStarters[hypervisor.TypeFirecracker] = firecracker.NewStarter() platformStarters[hypervisor.TypeQEMU] = qemu.NewStarter() } + +func configurePlatformStarters(ctx context.Context, p *paths.Paths, starters map[hypervisor.Type]hypervisor.VMStarter, cfg ManagerConfig) (*uffdpager.Supervisor, error) { + if cfg.FirecrackerSnapshotMemoryBackend != uffdpager.BackendUFFD { + return nil, nil + } + pager, err := uffdpager.NewSupervisor(ctx, p.DataDir(), cfg.FirecrackerUFFDCacheMaxBytes) + if err != nil { + return nil, fmt.Errorf("start firecracker uffd pager: %w", err) + } + starters[hypervisor.TypeFirecracker] = firecracker.NewStarter(firecracker.WithUFFDClient(pager)) + return pager, nil +} diff --git a/lib/instances/manager.go b/lib/instances/manager.go index 0ded26ea..fb203965 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "strings" "sync" "time" @@ -17,6 +18,7 @@ import ( "github.com/kernel/hypeman/lib/paths" "github.com/kernel/hypeman/lib/resources" "github.com/kernel/hypeman/lib/system" + "github.com/kernel/hypeman/lib/uffdpager" "github.com/kernel/hypeman/lib/volumes" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/metric" @@ -81,7 +83,9 @@ type ResourceLimits struct { // ManagerConfig holds non-resource manager behavior settings. type ManagerConfig struct { - LifecycleEventBufferSize int + LifecycleEventBufferSize int + FirecrackerSnapshotMemoryBackend string + FirecrackerUFFDCacheMaxBytes int64 } // Normalize applies defaults to manager config values. @@ -89,6 +93,13 @@ func (c ManagerConfig) Normalize() ManagerConfig { if c.LifecycleEventBufferSize <= 0 { c.LifecycleEventBufferSize = defaultLifecycleEventBufferSize } + c.FirecrackerSnapshotMemoryBackend = strings.ToLower(strings.TrimSpace(c.FirecrackerSnapshotMemoryBackend)) + if c.FirecrackerSnapshotMemoryBackend == "" { + c.FirecrackerSnapshotMemoryBackend = uffdpager.BackendFile + } + if c.FirecrackerUFFDCacheMaxBytes <= 0 { + c.FirecrackerUFFDCacheMaxBytes = 4 << 30 + } return c } @@ -149,9 +160,11 @@ type manager struct { tapGCOnce sync.Once // Hypervisor support - vmStarters map[hypervisor.Type]hypervisor.VMStarter - defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request - guestMemoryPolicy guestmemory.Policy + vmStarters map[hypervisor.Type]hypervisor.VMStarter + defaultHypervisor hypervisor.Type // Default hypervisor type when not specified in request + guestMemoryPolicy guestmemory.Policy + firecrackerSnapshotMemoryBackend string + firecrackerUFFDPager *uffdpager.Supervisor } // platformStarters is populated by platform-specific init functions. @@ -166,6 +179,16 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste // NewManagerWithConfig creates a new instances manager with additional manager settings. func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, snapshotDefaults SnapshotPolicy, managerConfig ManagerConfig, meter metric.Meter, tracer trace.Tracer, memoryPolicy ...guestmemory.Policy) Manager { + m, err := NewManagerWithConfigE(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, snapshotDefaults, managerConfig, meter, tracer, memoryPolicy...) + if err != nil { + panic(err) + } + return m +} + +// NewManagerWithConfigE creates a new instances manager and returns startup +// errors for optional host services such as the Firecracker UFFD pager. +func NewManagerWithConfigE(p *paths.Paths, imageManager images.Manager, systemManager system.Manager, networkManager network.Manager, deviceManager devices.Manager, volumeManager volumes.Manager, limits ResourceLimits, defaultHypervisor hypervisor.Type, snapshotDefaults SnapshotPolicy, managerConfig ManagerConfig, meter metric.Meter, tracer trace.Tracer, memoryPolicy ...guestmemory.Policy) (Manager, error) { // Validate and default the hypervisor type if defaultHypervisor == "" { defaultHypervisor = hypervisor.TypeCloudHypervisor @@ -179,34 +202,44 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan managerConfig = managerConfig.Normalize() // Initialize VM starters from platform-specific init functions - vmStarters := make(map[hypervisor.Type]hypervisor.VMStarter, len(platformStarters)) + rawStarters := make(map[hypervisor.Type]hypervisor.VMStarter, len(platformStarters)) for hvType, starter := range platformStarters { + rawStarters[hvType] = starter + } + firecrackerUFFDPager, err := configurePlatformStarters(context.Background(), p, rawStarters, managerConfig) + if err != nil { + return nil, err + } + vmStarters := make(map[hypervisor.Type]hypervisor.VMStarter, len(rawStarters)) + for hvType, starter := range rawStarters { vmStarters[hvType] = hypervisor.WrapVMStarter(hvType, starter) } m := &manager{ - paths: p, - imageManager: imageManager, - systemManager: systemManager, - networkManager: networkManager, - deviceManager: deviceManager, - volumeManager: volumeManager, - limits: limits, - instanceLocks: sync.Map{}, - bootMarkerScans: sync.Map{}, - hostTopology: detectHostTopology(), // Detect and cache host topology - vmStarters: vmStarters, - defaultHypervisor: defaultHypervisor, - now: time.Now, - writeFile: os.WriteFile, - meter: meter, - tracer: tracer, - guestMemoryPolicy: policy, - snapshotDefaults: snapshotDefaults, - compressionJobs: make(map[string]*compressionJob), - nativeCodecPaths: make(map[string]string), - lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize), - guestAgentReadyProbe: probeGuestAgentReady, + paths: p, + imageManager: imageManager, + systemManager: systemManager, + networkManager: networkManager, + deviceManager: deviceManager, + volumeManager: volumeManager, + limits: limits, + instanceLocks: sync.Map{}, + bootMarkerScans: sync.Map{}, + hostTopology: detectHostTopology(), // Detect and cache host topology + vmStarters: vmStarters, + defaultHypervisor: defaultHypervisor, + now: time.Now, + writeFile: os.WriteFile, + meter: meter, + tracer: tracer, + guestMemoryPolicy: policy, + firecrackerSnapshotMemoryBackend: managerConfig.FirecrackerSnapshotMemoryBackend, + firecrackerUFFDPager: firecrackerUFFDPager, + snapshotDefaults: snapshotDefaults, + compressionJobs: make(map[string]*compressionJob), + nativeCodecPaths: make(map[string]string), + lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize), + guestAgentReadyProbe: probeGuestAgentReady, } m.deleteSnapshotFn = m.deleteSnapshot @@ -224,7 +257,7 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan logger.FromContext(context.Background()).WarnContext(context.Background(), "failed to recover pending standby compression jobs", "error", err) } - return m + return m, nil } // SetResourceValidator sets the resource validator for aggregate limit checking. diff --git a/lib/instances/query.go b/lib/instances/query.go index 99d67d48..45dc13fc 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -83,6 +83,14 @@ func (m *manager) deriveStateWithOptions(ctx context.Context, stored *StoredMeta } return stateResult{State: StateStopped} } + if err := m.checkFirecrackerUFFDSessionHealth(ctx, stored); err != nil { + errMsg := err.Error() + log.WarnContext(ctx, "firecracker uffd session is unhealthy", + "instance_id", stored.Id, + "error", err, + ) + return stateResult{State: StateUnknown, Error: &errMsg} + } // 2. Socket exists - resolve hypervisor state, preferring the in-memory // cache (populated by lifecycle events and prior queries) and falling diff --git a/lib/instances/restore.go b/lib/instances/restore.go index d2db078c..e7afb35b 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -248,6 +248,12 @@ func (m *manager) restoreInstance( proxyRegistered = true } + restoreOptions, err := m.firecrackerSnapshotRestoreOptions(stored, snapshotDir) + if err != nil { + releaseNetwork() + return nil, fmt.Errorf("configure snapshot memory backend: %w", err) + } + // 5. Transition: Standby → Paused (start hypervisor + restore) restoreCtx, restoreSpanEnd := m.startLifecycleStep(ctx, "restore_from_snapshot", attribute.String("instance_id", id), @@ -255,10 +261,11 @@ func (m *manager) restoreInstance( attribute.String("operation", "restore_from_snapshot"), ) log.InfoContext(ctx, "restoring from snapshot", "instance_id", id, "snapshot_dir", snapshotDir, "hypervisor", stored.HypervisorType) - pid, hv, err := m.restoreFromSnapshot(restoreCtx, stored, snapshotDir) + pid, hv, err := m.restoreFromSnapshot(restoreCtx, stored, snapshotDir, restoreOptions) restoreSpanEnd(err) if err != nil { log.ErrorContext(ctx, "failed to restore from snapshot", "instance_id", id, "error", err) + m.closeFirecrackerUFFDSession(ctx, stored) // Cleanup network on failure releaseNetwork() return nil, err @@ -279,6 +286,7 @@ func (m *manager) restoreInstance( log.ErrorContext(ctx, "failed to resume VM", "instance_id", id, "error", err) // Cleanup on failure hv.Shutdown(ctx) + m.closeFirecrackerUFFDSession(ctx, stored) releaseNetwork() return nil, fmt.Errorf("resume vm failed: %w", err) } @@ -378,6 +386,7 @@ func (m *manager) restoreFromSnapshot( ctx context.Context, stored *StoredMetadata, snapshotDir string, + opts hypervisor.RestoreOptions, ) (int, hypervisor.Hypervisor, error) { log := logger.FromContext(ctx) @@ -389,7 +398,7 @@ func (m *manager) restoreFromSnapshot( // Restore VM from snapshot (handles process start + restore) log.DebugContext(ctx, "restoring VM from snapshot", "instance_id", stored.Id, "hypervisor", stored.HypervisorType, "version", stored.HypervisorVersion, "snapshot_dir", snapshotDir) - pid, hv, err := starter.RestoreVM(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath, snapshotDir) + pid, hv, err := starter.RestoreVM(ctx, m.paths, stored.HypervisorVersion, stored.SocketPath, snapshotDir, opts) if err != nil { return 0, nil, fmt.Errorf("restore vm: %w", err) } diff --git a/lib/instances/snapshot.go b/lib/instances/snapshot.go index 05d8084e..eac2d008 100644 --- a/lib/instances/snapshot.go +++ b/lib/instances/snapshot.go @@ -450,6 +450,11 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS forkMeta.ExitCode = nil forkMeta.ExitMessage = "" forkMeta.RestartStatus = restartpolicy.Status{} + forkMeta.FirecrackerUFFDSessionID = "" + forkMeta.FirecrackerUFFDPagerVersion = "" + if rec.Snapshot.Kind != SnapshotKindStandby { + forkMeta.FirecrackerSnapshotCacheKey = "" + } if rec.Snapshot.Kind == SnapshotKindStandby { forkMeta.VsockCID = rec.StoredMetadata.VsockCID } else { diff --git a/lib/instances/standby.go b/lib/instances/standby.go index 03b27410..33328600 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -190,6 +190,7 @@ func (m *manager) standbyInstance( if dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID); err == nil { guest.CloseConn(dialer.Key()) } + m.closeFirecrackerUFFDSession(ctx, stored) // 9. Release network allocation (delete TAP device) // TAP devices with explicit Owner/Group fields do NOT auto-delete when VMM exits @@ -216,6 +217,9 @@ func (m *manager) standbyInstance( stored.StoppedAt = &now stored.HypervisorPID = nil stored.PendingStandbyCompression = nil + if err := m.refreshFirecrackerSnapshotCacheKey(stored, snapshotDir); err != nil { + log.WarnContext(ctx, "failed to refresh firecracker snapshot cache key", "instance_id", id, "error", err) + } if compressionPolicy != nil { stored.PendingStandbyCompression = &PendingStandbyCompression{ Policy: *cloneCompressionConfig(compressionPolicy), diff --git a/lib/instances/stop.go b/lib/instances/stop.go index 3613323b..8c128351 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -282,6 +282,7 @@ func (m *manager) stopInstance( _ = os.Remove(match) } } + m.closeFirecrackerUFFDSession(ctx, stored) // 9. Ensure terminal stop semantics: no snapshot should remain in Stopped state. // This prevents stale snapshot directories from deriving state as Standby and @@ -305,6 +306,7 @@ func (m *manager) stopInstance( // Boot markers are per-boot-run and must not carry across stop/restore/start. stored.ProgramStartedAt = nil stored.GuestAgentReadyAt = nil + stored.FirecrackerSnapshotCacheKey = "" stored.Phases.Record(phasetracking.PhaseStopped, now) meta = &metadata{StoredMetadata: *stored} diff --git a/lib/instances/types.go b/lib/instances/types.go index 8c031fc9..0603beba 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -117,6 +117,11 @@ type StoredMetadata struct { HypervisorVersion string // Hypervisor version (e.g., "v51.1") HypervisorPID *int // Hypervisor process ID (may be stale after host restart) + // Firecracker UFFD snapshot restore metadata. + FirecrackerSnapshotCacheKey string + FirecrackerUFFDSessionID string + FirecrackerUFFDPagerVersion string + // Paths SocketPath string // Path to API socket DataDir string // Instance data directory diff --git a/lib/paths/paths.go b/lib/paths/paths.go index adc070c4..7f22bf6b 100644 --- a/lib/paths/paths.go +++ b/lib/paths/paths.go @@ -103,6 +103,31 @@ func (p *Paths) FirecrackerBinary(version, arch string) string { return filepath.Join(p.dataDir, "system", "binaries", "firecracker", version, arch, "firecracker") } +// UFFDDir returns the root directory for Firecracker UFFD pager state. +func (p *Paths) UFFDDir() string { + return filepath.Join(p.dataDir, "uffd") +} + +func (p *Paths) UFFDPagerDir(versionKey string) string { + return filepath.Join(p.UFFDDir(), versionKey) +} + +func (p *Paths) UFFDControlSocket(versionKey string) string { + return filepath.Join(p.UFFDPagerDir(versionKey), "control.sock") +} + +func (p *Paths) UFFDPagerPID(versionKey string) string { + return filepath.Join(p.UFFDPagerDir(versionKey), "pager.pid") +} + +func (p *Paths) UFFDPagerLog(versionKey string) string { + return filepath.Join(p.UFFDPagerDir(versionKey), "pager.log") +} + +func (p *Paths) UFFDSessionsDir(versionKey string) string { + return filepath.Join(p.UFFDPagerDir(versionKey), "sessions") +} + // Image path methods // ImageDigestDir returns the directory for a specific image digest. diff --git a/lib/providers/providers.go b/lib/providers/providers.go index 72b8a426..33a909d5 100644 --- a/lib/providers/providers.go +++ b/lib/providers/providers.go @@ -138,10 +138,16 @@ func ProvideInstanceManager(p *paths.Paths, cfg *config.Config, imageManager ima ReclaimEnabled: cfg.Hypervisor.Memory.ReclaimEnabled, VZBalloonRequired: cfg.Hypervisor.Memory.VZBalloonRequired, } + var firecrackerUFFDCacheMaxBytes datasize.ByteSize + if err := firecrackerUFFDCacheMaxBytes.UnmarshalText([]byte(cfg.Hypervisor.FirecrackerUFFDCacheMaxBytes)); err != nil { + return nil, fmt.Errorf("failed to parse hypervisor.firecracker_uffd_cache_max_bytes %q: %w", cfg.Hypervisor.FirecrackerUFFDCacheMaxBytes, err) + } managerConfig := instances.ManagerConfig{ - LifecycleEventBufferSize: cfg.Instances.LifecycleEventBufferSize, + LifecycleEventBufferSize: cfg.Instances.LifecycleEventBufferSize, + FirecrackerSnapshotMemoryBackend: cfg.Hypervisor.FirecrackerSnapshotMemoryBackend, + FirecrackerUFFDCacheMaxBytes: int64(firecrackerUFFDCacheMaxBytes), } - return instances.NewManagerWithConfig(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, snapshotDefaults, managerConfig, meter, tracer, memoryPolicy), nil + return instances.NewManagerWithConfigE(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, defaultHypervisor, snapshotDefaults, managerConfig, meter, tracer, memoryPolicy) } func snapshotDefaultsFromConfig(cfg *config.Config) instances.SnapshotPolicy { diff --git a/lib/uffdpager/README.md b/lib/uffdpager/README.md new file mode 100644 index 00000000..88809cf3 --- /dev/null +++ b/lib/uffdpager/README.md @@ -0,0 +1,26 @@ +# UFFD Snapshot Pager + +The UFFD pager lets Firecracker restore snapshot memory lazily. A restored VM +gets a per-session UFFD socket, and page faults are served from the snapshot +memory file by a local pager process. + +The pager is local to the host. It is not backed by Redis or another external +cache because page faults are latency-sensitive and the kernel-facing UFFD +socket is local. The process keeps one shared in-memory page cache, bounded by +`hypervisor.firecracker_uffd_cache_max_bytes`. Cache entries are keyed by a +snapshot cache key plus page offset, so multiple restore sessions from the same +snapshot can reuse hot pages without starting one pager per snapshot. + +UFFD is opt-in through `hypervisor.firecracker_snapshot_memory_backend=uffd`. +The default backend remains `file`. Enabling UFFD does not change already +running VMs; it only changes future Firecracker snapshot restores. If a restore +uses UFFD, the VM is pinned to the pager session created for that restore until +the instance stops, is deleted, or otherwise closes the session. + +The pager version is declared in `lib/uffdpager/VERSION`. Installed Linux hosts +run versioned systemd units named `hypeman-uffd@.service`. Hypeman +connects to the pager version from `VERSION`, so regular Hypeman releases do not +start a new pager unless the UFFD pager version changes. Older pager versions +are drained: they reject new sessions but continue serving existing sessions +until those sessions close. Systemd runs the pager through the dedicated +`hypeman-uffd-pager` binary rather than an alternate API-server mode. diff --git a/lib/uffdpager/VERSION b/lib/uffdpager/VERSION new file mode 100644 index 00000000..17e51c38 --- /dev/null +++ b/lib/uffdpager/VERSION @@ -0,0 +1 @@ +0.1.1 diff --git a/lib/uffdpager/cache.go b/lib/uffdpager/cache.go new file mode 100644 index 00000000..91b0345d --- /dev/null +++ b/lib/uffdpager/cache.go @@ -0,0 +1,247 @@ +package uffdpager + +import ( + "container/list" + "sync" + "sync/atomic" + "time" +) + +type pageKey struct { + cacheKey string + offset int64 + size int +} + +type cacheEntry struct { + key pageKey + data []byte +} + +type PageCache struct { + shards []*pageCacheShard +} + +type pageCacheShard struct { + mu sync.RWMutex + maxBytes int64 + bytes int64 + items map[pageKey]*list.Element + lru *list.List + + hits atomic.Int64 + misses atomic.Int64 + + lookupNanos atomic.Int64 + lookupMaxNanos atomic.Int64 + addNanos atomic.Int64 + addMaxNanos atomic.Int64 +} + +func NewPageCache(maxBytes int64) *PageCache { + maxBytes = normalizeCacheMaxBytes(maxBytes) + shardCount := pageCacheShardCount(maxBytes) + shards := make([]*pageCacheShard, shardCount) + shardMax := maxBytes / int64(shardCount) + if shardMax <= 0 { + shardMax = maxBytes + } + for i := range shards { + shards[i] = &pageCacheShard{ + maxBytes: shardMax, + items: make(map[pageKey]*list.Element), + lru: list.New(), + } + } + return &PageCache{ + shards: shards, + } +} + +func (c *PageCache) Get(cacheKey string, offset int64, size int) ([]byte, bool) { + key := pageKey{cacheKey: cacheKey, offset: offset, size: size} + data, ok := c.lookup(key, true) + if !ok { + return nil, false + } + return append([]byte(nil), data...), true +} + +// Borrow returns the immutable cached page without copying or touching LRU state. +func (c *PageCache) Borrow(cacheKey string, offset int64, size int) ([]byte, bool) { + return c.lookup(pageKey{cacheKey: cacheKey, offset: offset, size: size}, false) +} + +func (c *PageCache) lookup(key pageKey, touch bool) ([]byte, bool) { + start := time.Now() + shard := c.shardFor(key) + + if !touch { + shard.mu.RLock() + elem, ok := shard.items[key] + if !ok { + shard.mu.RUnlock() + shard.misses.Add(1) + shard.recordLookupDuration(time.Since(start)) + return nil, false + } + data := elem.Value.(*cacheEntry).data + shard.mu.RUnlock() + shard.hits.Add(1) + shard.recordLookupDuration(time.Since(start)) + return data, true + } + + shard.mu.Lock() + elem, ok := shard.items[key] + if !ok { + shard.mu.Unlock() + shard.misses.Add(1) + shard.recordLookupDuration(time.Since(start)) + return nil, false + } + shard.lru.MoveToFront(elem) + entry := elem.Value.(*cacheEntry) + data := entry.data + shard.mu.Unlock() + shard.hits.Add(1) + shard.recordLookupDuration(time.Since(start)) + return data, true +} + +func (c *PageCache) Add(cacheKey string, offset int64, data []byte) { + if len(data) == 0 { + return + } + start := time.Now() + key := pageKey{cacheKey: cacheKey, offset: offset, size: len(data)} + value := append([]byte(nil), data...) + shard := c.shardFor(key) + + shard.mu.Lock() + defer shard.mu.Unlock() + defer func() { + shard.recordAddDuration(time.Since(start)) + }() + + if elem, ok := shard.items[key]; ok { + entry := elem.Value.(*cacheEntry) + shard.bytes -= int64(len(entry.data)) + entry.data = value + shard.bytes += int64(len(entry.data)) + shard.lru.MoveToFront(elem) + shard.evictLocked() + return + } + + elem := shard.lru.PushFront(&cacheEntry{key: key, data: value}) + shard.items[key] = elem + shard.bytes += int64(len(value)) + shard.evictLocked() +} + +func (c *PageCache) SnapshotStats() (bytes, maxBytes int64, items int, hits, misses int64) { + for _, shard := range c.shards { + shard.mu.Lock() + bytes += shard.bytes + maxBytes += shard.maxBytes + items += len(shard.items) + shard.mu.Unlock() + hits += shard.hits.Load() + misses += shard.misses.Load() + } + return bytes, maxBytes, items, hits, misses +} + +func (c *PageCache) SnapshotTimingStats() (shards int, lookupNanos, lookupMaxNanos, addNanos, addMaxNanos int64) { + for _, shard := range c.shards { + shards++ + lookupNanos += shard.lookupNanos.Load() + if max := shard.lookupMaxNanos.Load(); max > lookupMaxNanos { + lookupMaxNanos = max + } + addNanos += shard.addNanos.Load() + if max := shard.addMaxNanos.Load(); max > addMaxNanos { + addMaxNanos = max + } + } + return shards, lookupNanos, lookupMaxNanos, addNanos, addMaxNanos +} + +func (c *PageCache) shardFor(key pageKey) *pageCacheShard { + if len(c.shards) == 1 { + return c.shards[0] + } + return c.shards[int(hashPageKey(key)%uint64(len(c.shards)))] +} + +func (c *pageCacheShard) evictLocked() { + for c.bytes > c.maxBytes && c.lru.Len() > 0 { + elem := c.lru.Back() + entry := elem.Value.(*cacheEntry) + delete(c.items, entry.key) + c.bytes -= int64(len(entry.data)) + c.lru.Remove(elem) + } +} + +func (c *pageCacheShard) recordLookupDuration(duration time.Duration) { + nanos := duration.Nanoseconds() + c.lookupNanos.Add(nanos) + atomicMaxCacheInt64(&c.lookupMaxNanos, nanos) +} + +func (c *pageCacheShard) recordAddDuration(duration time.Duration) { + nanos := duration.Nanoseconds() + c.addNanos.Add(nanos) + atomicMaxCacheInt64(&c.addMaxNanos, nanos) +} + +func pageCacheShardCount(maxBytes int64) int { + const targetShardBytes = int64(64 << 20) + count := int(maxBytes / targetShardBytes) + if count < 1 { + return 1 + } + if count > 64 { + return 64 + } + return count +} + +func hashPageKey(key pageKey) uint64 { + const ( + offset64 = 14695981039346656037 + prime64 = 1099511628211 + ) + hash := uint64(offset64) + for i := 0; i < len(key.cacheKey); i++ { + hash ^= uint64(key.cacheKey[i]) + hash *= prime64 + } + hash ^= uint64(key.offset) + hash *= prime64 + hash ^= uint64(key.offset >> 32) + hash *= prime64 + hash ^= uint64(key.size) + hash *= prime64 + return mixPageHash(hash) +} + +func mixPageHash(hash uint64) uint64 { + hash ^= hash >> 33 + hash *= 0xff51afd7ed558ccd + hash ^= hash >> 33 + hash *= 0xc4ceb9fe1a85ec53 + hash ^= hash >> 33 + return hash +} + +func atomicMaxCacheInt64(target *atomic.Int64, candidate int64) { + for { + current := target.Load() + if candidate <= current || target.CompareAndSwap(current, candidate) { + return + } + } +} diff --git a/lib/uffdpager/cache_test.go b/lib/uffdpager/cache_test.go new file mode 100644 index 00000000..80fdc4ed --- /dev/null +++ b/lib/uffdpager/cache_test.go @@ -0,0 +1,67 @@ +package uffdpager + +import ( + "bytes" + "testing" +) + +func TestPageCacheSharesPagesByCacheKeyAndOffset(t *testing.T) { + cache := NewPageCache(8192) + page := bytes.Repeat([]byte{7}, 4096) + + cache.Add("snapshot-a", 0, page) + got, ok := cache.Get("snapshot-a", 0, 4096) + if !ok { + t.Fatalf("expected cache hit") + } + if !bytes.Equal(got, page) { + t.Fatalf("cached page mismatch") + } + + got[0] = 1 + again, ok := cache.Get("snapshot-a", 0, 4096) + if !ok { + t.Fatalf("expected second cache hit") + } + if again[0] != 7 { + t.Fatalf("cache returned mutable backing slice") + } +} + +func TestPageCacheEvictsLRUWhenBounded(t *testing.T) { + cache := NewPageCache(8192) + cache.Add("snapshot-a", 0, bytes.Repeat([]byte{1}, 4096)) + cache.Add("snapshot-a", 4096, bytes.Repeat([]byte{2}, 4096)) + if _, ok := cache.Get("snapshot-a", 0, 4096); !ok { + t.Fatalf("expected first page before eviction") + } + + cache.Add("snapshot-a", 8192, bytes.Repeat([]byte{3}, 4096)) + if _, ok := cache.Get("snapshot-a", 4096, 4096); ok { + t.Fatalf("expected least recently used page to be evicted") + } + if _, ok := cache.Get("snapshot-a", 0, 4096); !ok { + t.Fatalf("expected recently used page to remain") + } +} + +func TestPageCacheDistributesAlignedPagesAcrossShards(t *testing.T) { + cache := NewPageCache(4 << 30) + page := bytes.Repeat([]byte{1}, 4096) + + for i := range 4096 { + cache.Add("snapshot-a", int64(i*4096), page) + } + + usedShards := 0 + for _, shard := range cache.shards { + shard.mu.Lock() + if len(shard.items) > 0 { + usedShards++ + } + shard.mu.Unlock() + } + if usedShards < len(cache.shards)/2 { + t.Fatalf("expected aligned pages to spread across shards, used %d of %d", usedShards, len(cache.shards)) + } +} diff --git a/lib/uffdpager/server_faults_linux.go b/lib/uffdpager/server_faults_linux.go new file mode 100644 index 00000000..f0b47c7c --- /dev/null +++ b/lib/uffdpager/server_faults_linux.go @@ -0,0 +1,341 @@ +//go:build linux + +package uffdpager + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync/atomic" + "syscall" + "time" + "unsafe" + + "golang.org/x/sys/unix" +) + +const ( + uffdEventPagefault = 0x12 + uffdEventRemove = 0x15 + uffdEventUnmap = 0x16 + uffdioCopy = 0xc028aa03 + uffdMsgSize = 32 +) + +type guestRegionUffdMapping struct { + BaseHostVirtAddr uint64 `json:"base_host_virt_addr"` + Size uint64 `json:"size"` + Offset uint64 `json:"offset"` + PageSize uint64 `json:"page_size"` + PageSizeKiB uint64 `json:"page_size_kib,omitempty"` +} + +type uffdioCopyArgs struct { + dst uint64 + src uint64 + len uint64 + mode uint64 + copy int64 +} + +type uffdEvent struct { + kind byte + addr int64 +} + +func (s *session) handleFaults(mappings []guestRegionUffdMapping) { + fd := s.uffdFD + _ = unix.SetNonblock(fd, true) + pollFDs := []unix.PollFd{{Fd: int32(fd), Events: unix.POLLIN}} + buf := make([]byte, uffdMsgSize) + var deferred []uffdEvent + for { + if len(deferred) == 0 { + n, err := unix.Poll(pollFDs, -1) + if err != nil { + if err == unix.EINTR { + continue + } + return + } + if n == 0 || pollFDs[0].Revents&unix.POLLIN == 0 { + if pollFDs[0].Revents&(unix.POLLHUP|unix.POLLERR|unix.POLLNVAL) != 0 { + return + } + continue + } + } + + events := deferred + deferred = nil + for { + event, ok, err := readUFFDEvent(fd, buf) + if err != nil { + log.Printf("uffd session %s read uffd event: %v", s.id, err) + return + } + if !ok { + break + } + events = append(events, event) + } + + for _, event := range events { + switch event.kind { + case uffdEventPagefault: + if err := s.servePageFault(mappings, event.addr); err != nil { + if errors.Is(err, unix.EAGAIN) { + deferred = append(deferred, event) + continue + } + s.server.copyErrors.Add(1) + log.Printf("uffd session %s page fault at %#x: %v", s.id, event.addr, err) + } + case uffdEventRemove, uffdEventUnmap: + // Reading remove/unmap events clears the pending state that can + // make UFFDIO_COPY return EAGAIN. + default: + log.Printf("uffd session %s ignoring unexpected uffd event %#x", s.id, event.kind) + } + } + + if len(deferred) > 0 { + time.Sleep(time.Millisecond) + } + } +} + +func readUFFDEvent(fd int, buf []byte) (uffdEvent, bool, error) { + read, err := unix.Read(fd, buf) + if err != nil { + if err == unix.EINTR { + return uffdEvent{}, false, nil + } + if err == unix.EAGAIN || err == unix.EWOULDBLOCK { + return uffdEvent{}, false, nil + } + return uffdEvent{}, false, err + } + if read == 0 { + return uffdEvent{}, false, io.EOF + } + if read < 32 { + return uffdEvent{}, false, fmt.Errorf("short uffd event read: %d bytes", read) + } + event := uffdEvent{kind: buf[0]} + if event.kind == uffdEventPagefault { + event.addr = int64(nativeEndian.Uint64(buf[16:24])) + } + return event, true, nil +} + +func (s *session) servePageFault(mappings []guestRegionUffdMapping, faultAddr int64) error { + start := time.Now() + active := s.server.activeFaults.Add(1) + atomicMaxInt64(&s.server.maxConcurrentFaults, active) + defer func() { + s.server.activeFaults.Add(-1) + nanos := time.Since(start).Nanoseconds() + s.server.faultNanos.Add(nanos) + atomicMaxInt64(&s.server.faultMaxNanos, nanos) + }() + + mapping, pageAddr, pageOffset, pageSize, ok := findMapping(mappings, faultAddr) + if !ok { + return fmt.Errorf("fault address %#x outside guest mappings", faultAddr) + } + _ = mapping + + s.server.faults.Add(1) + page, err := s.readPage(pageOffset, pageSize) + if err != nil { + return err + } + copyStart := time.Now() + if err := uffdCopy(s.uffdFD, uint64(pageAddr), page); err != nil { + nanos := time.Since(copyStart).Nanoseconds() + s.server.copyNanos.Add(nanos) + atomicMaxInt64(&s.server.copyMaxNanos, nanos) + return err + } + nanos := time.Since(copyStart).Nanoseconds() + s.server.copyNanos.Add(nanos) + atomicMaxInt64(&s.server.copyMaxNanos, nanos) + s.server.copies.Add(1) + return nil +} + +func (s *session) readPage(offset int64, size int) ([]byte, error) { + start := time.Now() + defer func() { + nanos := time.Since(start).Nanoseconds() + s.server.readPageNanos.Add(nanos) + atomicMaxInt64(&s.server.readPageMaxNanos, nanos) + }() + + if page, ok := s.server.cache.Borrow(s.cacheKey, offset, size); ok { + return page, nil + } + + page := make([]byte, size) + readStart := time.Now() + n, err := s.backingFile.ReadAt(page, offset) + readNanos := time.Since(readStart).Nanoseconds() + s.server.backingReadNanos.Add(readNanos) + atomicMaxInt64(&s.server.backingReadMaxNanos, readNanos) + if err != nil && !errors.Is(err, io.EOF) { + return nil, fmt.Errorf("read backing page at %d: %w", offset, err) + } + if n > 0 { + s.server.backingBytesRead.Add(int64(n)) + } + s.server.cache.Add(s.cacheKey, offset, page) + return page, nil +} + +func recvMappingsAndFD(conn *net.UnixConn) ([]guestRegionUffdMapping, int, error) { + buf := make([]byte, 128<<10) + oob := make([]byte, unix.CmsgSpace(4)) + n, oobn, _, _, err := conn.ReadMsgUnix(buf, oob) + if err != nil { + return nil, -1, err + } + if n == 0 { + return nil, -1, fmt.Errorf("empty mapping payload") + } + + msgs, err := unix.ParseSocketControlMessage(oob[:oobn]) + if err != nil { + return nil, -1, err + } + var fds []int + for _, msg := range msgs { + rights, err := unix.ParseUnixRights(&msg) + if err != nil { + continue + } + fds = append(fds, rights...) + } + if len(fds) == 0 { + return nil, -1, fmt.Errorf("no uffd file descriptor received") + } + for _, extra := range fds[1:] { + _ = unix.Close(extra) + } + + mappings, err := decodeMappings(buf[:n]) + if err != nil { + _ = unix.Close(fds[0]) + return nil, -1, err + } + return mappings, fds[0], nil +} + +func decodeMappings(data []byte) ([]guestRegionUffdMapping, error) { + var mappings []guestRegionUffdMapping + if err := json.Unmarshal(data, &mappings); err == nil { + return normalizeMappings(mappings) + } + + var wrapped struct { + Mappings []guestRegionUffdMapping `json:"mappings"` + } + if err := json.Unmarshal(data, &wrapped); err != nil { + return nil, fmt.Errorf("decode uffd mappings: %w", err) + } + return normalizeMappings(wrapped.Mappings) +} + +func normalizeMappings(mappings []guestRegionUffdMapping) ([]guestRegionUffdMapping, error) { + if len(mappings) == 0 { + return nil, fmt.Errorf("no uffd mappings received") + } + for i := range mappings { + if mappings[i].PageSize == 0 { + mappings[i].PageSize = mappings[i].PageSizeKiB + } + if mappings[i].PageSize == 0 { + mappings[i].PageSize = uint64(os.Getpagesize()) + } + if mappings[i].Size == 0 { + return nil, fmt.Errorf("mapping %d has zero size", i) + } + if mappings[i].PageSize&(mappings[i].PageSize-1) != 0 { + return nil, fmt.Errorf("mapping %d page size %d is not a power of two", i, mappings[i].PageSize) + } + } + return mappings, nil +} + +func findMapping(mappings []guestRegionUffdMapping, faultAddr int64) (guestRegionUffdMapping, int64, int64, int, bool) { + for _, mapping := range mappings { + start := int64(mapping.BaseHostVirtAddr) + end := start + int64(mapping.Size) + if faultAddr < start || faultAddr >= end { + continue + } + pageSize := int64(mapping.PageSize) + pageAddr := faultAddr &^ (pageSize - 1) + pageOffset := int64(mapping.Offset) + (pageAddr - start) + return mapping, pageAddr, pageOffset, int(pageSize), true + } + return guestRegionUffdMapping{}, 0, 0, 0, false +} + +func uffdCopy(fd int, dst uint64, page []byte) error { + if len(page) == 0 { + return fmt.Errorf("empty page") + } + args := uffdioCopyArgs{ + dst: dst, + src: uint64(uintptr(unsafe.Pointer(&page[0]))), + len: uint64(len(page)), + } + _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(fd), uintptr(uffdioCopy), uintptr(unsafe.Pointer(&args))) + if errno == 0 || errno == syscall.EEXIST { + return nil + } + return errno +} + +func atomicMaxInt64(target *atomic.Int64, candidate int64) { + for { + current := target.Load() + if candidate <= current || target.CompareAndSwap(current, candidate) { + return + } + } +} + +var nativeEndian = nativeByteOrder() + +type byteOrder interface { + Uint64([]byte) uint64 +} + +func nativeByteOrder() byteOrder { + var x uint16 = 0x1 + b := (*[2]byte)(unsafe.Pointer(&x)) + if b[0] == 0x1 { + return littleEndian{} + } + return bigEndian{} +} + +type littleEndian struct{} + +func (littleEndian) Uint64(b []byte) uint64 { + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +type bigEndian struct{} + +func (bigEndian) Uint64(b []byte) uint64 { + return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | + uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56 +} diff --git a/lib/uffdpager/server_handlers_linux.go b/lib/uffdpager/server_handlers_linux.go new file mode 100644 index 00000000..f2d604dd --- /dev/null +++ b/lib/uffdpager/server_handlers_linux.go @@ -0,0 +1,121 @@ +//go:build linux + +package uffdpager + +import ( + "encoding/json" + "fmt" + "log" + "net/http" + "strings" + + "github.com/go-chi/chi/v5" +) + +func (s *server) handleHealth(w http.ResponseWriter, r *http.Request) { + s.writeJSON(w, http.StatusOK, HealthResponse{ + Version: s.versionKey, + Draining: s.isDraining(), + ActiveSessions: s.activeSessions(), + }) +} + +func (s *server) handleStats(w http.ResponseWriter, r *http.Request) { + cacheBytes, cacheMax, cacheItems, hits, misses := s.cache.SnapshotStats() + cacheShards, cacheLookupNanos, cacheLookupMaxNanos, cacheAddNanos, cacheAddMaxNanos := s.cache.SnapshotTimingStats() + s.writeJSON(w, http.StatusOK, Stats{ + Version: s.versionKey, + Draining: s.isDraining(), + ActiveSessions: s.activeSessions(), + CacheBytes: cacheBytes, + CacheMax: cacheMax, + CacheItems: cacheItems, + CacheHits: hits, + CacheMisses: misses, + CacheShards: cacheShards, + CacheLookupNanos: cacheLookupNanos, + CacheLookupMaxNanos: cacheLookupMaxNanos, + CacheAddNanos: cacheAddNanos, + CacheAddMaxNanos: cacheAddMaxNanos, + Faults: s.faults.Load(), + BackingBytesRead: s.backingBytesRead.Load(), + Copies: s.copies.Load(), + CopyErrors: s.copyErrors.Load(), + ActiveFaults: s.activeFaults.Load(), + MaxConcurrentFaults: s.maxConcurrentFaults.Load(), + FaultNanos: s.faultNanos.Load(), + FaultMaxNanos: s.faultMaxNanos.Load(), + ReadPageNanos: s.readPageNanos.Load(), + ReadPageMaxNanos: s.readPageMaxNanos.Load(), + BackingReadNanos: s.backingReadNanos.Load(), + BackingReadMaxNanos: s.backingReadMaxNanos.Load(), + CopyNanos: s.copyNanos.Load(), + CopyMaxNanos: s.copyMaxNanos.Load(), + }) +} + +func (s *server) handleCreateSession(w http.ResponseWriter, r *http.Request) { + if s.isDraining() { + http.Error(w, "pager is draining", http.StatusServiceUnavailable) + return + } + + var req CreateSessionRequest + if err := json.NewDecoder(http.MaxBytesReader(w, r.Body, 1<<20)).Decode(&req); err != nil { + http.Error(w, fmt.Sprintf("decode request: %v", err), http.StatusBadRequest) + return + } + if strings.TrimSpace(req.SessionID) == "" { + req.SessionID = req.InstanceID + } + if strings.TrimSpace(req.SessionID) == "" { + http.Error(w, "session_id or instance_id is required", http.StatusBadRequest) + return + } + if strings.TrimSpace(req.BackingMemoryPath) == "" { + http.Error(w, "backing_memory_path is required", http.StatusBadRequest) + return + } + if strings.TrimSpace(req.CacheKey) == "" { + http.Error(w, "cache_key is required", http.StatusBadRequest) + return + } + + created, err := s.createSession(req) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + s.writeJSON(w, http.StatusOK, CreateSessionResponse{ + SessionID: created.id, + UFFDSocketPath: created.socketPath, + PagerVersion: s.versionKey, + }) +} + +func (s *server) handleCloseSession(w http.ResponseWriter, r *http.Request) { + id := chi.URLParam(r, "id") + s.closeSession(id) + w.WriteHeader(http.StatusNoContent) +} + +func (s *server) handleDrain(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + s.draining = true + shouldExit := len(s.sessions) == 0 + s.mu.Unlock() + + if shouldExit { + s.shutdownSoon() + } + w.WriteHeader(http.StatusNoContent) +} + +func (s *server) writeJSON(w http.ResponseWriter, status int, value any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(value); err != nil { + log.Printf("write json response: %v", err) + } +} diff --git a/lib/uffdpager/server_linux.go b/lib/uffdpager/server_linux.go new file mode 100644 index 00000000..d007ee6a --- /dev/null +++ b/lib/uffdpager/server_linux.go @@ -0,0 +1,121 @@ +//go:build linux + +package uffdpager + +import ( + "errors" + "flag" + "fmt" + "net" + "net/http" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + + "github.com/go-chi/chi/v5" +) + +type server struct { + dataDir string + versionKey string + cache *PageCache + controlSocket string + sessionRoot string + httpServer *http.Server + + mu sync.Mutex + sessions map[string]*session + draining bool + + faults atomic.Int64 + backingBytesRead atomic.Int64 + copies atomic.Int64 + copyErrors atomic.Int64 + + activeFaults atomic.Int64 + maxConcurrentFaults atomic.Int64 + faultNanos atomic.Int64 + faultMaxNanos atomic.Int64 + readPageNanos atomic.Int64 + readPageMaxNanos atomic.Int64 + backingReadNanos atomic.Int64 + backingReadMaxNanos atomic.Int64 + copyNanos atomic.Int64 + copyMaxNanos atomic.Int64 +} + +type session struct { + id string + instanceID string + backingMemoryPath string + cacheKey string + socketPath string + listener *net.UnixListener + backingFile *os.File + server *server + done chan struct{} + closeOnce sync.Once + uffdFD int + conn *net.UnixConn +} + +func Main(args []string) error { + fs := flag.NewFlagSet("hypeman-uffd-pager", flag.ContinueOnError) + dataDir := fs.String("data-dir", "", "hypeman data directory") + versionKey := fs.String("version-key", "", "pager version key") + cacheMaxBytes := fs.Int64("cache-max-bytes", defaultCacheMaxBytes, "maximum shared page cache bytes") + if err := fs.Parse(args); err != nil { + return err + } + if strings.TrimSpace(*dataDir) == "" { + return fmt.Errorf("--data-dir is required") + } + if strings.TrimSpace(*versionKey) == "" { + return fmt.Errorf("--version-key is required") + } + + s := newServer(*dataDir, *versionKey, *cacheMaxBytes) + return s.run() +} + +func newServer(dataDir, versionKey string, cacheMaxBytes int64) *server { + dir := pagerVersionDir(dataDir, versionKey) + return &server{ + dataDir: dataDir, + versionKey: versionKey, + cache: NewPageCache(cacheMaxBytes), + controlSocket: filepath.Join(dir, controlSocketFile), + sessionRoot: filepath.Join(dir, sessionsDir), + sessions: make(map[string]*session), + } +} + +func (s *server) run() error { + if err := os.MkdirAll(s.sessionRoot, 0755); err != nil { + return fmt.Errorf("create uffd session directory: %w", err) + } + _ = os.Remove(s.controlSocket) + listener, err := net.Listen("unix", s.controlSocket) + if err != nil { + return fmt.Errorf("listen on uffd control socket: %w", err) + } + defer listener.Close() + + _ = os.WriteFile(filepath.Join(filepath.Dir(s.controlSocket), pagerPIDFile), []byte(fmt.Sprintf("%d\n", os.Getpid())), 0644) + + router := chi.NewRouter() + router.Get("/health", s.handleHealth) + router.Get("/stats", s.handleStats) + router.Post("/sessions", s.handleCreateSession) + router.Post("/sessions/{id}/close", s.handleCloseSession) + router.Post("/drain", s.handleDrain) + + s.httpServer = &http.Server{Handler: router} + err = s.httpServer.Serve(listener) + if errors.Is(err, http.ErrServerClosed) { + return nil + } + return err +} diff --git a/lib/uffdpager/server_sessions_linux.go b/lib/uffdpager/server_sessions_linux.go new file mode 100644 index 00000000..72989766 --- /dev/null +++ b/lib/uffdpager/server_sessions_linux.go @@ -0,0 +1,170 @@ +//go:build linux + +package uffdpager + +import ( + "context" + "fmt" + "log" + "net" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "golang.org/x/sys/unix" +) + +func (s *server) createSession(req CreateSessionRequest) (*session, error) { + id := sanitizeSessionID(req.SessionID) + socketPath := filepath.Join(s.sessionRoot, id+".sock") + _ = os.Remove(socketPath) + addr := net.UnixAddr{Name: socketPath, Net: "unix"} + listener, err := net.ListenUnix("unix", &addr) + if err != nil { + return nil, fmt.Errorf("listen for uffd session %s: %w", id, err) + } + + sess := &session{ + id: id, + instanceID: req.InstanceID, + backingMemoryPath: req.BackingMemoryPath, + cacheKey: req.CacheKey, + socketPath: socketPath, + listener: listener, + server: s, + done: make(chan struct{}), + uffdFD: -1, + } + + s.mu.Lock() + if existing := s.sessions[id]; existing != nil { + existing.close() + } + s.sessions[id] = sess + s.mu.Unlock() + + go sess.run() + return sess, nil +} + +func (s *server) closeSession(id string) { + id = sanitizeSessionID(id) + s.mu.Lock() + sess := s.sessions[id] + delete(s.sessions, id) + shouldExit := s.draining && len(s.sessions) == 0 + s.mu.Unlock() + if sess != nil { + sess.close() + } + if shouldExit { + s.shutdownSoon() + } +} + +func (s *server) removeSession(sess *session) { + s.mu.Lock() + if current := s.sessions[sess.id]; current == sess { + delete(s.sessions, sess.id) + } + shouldExit := s.draining && len(s.sessions) == 0 + s.mu.Unlock() + if shouldExit { + s.shutdownSoon() + } +} + +func (s *server) shutdownSoon() { + go func() { + time.Sleep(50 * time.Millisecond) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + _ = s.httpServer.Shutdown(ctx) + }() +} + +func (s *server) activeSessions() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.sessions) +} + +func (s *server) isDraining() bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.draining +} + +func (s *session) run() { + defer func() { + s.close() + s.server.removeSession(s) + }() + + conn, err := s.listener.AcceptUnix() + if err != nil { + return + } + s.conn = conn + + file, err := os.Open(s.backingMemoryPath) + if err != nil { + log.Printf("uffd session %s open backing memory: %v", s.id, err) + return + } + s.backingFile = file + + mappings, uffdFD, err := recvMappingsAndFD(conn) + if err != nil { + log.Printf("uffd session %s receive mappings and fd: %v", s.id, err) + return + } + s.uffdFD = uffdFD + + sort.Slice(mappings, func(i, j int) bool { + return mappings[i].BaseHostVirtAddr < mappings[j].BaseHostVirtAddr + }) + s.handleFaults(mappings) +} + +func (s *session) close() { + s.closeOnce.Do(func() { + if s.listener != nil { + _ = s.listener.Close() + } + if s.conn != nil { + _ = s.conn.Close() + } + if s.uffdFD >= 0 { + _ = unix.Close(s.uffdFD) + } + if s.backingFile != nil { + _ = s.backingFile.Close() + } + _ = os.Remove(s.socketPath) + close(s.done) + }) +} + +func sanitizeSessionID(id string) string { + id = strings.TrimSpace(id) + if id == "" { + return fmt.Sprintf("session-%d", time.Now().UnixNano()) + } + return strings.Map(func(r rune) rune { + switch { + case r >= 'a' && r <= 'z': + return r + case r >= 'A' && r <= 'Z': + return r + case r >= '0' && r <= '9': + return r + case r == '-', r == '_', r == '.': + return r + default: + return '-' + } + }, id) +} diff --git a/lib/uffdpager/supervisor_linux.go b/lib/uffdpager/supervisor_linux.go new file mode 100644 index 00000000..f5318def --- /dev/null +++ b/lib/uffdpager/supervisor_linux.go @@ -0,0 +1,313 @@ +//go:build linux + +package uffdpager + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" +) + +const ( + controlSocketFile = "control.sock" + pagerPIDFile = "pager.pid" + pagerLogFile = "pager.log" + pagerEnvFile = "pager.env" + sessionsDir = "sessions" + + systemdUnitTemplate = "hypeman-uffd@.service" +) + +type Supervisor struct { + dataDir string + versionKey string + cacheMaxBytes int64 + controlSocket string + pagerBinary string + + mu sync.Mutex + clients map[string]*http.Client +} + +func NewSupervisor(ctx context.Context, dataDir string, cacheMaxBytes int64) (*Supervisor, error) { + exe, err := os.Executable() + if err != nil { + return nil, fmt.Errorf("resolve executable path: %w", err) + } + versionKey := Version() + if versionKey == "" { + return nil, fmt.Errorf("uffd pager version is empty") + } + s := &Supervisor{ + dataDir: dataDir, + versionKey: versionKey, + cacheMaxBytes: normalizeCacheMaxBytes(cacheMaxBytes), + controlSocket: pagerControlSocket(dataDir, versionKey), + pagerBinary: pagerBinaryPath(exe), + clients: make(map[string]*http.Client), + } + if err := s.ensureRunning(ctx); err != nil { + return nil, err + } + s.drainOlderPagers(ctx) + return s, nil +} + +func (s *Supervisor) VersionKey() string { + if s == nil { + return "" + } + return s.versionKey +} + +func (s *Supervisor) CreateSession(ctx context.Context, req CreateSessionRequest) (*CreateSessionResponse, error) { + if s == nil { + return nil, fmt.Errorf("uffd pager supervisor is nil") + } + if strings.TrimSpace(req.SessionID) == "" { + req.SessionID = req.InstanceID + } + var resp CreateSessionResponse + if err := s.doJSON(ctx, s.versionKey, http.MethodPost, "/sessions", req, &resp); err != nil { + return nil, err + } + return &resp, nil +} + +func (s *Supervisor) CloseSession(ctx context.Context, sessionID string) error { + if s == nil || strings.TrimSpace(sessionID) == "" { + return nil + } + return s.CloseSessionVersion(ctx, s.versionKey, sessionID) +} + +func (s *Supervisor) CloseSessionVersion(ctx context.Context, versionKey, sessionID string) error { + if s == nil || strings.TrimSpace(versionKey) == "" || strings.TrimSpace(sessionID) == "" { + return nil + } + path := "/sessions/" + urlPathEscape(sessionID) + "/close" + return s.doJSON(ctx, versionKey, http.MethodPost, path, nil, nil) +} + +func (s *Supervisor) Stats(ctx context.Context) (*Stats, error) { + if s == nil { + return nil, fmt.Errorf("uffd pager supervisor is nil") + } + var stats Stats + if err := s.doJSON(ctx, s.versionKey, http.MethodGet, "/stats", nil, &stats); err != nil { + return nil, err + } + return &stats, nil +} + +func (s *Supervisor) HealthVersion(ctx context.Context, versionKey string) (*HealthResponse, error) { + if s == nil { + return nil, fmt.Errorf("uffd pager supervisor is nil") + } + var health HealthResponse + if err := s.doJSON(ctx, versionKey, http.MethodGet, "/health", nil, &health); err != nil { + return nil, err + } + return &health, nil +} + +func (s *Supervisor) ensureRunning(ctx context.Context) error { + if s.isHealthy(ctx, s.versionKey) { + return nil + } + if s.systemdTemplateInstalled(ctx) { + if err := s.ensureRunningSystemd(ctx); err != nil { + return err + } + return s.waitHealthy(ctx) + } + if err := s.ensureRunningSubprocess(ctx); err != nil { + return err + } + return s.waitHealthy(ctx) +} + +func (s *Supervisor) ensureRunningSubprocess(ctx context.Context) error { + dir := pagerVersionDir(s.dataDir, s.versionKey) + if err := os.MkdirAll(filepath.Join(dir, sessionsDir), 0755); err != nil { + return fmt.Errorf("create uffd pager directory: %w", err) + } + _ = os.Remove(s.controlSocket) + + logFile, err := os.OpenFile(filepath.Join(dir, pagerLogFile), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return fmt.Errorf("open uffd pager log: %w", err) + } + defer logFile.Close() + + cmd := exec.Command( + s.pagerBinary, + "--data-dir", s.dataDir, + "--version-key", s.versionKey, + "--cache-max-bytes", strconv.FormatInt(s.cacheMaxBytes, 10), + ) + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + cmd.Stdout = logFile + cmd.Stderr = logFile + if err := cmd.Start(); err != nil { + return fmt.Errorf("start uffd pager: %w", err) + } + _ = os.WriteFile(filepath.Join(dir, pagerPIDFile), []byte(strconv.Itoa(cmd.Process.Pid)+"\n"), 0644) + _ = cmd.Process.Release() + + return nil +} + +func pagerBinaryPath(executable string) string { + if override := strings.TrimSpace(os.Getenv("HYPEMAN_UFFD_PAGER_BINARY")); override != "" { + return override + } + return filepath.Join(filepath.Dir(executable), "hypeman-uffd-pager") +} + +func (s *Supervisor) ensureRunningSystemd(ctx context.Context) error { + dir := pagerVersionDir(s.dataDir, s.versionKey) + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("create uffd pager directory: %w", err) + } + env := fmt.Sprintf("HYPEMAN_UFFD_CACHE_MAX_BYTES=%d\n", s.cacheMaxBytes) + if err := os.WriteFile(filepath.Join(dir, pagerEnvFile), []byte(env), 0644); err != nil { + return fmt.Errorf("write uffd pager systemd environment: %w", err) + } + unit := systemdUnitName(s.versionKey) + cmd := exec.CommandContext(ctx, "systemctl", "start", unit) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("start uffd pager systemd unit %s: %w: %s", unit, err, strings.TrimSpace(string(out))) + } + return nil +} + +func (s *Supervisor) waitHealthy(ctx context.Context) error { + deadline := time.Now().Add(5 * time.Second) + var lastErr error + for time.Now().Before(deadline) { + if s.isHealthy(ctx, s.versionKey) { + return nil + } + lastErr = ctx.Err() + if lastErr != nil { + break + } + time.Sleep(20 * time.Millisecond) + } + if lastErr != nil { + return fmt.Errorf("wait for uffd pager health: %w", lastErr) + } + return fmt.Errorf("uffd pager did not become healthy at %s", s.controlSocket) +} + +func (s *Supervisor) systemdTemplateInstalled(ctx context.Context) bool { + if _, err := exec.LookPath("systemctl"); err != nil { + return false + } + cmd := exec.CommandContext(ctx, "systemctl", "cat", systemdUnitTemplate) + return cmd.Run() == nil +} + +func (s *Supervisor) isHealthy(ctx context.Context, versionKey string) bool { + var health HealthResponse + return s.doJSON(ctx, versionKey, http.MethodGet, "/health", nil, &health) == nil +} + +func (s *Supervisor) drainOlderPagers(ctx context.Context) { + root := filepath.Join(s.dataDir, "uffd") + entries, err := os.ReadDir(root) + if err != nil { + return + } + for _, entry := range entries { + if !entry.IsDir() || entry.Name() == s.versionKey { + continue + } + _ = s.doJSON(ctx, entry.Name(), http.MethodPost, "/drain", nil, nil) + } +} + +func (s *Supervisor) doJSON(ctx context.Context, versionKey, method, path string, body any, out any) error { + client := s.clientForVersion(versionKey) + var reader io.Reader + if body != nil { + data, err := json.Marshal(body) + if err != nil { + return fmt.Errorf("marshal uffd pager request: %w", err) + } + reader = bytes.NewReader(data) + } + req, err := http.NewRequestWithContext(ctx, method, "http://unix"+path, reader) + if err != nil { + return err + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + data, _ := io.ReadAll(io.LimitReader(resp.Body, 64<<10)) + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("uffd pager %s %s returned %s: %s", method, path, resp.Status, strings.TrimSpace(string(data))) + } + if out == nil || len(data) == 0 { + return nil + } + if err := json.Unmarshal(data, out); err != nil { + return fmt.Errorf("decode uffd pager response: %w", err) + } + return nil +} + +func (s *Supervisor) clientForVersion(versionKey string) *http.Client { + s.mu.Lock() + defer s.mu.Unlock() + if client := s.clients[versionKey]; client != nil { + return client + } + socketPath := pagerControlSocket(s.dataDir, versionKey) + client := &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + }, + Timeout: 10 * time.Second, + } + s.clients[versionKey] = client + return client +} + +func pagerVersionDir(dataDir, versionKey string) string { + return filepath.Join(dataDir, "uffd", versionKey) +} + +func pagerControlSocket(dataDir, versionKey string) string { + return filepath.Join(pagerVersionDir(dataDir, versionKey), controlSocketFile) +} + +func urlPathEscape(value string) string { + replacer := strings.NewReplacer("%", "%25", "/", "%2F", "?", "%3F", "#", "%23") + return replacer.Replace(value) +} + +func systemdUnitName(versionKey string) string { + return "hypeman-uffd@" + versionKey + ".service" +} diff --git a/lib/uffdpager/supervisor_unsupported.go b/lib/uffdpager/supervisor_unsupported.go new file mode 100644 index 00000000..ece0704c --- /dev/null +++ b/lib/uffdpager/supervisor_unsupported.go @@ -0,0 +1,42 @@ +//go:build !linux + +package uffdpager + +import ( + "context" + "fmt" +) + +type Supervisor struct{} + +func NewSupervisor(context.Context, string, int64) (*Supervisor, error) { + return nil, fmt.Errorf("uffd pager is only supported on linux") +} + +func (s *Supervisor) VersionKey() string { + return "" +} + +func (s *Supervisor) CreateSession(context.Context, CreateSessionRequest) (*CreateSessionResponse, error) { + return nil, fmt.Errorf("uffd pager is only supported on linux") +} + +func (s *Supervisor) CloseSession(context.Context, string) error { + return nil +} + +func (s *Supervisor) CloseSessionVersion(context.Context, string, string) error { + return nil +} + +func (s *Supervisor) Stats(context.Context) (*Stats, error) { + return nil, fmt.Errorf("uffd pager is only supported on linux") +} + +func (s *Supervisor) HealthVersion(context.Context, string) (*HealthResponse, error) { + return nil, fmt.Errorf("uffd pager is only supported on linux") +} + +func Main([]string) error { + return fmt.Errorf("uffd pager is only supported on linux") +} diff --git a/lib/uffdpager/types.go b/lib/uffdpager/types.go new file mode 100644 index 00000000..99736983 --- /dev/null +++ b/lib/uffdpager/types.go @@ -0,0 +1,70 @@ +package uffdpager + +const ( + BackendFile = "file" + BackendUFFD = "uffd" + + defaultCacheMaxBytes = int64(4 << 30) +) + +// CreateSessionRequest describes one Firecracker UFFD restore session. +type CreateSessionRequest struct { + SessionID string `json:"session_id,omitempty"` + InstanceID string `json:"instance_id"` + BackingMemoryPath string `json:"backing_memory_path"` + CacheKey string `json:"cache_key"` +} + +// CreateSessionResponse returns the per-session socket Firecracker should use +// as mem_backend.backend_path. +type CreateSessionResponse struct { + SessionID string `json:"session_id"` + UFFDSocketPath string `json:"uffd_socket_path"` + PagerVersion string `json:"pager_version"` +} + +type HealthResponse struct { + Version string `json:"version"` + Draining bool `json:"draining"` + ActiveSessions int `json:"active_sessions"` +} + +type Stats struct { + Version string `json:"version"` + Draining bool `json:"draining"` + ActiveSessions int `json:"active_sessions"` + + CacheBytes int64 `json:"cache_bytes"` + CacheMax int64 `json:"cache_max"` + CacheItems int `json:"cache_items"` + CacheHits int64 `json:"cache_hits"` + CacheMisses int64 `json:"cache_misses"` + CacheShards int `json:"cache_shards"` + CacheLookupNanos int64 `json:"cache_lookup_nanos"` + CacheLookupMaxNanos int64 `json:"cache_lookup_max_nanos"` + CacheAddNanos int64 `json:"cache_add_nanos"` + CacheAddMaxNanos int64 `json:"cache_add_max_nanos"` + + Faults int64 `json:"faults"` + BackingBytesRead int64 `json:"backing_bytes_read"` + Copies int64 `json:"copies"` + CopyErrors int64 `json:"copy_errors"` + + ActiveFaults int64 `json:"active_faults"` + MaxConcurrentFaults int64 `json:"max_concurrent_faults"` + FaultNanos int64 `json:"fault_nanos"` + FaultMaxNanos int64 `json:"fault_max_nanos"` + ReadPageNanos int64 `json:"read_page_nanos"` + ReadPageMaxNanos int64 `json:"read_page_max_nanos"` + BackingReadNanos int64 `json:"backing_read_nanos"` + BackingReadMaxNanos int64 `json:"backing_read_max_nanos"` + CopyNanos int64 `json:"copy_nanos"` + CopyMaxNanos int64 `json:"copy_max_nanos"` +} + +func normalizeCacheMaxBytes(v int64) int64 { + if v <= 0 { + return defaultCacheMaxBytes + } + return v +} diff --git a/lib/uffdpager/version.go b/lib/uffdpager/version.go new file mode 100644 index 00000000..8a396e5b --- /dev/null +++ b/lib/uffdpager/version.go @@ -0,0 +1,13 @@ +package uffdpager + +import ( + _ "embed" + "strings" +) + +//go:embed VERSION +var pagerVersion string + +func Version() string { + return strings.TrimSpace(pagerVersion) +} diff --git a/scripts/build-from-source.sh b/scripts/build-from-source.sh index 17489b36..33e573eb 100755 --- a/scripts/build-from-source.sh +++ b/scripts/build-from-source.sh @@ -13,6 +13,7 @@ set -euo pipefail # Default values BINARY_NAME="hypeman-api" +UFFD_PAGER_BINARY_NAME="hypeman-uffd-pager" # Colors for output (true color) RED='\033[38;2;255;110;110m' @@ -72,6 +73,11 @@ if ! make build >> "$BUILD_LOG" 2>&1; then fi cp "bin/hypeman" "${OUTPUT_DIR}/${BINARY_NAME}" +OS=$(uname -s | tr '[:upper:]' '[:lower:]') +if [ "$OS" = "linux" ]; then + cp "bin/${UFFD_PAGER_BINARY_NAME}" "${OUTPUT_DIR}/${UFFD_PAGER_BINARY_NAME}" +fi + # Build hypeman-token (not included in make build) if ! go build -o "${OUTPUT_DIR}/hypeman-token" ./cmd/gen-jwt >> "$BUILD_LOG" 2>&1; then echo "" @@ -81,7 +87,6 @@ if ! go build -o "${OUTPUT_DIR}/hypeman-token" ./cmd/gen-jwt >> "$BUILD_LOG" 2>& fi # Copy config example files for config template -OS=$(uname -s | tr '[:upper:]' '[:lower:]') if [ "$OS" = "darwin" ]; then cp "config.example.darwin.yaml" "${OUTPUT_DIR}/config.example.darwin.yaml" else diff --git a/scripts/check-uffd-version.sh b/scripts/check-uffd-version.sh new file mode 100755 index 00000000..da3c8eea --- /dev/null +++ b/scripts/check-uffd-version.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +set -euo pipefail + +base_ref="${1:-origin/main}" + +if ! git rev-parse --verify "${base_ref}^{commit}" >/dev/null 2>&1; then + echo "base ref not found: ${base_ref}" >&2 + exit 1 +fi + +diff_range="${base_ref}...HEAD" +if ! git merge-base "${base_ref}" HEAD >/dev/null 2>&1; then + diff_range="${base_ref}..HEAD" +fi + +changed_runtime_files="$( + git diff --name-only "${diff_range}" -- \ + lib/uffdpager \ + lib/hypervisor/firecracker/process.go \ + lib/hypervisor/firecracker/config.go \ + lib/instances/firecracker_uffd.go \ + lib/instances/guest_resume_network.go \ + lib/instances/resume_network_handoff.go \ + cmd/uffd-pager | + grep -Ev '(^lib/uffdpager/VERSION$|(^|/)README\.md$|_test\.go$)' || true +)" + +if [ -z "${changed_runtime_files}" ]; then + exit 0 +fi + +if git diff --quiet "${diff_range}" -- lib/uffdpager/VERSION; then + echo "UFFD pager runtime files changed without updating lib/uffdpager/VERSION:" >&2 + echo "${changed_runtime_files}" >&2 + exit 1 +fi + +exit 0 diff --git a/scripts/e2e-install-test.sh b/scripts/e2e-install-test.sh index 1ebd78db..3bca778c 100755 --- a/scripts/e2e-install-test.sh +++ b/scripts/e2e-install-test.sh @@ -92,6 +92,7 @@ if [ "$OS" = "darwin" ]; then fi else [ -x /opt/hypeman/bin/hypeman-api ] || fail "hypeman-api binary not found" + [ -x /opt/hypeman/bin/hypeman-uffd-pager ] || fail "hypeman-uffd-pager binary not found" pass "Binaries installed correctly" # Check systemd service @@ -214,6 +215,7 @@ if [ "$OS" = "darwin" ]; then fi else [ ! -f /opt/hypeman/bin/hypeman-api ] || fail "hypeman-api binary still exists after uninstall" + [ ! -f /opt/hypeman/bin/hypeman-uffd-pager ] || fail "hypeman-uffd-pager binary still exists after uninstall" if systemctl is-active --quiet hypeman 2>/dev/null; then fail "systemd service still running after uninstall" fi diff --git a/scripts/install.sh b/scripts/install.sh index 896fa12a..349b9ca9 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -20,6 +20,7 @@ set -e REPO="kernel/hypeman" BINARY_NAME="hypeman-api" +UFFD_PAGER_BINARY_NAME="hypeman-uffd-pager" SERVICE_NAME="hypeman" # Colors for output (true color) @@ -254,17 +255,23 @@ if [ -n "$BINARY_DIR" ]; then done cp "${BINARY_DIR}/config.example.darwin.yaml" "${TMP_DIR}/config.example.darwin.yaml" else - for f in "${BINARY_NAME}" "hypeman-token" "config.example.yaml"; do + for f in "${BINARY_NAME}" "${UFFD_PAGER_BINARY_NAME}" "hypeman-token" "config.example.yaml"; do [ -f "${BINARY_DIR}/${f}" ] || error "File ${f} not found in ${BINARY_DIR}" done cp "${BINARY_DIR}/config.example.yaml" "${TMP_DIR}/config.example.yaml" fi cp "${BINARY_DIR}/${BINARY_NAME}" "${TMP_DIR}/${BINARY_NAME}" + if [ "$OS" = "linux" ]; then + cp "${BINARY_DIR}/${UFFD_PAGER_BINARY_NAME}" "${TMP_DIR}/${UFFD_PAGER_BINARY_NAME}" + fi cp "${BINARY_DIR}/hypeman-token" "${TMP_DIR}/hypeman-token" # Make binaries executable chmod +x "${TMP_DIR}/${BINARY_NAME}" + if [ "$OS" = "linux" ]; then + chmod +x "${TMP_DIR}/${UFFD_PAGER_BINARY_NAME}" + fi chmod +x "${TMP_DIR}/hypeman-token" VERSION="custom (from binary)" @@ -301,6 +308,9 @@ elif [ -n "$BRANCH" ]; then cp "config.example.yaml" "${TMP_DIR}/config.example.yaml" fi cp "bin/hypeman" "${TMP_DIR}/${BINARY_NAME}" + if [ "$OS" = "linux" ]; then + cp "bin/${UFFD_PAGER_BINARY_NAME}" "${TMP_DIR}/${UFFD_PAGER_BINARY_NAME}" + fi # Build hypeman-token (not included in make build) if ! go build -o "${TMP_DIR}/hypeman-token" ./cmd/gen-jwt >> "$BUILD_LOG" 2>&1; then @@ -388,6 +398,11 @@ info "Installing ${BINARY_NAME} to ${INSTALL_DIR}..." $SUDO mkdir -p "$INSTALL_DIR" $SUDO install -m 755 "${TMP_DIR}/${BINARY_NAME}" "${INSTALL_DIR}/${BINARY_NAME}" +if [ "$OS" = "linux" ]; then + info "Installing ${UFFD_PAGER_BINARY_NAME} to ${INSTALL_DIR}..." + $SUDO install -m 755 "${TMP_DIR}/${UFFD_PAGER_BINARY_NAME}" "${INSTALL_DIR}/${UFFD_PAGER_BINARY_NAME}" +fi + # Install hypeman-token binary info "Installing hypeman-token to ${INSTALL_DIR}..." $SUDO install -m 755 "${TMP_DIR}/hypeman-token" "${INSTALL_DIR}/hypeman-token" @@ -586,6 +601,28 @@ ReadWritePaths=${DATA_DIR} [Install] WantedBy=multi-user.target +EOF + + $SUDO tee "${SYSTEMD_DIR}/${SERVICE_NAME}-uffd@.service" > /dev/null << EOF +[Unit] +Description=Hypeman UFFD Pager (%i) +Documentation=https://github.com/kernel/hypeman +After=network.target + +[Service] +Type=simple +Environment="HOME=${DATA_DIR}" +EnvironmentFile=${DATA_DIR}/uffd/%i/pager.env +ExecStart=${INSTALL_DIR}/${UFFD_PAGER_BINARY_NAME} --data-dir ${DATA_DIR} --version-key %i --cache-max-bytes \${HYPEMAN_UFFD_CACHE_MAX_BYTES} +Restart=on-failure +RestartSec=5 +KillMode=process + +# Security hardening +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ReadWritePaths=${DATA_DIR} EOF info "Reloading systemd..." @@ -776,6 +813,7 @@ if [ "$OS" = "darwin" ]; then echo " Logs: ${DATA_DIR}/logs/hypeman.log" else echo " API Binary: ${INSTALL_DIR}/${BINARY_NAME}" + echo " UFFD Pager: ${INSTALL_DIR}/${UFFD_PAGER_BINARY_NAME}" echo " CLI: /usr/local/bin/hypeman" echo " Token tool: /usr/local/bin/hypeman-token" echo " Server config: ${CONFIG_FILE}" diff --git a/scripts/uninstall.sh b/scripts/uninstall.sh index 39dd0eeb..e910155e 100755 --- a/scripts/uninstall.sh +++ b/scripts/uninstall.sh @@ -94,6 +94,13 @@ if [ "$OS" = "darwin" ]; then launchctl unload "$PLIST_PATH" 2>/dev/null || true fi else + $SUDO systemctl list-units "${SERVICE_NAME}-uffd@*.service" --all --no-legend 2>/dev/null | awk '{print $1}' | while read -r unit; do + if [ -n "$unit" ]; then + info "Stopping ${unit}..." + $SUDO systemctl stop "$unit" 2>/dev/null || true + fi + done + if $SUDO systemctl is-active --quiet "$SERVICE_NAME" 2>/dev/null; then info "Stopping ${SERVICE_NAME} service..." $SUDO systemctl stop "$SERVICE_NAME" @@ -118,8 +125,12 @@ else if [ -f "${SYSTEMD_DIR}/${SERVICE_NAME}.service" ]; then info "Removing systemd service..." $SUDO rm -f "${SYSTEMD_DIR}/${SERVICE_NAME}.service" - $SUDO systemctl daemon-reload fi + if [ -f "${SYSTEMD_DIR}/${SERVICE_NAME}-uffd@.service" ]; then + info "Removing UFFD pager systemd service template..." + $SUDO rm -f "${SYSTEMD_DIR}/${SERVICE_NAME}-uffd@.service" + fi + $SUDO systemctl daemon-reload fi # =============================================================================