Skip to content

Commit ceeb1b9

Browse files
committed
Revert "[SS] Only cache multiple snapshots on the same branch locally" (#9218)
Reverts #9084 Seeing some unexpected behavior when testing locally - I think I know the fix, but don't want to rush it into the release
1 parent 9998793 commit ceeb1b9

File tree

3 files changed

+129
-79
lines changed

3 files changed

+129
-79
lines changed

enterprise/server/remote_execution/containers/firecracker/firecracker.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails
958958
// to all executors. `!c.hasFallbackKeys()` is a good proxy for whether we're
959959
// running on the master snapshot.
960960
hasRemoteSnapshotForBranchKey := c.hasRemoteSnapshotForBranchKey(ctx)
961-
shouldCacheRemotely = !c.hasFallbackKeys() || !hasRemoteSnapshotForBranchKey || !*snaputil.EnableLocalSnapshotSharing
961+
shouldCacheRemotely = !c.hasFallbackKeys() || !hasRemoteSnapshotForBranchKey
962962
if !shouldCacheRemotely {
963963
log.CtxInfof(ctx, "Would not save remote snapshot. Has remote snapshot for branch key: %v. Snapshot keys: %v", hasRemoteSnapshotForBranchKey, c.snapshotKeySet)
964964
} else if hasRemoteSnapshotForBranchKey {
@@ -967,15 +967,15 @@ func (c *FirecrackerContainer) saveSnapshot(ctx context.Context, snapshotDetails
967967
}
968968

969969
opts := &snaploader.CacheSnapshotOptions{
970-
VMMetadata: vmd,
971-
VMConfiguration: c.vmConfig,
972-
VMStateSnapshotPath: filepath.Join(c.getChroot(), snapshotDetails.vmStateSnapshotName),
973-
KernelImagePath: c.executorConfig.GuestKernelImagePath,
974-
InitrdImagePath: c.executorConfig.InitrdImagePath,
975-
ChunkedFiles: map[string]*copy_on_write.COWStore{},
976-
Recycled: c.recycled,
977-
Remote: shouldCacheRemotely,
978-
SkippedCacheRemotely: c.supportsRemoteSnapshots && !shouldCacheRemotely,
970+
VMMetadata: vmd,
971+
VMConfiguration: c.vmConfig,
972+
VMStateSnapshotPath: filepath.Join(c.getChroot(), snapshotDetails.vmStateSnapshotName),
973+
KernelImagePath: c.executorConfig.GuestKernelImagePath,
974+
InitrdImagePath: c.executorConfig.InitrdImagePath,
975+
ChunkedFiles: map[string]*copy_on_write.COWStore{},
976+
Recycled: c.recycled,
977+
Remote: c.supportsRemoteSnapshots,
978+
WouldNotHaveCachedRemotely: !shouldCacheRemotely,
979979
}
980980
if snapshotSharingEnabled {
981981
if c.rootStore != nil {

enterprise/server/remote_execution/containers/firecracker/firecracker_test.go

Lines changed: 105 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,21 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
853853
}
854854
})
855855

856+
workDir := testfs.MakeDirAll(t, rootDir, "work")
857+
opts := firecracker.ContainerOpts{
858+
ContainerImage: busyboxImage,
859+
ActionWorkingDirectory: workDir,
860+
VMConfiguration: &fcpb.VMConfiguration{
861+
NumCpus: 1,
862+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
863+
EnableNetworking: false,
864+
ScratchDiskSizeMb: 100,
865+
GuestKernelVersion: cfg.GuestKernelVersion,
866+
FirecrackerVersion: cfg.FirecrackerVersion,
867+
GuestApiVersion: cfg.GuestAPIVersion,
868+
},
869+
ExecutorConfig: cfg,
870+
}
856871
instanceName := "test-instance-name"
857872
task := &repb.ExecutionTask{
858873
Command: &repb.Command{
@@ -861,62 +876,57 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
861876
{Name: "recycle-runner", Value: "true"},
862877
}},
863878
Arguments: []string{"./buildbuddy_ci_runner"},
864-
// Simulate that this is for a PR branch, because master snapshots
865-
// are always saved to the remote cache.
866-
EnvironmentVariables: []*repb.Command_EnvironmentVariable{
867-
{Name: "GIT_REPO_DEFAULT_BRANCH", Value: "master"},
868-
{Name: "GIT_BRANCH", Value: "my-pr"},
869-
},
870879
},
871880
ExecuteRequest: &repb.ExecuteRequest{
872881
InstanceName: instanceName,
873882
},
874883
}
884+
baseVM, err := firecracker.NewContainer(ctx, env, task, opts)
885+
require.NoError(t, err)
886+
containersToCleanup = append(containersToCleanup, baseVM)
887+
err = container.PullImageIfNecessary(ctx, env, baseVM, oci.Credentials{}, opts.ContainerImage)
888+
require.NoError(t, err)
889+
err = baseVM.Create(ctx, opts.ActionWorkingDirectory)
890+
require.NoError(t, err)
891+
baseSnapshotId := baseVM.SnapshotID()
875892

876-
runAndSnapshotVM := func(workDir string, stringToLog string, expectedOutput string, snapshotKeyOverride *fcpb.SnapshotKey) string {
877-
opts := firecracker.ContainerOpts{
878-
ContainerImage: busyboxImage,
879-
ActionWorkingDirectory: workDir,
880-
VMConfiguration: &fcpb.VMConfiguration{
881-
NumCpus: 1,
882-
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
883-
EnableNetworking: false,
884-
ScratchDiskSizeMb: 100,
885-
},
886-
ExecutorConfig: cfg,
887-
OverrideSnapshotKey: snapshotKeyOverride,
888-
}
889-
vm, err := firecracker.NewContainer(ctx, env, task, opts)
890-
require.NoError(t, err)
891-
containersToCleanup = append(containersToCleanup, vm)
892-
err = vm.Create(ctx, workDir)
893-
require.NoError(t, err)
894-
cmd := appendToLog(stringToLog)
895-
res := vm.Exec(ctx, cmd, nil /*=stdio*/)
896-
require.NoError(t, res.Error)
897-
require.Equal(t, expectedOutput, string(res.Stdout))
898-
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
899-
err = vm.Pause(ctx)
900-
require.NoError(t, err)
893+
// Create a snapshot. Data written to this snapshot should persist
894+
// when other VMs reuse the snapshot
895+
cmd := appendToLog("Base")
896+
res := baseVM.Exec(ctx, cmd, nil /*=stdio*/)
897+
require.NoError(t, res.Error)
898+
require.Equal(t, "Base\n", string(res.Stdout))
899+
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
900+
err = baseVM.Pause(ctx)
901+
require.NoError(t, err)
901902

902-
return vm.SnapshotID()
903+
// Start a VM from the snapshot. Artifacts should be stored locally in the filecache
904+
workDirForkLocalFetch := testfs.MakeDirAll(t, rootDir, "work-fork-local-fetch")
905+
opts = firecracker.ContainerOpts{
906+
ContainerImage: busyboxImage,
907+
ActionWorkingDirectory: workDirForkLocalFetch,
908+
VMConfiguration: &fcpb.VMConfiguration{
909+
NumCpus: 1,
910+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
911+
EnableNetworking: false,
912+
ScratchDiskSizeMb: 100,
913+
},
914+
ExecutorConfig: cfg,
903915
}
904-
905-
// Create a snapshot. Data written to this snapshot should be cached remotely,
906-
// and persist when other VMs reuse the snapshot.
907-
workDir := testfs.MakeDirAll(t, rootDir, "work")
908-
baseSnapshotId := runAndSnapshotVM(workDir, "Cache Remotely", "Cache Remotely\n", nil)
909-
910-
// Start a VM from the snapshot. Artifacts should be stored locally in the filecache.
916+
forkedVM, err := firecracker.NewContainer(ctx, env, task, opts)
917+
require.NoError(t, err)
918+
containersToCleanup = append(containersToCleanup, forkedVM)
919+
err = forkedVM.Unpause(ctx)
920+
require.NoError(t, err)
921+
cmd = appendToLog("Fork local fetch")
922+
res = forkedVM.Exec(ctx, cmd, nil /*=stdio*/)
923+
require.NoError(t, res.Error)
911924
// The log should contain data written to the original snapshot
912-
// and the current VM.
913-
workDirForkLocalFetch := testfs.MakeDirAll(t, rootDir, "work-fork-local-fetch")
914-
runAndSnapshotVM(workDirForkLocalFetch, "Cache Locally", "Cache Remotely\nCache Locally\n", nil)
915-
916-
// Start another VM from the locally cached snapshot. The log should contain
917-
// data from the most recent run, as well as the current run.
918-
workDirForkLocalFetch2 := testfs.MakeDirAll(t, rootDir, "work-fork-local-fetch")
919-
runAndSnapshotVM(workDirForkLocalFetch2, "Cache Locally 2", "Cache Remotely\nCache Locally\nCache Locally 2\n", nil)
925+
// and the current VM
926+
require.Equal(t, "Base\nFork local fetch\n", string(res.Stdout))
927+
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
928+
err = forkedVM.Pause(ctx)
929+
require.NoError(t, err)
920930

921931
// Clear the local filecache. Vms should still be able to unpause the snapshot
922932
// by pulling artifacts from the remote cache
@@ -928,25 +938,65 @@ func TestFirecracker_RemoteSnapshotSharing(t *testing.T) {
928938
fc2.WaitForDirectoryScanToComplete()
929939
env.SetFileCache(fc2)
930940

931-
// Start a VM from the remote snapshot. The log should contain data from the
932-
// first run, which was saved remotely, but not the two most recent runs which
933-
// were only cached locally.
941+
// Start a VM from the snapshot.
934942
workDirForkRemoteFetch := testfs.MakeDirAll(t, rootDir, "work-fork-remote-fetch")
935-
runAndSnapshotVM(workDirForkRemoteFetch, "Cache Locally 3", "Cache Remotely\nCache Locally 3\n", nil)
943+
opts = firecracker.ContainerOpts{
944+
ContainerImage: busyboxImage,
945+
ActionWorkingDirectory: workDirForkRemoteFetch,
946+
VMConfiguration: &fcpb.VMConfiguration{
947+
NumCpus: 1,
948+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
949+
EnableNetworking: false,
950+
ScratchDiskSizeMb: 100,
951+
},
952+
ExecutorConfig: cfg,
953+
}
954+
forkedVM2, err := firecracker.NewContainer(ctx, env, task, opts)
955+
require.NoError(t, err)
956+
containersToCleanup = append(containersToCleanup, forkedVM2)
957+
err = forkedVM2.Unpause(ctx)
958+
require.NoError(t, err)
959+
cmd = appendToLog("Fork remote fetch")
960+
res = forkedVM2.Exec(ctx, cmd, nil /*=stdio*/)
961+
require.NoError(t, res.Error)
962+
// The log should contain data written to the most recent snapshot
963+
require.Equal(t, "Base\nFork local fetch\nFork remote fetch\n", string(res.Stdout))
964+
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
936965

937966
// Should still be able to start from the original snapshot if we use
938967
// a snapshot key containing the original VM's snapshot ID.
939968
// Note that when using a snapshot ID as the key, we only include the
940969
// snapshot_id and instance_name fields.
941-
// The log should contain data written to the original snapshot
942-
// and the current VM, but not from any of the other VMs, including the master
943-
// snapshot
944970
workDirForkOriginalSnapshot := testfs.MakeDirAll(t, rootDir, "work-fork-og-snapshot")
945971
originalSnapshotKey := &fcpb.SnapshotKey{
946972
InstanceName: instanceName,
947973
SnapshotId: baseSnapshotId,
948974
}
949-
runAndSnapshotVM(workDirForkOriginalSnapshot, "Fork from original vm", "Cache Remotely\nFork from original vm\n", originalSnapshotKey)
975+
opts = firecracker.ContainerOpts{
976+
ContainerImage: busyboxImage,
977+
ActionWorkingDirectory: workDirForkOriginalSnapshot,
978+
VMConfiguration: &fcpb.VMConfiguration{
979+
NumCpus: 1,
980+
MemSizeMb: minMemSizeMB, // small to make snapshotting faster.
981+
EnableNetworking: false,
982+
ScratchDiskSizeMb: 100,
983+
},
984+
ExecutorConfig: cfg,
985+
OverrideSnapshotKey: originalSnapshotKey,
986+
}
987+
ogFork, err := firecracker.NewContainer(ctx, env, task, opts)
988+
require.NoError(t, err)
989+
containersToCleanup = append(containersToCleanup, ogFork)
990+
err = ogFork.Unpause(ctx)
991+
require.NoError(t, err)
992+
cmd = appendToLog("Fork from original vm")
993+
res = ogFork.Exec(ctx, cmd, nil /*=stdio*/)
994+
require.NoError(t, res.Error)
995+
// The log should contain data written to the original snapshot
996+
// and the current VM, but not from any of the other VMs, including the master
997+
// snapshot
998+
require.Equal(t, "Base\nFork from original vm\n", string(res.Stdout))
999+
require.NotEmpty(t, res.VMMetadata.GetSnapshotId())
9501000
}
9511001

9521002
func TestFirecracker_RemoteSnapshotSharing_RemoteInstanceName(t *testing.T) {

enterprise/server/remote_execution/snaploader/snaploader.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -339,8 +339,9 @@ type CacheSnapshotOptions struct {
339339
// Whether to save the snapshot to the remote cache (in addition to locally)
340340
Remote bool
341341

342-
// Whether we skipped caching remotely due to newly applied remote snapshot limits.
343-
SkippedCacheRemotely bool
342+
// Whether we would've cached this snapshot remotely if our remote snapshot
343+
// limits were applied.
344+
WouldNotHaveCachedRemotely bool
344345
}
345346

346347
type UnpackedSnapshot struct {
@@ -417,20 +418,19 @@ func (l *FileCacheLoader) GetSnapshot(ctx context.Context, keys *fcpb.SnapshotKe
417418
func (l *FileCacheLoader) getSnapshot(ctx context.Context, key *fcpb.SnapshotKey, remoteEnabled bool) (*fcpb.SnapshotManifest, error) {
418419
ctx, span := tracing.StartSpan(ctx)
419420
defer span.End()
420-
421-
// Always prioritize the local manifest if it exists. It may point to a more
422-
// updated snapshot than the remote manifest, which is updated less frequently.
423-
if *snaputil.EnableLocalSnapshotSharing {
424-
manifest, err := l.getLocalManifest(ctx, key)
425-
if err == nil || !remoteEnabled || !*snaputil.EnableRemoteSnapshotSharing {
426-
return manifest, err
421+
if *snaputil.EnableRemoteSnapshotSharing && remoteEnabled {
422+
manifest, err := l.fetchRemoteManifest(ctx, key)
423+
if err != nil {
424+
return nil, status.WrapError(err, "fetch remote manifest")
427425
}
428-
} else if !remoteEnabled {
429-
return nil, status.InternalErrorf("invalid state: EnableLocalSnapshotSharing=false and remoteEnabled=false")
426+
return manifest, nil
430427
}
431428

432-
// Fall back to fetching remote manifest.
433-
return l.fetchRemoteManifest(ctx, key)
429+
manifest, err := l.getLocalManifest(ctx, key)
430+
if err != nil {
431+
return nil, status.WrapError(err, "get local manifest")
432+
}
433+
return manifest, nil
434434
}
435435

436436
// fetchRemoteManifest fetches the most recent snapshot manifest from the remote
@@ -1042,7 +1042,7 @@ func (l *FileCacheLoader) cacheCOW(ctx context.Context, name string, remoteInsta
10421042
metrics.FileName: name,
10431043
}).Observe(float64(emptyChunkCount) / float64(len(chunks)))
10441044

1045-
if cacheOpts.SkippedCacheRemotely {
1045+
if cacheOpts.Remote && cacheOpts.WouldNotHaveCachedRemotely {
10461046
metrics.COWSnapshotSkippedRemoteBytes.With(prometheus.Labels{
10471047
metrics.FileName: name,
10481048
}).Add(float64(compressedBytesWrittenRemotely))

0 commit comments

Comments
 (0)