Skip to content

Commit 70a9844

Browse files
committed
DRA: refactor checkpointing
1 parent 68ad8b2 commit 70a9844

File tree

6 files changed

+452
-108
lines changed

6 files changed

+452
-108
lines changed

pkg/kubelet/checkpointmanager/checkpoint_manager.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,6 @@ func (manager *impl) GetCheckpoint(checkpointKey string, checkpoint Checkpoint)
8585
return err
8686
}
8787
err = checkpoint.UnmarshalCheckpoint(blob)
88-
if err == nil {
89-
err = checkpoint.VerifyChecksum()
90-
}
9188
return err
9289
}
9390

pkg/kubelet/cm/dra/state/checkpoint.go

Lines changed: 0 additions & 68 deletions
This file was deleted.
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package state
18+
19+
import (
20+
"encoding/json"
21+
"fmt"
22+
23+
"k8s.io/apimachinery/pkg/runtime"
24+
"k8s.io/apimachinery/pkg/types"
25+
"k8s.io/apimachinery/pkg/util/sets"
26+
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
27+
)
28+
29+
// V1 data structures are copies of the correspondent structures from state_checkpoint.go (k/k 1.30)
30+
// This is done to avoid breaking changes
31+
//
32+
// NOTE: These structures must not be changed, new checkpoint version(s) should be added instead
33+
34+
type CheckpointV1 struct {
35+
CheckpointVersion
36+
Data []ClaimInfoStateV1 `json:"entries,omitempty"`
37+
Checksum checksum.Checksum `json:"checksum"`
38+
}
39+
40+
// ClaimInfoState is used to store claim info state in a checkpoint
41+
type ClaimInfoStateV1 struct {
42+
// Name of the DRA driver
43+
DriverName string
44+
45+
// ClassName is a resource class of the claim
46+
ClassName string
47+
48+
// ClaimUID is an UID of the resource claim
49+
ClaimUID types.UID
50+
51+
// ClaimName is a name of the resource claim
52+
ClaimName string
53+
54+
// Namespace is a claim namespace
55+
Namespace string
56+
57+
// PodUIDs is a set of pod UIDs that reference a resource
58+
PodUIDs sets.Set[string]
59+
60+
// ResourceHandles is a list of opaque resource data for processing by a specific kubelet plugin
61+
ResourceHandles []ResourceHandleV1
62+
63+
// CDIDevices is a map of DriverName --> CDI devices returned by the
64+
// GRPC API call NodePrepareResource
65+
CDIDevices map[string][]string
66+
}
67+
68+
// ResourceHandle holds opaque resource data for processing by a specific kubelet plugin.
69+
type ResourceHandleV1 struct {
70+
// DriverName specifies the name of the resource driver whose kubelet
71+
// plugin should be invoked to process this ResourceHandle's data once it
72+
// lands on a node. This may differ from the DriverName set in
73+
// ResourceClaimStatus this ResourceHandle is embedded in.
74+
DriverName string `json:"driverName,omitempty" protobuf:"bytes,1,opt,name=driverName"`
75+
76+
// Data contains the opaque data associated with this ResourceHandle. It is
77+
// set by the controller component of the resource driver whose name
78+
// matches the DriverName set in the ResourceClaimStatus this
79+
// ResourceHandle is embedded in. It is set at allocation time and is
80+
// intended for processing by the kubelet plugin whose name matches
81+
// the DriverName set in this ResourceHandle.
82+
//
83+
// The maximum size of this field is 16KiB. This may get increased in the
84+
// future, but not reduced.
85+
// +optional
86+
Data string `json:"data,omitempty" protobuf:"bytes,2,opt,name=data"`
87+
88+
// If StructuredData is set, then it needs to be used instead of Data.
89+
//
90+
// +optional
91+
StructuredData *StructuredResourceHandleV1 `json:"structuredData,omitempty" protobuf:"bytes,5,opt,name=structuredData"`
92+
}
93+
94+
// StructuredResourceHandle is the in-tree representation of the allocation result.
95+
type StructuredResourceHandleV1 struct {
96+
// VendorClassParameters are the per-claim configuration parameters
97+
// from the resource class at the time that the claim was allocated.
98+
//
99+
// +optional
100+
VendorClassParameters runtime.RawExtension `json:"vendorClassParameters,omitempty" protobuf:"bytes,1,opt,name=vendorClassParameters"`
101+
102+
// VendorClaimParameters are the per-claim configuration parameters
103+
// from the resource claim parameters at the time that the claim was
104+
// allocated.
105+
//
106+
// +optional
107+
VendorClaimParameters runtime.RawExtension `json:"vendorClaimParameters,omitempty" protobuf:"bytes,2,opt,name=vendorClaimParameters"`
108+
109+
// NodeName is the name of the node providing the necessary resources
110+
// if the resources are local to a node.
111+
//
112+
// +optional
113+
NodeName string `json:"nodeName,omitempty" protobuf:"bytes,4,name=nodeName"`
114+
115+
// Results lists all allocated driver resources.
116+
//
117+
// +listType=atomic
118+
Results []DriverAllocationResultV1 `json:"results" protobuf:"bytes,5,name=results"`
119+
}
120+
121+
// DriverAllocationResult contains vendor parameters and the allocation result for
122+
// one request.
123+
type DriverAllocationResultV1 struct {
124+
// VendorRequestParameters are the per-request configuration parameters
125+
// from the time that the claim was allocated.
126+
//
127+
// +optional
128+
VendorRequestParameters runtime.RawExtension `json:"vendorRequestParameters,omitempty" protobuf:"bytes,1,opt,name=vendorRequestParameters"`
129+
130+
AllocationResultModelV1 `json:",inline" protobuf:"bytes,2,name=allocationResultModel"`
131+
}
132+
133+
// AllocationResultModel must have one and only one field set.
134+
type AllocationResultModelV1 struct {
135+
// NamedResources describes the allocation result when using the named resources model.
136+
//
137+
// +optional
138+
NamedResources *NamedResourcesAllocationResultV1 `json:"namedResources,omitempty" protobuf:"bytes,1,opt,name=namedResources"`
139+
}
140+
141+
// NamedResourcesAllocationResult is used in AllocationResultModel.
142+
type NamedResourcesAllocationResultV1 struct {
143+
// Name is the name of the selected resource instance.
144+
Name string `json:"name" protobuf:"bytes,1,name=name"`
145+
}
146+
147+
func NewCheckpointV1(data ClaimInfoStateList) *CheckpointV1 {
148+
cp := &CheckpointV1{
149+
CheckpointVersion: CheckpointVersion{Version: v1},
150+
}
151+
cp.SetData(data)
152+
return cp
153+
}
154+
155+
// SetData sets checkpoint data from ClaimInfoStateList
156+
func (cp *CheckpointV1) SetData(data ClaimInfoStateList) {
157+
cp.Data = make([]ClaimInfoStateV1, len(data))
158+
for i, entry := range data {
159+
cp.Data[i] = ClaimInfoStateV1{
160+
ClaimUID: entry.ClaimUID,
161+
ClaimName: entry.ClaimName,
162+
Namespace: entry.Namespace,
163+
PodUIDs: entry.PodUIDs,
164+
CDIDevices: map[string][]string{},
165+
}
166+
for driverName, driverState := range entry.DriverState {
167+
cp.Data[i].CDIDevices[driverName] = make([]string, len(driverState.Devices))
168+
for j, device := range driverState.Devices {
169+
cp.Data[i].CDIDevices[driverName][j] = device.DeviceName
170+
}
171+
}
172+
}
173+
cp.Checksum = checksum.New(cp)
174+
}
175+
176+
// GetData returns checkpoint data in a form of ClaimInfoStateList
177+
func (cp *CheckpointV1) GetData() ClaimInfoStateList {
178+
result := make(ClaimInfoStateList, len(cp.Data))
179+
for i, entry := range cp.Data {
180+
result[i] = ClaimInfoState{
181+
ClaimUID: entry.ClaimUID,
182+
ClaimName: entry.ClaimName,
183+
Namespace: entry.Namespace,
184+
PodUIDs: entry.PodUIDs,
185+
DriverState: map[string]DriverState{},
186+
}
187+
for driverName, devices := range entry.CDIDevices {
188+
result[i].DriverState[driverName] = DriverState{
189+
Devices: make([]Device, len(devices)),
190+
}
191+
for j, cdiID := range devices {
192+
result[i].DriverState[driverName].Devices[j] = Device{
193+
CDIDeviceIDs: []string{cdiID},
194+
}
195+
}
196+
}
197+
}
198+
return result
199+
}
200+
201+
// MarshalCheckpoint marshals checkpoint to JSON
202+
func (cp *CheckpointV1) MarshalCheckpoint() ([]byte, error) {
203+
// make sure checksum wasn't set before so it doesn't affect output checksum
204+
cp.Checksum = 0
205+
cp.Checksum = checksum.New(cp)
206+
return json.Marshal(*cp)
207+
}
208+
209+
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint
210+
func (cp *CheckpointV1) UnmarshalCheckpoint(blob []byte) error {
211+
version := &CheckpointVersion{}
212+
if err := json.Unmarshal(blob, version); err != nil {
213+
return err
214+
}
215+
if version.Version == v1 {
216+
// NOTE: We can't verify V1 checksum due to the changes in structures names
217+
// e.g. DRAManagerCheckpoint -> CheckpointV1
218+
return json.Unmarshal(blob, cp)
219+
}
220+
return fmt.Errorf("unsupported checkpoint version %s", version.Version)
221+
}
222+
223+
// VerifyChecksum verifies that current checksum of checkpoint is valid
224+
func (cp *CheckpointV1) VerifyChecksum() error {
225+
ck := cp.Checksum
226+
cp.Checksum = 0
227+
err := ck.Verify(cp)
228+
cp.Checksum = ck
229+
return err
230+
}

0 commit comments

Comments
 (0)