Skip to content

Commit 18b7cde

Browse files
bart0shklueska
andcommitted
DRA: refactor checkpointing
Co-authored-by: Kevin Klues <klueska@gmail.com>
1 parent d11b58e commit 18b7cde

10 files changed

+386
-269
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
"encoding/json"
21+
22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23+
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
24+
)
25+
26+
func New(data []ClaimInfoState) *Checkpoint {
27+
pc := &Checkpoint{
28+
TypeMeta: metav1.TypeMeta{
29+
Kind: CheckpointKind,
30+
APIVersion: CheckpointAPIVersion,
31+
},
32+
Data: data,
33+
Checksum: 0,
34+
}
35+
return pc
36+
}
37+
38+
// MarshalCheckpoint marshals checkpoint to JSON
39+
func (cp *Checkpoint) MarshalCheckpoint() ([]byte, error) {
40+
cp.Checksum = 0
41+
out, err := json.Marshal(*cp)
42+
if err != nil {
43+
return nil, err
44+
}
45+
cp.Checksum = checksum.New(out)
46+
return json.Marshal(*cp)
47+
}
48+
49+
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint
50+
func (cp *Checkpoint) UnmarshalCheckpoint(blob []byte) error {
51+
if err := json.Unmarshal(blob, cp); err != nil {
52+
return err
53+
}
54+
return cp.VerifyChecksum()
55+
}
56+
57+
// VerifyChecksum verifies that current checksum of checkpoint is valid
58+
func (cp *Checkpoint) VerifyChecksum() error {
59+
ck := cp.Checksum
60+
cp.Checksum = 0
61+
defer func() {
62+
cp.Checksum = ck
63+
}()
64+
out, err := json.Marshal(*cp)
65+
if err != nil {
66+
return err
67+
}
68+
return ck.Verify(out)
69+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
"k8s.io/apimachinery/pkg/types"
22+
"k8s.io/apimachinery/pkg/util/sets"
23+
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
24+
)
25+
26+
const (
27+
GroupName = "checkpoint.dra.kubelet.k8s.io"
28+
CheckpointKind = "DRACheckpoint"
29+
CheckpointAPIVersion = GroupName + "/v1"
30+
)
31+
32+
type ClaimInfoStateList []ClaimInfoState
33+
34+
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
35+
// +k8s:deepcopy-gen=true
36+
type Checkpoint struct {
37+
metav1.TypeMeta
38+
Data ClaimInfoStateList
39+
Checksum checksum.Checksum
40+
}
41+
42+
// +k8s:deepcopy-gen=true
43+
type ClaimInfoState struct {
44+
// ClaimUID is the UID of a resource claim
45+
ClaimUID types.UID
46+
47+
// ClaimName is the name of a resource claim
48+
ClaimName string
49+
50+
// Namespace is a claim namespace
51+
Namespace string
52+
53+
// PodUIDs is a set of pod UIDs that reference a resource
54+
PodUIDs sets.Set[string]
55+
56+
// DriverState contains information about all drivers which have allocation
57+
// results in the claim, even if they don't provide devices for their results.
58+
DriverState map[string]DriverState
59+
}
60+
61+
// DriverState is used to store per-device claim info state in a checkpoint
62+
// +k8s:deepcopy-gen=true
63+
type DriverState struct {
64+
Devices []Device
65+
}
66+
67+
// Device is how a DRA driver described an allocated device in a claim
68+
// to kubelet. RequestName and CDI device IDs are optional.
69+
// +k8s:deepcopy-gen=true
70+
type Device struct {
71+
PoolName string
72+
DeviceName string
73+
RequestNames []string
74+
CDIDeviceIDs []string
75+
}

pkg/kubelet/cm/dra/state/zz_generated.deepcopy.go renamed to pkg/kubelet/cm/dra/checkpoint/v1/zz_generated.deepcopy.go

Lines changed: 34 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/kubelet/cm/dra/claiminfo.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,16 @@ import (
2525
resourceapi "k8s.io/api/resource/v1alpha3"
2626
"k8s.io/apimachinery/pkg/types"
2727
"k8s.io/apimachinery/pkg/util/sets"
28-
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
28+
checkpointapi "k8s.io/kubernetes/pkg/kubelet/cm/dra/checkpoint/v1"
29+
state "k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
2930
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
3031
)
3132

3233
// ClaimInfo holds information required
3334
// to prepare and unprepare a resource claim.
3435
// +k8s:deepcopy-gen=true
3536
type ClaimInfo struct {
36-
state.ClaimInfoState
37+
checkpointapi.ClaimInfoState
3738
prepared bool
3839
}
3940

@@ -47,18 +48,18 @@ type claimInfoCache struct {
4748
// newClaimInfoFromClaim creates a new claim info from a resource claim.
4849
// It verifies that the kubelet can handle the claim.
4950
func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) (*ClaimInfo, error) {
50-
claimInfoState := state.ClaimInfoState{
51+
claimInfoState := checkpointapi.ClaimInfoState{
5152
ClaimUID: claim.UID,
5253
ClaimName: claim.Name,
5354
Namespace: claim.Namespace,
5455
PodUIDs: sets.New[string](),
55-
DriverState: make(map[string]state.DriverState),
56+
DriverState: make(map[string]checkpointapi.DriverState),
5657
}
5758
if claim.Status.Allocation == nil {
5859
return nil, errors.New("not allocated")
5960
}
6061
for _, result := range claim.Status.Allocation.Devices.Results {
61-
claimInfoState.DriverState[result.Driver] = state.DriverState{}
62+
claimInfoState.DriverState[result.Driver] = checkpointapi.DriverState{}
6263
}
6364
info := &ClaimInfo{
6465
ClaimInfoState: claimInfoState,
@@ -68,7 +69,7 @@ func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) (*ClaimInfo, error)
6869
}
6970

7071
// newClaimInfoFromClaim creates a new claim info from a checkpointed claim info state object.
71-
func newClaimInfoFromState(state *state.ClaimInfoState) *ClaimInfo {
72+
func newClaimInfoFromState(state *checkpointapi.ClaimInfoState) *ClaimInfo {
7273
info := &ClaimInfo{
7374
ClaimInfoState: *state.DeepCopy(),
7475
prepared: false,
@@ -77,9 +78,9 @@ func newClaimInfoFromState(state *state.ClaimInfoState) *ClaimInfo {
7778
}
7879

7980
// setCDIDevices adds a set of CDI devices to the claim info.
80-
func (info *ClaimInfo) addDevice(driverName string, device state.Device) {
81+
func (info *ClaimInfo) addDevice(driverName string, device checkpointapi.Device) {
8182
if info.DriverState == nil {
82-
info.DriverState = make(map[string]state.DriverState)
83+
info.DriverState = make(map[string]checkpointapi.DriverState)
8384
}
8485
driverState := info.DriverState[driverName]
8586
driverState.Devices = append(driverState.Devices, device)
@@ -188,7 +189,7 @@ func (cache *claimInfoCache) hasPodReference(UID types.UID) bool {
188189

189190
// syncToCheckpoint syncs the full claim info cache state to a checkpoint.
190191
func (cache *claimInfoCache) syncToCheckpoint() error {
191-
claimInfoStateList := make(state.ClaimInfoStateList, 0, len(cache.claimInfo))
192+
claimInfoStateList := make(checkpointapi.ClaimInfoStateList, 0, len(cache.claimInfo))
192193
for _, infoClaim := range cache.claimInfo {
193194
claimInfoStateList = append(claimInfoStateList, infoClaim.ClaimInfoState)
194195
}

0 commit comments

Comments
 (0)