@@ -3,11 +3,13 @@ package cpuset
3
3
import (
4
4
"fmt"
5
5
"math"
6
+ "runtime"
6
7
"strconv"
7
8
"strings"
8
9
"sync"
9
10
10
11
"github.com/buildbuddy-io/buildbuddy/server/interfaces"
12
+ "github.com/buildbuddy-io/buildbuddy/server/util/alert"
11
13
"github.com/buildbuddy-io/buildbuddy/server/util/flag"
12
14
"github.com/buildbuddy-io/buildbuddy/server/util/log"
13
15
"github.com/buildbuddy-io/buildbuddy/server/util/priority_queue"
@@ -20,14 +22,23 @@ var (
20
22
cpuLeaserOverhead = flag .Float64 ("executor.cpu_leaser.overhead" , .20 , "The amount of extra CPU *above the task size* to include in a lease" )
21
23
cpuLeaserMinOverhead = flag .Int ("executor.cpu_leaser.min_overhead" , 2 , "Always ensure at least this many extra cpus are included in a lease" )
22
24
cpuLeaserCPUSet = flag .String ("executor.cpu_leaser.cpuset" , "" , "Manual override for the set of CPUs that may be leased. Ignored if empty. Ex. '0-1,3'" )
25
+ warnAboutLeaks = flag .Bool ("executor.cpu_leaser.warn_about_leaks" , true , "If set, warn about leaked leases" )
26
+ )
27
+
28
+ const (
29
+ // MaxNumLeases configures a safety valve to prevent a memory leak if
30
+ // leasers forget to close their leases.
31
+ MaxNumLeases = 1000
23
32
)
24
33
25
34
// Compile-time check that cpuLeaser implements the interface.
26
- var _ interfaces.CPULeaser = (* cpuLeaser )(nil )
35
+ var _ interfaces.CPULeaser = (* CPULeaser )(nil )
27
36
28
- type cpuLeaser struct {
37
+ type CPULeaser struct {
29
38
mu sync.Mutex
30
- leases map [cpuInfo ][]string
39
+ cpus []cpuInfo
40
+ leases []lease
41
+ load map [int ]int
31
42
physicalProcessors int
32
43
}
33
44
@@ -36,6 +47,12 @@ type cpuInfo struct {
36
47
physicalID int // numa node
37
48
}
38
49
50
+ type lease struct {
51
+ taskID string
52
+ cpus []int
53
+ location string // only set if *warnAboutLeaks is enabled
54
+ }
55
+
39
56
func toCPUInfos (processors []int , physicalID int ) []cpuInfo {
40
57
infos := make ([]cpuInfo , len (processors ))
41
58
for i , p := range processors {
@@ -114,9 +131,10 @@ func Parse(s string) ([]int, error) {
114
131
return processors , nil
115
132
}
116
133
117
- func NewLeaser () (interfaces.CPULeaser , error ) {
118
- cl := & cpuLeaser {
119
- leases : make (map [cpuInfo ][]string ),
134
+ func NewLeaser () (* CPULeaser , error ) {
135
+ cl := & CPULeaser {
136
+ leases : make ([]lease , 0 , MaxNumLeases ),
137
+ load : make (map [int ]int , 0 ),
120
138
}
121
139
122
140
var cpus []cpuInfo
@@ -130,13 +148,16 @@ func NewLeaser() (interfaces.CPULeaser, error) {
130
148
cpus = GetCPUs ()
131
149
}
132
150
151
+ cl .cpus = make ([]cpuInfo , len (cpus ))
152
+
133
153
processors := make (map [int ]struct {}, 0 )
134
- for _ , cpu := range cpus {
135
- cl .leases [cpu ] = make ([]string , 0 )
154
+ for i , cpu := range cpus {
155
+ cl .cpus [i ] = cpu
156
+ cl .load [cpu .processor ] = 0
136
157
processors [cpu .physicalID ] = struct {}{}
137
158
}
138
159
cl .physicalProcessors = len (processors )
139
- log .Debugf ("NewLeaser with %d processors and %d cores" , cl .physicalProcessors , len (cl .leases ))
160
+ log .Debugf ("NewLeaser with %d processors and %d cores" , cl .physicalProcessors , len (cl .cpus ))
140
161
return cl , nil
141
162
}
142
163
@@ -167,7 +188,7 @@ func WithNoOverhead() Option {
167
188
168
189
// Acquire leases a set of CPUs (identified by index) for a task. The returned
169
190
// function should be called to free the CPUs when they are no longer used.
170
- func (l * cpuLeaser ) Acquire (milliCPU int64 , taskID string , opts ... any ) (int , []int , func ()) {
191
+ func (l * CPULeaser ) Acquire (milliCPU int64 , taskID string , opts ... any ) (int , []int , func ()) {
171
192
l .mu .Lock ()
172
193
defer l .mu .Unlock ()
173
194
@@ -181,15 +202,16 @@ func (l *cpuLeaser) Acquire(milliCPU int64, taskID string, opts ...any) (int, []
181
202
numCPUs := computeNumCPUs (milliCPU , ! options .disableOverhead )
182
203
// If the CPU leaser is disabled; return all CPUs.
183
204
if ! * cpuLeaserEnable {
184
- numCPUs = len (l .leases )
205
+ numCPUs = len (l .cpus )
185
206
}
186
207
187
208
// Put all CPUs in a priority queue.
188
209
pq := priority_queue .New [cpuInfo ]()
189
- for cpuid , tasks := range l .leases {
210
+ for _ , cpuInfo := range l .cpus {
211
+ numTasks := l .load [cpuInfo .processor ]
190
212
// we want the least loaded cpus first, so give the
191
213
// cpus with more tasks a more negative score.
192
- pq .Push (cpuid , - 1 * len ( tasks ) )
214
+ pq .Push (cpuInfo , - 1 * numTasks )
193
215
}
194
216
195
217
// Get the set of CPUs, in order of load (incr).
@@ -217,30 +239,79 @@ func (l *cpuLeaser) Acquire(milliCPU int64, taskID string, opts ...any) (int, []
217
239
if c .physicalID != selectedNode {
218
240
continue
219
241
}
220
- // If the CPULeaser is enabled, actually track the lease.
221
- if * cpuLeaserEnable {
222
- l .leases [c ] = append (l .leases [c ], taskID )
223
- }
224
242
leaseSet = append (leaseSet , c .processor )
243
+ l .load [c .processor ] += 1
225
244
if len (leaseSet ) == numCPUs {
226
245
break
227
246
}
228
247
}
229
248
249
+ // If the CPULeaser is enabled, actually track the lease.
250
+ if * cpuLeaserEnable {
251
+ if len (l .leases ) >= MaxNumLeases {
252
+ droppedLease := l .leases [0 ]
253
+ l .leases = l .leases [1 :]
254
+ for _ , processor := range droppedLease .cpus {
255
+ l .load [processor ] -= 1
256
+ }
257
+ if * warnAboutLeaks {
258
+ alert .UnexpectedEvent ("cpu_leaser_leak" , "Acquire() handle leak at %s!" , droppedLease .location )
259
+ }
260
+ }
261
+
262
+ lease := lease {
263
+ taskID : taskID ,
264
+ cpus : leaseSet ,
265
+ }
266
+ if * warnAboutLeaks {
267
+ if _ , file , no , ok := runtime .Caller (1 ); ok {
268
+ lease .location = fmt .Sprintf ("%s:%d" , file , no )
269
+ }
270
+ }
271
+
272
+ l .leases = append (l .leases , lease )
273
+ }
274
+
230
275
log .Debugf ("Leased %s to task: %q (%d milliCPU)" , Format (leaseSet ), taskID , milliCPU )
231
276
return selectedNode , leaseSet , func () {
232
277
l .release (taskID )
233
278
}
234
279
}
235
280
236
- func (l * cpuLeaser ) release (taskID string ) {
281
+ func (l * CPULeaser ) release (taskID string ) {
237
282
l .mu .Lock ()
238
283
defer l .mu .Unlock ()
239
284
240
- for cpuid , tasks := range l .leases {
241
- l .leases [cpuid ] = slices .DeleteFunc (tasks , func (s string ) bool {
242
- return s == taskID
243
- })
285
+ var cpus []int
286
+ l .leases = slices .DeleteFunc (l .leases , func (l lease ) bool {
287
+ if l .taskID == taskID {
288
+ cpus = l .cpus
289
+ return true
290
+ }
291
+ return false
292
+ })
293
+
294
+ for _ , cpu := range cpus {
295
+ l .load [cpu ] -= 1
244
296
}
297
+
245
298
log .Debugf ("Task: %q released CPUs" , taskID )
246
299
}
300
+
301
+ func (l * CPULeaser ) TestOnlyGetOpenLeases () map [string ]int {
302
+ l .mu .Lock ()
303
+ defer l .mu .Unlock ()
304
+
305
+ taskCounts := make (map [string ]int )
306
+ for _ , l := range l .leases {
307
+ taskCounts [l .taskID ] = len (l .cpus )
308
+ }
309
+ return taskCounts
310
+ }
311
+
312
+ func (l * CPULeaser ) TestOnlyGetLoads () map [int ]int {
313
+ l .mu .Lock ()
314
+ defer l .mu .Unlock ()
315
+
316
+ return l .load
317
+ }
0 commit comments