@@ -23,11 +23,18 @@ import (
23
23
"errors"
24
24
"fmt"
25
25
"io"
26
+ "math"
26
27
"strings"
28
+ "sync"
27
29
"time"
28
30
29
31
"github.com/RichardKnop/machinery/v1"
30
32
"github.com/go-playground/validator/v10"
33
+ grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
34
+ grpc_zap "github.com/grpc-ecosystem/go-grpc-middleware/logging/zap"
35
+ grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry"
36
+ grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
37
+ "google.golang.org/grpc"
31
38
"google.golang.org/grpc/codes"
32
39
"google.golang.org/grpc/status"
33
40
@@ -46,10 +53,14 @@ import (
46
53
const (
47
54
// preheatTimeout is timeout of preheating.
48
55
preheatTimeout = 20 * time .Minute
49
- // listTasksTimeout is timeout of listing tasks.
50
- listTasksTimeout = 10 * time .Minute
51
56
// deleteTaskTimeout is timeout of deleting task.
52
57
deleteTaskTimeout = 20 * time .Minute
58
+ // deleteTaskConcurrency is the number of concurrent delete tasks.
59
+ deleteTaskConcurrency = 10
60
+ // deleteTaskMaxRetries is the maximum number of retries for delete tasks.
61
+ deleteTaskMaxRetries = 3
62
+ // deleteTaskBackoffWaitBetween is waiting for a fixed period of time between calls in backoff linear.
63
+ deleteTaskBackoffWaitBetween = 500 * time .Millisecond
53
64
)
54
65
55
66
// Job is an interface for job.
@@ -306,12 +317,6 @@ func (j *job) syncPeers() (string, error) {
306
317
307
318
// listTasks is a job to list tasks.
308
319
func (j * job ) listTasks (ctx context.Context , data string ) (string , error ) {
309
- // TODO:
310
- // 1. query all peers with task id
311
- // 2. delete current task by task id and host id
312
- ctx , cancel := context .WithTimeout (ctx , listTasksTimeout )
313
- defer cancel ()
314
-
315
320
req := & internaljob.ListTasksRequest {}
316
321
if err := internaljob .UnmarshalRequest (data , req ); err != nil {
317
322
logger .Errorf ("unmarshal request err: %s, request body: %s" , err .Error (), data )
@@ -324,26 +329,21 @@ func (j *job) listTasks(ctx context.Context, data string) (string, error) {
324
329
}
325
330
326
331
// Get all peers by task id
327
- peers , err := j .getPeers (req .TaskID )
332
+ peers , err := j .getValidPeers (req .TaskID )
328
333
if err != nil {
329
334
logger .Errorf ("get peers by task id %s failed: %s" , req .TaskID , err .Error ())
330
335
return "" , err
331
336
}
332
337
333
- // Return peers by page
334
338
listTaskResponse := & internaljob.ListTasksResponse {
335
- Total : len (peers ),
336
- Page : req .Page ,
337
- Peers : peers [req .Page * req .PerPage : (req .Page + 1 )* req .PerPage ],
339
+ Peers : peers ,
338
340
}
339
341
340
342
return internaljob .MarshalResponse (listTaskResponse )
341
343
}
342
344
343
345
// deleteTask is a job to delete task.
344
346
func (j * job ) deleteTask (ctx context.Context , data string ) (string , error ) {
345
- // TODO:
346
- // 1. query all peers with task id
347
347
ctx , cancel := context .WithTimeout (ctx , deleteTaskTimeout )
348
348
defer cancel ()
349
349
@@ -359,43 +359,82 @@ func (j *job) deleteTask(ctx context.Context, data string) (string, error) {
359
359
}
360
360
361
361
// Get all peers by task id
362
- peers , err := j .getPeers (req .TaskID )
362
+ peers , err := j .getValidPeers (req .TaskID )
363
363
if err != nil {
364
364
logger .Errorf ("get peers by task id %s failed: %s" , req .TaskID , err .Error ())
365
365
return "" , err
366
366
}
367
367
368
368
// Delete task by task id and host id
369
- successTasks := make ([]* internaljob.TaskInfo , 0 )
370
- failureTasks := make ([]* internaljob.TaskInfo , 0 )
369
+ successTasks := make ([]* internaljob.Task , 0 )
370
+ failureTasks := make ([]* internaljob.Task , 0 )
371
371
372
+ // Create a wait group to limit delete rpc concurrency
373
+ // and avoid too many rpc requests to the host.
374
+ wg := sync.WaitGroup {}
375
+ deleteTaskLimit := make (chan struct {}, deleteTaskConcurrency )
372
376
for _ , peer := range peers {
373
- // hostID := peer.Host.ID
374
- // get task info by task id
375
- task , ok := j .resource .TaskManager ().Load (req .TaskID )
376
- if ! ok {
377
- logger .Errorf ("task %s not found" , req .TaskID )
378
- failureTasks = append (failureTasks , & internaljob.TaskInfo {
379
- Task : nil ,
380
- Peer : peer ,
381
- Desc : "task not found" ,
377
+ wg .Add (1 )
378
+ deleteTaskLimit <- struct {}{}
379
+ go func (peer * resource.Peer ) {
380
+ defer func () {
381
+ wg .Done ()
382
+ <- deleteTaskLimit
383
+ }()
384
+
385
+ // Get dfdaemon client from host
386
+ target := fmt .Sprintf ("%s:%d" , peer .Host .IP , peer .Host .Port )
387
+ conn , err := grpc .DialContext (
388
+ ctx ,
389
+ target ,
390
+ grpc .WithIdleTimeout (0 ),
391
+ grpc .WithDefaultCallOptions (
392
+ grpc .MaxCallRecvMsgSize (math .MaxInt32 ),
393
+ grpc .MaxCallSendMsgSize (math .MaxInt32 ),
394
+ ),
395
+ grpc .WithUnaryInterceptor (grpc_middleware .ChainUnaryClient (
396
+ grpc_prometheus .UnaryClientInterceptor ,
397
+ grpc_zap .UnaryClientInterceptor (logger .GrpcLogger .Desugar ()),
398
+ grpc_retry .UnaryClientInterceptor (
399
+ grpc_retry .WithMax (deleteTaskMaxRetries ),
400
+ grpc_retry .WithBackoff (grpc_retry .BackoffLinear (deleteTaskBackoffWaitBetween )),
401
+ ),
402
+ )),
403
+ )
404
+ if err != nil {
405
+ logger .Errorf ("create grpc client to %s failed: %s" , target , err .Error ())
406
+ failureTasks = append (failureTasks , & internaljob.Task {
407
+ Task : peer .Task ,
408
+ Peer : peer ,
409
+ Description : err .Error (),
410
+ })
411
+ return
412
+ }
413
+
414
+ dfdaemonUploadClient := dfdaemonv2 .NewDfdaemonUploadClient (conn )
415
+ _ , err = dfdaemonUploadClient .DeleteCacheTask (ctx , & dfdaemonv2.DeleteCacheTaskRequest {
416
+ TaskId : req .TaskID ,
382
417
})
383
- continue
384
- }
418
+ if err != nil {
419
+ logger .Errorf ("delete task %s from %s failed: %s" , req .TaskID , target , err .Error ())
420
+ failureTasks = append (failureTasks , & internaljob.Task {
421
+ Task : peer .Task ,
422
+ Peer : peer ,
423
+ Description : err .Error (),
424
+ })
425
+ return
426
+ }
385
427
386
- // TODO: change to scheduler delete task grpc function
387
- // and add batch delete
388
- j .resource .SeedPeer ().Client ().DeleteCacheTask (ctx , & dfdaemonv2.DeleteCacheTaskRequest {
389
- TaskId : req .TaskID ,
390
- })
391
-
392
- successTasks = append (successTasks , & internaljob.TaskInfo {
393
- Task : task ,
394
- Peer : peer ,
395
- Desc : "success" ,
396
- })
428
+ successTasks = append (successTasks , & internaljob.Task {
429
+ Task : peer .Task ,
430
+ Peer : peer ,
431
+ Description : fmt .Sprintf ("delete task %s from %s success" , req .TaskID , target ),
432
+ })
433
+ }(peer )
397
434
}
398
435
436
+ wg .Wait ()
437
+
399
438
deleteTaskResponse := & internaljob.DeleteTaskResponse {
400
439
SuccessTasks : successTasks ,
401
440
FailureTasks : failureTasks ,
@@ -404,8 +443,8 @@ func (j *job) deleteTask(ctx context.Context, data string) (string, error) {
404
443
return internaljob .MarshalResponse (deleteTaskResponse )
405
444
}
406
445
407
- // getPeers try to get peers by task id
408
- func (j * job ) getPeers (taskID string ) ([]* resource.Peer , error ) {
446
+ // getValidPeers try to get valid peers by task id
447
+ func (j * job ) getValidPeers (taskID string ) ([]* resource.Peer , error ) {
409
448
// get task info by task id
410
449
task , ok := j .resource .TaskManager ().Load (taskID )
411
450
if ! ok {
@@ -424,5 +463,14 @@ func (j *job) getPeers(taskID string) ([]*resource.Peer, error) {
424
463
peers = append (peers , peer )
425
464
}
426
465
427
- return peers , nil
466
+ // Choose finished peers as list tasks result
467
+ finishedPeers := make ([]* resource.Peer , len (peers ))
468
+ for _ , peer := range peers {
469
+ currentState := peer .FSM .Current ()
470
+ if currentState == resource .PeerStateSucceeded || currentState == resource .PeerStateFailed {
471
+ finishedPeers = append (finishedPeers , peer )
472
+ }
473
+ }
474
+
475
+ return finishedPeers , nil
428
476
}
0 commit comments