Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions ingestor/adx/retry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package adx

import (
"context"
"errors"
"fmt"
"net/http"
"time"

kustoerrors "github.com/Azure/azure-kusto-go/kusto/data/errors"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/Azure/adx-mon/pkg/logger"
)

// isThrottled returns true if the error indicates the request was throttled by Kusto.
func isThrottled(err error) bool {
var he *kustoerrors.HttpError
if errors.As(err, &he) {
return he.IsThrottled()
}
return false
}

// isHTTP5xx returns true if the error is an HttpError with 5xx status code.
func isHTTP5xx(err error) bool {
var he *kustoerrors.HttpError
if errors.As(err, &he) {
return he.StatusCode >= http.StatusInternalServerError && he.StatusCode <= 599
}
return false
}

// isTransientKusto returns true if the error is considered retryable by the SDK Retry predicate.
func isTransientKusto(err error) bool {
return kustoerrors.Retry(err)
}

// retryMgmt wraps a Kusto management command with exponential backoff using the
// Kubernetes wait utilities. Retries are attempted for:
// - Throttling (HTTP 429)
// - HTTP 5xx responses
// - Transient errors recognized by the Kusto SDK's errors.Retry predicate
//
// Backoff parameters: start 500ms, factor 2.0, jitter 0.25, cap 10s, steps 8 (~1m upper bound).
func retryMgmt(ctx context.Context, desc string, fn func() error) error {
backoff := wait.Backoff{
Duration: 500 * time.Millisecond,
Factor: 2.0,
Jitter: 0.25,
Steps: 8,
Cap: 10 * time.Second,
}

var lastErr error
err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (done bool, err error) {
if ctx.Err() != nil {
return true, ctx.Err()
}
e := fn()
if e == nil {
return true, nil
}
// Decide retry conditions:
shouldRetry := isThrottled(e) || isHTTP5xx(e) || isTransientKusto(e)
if !shouldRetry {
return true, e
}
lastErr = e
logger.Warnf("Retrying %s due to transient error: %v", desc, e)
return false, nil
})
if err == nil {
return nil
}
if lastErr != nil {
return fmt.Errorf("%s: exhausted retries: %w", desc, lastErr)
}
return fmt.Errorf("%s: %w", desc, err)
}
42 changes: 30 additions & 12 deletions ingestor/adx/syncer.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,12 @@ func (s *Syncer) Close() error {
func (s *Syncer) loadIngestionMappings(ctx context.Context) error {
query := fmt.Sprintf(".show database %s ingestion mappings", s.database)
stmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(query)
rows, err := s.KustoCli.Mgmt(ctx, s.database, stmt)
if err != nil {
var rows *kusto.RowIterator
if err := retryMgmt(ctx, "loadIngestionMappings", func() error {
var err error
rows, err = s.KustoCli.Mgmt(ctx, s.database, stmt)
return err
}); err != nil {
return err
}

Expand Down Expand Up @@ -176,8 +180,12 @@ func (s *Syncer) EnsureTable(table string, mapping schema.SchemaMapping) error {

showStmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(sb.String())

rows, err := s.KustoCli.Mgmt(context.Background(), s.database, showStmt)
if err != nil {
var rows *kusto.RowIterator
if err := retryMgmt(context.Background(), "ensure-table", func() error {
var err error
rows, err = s.KustoCli.Mgmt(context.Background(), s.database, showStmt)
return err
}); err != nil {
return err
}

Expand Down Expand Up @@ -249,8 +257,12 @@ func (s *Syncer) EnsureMapping(table string, mapping schema.SchemaMapping) (stri

showStmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(sb.String())

rows, err := s.KustoCli.Mgmt(context.Background(), s.database, showStmt)
if err != nil {
var rows *kusto.RowIterator
if err := retryMgmt(context.Background(), "ensure-mapping", func() error {
var err error
rows, err = s.KustoCli.Mgmt(context.Background(), s.database, showStmt)
return err
}); err != nil {
return "", err
}

Expand Down Expand Up @@ -342,16 +354,20 @@ func (s *Syncer) ensurePromMetricsFunctions(ctx context.Context) error {
// but we can't create the function unless a table exists.
stmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(
".create table AdxmonIngestorTableCardinalityCount (Timestamp: datetime, SeriesId: long, Labels: dynamic, Value: real)")
_, err := s.KustoCli.Mgmt(ctx, s.database, stmt)
if err != nil {
if err := retryMgmt(ctx, "ensure-prom-functions-create-cardinality-table", func() error {
_, err := s.KustoCli.Mgmt(ctx, s.database, stmt)
return err
}); err != nil {
return err
}

for _, fn := range functions {
logger.Infof("Creating function %s", fn.name)
stmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(fn.body)
_, err := s.KustoCli.Mgmt(ctx, s.database, stmt)
if err != nil {
if err := retryMgmt(ctx, "create-function-"+fn.name, func() error {
_, err := s.KustoCli.Mgmt(ctx, s.database, stmt)
return err
}); err != nil {
return err
}
}
Expand Down Expand Up @@ -394,8 +410,10 @@ func (s *Syncer) ensureIngestionPolicy(ctx context.Context) error {

stmt := kusto.NewStmt("", kusto.UnsafeStmt(unsafe.Stmt{Add: true, SuppressWarning: true})).UnsafeAdd(
fmt.Sprintf(".alter-merge database %s policy ingestionbatching\n```%s\n```", s.database, string(b)))
_, err = s.KustoCli.Mgmt(ctx, s.database, stmt)
if err != nil {
if err := retryMgmt(ctx, "ensure-ingestion-policy", func() error {
_, err = s.KustoCli.Mgmt(ctx, s.database, stmt)
return err
}); err != nil {
return err
}
return nil
Expand Down
8 changes: 6 additions & 2 deletions ingestor/adx/uploader.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,8 +355,12 @@ func (n *uploader) extractSchema(path string) (string, error) {
// https://learn.microsoft.com/en-us/azure/data-explorer/kusto-emulator-overview#limitations
func (n *uploader) clusterRequiresDirectIngest(ctx context.Context) (bool, error) {
stmt := kql.New(".show cluster details")
rows, err := n.KustoCli.Mgmt(ctx, n.database, stmt)
if err != nil {
var rows *kusto.RowIterator
if err := retryMgmt(ctx, "show-cluster-details", func() error {
var err error
rows, err = n.KustoCli.Mgmt(ctx, n.database, stmt)
return err
}); err != nil {
return false, fmt.Errorf("failed to query cluster details: %w", err)
}
defer rows.Stop()
Expand Down