[3.1.9 backport] CBG-4016 Refresh sequence allocator before incr during nextSequenceGreaterThan (#6960)

adamcfraser · web-flow · commit 4f53ee26b82b · 2024-07-16T14:09:51.000+01:00
* CBG-4015 Refresh sequence allocator before incr during nextSequenceGreaterThan When nextSequenceGreaterThan requires a sequence larger than what’s already present in the allocator’s batch size, fetch the current _sync:seq before computing the require increment size, to account for variable allocation rates by other allocators. * Logging fixes * Test enhancements from PR review
diff --git a/db/sequence_allocator.go b/db/sequence_allocator.go
@@ -182,6 +182,18 @@ func (s *sequenceAllocator) nextSequence(ctx context.Context) (sequence uint64,
 	return sequence, nil
 }
 
+// _releaseCurrentBatch releases any unused sequences currently held by the allocator
+func (s *sequenceAllocator) _releaseCurrentBatch(ctx context.Context) (numReleased uint64, err error) {
+	if s.max > s.last {
+		numReleased, err = s.releaseSequenceRange(ctx, s.last+1, s.max)
+		if err != nil {
+			return 0, err
+		}
+		s.last = s.max
+	}
+	return numReleased, nil
+}
+
 // nextSequenceGreaterThan increments _sync:seq such that it's greater than existingSequence + s.sequenceBatchSize
 // In the case where our local s.max < _sync:seq (another node has incremented _sync:seq), we may be releasing
 // sequences greater than existingSequence, but we will only ever release sequences allocated by this node's incr operation
@@ -221,19 +233,45 @@ func (s *sequenceAllocator) nextSequenceGreaterThan(ctx context.Context, existin
 
 	}
 
-	// If the target sequence is greater than the highest in our batch (s.max), we want to:
-	// (a) Reserve n sequences past _sync:seq, where n = existingSequence - s.max.  It's ok if the resulting sequence exceeds targetSequence (if other nodes have allocated sequences and
-	//   updated _sync:seq since we last updated s.max.), then
+	// At this point we need to allocate a sequence that's larger than what's in our current batch, so we first need to release the current batch while holding the mutex.
+	var numReleasedBatch uint64
+	numReleasedBatch, err = s._releaseCurrentBatch(ctx)
+	if err != nil {
+		base.InfofCtx(ctx, base.KeyCache, "Unable to release current batch during nextSequenceGreaterThan for existing sequence %d. Will be handled by skipped sequence handling. %v", existingSequence, err)
+	}
+	releasedSequenceCount += numReleasedBatch
+
+	syncSeq, err := s.getSequence()
+	if err != nil {
+		base.WarnfCtx(ctx, "Unable to fetch current sequence during nextSequenceGreaterThan for existing sequence %d. Error:%v", existingSequence, err)
+		s.mutex.Unlock()
+		return 0, 0, err
+	}
+
+	// If the target sequence is less than the current _sync:seq, allocate as normal using _nextSequence
+	if syncSeq >= targetSequence {
+		sequence, sequencesReserved, err := s._nextSequence(ctx)
+		s.mutex.Unlock()
+		if err != nil {
+			return 0, 0, err
+		}
+		if sequencesReserved {
+			s.reserveNotify <- struct{}{}
+		}
+		s.dbStats.SequenceAssignedCount.Add(1)
+		return sequence, releasedSequenceCount, nil
+	}
+
+	// If the target sequence is greater than the current _sync:seq, we want to:
+	// (a) Reserve n sequences past _sync:seq, where n = existingSequence - syncSeq.  It's ok if the resulting sequence exceeds targetSequence (if other nodes have allocated sequences and
+	//   updated _sync:seq since we last updated s.max.)
 	// (b) Allocate a standard batch of sequences, and assign a sequence from that batch in the usual way.
 	// (c) Release any previously allocated sequences (s.last to s.max)
 	// (d) Release the reserved sequences from part (a)
 	// We can perform (a) and (b) as a single increment operation, but (c) and (d) aren't necessarily contiguous blocks and must be released
 	// separately
 
-	prevAllocReleaseFrom := s.last + 1
-	prevAllocReleaseTo := s.max
-
-	numberToRelease := existingSequence - s.max
+	numberToRelease := existingSequence - syncSeq
 	numberToAllocate := s.sequenceBatchSize
 	allocatedToSeq, err := s.incrementSequence(numberToRelease + numberToAllocate)
 	if err != nil {
@@ -253,12 +291,6 @@ func (s *sequenceAllocator) nextSequenceGreaterThan(ctx context.Context, existin
 	s.dbStats.SequenceReservedCount.Add(int64(numberToRelease + numberToAllocate))
 	s.dbStats.SequenceAssignedCount.Add(1)
 
-	// Release previously allocated sequences (c), if any
-	released, err := s.releaseSequenceRange(ctx, prevAllocReleaseFrom, prevAllocReleaseTo)
-	if err != nil {
-		base.WarnfCtx(ctx, "Error returned when releasing sequence range [%d-%d] for previously allocated sequences. Will be handled by skipped sequence handling.  Error:%v", prevAllocReleaseFrom, prevAllocReleaseTo, err)
-	}
-	releasedSequenceCount += released
 	// Release the newly allocated sequences that were used to catch up to existingSequence (d)
 	if numberToRelease > 0 {
 		releaseTo := allocatedToSeq - numberToAllocate
diff --git a/db/sequence_allocator_test.go b/db/sequence_allocator_test.go
@@ -11,7 +11,12 @@ licenses/APL2.txt.
 package db
 
 import (
+	"context"
+	"fmt"
+	"log"
+	"math/rand"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -378,14 +383,154 @@ func TestNextSequenceGreaterThanMultiNode(t *testing.T) {
 	assertNewAllocatorStats(t, dbStatsB, 1, 10, 2, 4)
 
 	// calling nextSequenceGreaterThan(15) on A will increment _sync:seq by 5 on it's previously allocated sequence (10).
-	// Since node B has already updated _sync:seq to 20, will result in:
+	// Since node B has already updated _sync:seq to 20, calling nextSequenceGreaterThan(15) on A will result in:
 	//   node A releasing sequences 2-10 from it's existing buffer
-	//   node A allocating and releasing sequences 21-24
-	//   node A adding sequences 25-35 to its buffer, and assigning 25 to the current request
+	//   node A adding sequences 21-30 to its buffer, and assigning 21 to the current request
 	nextSequence, releasedSequenceCount, err = a.nextSequenceGreaterThan(ctx, 15)
 	assert.NoError(t, err)
-	assert.Equal(t, uint64(26), nextSequence)
-	assert.Equal(t, 14, int(releasedSequenceCount))
-	assertNewAllocatorStats(t, dbStatsA, 2, 25, 2, 14)
+	assert.Equal(t, uint64(21), nextSequence)
+	assert.Equal(t, 9, int(releasedSequenceCount))
+	assertNewAllocatorStats(t, dbStatsA, 2, 20, 2, 9)
 
 }
+
+// TestVariableRateAllocators simulates the following scenario:
+//   - import nodes have high sequence allocation rate
+//   - client-facing nodes have low sequence allocation rate
+//   - documents are imported, then the same documents are immediately updated by clients
+//     (including sequence validation triggering nextSequenceGreaterThan)
+//
+// Ensures we don't release more sequences than would be expected based on allocator batch size
+func TestVariableRateAllocators(t *testing.T) {
+	ctx := base.TestCtx(t)
+	bucket := base.GetTestBucket(t)
+	defer bucket.Close(ctx)
+
+	var expectedAllocations uint64
+
+	dataStore := bucket.GetSingleDataStore()
+	stats, err := base.NewSyncGatewayStats()
+	require.NoError(t, err)
+
+	importStats, err := stats.NewDBStats("import", false, false, false, nil, nil)
+	require.NoError(t, err)
+
+	importFeedAllocator, err := newSequenceAllocator(ctx, dataStore, importStats.DatabaseStats, base.DefaultMetadataKeys)
+	require.NoError(t, err)
+
+	// All test allocators are stopped when allocatorCtx is closed
+	allocatorCtx, cancelFunc := context.WithCancel(ctx)
+
+	// Start import node allocator, performing 10000 allocations/second.
+	var allocatorWg sync.WaitGroup
+	allocatorWg.Add(1)
+	go func() {
+		count := runAllocator(allocatorCtx, importFeedAllocator, 100*time.Microsecond) // 10000 writes/second
+		atomic.AddUint64(&expectedAllocations, count)
+		allocatorWg.Done()
+	}()
+
+	// Start multiple client node allocators, performing 100 allocations/second
+	clientAllocators := make([]*sequenceAllocator, 0)
+	clientAllocatorCount := 10
+	for i := 0; i <= clientAllocatorCount; i++ {
+		clientStats, err := stats.NewDBStats(fmt.Sprintf("client%d", i), false, false, false, nil, nil)
+		require.NoError(t, err)
+		clientAllocator, err := newSequenceAllocator(ctx, dataStore, clientStats.DatabaseStats, base.DefaultMetadataKeys)
+		require.NoError(t, err)
+		clientAllocators = append(clientAllocators, clientAllocator)
+		allocatorWg.Add(1)
+		go func() {
+			count := runAllocator(allocatorCtx, clientAllocator, 10*time.Millisecond) // 100 writes/second
+			atomic.AddUint64(&expectedAllocations, count)
+			allocatorWg.Done()
+		}()
+	}
+
+	// Wait for allocators to get up to maximum batch size
+	time.Sleep(500 * time.Millisecond)
+	documentCount := 10
+	var updateWg sync.WaitGroup
+	updateWg.Add(documentCount)
+	for i := 0; i < documentCount; i++ {
+		go func() {
+			_ = multiNodeUpdate(t, ctx, importFeedAllocator, clientAllocators, 5, 10*time.Millisecond)
+			updateWg.Done()
+			atomic.AddUint64(&expectedAllocations, 6)
+		}()
+	}
+
+	updateWg.Wait()
+
+	// Stop background allocation goroutines, wait for them to close
+	cancelFunc()
+	allocatorWg.Wait()
+
+	log.Printf("expectedSequence (num allocations):%v", atomic.LoadUint64(&expectedAllocations))
+
+	importFeedAllocator.Stop(ctx)
+	numAssigned := importFeedAllocator.dbStats.SequenceAssignedCount.Value()
+	numReleased := importFeedAllocator.dbStats.SequenceReleasedCount.Value()
+	for _, allocator := range clientAllocators {
+		allocator.Stop(ctx)
+		numAssigned += allocator.dbStats.SequenceAssignedCount.Value()
+		clientSequencesReleased := allocator.dbStats.SequenceReleasedCount.Value()
+		numReleased += clientSequencesReleased
+
+	}
+
+	log.Printf("Total sequences released + assigned: %v", numReleased+numAssigned)
+	actualSequence, err := importFeedAllocator.getSequence()
+	log.Printf("actual sequence (getSequence): %v", actualSequence)
+	require.NoError(t, err)
+}
+
+// multiNodeUpdate obtains an initial sequence from an import allocator (import node), then performs repeated updates to the doc using random pool of iterators (random SG node).
+// Performs sequenceGreaterThan, then ensures that allocator doesn't release more than the sequence batch size
+func multiNodeUpdate(t *testing.T, ctx context.Context, importAllocator *sequenceAllocator, clientAllocators []*sequenceAllocator, updateCount int, interval time.Duration) (releasedCount uint64) {
+	currentSequence, _ := importAllocator.nextSequence(ctx)
+
+	for i := 0; i < updateCount; i++ {
+		allocatorIndex := rand.Intn(len(clientAllocators))
+		clientAllocator := clientAllocators[allocatorIndex]
+		nextSequence, err := clientAllocator.nextSequence(ctx)
+		require.NoError(t, err, "nextSequence error: %v", err)
+		if nextSequence < currentSequence {
+			prevNext := nextSequence
+			var numReleased uint64
+			nextSequence, numReleased, err = clientAllocator.nextSequenceGreaterThan(ctx, currentSequence)
+			require.NoError(t, err, "nextSequenceGreaterThan error: %v", err)
+			log.Printf("allocator %d released %d sequences because next < current (%d < %d)", numReleased, allocatorIndex, prevNext, currentSequence)
+			// At most clientAllocator should only need to release the current batch
+			assert.LessOrEqual(t, numReleased, getClientSequenceBatchSize(clientAllocator))
+			releasedCount += numReleased
+		}
+		currentSequence = nextSequence
+		time.Sleep(interval)
+	}
+
+	return releasedCount
+}
+
+func runAllocator(ctx context.Context, a *sequenceAllocator, frequency time.Duration) (allocationCount uint64) {
+
+	allocationCount = 0
+	ticker := time.NewTicker(frequency)
+	for {
+		select {
+		case <-ticker.C:
+			_, _ = a.nextSequence(ctx)
+			allocationCount++
+		case <-ctx.Done():
+			ticker.Stop()
+			log.Printf("allocator count: %v", allocationCount)
+			return allocationCount
+		}
+	}
+}
+
+func getClientSequenceBatchSize(allocator *sequenceAllocator) uint64 {
+	allocator.mutex.Lock()
+	defer allocator.mutex.Unlock()
+	return allocator.sequenceBatchSize
+}