[FIXED] workqueue reset to 0 when blk file is zero-sized due to unflushed data on crash (#6882)

neilalexander · web-flow · commit 7c06a4f3a098 · 2025-05-12T10:18:42.000+01:00
I encountered an issue where a Workqueue stream’s first and last sequence numbers were unexpectedly reset to 0 following an abrupt termination of the NATS server. Interestingly, the consumer remained fully caught up with messages and retained its expected state even after the crash, but the stream itself appeared to have been reset. I was able to retrieve a backup of the data after the crash and debug it locally. During analysis, I found that new msgs had not been flushed to disk, resulting in a zero-sized blk file I believe. As a result, during recovery, the stream state remained at zero and the index.db could not be used to reconstruct the state Resolves : #6881 Signed-off-by: souravagrawal <souravagrawal1111@gmail.com>
diff --git a/server/filestore.go b/server/filestore.go
@@ -475,18 +475,20 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim
 		}
 
 		// Check if our prior state remembers a last sequence past where we can see.
-		if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {
+		if prior.LastSeq > fs.state.LastSeq {
 			fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime
 			if fs.state.Msgs == 0 {
 				fs.state.FirstSeq = fs.state.LastSeq + 1
 				fs.state.FirstTime = time.Time{}
 			}
-			if _, err := fs.newMsgBlockForWrite(); err == nil {
-				if err = fs.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()); err != nil {
+			if fs.ld != nil {
+				if _, err := fs.newMsgBlockForWrite(); err == nil {
+					if err = fs.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()); err != nil {
+						return nil, err
+					}
+				} else {
 					return nil, err
 				}
-			} else {
-				return nil, err
 			}
 		}
 		// Since we recovered here, make sure to kick ourselves to write out our stream state.
diff --git a/server/filestore_test.go b/server/filestore_test.go
@@ -9555,3 +9555,33 @@ func TestFileStoreAllLastSeqs(t *testing.T) {
 	require_NoError(t, err)
 	require_True(t, reflect.DeepEqual(seqs, expected))
 }
+
+func TestFileStoreRecoverDoesNotResetStreamState(t *testing.T) {
+	cfg := StreamConfig{Name: "zzz", Subjects: []string{"ev.1"}, Storage: FileStorage, MaxAge: 5 * time.Second, Retention: WorkQueuePolicy}
+	fs, err := newFileStore(
+		FileStoreConfig{StoreDir: t.TempDir()},
+		cfg)
+
+	require_NoError(t, err)
+	defer fs.Stop()
+
+	subj, msg := "foo", []byte("Hello World")
+	toStore := 500
+	for i := 0; i < toStore; i++ {
+		_, _, err := fs.StoreMsg(subj, nil, msg, 0)
+		require_NoError(t, err)
+	}
+	time.Sleep(5 * time.Second)
+	fs, err = newFileStoreWithCreated(fs.fcfg, cfg, time.Now(), prf(&fs.fcfg), nil) //Expire all messages so stream does not hold any message, this is to simulate consumer consuming all messages.
+	require_NoError(t, err)
+	require_NoError(t, fs.Stop())      //To Ensure there is a state file created
+	require_True(t, len(fs.blks) == 1) //Since all messages are expire there should be only 1 blk file exist
+	os.Remove(fs.blks[0].mfn)          // we can change it to have a consumer and consumer all messages too, but removing blk files will simulate same behavior
+
+	//Now at this point stream has only index.db file and no blk files as all are deleted. previously it used to reset the stream state to 0
+	// now it will use index.db to populate stream state if could not be recovered from blk files.
+	fs, err = newFileStoreWithCreated(fs.fcfg, cfg, time.Now(), prf(&fs.fcfg), nil)
+	require_NoError(t, err)
+	require_True(t, fs.state.FirstSeq|fs.state.LastSeq != 0)
+
+}

Original file line number	Diff line number	Diff line change
`@@ -475,18 +475,20 @@ func newFileStoreWithCreated(fcfg FileStoreConfig, cfg StreamConfig, created tim`
`475`	`475`	`}`
`476`	`476`
`477`	`477`	`// Check if our prior state remembers a last sequence past where we can see.`
`478`		`- if fs.ld != nil && prior.LastSeq > fs.state.LastSeq {`
	`478`	`+ if prior.LastSeq > fs.state.LastSeq {`
`479`	`479`	`fs.state.LastSeq, fs.state.LastTime = prior.LastSeq, prior.LastTime`
`480`	`480`	`if fs.state.Msgs == 0 {`
`481`	`481`	`fs.state.FirstSeq = fs.state.LastSeq + 1`
`482`	`482`	`fs.state.FirstTime = time.Time{}`
`483`	`483`	`}`
`484`		`- if _, err := fs.newMsgBlockForWrite(); err == nil {`
`485`		`- if err = fs.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()); err != nil {`
	`484`	`+ if fs.ld != nil {`
	`485`	`+ if _, err := fs.newMsgBlockForWrite(); err == nil {`
	`486`	`+ if err = fs.writeTombstone(prior.LastSeq, prior.LastTime.UnixNano()); err != nil {`
	`487`	`+ return nil, err`
	`488`	`+ }`
	`489`	`+ } else {`
`486`	`490`	`return nil, err`
`487`	`491`	`}`
`488`		`- } else {`
`489`		`- return nil, err`
`490`	`492`	`}`
`491`	`493`	`}`
`492`	`494`	`// Since we recovered here, make sure to kick ourselves to write out our stream state.`