Skip to content

Commit 1dcb5d6

Browse files
committed
repair: fix forest and fec chainer publish edge cases
1 parent 090c57a commit 1dcb5d6

File tree

8 files changed

+356
-35
lines changed

8 files changed

+356
-35
lines changed

src/discof/forest/fd_forest.c

Lines changed: 87 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -187,18 +187,6 @@ fd_forest_verify( fd_forest_t const * forest ) {
187187
return 0;
188188
}
189189

190-
/* query queries for a connected ele keyed by slot. does not return
191-
orphaned ele. */
192-
193-
static fd_forest_ele_t *
194-
ancestry_frontier_query( fd_forest_t * forest, ulong slot ) {
195-
fd_forest_ele_t * pool = fd_forest_pool( forest );
196-
fd_forest_ele_t * ele = NULL;
197-
ele = fd_forest_ancestry_ele_query( fd_forest_ancestry( forest ), &slot, NULL, pool );
198-
ele = fd_ptr_if( !ele, fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &slot, NULL, pool ), ele );
199-
return ele;
200-
}
201-
202190
/* remove removes and returns a connected ele from ancestry or frontier
203191
maps. does not remove orphaned ele. does not unlink ele. */
204192

@@ -381,9 +369,6 @@ insert( fd_forest_t * forest, ulong slot, ushort parent_off ) {
381369

382370
fd_forest_ele_t *
383371
fd_forest_query( fd_forest_t * forest, ulong slot ) {
384-
# if FD_FOREST_USE_HANDHOLDING
385-
FD_TEST( slot > fd_forest_root_slot( forest ) ); /* caller error - inval */
386-
# endif
387372
return query( forest, slot );
388373
}
389374

@@ -432,22 +417,39 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
432417

433418
fd_forest_ancestry_t * ancestry = fd_forest_ancestry( forest );
434419
fd_forest_frontier_t * frontier = fd_forest_frontier( forest );
420+
fd_forest_orphaned_t * orphaned = fd_forest_orphaned( forest );
435421
fd_forest_ele_t * pool = fd_forest_pool( forest );
436422
ulong null = fd_forest_pool_idx_null( pool );
437423

438424
fd_forest_ele_t * old_root_ele = fd_forest_pool_ele( pool, forest->root );
439-
fd_forest_ele_t * new_root_ele = ancestry_frontier_query( forest, new_root_slot );
425+
fd_forest_ele_t * new_root_ele = query( forest, new_root_slot );
440426

441-
# if FD_FOREST_USE_HANDHOLDING
442-
FD_TEST( new_root_ele ); /* caller error - not found */
443-
FD_TEST( new_root_ele->slot > old_root_ele->slot ); /* caller error - inval */
444-
# endif
427+
#if FD_FOREST_USE_HANDHOLDING
428+
if( FD_LIKELY( new_root_ele ) ) {
429+
FD_TEST( new_root_ele->slot > old_root_ele->slot ); /* caller error - inval */
430+
}
431+
#endif
432+
433+
/* Edge case where if we haven't been getting repairs, and we have a
434+
gap between the root and orphans. we publish forward to a slot that
435+
we don't have. This only case this should be happening is when we
436+
load a second incremental and that incremental slot lives in the
437+
gap. In that case this isn't a bug, but we should be treating this
438+
new root like the snapshot slot / init root. Should be happening
439+
very rarely given a well-functioning repair. */
440+
441+
if( FD_UNLIKELY( !new_root_ele ) ) {
442+
new_root_ele = acquire( forest, new_root_slot );
443+
new_root_ele->complete_idx = 0;
444+
new_root_ele->buffered_idx = 0;
445+
fd_forest_frontier_ele_insert( frontier, new_root_ele, pool );
446+
}
445447

446448
/* First, remove the previous root, and add it to a FIFO prune queue.
447449
head points to the queue head (initialized with old_root_ele). */
448450

449451
fd_forest_ele_t * head = ancestry_frontier_remove( forest, old_root_ele->slot );
450-
head->next = null;
452+
head->prev = null;
451453
fd_forest_ele_t * tail = head;
452454

453455
/* Second, BFS down the tree, inserting each ele into the prune queue
@@ -460,22 +462,80 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
460462
if( FD_LIKELY( child != new_root_ele ) ) { /* do not prune new root or descendants */
461463
ulong idx = fd_forest_ancestry_idx_remove( ancestry, &child->slot, null, pool );
462464
idx = fd_ulong_if( idx != null, idx, fd_forest_frontier_idx_remove( frontier, &child->slot, null, pool ) );
463-
tail->next = idx; /* insert prune queue */
465+
tail->prev = idx; /* insert prune queue */
464466
# if FD_FOREST_USE_HANDHOLDING
465-
FD_TEST( tail->next != null ); /* programming error in BFS */
467+
FD_TEST( tail->prev != null ); /* programming error in BFS */
466468
# endif
467-
tail = fd_forest_pool_ele( pool, tail->next ); /* advance prune queue */
468-
tail->next = null;
469+
tail = fd_forest_pool_ele( pool, tail->prev ); /* advance prune queue */
470+
tail->prev = null;
469471
}
470472
child = fd_forest_pool_ele( pool, child->sibling );
471473
}
472-
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->next ); /* FIFO pop */
474+
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->prev ); /* FIFO pop */
473475
fd_forest_pool_ele_release( pool, head ); /* free head */
474476
head = next;
475477
}
476478

479+
/* If there is nothing on the frontier, we have hit an edge case
480+
during catching up where all of our frontiers were < the new root.
481+
In that case we need to continue repairing from the new root, so
482+
add it to the frontier. */
483+
484+
if( FD_UNLIKELY( fd_forest_frontier_iter_done( fd_forest_frontier_iter_init( frontier, pool ), frontier, pool ) ) ) {
485+
fd_forest_ele_t * remove = fd_forest_ancestry_ele_remove( ancestry, &new_root_ele->slot, NULL, pool );
486+
if( FD_UNLIKELY( !remove ) ) {
487+
/* Very rare case where during second incremental load we could publish to an orphaned slot */
488+
remove = fd_forest_orphaned_ele_remove( orphaned, &new_root_ele->slot, NULL, pool );
489+
}
490+
FD_TEST( remove == new_root_ele );
491+
fd_forest_frontier_ele_insert( frontier, new_root_ele, pool );
492+
new_root_ele->complete_idx = 0;
493+
new_root_ele->buffered_idx = 0;
494+
advance_frontier( forest, new_root_ele->slot, 0 );
495+
}
496+
477497
new_root_ele->parent = null; /* unlink new root from parent */
478-
forest->root = fd_forest_ancestry_idx_query( ancestry, &new_root_slot, null, pool );
498+
forest->root = fd_forest_pool_idx( pool, new_root_ele );
499+
500+
/* Lastly, cleanup orphans if there orphan heads < new_root_slot.
501+
First, add any relevant orphans to the prune queue. */
502+
503+
head = NULL;
504+
for( fd_forest_orphaned_iter_t iter = fd_forest_orphaned_iter_init( orphaned, pool );
505+
!fd_forest_orphaned_iter_done( iter, orphaned, pool );
506+
iter = fd_forest_orphaned_iter_next( iter, orphaned, pool ) ) {
507+
fd_forest_ele_t * ele = fd_forest_orphaned_iter_ele( iter, orphaned, pool );
508+
if( FD_UNLIKELY( ele->slot < new_root_slot ) ) {
509+
if( FD_UNLIKELY( !head ) ) {
510+
head = ele;
511+
head->prev = null;
512+
tail = ele;
513+
} else {
514+
tail->prev = iter.ele_idx;
515+
tail = fd_forest_pool_ele( pool, tail->prev );
516+
tail->prev = null;
517+
}
518+
}
519+
}
520+
521+
/* Now BFS and clean up children of these orphan heads */
522+
while( head ) {
523+
fd_forest_ele_t * child = fd_forest_pool_ele( pool, head->child );
524+
while( FD_LIKELY( child ) ) {
525+
if( FD_LIKELY( child != new_root_ele ) ) {
526+
tail->prev = fd_forest_pool_idx( pool, child ); /* insert prune queue */
527+
tail = fd_forest_pool_ele( pool, tail->prev ); /* advance prune queue */
528+
tail->prev = null;
529+
}
530+
child = fd_forest_pool_ele( pool, child->sibling );
531+
}
532+
ulong remove = fd_forest_orphaned_idx_remove( orphaned, &head->slot, null, pool ); /* remove myself */
533+
remove = fd_ulong_if( remove == null, fd_forest_ancestry_idx_remove( ancestry, &head->slot, null, pool ), remove );
534+
535+
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->prev ); /* FIFO pop */
536+
fd_forest_pool_ele_release( pool, head ); /* free head */
537+
head = next;
538+
}
479539
return new_root_ele;
480540
}
481541

src/discof/forest/fd_forest.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
struct __attribute__((aligned(128UL))) fd_forest_ele {
4747
ulong slot; /* map key */
48-
ulong prev; /* internal use by link_orphans */
48+
ulong prev; /* internal use for BFSing */
4949
ulong next; /* internal use by fd_pool, fd_map_chain */
5050
ulong parent; /* pool idx of the parent in the tree */
5151
ulong child; /* pool idx of the left-child */

src/discof/forest/test_forest.c

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,115 @@ test_publish( fd_wksp_t * wksp ) {
7979
}
8080
}
8181

82+
void
83+
test_publish_incremental( fd_wksp_t * wksp ){
84+
/* as the name suggests. tests the complications introduced by loading
85+
two incremental snapshots */
86+
87+
ulong ele_max = 8UL;
88+
void * mem = fd_wksp_alloc_laddr( wksp, fd_forest_align(), fd_forest_footprint( ele_max ), 1UL );
89+
FD_TEST( mem );
90+
fd_forest_t * forest = fd_forest_join( fd_forest_new( mem, ele_max, 42UL /* seed */ ) );
91+
92+
/* 1. Try publishing to a slot that doesnt exist
93+
94+
0 10 -> 11
95+
96+
*/
97+
98+
fd_forest_init( forest, 0 );
99+
fd_forest_data_shred_insert( forest, 11, 1, 0, 0, 1, 1 );
100+
101+
ulong new_root = 1;
102+
fd_forest_publish( forest, new_root );
103+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
104+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &new_root, NULL, fd_forest_pool( forest ) ) );
105+
FD_TEST( !fd_forest_query( forest, 0 ) );
106+
107+
/* 2. Try publishing to a slot on the frontier
108+
109+
1 -> 2 -> 3 10 -> 11
110+
111+
*/
112+
113+
fd_forest_data_shred_insert( forest, 2, 1, 0, 0, 1, 1 );
114+
fd_forest_data_shred_insert( forest, 3, 1, 0, 0, 1, 1 );
115+
116+
ulong frontier = 3;
117+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
118+
fd_forest_publish( forest, frontier );
119+
FD_TEST( fd_forest_root_slot( forest ) == frontier );
120+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
121+
FD_TEST( !fd_forest_query( forest, 1 ) );
122+
FD_TEST( !fd_forest_query( forest, 2 ) );
123+
FD_TEST( fd_forest_query( forest, 10 ) );
124+
FD_TEST( fd_forest_query( forest, 11 ) );
125+
126+
/* 3. Try publishing to a slot in ancestry but in front of the frontier
127+
128+
frontier new_root
129+
3 -> 4 -> 5 -> 6 -> 7 10 -> 11
130+
131+
*/
132+
133+
fd_forest_data_shred_insert( forest, 4, 1, 0, 0, 0, 0 );
134+
fd_forest_data_shred_insert( forest, 5, 1, 0, 0, 0, 0 );
135+
fd_forest_data_shred_insert( forest, 6, 1, 0, 0, 0, 0 );
136+
fd_forest_data_shred_insert( forest, 7, 1, 0, 0, 0, 0 );
137+
138+
frontier = 4;
139+
new_root = 6;
140+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
141+
fd_forest_publish( forest, new_root );
142+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
143+
frontier = 7;
144+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
145+
FD_TEST( !fd_forest_query( forest, 3 ) );
146+
FD_TEST( !fd_forest_query( forest, 4 ) );
147+
FD_TEST( !fd_forest_query( forest, 5 ) );
148+
149+
/* 4. Try publishing to an orphan slot
150+
151+
6 -> 7 10 -> 11
152+
8 -> 9 (should get pruned)
153+
*/
154+
155+
fd_forest_data_shred_insert( forest, 9, 1, 0, 0, 0, 0 );
156+
157+
new_root = 10;
158+
frontier = 11;
159+
160+
fd_forest_publish( forest, new_root);
161+
FD_TEST( !fd_forest_verify( forest ) );
162+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
163+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
164+
FD_TEST( !fd_forest_query( forest, 6 ) );
165+
FD_TEST( !fd_forest_query( forest, 7 ) );
166+
FD_TEST( !fd_forest_query( forest, 8 ) );
167+
FD_TEST( !fd_forest_query( forest, 9 ) );
168+
FD_TEST( fd_forest_query( forest, 10 ) );
169+
FD_TEST( fd_forest_query( forest, 11 ) );
170+
171+
/* 5. Try publishing to an orphan slot that is not a "head" of orphans
172+
(publish)
173+
10 -> 11 14 -> 15 -> 16
174+
175+
*/
176+
177+
fd_forest_data_shred_insert( forest, 14, 1, 0, 0, 0, 0 );
178+
fd_forest_data_shred_insert( forest, 15, 1, 0, 0, 0, 0 );
179+
fd_forest_data_shred_insert( forest, 16, 1, 0, 0, 0, 0 );
180+
181+
new_root = 15;
182+
frontier = 16;
183+
fd_forest_publish( forest, new_root );
184+
FD_TEST( !fd_forest_verify( forest ) );
185+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
186+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
187+
FD_TEST( !fd_forest_query( forest, 10 ) );
188+
FD_TEST( !fd_forest_query( forest, 11 ) );
189+
FD_TEST( !fd_forest_query( forest, 14 ) );
190+
}
82191
#define SORT_NAME sort
83192
#define SORT_KEY_T ulong
84193
#include "../../util/tmpl/fd_sort.c"
@@ -520,6 +629,7 @@ main( int argc, char ** argv ) {
520629
FD_TEST( wksp );
521630

522631
test_publish( wksp );
632+
test_publish_incremental( wksp );
523633
test_out_of_order( wksp );
524634
test_forks( wksp );
525635
// test_print_tree( wksp );

0 commit comments

Comments
 (0)