Skip to content

Commit ee70fdb

Browse files
committed
repair: fix forest and fec chainer publish edge cases
1 parent 9d5e755 commit ee70fdb

File tree

7 files changed

+356
-33
lines changed

7 files changed

+356
-33
lines changed

src/discof/forest/fd_forest.c

Lines changed: 87 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -188,18 +188,6 @@ fd_forest_verify( fd_forest_t const * forest ) {
188188
return 0;
189189
}
190190

191-
/* query queries for a connected ele keyed by slot. does not return
192-
orphaned ele. */
193-
194-
static fd_forest_ele_t *
195-
ancestry_frontier_query( fd_forest_t * forest, ulong slot ) {
196-
fd_forest_ele_t * pool = fd_forest_pool( forest );
197-
fd_forest_ele_t * ele = NULL;
198-
ele = fd_forest_ancestry_ele_query( fd_forest_ancestry( forest ), &slot, NULL, pool );
199-
ele = fd_ptr_if( !ele, fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &slot, NULL, pool ), ele );
200-
return ele;
201-
}
202-
203191
/* remove removes and returns a connected ele from ancestry or frontier
204192
maps. does not remove orphaned ele. does not unlink ele. */
205193

@@ -382,9 +370,6 @@ insert( fd_forest_t * forest, ulong slot, ushort parent_off ) {
382370

383371
fd_forest_ele_t *
384372
fd_forest_query( fd_forest_t * forest, ulong slot ) {
385-
# if FD_FOREST_USE_HANDHOLDING
386-
FD_TEST( slot > fd_forest_root_slot( forest ) ); /* caller error - inval */
387-
# endif
388373
return query( forest, slot );
389374
}
390375

@@ -433,22 +418,39 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
433418

434419
fd_forest_ancestry_t * ancestry = fd_forest_ancestry( forest );
435420
fd_forest_frontier_t * frontier = fd_forest_frontier( forest );
421+
fd_forest_orphaned_t * orphaned = fd_forest_orphaned( forest );
436422
fd_forest_ele_t * pool = fd_forest_pool( forest );
437423
ulong null = fd_forest_pool_idx_null( pool );
438424

439425
fd_forest_ele_t * old_root_ele = fd_forest_pool_ele( pool, forest->root );
440-
fd_forest_ele_t * new_root_ele = ancestry_frontier_query( forest, new_root_slot );
426+
fd_forest_ele_t * new_root_ele = query( forest, new_root_slot );
441427

442-
# if FD_FOREST_USE_HANDHOLDING
443-
FD_TEST( new_root_ele ); /* caller error - not found */
444-
FD_TEST( new_root_ele->slot > old_root_ele->slot ); /* caller error - inval */
445-
# endif
428+
#if FD_FOREST_USE_HANDHOLDING
429+
if( FD_LIKELY( new_root_ele ) ) {
430+
FD_TEST( new_root_ele->slot > old_root_ele->slot ); /* caller error - inval */
431+
}
432+
#endif
433+
434+
/* Edge case where if we haven't been getting repairs, and we have a
435+
gap between the root and orphans. we publish forward to a slot that
436+
we don't have. This only case this should be happening is when we
437+
load a second incremental and that incremental slot lives in the
438+
gap. In that case this isn't a bug, but we should be treating this
439+
new root like the snapshot slot / init root. Should be happening
440+
very rarely given a well-functioning repair. */
441+
442+
if( FD_UNLIKELY( !new_root_ele ) ) {
443+
new_root_ele = acquire( forest, new_root_slot );
444+
new_root_ele->complete_idx = 0;
445+
new_root_ele->buffered_idx = 0;
446+
fd_forest_frontier_ele_insert( frontier, new_root_ele, pool );
447+
}
446448

447449
/* First, remove the previous root, and add it to a FIFO prune queue.
448450
head points to the queue head (initialized with old_root_ele). */
449451

450452
fd_forest_ele_t * head = ancestry_frontier_remove( forest, old_root_ele->slot );
451-
head->next = null;
453+
head->prev = null;
452454
fd_forest_ele_t * tail = head;
453455

454456
/* Second, BFS down the tree, inserting each ele into the prune queue
@@ -461,22 +463,80 @@ fd_forest_publish( fd_forest_t * forest, ulong new_root_slot ) {
461463
if( FD_LIKELY( child != new_root_ele ) ) { /* do not prune new root or descendants */
462464
ulong idx = fd_forest_ancestry_idx_remove( ancestry, &child->slot, null, pool );
463465
idx = fd_ulong_if( idx != null, idx, fd_forest_frontier_idx_remove( frontier, &child->slot, null, pool ) );
464-
tail->next = idx; /* insert prune queue */
466+
tail->prev = idx; /* insert prune queue */
465467
# if FD_FOREST_USE_HANDHOLDING
466-
FD_TEST( tail->next != null ); /* programming error in BFS */
468+
FD_TEST( tail->prev != null ); /* programming error in BFS */
467469
# endif
468-
tail = fd_forest_pool_ele( pool, tail->next ); /* advance prune queue */
469-
tail->next = null;
470+
tail = fd_forest_pool_ele( pool, tail->prev ); /* advance prune queue */
471+
tail->prev = null;
470472
}
471473
child = fd_forest_pool_ele( pool, child->sibling );
472474
}
473-
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->next ); /* FIFO pop */
475+
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->prev ); /* FIFO pop */
474476
fd_forest_pool_ele_release( pool, head ); /* free head */
475477
head = next;
476478
}
477479

480+
/* If there is nothing on the frontier, we have hit an edge case
481+
during catching up where all of our frontiers were < the new root.
482+
In that case we need to continue repairing from the new root, so
483+
add it to the frontier. */
484+
485+
if( FD_UNLIKELY( fd_forest_frontier_iter_done( fd_forest_frontier_iter_init( frontier, pool ), frontier, pool ) ) ) {
486+
fd_forest_ele_t * remove = fd_forest_ancestry_ele_remove( ancestry, &new_root_ele->slot, NULL, pool );
487+
if( FD_UNLIKELY( !remove ) ) {
488+
/* Very rare case where during second incremental load we could publish to an orphaned slot */
489+
remove = fd_forest_orphaned_ele_remove( orphaned, &new_root_ele->slot, NULL, pool );
490+
}
491+
FD_TEST( remove == new_root_ele );
492+
fd_forest_frontier_ele_insert( frontier, new_root_ele, pool );
493+
new_root_ele->complete_idx = 0;
494+
new_root_ele->buffered_idx = 0;
495+
advance_frontier( forest, new_root_ele->slot, 0 );
496+
}
497+
478498
new_root_ele->parent = null; /* unlink new root from parent */
479-
forest->root = fd_forest_ancestry_idx_query( ancestry, &new_root_slot, null, pool );
499+
forest->root = fd_forest_pool_idx( pool, new_root_ele );
500+
501+
/* Lastly, cleanup orphans if there orphan heads < new_root_slot.
502+
First, add any relevant orphans to the prune queue. */
503+
504+
head = NULL;
505+
for( fd_forest_orphaned_iter_t iter = fd_forest_orphaned_iter_init( orphaned, pool );
506+
!fd_forest_orphaned_iter_done( iter, orphaned, pool );
507+
iter = fd_forest_orphaned_iter_next( iter, orphaned, pool ) ) {
508+
fd_forest_ele_t * ele = fd_forest_orphaned_iter_ele( iter, orphaned, pool );
509+
if( FD_UNLIKELY( ele->slot < new_root_slot ) ) {
510+
if( FD_UNLIKELY( !head ) ) {
511+
head = ele;
512+
head->prev = null;
513+
tail = ele;
514+
} else {
515+
tail->prev = iter.ele_idx;
516+
tail = fd_forest_pool_ele( pool, tail->prev );
517+
tail->prev = null;
518+
}
519+
}
520+
}
521+
522+
/* Now BFS and clean up children of these orphan heads */
523+
while( head ) {
524+
fd_forest_ele_t * child = fd_forest_pool_ele( pool, head->child );
525+
while( FD_LIKELY( child ) ) {
526+
if( FD_LIKELY( child != new_root_ele ) ) {
527+
tail->prev = fd_forest_pool_idx( pool, child ); /* insert prune queue */
528+
tail = fd_forest_pool_ele( pool, tail->prev ); /* advance prune queue */
529+
tail->prev = null;
530+
}
531+
child = fd_forest_pool_ele( pool, child->sibling );
532+
}
533+
ulong remove = fd_forest_orphaned_idx_remove( orphaned, &head->slot, null, pool ); /* remove myself */
534+
remove = fd_ulong_if( remove == null, fd_forest_ancestry_idx_remove( ancestry, &head->slot, null, pool ), remove );
535+
536+
fd_forest_ele_t * next = fd_forest_pool_ele( pool, head->prev ); /* FIFO pop */
537+
fd_forest_pool_ele_release( pool, head ); /* free head */
538+
head = next;
539+
}
480540
return new_root_ele;
481541
}
482542

src/discof/forest/fd_forest.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
struct __attribute__((aligned(128UL))) fd_forest_ele {
4747
ulong slot; /* map key */
48-
ulong prev; /* internal use by link_orphans */
48+
ulong prev; /* internal use for BFSing */
4949
ulong next; /* internal use by fd_pool, fd_map_chain */
5050
ulong parent; /* pool idx of the parent in the tree */
5151
ulong child; /* pool idx of the left-child */

src/discof/forest/test_forest.c

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,115 @@ test_publish( fd_wksp_t * wksp ) {
7979
}
8080
}
8181

82+
void
83+
test_publish_incremental( fd_wksp_t * wksp ){
84+
/* as the name suggests. tests the complications introduced by loading
85+
two incremental snapshots */
86+
87+
ulong ele_max = 8UL;
88+
void * mem = fd_wksp_alloc_laddr( wksp, fd_forest_align(), fd_forest_footprint( ele_max ), 1UL );
89+
FD_TEST( mem );
90+
fd_forest_t * forest = fd_forest_join( fd_forest_new( mem, ele_max, 42UL /* seed */ ) );
91+
92+
/* 1. Try publishing to a slot that doesnt exist
93+
94+
0 10 -> 11
95+
96+
*/
97+
98+
fd_forest_init( forest, 0 );
99+
fd_forest_data_shred_insert( forest, 11, 1, 0, 0, 1, 1 );
100+
101+
ulong new_root = 1;
102+
fd_forest_publish( forest, new_root );
103+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
104+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &new_root, NULL, fd_forest_pool( forest ) ) );
105+
FD_TEST( !fd_forest_query( forest, 0 ) );
106+
107+
/* 2. Try publishing to a slot on the frontier
108+
109+
1 -> 2 -> 3 10 -> 11
110+
111+
*/
112+
113+
fd_forest_data_shred_insert( forest, 2, 1, 0, 0, 1, 1 );
114+
fd_forest_data_shred_insert( forest, 3, 1, 0, 0, 1, 1 );
115+
116+
ulong frontier = 3;
117+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
118+
fd_forest_publish( forest, frontier );
119+
FD_TEST( fd_forest_root_slot( forest ) == frontier );
120+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
121+
FD_TEST( !fd_forest_query( forest, 1 ) );
122+
FD_TEST( !fd_forest_query( forest, 2 ) );
123+
FD_TEST( fd_forest_query( forest, 10 ) );
124+
FD_TEST( fd_forest_query( forest, 11 ) );
125+
126+
/* 3. Try publishing to a slot in ancestry but in front of the frontier
127+
128+
frontier new_root
129+
3 -> 4 -> 5 -> 6 -> 7 10 -> 11
130+
131+
*/
132+
133+
fd_forest_data_shred_insert( forest, 4, 1, 0, 0, 0, 0 );
134+
fd_forest_data_shred_insert( forest, 5, 1, 0, 0, 0, 0 );
135+
fd_forest_data_shred_insert( forest, 6, 1, 0, 0, 0, 0 );
136+
fd_forest_data_shred_insert( forest, 7, 1, 0, 0, 0, 0 );
137+
138+
frontier = 4;
139+
new_root = 6;
140+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
141+
fd_forest_publish( forest, new_root );
142+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
143+
frontier = 7;
144+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
145+
FD_TEST( !fd_forest_query( forest, 3 ) );
146+
FD_TEST( !fd_forest_query( forest, 4 ) );
147+
FD_TEST( !fd_forest_query( forest, 5 ) );
148+
149+
/* 4. Try publishing to an orphan slot
150+
151+
6 -> 7 10 -> 11
152+
8 -> 9 (should get pruned)
153+
*/
154+
155+
fd_forest_data_shred_insert( forest, 9, 1, 0, 0, 0, 0 );
156+
157+
new_root = 10;
158+
frontier = 11;
159+
160+
fd_forest_publish( forest, new_root);
161+
FD_TEST( !fd_forest_verify( forest ) );
162+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
163+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
164+
FD_TEST( !fd_forest_query( forest, 6 ) );
165+
FD_TEST( !fd_forest_query( forest, 7 ) );
166+
FD_TEST( !fd_forest_query( forest, 8 ) );
167+
FD_TEST( !fd_forest_query( forest, 9 ) );
168+
FD_TEST( fd_forest_query( forest, 10 ) );
169+
FD_TEST( fd_forest_query( forest, 11 ) );
170+
171+
/* 5. Try publishing to an orphan slot that is not a "head" of orphans
172+
(publish)
173+
10 -> 11 14 -> 15 -> 16
174+
175+
*/
176+
177+
fd_forest_data_shred_insert( forest, 14, 1, 0, 0, 0, 0 );
178+
fd_forest_data_shred_insert( forest, 15, 1, 0, 0, 0, 0 );
179+
fd_forest_data_shred_insert( forest, 16, 1, 0, 0, 0, 0 );
180+
181+
new_root = 15;
182+
frontier = 16;
183+
fd_forest_publish( forest, new_root );
184+
FD_TEST( !fd_forest_verify( forest ) );
185+
FD_TEST( fd_forest_root_slot( forest ) == new_root );
186+
FD_TEST( fd_forest_frontier_ele_query( fd_forest_frontier( forest ), &frontier, NULL, fd_forest_pool( forest ) ) );
187+
FD_TEST( !fd_forest_query( forest, 10 ) );
188+
FD_TEST( !fd_forest_query( forest, 11 ) );
189+
FD_TEST( !fd_forest_query( forest, 14 ) );
190+
}
82191
#define SORT_NAME sort
83192
#define SORT_KEY_T ulong
84193
#include "../../util/tmpl/fd_sort.c"
@@ -520,6 +629,7 @@ main( int argc, char ** argv ) {
520629
FD_TEST( wksp );
521630

522631
test_publish( wksp );
632+
test_publish_incremental( wksp );
523633
test_out_of_order( wksp );
524634
test_forks( wksp );
525635
// test_print_tree( wksp );

0 commit comments

Comments
 (0)