Skip to content

Commit ee73f49

Browse files
WIP: make Funk workspace use unpinned pages
1 parent 789d03a commit ee73f49

File tree

10 files changed

+185
-13
lines changed

10 files changed

+185
-13
lines changed

src/app/firedancer/topology.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ setup_topo_funk( fd_topo_t * topo,
116116
ulong part_max = fd_wksp_part_max_est( funk_footprint, 1U<<14U );
117117
if( FD_UNLIKELY( !part_max ) ) FD_LOG_ERR(( "fd_wksp_part_max_est(%lu,16KiB) failed", funk_footprint ));
118118
wksp->part_max += part_max;
119+
wksp->is_locked = 0;
119120

120121
return obj;
121122
}

src/app/shared/commands/configure/hugetlbfs.c

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,16 @@ init( config_t const * config ) {
173173
if( FD_UNLIKELY( chmod( mount_path[ i ], S_IRUSR | S_IWUSR | S_IXUSR ) ) )
174174
FD_LOG_ERR(( "chmod of hugetlbfs at `%s` failed (%i-%s)", mount_path[ i ], errno, fd_io_strerror( errno ) ));
175175
}
176+
177+
/* Create the directory for the normal pages */
178+
char unpinned_pages_mnt_path[ PATH_MAX ];
179+
FD_TEST( fd_cstr_printf_check(
180+
unpinned_pages_mnt_path,
181+
sizeof(unpinned_pages_mnt_path), NULL, "%s/.normal", config->hugetlbfs.mount_path ) );
182+
FD_LOG_NOTICE(( "RUN: `mkdir -p %s`", unpinned_pages_mnt_path ));
183+
if( FD_UNLIKELY( -1==fd_file_util_mkdir_all( unpinned_pages_mnt_path, config->uid, config->gid ) ) ) {
184+
FD_LOG_ERR(( "could not create mount directory `%s` (%i-%s)", unpinned_pages_mnt_path, errno, fd_io_strerror( errno ) ));
185+
}
176186
}
177187

178188
static void
@@ -225,6 +235,32 @@ warn_mount_users( char const * mount_path ) {
225235
if( FD_UNLIKELY( -1==closedir( dir ) ) ) FD_LOG_ERR(( "closedir (%i-%s)", errno, fd_io_strerror( errno ) ));
226236
}
227237

238+
static int
239+
empty_dir_top_level( const char *dir_path ) {
240+
DIR *dir = opendir( dir_path );
241+
if( FD_UNLIKELY( !dir ) ) return (errno == ENOENT) ? 0 : -1;
242+
243+
struct dirent *entry;
244+
while( ( entry = readdir(dir) ) ) {
245+
/* Skip . and .. */
246+
if( FD_UNLIKELY( !strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ) ) continue;
247+
248+
char path[PATH_MAX];
249+
FD_TEST( fd_cstr_printf_check( path, PATH_MAX, NULL, "%s/%s", dir_path, entry->d_name ) );
250+
251+
/* Only remove regular files, not subdirectories */
252+
struct stat st;
253+
if( stat(path, &st) == 0 && S_ISREG(st.st_mode)) {
254+
if( FD_UNLIKELY( unlink(path) && errno != ENOENT ) ) {
255+
FD_LOG_WARNING(( "failed to remove file `%s` (%i-%s)", path, errno, fd_io_strerror(errno) ));
256+
}
257+
}
258+
}
259+
260+
closedir( dir );
261+
return 0;
262+
}
263+
228264
static void
229265
fini( config_t const * config,
230266
int pre_init ) {
@@ -271,6 +307,13 @@ fini( config_t const * config,
271307
if( FD_LIKELY( fclose( fp ) ) )
272308
FD_LOG_ERR(( "error closing `/proc/self/mounts` (%i-%s)", errno, fd_io_strerror( errno ) ));
273309

310+
/* For normal pages, we need to empty the directory first */
311+
if( i == 2 ) {
312+
FD_LOG_NOTICE(( "RUN: `rmdir %s`", mount_path[ i ] ));
313+
if( FD_UNLIKELY( empty_dir_top_level(mount_path[i]) && errno != ENOENT ) )
314+
FD_LOG_ERR(( "error removing hugetlbfs mount at `%s` (%i-%s)", mount_path[i], errno, fd_io_strerror(errno) ));
315+
}
316+
274317
FD_LOG_NOTICE(( "RUN: `rmdir %s`", mount_path[ i ] ));
275318
if( FD_UNLIKELY( rmdir( mount_path[ i ] ) && errno!=ENOENT ) )
276319
FD_LOG_ERR(( "error removing hugetlbfs mount at `%s` (%i-%s)", mount_path[ i ], errno, fd_io_strerror( errno ) ));
@@ -307,13 +350,18 @@ check( config_t const * config ) {
307350
int result2 = stat( mount_path[ 1 ], &st );
308351
if( FD_UNLIKELY( result2 && errno!=ENOENT ) )
309352
PARTIALLY_CONFIGURED( "failed to stat `%s` (%i-%s)", mount_path[ 1 ], errno, fd_io_strerror( errno ) );
353+
int result3 = stat( config->hugetlbfs.normal_page_mount_path, &st );
354+
if( FD_UNLIKELY( result3 && errno!=ENOENT ) )
355+
PARTIALLY_CONFIGURED( "failed to stat `%s` (%i-%s)", config->hugetlbfs.normal_page_mount_path, errno, fd_io_strerror( errno ) );
310356

311-
if( FD_UNLIKELY( result1 && result2 ) )
312-
NOT_CONFIGURED( "mounts `%s` and `%s` do not exist", mount_path[ 0 ], mount_path[ 1 ] );
357+
if( FD_UNLIKELY( result1 && result2 && result3 ) )
358+
NOT_CONFIGURED( "mounts `%s`, `%s` and `%s` do not exist", mount_path[ 0 ], mount_path[ 1 ], config->hugetlbfs.normal_page_mount_path );
313359
else if( FD_UNLIKELY( result1 ) )
314360
PARTIALLY_CONFIGURED( "mount `%s` does not exist", mount_path[ 0 ] );
315361
else if( FD_UNLIKELY( result2 ) )
316362
PARTIALLY_CONFIGURED( "mount `%s` does not exist", mount_path[ 1 ] );
363+
else if( FD_UNLIKELY( result3 ) )
364+
PARTIALLY_CONFIGURED( "mount `%s` does not exist", config->hugetlbfs.normal_page_mount_path );
317365

318366
CHECK( check_dir( config->hugetlbfs.mount_path, config->uid, config->gid, S_IFDIR | S_IRUSR | S_IWUSR | S_IXUSR ) );
319367
for( ulong i=0UL; i<2UL; i++ ) {

src/app/shared/commands/run/run.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,9 @@ workspace_path( config_t const * config,
472472
case FD_SHMEM_GIGANTIC_PAGE_SZ:
473473
mount_path = config->hugetlbfs.gigantic_page_mount_path;
474474
break;
475+
case FD_SHMEM_NORMAL_PAGE_SZ:
476+
mount_path = config->hugetlbfs.normal_page_mount_path;
477+
break;
475478
default:
476479
FD_LOG_ERR(( "invalid page size %lu", wksp->page_sz ));
477480
}
@@ -511,6 +514,7 @@ warn_unknown_files( config_t const * config,
511514
int known_file = 0;
512515
for( ulong i=0UL; i<config->topo.wksp_cnt; i++ ) {
513516
fd_topo_wksp_t const * wksp = &config->topo.workspaces[ i ];
517+
if( !wksp->is_locked ) continue;
514518

515519
char expected_path[ PATH_MAX ];
516520
workspace_path( config, wksp, expected_path );

src/app/shared/fd_config.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,11 @@ fd_config_fill( fd_config_t * config,
292292
NULL,
293293
"%s/.huge",
294294
config->hugetlbfs.mount_path ) );
295+
FD_TEST( fd_cstr_printf_check( config->hugetlbfs.normal_page_mount_path,
296+
sizeof(config->hugetlbfs.normal_page_mount_path),
297+
NULL,
298+
"%s/.normal",
299+
config->hugetlbfs.mount_path ) );
295300

296301
ulong max_page_sz = fd_cstr_to_shmem_page_sz( config->hugetlbfs.max_page_size );
297302
if( FD_UNLIKELY( max_page_sz!=FD_SHMEM_HUGE_PAGE_SZ && max_page_sz!=FD_SHMEM_GIGANTIC_PAGE_SZ ) ) FD_LOG_ERR(( "[hugetlbfs.max_page_size] must be \"huge\" or \"gigantic\"" ));

src/app/shared/fd_config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ struct fd_config {
242242
struct {
243243
char gigantic_page_mount_path[ PATH_MAX ];
244244
char huge_page_mount_path[ PATH_MAX ];
245+
char normal_page_mount_path[ PATH_MAX ];
245246
char mount_path[ PATH_MAX ];
246247
char max_page_size[ 16 ];
247248
ulong gigantic_page_threshold_mib;

src/disco/topo/fd_topo.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ fd_topo_create_workspace( fd_topo_t * topo,
8282
ulong sub_cpu_idx [ 1 ] = { fd_shmem_cpu_idx( wksp->numa_idx ) };
8383

8484
int err;
85-
if( FD_UNLIKELY( update_existing ) ) {
85+
if( FD_UNLIKELY( !wksp->is_locked ) ) {
86+
err = fd_shmem_create_multi_unlocked( name, wksp->page_sz, wksp->page_cnt, S_IRUSR | S_IWUSR ); /* logs details */
87+
} else if( FD_UNLIKELY( update_existing ) ) {
8688
err = fd_shmem_update_multi( name, wksp->page_sz, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
8789
} else {
8890
err = fd_shmem_create_multi( name, wksp->page_sz, 1, sub_page_cnt, sub_cpu_idx, S_IRUSR | S_IWUSR ); /* logs details */
@@ -223,6 +225,8 @@ fd_topo_mlock_max_tile1( fd_topo_t const * topo,
223225
ulong tile_mem = 0UL;
224226

225227
for( ulong i=0UL; i<topo->wksp_cnt; i++ ) {
228+
if( FD_UNLIKELY( !topo->workspaces[ i ].is_locked ) ) continue;
229+
226230
if( FD_UNLIKELY( -1!=tile_needs_wksp( topo, tile, i ) ) )
227231
tile_mem += topo->workspaces[ i ].page_cnt * topo->workspaces[ i ].page_sz;
228232
}
@@ -250,6 +254,7 @@ fd_topo_gigantic_page_cnt( fd_topo_t const * topo,
250254
for( ulong i=0UL; i<topo->wksp_cnt; i++ ) {
251255
fd_topo_wksp_t const * wksp = &topo->workspaces[ i ];
252256
if( FD_LIKELY( wksp->numa_idx!=numa_idx ) ) continue;
257+
if( FD_UNLIKELY( !wksp->is_locked ) ) continue;
253258

254259
if( FD_LIKELY( wksp->page_sz==FD_SHMEM_GIGANTIC_PAGE_SZ ) ) {
255260
result += wksp->page_cnt;
@@ -266,6 +271,7 @@ fd_topo_huge_page_cnt( fd_topo_t const * topo,
266271
for( ulong i=0UL; i<topo->wksp_cnt; i++ ) {
267272
fd_topo_wksp_t const * wksp = &topo->workspaces[ i ];
268273
if( FD_LIKELY( wksp->numa_idx!=numa_idx ) ) continue;
274+
if( FD_UNLIKELY( !wksp->is_locked ) ) continue;
269275

270276
if( FD_LIKELY( wksp->page_sz==FD_SHMEM_HUGE_PAGE_SZ ) ) {
271277
result += wksp->page_cnt;
@@ -296,6 +302,7 @@ FD_FN_PURE ulong
296302
fd_topo_mlock( fd_topo_t const * topo ) {
297303
ulong result = 0UL;
298304
for( ulong i=0UL; i<topo->wksp_cnt; i++ ) {
305+
if( FD_UNLIKELY( !topo->workspaces[ i ].is_locked ) ) continue;
299306
result += topo->workspaces[ i ].page_cnt * topo->workspaces[ i ].page_sz;
300307
}
301308
return result;
@@ -385,7 +392,7 @@ fd_topo_print_log( int stdout,
385392

386393
char size[ 24 ];
387394
fd_topo_mem_sz_string( wksp->page_sz * wksp->page_cnt, size );
388-
PRINT( " %2lu (%7s): %12s page_cnt=%3lu page_sz=%-8s numa_idx=%-2lu footprint=%10lu loose=%lu\n", i, size, wksp->name, wksp->page_cnt, fd_shmem_page_sz_to_cstr( wksp->page_sz ), wksp->numa_idx, wksp->known_footprint, wksp->total_footprint - wksp->known_footprint );
395+
PRINT( " %2lu (%7s): %12s page_cnt=%3lu page_sz=%-8s numa_idx=%-2lu footprint=%10lu loose=%10lu is_locked=%d\n", i, size, wksp->name, wksp->page_cnt, fd_shmem_page_sz_to_cstr( wksp->page_sz ), wksp->numa_idx, wksp->known_footprint, wksp->total_footprint - wksp->known_footprint, wksp->is_locked );
389396
}
390397

391398
PRINT( "\nOBJECTS\n" );

src/disco/topo/fd_topo.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,17 @@
3030
sits on top of 1 or more memory mapped gigantic or huge pages mounted
3131
to the hugetlbfs. */
3232
typedef struct {
33-
ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
34-
char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
33+
ulong id; /* The ID of this workspace. Indexed from [0, wksp_cnt). When placed in a topology, the ID must be the index of the workspace in the workspaces list. */
34+
char name[ 13UL ]; /* The name of this workspace, like "pack". There can be at most one of each workspace name in a topology. */
3535

36-
ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
36+
ulong numa_idx; /* The index of the NUMA node on the system that this workspace should be allocated from. */
3737

3838
/* Computed fields. These are not supplied as configuration but calculated as needed. */
3939
struct {
40-
ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
41-
ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
42-
ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
40+
ulong page_sz; /* The size of the pages that this workspace is backed by. One of FD_PAGE_SIZE_*. */
41+
ulong page_cnt; /* The number of pages that must be mapped to this workspace to store all the data needed by consumers. */
42+
int is_locked; /* If the workspace should use pages locked and pinned to a specific numa node. */
43+
ulong part_max; /* The maximum number of partitions in the underlying workspace. There can only be this many allocations made at any one time. */
4344

4445
fd_wksp_t * wksp; /* The workspace memory in the local process. */
4546
ulong known_footprint; /* Total size in bytes of all data in Firedancer that will be stored in this workspace at startup. */

src/disco/topo/fd_topob.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ fd_topob_wksp( fd_topo_t * topo,
4141
fd_topo_wksp_t * wksp = &topo->workspaces[ topo->wksp_cnt ];
4242
strncpy( wksp->name, name, sizeof(wksp->name) );
4343
wksp->id = topo->wksp_cnt;
44+
wksp->is_locked = 1;
4445
topo->wksp_cnt++;
4546
}
4647

@@ -644,6 +645,11 @@ fd_topob_finish( fd_topo_t * topo,
644645
if( total_wksp_footprint < topo->gigantic_page_threshold ) page_sz = FD_SHMEM_HUGE_PAGE_SZ;
645646
if( FD_UNLIKELY( page_sz!=FD_SHMEM_HUGE_PAGE_SZ && page_sz!=FD_SHMEM_GIGANTIC_PAGE_SZ ) ) FD_LOG_ERR(( "invalid page_sz" ));
646647

648+
/* If the workspace is not locked, we can't use huge pages. */
649+
if( FD_UNLIKELY( !wksp->is_locked ) ) {
650+
page_sz = FD_SHMEM_NORMAL_PAGE_SZ;
651+
}
652+
647653
ulong wksp_aligned_footprint = fd_ulong_align_up( total_wksp_footprint, page_sz );
648654

649655
/* Give any leftover space in the underlying shared memory to the

src/util/shmem/fd_shmem.h

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -346,13 +346,36 @@ fd_shmem_create_multi( char const * name, /* Should point to cstr with
346346
ulong const * sub_cpu_idx, /* Indexed [0,sub_cnt), each should be in [0,fd_shmem_cpu_cnt()) */
347347
ulong mode ); /* E.g. 0660 for user rw, group rw, world none */
348348

349+
/* fd_shmem_create_multi_unlocked creates a shared memory region whose
350+
name is given by the cstr pointed to by name backed by page_sz pages.
351+
It functions the same as fd_shmem_create_multi, but the pages are not
352+
locked, not pinned to any particular numa node, and have the default numa
353+
mempolicy.
354+
355+
mode specifies the permissions for this region (the usual POSIX open
356+
umask caveats apply).
357+
358+
Returns 0 on success and an strerror friendly error code on failure
359+
(also logs extensive details on error). Reasons for failure include
360+
name is invalid (EINVAL), page_sz is invalid (EINVAL), page_cnt is
361+
zero (EINVAL), cnt*page_sz overflows an off_t (EINVAL), open fails
362+
(errno of the open, e.g. region with the same name and page_sz in the
363+
thread domain already exists), ftruncate fails (errno of ftruncate,
364+
e.g. no suitable memory available near cpu_idx), etc.
365+
*/
366+
int
367+
fd_shmem_create_multi_unlocked( char const * name,
368+
ulong page_sz,
369+
ulong page_cnt,
370+
ulong mode );
371+
349372
/* fd_shmem_update_multi updates a shared memory region created by
350373
fd_shmem_create_multi in place, to be as-if it was created with
351374
the provided parameters instead.
352-
375+
353376
This can be preferable to deleting and recreating the shmem region
354377
because it prevents needing to zero all of the underlying memory.
355-
378+
356379
WARNING: The memory returned will not be zeroed and the user will
357380
be able to read any contents that were in the previous workspace. */
358381

@@ -449,7 +472,7 @@ fd_shmem_acquire( ulong page_sz,
449472
by fd_shmem_acquire. This always succeeds from the caller's POV but
450473
logs details if there is any wonkiness under the hood. It is fine to
451474
release subregions of individual previous acquisitions.
452-
475+
453476
Returns 0 if successful, -1 for any errors. */
454477

455478
int

src/util/shmem/fd_shmem_admin.c

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,82 @@ fd_shmem_update_multi( char const * name,
407407
return fd_shmem_create_multi_flags( name, page_sz, sub_cnt, _sub_page_cnt, _sub_cpu_idx, mode, O_RDWR );
408408
}
409409

410+
int
411+
fd_shmem_create_multi_unlocked( char const * name,
412+
ulong page_sz,
413+
ulong page_cnt,
414+
ulong mode ) {
415+
416+
/* Check input args */
417+
418+
if( FD_UNLIKELY( !fd_shmem_name_len( name ) ) ) { FD_LOG_WARNING(( "bad name (%s)", name ? name : "NULL" )); return EINVAL; }
419+
420+
if( FD_UNLIKELY( !page_cnt ) ) { FD_LOG_WARNING(( "zero page_cnt" )); return EINVAL; }
421+
422+
if( FD_UNLIKELY( !fd_shmem_is_page_sz( page_sz ) ) ) { FD_LOG_WARNING(( "bad page_sz (%lu)", page_sz )); return EINVAL; }
423+
424+
425+
426+
# define ERROR( cleanup ) do { err = errno; goto cleanup; } while(0)
427+
428+
int err = 0;
429+
430+
char path[ FD_SHMEM_PRIVATE_PATH_BUF_MAX ];
431+
void * shmem;
432+
433+
ulong sz = page_cnt*page_sz;
434+
435+
/* Acquire the pages at a random address */
436+
437+
/* Create the region */
438+
int open_flags = O_RDWR | O_CREAT | O_TRUNC;
439+
int fd = open( fd_shmem_private_path( name, page_sz, path ), open_flags, (mode_t)mode );
440+
if( FD_UNLIKELY( fd==-1 ) ) {
441+
FD_LOG_WARNING(( "open(\"%s\",%#x,0%03lo) failed (%i-%s)", path, (uint)open_flags, mode, errno, fd_io_strerror( errno ) ));
442+
ERROR( done );
443+
}
444+
445+
/* Size the region */
446+
447+
if( FD_UNLIKELY( ftruncate( fd, (off_t)sz ) ) ) {
448+
FD_LOG_WARNING(( "ftruncate(\"%s\",%lu KiB) failed (%i-%s)", path, sz>>10, errno, fd_io_strerror( errno ) ));
449+
ERROR( close );
450+
}
451+
452+
/* Map the region into our address space. */
453+
454+
shmem = mmap( NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, (off_t)0);
455+
if( FD_UNLIKELY( shmem==MAP_FAILED ) ) {
456+
FD_LOG_WARNING(( "mmap(NULL,%lu KiB,PROT_READ|PROT_WRITE,MAP_SHARED,\"%s\",0) failed (%i-%s)",
457+
sz>>10, path, errno, fd_io_strerror( errno ) ));
458+
ERROR( close );
459+
}
460+
461+
/* Validate the mapping */
462+
463+
if( FD_UNLIKELY( !fd_ulong_is_aligned( (ulong)shmem, page_sz ) ) ) {
464+
FD_LOG_WARNING(( "misaligned memory mapping for unpinned shmem region \"%s\"", name ));
465+
errno = EFAULT; /* ENOMEM is arguable */
466+
ERROR( unmap );
467+
}
468+
469+
# undef ERROR
470+
471+
unmap:
472+
if( FD_UNLIKELY( err ) && FD_UNLIKELY( munmap( shmem, sz ) ) )
473+
FD_LOG_ERR(( "munmap(\"%s\",%lu KiB) failed (%i-%s)",
474+
path, sz>>10, errno, fd_io_strerror( errno ) ));
475+
476+
close:
477+
if( FD_UNLIKELY( err ) && FD_UNLIKELY( unlink( path ) ) )
478+
FD_LOG_ERR(( "unlink(\"%s\") failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
479+
if( FD_UNLIKELY( err ) && FD_UNLIKELY( close( fd ) ) )
480+
FD_LOG_ERR(( "close(\"%s\") failed (%i-%s)", path, errno, fd_io_strerror( errno ) ));
481+
482+
done:
483+
return err;
484+
}
485+
410486
int
411487
fd_shmem_unlink( char const * name,
412488
ulong page_sz ) {

0 commit comments

Comments
 (0)