Skip to content

snapshot: non-temporal memcpy #4869

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/flamenco/snapshot/fd_snapshot.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ void
fd_snapshot_load_accounts( fd_snapshot_load_ctx_t * ctx ) {

/* Now, that the manifest is done being read in. Read in the rest of the accounts. */
fd_snapshot_enable_nt_copy( ctx->restore );
for(;;) {
int err = fd_snapshot_loader_advance( ctx->loader );
if( err==-1 ) break; /* We have finished loading in the snapshot. */
Expand Down
73 changes: 72 additions & 1 deletion src/flamenco/snapshot/fd_snapshot_restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
#include <string.h> /* strncmp */
#include <sys/random.h> /* getrandom */

#if FD_HAS_AVX512 || FD_HAS_AVX
#include <immintrin.h>
#endif

/* Snapshot Restore Buffer Handling ***********************************/

static void
Expand Down Expand Up @@ -582,6 +586,72 @@ fd_snapshot_read_account_hdr_chunk( fd_snapshot_restore_t * restore,
return end;
}

void
fd_snapshot_enable_nt_copy( fd_snapshot_restore_t * restore ) {
restore->use_nt_copy = 1;
}

static void
fd_snapshot_nt_memcpy( uchar * restrict dst,
uchar const * restrict src,
ulong sz ) {
#if FD_HAS_AVX512 || FD_HAS_AVX
if( sz<1024UL ) goto tail;

/* Head copy */
ulong dst_align = (ulong)fd_ulong_align_up( (ulong)dst, 64UL );
ulong head_sz = dst_align - (ulong)dst;
fd_memcpy( dst, src, head_sz );
src += head_sz;
dst += head_sz;
sz -= head_sz;

/* NT copy */
while( sz>=512 ) {
# if FD_HAS_AVX512
_mm512_stream_si512( (void *)( dst+ 0UL ), _mm512_loadu_si512( (void const *)( src+ 0UL ) ) );
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit faster to do the stores a few instructions after the loads. The secret is interleaving. This keeps the memory pipeline full.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't the CPU's op reorder buffer already do this optimization for us? I'm almost certain the CPU should be able to schedule additional loads whenever it's waiting for a load to complete.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How deep is the speculative execution buffer?

_mm512_stream_si512( (void *)( dst+ 64UL ), _mm512_loadu_si512( (void const *)( src+ 64UL ) ) );
_mm512_stream_si512( (void *)( dst+128UL ), _mm512_loadu_si512( (void const *)( src+128UL ) ) );
_mm512_stream_si512( (void *)( dst+192UL ), _mm512_loadu_si512( (void const *)( src+192UL ) ) );
_mm512_stream_si512( (void *)( dst+256UL ), _mm512_loadu_si512( (void const *)( src+256UL ) ) );
_mm512_stream_si512( (void *)( dst+320UL ), _mm512_loadu_si512( (void const *)( src+320UL ) ) );
_mm512_stream_si512( (void *)( dst+384UL ), _mm512_loadu_si512( (void const *)( src+384UL ) ) );
_mm512_stream_si512( (void *)( dst+448UL ), _mm512_loadu_si512( (void const *)( src+448UL ) ) );
# elif FD_HAS_AVX
_mm256_stream_si256( (void *)( dst+ 0UL ), _mm256_loadu_si256( (void const *)( src+ 0UL ) ) );
_mm256_stream_si256( (void *)( dst+ 32UL ), _mm256_loadu_si256( (void const *)( src+ 32UL ) ) );
_mm256_stream_si256( (void *)( dst+ 64UL ), _mm256_loadu_si256( (void const *)( src+ 64UL ) ) );
_mm256_stream_si256( (void *)( dst+ 96UL ), _mm256_loadu_si256( (void const *)( src+ 96UL ) ) );
_mm256_stream_si256( (void *)( dst+128UL ), _mm256_loadu_si256( (void const *)( src+128UL ) ) );
_mm256_stream_si256( (void *)( dst+160UL ), _mm256_loadu_si256( (void const *)( src+160UL ) ) );
_mm256_stream_si256( (void *)( dst+192UL ), _mm256_loadu_si256( (void const *)( src+192UL ) ) );
_mm256_stream_si256( (void *)( dst+224UL ), _mm256_loadu_si256( (void const *)( src+224UL ) ) );
_mm256_stream_si256( (void *)( dst+256UL ), _mm256_loadu_si256( (void const *)( src+256UL ) ) );
_mm256_stream_si256( (void *)( dst+288UL ), _mm256_loadu_si256( (void const *)( src+288UL ) ) );
_mm256_stream_si256( (void *)( dst+320UL ), _mm256_loadu_si256( (void const *)( src+320UL ) ) );
_mm256_stream_si256( (void *)( dst+352UL ), _mm256_loadu_si256( (void const *)( src+352UL ) ) );
_mm256_stream_si256( (void *)( dst+384UL ), _mm256_loadu_si256( (void const *)( src+384UL ) ) );
_mm256_stream_si256( (void *)( dst+416UL ), _mm256_loadu_si256( (void const *)( src+416UL ) ) );
_mm256_stream_si256( (void *)( dst+448UL ), _mm256_loadu_si256( (void const *)( src+448UL ) ) );
_mm256_stream_si256( (void *)( dst+480UL ), _mm256_loadu_si256( (void const *)( src+480UL ) ) );
# else
# error "Unsupported AVX/AVX512 configuration"
# endif
src += 512;
dst += 512;
sz -= 512;
}

_mm_sfence();
#endif

/* Tail copy */
tail:
if( FD_LIKELY( sz ) ) {
fd_memcpy( dst, src, sz );
}
}

/* fd_snapshot_read_account_chunk reads partial account content. */

static uchar const *
Expand All @@ -591,7 +661,8 @@ fd_snapshot_read_account_chunk( fd_snapshot_restore_t * restore,

ulong data_sz = fd_ulong_min( restore->acc_sz, bufsz );
if( FD_LIKELY( restore->acc_data ) ) {
fd_memcpy( restore->acc_data, buf, data_sz );
if( restore->use_nt_copy ) fd_snapshot_nt_memcpy( restore->acc_data, buf, data_sz );
else fd_memcpy ( restore->acc_data, buf, data_sz );
restore->acc_data += data_sz;
}
if( FD_UNLIKELY( data_sz > restore->accv_sz ) )
Expand Down
9 changes: 9 additions & 0 deletions src/flamenco/snapshot/fd_snapshot_restore.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,15 @@ fd_snapshot_restore_chunk( void * restore,
ulong
fd_snapshot_restore_get_slot( fd_snapshot_restore_t * restore );

/* fd_snapshot_enable_nt_copy enables non-temporal stores for account
data if the target has AVX2 or AVX512. This should only be enabled
if multiple gigabytes worth of account data are going to be restored
without reads to that same data until after the restore operation
completes. */

void
fd_snapshot_enable_nt_copy( fd_snapshot_restore_t * restore );

extern fd_tar_read_vtable_t const fd_snapshot_restore_tar_vt;

FD_PROTOTYPES_END
Expand Down
3 changes: 2 additions & 1 deletion src/flamenco/snapshot/fd_snapshot_restore_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,9 @@ struct fd_snapshot_restore {

uchar state;
uchar manifest_done;
uchar use_nt_copy : 1;
uchar status_cache_done : 1;
uchar failed : 1;
uchar failed : 1;

/* Buffer params. This buffer is used to gather file content into
a contiguous byte array. Currently in use for the manifest and the
Expand Down
Loading