Skip to content

Commit 9d5e755

Browse files
committed
sha256: add optimized fast-path for PoH-style hashing
1 parent e0c09eb commit 9d5e755

File tree

6 files changed

+172
-50
lines changed

6 files changed

+172
-50
lines changed

src/ballet/poh/fd_poh.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
void *
44
fd_poh_append( void * poh,
55
ulong n ) {
6-
while( n-- ) {
7-
fd_sha256_hash_32( poh, poh );
8-
}
6+
fd_sha256_hash_32_repeated( poh, poh, n );
97
return poh;
108
}
119

src/ballet/sha256/fd_sha256.c

Lines changed: 123 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
#include "fd_sha256.h"
22

3+
#if FD_HAS_SHANI
4+
/* For the optimized repeated hash */
5+
#include "../../util/simd/fd_sse.h"
6+
#endif
7+
38
ulong
49
fd_sha256_align( void ) {
510
return FD_SHA256_ALIGN;
@@ -411,56 +416,136 @@ fd_sha256_hash( void const * _data,
411416
return memcpy( _hash, state, 32 );
412417
}
413418

419+
420+
414421
void *
415-
fd_sha256_hash_32( void const * _data,
416-
void * _hash ) {
422+
fd_sha256_hash_32_repeated( void const * _data,
423+
void * _hash,
424+
ulong cnt ) {
417425
uchar const * data = (uchar const *)_data;
426+
uchar * hash = (uchar *)_hash;
427+
#if FD_HAS_SHANI
428+
vu_t w0003 = vu_bswap( vu_ldu( data ) );
429+
vu_t w0407 = vu_bswap( vu_ldu( data+16UL ) );
430+
vb_t const w080b = vb( 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
431+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 );
432+
vb_t const w0c0f = vb( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
433+
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00 ); /* 32 bytes */
434+
static const uint fd_sha256_core_shaext_Kmask[]= { 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298,66051,67438087,134810123,202182159 };
435+
vu_t const initialFEBA = vu( 0x9b05688cU, 0x510e527fU, 0xbb67ae85U, 0x6a09e667U );
436+
vu_t const initialHGDC = vu( 0x5be0cd19U, 0x1f83d9abU, 0xa54ff53aU, 0x3c6ef372U );
437+
438+
for( ulong iter=0UL; iter<cnt; iter++ ) {
439+
vu_t stateFEBA = initialFEBA;
440+
vu_t stateHGDC = initialHGDC;
441+
442+
/* _mm_sha256rnds2_epu32 does two rounds, one from the first uint in
443+
wk and one from the second. Since wk stores four rounds worth of
444+
message schedule values, it makes sense for the macro to do four
445+
rounds at a time. We need to permute wk in between so that the
446+
second call to the intrinsic will use the other values. */
447+
#define FOUR_ROUNDS( wk ) do { \
448+
vu_t __wk = (wk); \
449+
vu_t temp_state = stateFEBA; \
450+
stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, __wk ); \
451+
stateHGDC = temp_state; \
452+
\
453+
temp_state = stateFEBA; \
454+
stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, vu_permute( __wk, 2,3,0,1 ) );\
455+
stateHGDC = temp_state; \
456+
} while( 0 )
457+
458+
/* w[i] for i>= 16 is w[i-16]+ s0(w[i-15]) + w[i-7] + s1(w[i-2])
459+
Since our vector size is 4 uints, it's only s1 that is a little
460+
problematic, because it references items in the same vector.
461+
Thankfully, the msg2 intrinsic takes care of the complexity, but we
462+
need to execute it last.
463+
464+
For w[16..19], we get w[i-16] and s0(s[i-15]) using the msg1
465+
intrinsic on w0003 and w0407. w[i-7] comes from w080b and w0c0f
466+
adjusted with alignr, and s1(w[i-2]) comes from the sum of the
467+
previous values and w0c0f. */
468+
469+
#define NEXT_W( w_minus_16, w_minus_12, w_minus_8, w_minus_4 ) (__extension__({ \
470+
vu_t __w_i_16_s0_i_15 = _mm_sha256msg1_epu32( w_minus_16, w_minus_12 ); \
471+
vu_t __w_i_7 = _mm_alignr_epi8( w_minus_4, w_minus_8, 4 ); \
472+
_mm_sha256msg2_epu32( vu_add( __w_i_7, __w_i_16_s0_i_15 ), w_minus_4 ); \
473+
}))
474+
475+
476+
/* */ FOUR_ROUNDS( vu_add( w0003, vu_ld( fd_sha256_core_shaext_Kmask+ 0UL ) ) );
477+
/* */ FOUR_ROUNDS( vu_add( w0407, vu_ld( fd_sha256_core_shaext_Kmask+ 4UL ) ) );
478+
/* */ FOUR_ROUNDS( vu_add( w080b, vu_ld( fd_sha256_core_shaext_Kmask+ 8UL ) ) );
479+
/* */ FOUR_ROUNDS( vu_add( w0c0f, vu_ld( fd_sha256_core_shaext_Kmask+12UL ) ) );
480+
vu_t w1013 = NEXT_W( w0003, w0407, w080b, w0c0f ); FOUR_ROUNDS( vu_add( w1013, vu_ld( fd_sha256_core_shaext_Kmask+16UL ) ) );
481+
vu_t w1417 = NEXT_W( w0407, w080b, w0c0f, w1013 ); FOUR_ROUNDS( vu_add( w1417, vu_ld( fd_sha256_core_shaext_Kmask+20UL ) ) );
482+
vu_t w181b = NEXT_W( w080b, w0c0f, w1013, w1417 ); FOUR_ROUNDS( vu_add( w181b, vu_ld( fd_sha256_core_shaext_Kmask+24UL ) ) );
483+
vu_t w1c1f = NEXT_W( w0c0f, w1013, w1417, w181b ); FOUR_ROUNDS( vu_add( w1c1f, vu_ld( fd_sha256_core_shaext_Kmask+28UL ) ) );
484+
vu_t w2023 = NEXT_W( w1013, w1417, w181b, w1c1f ); FOUR_ROUNDS( vu_add( w2023, vu_ld( fd_sha256_core_shaext_Kmask+32UL ) ) );
485+
vu_t w2427 = NEXT_W( w1417, w181b, w1c1f, w2023 ); FOUR_ROUNDS( vu_add( w2427, vu_ld( fd_sha256_core_shaext_Kmask+36UL ) ) );
486+
vu_t w282b = NEXT_W( w181b, w1c1f, w2023, w2427 ); FOUR_ROUNDS( vu_add( w282b, vu_ld( fd_sha256_core_shaext_Kmask+40UL ) ) );
487+
vu_t w2c2f = NEXT_W( w1c1f, w2023, w2427, w282b ); FOUR_ROUNDS( vu_add( w2c2f, vu_ld( fd_sha256_core_shaext_Kmask+44UL ) ) );
488+
vu_t w3033 = NEXT_W( w2023, w2427, w282b, w2c2f ); FOUR_ROUNDS( vu_add( w3033, vu_ld( fd_sha256_core_shaext_Kmask+48UL ) ) );
489+
vu_t w3437 = NEXT_W( w2427, w282b, w2c2f, w3033 ); FOUR_ROUNDS( vu_add( w3437, vu_ld( fd_sha256_core_shaext_Kmask+52UL ) ) );
490+
vu_t w383b = NEXT_W( w282b, w2c2f, w3033, w3437 ); FOUR_ROUNDS( vu_add( w383b, vu_ld( fd_sha256_core_shaext_Kmask+56UL ) ) );
491+
vu_t w3c3f = NEXT_W( w2c2f, w3033, w3437, w383b ); FOUR_ROUNDS( vu_add( w3c3f, vu_ld( fd_sha256_core_shaext_Kmask+60UL ) ) );
492+
493+
stateFEBA = vu_add( stateFEBA, initialFEBA );
494+
stateHGDC = vu_add( stateHGDC, initialHGDC );
495+
496+
vu_t stateABCD = vu_permute2( stateFEBA, stateHGDC, 3, 2, 3, 2 );
497+
vu_t stateEFGH = vu_permute2( stateFEBA, stateHGDC, 1, 0, 1, 0 );
498+
499+
w0003 = stateABCD;
500+
w0407 = stateEFGH;
501+
}
502+
vu_stu( hash, vu_bswap( w0003 ) );
503+
vu_stu( hash+16UL, vu_bswap( w0407 ) );
418504

419-
/* This is just the above streamlined to eliminate all the overheads
420-
to support incremental hashing. */
505+
#else
421506

422507
uchar buf[ FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
423-
uint state[8] __attribute__((aligned(32)));
424508

425-
state[0] = 0x6a09e667U;
426-
state[1] = 0xbb67ae85U;
427-
state[2] = 0x3c6ef372U;
428-
state[3] = 0xa54ff53aU;
429-
state[4] = 0x510e527fU;
430-
state[5] = 0x9b05688cU;
431-
state[6] = 0x1f83d9abU;
432-
state[7] = 0x5be0cd19U;
433-
434-
ulong sz = 32;
435-
436-
ulong block_cnt = sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
437-
if( FD_LIKELY( block_cnt ) ) fd_sha256_core( state, data, block_cnt );
438-
439-
ulong buf_used = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
440-
if( FD_UNLIKELY( buf_used ) ) memcpy( buf, data + (block_cnt << FD_SHA256_PRIVATE_LG_BUF_MAX), buf_used );
509+
/* Prepare padding once */
510+
ulong buf_used = 32UL;
511+
memcpy( buf, data, 32UL );
441512
buf[ buf_used ] = (uchar)0x80;
442513
buf_used++;
443514

444-
if( FD_UNLIKELY( buf_used > (FD_SHA256_PRIVATE_BUF_MAX-8UL) ) ) {
445-
memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-buf_used );
446-
fd_sha256_core( state, buf, 1UL );
447-
buf_used = 0UL;
448-
}
449-
450-
ulong bit_cnt = sz << 3;
515+
ulong bit_cnt = 32UL << 3;
451516
memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-8UL-buf_used );
452517
FD_STORE( ulong, buf+FD_SHA256_PRIVATE_BUF_MAX-8UL, fd_ulong_bswap( bit_cnt ) );
453-
fd_sha256_core( state, buf, 1UL );
454518

455-
state[0] = fd_uint_bswap( state[0] );
456-
state[1] = fd_uint_bswap( state[1] );
457-
state[2] = fd_uint_bswap( state[2] );
458-
state[3] = fd_uint_bswap( state[3] );
459-
state[4] = fd_uint_bswap( state[4] );
460-
state[5] = fd_uint_bswap( state[5] );
461-
state[6] = fd_uint_bswap( state[6] );
462-
state[7] = fd_uint_bswap( state[7] );
463-
return memcpy( _hash, state, 32 );
519+
/* This is just the above streamlined to eliminate all the overheads
520+
to support incremental hashing. */
521+
for( ulong iter=0UL; iter<cnt; iter++ ) {
522+
523+
uint state[8] __attribute__((aligned(32)));
524+
525+
state[0] = 0x6a09e667U;
526+
state[1] = 0xbb67ae85U;
527+
state[2] = 0x3c6ef372U;
528+
state[3] = 0xa54ff53aU;
529+
state[4] = 0x510e527fU;
530+
state[5] = 0x9b05688cU;
531+
state[6] = 0x1f83d9abU;
532+
state[7] = 0x5be0cd19U;
533+
534+
fd_sha256_core( state, buf, 1UL );
535+
536+
state[0] = fd_uint_bswap( state[0] );
537+
state[1] = fd_uint_bswap( state[1] );
538+
state[2] = fd_uint_bswap( state[2] );
539+
state[3] = fd_uint_bswap( state[3] );
540+
state[4] = fd_uint_bswap( state[4] );
541+
state[5] = fd_uint_bswap( state[5] );
542+
state[6] = fd_uint_bswap( state[6] );
543+
state[7] = fd_uint_bswap( state[7] );
544+
memcpy( buf, state, 32UL );
545+
}
546+
memcpy( hash, buf, 32UL );
547+
#endif
548+
return _hash;
464549
}
465550

466551
#undef fd_sha256_core

src/ballet/sha256/fd_sha256.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,24 @@ fd_sha256_hash( void const * data,
148148
ulong sz,
149149
void * hash );
150150

151+
/* fd_sha256_hash_32_repeated hashes the 32 bytes pointed to by data,
152+
then hashes the hash, and repeats, doing a total of cnt hashes. It
153+
is a streamlined version of:
154+
155+
uchar temp[32];
156+
memcpy( temp, data, 32UL );
157+
for( ulong i=0UL; i<cnt; i++ ) fd_sha256_hash( temp, 32UL, temp );
158+
memcpy( hash, temp, 32UL );
159+
return hash;
160+
161+
This eliminates function call overhead and data marshalling. cnt==0
162+
is okay, in which case this just copies data to hash. Always returns
163+
hash. data and hash must be valid, non-NULL pointers, even when
164+
cnt==0. */
151165
void *
152-
fd_sha256_hash_32( void const * data,
153-
void * hash );
166+
fd_sha256_hash_32_repeated( void const * data,
167+
void * hash,
168+
ulong cnt );
154169

155170
FD_PROTOTYPES_END
156171

src/ballet/sha256/test_sha256.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,26 @@ main( int argc,
136136
# undef DATA_MAX
137137
# undef BATCH_MAX
138138

139+
uchar in_hash[32];
140+
/* test fd_sha256_hash_32_repeated */
141+
for( ulong k=0UL; k<10000000UL; k = (k<<1)|1UL ) {
142+
for( ulong b=0UL; b<32UL; b++ ) in_hash[b] = fd_rng_uchar( rng );
143+
memcpy( hash, in_hash, 32UL );
144+
for( ulong iter=0UL; iter<k; iter++ ) fd_sha256_hash( hash, 32UL, hash );
145+
fd_sha256_hash_32_repeated( in_hash, in_hash, k );
146+
for( ulong b=0UL; b<32UL; b++ ) FD_TEST( in_hash[b]==hash[b] );
147+
}
148+
139149
/* do a benchmark on PoH-style hashing */
140150
FD_LOG_NOTICE(( "Benchmarking poh" ));
141-
for( ulong b=0UL; b<32UL; b++ ) hash[b] = fd_rng_uchar( rng );
151+
for( ulong b=0UL; b<32UL; b++ ) in_hash[b] = fd_rng_uchar( rng );
152+
142153
{
154+
memcpy( hash, in_hash, 32UL );
143155
/* warmup */
144156
for( ulong rem=10UL; rem; rem-- ) fd_sha256_hash( hash, 32UL, hash );
145157

158+
memcpy( hash, in_hash, 32UL );
146159
/* for real */
147160
ulong iter = 1000000UL;
148161
long dt = -fd_log_wallclock();
@@ -151,6 +164,17 @@ main( int argc,
151164
float hashes_per_sec = ((float)iter * 1e-6f ) / ((float)dt * 1e-9f) ;
152165
FD_LOG_NOTICE(( "~%6.3f M poh hashes / sec / core", (double)hashes_per_sec ));
153166
}
167+
{
168+
fd_sha256_hash_32_repeated( in_hash, hash, 10UL );
169+
ulong iter = 1000000UL;
170+
/* Now do it again with hash_32_repeated */
171+
long dt = -fd_log_wallclock();
172+
fd_sha256_hash_32_repeated( in_hash, hash, iter );
173+
dt += fd_log_wallclock();
174+
float hashes_per_sec = ((float)iter * 1e-6f ) / ((float)dt * 1e-9f) ;
175+
FD_LOG_NOTICE(( "~%6.3f M poh hashes / sec / core with fd_sha256_hash_32_repeated", (double)hashes_per_sec ));
176+
}
177+
154178
/* do a quick benchmark of sha-256 on small and large UDP payload
155179
packets from UDP/IP4/VLAN/Ethernet */
156180

src/discof/poh/fd_poh_tile.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,9 +1688,9 @@ after_credit( fd_poh_ctx_t * ctx,
16881688

16891689
*charge_busy = 1;
16901690

1691-
while( ctx->hashcnt<target_hashcnt ) {
1692-
fd_sha256_hash( ctx->hash, 32UL, ctx->hash );
1693-
ctx->hashcnt++;
1691+
if( FD_LIKELY( ctx->hashcnt<target_hashcnt ) ) {
1692+
fd_sha256_hash_32_repeated( ctx->hash, ctx->hash, target_hashcnt-ctx->hashcnt );
1693+
ctx->hashcnt = target_hashcnt;
16941694
}
16951695

16961696
if( FD_UNLIKELY( ctx->hashcnt==ctx->hashcnt_per_slot ) ) {

src/discoh/poh/fd_poh_tile.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1689,9 +1689,9 @@ after_credit( fd_poh_ctx_t * ctx,
16891689

16901690
*charge_busy = 1;
16911691

1692-
while( ctx->hashcnt<target_hashcnt ) {
1693-
fd_sha256_hash( ctx->hash, 32UL, ctx->hash );
1694-
ctx->hashcnt++;
1692+
if( FD_LIKELY( ctx->hashcnt<target_hashcnt ) ) {
1693+
fd_sha256_hash_32_repeated( ctx->hash, ctx->hash, target_hashcnt-ctx->hashcnt );
1694+
ctx->hashcnt = target_hashcnt;
16951695
}
16961696

16971697
if( FD_UNLIKELY( ctx->hashcnt==ctx->hashcnt_per_slot ) ) {

0 commit comments

Comments
 (0)