sha256: add optimized fast-path for PoH-style hashing

ptaffet-jump · ptaffet-jump · commit 9d5e75546bc1 · 2025-07-01T19:29:32.000Z
diff --git a/src/ballet/poh/fd_poh.c b/src/ballet/poh/fd_poh.c
@@ -3,9 +3,7 @@
 void *
 fd_poh_append( void * poh,
                ulong  n ) {
-  while( n-- ) {
-    fd_sha256_hash_32( poh, poh );
-  }
+  fd_sha256_hash_32_repeated( poh, poh, n );
   return poh;
 }
 
diff --git a/src/ballet/sha256/fd_sha256.c b/src/ballet/sha256/fd_sha256.c
@@ -1,5 +1,10 @@
 #include "fd_sha256.h"
 
+#if FD_HAS_SHANI
+/* For the optimized repeated hash */
+#include "../../util/simd/fd_sse.h"
+#endif
+
 ulong
 fd_sha256_align( void ) {
   return FD_SHA256_ALIGN;
@@ -411,56 +416,136 @@ fd_sha256_hash( void const * _data,
   return memcpy( _hash, state, 32 );
 }
 
+
+
 void *
-fd_sha256_hash_32( void const * _data,
-                   void *       _hash ) {
+fd_sha256_hash_32_repeated( void const * _data,
+                            void *       _hash,
+                            ulong        cnt ) {
   uchar const * data = (uchar const *)_data;
+  uchar       * hash = (uchar       *)_hash;
+#if FD_HAS_SHANI
+  vu_t       w0003 = vu_bswap( vu_ldu( data      ) );
+  vu_t       w0407 = vu_bswap( vu_ldu( data+16UL ) );
+  vb_t const w080b = vb( 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+                         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 );
+  vb_t const w0c0f = vb( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                         0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00 ); /* 32 bytes */
+  static const uint fd_sha256_core_shaext_Kmask[]= { 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298,66051,67438087,134810123,202182159 };
+  vu_t const initialFEBA = vu( 0x9b05688cU, 0x510e527fU, 0xbb67ae85U, 0x6a09e667U );
+  vu_t const initialHGDC = vu( 0x5be0cd19U, 0x1f83d9abU, 0xa54ff53aU, 0x3c6ef372U );
+
+  for( ulong iter=0UL; iter<cnt; iter++ ) {
+    vu_t stateFEBA = initialFEBA;
+    vu_t stateHGDC = initialHGDC;
+
+    /* _mm_sha256rnds2_epu32 does two rounds, one from the first uint in
+       wk and one from the second.  Since wk stores four rounds worth of
+       message schedule values, it makes sense for the macro to do four
+       rounds at a time.  We need to permute wk in between so that the
+       second call to the intrinsic will use the other values. */
+#define FOUR_ROUNDS( wk ) do {                                                               \
+      vu_t __wk = (wk);                                                                      \
+      vu_t temp_state = stateFEBA;                                                           \
+      stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, __wk );                       \
+      stateHGDC = temp_state;                                                                \
+                                                                                             \
+      temp_state = stateFEBA;                                                                \
+      stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, vu_permute( __wk, 2,3,0,1 ) );\
+      stateHGDC = temp_state;                                                                \
+    } while( 0 )
+
+    /* w[i] for i>= 16 is w[i-16]+ s0(w[i-15]) + w[i-7] + s1(w[i-2])
+       Since our vector size is 4 uints, it's only s1 that is a little
+       problematic, because it references items in the same vector.
+       Thankfully, the msg2 intrinsic takes care of the complexity, but we
+       need to execute it last.
+
+       For w[16..19], we get w[i-16] and s0(s[i-15]) using the msg1
+       intrinsic on w0003 and w0407. w[i-7] comes from w080b and w0c0f
+       adjusted with alignr, and s1(w[i-2]) comes from the sum of the
+       previous values and w0c0f. */
+
+#define NEXT_W( w_minus_16, w_minus_12, w_minus_8, w_minus_4 ) (__extension__({      \
+    vu_t __w_i_16_s0_i_15 = _mm_sha256msg1_epu32( w_minus_16, w_minus_12 );          \
+    vu_t __w_i_7          = _mm_alignr_epi8( w_minus_4, w_minus_8, 4 );              \
+    _mm_sha256msg2_epu32( vu_add( __w_i_7, __w_i_16_s0_i_15 ), w_minus_4 );          \
+      }))
+
+
+    /*                                              */ FOUR_ROUNDS( vu_add( w0003, vu_ld( fd_sha256_core_shaext_Kmask+ 0UL ) ) );
+    /*                                              */ FOUR_ROUNDS( vu_add( w0407, vu_ld( fd_sha256_core_shaext_Kmask+ 4UL ) ) );
+    /*                                              */ FOUR_ROUNDS( vu_add( w080b, vu_ld( fd_sha256_core_shaext_Kmask+ 8UL ) ) );
+    /*                                              */ FOUR_ROUNDS( vu_add( w0c0f, vu_ld( fd_sha256_core_shaext_Kmask+12UL ) ) );
+    vu_t w1013 = NEXT_W( w0003, w0407, w080b, w0c0f ); FOUR_ROUNDS( vu_add( w1013, vu_ld( fd_sha256_core_shaext_Kmask+16UL ) ) );
+    vu_t w1417 = NEXT_W( w0407, w080b, w0c0f, w1013 ); FOUR_ROUNDS( vu_add( w1417, vu_ld( fd_sha256_core_shaext_Kmask+20UL ) ) );
+    vu_t w181b = NEXT_W( w080b, w0c0f, w1013, w1417 ); FOUR_ROUNDS( vu_add( w181b, vu_ld( fd_sha256_core_shaext_Kmask+24UL ) ) );
+    vu_t w1c1f = NEXT_W( w0c0f, w1013, w1417, w181b ); FOUR_ROUNDS( vu_add( w1c1f, vu_ld( fd_sha256_core_shaext_Kmask+28UL ) ) );
+    vu_t w2023 = NEXT_W( w1013, w1417, w181b, w1c1f ); FOUR_ROUNDS( vu_add( w2023, vu_ld( fd_sha256_core_shaext_Kmask+32UL ) ) );
+    vu_t w2427 = NEXT_W( w1417, w181b, w1c1f, w2023 ); FOUR_ROUNDS( vu_add( w2427, vu_ld( fd_sha256_core_shaext_Kmask+36UL ) ) );
+    vu_t w282b = NEXT_W( w181b, w1c1f, w2023, w2427 ); FOUR_ROUNDS( vu_add( w282b, vu_ld( fd_sha256_core_shaext_Kmask+40UL ) ) );
+    vu_t w2c2f = NEXT_W( w1c1f, w2023, w2427, w282b ); FOUR_ROUNDS( vu_add( w2c2f, vu_ld( fd_sha256_core_shaext_Kmask+44UL ) ) );
+    vu_t w3033 = NEXT_W( w2023, w2427, w282b, w2c2f ); FOUR_ROUNDS( vu_add( w3033, vu_ld( fd_sha256_core_shaext_Kmask+48UL ) ) );
+    vu_t w3437 = NEXT_W( w2427, w282b, w2c2f, w3033 ); FOUR_ROUNDS( vu_add( w3437, vu_ld( fd_sha256_core_shaext_Kmask+52UL ) ) );
+    vu_t w383b = NEXT_W( w282b, w2c2f, w3033, w3437 ); FOUR_ROUNDS( vu_add( w383b, vu_ld( fd_sha256_core_shaext_Kmask+56UL ) ) );
+    vu_t w3c3f = NEXT_W( w2c2f, w3033, w3437, w383b ); FOUR_ROUNDS( vu_add( w3c3f, vu_ld( fd_sha256_core_shaext_Kmask+60UL ) ) );
+
+    stateFEBA = vu_add( stateFEBA, initialFEBA );
+    stateHGDC = vu_add( stateHGDC, initialHGDC );
+
+    vu_t stateABCD = vu_permute2( stateFEBA, stateHGDC, 3, 2, 3, 2 );
+    vu_t stateEFGH = vu_permute2( stateFEBA, stateHGDC, 1, 0, 1, 0 );
+
+    w0003 = stateABCD;
+    w0407 = stateEFGH;
+  }
+  vu_stu( hash,      vu_bswap( w0003 ) );
+  vu_stu( hash+16UL, vu_bswap( w0407 ) );
 
-  /* This is just the above streamlined to eliminate all the overheads
-     to support incremental hashing. */
+#else
 
   uchar buf[ FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
-  uint  state[8] __attribute__((aligned(32)));
 
-  state[0] = 0x6a09e667U;
-  state[1] = 0xbb67ae85U;
-  state[2] = 0x3c6ef372U;
-  state[3] = 0xa54ff53aU;
-  state[4] = 0x510e527fU;
-  state[5] = 0x9b05688cU;
-  state[6] = 0x1f83d9abU;
-  state[7] = 0x5be0cd19U;
-
-  ulong sz = 32;
-
-  ulong block_cnt = sz >> FD_SHA256_PRIVATE_LG_BUF_MAX;
-  if( FD_LIKELY( block_cnt ) ) fd_sha256_core( state, data, block_cnt );
-
-  ulong buf_used = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL);
-  if( FD_UNLIKELY( buf_used ) ) memcpy( buf, data + (block_cnt << FD_SHA256_PRIVATE_LG_BUF_MAX), buf_used );
+  /* Prepare padding once */
+  ulong buf_used = 32UL;
+  memcpy( buf, data, 32UL );
   buf[ buf_used ] = (uchar)0x80;
   buf_used++;
 
-  if( FD_UNLIKELY( buf_used > (FD_SHA256_PRIVATE_BUF_MAX-8UL) ) ) {
-    memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-buf_used );
-    fd_sha256_core( state, buf, 1UL );
-    buf_used = 0UL;
-  }
-
-  ulong bit_cnt = sz << 3;
+  ulong bit_cnt = 32UL << 3;
   memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-8UL-buf_used );
   FD_STORE( ulong, buf+FD_SHA256_PRIVATE_BUF_MAX-8UL, fd_ulong_bswap( bit_cnt ) );
-  fd_sha256_core( state, buf, 1UL );
 
-  state[0] = fd_uint_bswap( state[0] );
-  state[1] = fd_uint_bswap( state[1] );
-  state[2] = fd_uint_bswap( state[2] );
-  state[3] = fd_uint_bswap( state[3] );
-  state[4] = fd_uint_bswap( state[4] );
-  state[5] = fd_uint_bswap( state[5] );
-  state[6] = fd_uint_bswap( state[6] );
-  state[7] = fd_uint_bswap( state[7] );
-  return memcpy( _hash, state, 32 );
+  /* This is just the above streamlined to eliminate all the overheads
+     to support incremental hashing. */
+  for( ulong iter=0UL; iter<cnt; iter++ ) {
+
+    uint  state[8] __attribute__((aligned(32)));
+
+    state[0] = 0x6a09e667U;
+    state[1] = 0xbb67ae85U;
+    state[2] = 0x3c6ef372U;
+    state[3] = 0xa54ff53aU;
+    state[4] = 0x510e527fU;
+    state[5] = 0x9b05688cU;
+    state[6] = 0x1f83d9abU;
+    state[7] = 0x5be0cd19U;
+
+    fd_sha256_core( state, buf, 1UL );
+
+    state[0] = fd_uint_bswap( state[0] );
+    state[1] = fd_uint_bswap( state[1] );
+    state[2] = fd_uint_bswap( state[2] );
+    state[3] = fd_uint_bswap( state[3] );
+    state[4] = fd_uint_bswap( state[4] );
+    state[5] = fd_uint_bswap( state[5] );
+    state[6] = fd_uint_bswap( state[6] );
+    state[7] = fd_uint_bswap( state[7] );
+    memcpy( buf, state, 32UL );
+  }
+  memcpy( hash, buf, 32UL );
+#endif
+  return _hash;
 }
 
 #undef fd_sha256_core
diff --git a/src/ballet/sha256/fd_sha256.h b/src/ballet/sha256/fd_sha256.h
@@ -148,9 +148,24 @@ fd_sha256_hash( void const * data,
                 ulong        sz,
                 void *       hash );
 
+/* fd_sha256_hash_32_repeated hashes the 32 bytes pointed to by data,
+   then hashes the hash, and repeats, doing a total of cnt hashes.  It
+   is a streamlined version of:
+
+   uchar temp[32];
+   memcpy( temp, data, 32UL );
+   for( ulong i=0UL; i<cnt; i++ ) fd_sha256_hash( temp, 32UL, temp );
+   memcpy( hash, temp, 32UL );
+   return hash;
+
+   This eliminates function call overhead and data marshalling.  cnt==0
+   is okay, in which case this just copies data to hash.  Always returns
+   hash.  data and hash must be valid, non-NULL pointers, even when
+   cnt==0. */
 void *
-fd_sha256_hash_32( void const * data,
-                   void *       hash );
+fd_sha256_hash_32_repeated( void const * data,
+                            void *       hash,
+                            ulong        cnt );
 
 FD_PROTOTYPES_END
 
diff --git a/src/ballet/sha256/test_sha256.c b/src/ballet/sha256/test_sha256.c
@@ -136,13 +136,26 @@ main( int     argc,
 # undef DATA_MAX
 # undef BATCH_MAX
 
+  uchar in_hash[32];
+  /* test fd_sha256_hash_32_repeated */
+  for( ulong k=0UL; k<10000000UL; k = (k<<1)|1UL ) {
+    for( ulong b=0UL; b<32UL; b++ ) in_hash[b] = fd_rng_uchar( rng );
+    memcpy( hash, in_hash, 32UL );
+    for( ulong iter=0UL; iter<k; iter++ ) fd_sha256_hash( hash, 32UL, hash );
+    fd_sha256_hash_32_repeated( in_hash, in_hash, k );
+    for( ulong b=0UL; b<32UL; b++ ) FD_TEST( in_hash[b]==hash[b] );
+  }
+
   /* do a benchmark on PoH-style hashing */
   FD_LOG_NOTICE(( "Benchmarking poh" ));
-  for( ulong b=0UL; b<32UL; b++ ) hash[b] = fd_rng_uchar( rng );
+  for( ulong b=0UL; b<32UL; b++ ) in_hash[b] = fd_rng_uchar( rng );
+
   {
+    memcpy( hash, in_hash, 32UL );
     /* warmup */
     for( ulong rem=10UL; rem; rem-- ) fd_sha256_hash( hash, 32UL, hash );
 
+    memcpy( hash, in_hash, 32UL );
     /* for real */
     ulong iter = 1000000UL;
     long  dt   = -fd_log_wallclock();
@@ -151,6 +164,17 @@ main( int     argc,
     float hashes_per_sec = ((float)iter * 1e-6f ) / ((float)dt * 1e-9f) ;
     FD_LOG_NOTICE(( "~%6.3f M poh hashes / sec / core", (double)hashes_per_sec ));
   }
+  {
+    fd_sha256_hash_32_repeated( in_hash, hash, 10UL );
+    ulong iter = 1000000UL;
+    /* Now do it again with hash_32_repeated */
+    long  dt   = -fd_log_wallclock();
+    fd_sha256_hash_32_repeated( in_hash, hash, iter );
+    dt += fd_log_wallclock();
+    float hashes_per_sec = ((float)iter * 1e-6f ) / ((float)dt * 1e-9f) ;
+    FD_LOG_NOTICE(( "~%6.3f M poh hashes / sec / core with fd_sha256_hash_32_repeated", (double)hashes_per_sec ));
+  }
+
   /* do a quick benchmark of sha-256 on small and large UDP payload
      packets from UDP/IP4/VLAN/Ethernet */
 
diff --git a/src/discof/poh/fd_poh_tile.c b/src/discof/poh/fd_poh_tile.c
@@ -1688,9 +1688,9 @@ after_credit( fd_poh_ctx_t *      ctx,
 
   *charge_busy = 1;
 
-  while( ctx->hashcnt<target_hashcnt ) {
-    fd_sha256_hash( ctx->hash, 32UL, ctx->hash );
-    ctx->hashcnt++;
+  if( FD_LIKELY( ctx->hashcnt<target_hashcnt ) ) {
+    fd_sha256_hash_32_repeated( ctx->hash, ctx->hash, target_hashcnt-ctx->hashcnt );
+    ctx->hashcnt = target_hashcnt;
   }
 
   if( FD_UNLIKELY( ctx->hashcnt==ctx->hashcnt_per_slot ) ) {
diff --git a/src/discoh/poh/fd_poh_tile.c b/src/discoh/poh/fd_poh_tile.c
@@ -1689,9 +1689,9 @@ after_credit( fd_poh_ctx_t *      ctx,
 
   *charge_busy = 1;
 
-  while( ctx->hashcnt<target_hashcnt ) {
-    fd_sha256_hash( ctx->hash, 32UL, ctx->hash );
-    ctx->hashcnt++;
+  if( FD_LIKELY( ctx->hashcnt<target_hashcnt ) ) {
+    fd_sha256_hash_32_repeated( ctx->hash, ctx->hash, target_hashcnt-ctx->hashcnt );
+    ctx->hashcnt = target_hashcnt;
   }
 
   if( FD_UNLIKELY( ctx->hashcnt==ctx->hashcnt_per_slot ) ) {