|
1 | 1 | #include "fd_sha256.h"
|
2 | 2 |
|
| 3 | +#if FD_HAS_SHANI |
| 4 | +/* For the optimized repeated hash */ |
| 5 | +#include "../../util/simd/fd_sse.h" |
| 6 | +#endif |
| 7 | + |
3 | 8 | ulong
|
4 | 9 | fd_sha256_align( void ) {
|
5 | 10 | return FD_SHA256_ALIGN;
|
@@ -411,56 +416,136 @@ fd_sha256_hash( void const * _data,
|
411 | 416 | return memcpy( _hash, state, 32 );
|
412 | 417 | }
|
413 | 418 |
|
| 419 | + |
| 420 | + |
414 | 421 | void *
|
415 |
| -fd_sha256_hash_32( void const * _data, |
416 |
| - void * _hash ) { |
| 422 | +fd_sha256_hash_32_repeated( void const * _data, |
| 423 | + void * _hash, |
| 424 | + ulong cnt ) { |
417 | 425 | uchar const * data = (uchar const *)_data;
|
| 426 | + uchar * hash = (uchar *)_hash; |
| 427 | +#if FD_HAS_SHANI |
| 428 | + vu_t w0003 = vu_bswap( vu_ldu( data ) ); |
| 429 | + vu_t w0407 = vu_bswap( vu_ldu( data+16UL ) ); |
| 430 | + vb_t const w080b = vb( 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, |
| 431 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 ); |
| 432 | + vb_t const w0c0f = vb( 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 433 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00 ); /* 32 bytes */ |
| 434 | + static const uint fd_sha256_core_shaext_Kmask[]= { 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298,66051,67438087,134810123,202182159 }; |
| 435 | + vu_t const initialFEBA = vu( 0x9b05688cU, 0x510e527fU, 0xbb67ae85U, 0x6a09e667U ); |
| 436 | + vu_t const initialHGDC = vu( 0x5be0cd19U, 0x1f83d9abU, 0xa54ff53aU, 0x3c6ef372U ); |
| 437 | + |
| 438 | + for( ulong iter=0UL; iter<cnt; iter++ ) { |
| 439 | + vu_t stateFEBA = initialFEBA; |
| 440 | + vu_t stateHGDC = initialHGDC; |
| 441 | + |
| 442 | + /* _mm_sha256rnds2_epu32 does two rounds, one from the first uint in |
| 443 | + wk and one from the second. Since wk stores four rounds worth of |
| 444 | + message schedule values, it makes sense for the macro to do four |
| 445 | + rounds at a time. We need to permute wk in between so that the |
| 446 | + second call to the intrinsic will use the other values. */ |
| 447 | +#define FOUR_ROUNDS( wk ) do { \ |
| 448 | + vu_t __wk = (wk); \ |
| 449 | + vu_t temp_state = stateFEBA; \ |
| 450 | + stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, __wk ); \ |
| 451 | + stateHGDC = temp_state; \ |
| 452 | + \ |
| 453 | + temp_state = stateFEBA; \ |
| 454 | + stateFEBA = _mm_sha256rnds2_epu32( stateHGDC, stateFEBA, vu_permute( __wk, 2,3,0,1 ) );\ |
| 455 | + stateHGDC = temp_state; \ |
| 456 | + } while( 0 ) |
| 457 | + |
| 458 | + /* w[i] for i>= 16 is w[i-16]+ s0(w[i-15]) + w[i-7] + s1(w[i-2]) |
| 459 | + Since our vector size is 4 uints, it's only s1 that is a little |
| 460 | + problematic, because it references items in the same vector. |
| 461 | + Thankfully, the msg2 intrinsic takes care of the complexity, but we |
| 462 | + need to execute it last. |
| 463 | +
|
| 464 | + For w[16..19], we get w[i-16] and s0(s[i-15]) using the msg1 |
| 465 | + intrinsic on w0003 and w0407. w[i-7] comes from w080b and w0c0f |
| 466 | + adjusted with alignr, and s1(w[i-2]) comes from the sum of the |
| 467 | + previous values and w0c0f. */ |
| 468 | + |
| 469 | +#define NEXT_W( w_minus_16, w_minus_12, w_minus_8, w_minus_4 ) (__extension__({ \ |
| 470 | + vu_t __w_i_16_s0_i_15 = _mm_sha256msg1_epu32( w_minus_16, w_minus_12 ); \ |
| 471 | + vu_t __w_i_7 = _mm_alignr_epi8( w_minus_4, w_minus_8, 4 ); \ |
| 472 | + _mm_sha256msg2_epu32( vu_add( __w_i_7, __w_i_16_s0_i_15 ), w_minus_4 ); \ |
| 473 | + })) |
| 474 | + |
| 475 | + |
| 476 | + /* */ FOUR_ROUNDS( vu_add( w0003, vu_ld( fd_sha256_core_shaext_Kmask+ 0UL ) ) ); |
| 477 | + /* */ FOUR_ROUNDS( vu_add( w0407, vu_ld( fd_sha256_core_shaext_Kmask+ 4UL ) ) ); |
| 478 | + /* */ FOUR_ROUNDS( vu_add( w080b, vu_ld( fd_sha256_core_shaext_Kmask+ 8UL ) ) ); |
| 479 | + /* */ FOUR_ROUNDS( vu_add( w0c0f, vu_ld( fd_sha256_core_shaext_Kmask+12UL ) ) ); |
| 480 | + vu_t w1013 = NEXT_W( w0003, w0407, w080b, w0c0f ); FOUR_ROUNDS( vu_add( w1013, vu_ld( fd_sha256_core_shaext_Kmask+16UL ) ) ); |
| 481 | + vu_t w1417 = NEXT_W( w0407, w080b, w0c0f, w1013 ); FOUR_ROUNDS( vu_add( w1417, vu_ld( fd_sha256_core_shaext_Kmask+20UL ) ) ); |
| 482 | + vu_t w181b = NEXT_W( w080b, w0c0f, w1013, w1417 ); FOUR_ROUNDS( vu_add( w181b, vu_ld( fd_sha256_core_shaext_Kmask+24UL ) ) ); |
| 483 | + vu_t w1c1f = NEXT_W( w0c0f, w1013, w1417, w181b ); FOUR_ROUNDS( vu_add( w1c1f, vu_ld( fd_sha256_core_shaext_Kmask+28UL ) ) ); |
| 484 | + vu_t w2023 = NEXT_W( w1013, w1417, w181b, w1c1f ); FOUR_ROUNDS( vu_add( w2023, vu_ld( fd_sha256_core_shaext_Kmask+32UL ) ) ); |
| 485 | + vu_t w2427 = NEXT_W( w1417, w181b, w1c1f, w2023 ); FOUR_ROUNDS( vu_add( w2427, vu_ld( fd_sha256_core_shaext_Kmask+36UL ) ) ); |
| 486 | + vu_t w282b = NEXT_W( w181b, w1c1f, w2023, w2427 ); FOUR_ROUNDS( vu_add( w282b, vu_ld( fd_sha256_core_shaext_Kmask+40UL ) ) ); |
| 487 | + vu_t w2c2f = NEXT_W( w1c1f, w2023, w2427, w282b ); FOUR_ROUNDS( vu_add( w2c2f, vu_ld( fd_sha256_core_shaext_Kmask+44UL ) ) ); |
| 488 | + vu_t w3033 = NEXT_W( w2023, w2427, w282b, w2c2f ); FOUR_ROUNDS( vu_add( w3033, vu_ld( fd_sha256_core_shaext_Kmask+48UL ) ) ); |
| 489 | + vu_t w3437 = NEXT_W( w2427, w282b, w2c2f, w3033 ); FOUR_ROUNDS( vu_add( w3437, vu_ld( fd_sha256_core_shaext_Kmask+52UL ) ) ); |
| 490 | + vu_t w383b = NEXT_W( w282b, w2c2f, w3033, w3437 ); FOUR_ROUNDS( vu_add( w383b, vu_ld( fd_sha256_core_shaext_Kmask+56UL ) ) ); |
| 491 | + vu_t w3c3f = NEXT_W( w2c2f, w3033, w3437, w383b ); FOUR_ROUNDS( vu_add( w3c3f, vu_ld( fd_sha256_core_shaext_Kmask+60UL ) ) ); |
| 492 | + |
| 493 | + stateFEBA = vu_add( stateFEBA, initialFEBA ); |
| 494 | + stateHGDC = vu_add( stateHGDC, initialHGDC ); |
| 495 | + |
| 496 | + vu_t stateABCD = vu_permute2( stateFEBA, stateHGDC, 3, 2, 3, 2 ); |
| 497 | + vu_t stateEFGH = vu_permute2( stateFEBA, stateHGDC, 1, 0, 1, 0 ); |
| 498 | + |
| 499 | + w0003 = stateABCD; |
| 500 | + w0407 = stateEFGH; |
| 501 | + } |
| 502 | + vu_stu( hash, vu_bswap( w0003 ) ); |
| 503 | + vu_stu( hash+16UL, vu_bswap( w0407 ) ); |
418 | 504 |
|
419 |
| - /* This is just the above streamlined to eliminate all the overheads |
420 |
| - to support incremental hashing. */ |
| 505 | +#else |
421 | 506 |
|
422 | 507 | uchar buf[ FD_SHA256_PRIVATE_BUF_MAX ] __attribute__((aligned(128)));
|
423 |
| - uint state[8] __attribute__((aligned(32))); |
424 | 508 |
|
425 |
| - state[0] = 0x6a09e667U; |
426 |
| - state[1] = 0xbb67ae85U; |
427 |
| - state[2] = 0x3c6ef372U; |
428 |
| - state[3] = 0xa54ff53aU; |
429 |
| - state[4] = 0x510e527fU; |
430 |
| - state[5] = 0x9b05688cU; |
431 |
| - state[6] = 0x1f83d9abU; |
432 |
| - state[7] = 0x5be0cd19U; |
433 |
| - |
434 |
| - ulong sz = 32; |
435 |
| - |
436 |
| - ulong block_cnt = sz >> FD_SHA256_PRIVATE_LG_BUF_MAX; |
437 |
| - if( FD_LIKELY( block_cnt ) ) fd_sha256_core( state, data, block_cnt ); |
438 |
| - |
439 |
| - ulong buf_used = sz & (FD_SHA256_PRIVATE_BUF_MAX-1UL); |
440 |
| - if( FD_UNLIKELY( buf_used ) ) memcpy( buf, data + (block_cnt << FD_SHA256_PRIVATE_LG_BUF_MAX), buf_used ); |
| 509 | + /* Prepare padding once */ |
| 510 | + ulong buf_used = 32UL; |
| 511 | + memcpy( buf, data, 32UL ); |
441 | 512 | buf[ buf_used ] = (uchar)0x80;
|
442 | 513 | buf_used++;
|
443 | 514 |
|
444 |
| - if( FD_UNLIKELY( buf_used > (FD_SHA256_PRIVATE_BUF_MAX-8UL) ) ) { |
445 |
| - memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-buf_used ); |
446 |
| - fd_sha256_core( state, buf, 1UL ); |
447 |
| - buf_used = 0UL; |
448 |
| - } |
449 |
| - |
450 |
| - ulong bit_cnt = sz << 3; |
| 515 | + ulong bit_cnt = 32UL << 3; |
451 | 516 | memset( buf + buf_used, 0, FD_SHA256_PRIVATE_BUF_MAX-8UL-buf_used );
|
452 | 517 | FD_STORE( ulong, buf+FD_SHA256_PRIVATE_BUF_MAX-8UL, fd_ulong_bswap( bit_cnt ) );
|
453 |
| - fd_sha256_core( state, buf, 1UL ); |
454 | 518 |
|
455 |
| - state[0] = fd_uint_bswap( state[0] ); |
456 |
| - state[1] = fd_uint_bswap( state[1] ); |
457 |
| - state[2] = fd_uint_bswap( state[2] ); |
458 |
| - state[3] = fd_uint_bswap( state[3] ); |
459 |
| - state[4] = fd_uint_bswap( state[4] ); |
460 |
| - state[5] = fd_uint_bswap( state[5] ); |
461 |
| - state[6] = fd_uint_bswap( state[6] ); |
462 |
| - state[7] = fd_uint_bswap( state[7] ); |
463 |
| - return memcpy( _hash, state, 32 ); |
| 519 | + /* This is just the above streamlined to eliminate all the overheads |
| 520 | + to support incremental hashing. */ |
| 521 | + for( ulong iter=0UL; iter<cnt; iter++ ) { |
| 522 | + |
| 523 | + uint state[8] __attribute__((aligned(32))); |
| 524 | + |
| 525 | + state[0] = 0x6a09e667U; |
| 526 | + state[1] = 0xbb67ae85U; |
| 527 | + state[2] = 0x3c6ef372U; |
| 528 | + state[3] = 0xa54ff53aU; |
| 529 | + state[4] = 0x510e527fU; |
| 530 | + state[5] = 0x9b05688cU; |
| 531 | + state[6] = 0x1f83d9abU; |
| 532 | + state[7] = 0x5be0cd19U; |
| 533 | + |
| 534 | + fd_sha256_core( state, buf, 1UL ); |
| 535 | + |
| 536 | + state[0] = fd_uint_bswap( state[0] ); |
| 537 | + state[1] = fd_uint_bswap( state[1] ); |
| 538 | + state[2] = fd_uint_bswap( state[2] ); |
| 539 | + state[3] = fd_uint_bswap( state[3] ); |
| 540 | + state[4] = fd_uint_bswap( state[4] ); |
| 541 | + state[5] = fd_uint_bswap( state[5] ); |
| 542 | + state[6] = fd_uint_bswap( state[6] ); |
| 543 | + state[7] = fd_uint_bswap( state[7] ); |
| 544 | + memcpy( buf, state, 32UL ); |
| 545 | + } |
| 546 | + memcpy( hash, buf, 32UL ); |
| 547 | +#endif |
| 548 | + return _hash; |
464 | 549 | }
|
465 | 550 |
|
466 | 551 | #undef fd_sha256_core
|
0 commit comments