Skip to content

Commit 7753660

Browse files
authored
feat: add IA-32 (i386) SSE2 SIMD dispatch and refactor CRC into dispatch layer (#58)
- Extend SIMD acceleration from x86-64-only to IA-32 (i386) targets and restructures the CRC fold/update logic into the centralized dispatch layer
1 parent 4cbe340 commit 7753660

File tree

93 files changed

+12868
-466
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+12868
-466
lines changed

HashLib.Benchmark/src/Core/uPerformanceBenchmark.pas

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ class function TPerformanceBenchmark.GetPlatformInfo: String;
173173
{$IF DEFINED(CPUX86_64)}
174174
LCPU := 'x86_64';
175175
{$ELSEIF DEFINED(CPUI386)}
176-
LCPU := 'x86';
176+
LCPU := 'i386';
177177
{$ELSEIF DEFINED(CPUAARCH64)}
178178
LCPU := 'AArch64';
179179
{$ELSEIF DEFINED(CPUARM)}
@@ -199,7 +199,7 @@ class function TPerformanceBenchmark.GetPlatformInfo: String;
199199
{$IF DEFINED(CPUX64)}
200200
LCPU := 'x86_64';
201201
{$ELSEIF DEFINED(CPUX86)}
202-
LCPU := 'x86';
202+
LCPU := 'i386';
203203
{$ELSEIF DEFINED(CPUARM64)}
204204
LCPU := 'AArch64';
205205
{$ELSEIF DEFINED(CPUARM)}

HashLib/src/Checksum/HlpAdler32Dispatch.pas

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -64,33 +64,15 @@ procedure Adler32_Update_Scalar(AData: PByte; ALength: UInt32; ASums: Pointer);
6464
end;
6565

6666
// =============================================================================
67-
// SIMD implementations (x86-64 only)
67+
// SIMD implementations: SSE2 / SSSE3 (IA-32); SSE2 / SSSE3 / AVX2 (x86-64)
6868
// =============================================================================
6969

70-
{$IFDEF HASHLIB_X86_64_ASM}
70+
{$IFDEF HASHLIB_X86_SIMD}
7171

7272
type
7373
TProcessBlocksProc = procedure(AData: PByte; ANumBlocks: UInt32;
7474
ASums, AConstants: Pointer);
7575

76-
procedure Adler32_ProcessBlocks_Sse2(AData: PByte; ANumBlocks: UInt32;
77-
ASums, AConstants: Pointer);
78-
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
79-
{$I ..\Include\Simd\Adler32\Adler32BlocksSse2.inc}
80-
end;
81-
82-
procedure Adler32_ProcessBlocks_Ssse3(AData: PByte; ANumBlocks: UInt32;
83-
ASums, AConstants: Pointer);
84-
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
85-
{$I ..\Include\Simd\Adler32\Adler32BlocksSsse3.inc}
86-
end;
87-
88-
procedure Adler32_ProcessBlocks_Avx2(AData: PByte; ANumBlocks: UInt32;
89-
ASums, AConstants: Pointer);
90-
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
91-
{$I ..\Include\Simd\Adler32\Adler32BlocksAvx2.inc}
92-
end;
93-
9476
procedure Adler32_Update_Simd(AData: PByte; ALength: UInt32; ASums: Pointer;
9577
AProcessBlocks: TProcessBlocksProc);
9678
const
@@ -130,9 +112,42 @@ procedure Adler32_Update_Simd(AData: PByte; ALength: UInt32; ASums: Pointer;
130112
end;
131113
end;
132114

133-
procedure Adler32_Update_Sse2(AData: PByte; ALength: UInt32; ASums: Pointer);
134-
begin
135-
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Sse2);
115+
{$ENDIF HASHLIB_X86_SIMD}
116+
117+
{$IFDEF HASHLIB_I386_ASM}
118+
119+
procedure Adler32_ProcessBlocks_Sse2(AData: PByte; ANumBlocks: UInt32;
120+
ASums, AConstants: Pointer);
121+
{$I ..\Include\Simd\Common\SimdProc4Begin_i386.inc}
122+
{$I ..\Include\Simd\Adler32\Adler32BlocksSse2_i386.inc}
123+
end;
124+
125+
procedure Adler32_ProcessBlocks_Ssse3(AData: PByte; ANumBlocks: UInt32;
126+
ASums, AConstants: Pointer);
127+
{$I ..\Include\Simd\Common\SimdProc4Begin_i386.inc}
128+
{$I ..\Include\Simd\Adler32\Adler32BlocksSsse3_i386.inc}
129+
end;
130+
131+
{$ENDIF HASHLIB_I386_ASM}
132+
133+
{$IFDEF HASHLIB_X86_64_ASM}
134+
135+
procedure Adler32_ProcessBlocks_Sse2(AData: PByte; ANumBlocks: UInt32;
136+
ASums, AConstants: Pointer);
137+
{$I ..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
138+
{$I ..\Include\Simd\Adler32\Adler32BlocksSse2_x86_64.inc}
139+
end;
140+
141+
procedure Adler32_ProcessBlocks_Ssse3(AData: PByte; ANumBlocks: UInt32;
142+
ASums, AConstants: Pointer);
143+
{$I ..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
144+
{$I ..\Include\Simd\Adler32\Adler32BlocksSsse3_x86_64.inc}
145+
end;
146+
147+
procedure Adler32_ProcessBlocks_Avx2(AData: PByte; ANumBlocks: UInt32;
148+
ASums, AConstants: Pointer);
149+
{$I ..\Include\Simd\Common\SimdProc4Begin_x86_64.inc}
150+
{$I ..\Include\Simd\Adler32\Adler32BlocksAvx2_x86_64.inc}
136151
end;
137152

138153
procedure Adler32_Update_Ssse3(AData: PByte; ALength: UInt32; ASums: Pointer);
@@ -147,13 +162,43 @@ procedure Adler32_Update_Avx2(AData: PByte; ALength: UInt32; ASums: Pointer);
147162

148163
{$ENDIF HASHLIB_X86_64_ASM}
149164

165+
{$IFDEF HASHLIB_X86_SIMD}
166+
167+
procedure Adler32_Update_Sse2(AData: PByte; ALength: UInt32; ASums: Pointer);
168+
begin
169+
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Sse2);
170+
end;
171+
172+
{$IFDEF HASHLIB_I386_ASM}
173+
174+
procedure Adler32_Update_Ssse3(AData: PByte; ALength: UInt32; ASums: Pointer);
175+
begin
176+
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Ssse3);
177+
end;
178+
179+
{$ENDIF HASHLIB_I386_ASM}
180+
181+
{$ENDIF HASHLIB_X86_SIMD}
182+
150183
// =============================================================================
151184
// Dispatch initialization
152185
// =============================================================================
153186

154187
procedure InitDispatch();
155188
begin
156189
Adler32_Update := @Adler32_Update_Scalar;
190+
{$IFDEF HASHLIB_I386_ASM}
191+
case TSimd.GetActiveLevel() of
192+
TSimdLevel.SSSE3:
193+
begin
194+
Adler32_Update := @Adler32_Update_Ssse3;
195+
end;
196+
TSimdLevel.SSE2:
197+
begin
198+
Adler32_Update := @Adler32_Update_Sse2;
199+
end;
200+
end;
201+
{$ENDIF}
157202
{$IFDEF HASHLIB_X86_64_ASM}
158203
case TSimd.GetActiveLevel() of
159204
TSimdLevel.AVX2:

HashLib/src/Checksum/HlpCRC.pas

Lines changed: 29 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ interface
1818
HlpHashResult,
1919
HlpIHashResult,
2020
HlpICRC,
21-
HlpGF2;
21+
HlpCRCDispatch;
2222

2323
resourcestring
2424
SUnSupportedCRCType = 'UnSupported CRC Type: "%s"';
@@ -577,7 +577,7 @@ TCRC = class sealed(THash, IChecksum, ICRC, ITransformBlock)
577577
type
578578
TCRCCacheValue = record
579579
Table: THashLibMatrixUInt64Array;
580-
FoldConstants: TCRCFoldConstants;
580+
FoldRuntime: TCRCFoldRuntimeCtx64;
581581
end;
582582

583583
class var
@@ -622,11 +622,10 @@ TCRCCacheValue = record
622622
function GetCheckValue: UInt64; inline;
623623
procedure SetCheckValue(AValue: UInt64); inline;
624624

625-
// tables work only for CRCs with width > 7
626-
procedure CalculateCRCbyTable(AData: PByte; ADataLength, AIndex: Int32);
627-
// fast bit by bit algorithm without augmented zero bytes.
628-
// does not use lookup table, suited for polynomial orders between 1...32.
629-
procedure CalculateCRCdirect(AData: PByte; ADataLength, AIndex: Int32);
625+
// Table-driven byte path: length < MinSimdBytes or tail after fold (no 16-byte block).
626+
procedure UpdateCRCViaByteTable(AData: PByte; ADataLength, AIndex: Int32);
627+
// Bit-serial update without table (width <= MinTableWidth).
628+
procedure UpdateCRCViaBitSerial(AData: PByte; ADataLength, AIndex: Int32);
630629

631630
// reflects the lower 'width' LBits of 'value'
632631
class function Reflect(AValue: UInt64; AWidth: Int32): UInt64; static;
@@ -664,9 +663,6 @@ TCRCCacheValue = record
664663

665664
implementation
666665

667-
uses
668-
HlpCRCDispatch;
669-
670666
{ TCRC }
671667

672668
function TCRC.GetCheckValue: UInt64;
@@ -754,14 +750,12 @@ function TCRC.GetName: String;
754750
Result := Format('T%s', [Names[0]]);
755751
end;
756752

757-
procedure TCRC.CalculateCRCbyTable(AData: PByte; ADataLength, AIndex: Int32);
753+
procedure TCRC.UpdateCRCViaByteTable(AData: PByte; ADataLength, AIndex: Int32);
758754
var
759755
LLength: Int32;
760-
LTemp, LQWord1, LQWord2, LNewTemp, LTempCopy: UInt64;
756+
LTemp: UInt64;
761757
LCRCTable: THashLibMatrixUInt64Array;
762758
LPtrData: PByte;
763-
LBIdx, LCrcBytes: Int32;
764-
LByte: Byte;
765759
begin
766760
LLength := ADataLength;
767761
LPtrData := AData + AIndex;
@@ -770,34 +764,6 @@ procedure TCRC.CalculateCRCbyTable(AData: PByte; ADataLength, AIndex: Int32);
770764

771765
if IsInputReflected then
772766
begin
773-
// Slicing-by-16: process 16 bytes per iteration using UInt64 reads
774-
while LLength >= 16 do
775-
begin
776-
LQWord1 := PUInt64(LPtrData)^ xor LTemp;
777-
LQWord2 := PUInt64(LPtrData + 8)^;
778-
779-
LTemp := LCRCTable[15][Byte(LQWord1)]
780-
xor LCRCTable[14][Byte(LQWord1 shr 8)]
781-
xor LCRCTable[13][Byte(LQWord1 shr 16)]
782-
xor LCRCTable[12][Byte(LQWord1 shr 24)]
783-
xor LCRCTable[11][Byte(LQWord1 shr 32)]
784-
xor LCRCTable[10][Byte(LQWord1 shr 40)]
785-
xor LCRCTable[9][Byte(LQWord1 shr 48)]
786-
xor LCRCTable[8][Byte(LQWord1 shr 56)]
787-
xor LCRCTable[7][Byte(LQWord2)]
788-
xor LCRCTable[6][Byte(LQWord2 shr 8)]
789-
xor LCRCTable[5][Byte(LQWord2 shr 16)]
790-
xor LCRCTable[4][Byte(LQWord2 shr 24)]
791-
xor LCRCTable[3][Byte(LQWord2 shr 32)]
792-
xor LCRCTable[2][Byte(LQWord2 shr 40)]
793-
xor LCRCTable[1][Byte(LQWord2 shr 48)]
794-
xor LCRCTable[0][Byte(LQWord2 shr 56)];
795-
796-
System.Inc(LPtrData, 16);
797-
System.Dec(LLength, 16);
798-
end;
799-
800-
// Remaining 1..15 bytes: byte-at-a-time using row 0
801767
while LLength > 0 do
802768
begin
803769
LTemp := (LTemp shr 8) xor LCRCTable[0][Byte(LTemp xor LPtrData^)];
@@ -807,34 +773,6 @@ procedure TCRC.CalculateCRCbyTable(AData: PByte; ADataLength, AIndex: Int32);
807773
end
808774
else
809775
begin
810-
// Non-reflected: slicing-by-16 with byte reads
811-
LCrcBytes := (Width + 7) shr 3;
812-
813-
while LLength >= 16 do
814-
begin
815-
LNewTemp := UInt64(0);
816-
LTempCopy := LTemp;
817-
818-
LBIdx := 0;
819-
while LBIdx < LCrcBytes do
820-
begin
821-
LByte := LPtrData[LBIdx] xor Byte(LTempCopy shr (Width - 8));
822-
LTempCopy := (LTempCopy shl 8) and FCRCMask;
823-
LNewTemp := LNewTemp xor LCRCTable[15 - LBIdx][LByte];
824-
System.Inc(LBIdx);
825-
end;
826-
while LBIdx < 16 do
827-
begin
828-
LNewTemp := LNewTemp xor LCRCTable[15 - LBIdx][LPtrData[LBIdx]];
829-
System.Inc(LBIdx);
830-
end;
831-
832-
LTemp := LNewTemp;
833-
System.Inc(LPtrData, 16);
834-
System.Dec(LLength, 16);
835-
end;
836-
837-
// Remaining 1..15 bytes: byte-at-a-time using row 0
838776
while LLength > 0 do
839777
begin
840778
LTemp := (LTemp shl 8) xor LCRCTable[0]
@@ -847,39 +785,10 @@ procedure TCRC.CalculateCRCbyTable(AData: PByte; ADataLength, AIndex: Int32);
847785
FHash := LTemp;
848786
end;
849787

850-
procedure TCRC.CalculateCRCdirect(AData: PByte; ADataLength, AIndex: Int32);
851-
var
852-
LLength, LIdx: Int32;
853-
LTemp, LBit, LJdx, LHash: UInt64;
788+
procedure TCRC.UpdateCRCViaBitSerial(AData: PByte; ADataLength, AIndex: Int32);
854789
begin
855-
856-
LLength := ADataLength;
857-
LIdx := AIndex;
858-
while LLength > 0 do
859-
begin
860-
LTemp := UInt64(AData[LIdx]);
861-
if (IsInputReflected) then
862-
begin
863-
LTemp := Reflect(LTemp, 8);
864-
end;
865-
866-
LJdx := $80;
867-
LHash := FHash;
868-
while LJdx > 0 do
869-
begin
870-
LBit := LHash and FCRCHighBitMask;
871-
LHash := LHash shl 1;
872-
if ((LTemp and LJdx) > 0) then
873-
LBit := LBit xor FCRCHighBitMask;
874-
if (LBit > 0) then
875-
LHash := LHash xor Polynomial;
876-
LJdx := LJdx shr 1;
877-
end;
878-
FHash := LHash;
879-
System.Inc(LIdx);
880-
System.Dec(LLength);
881-
end;
882-
790+
CRC_UpdateViaBitSerial(AData, ADataLength, AIndex, FHash, Polynomial, Width,
791+
IsInputReflected, FCRCHighBitMask);
883792
end;
884793

885794
function TCRC.Clone(): IHash;
@@ -1484,8 +1393,8 @@ class function TCRC.GetOrCreateCacheEntry(APoly: UInt64; AWidth: Int32;
14841393
if not FCache.TryGetValue(LKey, Result) then
14851394
begin
14861395
Result.Table := GenerateCRCTable(APoly, AWidth, AReflected);
1487-
TGF2.GenerateFoldConstants(APoly, AWidth, AReflected,
1488-
Result.FoldConstants);
1396+
CRCDispatch_InitRuntimeCtx64(Result.Table, APoly, AWidth, AReflected,
1397+
Result.FoldRuntime);
14891398
FCache.Add(LKey, Result);
14901399
end;
14911400
end;
@@ -1543,30 +1452,30 @@ procedure TCRC.TransformBytes(const AData: THashLibByteArray;
15431452
else
15441453
begin
15451454
LFoldFunc := CRC_Fold_Msb;
1546-
if Width < 64 then
1547-
LState[0] := FHash shl (64 - Width)
1455+
if CRC_Fold_UsesPclmul then
1456+
begin
1457+
if Width < 64 then
1458+
LState[0] := FHash shl (64 - Width)
1459+
else
1460+
LState[0] := FHash;
1461+
end
15481462
else
15491463
LState[0] := FHash;
15501464
end;
15511465

1552-
if Assigned(LFoldFunc) then
1553-
begin
1554-
LState[1] := 0;
1555-
LProcessed := ALength and (not Int32(15));
1556-
FHash := LFoldFunc(LPtrAData + AIndex, UInt32(LProcessed),
1557-
@LState[0], @FCacheEntry.FoldConstants) and FCRCMask;
1558-
LTail := ALength - LProcessed;
1559-
if LTail > 0 then
1560-
CalculateCRCbyTable(LPtrAData, LTail, AIndex + LProcessed);
1561-
end
1562-
else
1563-
CalculateCRCbyTable(LPtrAData, ALength, AIndex);
1466+
LState[1] := 0;
1467+
LProcessed := ALength and (not Int32(15));
1468+
FHash := LFoldFunc(LPtrAData + AIndex, UInt32(LProcessed), @LState[0],
1469+
@FCacheEntry.FoldRuntime) and FCRCMask;
1470+
LTail := ALength - LProcessed;
1471+
if LTail > 0 then
1472+
UpdateCRCViaByteTable(LPtrAData, LTail, AIndex + LProcessed);
15641473
end
15651474
else
1566-
CalculateCRCbyTable(LPtrAData, ALength, AIndex);
1475+
UpdateCRCViaByteTable(LPtrAData, ALength, AIndex);
15671476
end
15681477
else
1569-
CalculateCRCdirect(LPtrAData, ALength, AIndex);
1478+
UpdateCRCViaBitSerial(LPtrAData, ALength, AIndex);
15701479
end;
15711480

15721481
function TCRC.TransformFinal: IHashResult;

0 commit comments

Comments
 (0)