From 109a9591bb30dbabe8354c17f7b0bb974f262489 Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 11 Aug 2025 12:32:31 -0400 Subject: [PATCH 1/8] Init with compression sketch --- .../main/java/datawave/util/Compression.java | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 warehouse/core/src/main/java/datawave/util/Compression.java diff --git a/warehouse/core/src/main/java/datawave/util/Compression.java b/warehouse/core/src/main/java/datawave/util/Compression.java new file mode 100644 index 00000000000..d7b9280008b --- /dev/null +++ b/warehouse/core/src/main/java/datawave/util/Compression.java @@ -0,0 +1,75 @@ +package datawave.util; + +import org.apache.commons.codec.binary.Base64; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.zip.GZIPOutputStream; + +public class Compression { + + /* + Compression and Decompression order of ops: + + 1. Get your data as a String + 2. Compress it however you want (Lets say GZIP) + 3. Convert that compressed chunk into BASE64, which helps with transferring the data + + 4. Receive the compressed chunk + 5. Revert it from BASE64 to normal (Lets say GZIP) + 6. Un-GZIP it to get your info. + + */ + + public enum CompressionAlgorithm { + GZIP, + } + + public enum Codec { + BASE64, + } + + public static String compressGZIP(final String data, final Charset characterSet) throws IOException { + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); + final GZIPOutputStream gzipStream = new GZIPOutputStream(byteStream); + final DataOutputStream dataOut = new DataOutputStream(gzipStream); + + byte[] arr = data.getBytes(characterSet); + final int length = arr.length; + + dataOut.writeInt(length); + dataOut.write(arr); + + dataOut.close(); + byteStream.close(); + + return new String(Base64.encodeBase64(byteStream.toByteArray())); + } + + public static String decompressGZIP(){ + return ""; + } + + public static String encodeBASE64(final String data, final Charset characterSet, ) throws IOException { + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); + final GZIPOutputStream gzipStream = new GZIPOutputStream(byteStream); + final DataOutputStream dataOut = new DataOutputStream(gzipStream); + + byte[] arr = data.getBytes(characterSet); + final int length = arr.length; + + dataOut.writeInt(length); + dataOut.write(arr); + + dataOut.close(); + byteStream.close(); + + return new String(Base64.encodeBase64(byteStream.toByteArray())); + } + + public static String decodeBASE64(){ + return ""; + } +} From 3096695f02f95ad98b8fe02a6e383ad41b5bbbba Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 11 Aug 2025 15:10:11 -0400 Subject: [PATCH 2/8] wip --- .../main/java/datawave/util/Compression.java | 75 ------------------- .../util/compression/CompressionMethod.java | 30 ++++++++ .../util/compression/GZipCompression.java | 50 +++++++++++++ 3 files changed, 80 insertions(+), 75 deletions(-) delete mode 100644 warehouse/core/src/main/java/datawave/util/Compression.java create mode 100644 warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java create mode 100644 warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java diff --git a/warehouse/core/src/main/java/datawave/util/Compression.java b/warehouse/core/src/main/java/datawave/util/Compression.java deleted file mode 100644 index d7b9280008b..00000000000 --- a/warehouse/core/src/main/java/datawave/util/Compression.java +++ /dev/null @@ -1,75 +0,0 @@ -package datawave.util; - -import org.apache.commons.codec.binary.Base64; - -import java.io.ByteArrayOutputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.zip.GZIPOutputStream; - -public class Compression { - - /* - Compression and Decompression order of ops: - - 1. Get your data as a String - 2. Compress it however you want (Lets say GZIP) - 3. Convert that compressed chunk into BASE64, which helps with transferring the data - - 4. Receive the compressed chunk - 5. Revert it from BASE64 to normal (Lets say GZIP) - 6. Un-GZIP it to get your info. - - */ - - public enum CompressionAlgorithm { - GZIP, - } - - public enum Codec { - BASE64, - } - - public static String compressGZIP(final String data, final Charset characterSet) throws IOException { - final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); - final GZIPOutputStream gzipStream = new GZIPOutputStream(byteStream); - final DataOutputStream dataOut = new DataOutputStream(gzipStream); - - byte[] arr = data.getBytes(characterSet); - final int length = arr.length; - - dataOut.writeInt(length); - dataOut.write(arr); - - dataOut.close(); - byteStream.close(); - - return new String(Base64.encodeBase64(byteStream.toByteArray())); - } - - public static String decompressGZIP(){ - return ""; - } - - public static String encodeBASE64(final String data, final Charset characterSet, ) throws IOException { - final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(); - final GZIPOutputStream gzipStream = new GZIPOutputStream(byteStream); - final DataOutputStream dataOut = new DataOutputStream(gzipStream); - - byte[] arr = data.getBytes(characterSet); - final int length = arr.length; - - dataOut.writeInt(length); - dataOut.write(arr); - - dataOut.close(); - byteStream.close(); - - return new String(Base64.encodeBase64(byteStream.toByteArray())); - } - - public static String decodeBASE64(){ - return ""; - } -} diff --git a/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java b/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java new file mode 100644 index 00000000000..8b155bf46c6 --- /dev/null +++ b/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java @@ -0,0 +1,30 @@ +package datawave.util.compression; + +import org.apache.commons.codec.binary.Base64; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public abstract class CompressionMethod { + + /* + Compression and Decompression order of ops: + + 1. Get your data as a String + 2. Compress it however you want (Lets say GZIP) + 3. Convert that compressed chunk into BASE64, which helps with transferring the data + + 4. Receive the compressed chunk + 5. Revert it from BASE64 to normal (Lets say GZIP) + 6. Un-GZIP it to get your info. + + */ + + public abstract String compress(final String data, final Charset charset) throws IOException; + public abstract String decompress(final String base64, final Charset charset) throws IOException; +} diff --git a/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java b/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java new file mode 100644 index 00000000000..2d9d1b8dc5e --- /dev/null +++ b/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java @@ -0,0 +1,50 @@ +package datawave.util.compression; + +import org.apache.commons.codec.binary.Base64; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +public class GZipCompression extends CompressionMethod{ + + @Override + public String compress(final String data, final Charset charset) throws IOException { + final byte[] input = data.getBytes(charset); + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(baos)) { + + gzip.write(input); + // closing gzip finishes the stream and flushes to baos + gzip.close(); + + // todo: add codec classes if modularity is needed + return Base64.encodeBase64String(baos.toByteArray()); + } + } + + @Override + public String decompress(final String base64, final Charset charset) throws IOException { + + // always assumes the codec is base64 + final byte[] compressed = Base64.decodeBase64(base64); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); + GZIPInputStream gzip = new GZIPInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + + byte[] buf = new byte[4096]; + int n; + while ((n = gzip.read(buf)) != -1) { + baos.write(buf, 0, n); + } + return baos.toString(charset); + } + } +} + + From 74b9172534f79463c82dd8a8fb5ebd8216a333a7 Mon Sep 17 00:00:00 2001 From: Seth Date: Wed, 13 Aug 2025 09:19:52 -0400 Subject: [PATCH 3/8] Starter tests --- .../util/compression/CompressionMethod.java | 61 +++++++--- .../util/compression/GZipCompression.java | 39 ++++-- .../util/compression/GZipCompressionTest.java | 114 ++++++++++++++++++ 3 files changed, 189 insertions(+), 25 deletions(-) create mode 100644 warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java diff --git a/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java b/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java index 8b155bf46c6..74e52ea2c5a 100644 --- a/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java +++ b/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java @@ -2,29 +2,56 @@ import org.apache.commons.codec.binary.Base64; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataOutputStream; import java.io.IOException; import java.nio.charset.Charset; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; +/** + * Provides a base contract for compression and decompression methods. + * + *

General process:

+ *
    + *
  1. Compression: Original Data → Compress with Algorithm (e.g., GZIP) → Encode in Base64 → Return as String
  2. + *
  3. Decompression: Base64 String → Decode from Base64 → Decompress with Algorithm (e.g., GZIP) → Return Original Data
  4. + *
+ * + *

Base64 encoding ensures that compressed data can be safely stored or transferred as text, + * regardless of binary content.

+ */ public abstract class CompressionMethod { - /* - Compression and Decompression order of ops: - - 1. Get your data as a String - 2. Compress it however you want (Lets say GZIP) - 3. Convert that compressed chunk into BASE64, which helps with transferring the data - - 4. Receive the compressed chunk - 5. Revert it from BASE64 to normal (Lets say GZIP) - 6. Un-GZIP it to get your info. - + /** + * Compresses the given string into a Base64-encoded representation. + * + *

Order of operations:

+ *
    + *
  1. Take the original data string.
  2. + *
  3. Compress the data using the specified compression algorithm (e.g., GZIP).
  4. + *
  5. Encode the compressed byte array into Base64.
  6. + *
  7. Return the Base64 string, encoded using the specified character set.
  8. + *
+ * + * @param data The uncompressed string to be processed. + * @param charset The charset used for converting between strings and byte arrays. + * @return A Base64-encoded string containing the compressed data. + * @throws IOException If compression or encoding fails. */ - public abstract String compress(final String data, final Charset charset) throws IOException; + + /** + * Decompresses a Base64-encoded, compressed string back to its original form. + * + *

This is the reverse of {@link #compress(String, Charset)}:

+ *
    + *
  1. Take the Base64-encoded string.
  2. + *
  3. Decode the Base64 string into the original compressed byte array.
  4. + *
  5. Decompress the byte array using the appropriate algorithm (e.g., GZIP).
  6. + *
  7. Convert the decompressed byte array back into a string using the specified charset.
  8. + *
+ * + * @param base64 A Base64-encoded string containing compressed data. + * @param charset The charset used for decoding the final string. + * @return The original uncompressed string. + * @throws IOException If decoding or decompression fails. + */ public abstract String decompress(final String base64, final Charset charset) throws IOException; } diff --git a/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java b/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java index 2d9d1b8dc5e..c7c8b622e11 100644 --- a/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java +++ b/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java @@ -9,8 +9,28 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -public class GZipCompression extends CompressionMethod{ +/** + * A concrete implementation of {@link CompressionMethod} that uses GZIP for compression + * and Base64 for safe string encoding/decoding. + * + *

Process overview:

+ *
    + *
  • Compression: String → Bytes (charset) → GZIP → Base64 String
  • + *
  • Decompression: Base64 String → Bytes → GZIP → String (charset)
  • + *
+ * + *

Base64 ensures that the compressed binary data can be represented as plain text.

+ */ +public class GZipCompression extends CompressionMethod { + /** + * Compresses a string into a Base64-encoded GZIP stream. + * + * @param data The input string to be compressed. + * @param charset The charset for encoding the string to bytes. + * @return A Base64 string containing the GZIP-compressed data. + * @throws IOException If compression or encoding fails. + */ @Override public String compress(final String data, final Charset charset) throws IOException { final byte[] input = data.getBytes(charset); @@ -19,18 +39,22 @@ public String compress(final String data, final Charset charset) throws IOExcept GZIPOutputStream gzip = new GZIPOutputStream(baos)) { gzip.write(input); - // closing gzip finishes the stream and flushes to baos - gzip.close(); + gzip.close(); // must close to flush all compressed data into baos - // todo: add codec classes if modularity is needed return Base64.encodeBase64String(baos.toByteArray()); } } + /** + * Decompresses a Base64-encoded GZIP string back into its original form. + * + * @param base64 The Base64-encoded string containing GZIP-compressed data. + * @param charset The charset used to convert decompressed bytes into a string. + * @return The original, decompressed string. + * @throws IOException If decoding or decompression fails. + */ @Override public String decompress(final String base64, final Charset charset) throws IOException { - - // always assumes the codec is base64 final byte[] compressed = Base64.decodeBase64(base64); try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); @@ -42,9 +66,8 @@ public String decompress(final String base64, final Charset charset) throws IOEx while ((n = gzip.read(buf)) != -1) { baos.write(buf, 0, n); } + return baos.toString(charset); } } } - - diff --git a/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java b/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java new file mode 100644 index 00000000000..d9a4e23a185 --- /dev/null +++ b/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java @@ -0,0 +1,114 @@ +package datawave.util.compression; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for {@link GZipCompression}. + * Notes: + * - We do NOT assert compressed bytes equality because GZIP headers (e.g., timestamps) + * can make outputs differ across runs/environments. + * - We focus on round-trip integrity and expected failure modes on bad inputs. + */ +class GZipCompressionTest { + + private GZipCompression gzip; + + @BeforeEach + void setUp() { + gzip = new GZipCompression(); + } + + @Test + void roundTrip_utf8_simpleAndUnicode() throws Exception { + String original = "Hello, café — Καλημέρα — こんにちは — 👋🌍"; + String base64 = gzip.compress(original, StandardCharsets.UTF_8); + String restored = gzip.decompress(base64, StandardCharsets.UTF_8); + assertEquals(original, restored, "UTF-8 round trip should restore original string"); + } + + @Test + void compressibleData_resultsInReasonableSize() throws Exception { + // Highly repetitive -> compresses very well even after Base64 inflation. + String original = "A".repeat(10_000); + String base64 = gzip.compress(original, StandardCharsets.UTF_8); + + // Compare Base64 encoded compressed length vs original byte length + int originalBytes = original.getBytes(StandardCharsets.UTF_8).length; + assertTrue(base64.length() < originalBytes, + "Base64(compressed) should be smaller than original bytes for highly compressible data"); + // Still verify round-trip + assertEquals(original, gzip.decompress(base64, StandardCharsets.UTF_8)); + } + + @Test + void emptyString_roundTrip_ok() throws Exception { + String original = ""; + String base64 = gzip.compress(original, StandardCharsets.UTF_8); + String restored = gzip.decompress(base64, StandardCharsets.UTF_8); + assertEquals(original, restored); + } + + @Test + void iso88591_roundTrip_ok() throws Exception { + // é exists in ISO-8859-1. verify round-trip with a non-UTF charset. + Charset latin1 = StandardCharsets.ISO_8859_1; + String original = "café naïve façade résumé"; + String base64 = gzip.compress(original, latin1); + String restored = gzip.decompress(base64, latin1); + assertEquals(original, restored, "ISO-8859-1 round trip should restore original string"); + } + + @Test + void invalidBase64_throwsOnDecompress() { + // Not valid Base64. decode will yield garbage/empty, GZIPInputStream should fail. + String notBase64 = "this is not base64!!!"; + assertThrows(Exception.class, () -> gzip.decompress(notBase64, StandardCharsets.UTF_8), + "Decompression should fail on invalid Base64"); + } + + @Test + void validBase64ButNotGzip_throwsOnDecompress() { + // Base64 of plain text "hello" (not a GZIP stream). + String base64OfPlain = java.util.Base64.getEncoder().encodeToString("hello".getBytes(StandardCharsets.UTF_8)); + assertThrows(Exception.class, () -> gzip.decompress(base64OfPlain, StandardCharsets.UTF_8), + "Decompression should fail when bytes are not GZIP-compressed"); + } + + @Test + void largeData_roundTrip_ok() throws Exception { + // Pseudo-random string to simulate less-compressible data; validates stability and correctness. + byte[] bytes = new byte[1 << 18]; // 256 KiB + new Random(12345).nextBytes(bytes); + String original = new String(bytes, StandardCharsets.ISO_8859_1); // 1:1 mapping for bytes 0-255 + + String base64 = gzip.compress(original, StandardCharsets.ISO_8859_1); + String restored = gzip.decompress(base64, StandardCharsets.ISO_8859_1); + assertEquals(original, restored, "Large data should round-trip correctly"); + } + + @Test + void nullData_throwsNullPointer_inCompress() { + assertThrows(NullPointerException.class, () -> gzip.compress(null, StandardCharsets.UTF_8), + "Compressing null data should throw NPE"); + } + + @Test + void nullCharset_throwsNullPointer_inCompress() { + assertThrows(NullPointerException.class, () -> gzip.compress("data", null), + "Compressing with null charset should throw NPE"); + } + + @Test + void nullBase64_throwsNullPointer_inDecompress() { + assertThrows(NullPointerException.class, () -> gzip.decompress(null, StandardCharsets.UTF_8), + "Decompressing null Base64 should throw NPE"); + } + +} From 98a63f4dd4e55c283435dac1c19efed075ee6ab4 Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 18 Aug 2025 13:09:34 -0400 Subject: [PATCH 4/8] Refactor + add 7ZIP utils to mvn --- warehouse/core/pom.xml | 6 + .../util/compression/CompressionMethod.java | 57 ---- .../util/compression/GZipCompression.java | 73 ------ .../util/compression/OptionCompressor.java | 245 ++++++++++++++++++ .../util/compression/GZipCompressionTest.java | 114 -------- .../compression/OptionCompressorTest.java | 97 +++++++ 6 files changed, 348 insertions(+), 244 deletions(-) delete mode 100644 warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java delete mode 100644 warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java create mode 100644 warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java delete mode 100644 warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java create mode 100644 warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java diff --git a/warehouse/core/pom.xml b/warehouse/core/pom.xml index f996c03b5f2..40e3aff80d4 100644 --- a/warehouse/core/pom.xml +++ b/warehouse/core/pom.xml @@ -193,6 +193,12 @@ easymock test + + + org.tukaani + xz + 1.10 + diff --git a/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java b/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java deleted file mode 100644 index 74e52ea2c5a..00000000000 --- a/warehouse/core/src/main/java/datawave/util/compression/CompressionMethod.java +++ /dev/null @@ -1,57 +0,0 @@ -package datawave.util.compression; - -import org.apache.commons.codec.binary.Base64; - -import java.io.IOException; -import java.nio.charset.Charset; - -/** - * Provides a base contract for compression and decompression methods. - * - *

General process:

- *
    - *
  1. Compression: Original Data → Compress with Algorithm (e.g., GZIP) → Encode in Base64 → Return as String
  2. - *
  3. Decompression: Base64 String → Decode from Base64 → Decompress with Algorithm (e.g., GZIP) → Return Original Data
  4. - *
- * - *

Base64 encoding ensures that compressed data can be safely stored or transferred as text, - * regardless of binary content.

- */ -public abstract class CompressionMethod { - - /** - * Compresses the given string into a Base64-encoded representation. - * - *

Order of operations:

- *
    - *
  1. Take the original data string.
  2. - *
  3. Compress the data using the specified compression algorithm (e.g., GZIP).
  4. - *
  5. Encode the compressed byte array into Base64.
  6. - *
  7. Return the Base64 string, encoded using the specified character set.
  8. - *
- * - * @param data The uncompressed string to be processed. - * @param charset The charset used for converting between strings and byte arrays. - * @return A Base64-encoded string containing the compressed data. - * @throws IOException If compression or encoding fails. - */ - public abstract String compress(final String data, final Charset charset) throws IOException; - - /** - * Decompresses a Base64-encoded, compressed string back to its original form. - * - *

This is the reverse of {@link #compress(String, Charset)}:

- *
    - *
  1. Take the Base64-encoded string.
  2. - *
  3. Decode the Base64 string into the original compressed byte array.
  4. - *
  5. Decompress the byte array using the appropriate algorithm (e.g., GZIP).
  6. - *
  7. Convert the decompressed byte array back into a string using the specified charset.
  8. - *
- * - * @param base64 A Base64-encoded string containing compressed data. - * @param charset The charset used for decoding the final string. - * @return The original uncompressed string. - * @throws IOException If decoding or decompression fails. - */ - public abstract String decompress(final String base64, final Charset charset) throws IOException; -} diff --git a/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java b/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java deleted file mode 100644 index c7c8b622e11..00000000000 --- a/warehouse/core/src/main/java/datawave/util/compression/GZipCompression.java +++ /dev/null @@ -1,73 +0,0 @@ -package datawave.util.compression; - -import org.apache.commons.codec.binary.Base64; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -/** - * A concrete implementation of {@link CompressionMethod} that uses GZIP for compression - * and Base64 for safe string encoding/decoding. - * - *

Process overview:

- *
    - *
  • Compression: String → Bytes (charset) → GZIP → Base64 String
  • - *
  • Decompression: Base64 String → Bytes → GZIP → String (charset)
  • - *
- * - *

Base64 ensures that the compressed binary data can be represented as plain text.

- */ -public class GZipCompression extends CompressionMethod { - - /** - * Compresses a string into a Base64-encoded GZIP stream. - * - * @param data The input string to be compressed. - * @param charset The charset for encoding the string to bytes. - * @return A Base64 string containing the GZIP-compressed data. - * @throws IOException If compression or encoding fails. - */ - @Override - public String compress(final String data, final Charset charset) throws IOException { - final byte[] input = data.getBytes(charset); - - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - GZIPOutputStream gzip = new GZIPOutputStream(baos)) { - - gzip.write(input); - gzip.close(); // must close to flush all compressed data into baos - - return Base64.encodeBase64String(baos.toByteArray()); - } - } - - /** - * Decompresses a Base64-encoded GZIP string back into its original form. - * - * @param base64 The Base64-encoded string containing GZIP-compressed data. - * @param charset The charset used to convert decompressed bytes into a string. - * @return The original, decompressed string. - * @throws IOException If decoding or decompression fails. - */ - @Override - public String decompress(final String base64, final Charset charset) throws IOException { - final byte[] compressed = Base64.decodeBase64(base64); - - try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - GZIPInputStream gzip = new GZIPInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - - byte[] buf = new byte[4096]; - int n; - while ((n = gzip.read(buf)) != -1) { - baos.write(buf, 0, n); - } - - return baos.toString(charset); - } - } -} diff --git a/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java new file mode 100644 index 00000000000..1822c215c75 --- /dev/null +++ b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java @@ -0,0 +1,245 @@ +package datawave.util.compression; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; +import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + + +// Bzip2, Gz, Lz4, Lzo, NoCompression, Snappy, ZStandard + +/** + * Provides a base contract for compression and decompression methods. + * + *

General process:

+ *
    + *
  1. Compression: Original Data → Compress with Algorithm (selected via {@link CompressionMethod}) → Encode in Base64 (except {@code NONE}) → Return as String
  2. + *
  3. Decompression: Input String (Base64 if compressed; plain text if {@code NONE}) → Decode Base64 (if applicable) → Decompress with Algorithm → Return Original Data
  4. + *
+ * + *

Base64 encoding ensures that compressed data can be safely stored or transferred as text. + * For {@link CompressionMethod#NONE} the data is returned as-is and is not Base64-encoded.

+ */ +public class OptionCompressor { + + // possible data class for holding metadata for the compression method. Not sure if I'll need to actually include this. + // see Gz.java or Bzip2.java from Accumulo. + // public CompressionAlgorithmConfiguration configuration; + + // additionally, see https://commons.apache.org/proper/commons-compress/ + // it's probably best to source all the compression from the same place if possible. + + public enum CompressionMethod{ + NONE, + GZIP, + BZIP2, + SEVEN_ZIP + } + + /** + * Compresses the given string using the specified {@link CompressionMethod}. + * + *

Order of operations (by method):

+ *
    + *
  • {@code NONE}: Return {@code data} unchanged; no Base64 applied.
  • + *
  • {@code GZIP}: Convert {@code data} to bytes with {@code charset} → GZIP-compress → Base64-encode → return String.
  • + *
  • {@code BZIP2}: Convert {@code data} to bytes with {@code charset} → BZIP2-compress → Base64-encode → return String.
  • + *
  • {@code SEVEN_ZIP}: Convert {@code data} to bytes with {@code charset} → LZMA-compress (7z algorithm stream) → Base64-encode → return String.
  • + *
+ */ + public String compress(final String data, final CompressionMethod method, final Charset charset) throws IOException { + switch(method){ + case NONE: + return data; + case GZIP: + return compressGZIP(data, charset); + case BZIP2: + return compressBZIP2(data, charset); + case SEVEN_ZIP: + return compress7ZIP(data, charset); + default: + throw new IllegalArgumentException("unrecognized compression option: " + method); + } + } + + /** + * GZIP implementation reference: + * 1) String → bytes via charset + * 2) Write to GZIPOutputStream (must close to finalize) + * 3) Base64-encode compressed bytes + */ + private String compressGZIP(final String data, final Charset charset){ + final byte[] input = data.getBytes(charset); + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(baos)) { + + gzip.write(input); + gzip.close(); // must close to flush all compressed data into baos + + return Base64.encodeBase64String(baos.toByteArray()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * BZIP2 implementation (mirrors GZIP flow): + * 1) String → bytes via charset + * 2) Write to BZip2CompressorOutputStream (close to finalize) + * 3) Base64-encode compressed bytes + */ + private String compressBZIP2(final String data, final Charset charset) { + final byte[] input = data.getBytes(charset); + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { + + bzos.write(input); + bzos.close(); // finalize BZIP2 stream + + return Base64.encodeBase64String(baos.toByteArray()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * "7-Zip" via LZMA stream implementation: + * 1) String → bytes via charset + * 2) Write to LZMACompressorOutputStream (close to finalize) + * 3) Base64-encode compressed bytes + * + * Note: This uses the LZMA compressor stream from Apache Commons Compress, + * which is the algorithm used by 7-Zip; it is not a .7z archive container. + */ + private String compress7ZIP(final String data, final Charset charset) { + final byte[] input = data.getBytes(charset); + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { + + lzma.write(input); + lzma.close(); // finalize LZMA stream + + return Base64.encodeBase64String(baos.toByteArray()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * GZIP implementation reference for decompression: + * 1) Base64-decode to compressed bytes + * 2) Read via GZIPInputStream into buffer + * 3) Collect into ByteArrayOutputStream → String via charset + */ + private String decompressGZIP(final String dataBase64, final Charset charset){ + final byte[] compressed = Base64.decodeBase64(dataBase64); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); + GZIPInputStream gzip = new GZIPInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + + byte[] buf = new byte[4096]; + int n; + while ((n = gzip.read(buf)) != -1) { + baos.write(buf, 0, n); + } + + // NOTE: Keeping the same pattern as provided code. + return baos.toString(charset); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * BZIP2 decompression (mirrors GZIP flow): + * 1) Base64-decode to compressed bytes + * 2) Read via BZip2CompressorInputStream into buffer + * 3) Collect into ByteArrayOutputStream → String via charset + */ + private String decompressBZIP2(final String dataBase64, final Charset charset) { + final byte[] compressed = Base64.decodeBase64(dataBase64); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); + BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + + byte[] buf = new byte[4096]; + int n; + while ((n = bzis.read(buf)) != -1) { + baos.write(buf, 0, n); + } + + return baos.toString(charset); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * "7-Zip" via LZMA stream decompression (mirrors GZIP/BZIP2 flow): + * 1) Base64-decode to compressed bytes + * 2) Read via LZMACompressorInputStream into buffer + * 3) Collect into ByteArrayOutputStream → String via charset + * + * Note: This expects LZMA stream data (not a .7z archive container). + */ + private String decompress7ZIP(final String dataBase64, final Charset charset) { + final byte[] compressed = Base64.decodeBase64(dataBase64); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); + LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + + byte[] buf = new byte[4096]; + int n; + while ((n = lzma.read(buf)) != -1) { + baos.write(buf, 0, n); + } + + return baos.toString(charset); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Decompresses a string using the specified {@link CompressionMethod}. + * + *

Order of operations (by method):

+ *
    + *
  • {@code NONE}: Return input unchanged (input is expected to be the original plain text, not Base64).
  • + *
  • {@code GZIP}: Base64-decode → GZIP-decompress → decode to String using {@code charset}.
  • + *
  • {@code BZIP2}: Base64-decode → BZIP2-decompress → decode to String using {@code charset}.
  • + *
  • {@code SEVEN_ZIP}: Base64-decode → LZMA-decompress → decode to String using {@code charset}.
  • + *
+ * + *

This is the reverse of {@link #compress(String, CompressionMethod, Charset)}. For all non-{@code NONE} methods, + * the input should be the Base64 text returned by the corresponding {@code compress(...)} call.

+ */ + public String decompress(final String input, final CompressionMethod method, final Charset charset) throws IOException { + switch (method) { + case NONE: + return input; + case GZIP: + return decompressGZIP(input, charset); + case BZIP2: + return decompressBZIP2(input, charset); + case SEVEN_ZIP: + return decompress7ZIP(input, charset); + default: + throw new IllegalArgumentException("unrecognized decompression option: " + method); + } + } +} diff --git a/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java b/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java deleted file mode 100644 index d9a4e23a185..00000000000 --- a/warehouse/core/src/test/java/datawave/util/compression/GZipCompressionTest.java +++ /dev/null @@ -1,114 +0,0 @@ -package datawave.util.compression; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.Random; - -import static org.junit.jupiter.api.Assertions.*; - -/** - * Tests for {@link GZipCompression}. - * Notes: - * - We do NOT assert compressed bytes equality because GZIP headers (e.g., timestamps) - * can make outputs differ across runs/environments. - * - We focus on round-trip integrity and expected failure modes on bad inputs. - */ -class GZipCompressionTest { - - private GZipCompression gzip; - - @BeforeEach - void setUp() { - gzip = new GZipCompression(); - } - - @Test - void roundTrip_utf8_simpleAndUnicode() throws Exception { - String original = "Hello, café — Καλημέρα — こんにちは — 👋🌍"; - String base64 = gzip.compress(original, StandardCharsets.UTF_8); - String restored = gzip.decompress(base64, StandardCharsets.UTF_8); - assertEquals(original, restored, "UTF-8 round trip should restore original string"); - } - - @Test - void compressibleData_resultsInReasonableSize() throws Exception { - // Highly repetitive -> compresses very well even after Base64 inflation. - String original = "A".repeat(10_000); - String base64 = gzip.compress(original, StandardCharsets.UTF_8); - - // Compare Base64 encoded compressed length vs original byte length - int originalBytes = original.getBytes(StandardCharsets.UTF_8).length; - assertTrue(base64.length() < originalBytes, - "Base64(compressed) should be smaller than original bytes for highly compressible data"); - // Still verify round-trip - assertEquals(original, gzip.decompress(base64, StandardCharsets.UTF_8)); - } - - @Test - void emptyString_roundTrip_ok() throws Exception { - String original = ""; - String base64 = gzip.compress(original, StandardCharsets.UTF_8); - String restored = gzip.decompress(base64, StandardCharsets.UTF_8); - assertEquals(original, restored); - } - - @Test - void iso88591_roundTrip_ok() throws Exception { - // é exists in ISO-8859-1. verify round-trip with a non-UTF charset. - Charset latin1 = StandardCharsets.ISO_8859_1; - String original = "café naïve façade résumé"; - String base64 = gzip.compress(original, latin1); - String restored = gzip.decompress(base64, latin1); - assertEquals(original, restored, "ISO-8859-1 round trip should restore original string"); - } - - @Test - void invalidBase64_throwsOnDecompress() { - // Not valid Base64. decode will yield garbage/empty, GZIPInputStream should fail. - String notBase64 = "this is not base64!!!"; - assertThrows(Exception.class, () -> gzip.decompress(notBase64, StandardCharsets.UTF_8), - "Decompression should fail on invalid Base64"); - } - - @Test - void validBase64ButNotGzip_throwsOnDecompress() { - // Base64 of plain text "hello" (not a GZIP stream). - String base64OfPlain = java.util.Base64.getEncoder().encodeToString("hello".getBytes(StandardCharsets.UTF_8)); - assertThrows(Exception.class, () -> gzip.decompress(base64OfPlain, StandardCharsets.UTF_8), - "Decompression should fail when bytes are not GZIP-compressed"); - } - - @Test - void largeData_roundTrip_ok() throws Exception { - // Pseudo-random string to simulate less-compressible data; validates stability and correctness. - byte[] bytes = new byte[1 << 18]; // 256 KiB - new Random(12345).nextBytes(bytes); - String original = new String(bytes, StandardCharsets.ISO_8859_1); // 1:1 mapping for bytes 0-255 - - String base64 = gzip.compress(original, StandardCharsets.ISO_8859_1); - String restored = gzip.decompress(base64, StandardCharsets.ISO_8859_1); - assertEquals(original, restored, "Large data should round-trip correctly"); - } - - @Test - void nullData_throwsNullPointer_inCompress() { - assertThrows(NullPointerException.class, () -> gzip.compress(null, StandardCharsets.UTF_8), - "Compressing null data should throw NPE"); - } - - @Test - void nullCharset_throwsNullPointer_inCompress() { - assertThrows(NullPointerException.class, () -> gzip.compress("data", null), - "Compressing with null charset should throw NPE"); - } - - @Test - void nullBase64_throwsNullPointer_inDecompress() { - assertThrows(NullPointerException.class, () -> gzip.decompress(null, StandardCharsets.UTF_8), - "Decompressing null Base64 should throw NPE"); - } - -} diff --git a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java new file mode 100644 index 00000000000..6f433b10997 --- /dev/null +++ b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java @@ -0,0 +1,97 @@ +package datawave.util.compression; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.*; + +class OptionCompressorTest { + + private OptionCompressor compressor; + + @BeforeEach + void setUp() { + compressor = new OptionCompressor(); + } + + // Helper wrappers for readability + private String compress(String s, OptionCompressor.CompressionMethod m, Charset cs) throws Exception { + return compressor.compress(s, m, cs); + } + private String decompress(String s, OptionCompressor.CompressionMethod m, Charset cs) throws Exception { + return compressor.decompress(s, m, cs); + } + + // Round-trip: UTF-8 content across all real compression methods + @ParameterizedTest + @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) + void roundTrip_utf8_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { + String original = "Hello, café — Καλημέρα — こんにちは — 👋🌍"; + String base64 = compress(original, method, StandardCharsets.UTF_8); + String restored = decompress(base64, method, StandardCharsets.UTF_8); + assertEquals(original, restored, method + " UTF-8 round trip should restore original string"); + } + + // Highly compressible data should shrink after Base64 (still smaller than original bytes) + @ParameterizedTest + @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) + void compressibleData_resultsInReasonableSize_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { + String original = "A".repeat(10_000); + String base64 = compress(original, method, StandardCharsets.UTF_8); + + int originalBytes = original.getBytes(StandardCharsets.UTF_8).length; + assertTrue(base64.length() < originalBytes, + () -> method + " Base64(compressed) should be smaller than original bytes for highly compressible data"); + + assertEquals(original, decompress(base64, method, StandardCharsets.UTF_8)); + } + + // Large-ish random data round-trip (ISO-8859-1 for 1:1 byte<->char) + @ParameterizedTest + @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) + void largeData_roundTrip_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { + byte[] bytes = new byte[1 << 18]; // 256 KiB + new Random(12345).nextBytes(bytes); + String original = new String(bytes, StandardCharsets.ISO_8859_1); + + String base64 = compress(original, method, StandardCharsets.ISO_8859_1); + String restored = decompress(base64, method, StandardCharsets.ISO_8859_1); + assertEquals(original, restored, method + " large data should round-trip correctly"); + } + + // Bad inputs: invalid Base64 should fail on real decompressors + @ParameterizedTest + @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) + void invalidBase64_throwsOnDecompress_allAlgos(OptionCompressor.CompressionMethod method) { + String notBase64 = "this is not base64!!!"; + assertThrows(Exception.class, () -> decompress(notBase64, method, StandardCharsets.UTF_8), + method + " decompression should fail on invalid Base64"); + } + + // Bad inputs: valid Base64 but not compressed bytes should fail + @ParameterizedTest + @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) + void validBase64ButNotCompressed_throwsOnDecompress_allAlgos(OptionCompressor.CompressionMethod method) { + String base64OfPlain = java.util.Base64.getEncoder() + .encodeToString("hello".getBytes(StandardCharsets.UTF_8)); + assertThrows(Exception.class, () -> decompress(base64OfPlain, method, StandardCharsets.UTF_8), + method + " decompression should fail when bytes are not " + method + "-compressed"); + } + + // NONE: identity behavior (no Base64, no compression) + @Test + void noneMethod_identityAndNoBase64Assumptions() throws Exception { + String original = "Plain text 👀 stays as-is."; + String compressed = compress(original, OptionCompressor.CompressionMethod.NONE, StandardCharsets.UTF_8); + assertEquals(original, compressed, "NONE compress should be identity"); + + String restored = decompress(original, OptionCompressor.CompressionMethod.NONE, StandardCharsets.UTF_8); + assertEquals(original, restored, "NONE decompress should be identity"); + } +} From 92e36dc59b7993c70dd05ab7f01d2ba5e5f55bc8 Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 25 Aug 2025 15:01:52 -0400 Subject: [PATCH 5/8] Add Efficiency Test --- warehouse/core/pom.xml | 12 +- .../util/compression/OptionCompressor.java | 127 +++-- .../OptionCompressorEfficiencyTest.java | 444 ++++++++++++++++++ .../compression/OptionCompressorTest.java | 36 +- .../query/index/lookup/RangeStream.java | 19 + .../query/index/lookup/ShardRangeStream.java | 6 + 6 files changed, 552 insertions(+), 92 deletions(-) create mode 100644 warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java diff --git a/warehouse/core/pom.xml b/warehouse/core/pom.xml index 40e3aff80d4..e79c79f12fb 100644 --- a/warehouse/core/pom.xml +++ b/warehouse/core/pom.xml @@ -157,6 +157,12 @@ org.springframework spring-context-support + + + org.tukaani + xz + 1.10 + xerces xercesImpl @@ -193,12 +199,6 @@ easymock test - - - org.tukaani - xz - 1.10 - diff --git a/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java index 1822c215c75..4922b8a1f78 100644 --- a/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java +++ b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java @@ -1,11 +1,5 @@ package datawave.util.compression; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; -import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -13,20 +7,31 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; +import org.apache.commons.compress.compressors.lzma.LZMACompressorOutputStream; // Bzip2, Gz, Lz4, Lzo, NoCompression, Snappy, ZStandard /** * Provides a base contract for compression and decompression methods. * - *

General process:

+ *

+ * General process: + *

*
    - *
  1. Compression: Original Data → Compress with Algorithm (selected via {@link CompressionMethod}) → Encode in Base64 (except {@code NONE}) → Return as String
  2. - *
  3. Decompression: Input String (Base64 if compressed; plain text if {@code NONE}) → Decode Base64 (if applicable) → Decompress with Algorithm → Return Original Data
  4. + *
  5. Compression: Original Data → Compress with Algorithm (selected via {@link CompressionMethod}) → Encode in Base64 (except {@code NONE}) → Return as + * String
  6. + *
  7. Decompression: Input String (Base64 if compressed; plain text if {@code NONE}) → Decode Base64 (if applicable) → Decompress with Algorithm → + * Return Original Data
  8. *
* - *

Base64 encoding ensures that compressed data can be safely stored or transferred as text. - * For {@link CompressionMethod#NONE} the data is returned as-is and is not Base64-encoded.

+ *

+ * Base64 encoding ensures that compressed data can be safely stored or transferred as text. For {@link CompressionMethod#NONE} the data is returned as-is and + * is not Base64-encoded. + *

*/ public class OptionCompressor { @@ -37,26 +42,25 @@ public class OptionCompressor { // additionally, see https://commons.apache.org/proper/commons-compress/ // it's probably best to source all the compression from the same place if possible. - public enum CompressionMethod{ - NONE, - GZIP, - BZIP2, - SEVEN_ZIP + public enum CompressionMethod { + NONE, GZIP, BZIP2, SEVEN_ZIP } /** * Compresses the given string using the specified {@link CompressionMethod}. * - *

Order of operations (by method):

+ *

+ * Order of operations (by method): + *

*
    - *
  • {@code NONE}: Return {@code data} unchanged; no Base64 applied.
  • - *
  • {@code GZIP}: Convert {@code data} to bytes with {@code charset} → GZIP-compress → Base64-encode → return String.
  • - *
  • {@code BZIP2}: Convert {@code data} to bytes with {@code charset} → BZIP2-compress → Base64-encode → return String.
  • - *
  • {@code SEVEN_ZIP}: Convert {@code data} to bytes with {@code charset} → LZMA-compress (7z algorithm stream) → Base64-encode → return String.
  • + *
  • {@code NONE}: Return {@code data} unchanged; no Base64 applied.
  • + *
  • {@code GZIP}: Convert {@code data} to bytes with {@code charset} → GZIP-compress → Base64-encode → return String.
  • + *
  • {@code BZIP2}: Convert {@code data} to bytes with {@code charset} → BZIP2-compress → Base64-encode → return String.
  • + *
  • {@code SEVEN_ZIP}: Convert {@code data} to bytes with {@code charset} → LZMA-compress (7z algorithm stream) → Base64-encode → return String.
  • *
*/ public String compress(final String data, final CompressionMethod method, final Charset charset) throws IOException { - switch(method){ + switch (method) { case NONE: return data; case GZIP: @@ -71,16 +75,12 @@ public String compress(final String data, final CompressionMethod method, final } /** - * GZIP implementation reference: - * 1) String → bytes via charset - * 2) Write to GZIPOutputStream (must close to finalize) - * 3) Base64-encode compressed bytes + * GZIP implementation reference: 1) String → bytes via charset 2) Write to GZIPOutputStream (must close to finalize) 3) Base64-encode compressed bytes */ - private String compressGZIP(final String data, final Charset charset){ + private String compressGZIP(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - GZIPOutputStream gzip = new GZIPOutputStream(baos)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(baos)) { gzip.write(input); gzip.close(); // must close to flush all compressed data into baos @@ -92,16 +92,13 @@ private String compressGZIP(final String data, final Charset charset){ } /** - * BZIP2 implementation (mirrors GZIP flow): - * 1) String → bytes via charset - * 2) Write to BZip2CompressorOutputStream (close to finalize) - * 3) Base64-encode compressed bytes + * BZIP2 implementation (mirrors GZIP flow): 1) String → bytes via charset 2) Write to BZip2CompressorOutputStream (close to finalize) 3) Base64-encode + * compressed bytes */ private String compressBZIP2(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { bzos.write(input); bzos.close(); // finalize BZIP2 stream @@ -113,19 +110,15 @@ private String compressBZIP2(final String data, final Charset charset) { } /** - * "7-Zip" via LZMA stream implementation: - * 1) String → bytes via charset - * 2) Write to LZMACompressorOutputStream (close to finalize) - * 3) Base64-encode compressed bytes + * "7-Zip" via LZMA stream implementation: 1) String → bytes via charset 2) Write to LZMACompressorOutputStream (close to finalize) 3) Base64-encode + * compressed bytes * - * Note: This uses the LZMA compressor stream from Apache Commons Compress, - * which is the algorithm used by 7-Zip; it is not a .7z archive container. + * Note: This uses the LZMA compressor stream from Apache Commons Compress, which is the algorithm used by 7-Zip; it is not a .7z archive container. */ private String compress7ZIP(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { lzma.write(input); lzma.close(); // finalize LZMA stream @@ -137,17 +130,15 @@ private String compress7ZIP(final String data, final Charset charset) { } /** - * GZIP implementation reference for decompression: - * 1) Base64-decode to compressed bytes - * 2) Read via GZIPInputStream into buffer - * 3) Collect into ByteArrayOutputStream → String via charset + * GZIP implementation reference for decompression: 1) Base64-decode to compressed bytes 2) Read via GZIPInputStream into buffer 3) Collect into + * ByteArrayOutputStream → String via charset */ - private String decompressGZIP(final String dataBase64, final Charset charset){ + private String decompressGZIP(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - GZIPInputStream gzip = new GZIPInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + GZIPInputStream gzip = new GZIPInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { byte[] buf = new byte[4096]; int n; @@ -163,17 +154,15 @@ private String decompressGZIP(final String dataBase64, final Charset charset){ } /** - * BZIP2 decompression (mirrors GZIP flow): - * 1) Base64-decode to compressed bytes - * 2) Read via BZip2CompressorInputStream into buffer - * 3) Collect into ByteArrayOutputStream → String via charset + * BZIP2 decompression (mirrors GZIP flow): 1) Base64-decode to compressed bytes 2) Read via BZip2CompressorInputStream into buffer 3) Collect into + * ByteArrayOutputStream → String via charset */ private String decompressBZIP2(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { byte[] buf = new byte[4096]; int n; @@ -188,9 +177,7 @@ private String decompressBZIP2(final String dataBase64, final Charset charset) { } /** - * "7-Zip" via LZMA stream decompression (mirrors GZIP/BZIP2 flow): - * 1) Base64-decode to compressed bytes - * 2) Read via LZMACompressorInputStream into buffer + * "7-Zip" via LZMA stream decompression (mirrors GZIP/BZIP2 flow): 1) Base64-decode to compressed bytes 2) Read via LZMACompressorInputStream into buffer * 3) Collect into ByteArrayOutputStream → String via charset * * Note: This expects LZMA stream data (not a .7z archive container). @@ -199,8 +186,8 @@ private String decompress7ZIP(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { byte[] buf = new byte[4096]; int n; @@ -217,16 +204,20 @@ private String decompress7ZIP(final String dataBase64, final Charset charset) { /** * Decompresses a string using the specified {@link CompressionMethod}. * - *

Order of operations (by method):

+ *

+ * Order of operations (by method): + *

*
    - *
  • {@code NONE}: Return input unchanged (input is expected to be the original plain text, not Base64).
  • - *
  • {@code GZIP}: Base64-decode → GZIP-decompress → decode to String using {@code charset}.
  • - *
  • {@code BZIP2}: Base64-decode → BZIP2-decompress → decode to String using {@code charset}.
  • - *
  • {@code SEVEN_ZIP}: Base64-decode → LZMA-decompress → decode to String using {@code charset}.
  • + *
  • {@code NONE}: Return input unchanged (input is expected to be the original plain text, not Base64).
  • + *
  • {@code GZIP}: Base64-decode → GZIP-decompress → decode to String using {@code charset}.
  • + *
  • {@code BZIP2}: Base64-decode → BZIP2-decompress → decode to String using {@code charset}.
  • + *
  • {@code SEVEN_ZIP}: Base64-decode → LZMA-decompress → decode to String using {@code charset}.
  • *
* - *

This is the reverse of {@link #compress(String, CompressionMethod, Charset)}. For all non-{@code NONE} methods, - * the input should be the Base64 text returned by the corresponding {@code compress(...)} call.

+ *

+ * This is the reverse of {@link #compress(String, CompressionMethod, Charset)}. For all non-{@code NONE} methods, the input should be the Base64 text + * returned by the corresponding {@code compress(...)} call. + *

*/ public String decompress(final String input, final CompressionMethod method, final Charset charset) throws IOException { switch (method) { diff --git a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java new file mode 100644 index 00000000000..2fd638352b5 --- /dev/null +++ b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java @@ -0,0 +1,444 @@ +package datawave.util.compression; + +import datawave.query.util.TypeMetadata; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.lang3.StringUtils; // Apache Commons Lang for capitalization +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Compression benchmark for OptionCompressor across generated datasets. + * + * - Prints a concise summary table per run (averages over MEASUREMENT_TRIALS) + * - Optional: dumps the full corpus mapping (fields -> types -> ingests) at the END of each run + * - Optional: runs multiple exponentially larger datasets to observe scaling trends + */ +public class OptionCompressorEfficiencyTest { + + private static final Charset UTF8 = StandardCharsets.UTF_8; + + // Fixed type set (as requested) + private static final List TYPES = Arrays.asList( + "LcNoDiacriticsType", "NumberType", "DateType", "BooleanType", "GeoType" + ); + + // ---- Output toggles ---- + private static final boolean PRINT_SUMMARY_TABLE = true; // human-readable summary + private static final boolean PRINT_CORPUS_DETAILS = false; // single toggle: dump full corpus at end of each run + + // ---- Benchmark configuration ---- + private static final int MEASUREMENT_TRIALS = 5; // number of timing iterations (averaged) + private static final long RNG_SEED = 42L; // base seed for deterministic generation + + // ---- Dataset generation (base) ---- + private static final int BASE_FIELD_COUNT = 64; // fields generated with natural names + private static final int BASE_INGEST_COUNT = 16; // ingests generated with natural names + private static final int MAX_TYPES_PER_FIELD = 5; // chosen from TYPES per field + + // ---- Multi-run exponential scaling ---- + // If enabled, we run SCALE_STEPS datasets. Each step multiplies fields & ingests by SCALE_FACTOR^step. + private static final boolean SCALE_CORPUS_SIZES = true; + private static final int SCALE_STEPS = 3; // e.g., 4 runs: base, x2, x4, x8 (with factor 2) + private static final int SCALE_FACTOR = 4; // >=1; 2 = doubles each step + + // Word roots for natural, number-free names + private static final String[] NAME_WORDS = { + "lorem","ipsum","dolor","amet","terra","orbis","nova","astra","luna","sol","nimbus","cirrus","zenith","apex", + "aurora","boreal","ember","aqua","flumen","rivus","silva","folium","radix","vertex","praxis","nexus","vector", + "argon","neon","ion","quanta","quantum","plasma","flux","sigma","omega","delta","theta","lambda","kappa","alpha", + "gamma","zeta","rho","tau","psi","mercury","atlas","phoenix","orion","vega","sirius","altair","pangea","cedrus", + "cortex","lumen","umbra","umbrel","vulcan","cronos","helios","gaia","aether","strata","tundra","sylvan","arbor", + "granum","cumulus","stratus","horizon","axiom","lemma","theorem","matrix","vectora","volt","ampere" + }; + + @Test + void runCompressionBenchmarkSuite() throws IOException { + OptionCompressor compressor = new OptionCompressor(); + List datasets = new ArrayList<>(); + + if (SCALE_CORPUS_SIZES && SCALE_STEPS > 0 && SCALE_FACTOR >= 1) { + for (int step = 0; step < SCALE_STEPS; step++) { + long stepSeed = RNG_SEED + step * 1337L; // distinct, deterministic seed per step + int scale = intPow(SCALE_FACTOR, step); + int fields = Math.max(1, BASE_FIELD_COUNT * scale); + int ingests = Math.max(1, BASE_INGEST_COUNT * scale); + + String runName = String.format(Locale.ROOT, + "generated(step=%d,fields=%d,ingests=%d,maxTypes=%d)", + step, fields, ingests, MAX_TYPES_PER_FIELD); + + datasets.add(buildDataset(runName, fields, ingests, MAX_TYPES_PER_FIELD, TYPES, stepSeed)); + } + } else { + datasets.add(buildDataset( + "generated(base)", + BASE_FIELD_COUNT, BASE_INGEST_COUNT, MAX_TYPES_PER_FIELD, TYPES, RNG_SEED + )); + } + + // Warm up the JVM/compressor once across all datasets & algorithms + warmupAllMethods(compressor, datasets); + + if (PRINT_SUMMARY_TABLE) { + printSummaryLegend(MEASUREMENT_TRIALS); + } + + // ---- Main measurement loop ---- + for (BenchmarkDataset dataset : datasets) { + final byte[] originalBytes = dataset.serialized.getBytes(UTF8); + final int originalLen = originalBytes.length; + + if (PRINT_SUMMARY_TABLE) { + printRunHeader(dataset.name, originalLen); + printSummaryHeader(); + } + + for (OptionCompressor.CompressionMethod method : OptionCompressor.CompressionMethod.values()) { + long compressNanosTotal = 0; + long decompressNanosTotal = 0; + int compressedLenRaw = 0; + int compressedLenB64 = 0; + + String lastEncoded = null; + + for (int i = 0; i < MEASUREMENT_TRIALS; i++) { + long t0 = System.nanoTime(); + lastEncoded = compressor.compress(dataset.serialized, method, UTF8); + long t1 = System.nanoTime(); + compressNanosTotal += (t1 - t0); + + if (method == OptionCompressor.CompressionMethod.NONE) { + compressedLenRaw = originalLen; + compressedLenB64 = originalLen; + } else { + byte[] decoded = Base64.decodeBase64(lastEncoded); + compressedLenRaw = decoded.length; + compressedLenB64 = lastEncoded.getBytes(UTF8).length; + } + + long t2 = System.nanoTime(); + String restored = compressor.decompress(lastEncoded, method, UTF8); + long t3 = System.nanoTime(); + decompressNanosTotal += (t3 - t2); + + assertEquals(dataset.serialized, restored, "Round-trip must match for " + method); + } + + double avgCompressMs = nanosToMillis(compressNanosTotal / (double) MEASUREMENT_TRIALS); + double avgDecompressMs = nanosToMillis(decompressNanosTotal / (double) MEASUREMENT_TRIALS); + double ratioRaw = (double) compressedLenRaw / originalLen; + double ratioB64 = (double) compressedLenB64 / originalLen; + + if (PRINT_SUMMARY_TABLE) { + printSummaryRow( + method.name(), + humanBytes(originalLen), + humanBytes(compressedLenRaw), + humanBytes(compressedLenB64), + percentSaved(ratioRaw), + percentSaved(ratioB64), + avgCompressMs, + avgDecompressMs + ); + } + } + + // ---- Dump the full corpus mapping at the END of this run (single boolean) ---- + if (PRINT_CORPUS_DETAILS) { + printCorpusDetails(dataset); + } + + if (PRINT_SUMMARY_TABLE) { + System.out.println(); // spacer between runs + } + } + } + + // ------------------------ Naming & formatting helpers ------------------------ + + private static void printSummaryLegend(int trials) { + System.out.println(); + System.out.println("OptionCompressor Benchmark — averages over " + trials + " trials (lower time is better; higher % saved is better)."); + System.out.println("Columns:"); + System.out.println(" Method Compression algorithm"); + System.out.println(" Original Original size"); + System.out.println(" Compressed Size after compression (no Base64)"); + System.out.println(" Base64 Size after compression + Base64 text"); + System.out.println(" Saved (raw) % space saved vs original (no Base64)"); + System.out.println(" Saved (b64) % space saved vs original (with Base64)"); + System.out.println(" Compress/Decompress: average milliseconds"); + System.out.println(); + } + + private static void printRunHeader(String name, int origLenBytes) { + System.out.printf("Run: %s | Original total: %s (%d bytes)%n", name, humanBytes(origLenBytes), origLenBytes); + } + + private static void printSummaryHeader() { + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", + "Method", "Original", "Compressed", "Base64", "Saved (raw)", "Saved (b64)"); + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", + repeat('-',14), repeat('-',11), repeat('-',11), repeat('-',10), repeat('-',11), repeat('-',13)); + } + + private static void printSummaryRow(String method, + String original, + String compressed, + String base64, + String savedRawPct, + String savedB64Pct, + double compressMs, + double decompressMs) { + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s | Compress: %6.3f ms Decompress: %6.3f ms%n", + method, original, compressed, base64, savedRawPct, savedB64Pct, compressMs, decompressMs); + } + + private static String humanBytes(long bytes) { + final String[] units = {"B","KiB","MiB","GiB","TiB"}; + double v = bytes; + int u = 0; + while (v >= 1024.0 && u < units.length - 1) { v /= 1024.0; u++; } + if (u == 0) return String.format(Locale.ROOT, "%d%s", bytes, units[u]); + return String.format(Locale.ROOT, "%.2f%s", v, units[u]); + } + + private static String percentSaved(double ratioCompressedOverOriginal) { + double saved = (1.0 - ratioCompressedOverOriginal) * 100.0; + return String.format(Locale.ROOT, "%.1f%%", saved); + } + + private static String repeat(char c, int n) { + char[] arr = new char[n]; + Arrays.fill(arr, c); + return new String(arr); + } + + private static double nanosToMillis(double nanos) { + return nanos / 1_000_000.0; + } + + private static int intPow(int base, int exp) { + int r = 1; + for (int i = 0; i < exp; i++) r = Math.multiplyExact(r, base); + return r; + } + + // ------------------------ Corpus dump (single boolean; no preview tuning) ------------------------ + + private static void printCorpusDetails(BenchmarkDataset ds) { + System.out.printf("%n=== Corpus Details ===%n"); + System.out.printf("Seed: %d%n", ds.seed); + System.out.printf("Fields: %d Ingests: %d MaxTypesPerField: %d TypesUsed: %d / %d Triples: %,d%n", + ds.generatedFieldCount, ds.generatedIngestCount, ds.maxTypesPerField, + ds.distinctTypes.size(), ds.totalTypesAvailable, ds.triples.size()); + + // Full, untruncated listing (sorted for determinism) + List fields = new ArrayList<>(ds.byField.keySet()); + Collections.sort(fields); + for (String f : fields) { + System.out.println("- " + f + ":"); + Map> types = ds.byField.getOrDefault(f, Collections.emptyMap()); + List typeNames = new ArrayList<>(types.keySet()); + Collections.sort(typeNames); + for (String t : typeNames) { + List ing = new ArrayList<>(types.getOrDefault(t, Collections.emptySet())); + Collections.sort(ing); + System.out.println(" " + t + " -> " + String.join(", ", ing)); + } + } + System.out.println("======================\n"); + } + + // ------------------------ Dataset generation & warmup ------------------------ + + private static class Mapping { + final String field, ingest, type; + Mapping(String f, String i, String t) { this.field = f; this.ingest = i; this.type = t; } + } + + private static class BenchmarkDataset { + final String name; + final String serialized; // TypeMetadata serialization + final long seed; + + final int generatedFieldCount; + final int generatedIngestCount; + final int maxTypesPerField; + final int totalTypesAvailable; + + final List triples; + + final Set distinctFields = new HashSet<>(); + final Set distinctIngests = new HashSet<>(); + final Set distinctTypes = new HashSet<>(); + // field -> type -> ingests + final Map>> byField = new HashMap<>(); + + BenchmarkDataset(String name, + String serialized, + long seed, + int generatedFieldCount, + int generatedIngestCount, + int maxTypesPerField, + int totalTypesAvailable, + List triples) { + this.name = name; + this.serialized = serialized; + this.seed = seed; + this.generatedFieldCount = generatedFieldCount; + this.generatedIngestCount = generatedIngestCount; + this.maxTypesPerField = maxTypesPerField; + this.totalTypesAvailable = totalTypesAvailable; + this.triples = triples; + + for (Mapping m : triples) { + distinctFields.add(m.field); + distinctIngests.add(m.ingest); + distinctTypes.add(m.type); + byField + .computeIfAbsent(m.field, k -> new HashMap<>()) + .computeIfAbsent(m.type, k -> new HashSet<>()) + .add(m.ingest); + } + } + } + + private static void warmupAllMethods(OptionCompressor compressor, List datasets) { + for (BenchmarkDataset ds : datasets) { + for (OptionCompressor.CompressionMethod m : OptionCompressor.CompressionMethod.values()) { + try { + String enc = compressor.compress(ds.serialized, m, UTF8); + String dec = compressor.decompress(enc, m, UTF8); + assertEquals(ds.serialized, dec); + } catch (Exception ignored) {} + } + } + } + + /** + * Build a dataset from generated, natural (number-free) field/ingest names and + * a deterministic random subset (up to maxTypesPerField) of TYPES per field. + * For each (field, chosenType), ALL ingests are included. + */ + private static BenchmarkDataset buildDataset(String name, + int fieldCount, + int ingestCount, + int maxTypesPerField, + List availableTypes, + long seed) { + if (fieldCount < 1) fieldCount = 1; + if (ingestCount < 1) ingestCount = 1; + if (maxTypesPerField < 1) maxTypesPerField = 1; + + Random rnd = new Random(seed); + + // Use different derived seeds so fields/ingests differ + List fields = generateNaturalNames(fieldCount, new Random(seed ^ 0xC0FFEE1234ABCDEFL)); + List ingests = generateNaturalNames(ingestCount, new Random(seed ^ 0xBEEFBABE56789ABCL)); + + TypeMetadata tm = new TypeMetadata(); + List triples = new ArrayList<>(); + + for (String f : fields) { + List shuffled = new ArrayList<>(availableTypes); + Collections.shuffle(shuffled, rnd); + int choose = Math.min(maxTypesPerField, shuffled.size()); + List chosenTypes = shuffled.subList(0, choose); + + for (String t : chosenTypes) { + for (String i : ingests) { + tm.put(f, i, t); + triples.add(new Mapping(f, i, t)); + } + } + } + + return new BenchmarkDataset( + name, + tm.toString(), + seed, + fieldCount, + ingestCount, + maxTypesPerField, + availableTypes.size(), + triples + ); + } + + /** + * Generate 'count' unique, natural-looking, number-free names using pairs (then triples) of roots. + * Deterministic given the Random. Produces lowerCamelCase tokens like "auroraFlux". + * Uses Apache Commons Lang for capitalization. + */ + private static List generateNaturalNames(int count, Random rnd) { + List words = Arrays.asList(NAME_WORDS); + + // All distinct 2-word combinations (order matters; no duplicates like (i,i)) + List pairs = new ArrayList<>(words.size() * (words.size() - 1)); + for (int i = 0; i < words.size(); i++) { + for (int j = 0; j < words.size(); j++) { + if (i == j) continue; + pairs.add(new int[]{i, j}); + } + } + Collections.shuffle(pairs, rnd); + + LinkedHashSet out = new LinkedHashSet<>(count); + int p = 0; + while (out.size() < count && p < pairs.size()) { + int[] ij = pairs.get(p++); + out.add(toLowerCamel(words.get(ij[0]), words.get(ij[1]))); + } + + // Top up with 3-word combinations if needed + if (out.size() < count) { + List triples = new ArrayList<>(words.size() * words.size() * words.size()); + for (int i = 0; i < words.size(); i++) { + for (int j = 0; j < words.size(); j++) { + if (i == j) continue; + for (int k = 0; k < words.size(); k++) { + if (k == i || k == j) continue; + triples.add(new int[]{i, j, k}); + } + } + } + Collections.shuffle(triples, rnd); + int t = 0; + while (out.size() < count && t < triples.size()) { + int[] ijk = triples.get(t++); + out.add(toLowerCamel(words.get(ijk[0]), words.get(ijk[1]), words.get(ijk[2]))); + } + } + + // Safety fallback + if (out.size() < count) { + for (String w : words) { + out.add(toLowerCamel(w)); + if (out.size() >= count) break; + } + } + return new ArrayList<>(out).subList(0, count); + } + + private static String toLowerCamel(String... parts) { + if (parts == null || parts.length == 0) return ""; + List clean = new ArrayList<>(parts.length); + for (String p : parts) { + if (p == null) continue; + String s = p.replaceAll("[^A-Za-z]", "").toLowerCase(Locale.ROOT); + if (!s.isEmpty()) clean.add(s); + } + if (clean.isEmpty()) return ""; + StringBuilder sb = new StringBuilder(clean.get(0)); // first stays lower + for (int i = 1; i < clean.size(); i++) { + sb.append(StringUtils.capitalize(clean.get(i))); // Apache Commons Lang + } + return sb.toString(); + } +} diff --git a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java index 6f433b10997..cdc0abb25f3 100644 --- a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java +++ b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorTest.java @@ -1,15 +1,17 @@ package datawave.util.compression; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.EnumSource; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Random; -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; class OptionCompressorTest { @@ -24,11 +26,12 @@ void setUp() { private String compress(String s, OptionCompressor.CompressionMethod m, Charset cs) throws Exception { return compressor.compress(s, m, cs); } + private String decompress(String s, OptionCompressor.CompressionMethod m, Charset cs) throws Exception { return compressor.decompress(s, m, cs); } - // Round-trip: UTF-8 content across all real compression methods + // Round-trip: UTF-8 content across all real compression methods @ParameterizedTest @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) void roundTrip_utf8_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { @@ -38,7 +41,7 @@ void roundTrip_utf8_allAlgos(OptionCompressor.CompressionMethod method) throws E assertEquals(original, restored, method + " UTF-8 round trip should restore original string"); } - // Highly compressible data should shrink after Base64 (still smaller than original bytes) + // Highly compressible data should shrink after Base64 (still smaller than original bytes) @ParameterizedTest @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) void compressibleData_resultsInReasonableSize_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { @@ -46,13 +49,12 @@ void compressibleData_resultsInReasonableSize_allAlgos(OptionCompressor.Compress String base64 = compress(original, method, StandardCharsets.UTF_8); int originalBytes = original.getBytes(StandardCharsets.UTF_8).length; - assertTrue(base64.length() < originalBytes, - () -> method + " Base64(compressed) should be smaller than original bytes for highly compressible data"); + assertTrue(base64.length() < originalBytes, () -> method + " Base64(compressed) should be smaller than original bytes for highly compressible data"); assertEquals(original, decompress(base64, method, StandardCharsets.UTF_8)); } - // Large-ish random data round-trip (ISO-8859-1 for 1:1 byte<->char) + // Large-ish random data round-trip (ISO-8859-1 for 1:1 byte<->char) @ParameterizedTest @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) void largeData_roundTrip_allAlgos(OptionCompressor.CompressionMethod method) throws Exception { @@ -65,26 +67,24 @@ void largeData_roundTrip_allAlgos(OptionCompressor.CompressionMethod method) thr assertEquals(original, restored, method + " large data should round-trip correctly"); } - // Bad inputs: invalid Base64 should fail on real decompressors + // Bad inputs: invalid Base64 should fail on real decompressors @ParameterizedTest @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) void invalidBase64_throwsOnDecompress_allAlgos(OptionCompressor.CompressionMethod method) { String notBase64 = "this is not base64!!!"; - assertThrows(Exception.class, () -> decompress(notBase64, method, StandardCharsets.UTF_8), - method + " decompression should fail on invalid Base64"); + assertThrows(Exception.class, () -> decompress(notBase64, method, StandardCharsets.UTF_8), method + " decompression should fail on invalid Base64"); } - // Bad inputs: valid Base64 but not compressed bytes should fail + // Bad inputs: valid Base64 but not compressed bytes should fail @ParameterizedTest @EnumSource(value = OptionCompressor.CompressionMethod.class, names = {"GZIP", "BZIP2", "SEVEN_ZIP"}) void validBase64ButNotCompressed_throwsOnDecompress_allAlgos(OptionCompressor.CompressionMethod method) { - String base64OfPlain = java.util.Base64.getEncoder() - .encodeToString("hello".getBytes(StandardCharsets.UTF_8)); + String base64OfPlain = java.util.Base64.getEncoder().encodeToString("hello".getBytes(StandardCharsets.UTF_8)); assertThrows(Exception.class, () -> decompress(base64OfPlain, method, StandardCharsets.UTF_8), - method + " decompression should fail when bytes are not " + method + "-compressed"); + method + " decompression should fail when bytes are not " + method + "-compressed"); } - // NONE: identity behavior (no Base64, no compression) + // NONE: identity behavior (no Base64, no compression) @Test void noneMethod_identityAndNoBase64Assumptions() throws Exception { String original = "Plain text 👀 stays as-is."; diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java index c0ef46cc987..c3612528621 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java @@ -69,6 +69,7 @@ import com.google.common.base.Function; import com.google.common.base.Predicate; +import com.google.common.collect.HashMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; @@ -204,6 +205,24 @@ public CloseableIterable streamPlans(JexlNode script) { log.trace(JexlStringBuildingVisitor.buildQuery(node)); } + System.out.println("SETH SMUCKER"); + + Multimap> nonIndexedQueryFieldsDatatypes = HashMultimap.create(config.getQueryFieldsDatatypes()); + nonIndexedQueryFieldsDatatypes.keySet().removeAll(config.getIndexedFields()); + String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); + System.out.println("NON_INDEXED_DATATYPES: " + nonIndexedTypes); + + try { + String serializedTypeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()).toString(); + System.out.println("TYPE_METADATA: " + serializedTypeMetadata); + + } catch (TableNotFoundException ignored) { + + } + + // String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); + // System.out.println("TYPE_METADATA_AUTHS: " + requiredAuthsString); + BaseIndexStream ranges = (BaseIndexStream) node.jjtAccept(this, null); // Guards against the case of a very oddly formed JEXL query, e.g. ("foo") diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java index 24372e784c7..4cb7caf55f4 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java @@ -136,18 +136,24 @@ public QueryPlan apply(Entry entry) { * Lift and shift from DefaultQueryPlanner to avoid reliance on static methods */ private void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper) { + System.out.println("SETH SMUCKER"); DefaultQueryPlanner.addOption(cfg, QueryOptions.QUERY_MAPPING_COMPRESS, Boolean.toString(true), false); Multimap> nonIndexedQueryFieldsDatatypes = HashMultimap.create(config.getQueryFieldsDatatypes()); nonIndexedQueryFieldsDatatypes.keySet().removeAll(config.getIndexedFields()); String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); + System.out.println("NON_INDEXED_DATATYPES: " + nonIndexedTypes); DefaultQueryPlanner.addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false); try { String serializedTypeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()).toString(); + System.out.println("TYPE_METADATA: " + serializedTypeMetadata); + DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); + System.out.println("TYPE_METADATA_AUTHS: " + requiredAuthsString); + requiredAuthsString = QueryOptions.compressOption(requiredAuthsString, QueryOptions.UTF8); DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false); } catch (TableNotFoundException | IOException e) { From d95f1621e04edbf72eb7da69cc256a1374782af0 Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 25 Aug 2025 16:11:18 -0400 Subject: [PATCH 6/8] Fixes --- .../OptionCompressorEfficiencyTest.java | 241 ++++++++---------- 1 file changed, 112 insertions(+), 129 deletions(-) diff --git a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java index 2fd638352b5..44eaa1b214f 100644 --- a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java +++ b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java @@ -1,61 +1,69 @@ package datawave.util.compression; -import datawave.query.util.TypeMetadata; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.lang3.StringUtils; // Apache Commons Lang for capitalization -import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; +import java.util.Set; -import static org.junit.jupiter.api.Assertions.*; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.lang3.StringUtils; // Apache Commons Lang for capitalization +import org.junit.jupiter.api.Test; + +import datawave.query.util.TypeMetadata; + +import static org.junit.jupiter.api.Assertions.assertEquals; /** * Compression benchmark for OptionCompressor across generated datasets. * - * - Prints a concise summary table per run (averages over MEASUREMENT_TRIALS) - * - Optional: dumps the full corpus mapping (fields -> types -> ingests) at the END of each run - * - Optional: runs multiple exponentially larger datasets to observe scaling trends + * - Prints a concise summary table per run (averages over MEASUREMENT_TRIALS) - Optional: dumps the full corpus mapping (fields -> types -> ingests) at the END + * of each run - Optional: runs multiple exponentially larger datasets to observe scaling trends */ public class OptionCompressorEfficiencyTest { private static final Charset UTF8 = StandardCharsets.UTF_8; // Fixed type set (as requested) - private static final List TYPES = Arrays.asList( - "LcNoDiacriticsType", "NumberType", "DateType", "BooleanType", "GeoType" - ); + private static final List TYPES = Arrays.asList("LcNoDiacriticsType", "NumberType", "DateType", "BooleanType", "GeoType"); // ---- Output toggles ---- - private static final boolean PRINT_SUMMARY_TABLE = true; // human-readable summary - private static final boolean PRINT_CORPUS_DETAILS = false; // single toggle: dump full corpus at end of each run + private static final boolean PRINT_SUMMARY_TABLE = true; // human-readable summary + private static final boolean PRINT_CORPUS_DETAILS = false; // single toggle: dump full corpus at end of each run // ---- Benchmark configuration ---- - private static final int MEASUREMENT_TRIALS = 5; // number of timing iterations (averaged) - private static final long RNG_SEED = 42L; // base seed for deterministic generation + private static final int MEASUREMENT_TRIALS = 5; // number of timing iterations (averaged) + private static final long RNG_SEED = 42L; // base seed for deterministic generation // ---- Dataset generation (base) ---- - private static final int BASE_FIELD_COUNT = 64; // fields generated with natural names - private static final int BASE_INGEST_COUNT = 16; // ingests generated with natural names - private static final int MAX_TYPES_PER_FIELD = 5; // chosen from TYPES per field + private static final int BASE_FIELD_COUNT = 64; // fields generated with natural names + private static final int BASE_INGEST_COUNT = 16; // ingests generated with natural names + private static final int MAX_TYPES_PER_FIELD = 5; // chosen from TYPES per field // ---- Multi-run exponential scaling ---- // If enabled, we run SCALE_STEPS datasets. Each step multiplies fields & ingests by SCALE_FACTOR^step. private static final boolean SCALE_CORPUS_SIZES = true; - private static final int SCALE_STEPS = 3; // e.g., 4 runs: base, x2, x4, x8 (with factor 2) - private static final int SCALE_FACTOR = 4; // >=1; 2 = doubles each step + private static final int SCALE_STEPS = 3; // e.g., 4 runs: base, x2, x4, x8 (with factor 2) + private static final int SCALE_FACTOR = 4; // >=1; 2 = doubles each step // Word roots for natural, number-free names - private static final String[] NAME_WORDS = { - "lorem","ipsum","dolor","amet","terra","orbis","nova","astra","luna","sol","nimbus","cirrus","zenith","apex", - "aurora","boreal","ember","aqua","flumen","rivus","silva","folium","radix","vertex","praxis","nexus","vector", - "argon","neon","ion","quanta","quantum","plasma","flux","sigma","omega","delta","theta","lambda","kappa","alpha", - "gamma","zeta","rho","tau","psi","mercury","atlas","phoenix","orion","vega","sirius","altair","pangea","cedrus", - "cortex","lumen","umbra","umbrel","vulcan","cronos","helios","gaia","aether","strata","tundra","sylvan","arbor", - "granum","cumulus","stratus","horizon","axiom","lemma","theorem","matrix","vectora","volt","ampere" - }; + private static final String[] NAME_WORDS = {"lorem", "ipsum", "dolor", "amet", "terra", "orbis", "nova", "astra", "luna", "sol", "nimbus", "cirrus", + "zenith", "apex", "aurora", "boreal", "ember", "aqua", "flumen", "rivus", "silva", "folium", "radix", "vertex", "praxis", "nexus", "vector", + "argon", "neon", "ion", "quanta", "quantum", "plasma", "flux", "sigma", "omega", "delta", "theta", "lambda", "kappa", "alpha", "gamma", "zeta", + "rho", "tau", "psi", "mercury", "atlas", "phoenix", "orion", "vega", "sirius", "altair", "pangea", "cedrus", "cortex", "lumen", "umbra", "umbrel", + "vulcan", "cronos", "helios", "gaia", "aether", "strata", "tundra", "sylvan", "arbor", "granum", "cumulus", "stratus", "horizon", "axiom", "lemma", + "theorem", "matrix", "vectora", "volt", "ampere"}; @Test void runCompressionBenchmarkSuite() throws IOException { @@ -65,21 +73,16 @@ void runCompressionBenchmarkSuite() throws IOException { if (SCALE_CORPUS_SIZES && SCALE_STEPS > 0 && SCALE_FACTOR >= 1) { for (int step = 0; step < SCALE_STEPS; step++) { long stepSeed = RNG_SEED + step * 1337L; // distinct, deterministic seed per step - int scale = intPow(SCALE_FACTOR, step); - int fields = Math.max(1, BASE_FIELD_COUNT * scale); - int ingests = Math.max(1, BASE_INGEST_COUNT * scale); + int scale = intPow(SCALE_FACTOR, step); + int fields = Math.max(1, BASE_FIELD_COUNT * scale); + int ingests = Math.max(1, BASE_INGEST_COUNT * scale); - String runName = String.format(Locale.ROOT, - "generated(step=%d,fields=%d,ingests=%d,maxTypes=%d)", - step, fields, ingests, MAX_TYPES_PER_FIELD); + String runName = String.format(Locale.ROOT, "generated(step=%d,fields=%d,ingests=%d,maxTypes=%d)", step, fields, ingests, MAX_TYPES_PER_FIELD); datasets.add(buildDataset(runName, fields, ingests, MAX_TYPES_PER_FIELD, TYPES, stepSeed)); } } else { - datasets.add(buildDataset( - "generated(base)", - BASE_FIELD_COUNT, BASE_INGEST_COUNT, MAX_TYPES_PER_FIELD, TYPES, RNG_SEED - )); + datasets.add(buildDataset("generated(base)", BASE_FIELD_COUNT, BASE_INGEST_COUNT, MAX_TYPES_PER_FIELD, TYPES, RNG_SEED)); } // Warm up the JVM/compressor once across all datasets & algorithms @@ -92,7 +95,7 @@ void runCompressionBenchmarkSuite() throws IOException { // ---- Main measurement loop ---- for (BenchmarkDataset dataset : datasets) { final byte[] originalBytes = dataset.serialized.getBytes(UTF8); - final int originalLen = originalBytes.length; + final int originalLen = originalBytes.length; if (PRINT_SUMMARY_TABLE) { printRunHeader(dataset.name, originalLen); @@ -100,10 +103,10 @@ void runCompressionBenchmarkSuite() throws IOException { } for (OptionCompressor.CompressionMethod method : OptionCompressor.CompressionMethod.values()) { - long compressNanosTotal = 0; + long compressNanosTotal = 0; long decompressNanosTotal = 0; - int compressedLenRaw = 0; - int compressedLenB64 = 0; + int compressedLenRaw = 0; + int compressedLenB64 = 0; String lastEncoded = null; @@ -130,22 +133,14 @@ void runCompressionBenchmarkSuite() throws IOException { assertEquals(dataset.serialized, restored, "Round-trip must match for " + method); } - double avgCompressMs = nanosToMillis(compressNanosTotal / (double) MEASUREMENT_TRIALS); + double avgCompressMs = nanosToMillis(compressNanosTotal / (double) MEASUREMENT_TRIALS); double avgDecompressMs = nanosToMillis(decompressNanosTotal / (double) MEASUREMENT_TRIALS); - double ratioRaw = (double) compressedLenRaw / originalLen; - double ratioB64 = (double) compressedLenB64 / originalLen; + double ratioRaw = (double) compressedLenRaw / originalLen; + double ratioB64 = (double) compressedLenB64 / originalLen; if (PRINT_SUMMARY_TABLE) { - printSummaryRow( - method.name(), - humanBytes(originalLen), - humanBytes(compressedLenRaw), - humanBytes(compressedLenB64), - percentSaved(ratioRaw), - percentSaved(ratioB64), - avgCompressMs, - avgDecompressMs - ); + printSummaryRow(method.name(), humanBytes(originalLen), humanBytes(compressedLenRaw), humanBytes(compressedLenB64), percentSaved(ratioRaw), + percentSaved(ratioB64), avgCompressMs, avgDecompressMs); } } @@ -181,30 +176,27 @@ private static void printRunHeader(String name, int origLenBytes) { } private static void printSummaryHeader() { - System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", - "Method", "Original", "Compressed", "Base64", "Saved (raw)", "Saved (b64)"); - System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", - repeat('-',14), repeat('-',11), repeat('-',11), repeat('-',10), repeat('-',11), repeat('-',13)); + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", "Method", "Original", "Compressed", "Base64", "Saved (raw)", "Saved (b64)"); + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s%n", repeat('-', 14), repeat('-', 11), repeat('-', 11), repeat('-', 10), repeat('-', 11), + repeat('-', 13)); } - private static void printSummaryRow(String method, - String original, - String compressed, - String base64, - String savedRawPct, - String savedB64Pct, - double compressMs, - double decompressMs) { - System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s | Compress: %6.3f ms Decompress: %6.3f ms%n", - method, original, compressed, base64, savedRawPct, savedB64Pct, compressMs, decompressMs); + private static void printSummaryRow(String method, String original, String compressed, String base64, String savedRawPct, String savedB64Pct, + double compressMs, double decompressMs) { + System.out.printf("%-14s %-11s %-11s %-10s %-11s %-13s | Compress: %6.3f ms Decompress: %6.3f ms%n", method, original, compressed, base64, + savedRawPct, savedB64Pct, compressMs, decompressMs); } private static String humanBytes(long bytes) { - final String[] units = {"B","KiB","MiB","GiB","TiB"}; + final String[] units = {"B", "KiB", "MiB", "GiB", "TiB"}; double v = bytes; int u = 0; - while (v >= 1024.0 && u < units.length - 1) { v /= 1024.0; u++; } - if (u == 0) return String.format(Locale.ROOT, "%d%s", bytes, units[u]); + while (v >= 1024.0 && u < units.length - 1) { + v /= 1024.0; + u++; + } + if (u == 0) + return String.format(Locale.ROOT, "%d%s", bytes, units[u]); return String.format(Locale.ROOT, "%.2f%s", v, units[u]); } @@ -225,7 +217,8 @@ private static double nanosToMillis(double nanos) { private static int intPow(int base, int exp) { int r = 1; - for (int i = 0; i < exp; i++) r = Math.multiplyExact(r, base); + for (int i = 0; i < exp; i++) + r = Math.multiplyExact(r, base); return r; } @@ -234,9 +227,8 @@ private static int intPow(int base, int exp) { private static void printCorpusDetails(BenchmarkDataset ds) { System.out.printf("%n=== Corpus Details ===%n"); System.out.printf("Seed: %d%n", ds.seed); - System.out.printf("Fields: %d Ingests: %d MaxTypesPerField: %d TypesUsed: %d / %d Triples: %,d%n", - ds.generatedFieldCount, ds.generatedIngestCount, ds.maxTypesPerField, - ds.distinctTypes.size(), ds.totalTypesAvailable, ds.triples.size()); + System.out.printf("Fields: %d Ingests: %d MaxTypesPerField: %d TypesUsed: %d / %d Triples: %,d%n", ds.generatedFieldCount, + ds.generatedIngestCount, ds.maxTypesPerField, ds.distinctTypes.size(), ds.totalTypesAvailable, ds.triples.size()); // Full, untruncated listing (sorted for determinism) List fields = new ArrayList<>(ds.byField.keySet()); @@ -259,13 +251,18 @@ private static void printCorpusDetails(BenchmarkDataset ds) { private static class Mapping { final String field, ingest, type; - Mapping(String f, String i, String t) { this.field = f; this.ingest = i; this.type = t; } + + Mapping(String f, String i, String t) { + this.field = f; + this.ingest = i; + this.type = t; + } } private static class BenchmarkDataset { final String name; final String serialized; // TypeMetadata serialization - final long seed; + final long seed; final int generatedFieldCount; final int generatedIngestCount; @@ -276,18 +273,12 @@ private static class BenchmarkDataset { final Set distinctFields = new HashSet<>(); final Set distinctIngests = new HashSet<>(); - final Set distinctTypes = new HashSet<>(); + final Set distinctTypes = new HashSet<>(); // field -> type -> ingests - final Map>> byField = new HashMap<>(); - - BenchmarkDataset(String name, - String serialized, - long seed, - int generatedFieldCount, - int generatedIngestCount, - int maxTypesPerField, - int totalTypesAvailable, - List triples) { + final Map>> byField = new HashMap<>(); + + BenchmarkDataset(String name, String serialized, long seed, int generatedFieldCount, int generatedIngestCount, int maxTypesPerField, + int totalTypesAvailable, List triples) { this.name = name; this.serialized = serialized; this.seed = seed; @@ -301,10 +292,7 @@ private static class BenchmarkDataset { distinctFields.add(m.field); distinctIngests.add(m.ingest); distinctTypes.add(m.type); - byField - .computeIfAbsent(m.field, k -> new HashMap<>()) - .computeIfAbsent(m.type, k -> new HashSet<>()) - .add(m.ingest); + byField.computeIfAbsent(m.field, k -> new HashMap<>()).computeIfAbsent(m.type, k -> new HashSet<>()).add(m.ingest); } } } @@ -322,24 +310,21 @@ private static void warmupAllMethods(OptionCompressor compressor, List availableTypes, - long seed) { - if (fieldCount < 1) fieldCount = 1; - if (ingestCount < 1) ingestCount = 1; - if (maxTypesPerField < 1) maxTypesPerField = 1; + private static BenchmarkDataset buildDataset(String name, int fieldCount, int ingestCount, int maxTypesPerField, List availableTypes, long seed) { + if (fieldCount < 1) + fieldCount = 1; + if (ingestCount < 1) + ingestCount = 1; + if (maxTypesPerField < 1) + maxTypesPerField = 1; Random rnd = new Random(seed); // Use different derived seeds so fields/ingests differ - List fields = generateNaturalNames(fieldCount, new Random(seed ^ 0xC0FFEE1234ABCDEFL)); + List fields = generateNaturalNames(fieldCount, new Random(seed ^ 0xC0FFEE1234ABCDEFL)); List ingests = generateNaturalNames(ingestCount, new Random(seed ^ 0xBEEFBABE56789ABCL)); TypeMetadata tm = new TypeMetadata(); @@ -359,22 +344,12 @@ private static BenchmarkDataset buildDataset(String name, } } - return new BenchmarkDataset( - name, - tm.toString(), - seed, - fieldCount, - ingestCount, - maxTypesPerField, - availableTypes.size(), - triples - ); + return new BenchmarkDataset(name, tm.toString(), seed, fieldCount, ingestCount, maxTypesPerField, availableTypes.size(), triples); } /** - * Generate 'count' unique, natural-looking, number-free names using pairs (then triples) of roots. - * Deterministic given the Random. Produces lowerCamelCase tokens like "auroraFlux". - * Uses Apache Commons Lang for capitalization. + * Generate 'count' unique, natural-looking, number-free names using pairs (then triples) of roots. Deterministic given the Random. Produces lowerCamelCase + * tokens like "auroraFlux". Uses Apache Commons Lang for capitalization. */ private static List generateNaturalNames(int count, Random rnd) { List words = Arrays.asList(NAME_WORDS); @@ -383,8 +358,9 @@ private static List generateNaturalNames(int count, Random rnd) { List pairs = new ArrayList<>(words.size() * (words.size() - 1)); for (int i = 0; i < words.size(); i++) { for (int j = 0; j < words.size(); j++) { - if (i == j) continue; - pairs.add(new int[]{i, j}); + if (i == j) + continue; + pairs.add(new int[] {i, j}); } } Collections.shuffle(pairs, rnd); @@ -401,10 +377,12 @@ private static List generateNaturalNames(int count, Random rnd) { List triples = new ArrayList<>(words.size() * words.size() * words.size()); for (int i = 0; i < words.size(); i++) { for (int j = 0; j < words.size(); j++) { - if (i == j) continue; + if (i == j) + continue; for (int k = 0; k < words.size(); k++) { - if (k == i || k == j) continue; - triples.add(new int[]{i, j, k}); + if (k == i || k == j) + continue; + triples.add(new int[] {i, j, k}); } } } @@ -420,21 +398,26 @@ private static List generateNaturalNames(int count, Random rnd) { if (out.size() < count) { for (String w : words) { out.add(toLowerCamel(w)); - if (out.size() >= count) break; + if (out.size() >= count) + break; } } return new ArrayList<>(out).subList(0, count); } private static String toLowerCamel(String... parts) { - if (parts == null || parts.length == 0) return ""; + if (parts == null || parts.length == 0) + return ""; List clean = new ArrayList<>(parts.length); for (String p : parts) { - if (p == null) continue; + if (p == null) + continue; String s = p.replaceAll("[^A-Za-z]", "").toLowerCase(Locale.ROOT); - if (!s.isEmpty()) clean.add(s); + if (!s.isEmpty()) + clean.add(s); } - if (clean.isEmpty()) return ""; + if (clean.isEmpty()) + return ""; StringBuilder sb = new StringBuilder(clean.get(0)); // first stays lower for (int i = 1; i < clean.size(); i++) { sb.append(StringUtils.capitalize(clean.get(i))); // Apache Commons Lang From 6d9a11f3d2077c48c60fd222667a1cc5d2b947a2 Mon Sep 17 00:00:00 2001 From: Seth Date: Mon, 25 Aug 2025 16:23:58 -0400 Subject: [PATCH 7/8] formatting --- .../util/compression/OptionCompressorEfficiencyTest.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java index 44eaa1b214f..5a4a51a435c 100644 --- a/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java +++ b/warehouse/core/src/test/java/datawave/util/compression/OptionCompressorEfficiencyTest.java @@ -1,5 +1,6 @@ package datawave.util.compression; +import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; import java.nio.charset.Charset; @@ -16,15 +17,12 @@ import java.util.Random; import java.util.Set; - import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang3.StringUtils; // Apache Commons Lang for capitalization import org.junit.jupiter.api.Test; import datawave.query.util.TypeMetadata; -import static org.junit.jupiter.api.Assertions.assertEquals; - /** * Compression benchmark for OptionCompressor across generated datasets. * @@ -235,7 +233,7 @@ private static void printCorpusDetails(BenchmarkDataset ds) { Collections.sort(fields); for (String f : fields) { System.out.println("- " + f + ":"); - Map> types = ds.byField.getOrDefault(f, Collections.emptyMap()); + Map> types = ds.byField.getOrDefault(f, Collections.emptyMap()); List typeNames = new ArrayList<>(types.keySet()); Collections.sort(typeNames); for (String t : typeNames) { From bda8282423b355839b0caae426ed1c9a9ea1beac Mon Sep 17 00:00:00 2001 From: Seth Date: Wed, 27 Aug 2025 11:49:50 -0400 Subject: [PATCH 8/8] Fixes --- .../util/compression/OptionCompressor.java | 96 +++++++++---------- .../query/index/lookup/RangeStream.java | 14 --- .../query/index/lookup/ShardRangeStream.java | 4 - 3 files changed, 43 insertions(+), 71 deletions(-) diff --git a/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java index 4922b8a1f78..0be4b52e9ad 100644 --- a/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java +++ b/warehouse/core/src/main/java/datawave/util/compression/OptionCompressor.java @@ -35,13 +35,6 @@ */ public class OptionCompressor { - // possible data class for holding metadata for the compression method. Not sure if I'll need to actually include this. - // see Gz.java or Bzip2.java from Accumulo. - // public CompressionAlgorithmConfiguration configuration; - - // additionally, see https://commons.apache.org/proper/commons-compress/ - // it's probably best to source all the compression from the same place if possible. - public enum CompressionMethod { NONE, GZIP, BZIP2, SEVEN_ZIP } @@ -80,11 +73,10 @@ public String compress(final String data, final CompressionMethod method, final private String compressGZIP(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(baos)) { - - gzip.write(input); - gzip.close(); // must close to flush all compressed data into baos - + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + try (GZIPOutputStream gzip = new GZIPOutputStream(baos)) { + gzip.write(input); + } return Base64.encodeBase64String(baos.toByteArray()); } catch (IOException e) { throw new RuntimeException(e); @@ -98,11 +90,10 @@ private String compressGZIP(final String data, final Charset charset) { private String compressBZIP2(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { - - bzos.write(input); - bzos.close(); // finalize BZIP2 stream - + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + try (BZip2CompressorOutputStream bzos = new BZip2CompressorOutputStream(baos)) { + bzos.write(input); + } return Base64.encodeBase64String(baos.toByteArray()); } catch (IOException e) { throw new RuntimeException(e); @@ -118,11 +109,10 @@ private String compressBZIP2(final String data, final Charset charset) { private String compress7ZIP(final String data, final Charset charset) { final byte[] input = data.getBytes(charset); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { - - lzma.write(input); - lzma.close(); // finalize LZMA stream - + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + try (LZMACompressorOutputStream lzma = new LZMACompressorOutputStream(baos)) { + lzma.write(input); + } return Base64.encodeBase64String(baos.toByteArray()); } catch (IOException e) { throw new RuntimeException(e); @@ -136,18 +126,18 @@ private String compress7ZIP(final String data, final Charset charset) { private String decompressGZIP(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); - try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - GZIPInputStream gzip = new GZIPInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - - byte[] buf = new byte[4096]; - int n; - while ((n = gzip.read(buf)) != -1) { - baos.write(buf, 0, n); + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed)) { + try (GZIPInputStream gzip = new GZIPInputStream(bais)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buf = new byte[4096]; + int n; + while ((n = gzip.read(buf)) != -1) { + baos.write(buf, 0, n); + } + // NOTE: Keeping the same pattern as provided code. + return baos.toString(charset); + } } - - // NOTE: Keeping the same pattern as provided code. - return baos.toString(charset); } catch (IOException e) { throw new RuntimeException(e); } @@ -160,17 +150,17 @@ private String decompressGZIP(final String dataBase64, final Charset charset) { private String decompressBZIP2(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); - try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - - byte[] buf = new byte[4096]; - int n; - while ((n = bzis.read(buf)) != -1) { - baos.write(buf, 0, n); + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed)) { + try (BZip2CompressorInputStream bzis = new BZip2CompressorInputStream(bais)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buf = new byte[4096]; + int n; + while ((n = bzis.read(buf)) != -1) { + baos.write(buf, 0, n); + } + return baos.toString(charset); + } } - - return baos.toString(charset); } catch (IOException e) { throw new RuntimeException(e); } @@ -185,17 +175,17 @@ private String decompressBZIP2(final String dataBase64, final Charset charset) { private String decompress7ZIP(final String dataBase64, final Charset charset) { final byte[] compressed = Base64.decodeBase64(dataBase64); - try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed); - LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais); - ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - - byte[] buf = new byte[4096]; - int n; - while ((n = lzma.read(buf)) != -1) { - baos.write(buf, 0, n); + try (ByteArrayInputStream bais = new ByteArrayInputStream(compressed)) { + try (LZMACompressorInputStream lzma = new LZMACompressorInputStream(bais)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + byte[] buf = new byte[4096]; + int n; + while ((n = lzma.read(buf)) != -1) { + baos.write(buf, 0, n); + } + return baos.toString(charset); + } } - - return baos.toString(charset); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java index c3612528621..c767be392e1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java @@ -205,23 +205,9 @@ public CloseableIterable streamPlans(JexlNode script) { log.trace(JexlStringBuildingVisitor.buildQuery(node)); } - System.out.println("SETH SMUCKER"); - Multimap> nonIndexedQueryFieldsDatatypes = HashMultimap.create(config.getQueryFieldsDatatypes()); nonIndexedQueryFieldsDatatypes.keySet().removeAll(config.getIndexedFields()); String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); - System.out.println("NON_INDEXED_DATATYPES: " + nonIndexedTypes); - - try { - String serializedTypeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()).toString(); - System.out.println("TYPE_METADATA: " + serializedTypeMetadata); - - } catch (TableNotFoundException ignored) { - - } - - // String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); - // System.out.println("TYPE_METADATA_AUTHS: " + requiredAuthsString); BaseIndexStream ranges = (BaseIndexStream) node.jjtAccept(this, null); diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java index 4cb7caf55f4..4bcda9c1d22 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java @@ -136,23 +136,19 @@ public QueryPlan apply(Entry entry) { * Lift and shift from DefaultQueryPlanner to avoid reliance on static methods */ private void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper) { - System.out.println("SETH SMUCKER"); DefaultQueryPlanner.addOption(cfg, QueryOptions.QUERY_MAPPING_COMPRESS, Boolean.toString(true), false); Multimap> nonIndexedQueryFieldsDatatypes = HashMultimap.create(config.getQueryFieldsDatatypes()); nonIndexedQueryFieldsDatatypes.keySet().removeAll(config.getIndexedFields()); String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); - System.out.println("NON_INDEXED_DATATYPES: " + nonIndexedTypes); DefaultQueryPlanner.addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false); try { String serializedTypeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()).toString(); - System.out.println("TYPE_METADATA: " + serializedTypeMetadata); DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); - System.out.println("TYPE_METADATA_AUTHS: " + requiredAuthsString); requiredAuthsString = QueryOptions.compressOption(requiredAuthsString, QueryOptions.UTF8); DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false);