|
10 | 10 | public class ContentUtils {
|
11 | 11 | private static final byte[] HEX_ARRAY = "0123456789ABCDEF".getBytes(StandardCharsets.US_ASCII);
|
12 | 12 |
|
13 |
| - private static final String UTF8_REGEX = """ |
14 |
| - \\A([\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII |
15 |
| - | [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte |
16 |
| - | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs |
17 |
| - | [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte |
18 |
| - | \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates |
19 |
| - | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3 |
20 |
| - | [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15 |
21 |
| - | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16 |
22 |
| - )*\\z |
23 |
| - """.trim(); |
24 |
| - |
25 |
| - private static final Pattern UTF8_PATTERN = Pattern.compile(UTF8_REGEX, Pattern.COMMENTS); |
26 |
| - |
27 | 13 | private ContentUtils() {
|
28 | 14 | }
|
29 | 15 |
|
30 | 16 | /**
|
31 |
| - * Detects if bytes contain a UTF-8 string or something else |
32 |
| - * Source: https://stackoverflow.com/questions/1193200/how-can-i-check-whether-a-byte-array-contains-a-unicode-string-in-java |
| 17 | + * Detects if bytes contain a UTF-8 string or something else. |
33 | 18 | * @param value the bytes to test for a UTF-8 encoded {@code java.lang.String} value
|
34 | 19 | * @return true, if the byte[] contains a UTF-8 encode {@code java.lang.String}
|
35 | 20 | */
|
36 | 21 | public static boolean isValidUtf8(byte[] value) {
|
37 |
| - //If the array is too long, it throws a StackOverflowError due to the regex, so we assume it is a String. |
38 |
| - if (value.length <= 1000) { |
39 |
| - String phonyString = new String(value, StandardCharsets.ISO_8859_1); |
40 |
| - return UTF8_PATTERN.matcher(phonyString).matches(); |
| 22 | + // Any data exceeding 10KB will be treated as a string. |
| 23 | + if (value.length > 10_000) { |
| 24 | + return true; |
| 25 | + } |
| 26 | + int i = 0; |
| 27 | + while (i < value.length) { |
| 28 | + int b = value[i] & 0xFF; |
| 29 | + int numBytes; |
| 30 | + if ((b & 0x80) == 0) { |
| 31 | + // 1-byte (ASCII) |
| 32 | + numBytes = 1; |
| 33 | + } else if ((b & 0xE0) == 0xC0) { |
| 34 | + // 2-byte sequence |
| 35 | + numBytes = 2; |
| 36 | + } else if ((b & 0xF0) == 0xE0) { |
| 37 | + // 3-byte sequence |
| 38 | + numBytes = 3; |
| 39 | + } else if ((b & 0xF8) == 0xF0) { |
| 40 | + // 4-byte sequence |
| 41 | + numBytes = 4; |
| 42 | + } else { |
| 43 | + // Invalid first byte |
| 44 | + return false; |
| 45 | + } |
| 46 | + if (i + numBytes > value.length) { |
| 47 | + return false; |
| 48 | + } |
| 49 | + // Check continuation bytes |
| 50 | + for (int j = 1; j < numBytes; j++) { |
| 51 | + if ((value[i + j] & 0xC0) != 0x80) { |
| 52 | + return false; |
| 53 | + } |
| 54 | + } |
| 55 | + i += numBytes; |
41 | 56 | }
|
42 | 57 | return true;
|
43 | 58 | }
|
@@ -86,11 +101,11 @@ public static String convertToString(byte[] value) {
|
86 | 101 | if (ContentUtils.isValidUtf8(value)) {
|
87 | 102 | valueAsString = new String(value);
|
88 | 103 | } else {
|
89 |
| - if (value.length == 8) { |
| 104 | + if (value.length == Long.BYTES) { |
90 | 105 | valueAsString = String.valueOf(ContentUtils.asLong(value));
|
91 |
| - } else if (value.length == 4) { |
| 106 | + } else if (value.length == Integer.BYTES) { |
92 | 107 | valueAsString = String.valueOf(ContentUtils.asInt(value));
|
93 |
| - } else if (value.length == 2) { |
| 108 | + } else if (value.length == Short.BYTES) { |
94 | 109 | valueAsString = String.valueOf(ContentUtils.asShort(value));
|
95 | 110 | } else {
|
96 | 111 | valueAsString = bytesToHex(value);
|
|
0 commit comments