Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 37 additions & 44 deletions dev/src/order.ts
Original file line number Diff line number Diff line change
Expand Up @@ -254,56 +254,49 @@ function compareVectors(left: ApiMapValue, right: ApiMapValue): number {
* @internal
*/
export function compareUtf8Strings(left: string, right: string): number {
let i = 0;
while (i < left.length && i < right.length) {
const leftCodePoint = left.codePointAt(i)!;
const rightCodePoint = right.codePointAt(i)!;

if (leftCodePoint !== rightCodePoint) {
if (leftCodePoint < 128 && rightCodePoint < 128) {
// ASCII comparison
return primitiveComparator(leftCodePoint, rightCodePoint);
} else {
// Lazy instantiate TextEncoder
const encoder = new TextEncoder();

// UTF-8 encode the character at index i for byte comparison.
const leftBytes = encoder.encode(getUtf8SafeSubstring(left, i));
const rightBytes = encoder.encode(getUtf8SafeSubstring(right, i));
const comp = compareBlobs(
Buffer.from(leftBytes),
Buffer.from(rightBytes)
);
if (comp !== 0) {
return comp;
} else {
// EXTREMELY RARE CASE: Code points differ, but their UTF-8 byte
// representations are identical. This can happen with malformed input
// (invalid surrogate pairs). The backend also actively prevents invalid
// surrogates as INVALID_ARGUMENT errors, so we almost never receive
// invalid strings from backend.
// Fallback to code point comparison for graceful handling.
return primitiveComparator(leftCodePoint, rightCodePoint);
}
}
// Find the first differing character (a.k.a. "UTF-16 code unit") in the two strings and,
// if found, use that character to determine the relative ordering of the two strings as a
// whole. Comparing UTF-16 strings in UTF-8 byte order can be done simply and efficiently by
// comparing the UTF-16 code units (chars). This serendipitously works because of the way UTF-8
// and UTF-16 happen to represent Unicode code points.
//
// After finding the first pair of differing characters, there are two cases:
//
// Case 1: Both characters are non-surrogates (code points less than or equal to 0xFFFF) or
// both are surrogates from a surrogate pair (that collectively represent code points greater
// than 0xFFFF). In this case their numeric order as UTF-16 code units is the same as the
// lexicographical order of their corresponding UTF-8 byte sequences. A direct comparison is
// sufficient.
//
// Case 2: One character is a surrogate and the other is not. In this case the surrogate-
// containing string is always ordered after the non-surrogate. This is because surrogates are
// used to represent code points greater than 0xFFFF which have 4-byte UTF-8 representations
// and are lexicographically greater than the 1, 2, or 3-byte representations of code points
// less than or equal to 0xFFFF.
const length = Math.min(left.length, right.length);
for (let i = 0; i < length; i++) {
const leftChar = left.charAt(i);
const rightChar = right.charAt(i);
if (leftChar !== rightChar) {
return isSurrogate(leftChar) === isSurrogate(rightChar)
? primitiveComparator(leftChar, rightChar)
: isSurrogate(leftChar)
? 1
: -1;
}
// Increment by 2 for surrogate pairs, 1 otherwise
i += leftCodePoint > 0xffff ? 2 : 1;
}

// Compare lengths if all characters are equal
// Use the lengths of the strings to determine the overall comparison result since either the
// strings were equal or one is a prefix of the other.
return primitiveComparator(left.length, right.length);
}

function getUtf8SafeSubstring(str: string, index: number): string {
const firstCodePoint = str.codePointAt(index)!;
if (firstCodePoint > 0xffff) {
// It's a surrogate pair, return the whole pair
return str.substring(index, index + 2);
} else {
// It's a single code point, return it
return str.substring(index, index + 1);
}
const MIN_SURROGATE = 0xd800;
const MAX_SURROGATE = 0xdfff;

export function isSurrogate(s: string): boolean {
const c = s.charCodeAt(0);
return c >= MIN_SURROGATE && c <= MAX_SURROGATE;
}

/*!
Expand Down