Skip to content

Commit 3b2aeae

Browse files
authored
Expand Unicode range markers in UnicodeData.txt processing (#57)
fix
1 parent da069fc commit 3b2aeae

File tree

1 file changed

+63
-1
lines changed

1 file changed

+63
-1
lines changed

src/worker/index.ts

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,68 @@ import type { WorkerAPI, WorkerMessageRequest, WorkerMessageResponse } from "$ut
22
import { advancedQuery } from "./advancedQuery";
33
import { deserialize, simpleQuery, type UnicodeMappings } from "@emnudge/unicode-query";
44

5+
/**
6+
* Expands First/Last range markers in UnicodeData.txt format.
7+
*
8+
* The Unicode data file uses range markers like:
9+
* 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
10+
* 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
11+
*
12+
* This function expands these to include all codepoints in the range.
13+
*/
14+
function expandUnicodeRanges(unicodeData: string): string {
15+
const lines = unicodeData.split("\n");
16+
const result: string[] = [];
17+
18+
for (let i = 0; i < lines.length; i++) {
19+
const line = lines[i];
20+
if (!line.trim()) {
21+
result.push(line);
22+
continue;
23+
}
24+
25+
const parts = line.split(";");
26+
const label = parts[1] || "";
27+
28+
// Check if this is a "First" range marker
29+
if (label.includes(", First>")) {
30+
// Look for the corresponding "Last" marker on the next line
31+
const nextLine = lines[i + 1];
32+
if (nextLine) {
33+
const nextParts = nextLine.split(";");
34+
const nextLabel = nextParts[1] || "";
35+
36+
if (nextLabel.includes(", Last>")) {
37+
const startCodepoint = parseInt(parts[0], 16);
38+
const endCodepoint = parseInt(nextParts[0], 16);
39+
40+
// Extract the base name (e.g., "CJK Ideograph Extension A" from "<CJK Ideograph Extension A, First>")
41+
const baseName = label.replace(/^</, "").replace(/, First>$/, "");
42+
43+
// Generate entries for all codepoints in the range
44+
for (let cp = startCodepoint; cp <= endCodepoint; cp++) {
45+
const cpHex = cp.toString(16).toUpperCase().padStart(4, "0");
46+
// Create a new entry with the base name and codepoint
47+
const newParts = [...parts];
48+
newParts[0] = cpHex;
49+
newParts[1] = `<${baseName}-${cpHex}>`;
50+
result.push(newParts.join(";"));
51+
}
52+
53+
// Skip the "Last" line since we've processed the range
54+
i++;
55+
continue;
56+
}
57+
}
58+
}
59+
60+
// Not a range marker, keep the line as-is
61+
result.push(line);
62+
}
63+
64+
return result.join("\n");
65+
}
66+
567
let unicodeMappingsCache: UnicodeMappings | null = null;
668
async function init() {
769
if (!unicodeMappingsCache) {
@@ -13,7 +75,7 @@ async function init() {
1375

1476
unicodeMappingsCache = deserialize({
1577
blocks,
16-
unicodeData,
78+
unicodeData: expandUnicodeRanges(unicodeData),
1779
symbolHtmlNames,
1880
});
1981
}

0 commit comments

Comments
 (0)