@@ -2,6 +2,68 @@ import type { WorkerAPI, WorkerMessageRequest, WorkerMessageResponse } from "$ut
22import { advancedQuery } from "./advancedQuery" ;
33import { deserialize , simpleQuery , type UnicodeMappings } from "@emnudge/unicode-query" ;
44
5+ /**
6+ * Expands First/Last range markers in UnicodeData.txt format.
7+ *
8+ * The Unicode data file uses range markers like:
9+ * 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
10+ * 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
11+ *
12+ * This function expands these to include all codepoints in the range.
13+ */
14+ function expandUnicodeRanges ( unicodeData : string ) : string {
15+ const lines = unicodeData . split ( "\n" ) ;
16+ const result : string [ ] = [ ] ;
17+
18+ for ( let i = 0 ; i < lines . length ; i ++ ) {
19+ const line = lines [ i ] ;
20+ if ( ! line . trim ( ) ) {
21+ result . push ( line ) ;
22+ continue ;
23+ }
24+
25+ const parts = line . split ( ";" ) ;
26+ const label = parts [ 1 ] || "" ;
27+
28+ // Check if this is a "First" range marker
29+ if ( label . includes ( ", First>" ) ) {
30+ // Look for the corresponding "Last" marker on the next line
31+ const nextLine = lines [ i + 1 ] ;
32+ if ( nextLine ) {
33+ const nextParts = nextLine . split ( ";" ) ;
34+ const nextLabel = nextParts [ 1 ] || "" ;
35+
36+ if ( nextLabel . includes ( ", Last>" ) ) {
37+ const startCodepoint = parseInt ( parts [ 0 ] , 16 ) ;
38+ const endCodepoint = parseInt ( nextParts [ 0 ] , 16 ) ;
39+
40+ // Extract the base name (e.g., "CJK Ideograph Extension A" from "<CJK Ideograph Extension A, First>")
41+ const baseName = label . replace ( / ^ < / , "" ) . replace ( / , F i r s t > $ / , "" ) ;
42+
43+ // Generate entries for all codepoints in the range
44+ for ( let cp = startCodepoint ; cp <= endCodepoint ; cp ++ ) {
45+ const cpHex = cp . toString ( 16 ) . toUpperCase ( ) . padStart ( 4 , "0" ) ;
46+ // Create a new entry with the base name and codepoint
47+ const newParts = [ ...parts ] ;
48+ newParts [ 0 ] = cpHex ;
49+ newParts [ 1 ] = `<${ baseName } -${ cpHex } >` ;
50+ result . push ( newParts . join ( ";" ) ) ;
51+ }
52+
53+ // Skip the "Last" line since we've processed the range
54+ i ++ ;
55+ continue ;
56+ }
57+ }
58+ }
59+
60+ // Not a range marker, keep the line as-is
61+ result . push ( line ) ;
62+ }
63+
64+ return result . join ( "\n" ) ;
65+ }
66+
567let unicodeMappingsCache : UnicodeMappings | null = null ;
668async function init ( ) {
769 if ( ! unicodeMappingsCache ) {
@@ -13,7 +75,7 @@ async function init() {
1375
1476 unicodeMappingsCache = deserialize ( {
1577 blocks,
16- unicodeData,
78+ unicodeData : expandUnicodeRanges ( unicodeData ) ,
1779 symbolHtmlNames,
1880 } ) ;
1981 }
0 commit comments