11package com .networknt .schema .regex ;
22
3+ import java .nio .charset .Charset ;
34import java .nio .charset .StandardCharsets ;
45import java .util .regex .Pattern ;
56
7+ import org .jcodings .ApplyAllCaseFoldFunction ;
8+ import org .jcodings .CaseFoldCodeItem ;
9+ import org .jcodings .CodeRange ;
10+ import org .jcodings .Encoding ;
11+ import org .jcodings .IntHolder ;
12+ import org .jcodings .constants .CharacterType ;
613import org .jcodings .specific .UTF8Encoding ;
14+ import org .jcodings .unicode .UnicodeCodeRange ;
715import org .joni .Option ;
816import org .joni .Regex ;
917import org .joni .Syntax ;
@@ -42,17 +50,8 @@ class JoniRegularExpression implements RegularExpression {
4250
4351 JoniRegularExpression (String regex , Syntax syntax ) {
4452 validate (regex );
45- // Joni is too liberal on some constructs
46- String s = regex
47- .replace ("\\ d" , "[0-9]" )
48- .replace ("\\ D" , "[^0-9]" )
49- .replace ("\\ w" , "[a-zA-Z0-9_]" )
50- .replace ("\\ W" , "[^a-zA-Z0-9_]" )
51- .replace ("\\ s" , "[ \\ f\\ n\\ r\\ t\\ v\\ u00a0\\ u1680\\ u2000-\\ u200a\\ u2028\\ u2029\\ u202f\\ u205f\\ u3000\\ ufeff]" )
52- .replace ("\\ S" , "[^ \\ f\\ n\\ r\\ t\\ v\\ u00a0\\ u1680\\ u2000-\\ u200a\\ u2028\\ u2029\\ u202f\\ u205f\\ u3000\\ ufeff]" );
53-
54- byte [] bytes = s .getBytes (StandardCharsets .UTF_8 );
55- this .pattern = new Regex (bytes , 0 , bytes .length , Option .SINGLELINE , UTF8Encoding .INSTANCE , syntax );
53+ byte [] bytes = regex .getBytes (StandardCharsets .UTF_8 );
54+ this .pattern = new Regex (bytes , 0 , bytes .length , Option .SINGLELINE , ECMAScriptUTF8Encoding .INSTANCE , syntax );
5655 }
5756
5857 protected void validate (String regex ) {
@@ -73,4 +72,192 @@ public boolean matches(String value) {
7372 return this .pattern .matcher (bytes ).search (0 , bytes .length , Option .NONE ) >= 0 ;
7473 }
7574
75+ static class Arrays {
76+ public static boolean equals (byte [] a , byte [] a2 , int p , int end ) {
77+ if (a ==a2 ) {
78+ return true ;
79+ }
80+ if (a ==null || a2 ==null ) {
81+ return false ;
82+ }
83+
84+ int length = a .length ;
85+ if ((end - p ) != length ) {
86+ return false ;
87+ }
88+
89+ for (int i =0 ; i <length ; i ++) {
90+ if (a [i ] != a2 [i +p ]) {
91+ return false ;
92+ }
93+ }
94+ return true ;
95+ }
96+ }
97+
98+ /**
99+ * An {@link Encoding} that returns the appropriate code ranges that correspond
100+ * to the ECMA-262 regular expression implementation instead of matching
101+ * directly to a Unicode General Category.
102+ */
103+ public static class ECMAScriptUTF8Encoding extends DelegatingEncoding {
104+ /*
105+ * [0-9]
106+ */
107+ private static final int [] CR_DIGIT = { 1 , '0' , '9' };
108+ /*
109+ * [a-zA-Z0-9_]
110+ */
111+ private static final int [] CR_WORD = { 4 , '0' , '9' , 'A' , 'Z' , '_' , '_' , 'a' , 'z' };
112+ /*
113+ * [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
114+ */
115+ private static final int [] CR_SPACE = { 10 , '\t' , '\r' , ' ' , ' ' , '\u00a0' , '\u00a0' , '\u1680' , '\u1680' , '\u2000' ,
116+ '\u200a' , '\u2028' , '\u2029' , '\u202f' , '\u202f' , '\u205f' , '\u205f' , '\u3000' , '\u3000' , '\ufeff' ,
117+ '\ufeff' };
118+ /*
119+ * For \p{digit}
120+ */
121+ private static final byte [] PROPERTY_NAME_DIGIT = { 100 , 105 , 103 , 105 , 116 };
122+
123+ public static final ECMAScriptUTF8Encoding INSTANCE = new ECMAScriptUTF8Encoding ();
124+
125+ protected ECMAScriptUTF8Encoding () {
126+ super (UTF8Encoding .INSTANCE );
127+ }
128+
129+ @ Override
130+ public int [] ctypeCodeRange (int ctype , IntHolder sbOut ) {
131+ switch (ctype ) {
132+ case CharacterType .DIGIT : // \d
133+ sbOut .value = 0x80 ;
134+ return CR_DIGIT ;
135+ case CharacterType .WORD : // \w
136+ sbOut .value = 0x80 ;
137+ return CR_WORD ;
138+ case CharacterType .SPACE : // \s
139+ sbOut .value = 0x80 ;
140+ return CR_SPACE ;
141+ }
142+ return delegate .ctypeCodeRange (ctype , sbOut );
143+ }
144+
145+ @ Override
146+ public boolean isCodeCType (int code , int ctype ) {
147+ switch (ctype ) {
148+ case CharacterType .DIGIT : // \d
149+ return CodeRange .isInCodeRange (CR_DIGIT , code );
150+ case CharacterType .WORD : // \w
151+ return CodeRange .isInCodeRange (CR_WORD , code );
152+ case CharacterType .SPACE : // \s
153+ return CodeRange .isInCodeRange (CR_SPACE , code );
154+ }
155+ return delegate .isCodeCType (code , ctype );
156+ }
157+
158+ @ Override
159+ public int propertyNameToCType (byte []name , int p , int end ) {
160+ if (Arrays .equals (PROPERTY_NAME_DIGIT , name , p , end )) {
161+ return UnicodeCodeRange .ND .ordinal ();// 55 Same as \p{Nd} and not returning CharacterType.DIGIT
162+ }
163+ return delegate .propertyNameToCType (name , p , end );
164+ }
165+ }
166+
167+ /**
168+ * An {@link Encoding} that delegates to another {@link Encoding}.
169+ * <p>
170+ * This can be used to customize the behavior of implementations that are final.
171+ */
172+ public static class DelegatingEncoding extends Encoding {
173+ protected final Encoding delegate ;
174+ protected DelegatingEncoding (Encoding delegate ) {
175+ super (new String (delegate .getName ()), delegate .minLength (), delegate .maxLength ());
176+ this .delegate = delegate ;
177+ }
178+ @ Override
179+ public Charset getCharset () {
180+ return delegate .getCharset ();
181+ }
182+ @ Override
183+ public String getCharsetName () {
184+ return delegate .getCharsetName ();
185+ }
186+ @ Override
187+ public int length (byte c ) {
188+ return delegate .length (c );
189+ }
190+ @ Override
191+ public int length (byte [] bytes , int p , int end ) {
192+ return delegate .length (bytes , p , end );
193+ }
194+ @ Override
195+ public boolean isNewLine (byte [] bytes , int p , int end ) {
196+ return delegate .isNewLine (bytes , p , end );
197+ }
198+ @ Override
199+ public int mbcToCode (byte [] bytes , int p , int end ) {
200+ return delegate .mbcToCode (bytes , p , end );
201+ }
202+ @ Override
203+ public int codeToMbcLength (int code ) {
204+ return delegate .codeToMbcLength (code );
205+ }
206+ @ Override
207+ public int codeToMbc (int code , byte [] bytes , int p ) {
208+ return delegate .codeToMbc (code , bytes , p );
209+ }
210+ @ Override
211+ public int mbcCaseFold (int flag , byte [] bytes , IntHolder pp , int end , byte [] to ) {
212+ return delegate .mbcCaseFold (flag , bytes , pp , end , to );
213+ }
214+ @ Override
215+ public byte [] toLowerCaseTable () {
216+ return delegate .toLowerCaseTable ();
217+ }
218+ @ Override
219+ public void applyAllCaseFold (int flag , ApplyAllCaseFoldFunction fun , Object arg ) {
220+ delegate .applyAllCaseFold (flag , fun , arg );
221+ }
222+ @ Override
223+ public CaseFoldCodeItem [] caseFoldCodesByString (int flag , byte [] bytes , int p , int end ) {
224+ return delegate .caseFoldCodesByString (flag , bytes , p , end );
225+ }
226+ @ Override
227+ public int propertyNameToCType (byte [] bytes , int p , int end ) {
228+ return delegate .propertyNameToCType (bytes , p , end );
229+ }
230+ @ Override
231+ public boolean isCodeCType (int code , int ctype ) {
232+ return delegate .isCodeCType (code , ctype );
233+ }
234+ @ Override
235+ public int [] ctypeCodeRange (int ctype , IntHolder sbOut ) {
236+ return delegate .ctypeCodeRange (ctype , sbOut );
237+ }
238+ @ Override
239+ public int leftAdjustCharHead (byte [] bytes , int p , int s , int end ) {
240+ return delegate .leftAdjustCharHead (bytes , p , s , end );
241+ }
242+ @ Override
243+ public boolean isReverseMatchAllowed (byte [] bytes , int p , int end ) {
244+ return delegate .isReverseMatchAllowed (bytes , p , end );
245+ }
246+ @ Override
247+ public int caseMap (IntHolder flagP , byte [] bytes , IntHolder pp , int end , byte [] to , int toP , int toEnd ) {
248+ return delegate .caseMap (flagP , bytes , pp , end , to , toP , toEnd );
249+ }
250+ @ Override
251+ public int strLength (byte [] bytes , int p , int end ) {
252+ return delegate .strLength (bytes , p , end );
253+ }
254+ @ Override
255+ public int strCodeAt (byte [] bytes , int p , int end , int index ) {
256+ return delegate .strCodeAt (bytes , p , end , index );
257+ }
258+ @ Override
259+ public boolean isMbcCrnl (byte [] bytes , int p , int end ) {
260+ return delegate .isMbcCrnl (bytes , p , end );
261+ }
262+ }
76263}
0 commit comments