Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@

<version.itu>1.14.0</version.itu>
<version.jackson>2.18.3</version.jackson>
<version.joni>2.2.1</version.joni>
<version.joni>2.2.6</version.joni>
<version.logback>1.3.14</version.logback> <!-- 1.4.x and above is not Java 8 compatible -->
<version.slf4j>2.0.17</version.slf4j>
<version.graaljs>21.3.10</version.graaljs> <!-- 22.x and above is not Java 8 compatible -->
Expand Down
209 changes: 198 additions & 11 deletions src/main/java/com/networknt/schema/regex/JoniRegularExpression.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
package com.networknt.schema.regex;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Pattern;

import org.jcodings.ApplyAllCaseFoldFunction;
import org.jcodings.CaseFoldCodeItem;
import org.jcodings.CodeRange;
import org.jcodings.Encoding;
import org.jcodings.IntHolder;
import org.jcodings.constants.CharacterType;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.unicode.UnicodeCodeRange;
import org.joni.Option;
import org.joni.Regex;
import org.joni.Syntax;
Expand Down Expand Up @@ -42,17 +50,8 @@ class JoniRegularExpression implements RegularExpression {

JoniRegularExpression(String regex, Syntax syntax) {
validate(regex);
// Joni is too liberal on some constructs
String s = regex
.replace("\\d", "[0-9]")
.replace("\\D", "[^0-9]")
.replace("\\w", "[a-zA-Z0-9_]")
.replace("\\W", "[^a-zA-Z0-9_]")
.replace("\\s", "[ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]")
.replace("\\S", "[^ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]");

byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, UTF8Encoding.INSTANCE, syntax);
byte[] bytes = regex.getBytes(StandardCharsets.UTF_8);
this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, ECMAScriptUTF8Encoding.INSTANCE, syntax);
}

protected void validate(String regex) {
Expand All @@ -73,4 +72,192 @@ public boolean matches(String value) {
return this.pattern.matcher(bytes).search(0, bytes.length, Option.NONE) >= 0;
}

static class Arrays {
public static boolean equals(byte[] a, byte[] a2, int p, int end) {
if (a==a2) {
return true;
}
if (a==null || a2==null) {
return false;
}

int length = a.length;
if ((end - p) != length) {
return false;
}

for (int i=0; i<length; i++) {
if (a[i] != a2[i+p]) {
return false;
}
}
return true;
}
}

/**
* An {@link Encoding} that returns the appropriate code ranges that correspond
* to the ECMA-262 regular expression implementation instead of matching
* directly to a Unicode General Category.
*/
public static class ECMAScriptUTF8Encoding extends DelegatingEncoding {
/*
* [0-9]
*/
private static final int[] CR_DIGIT = { 1, '0', '9' };
/*
* [a-zA-Z0-9_]
*/
private static final int[] CR_WORD = { 4, '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
/*
* [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
*/
private static final int[] CR_SPACE = { 10, '\t', '\r', ' ', ' ', '\u00a0', '\u00a0', '\u1680', '\u1680', '\u2000',
'\u200a', '\u2028', '\u2029', '\u202f', '\u202f', '\u205f', '\u205f', '\u3000', '\u3000', '\ufeff',
'\ufeff' };
/*
* For \p{digit}
*/
private static final byte[] PROPERTY_NAME_DIGIT = { 100, 105, 103, 105, 116};

public static final ECMAScriptUTF8Encoding INSTANCE = new ECMAScriptUTF8Encoding();

protected ECMAScriptUTF8Encoding() {
super(UTF8Encoding.INSTANCE);
}

@Override
public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
switch (ctype) {
case CharacterType.DIGIT: // \d
sbOut.value = 0x80;
return CR_DIGIT;
case CharacterType.WORD: // \w
sbOut.value = 0x80;
return CR_WORD;
case CharacterType.SPACE: // \s
sbOut.value = 0x80;
return CR_SPACE;
}
return delegate.ctypeCodeRange(ctype, sbOut);
}

@Override
public boolean isCodeCType(int code, int ctype) {
switch (ctype) {
case CharacterType.DIGIT: // \d
return CodeRange.isInCodeRange(CR_DIGIT, code);
case CharacterType.WORD: // \w
return CodeRange.isInCodeRange(CR_WORD, code);
case CharacterType.SPACE: // \s
return CodeRange.isInCodeRange(CR_SPACE, code);
}
return delegate.isCodeCType(code, ctype);
}

@Override
public int propertyNameToCType(byte[]name, int p, int end) {
if (Arrays.equals(PROPERTY_NAME_DIGIT, name, p, end)) {
return UnicodeCodeRange.ND.ordinal();// 55 Same as \p{Nd} and not returning CharacterType.DIGIT
}
return delegate.propertyNameToCType(name, p, end);
}
}

/**
* An {@link Encoding} that delegates to another {@link Encoding}.
* <p>
* This can be used to customize the behavior of implementations that are final.
*/
public static class DelegatingEncoding extends Encoding {
protected final Encoding delegate;
protected DelegatingEncoding(Encoding delegate) {
super(new String(delegate.getName()), delegate.minLength(), delegate.maxLength());
this.delegate = delegate;
}
@Override
public Charset getCharset() {
return delegate.getCharset();
}
@Override
public String getCharsetName() {
return delegate.getCharsetName();
}
@Override
public int length(byte c) {
return delegate.length(c);
}
@Override
public int length(byte[] bytes, int p, int end) {
return delegate.length(bytes, p, end);
}
@Override
public boolean isNewLine(byte[] bytes, int p, int end) {
return delegate.isNewLine(bytes, p, end);
}
@Override
public int mbcToCode(byte[] bytes, int p, int end) {
return delegate.mbcToCode(bytes, p, end);
}
@Override
public int codeToMbcLength(int code) {
return delegate.codeToMbcLength(code);
}
@Override
public int codeToMbc(int code, byte[] bytes, int p) {
return delegate.codeToMbc(code, bytes, p);
}
@Override
public int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] to) {
return delegate.mbcCaseFold(flag, bytes, pp, end, to);
}
@Override
public byte[] toLowerCaseTable() {
return delegate.toLowerCaseTable();
}
@Override
public void applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, Object arg) {
delegate.applyAllCaseFold(flag, fun, arg);
}
@Override
public CaseFoldCodeItem[] caseFoldCodesByString(int flag, byte[] bytes, int p, int end) {
return delegate.caseFoldCodesByString(flag, bytes, p, end);
}
@Override
public int propertyNameToCType(byte[] bytes, int p, int end) {
return delegate.propertyNameToCType(bytes, p, end);
}
@Override
public boolean isCodeCType(int code, int ctype) {
return delegate.isCodeCType(code, ctype);
}
@Override
public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
return delegate.ctypeCodeRange(ctype, sbOut);
}
@Override
public int leftAdjustCharHead(byte[] bytes, int p, int s, int end) {
return delegate.leftAdjustCharHead(bytes, p, s, end);
}
@Override
public boolean isReverseMatchAllowed(byte[] bytes, int p, int end) {
return delegate.isReverseMatchAllowed(bytes, p, end);
}
@Override
public int caseMap(IntHolder flagP, byte[] bytes, IntHolder pp, int end, byte[] to, int toP, int toEnd) {
return delegate.caseMap(flagP, bytes, pp, end, to, toP, toEnd);
}
@Override
public int strLength(byte[] bytes, int p, int end) {
return delegate.strLength(bytes, p, end);
}
@Override
public int strCodeAt(byte[] bytes, int p, int end, int index) {
return delegate.strCodeAt(bytes, p, end, index);
}
@Override
public boolean isMbcCrnl(byte[] bytes, int p, int end) {
return delegate.isMbcCrnl(bytes, p, end);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,82 @@ void noImplicitAnchors() {
RegularExpression regex = new JoniRegularExpression("[a-z]{1,10}");
assertTrue(regex.matches("1abc1"));
}

@Test
void digitCharacterClassShouldNotMatchUnicodeDigit() {
RegularExpression regex = new JoniRegularExpression("\\d");
assertFalse(regex.matches("߀"));
}

@Test
void wordCharacterClassShouldNotMatchUnicodeDigit() {
RegularExpression regex = new JoniRegularExpression("\\w");
assertFalse(regex.matches("߀"));
}

@Test
void unicodeNumberCharacterClassShouldMatchUnicodeDigit() {
RegularExpression regex = new JoniRegularExpression("\\p{N}");
assertTrue(regex.matches("߀"));
}

@Test
void unicodeNumberDigitCharacterClassShouldMatchUnicodeDigit() {
RegularExpression regex = new JoniRegularExpression("\\p{digit}");
assertTrue(regex.matches("߀"));
}

@Test
void unicodeNdCharacterClassShouldMatchUnicodeDigit() {
RegularExpression regex = new JoniRegularExpression("\\p{Nd}");
assertTrue(regex.matches("߀"));
}

@Test
void digitCharacterClassShouldMatchAsciiDigit() {
RegularExpression regex = new JoniRegularExpression("\\d");
assertTrue(regex.matches("0"));
}

@Test
void digitCharacterClassShouldMatchAsciiDigitInCharacterSet() {
RegularExpression regex = new JoniRegularExpression("[\\d]");
assertTrue(regex.matches("0"));
}

@Test
void whitespaceClassShouldMatchWhitespace() {
RegularExpression regex = new JoniRegularExpression("\\s");
assertTrue(regex.matches(" "));
}

@Test
void whitespaceClassShouldMatchLatin1NonBreakingSpace() {
RegularExpression regex = new JoniRegularExpression("\\s");
assertTrue(regex.matches("\u00a0"));
}

@Test
void whitespaceClassShouldMatchWhitespaceInCharacterSet() {
RegularExpression regex = new JoniRegularExpression("[\\s]");
assertTrue(regex.matches(" "));
}

@Test
void whitespaceClassShouldMatchLatin1NonBreakingSpaceInCharacterSet() {
RegularExpression regex = new JoniRegularExpression("[\\s]");
assertTrue(regex.matches("\u00a0"));
}

@Test
void nonWhitespaceClassShouldNotMatchWhitespaceInCharacterSet() {
RegularExpression regex = new JoniRegularExpression("[\\S]");
assertFalse(regex.matches(" "));
}

@Test
void nonWhitespaceClassShouldNotMatchLatin1NonBreakingSpaceInCharacterSet() {
RegularExpression regex = new JoniRegularExpression("[\\S]");
assertFalse(regex.matches("\u00a0"));
}
}