Skip to content

Commit 54fec11

Browse files
committed
Add the ability to search for morphofeatures using a regex in the key. Greatly simplifies certain searches for whether or not a feature exists, for example
1 parent 0f1997f commit 54fec11

File tree

5 files changed

+226
-31
lines changed

5 files changed

+226
-31
lines changed

src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java

+32-8
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ public class NodePattern extends SemgrexPattern {
3939
* and only partial matches are necessary
4040
*/
4141
private final List<Pair<String, Attribute>> partialAttributes;
42+
private final List<RegexPartialAttribute> regexPartialAttributes;
4243
private final boolean isRoot;
4344
private final boolean isLink;
4445
private final boolean isEmpty;
@@ -67,6 +68,7 @@ public NodePattern(GraphRelation r, boolean negDesc,
6768
this.attributes = new ArrayList<>();
6869
// same with partial attributes
6970
this.partialAttributes = new ArrayList<>();
71+
this.regexPartialAttributes = new ArrayList<>();
7072

7173
descString = "{";
7274
for (Triple<String, String, Boolean> entry : attrs.attributes()) {
@@ -105,15 +107,21 @@ public NodePattern(GraphRelation r, boolean negDesc,
105107
}
106108

107109
final Attribute attr;
108-
// Add the attributes for this key
109-
if (value.equals("__")) {
110-
attr = new Attribute(key, true, true, negated);
111-
} else if (value.matches("/.*/")) {
112-
attr = buildRegexAttribute(key, value, negated);
113-
} else { // raw description
114-
attr = new Attribute(key, value, value, negated);
110+
if (key.equals("__")) {
111+
regexPartialAttributes.add(new RegexPartialAttribute(annotation, "/.*/", value, negated));
112+
} else if (key.matches("/.*/")) {
113+
regexPartialAttributes.add(new RegexPartialAttribute(annotation, key, value, negated));
114+
} else {
115+
// Add the attributes for this key
116+
if (value.equals("__")) {
117+
attr = new Attribute(key, true, true, negated);
118+
} else if (value.matches("/.*/")) {
119+
attr = buildRegexAttribute(key, value, negated);
120+
} else { // raw description
121+
attr = new Attribute(key, value, value, negated);
122+
}
123+
partialAttributes.add(new Pair<>(annotation, attr));
115124
}
116-
partialAttributes.add(new Pair<>(annotation, attr));
117125

118126
if (!descString.equals("{"))
119127
descString += ";";
@@ -262,6 +270,22 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i
262270
return negDesc;
263271
}
264272
}
273+
for (RegexPartialAttribute partialAttribute : regexPartialAttributes) {
274+
Class clazz = Env.lookupAnnotationKey(env, partialAttribute.annotation);
275+
Object rawmap = node.get(clazz);
276+
final Map<?, ?> map;
277+
if (rawmap == null) {
278+
map = null;
279+
} else {
280+
if (!(rawmap instanceof Map))
281+
throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!");
282+
map = (Map) rawmap;
283+
}
284+
boolean matches = partialAttribute.checkMatches(map, ignoreCase);
285+
if (!matches) {
286+
return negDesc;
287+
}
288+
}
265289

266290
// System.out.println("matches");
267291
// System.out.println("");
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package edu.stanford.nlp.semgraph.semgrex;
2+
3+
import java.io.Serializable;
4+
import java.util.Map;
5+
import java.util.regex.Pattern;
6+
7+
public class RegexPartialAttribute implements Serializable {
8+
final String annotation;
9+
final Pattern key;
10+
11+
// TODO: separate these into two different classes?
12+
final Pattern casedPattern;
13+
final Pattern caselessPattern;
14+
final String exactMatch;
15+
16+
final boolean negated;
17+
18+
RegexPartialAttribute(String annotation, String key, String value, boolean negated) {
19+
this.annotation = annotation;
20+
//System.out.println(annotation + " " + key + " " + value + " " + negated);
21+
String keyContent = key.substring(1, key.length() - 1);
22+
this.key = Pattern.compile(keyContent);
23+
24+
if (value.equals("__")) {
25+
casedPattern = Pattern.compile(".*");
26+
caselessPattern = Pattern.compile(".*");
27+
exactMatch = null;
28+
} else if (value.matches("/.*/")) {
29+
String patternContent = value.substring(1, value.length() - 1);
30+
casedPattern = Pattern.compile(patternContent);
31+
caselessPattern = Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
32+
exactMatch = null;
33+
} else {
34+
casedPattern = null;
35+
caselessPattern = null;
36+
exactMatch = value;
37+
}
38+
39+
this.negated = negated;
40+
}
41+
42+
boolean valueMatches(boolean ignoreCase, String value) {
43+
if (ignoreCase) {
44+
return caselessPattern == null ? value.equalsIgnoreCase(exactMatch.toString()) : caselessPattern.matcher(value).matches();
45+
} else {
46+
return casedPattern == null ? value.equals(exactMatch.toString()) : casedPattern.matcher(value).matches();
47+
}
48+
}
49+
50+
boolean checkMatches(Map<?, ?> map, boolean ignoreCase) {
51+
//System.out.println("CHECKING MATCHES");
52+
//System.out.println(map);
53+
if (map == null) {
54+
// we treat an empty map as failing to match
55+
// so if the attribute is negated, that means this attribute passes
56+
return negated;
57+
}
58+
59+
for (Map.Entry<?, ?> entry : map.entrySet()) {
60+
//System.out.println(key + " " + entry.getKey().toString() + " " + key.matcher(entry.getKey().toString()).matches());
61+
if (key.matcher(entry.getKey().toString()).matches()) {
62+
String value = entry.getValue().toString();
63+
if (valueMatches(ignoreCase, value)) {
64+
return !negated;
65+
}
66+
}
67+
}
68+
69+
return negated;
70+
}
71+
72+
private static final long serialVersionUID = 378257698196124612L;
73+
}

src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java

+47-21
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,20 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
572572
}
573573
case 23:{
574574
jj_consume_token(23);
575-
key = jj_consume_token(IDENTIFIER);
575+
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
576+
case IDENTIFIER:{
577+
key = jj_consume_token(IDENTIFIER);
578+
break;
579+
}
580+
case REGEX:{
581+
key = jj_consume_token(REGEX);
582+
break;
583+
}
584+
default:
585+
jj_la1[25] = jj_gen;
586+
jj_consume_token(-1);
587+
throw new ParseException();
588+
}
576589
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
577590
case 10:{
578591
attrType = jj_consume_token(10);
@@ -583,7 +596,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
583596
break;
584597
}
585598
default:
586-
jj_la1[25] = jj_gen;
599+
jj_la1[26] = jj_gen;
587600
jj_consume_token(-1);
588601
throw new ParseException();
589602
}
@@ -597,7 +610,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
597610
break;
598611
}
599612
default:
600-
jj_la1[26] = jj_gen;
613+
jj_la1[27] = jj_gen;
601614
jj_consume_token(-1);
602615
throw new ParseException();
603616
}
@@ -615,11 +628,24 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
615628
break;
616629
}
617630
default:
618-
jj_la1[27] = jj_gen;
631+
jj_la1[28] = jj_gen;
619632
break label_6;
620633
}
621634
jj_consume_token(24);
622-
key = jj_consume_token(IDENTIFIER);
635+
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
636+
case IDENTIFIER:{
637+
key = jj_consume_token(IDENTIFIER);
638+
break;
639+
}
640+
case REGEX:{
641+
key = jj_consume_token(REGEX);
642+
break;
643+
}
644+
default:
645+
jj_la1[29] = jj_gen;
646+
jj_consume_token(-1);
647+
throw new ParseException();
648+
}
623649
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
624650
case 10:{
625651
attrType = jj_consume_token(10);
@@ -630,7 +656,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
630656
break;
631657
}
632658
default:
633-
jj_la1[28] = jj_gen;
659+
jj_la1[30] = jj_gen;
634660
jj_consume_token(-1);
635661
throw new ParseException();
636662
}
@@ -644,7 +670,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
644670
break;
645671
}
646672
default:
647-
jj_la1[29] = jj_gen;
673+
jj_la1[31] = jj_gen;
648674
jj_consume_token(-1);
649675
throw new ParseException();
650676
}
@@ -659,7 +685,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
659685
break;
660686
}
661687
default:
662-
jj_la1[30] = jj_gen;
688+
jj_la1[32] = jj_gen;
663689
jj_consume_token(-1);
664690
throw new ParseException();
665691
}
@@ -676,7 +702,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
676702
break;
677703
}
678704
default:
679-
jj_la1[31] = jj_gen;
705+
jj_la1[33] = jj_gen;
680706
jj_consume_token(-1);
681707
throw new ParseException();
682708
}
@@ -700,7 +726,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
700726
break;
701727
}
702728
default:
703-
jj_la1[32] = jj_gen;
729+
jj_la1[34] = jj_gen;
704730
break label_7;
705731
}
706732
jj_consume_token(24);
@@ -709,7 +735,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
709735
break;
710736
}
711737
default:
712-
jj_la1[33] = jj_gen;
738+
jj_la1[35] = jj_gen;
713739
;
714740
}
715741
jj_consume_token(25);
@@ -729,7 +755,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
729755
break;
730756
}
731757
default:
732-
jj_la1[34] = jj_gen;
758+
jj_la1[36] = jj_gen;
733759
;
734760
}
735761
pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null);
@@ -746,13 +772,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th
746772
public Token jj_nt;
747773
private int jj_ntk;
748774
private int jj_gen;
749-
final private int[] jj_la1 = new int[35];
775+
final private int[] jj_la1 = new int[37];
750776
static private int[] jj_la1_0;
751777
static {
752778
jj_la1_init_0();
753779
}
754780
private static void jj_la1_init_0() {
755-
jj_la1_0 = new int[] {0x400,0x4028808,0x3801c,0x3801c,0x4028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x4028800,0x2000,0x402c000,0x4000,0x4028000,0x4020000,0x400400,0x110,0x400400,0x110,0x1000000,0x400400,0x110,0xc00400,0xd0,0x1000000,0xd0,0x200000,};
781+
jj_la1_0 = new int[] {0x400,0x4028808,0x3801c,0x3801c,0x4028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x4028800,0x2000,0x402c000,0x4000,0x4028000,0x4020000,0x400400,0x110,0x110,0x400400,0x110,0x1000000,0x110,0x400400,0x110,0xc00400,0xd0,0x1000000,0xd0,0x200000,};
756782
}
757783

758784
/** Constructor with InputStream. */
@@ -766,7 +792,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) {
766792
token = new Token();
767793
jj_ntk = -1;
768794
jj_gen = 0;
769-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
795+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
770796
}
771797

772798
/** Reinitialise. */
@@ -780,7 +806,7 @@ public void ReInit(java.io.InputStream stream, String encoding) {
780806
token = new Token();
781807
jj_ntk = -1;
782808
jj_gen = 0;
783-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
809+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
784810
}
785811

786812
/** Constructor. */
@@ -790,7 +816,7 @@ public SemgrexParser(java.io.Reader stream) {
790816
token = new Token();
791817
jj_ntk = -1;
792818
jj_gen = 0;
793-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
819+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
794820
}
795821

796822
/** Reinitialise. */
@@ -808,7 +834,7 @@ public void ReInit(java.io.Reader stream) {
808834
token = new Token();
809835
jj_ntk = -1;
810836
jj_gen = 0;
811-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
837+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
812838
}
813839

814840
/** Constructor with generated Token Manager. */
@@ -817,7 +843,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) {
817843
token = new Token();
818844
jj_ntk = -1;
819845
jj_gen = 0;
820-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
846+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
821847
}
822848

823849
/** Reinitialise. */
@@ -826,7 +852,7 @@ public void ReInit(SemgrexParserTokenManager tm) {
826852
token = new Token();
827853
jj_ntk = -1;
828854
jj_gen = 0;
829-
for (int i = 0; i < 35; i++) jj_la1[i] = -1;
855+
for (int i = 0; i < 37; i++) jj_la1[i] = -1;
830856
}
831857

832858
private Token jj_consume_token(int kind) throws ParseException {
@@ -882,7 +908,7 @@ public ParseException generateParseException() {
882908
la1tokens[jj_kind] = true;
883909
jj_kind = -1;
884910
}
885-
for (int i = 0; i < 35; i++) {
911+
for (int i = 0; i < 37; i++) {
886912
if (jj_la1[i] == jj_gen) {
887913
for (int j = 0; j < 32; j++) {
888914
if ((jj_la1_0[i] & (1<<j)) != 0) {

src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj

+2-2
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ void AddAttribute(NodeAttributes attributes) : {
285285
})
286286
|
287287
( ":{"
288-
((key = <IDENTIFIER>) (attrType = ":" | attrType = "!:") (value = <IDENTIFIER> | value = <REGEX>)
288+
((key = <IDENTIFIER> | key = <REGEX>) (attrType = ":" | attrType = "!:") (value = <IDENTIFIER> | value = <REGEX>)
289289
{
290290
if (attr == null || key == null || value == null) {
291291
throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr +
@@ -294,7 +294,7 @@ void AddAttribute(NodeAttributes attributes) : {
294294
negated = attrType.image.equals("!:");
295295
attributes.addContains(attr.image, key.image, value.image, negated);
296296
})
297-
( ";" (key = <IDENTIFIER>) (attrType = ":" | attrType = "!:") (value = <IDENTIFIER> | value = <REGEX>)
297+
( ";" (key = <IDENTIFIER> | key = <REGEX>) (attrType = ":" | attrType = "!:") (value = <IDENTIFIER> | value = <REGEX>)
298298
{
299299
if (attr == null || key == null || value == null) {
300300
throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr +

0 commit comments

Comments
 (0)