Skip to content

Commit 5471324

Browse files
committed
Add a SetPhraseHead operation for rearranging words, useful in the short term for updating name phrases in the UD datasets (especially the Sindhi one) so the first word in a name is the head via flat relations
1 parent f2c6d40 commit 5471324

File tree

3 files changed

+224
-0
lines changed

3 files changed

+224
-0
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
import java.util.*;
4+
import java.io.*;
5+
6+
import edu.stanford.nlp.ling.IndexedWord;
7+
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
8+
import edu.stanford.nlp.semgraph.SemanticGraph;
9+
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
10+
import edu.stanford.nlp.trees.GrammaticalRelation;
11+
12+
/**
13+
* Build a new phrase out of the matched words.
14+
* <br>
15+
* All of the words must currently be connected to themselves. Eg, there would be one head which points to a different word, and the other words all point to that head.
16+
* <br>
17+
* If that condition is matched, then existing internal edges are replaced with edges to the new head, with the given reln <br>
18+
* If the head is changed, the edge out of the phrase (if it is not root) is changed to come from the new head <br>
19+
* Edges in to the phrase are also changed to point to the new head.
20+
* The purpose of that change is so for a noun phrase, for example, modifiers of that noun phrase such as nmod or nmod:desc now modify the new head
21+
*/
22+
public class SetPhraseHead extends SsurgeonEdit {
23+
public static final String LABEL = "setPhraseHead";
24+
25+
final List<String> phrase;
26+
final int headIndex;
27+
final GrammaticalRelation relation;
28+
final double weight;
29+
30+
public SetPhraseHead(List<String> nodes, Integer headIndex, GrammaticalRelation relation, double weight) {
31+
if (headIndex == null) {
32+
throw new SsurgeonParseException("SetPhraseHead expected a -headIndex, 0-indexed for the node to use as the new head");
33+
}
34+
if (headIndex < 0 || headIndex >= nodes.size()) {
35+
throw new SsurgeonParseException("-headIndex of " + headIndex + " is out of bounds for a phrase with " + nodes.size() + " words");
36+
}
37+
38+
if (relation == null) {
39+
throw new SsurgeonParseException("SetPhraseHead expected a -reln to represent the dependency to use for the new phrase");
40+
}
41+
42+
this.phrase = new ArrayList<>(nodes);
43+
this.headIndex = headIndex;
44+
this.relation = relation;
45+
this.weight = weight;
46+
}
47+
48+
@Override
49+
public String toEditString() {
50+
StringWriter buf = new StringWriter();
51+
buf.write(LABEL);
52+
buf.write("\t");
53+
for (String node : phrase) {
54+
buf.write("-node " + node + "\t");
55+
}
56+
buf.write("-headIndex " + headIndex + "\t");
57+
buf.write("-reln " + relation.toString());
58+
return buf.toString();
59+
}
60+
61+
62+
@Override
63+
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
64+
List<IndexedWord> matchedNodes = new ArrayList<>();
65+
IndexedWord newHead = null;
66+
int idx = 0;
67+
for (String word : phrase) {
68+
IndexedWord node = sm.getNode(word);
69+
if (node == null) {
70+
return false;
71+
}
72+
matchedNodes.add(node);
73+
74+
if (idx == headIndex) {
75+
newHead = node;
76+
}
77+
++idx;
78+
}
79+
80+
SemanticGraphEdge edgeOut = null;
81+
List<SemanticGraphEdge> deleteEdges = new ArrayList<>();
82+
List<SemanticGraphEdge> relocateEdges = new ArrayList<>();
83+
for (IndexedWord node : matchedNodes) {
84+
for (SemanticGraphEdge edge : sg.incomingEdgeIterable(node)) {
85+
if (matchedNodes.contains(edge.getSource())) {
86+
// TODO: not sure keeping extra edges is correct
87+
if (edge.getSource() != newHead && !edge.isExtra()) {
88+
deleteEdges.add(edge);
89+
}
90+
} else if (edgeOut == null) {
91+
edgeOut = edge;
92+
} else {
93+
// oops, this wasn't a self-contained phrase. guess we don't try to rearrange it after all
94+
// TODO: if the heads are the same, we could make it a phrase
95+
return false;
96+
}
97+
}
98+
for (SemanticGraphEdge edge : sg.outgoingEdgeIterable(node)) {
99+
// edges which point outside the phrase will be set to have the source be the new head
100+
if (!matchedNodes.contains(edge.getTarget())) {
101+
if (edge.getSource() != newHead) {
102+
relocateEdges.add(edge);
103+
}
104+
}
105+
}
106+
}
107+
108+
boolean modified = false;
109+
if (edgeOut == null) {
110+
// the newHead should be the root now
111+
Set<IndexedWord> roots = new HashSet<>(sg.getRoots());
112+
if (!roots.contains(newHead)) {
113+
modified = true;
114+
for (IndexedWord other : matchedNodes) {
115+
roots.remove(other);
116+
}
117+
roots.add(newHead);
118+
}
119+
sg.setRoots(roots);
120+
} else if (edgeOut.getTarget() != newHead) {
121+
SemanticGraphEdge newEdge = new SemanticGraphEdge(edgeOut.getSource(),
122+
newHead,
123+
edgeOut.getRelation(),
124+
edgeOut.getWeight(),
125+
edgeOut.isExtra());
126+
boolean success = sg.removeEdge(edgeOut);
127+
if (!success) {
128+
throw new RuntimeException("Between when the outgoing edge was found and now, the edge was somehow deleted");
129+
}
130+
sg.addEdge(newEdge);
131+
modified = true;
132+
}
133+
134+
for (SemanticGraphEdge edge : relocateEdges) {
135+
SemanticGraphEdge newEdge = new SemanticGraphEdge(newHead,
136+
edge.getTarget(),
137+
edge.getRelation(),
138+
edge.getWeight(),
139+
edge.isExtra());
140+
boolean success = sg.removeEdge(edge);
141+
if (!success) {
142+
throw new RuntimeException("Between when the incoming edge was found and now, the edge was somehow deleted");
143+
}
144+
sg.addEdge(newEdge);
145+
modified = true;
146+
}
147+
148+
for (SemanticGraphEdge edge : deleteEdges) {
149+
boolean success = sg.removeEdge(edge);
150+
if (!success) {
151+
throw new RuntimeException("Between when the internal phrase edge was found and now, the edge was somehow deleted");
152+
}
153+
modified = true;
154+
}
155+
for (IndexedWord other : matchedNodes) {
156+
if (other == newHead)
157+
continue;
158+
159+
found: {
160+
for (SemanticGraphEdge existingEdge : sg.getAllEdges(newHead, other)) {
161+
if (existingEdge.getRelation().equals(relation)) {
162+
break found;
163+
}
164+
}
165+
SemanticGraphEdge newEdge = new SemanticGraphEdge(newHead,
166+
other,
167+
relation,
168+
weight,
169+
false);
170+
sg.addEdge(newEdge);
171+
modified = true;
172+
}
173+
}
174+
175+
return modified;
176+
}
177+
178+
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,19 @@
149149
* {@code -node} (repeated) is the nodes to edit. <br>
150150
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
151151
*</p><p>
152+
* {@code setPhraseHead} will set a new head for a sequence of nodes.<br>
153+
* {@code -node} for each node to include in the phrase. <br>
154+
* {@code -headIndex} is the index (counting from 0) of the node to make the head. <br>
155+
* {@code -reln} is the name of the dependency type to use to connect the other words in the phrase to the new head <br>
156+
* {@code -weight} is the weight to give the new edges (probably not particularly important) <br>
157+
* The words must already be in a phrase for this to work. This is
158+
* detected by making sure each node has its parent within the phrase,
159+
* except for the head word, which can either be the root or have the
160+
* one edge that goes out from the phrase. <br>
161+
* This operation reconnects the head of the phrase to the same node that was previously the parent of the phrase. <br>
162+
* All edges that previously went to a different word in the phrase are now pointed to the new head of the phrase. <br>
163+
* Some of these behaviors are optional. If you happen to need a different behavior, please file an issue on github.
164+
*</p><p>
152165
* {@code splitWord} will split a single word into multiple pieces from the text of the current word <br>
153166
* {@code -node} is the node to split. <br>
154167
* {@code -headIndex} is the index (counting from 0) of the word piece to make the head. <br>
@@ -659,6 +672,9 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
659672
return new KillAllIncomingEdges(argsBox.nodes.get(0));
660673
} else if (command.equalsIgnoreCase(CombineMWT.LABEL)) {
661674
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
675+
} else if (command.equalsIgnoreCase(SetPhraseHead.LABEL)) {
676+
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
677+
return new SetPhraseHead(argsBox.nodes, argsBox.headIndex, reln, argsBox.weight);
662678
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
663679
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
664680
if (argsBox.regex.size() > 0 && argsBox.exact.size() > 0) {

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,36 @@ public void readXMLDeleteLeaf() {
20852085
assertEquals(newSg, expected);
20862086
}
20872087

2088+
@Test
2089+
public void readXMLSetPhraseHead() {
2090+
String doc = String.join(newline,
2091+
"<ssurgeon-pattern-list>",
2092+
" <ssurgeon-pattern>",
2093+
" <uid>38</uid>",
2094+
" <notes>Test resetting a phrase's internal and external links</notes>",
2095+
" <language>UniversalEnglish</language>",
2096+
" <semgrex>" + XMLUtils.escapeXML("{word:John}=n1 . {word:Bauer}=n2") + "</semgrex>",
2097+
" <edit-list>SetPhraseHead -node n1 -node n2 -headIndex 0 -reln flat</edit-list>",
2098+
" </ssurgeon-pattern>",
2099+
"</ssurgeon-pattern-list>");
2100+
Ssurgeon inst = Ssurgeon.inst();
2101+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2102+
assertEquals(patterns.size(), 1);
2103+
SsurgeonPattern pattern = patterns.get(0);
2104+
2105+
// test where the new phrase is not the root
2106+
SemanticGraph sg = SemanticGraph.valueOf("[works-4 obl> [Stanford-6 case> at-5] nsubj> [Bauer-3 flat> John-2 nmod> Earl-1]]");
2107+
SemanticGraph newSg = pattern.iterate(sg).first;
2108+
SemanticGraph expected = SemanticGraph.valueOf("[works-4 obl> [Stanford-6 case> at-5] nsubj> [John-2 flat> Bauer-3 nmod> Earl-1]]");
2109+
assertEquals(newSg, expected);
2110+
2111+
// test where the new phrase IS the root
2112+
sg = SemanticGraph.valueOf("[Bauer-5 flat> John-4 cop> is-3 nsubj> [programmer-2 det> The-1]]");
2113+
newSg = pattern.iterate(sg).first;
2114+
expected = SemanticGraph.valueOf("[John-4 flat> Bauer-5 cop> is-3 nsubj> [programmer-2 det> The-1]]");
2115+
assertEquals(newSg, expected);
2116+
}
2117+
20882118
/**
20892119
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 0
20902120
*/

0 commit comments

Comments
 (0)