Skip to content

Commit 39c0e43

Browse files
committed
fix extraction of urls that are not well formed (supplementary-material generated by pub2tei)
1 parent 54bc62a commit 39c0e43

File tree

3 files changed

+29
-61
lines changed

3 files changed

+29
-61
lines changed

src/main/java/org/grobid/core/engines/DatasetDisambiguator.java

Lines changed: 20 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,36 @@
11
package org.grobid.core.engines;
22

3-
import nu.xom.Attribute;
4-
import nu.xom.Element;
3+
import com.fasterxml.jackson.core.io.JsonStringEncoder;
4+
import com.fasterxml.jackson.databind.JsonNode;
5+
import com.fasterxml.jackson.databind.ObjectMapper;
56
import org.apache.commons.io.FileUtils;
67
import org.apache.commons.lang3.StringUtils;
8+
import org.apache.http.HttpEntity;
79
import org.apache.http.client.config.RequestConfig;
8-
import org.grobid.core.GrobidModels;
9-
import org.grobid.core.data.DatasetComponent;
10+
import org.apache.http.client.methods.CloseableHttpResponse;
11+
import org.apache.http.client.methods.HttpGet;
12+
import org.apache.http.client.methods.HttpPost;
13+
import org.apache.http.conn.HttpHostConnectException;
14+
import org.apache.http.entity.ContentType;
15+
import org.apache.http.entity.mime.HttpMultipartMode;
16+
import org.apache.http.entity.mime.MultipartEntityBuilder;
17+
import org.apache.http.entity.mime.content.StringBody;
18+
import org.apache.http.impl.client.CloseableHttpClient;
19+
import org.apache.http.impl.client.HttpClientBuilder;
20+
import org.apache.http.impl.client.HttpClients;
21+
import org.apache.http.util.EntityUtils;
1022
import org.grobid.core.data.Dataset;
11-
import org.grobid.core.data.BiblioItem;
12-
import org.grobid.core.document.Document;
13-
import org.grobid.core.document.DocumentPiece;
14-
import org.grobid.core.document.DocumentSource;
15-
import org.grobid.core.document.xml.XmlBuilderUtils;
16-
import org.grobid.core.engines.config.GrobidAnalysisConfig;
17-
import org.grobid.core.engines.label.DatasetTaggingLabels;
18-
import org.grobid.core.engines.label.SegmentationLabels;
19-
import org.grobid.core.engines.label.TaggingLabel;
20-
import org.grobid.core.engines.label.TaggingLabels;
21-
import org.grobid.core.exceptions.GrobidException;
22-
import org.grobid.core.factory.GrobidFactory;
23-
import org.grobid.core.features.FeaturesVectorDataseer;
24-
import org.grobid.core.layout.BoundingBox;
23+
import org.grobid.core.data.DatasetComponent;
2524
import org.grobid.core.layout.LayoutToken;
26-
import org.grobid.core.layout.LayoutTokenization;
27-
import org.grobid.core.lexicon.DatastetLexicon;
2825
import org.grobid.core.utilities.DatastetConfiguration;
29-
import org.grobid.core.utilities.*;
3026
import org.slf4j.Logger;
3127
import org.slf4j.LoggerFactory;
32-
import org.xml.sax.InputSource;
33-
34-
import com.fasterxml.jackson.core.*;
35-
import com.fasterxml.jackson.databind.*;
36-
import com.fasterxml.jackson.databind.node.*;
37-
import com.fasterxml.jackson.annotation.*;
38-
import com.fasterxml.jackson.core.io.*;
39-
40-
import java.io.*;
41-
import java.text.DateFormat;
42-
import java.text.SimpleDateFormat;
43-
import java.util.*;
4428

45-
import java.net.HttpURLConnection;
29+
import java.io.File;
30+
import java.io.IOException;
4631
import java.net.MalformedURLException;
4732
import java.net.URL;
48-
49-
import org.apache.http.HttpResponse;
50-
import org.apache.http.NameValuePair;
51-
import org.apache.http.client.HttpClient;
52-
import org.apache.http.client.entity.UrlEncodedFormEntity;
53-
import org.apache.http.client.methods.HttpGet;
54-
import org.apache.http.client.methods.HttpPost;
55-
import org.apache.http.impl.client.HttpClientBuilder;
56-
import org.apache.http.message.BasicNameValuePair;
57-
import org.apache.http.impl.client.CloseableHttpClient;
58-
import org.apache.http.impl.client.HttpClients;
59-
import org.apache.http.client.methods.CloseableHttpResponse;
60-
import org.apache.http.HttpEntity;
61-
import org.apache.http.util.EntityUtils;
62-
import org.apache.http.entity.mime.content.StringBody;
63-
import org.apache.http.entity.ContentType;
64-
import org.apache.http.entity.mime.MultipartEntityBuilder;
65-
import org.apache.http.entity.mime.HttpMultipartMode;
66-
import org.apache.http.conn.HttpHostConnectException;
67-
import org.apache.commons.lang3.tuple.Pair;
68-
69-
import static org.apache.commons.lang3.StringUtils.*;
70-
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
33+
import java.util.*;
7134

7235
/**
7336
* Dataset entity disambiguator. Once dataset mentions are recognized and grouped

src/main/java/org/grobid/core/engines/DatasetParser.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,11 @@ private List<DatasetComponent> addUrlComponentsAsReferences(DatasetDocumentSeque
550550
String target = urlInfos.getMiddle();
551551
// String type = urlInfos.getRight();
552552

553-
DatasetComponent urlComponent = new DatasetComponent(sequence.getText().substring(pos.start, pos.end));
553+
String sequenceText = sequence.getText();
554+
if (sequenceText.length() <= pos.start || sequenceText.length() <= pos.end) {
555+
continue;
556+
}
557+
DatasetComponent urlComponent = new DatasetComponent(sequenceText.substring(pos.start, pos.end));
554558
urlComponent.setOffsetStart(pos.start);
555559
urlComponent.setOffsetEnd(pos.end);
556560
if (target != null) {
@@ -2005,7 +2009,8 @@ public Pair<List<List<Dataset>>, List<BibDataSet>> processTEIDocument(org.w3c.do
20052009

20062010
biblioRefMap.put(refTextClean, biblioItem);
20072011

2008-
Integer refKey = biblioComponentWrapper.getRefKey(target); BiblioComponent biblioComponent = new BiblioComponent(
2012+
Integer refKey = biblioComponentWrapper.getRefKey(target);
2013+
BiblioComponent biblioComponent = new BiblioComponent(
20092014
biblioItem, refKey
20102015
);
20112016
biblioComponent.setRawForm(refText);

src/main/java/org/grobid/core/utilities/XMLUtilities.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,15 +223,15 @@ public static Pair<String, Map<String,Triple<OffsetPosition, String, String>>> g
223223
for (int j = 0; j < list2.getLength(); j++) {
224224
Node node2 = list2.item(j);
225225
if (node2.getNodeType() == Node.TEXT_NODE) {
226-
String chunk = node2.getNodeValue();
226+
String chunk = normalize(node2.getNodeValue());
227227
buf.append(chunk);
228228
found = true;
229229
indexPos += chunk.length();
230230
}
231231
}
232232
}
233233
} else if (node.getNodeType() == Node.TEXT_NODE) {
234-
String chunk = node.getNodeValue();
234+
String chunk = normalize(node.getNodeValue());
235235
buf.append(chunk);
236236
found = true;
237237
indexPos += chunk.length();

0 commit comments

Comments
 (0)