Skip to content

Commit 0639986

Browse files
committed
Work on data harmonization function
1 parent 692ed43 commit 0639986

File tree

3 files changed

+79
-11
lines changed

3 files changed

+79
-11
lines changed

MassBank-Project/MassBank-lib/pom.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@
108108
<groupId>com.zaxxer</groupId>
109109
<artifactId>HikariCP</artifactId>
110110
</dependency>
111+
<dependency>
112+
<groupId>io.github.java-diff-utils</groupId>
113+
<artifactId>java-diff-utils</artifactId>
114+
</dependency>
111115
</dependencies>
112116

113117

MassBank-Project/MassBank-lib/src/main/java/massbank/cli/AddMetaData.java

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import java.net.URL;
88
import java.net.URLConnection;
99
import java.net.URLEncoder;
10-
import java.nio.charset.Charset;
1110
import java.nio.charset.StandardCharsets;
1211
import java.nio.file.Path;
1312
import java.util.*;
@@ -17,6 +16,10 @@
1716
import java.util.regex.Matcher;
1817
import java.util.regex.Pattern;
1918

19+
import com.github.difflib.DiffUtils;
20+
import com.github.difflib.patch.AbstractDelta;
21+
import com.github.difflib.patch.Patch;
22+
import com.google.gson.*;
2023
import massbank.ProjectPropertiesLoader;
2124
import massbank.RecordParser;
2225
import org.apache.commons.cli.CommandLine;
@@ -39,11 +42,6 @@
3942
import org.openscience.cdk.smiles.SmiFlavor;
4043
import org.openscience.cdk.smiles.SmilesGenerator;
4144

42-
import com.google.gson.Gson;
43-
import com.google.gson.GsonBuilder;
44-
import com.google.gson.JsonObject;
45-
import com.google.gson.JsonSyntaxException;
46-
4745
import de.undercouch.citeproc.CSL;
4846
import de.undercouch.citeproc.bibtex.BibTeXConverter;
4947
import de.undercouch.citeproc.bibtex.BibTeXItemDataProvider;
@@ -475,6 +473,56 @@ else if (ret == InchiStatus.ERROR) {
475473
}
476474
return record.toString();
477475
}
476+
477+
public static String fetchCIDFromSID(String sid) {
478+
String apiUrl = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/" + sid + "/cids/JSON";
479+
try {
480+
HttpURLConnection connection = (HttpURLConnection) new URL(apiUrl).openConnection();
481+
connection.setRequestMethod("GET");
482+
483+
Scanner scanner = new Scanner(connection.getInputStream());
484+
StringBuilder response = new StringBuilder();
485+
while (scanner.hasNext()) {
486+
response.append(scanner.nextLine());
487+
}
488+
scanner.close();
489+
490+
JsonObject jsonResponse = JsonParser.parseString(response.toString()).getAsJsonObject();
491+
JsonArray cid = jsonResponse.getAsJsonObject("InformationList")
492+
.getAsJsonArray("Information")
493+
.get(0).getAsJsonObject()
494+
.getAsJsonArray("CID");
495+
496+
if (cid.size() != 1) {
497+
System.out.println("Error: More than one CID found for SID " + sid);
498+
return null;
499+
}
500+
501+
return cid.get(0).getAsString();
502+
} catch (IOException e) {
503+
e.printStackTrace();
504+
System.out.println("Error fetching CID");
505+
return null;
506+
}
507+
}
508+
509+
public static String doNormalizeCompoundIdentifier(Record record) {
510+
Map<String, String> inRecord = new HashMap<>();
511+
inRecord.put("Inchi", record.CH_IUPAC());
512+
inRecord.put("SMILES", record.CH_SMILES());
513+
if (record.CH_LINK().get("PUBCHEM") != null) {
514+
inRecord.put("PUBCHEM", record.CH_LINK().get("PUBCHEM"));
515+
}
516+
if (record.CH_LINK().get("CAS") != null) {
517+
inRecord.put("CAS", record.CH_LINK().get("CAS"));
518+
}
519+
520+
System.out.println(inRecord);
521+
522+
System.out.println(fetchCIDFromSID("5689"));
523+
524+
return record.toString();
525+
}
478526

479527

480528
public static void main(String[] arguments) throws Exception {
@@ -512,12 +560,23 @@ public static void main(String[] arguments) throws Exception {
512560
// recordstring2=doAddInchikey(record);
513561
//}
514562

563+
recordStringAfterMod = doNormalizeCompoundIdentifier(record);
515564

516-
if (doAddPubchemCid.get()) {
517-
recordStringAfterMod=doAddPubchemCID(record);
518-
}
519-
if (doSetSMILESfromInChi.get()) {
520-
recordStringAfterMod=doSetSMILESfromInChi(record);
565+
566+
// if (doAddPubchemCid.get()) {
567+
// recordStringAfterMod=doAddPubchemCID(record);
568+
// }
569+
// if (doSetSMILESfromInChi.get()) {
570+
// recordStringAfterMod=doSetSMILESfromInChi(record);
571+
// }
572+
573+
List<String> originalList = Arrays.asList(recordString.getValue().split("\\n"));
574+
List<String> revisedList = Arrays.asList(recordStringAfterMod.split("\\n"));
575+
576+
Patch<String> patch = DiffUtils.diff(originalList, revisedList);
577+
578+
for (AbstractDelta<String> delta : patch.getDeltas()) {
579+
System.out.println(delta);
521580
}
522581

523582
if (!recordString.getValue().equals(recordStringAfterMod)) {

MassBank-Project/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@
151151
<artifactId>HikariCP</artifactId>
152152
<version>6.2.1</version>
153153
</dependency>
154+
<dependency>
155+
<groupId>io.github.java-diff-utils</groupId>
156+
<artifactId>java-diff-utils</artifactId>
157+
<version>4.15</version>
158+
</dependency>
154159

155160
<!-- MassBank-web -->
156161
<dependency>

0 commit comments

Comments
 (0)