Skip to content

Commit aed8763

Browse files
authored
Support multiple parsers (#16)
* wip * add ACL anthology URL to allow CORS from chrome extension * add ACL anthology parser
1 parent 26b01d7 commit aed8763

File tree

3 files changed

+156
-89
lines changed

3 files changed

+156
-89
lines changed

manifest.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
"host_permissions": [
2121
"*://api.notion.com/*",
2222
"*://www.notion.so/*",
23-
"*://openreview.net/*"
23+
"*://openreview.net/*",
24+
"*://aclanthology.org/*"
2425
],
2526
"content_security_policy": {
2627
"extension_pages": "script-src 'self'; object-src 'self'"

src/js/parsers.js

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// MIT License
2+
// Copyright (c) 2024 denkiwakame <denkivvakame@gmail.com>
3+
4+
class URLParser {
5+
constructor() {
6+
this.parsers = [];
7+
}
8+
9+
addParser(domain, handler) {
10+
this.parsers.push({ domain, handler });
11+
}
12+
13+
async parse(url) {
14+
for (let { domain, handler } of this.parsers) {
15+
if (url?.startsWith(domain)) return handler(url);
16+
}
17+
throw new Error('No perser found for the given URL');
18+
}
19+
}
20+
21+
const arXivParser = async (url) => {
22+
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
23+
// ref: https://info.arxiv.org/help/arxiv_identifier.html
24+
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
25+
const parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];
26+
27+
const paperId = parseArXivId(url);
28+
const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
29+
if (res.status != 200) {
30+
console.error('arXiv API request failed');
31+
return;
32+
}
33+
const data = await res.text(); // TODO: error handling
34+
console.log(res.status);
35+
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
36+
console.log(xmlData);
37+
38+
const entry = xmlData.querySelector('entry');
39+
const id = parseArXivId(entry.querySelector('id')?.textContent);
40+
const paperTitle = entry.querySelector('title').textContent;
41+
const abst = entry.querySelector('summary').textContent;
42+
const authors = Array.from(entry.querySelectorAll('author')).map((author) => {
43+
return author.textContent.trim();
44+
});
45+
const published = entry.querySelector('published').textContent;
46+
const comment = entry.querySelector('comment')?.textContent ?? 'none';
47+
48+
return {
49+
id: id,
50+
title: paperTitle,
51+
abst: abst,
52+
authors: authors,
53+
url: url,
54+
published: published,
55+
comment: comment,
56+
publisher: 'arXiv',
57+
};
58+
};
59+
60+
const openReviewParser = async (url) => {
61+
const id = new URLSearchParams(new URL(url).search).get('id');
62+
const res = await fetch(url);
63+
const html = await res.text();
64+
const parser = new DOMParser();
65+
const xml = parser.parseFromString(html, 'text/html');
66+
67+
const authorsArray = Array.from(
68+
xml.querySelectorAll('meta[name="citation_author"]'),
69+
(author) => author.getAttribute('content')
70+
);
71+
const authors = authorsArray.length ? authorsArray : ['Anonymous'];
72+
73+
const paperTitle = xml
74+
.querySelector('meta[name="citation_title"]')
75+
.getAttribute('content');
76+
77+
const abst = xml
78+
.querySelector('meta[name="citation_abstract"]')
79+
.getAttribute('content');
80+
81+
const date = xml
82+
.querySelector('meta[name="citation_online_date"]')
83+
.getAttribute('content');
84+
// -> ISO 8601 date string
85+
const published = new Date(date).toISOString().split('T')[0];
86+
const comment = 'none';
87+
88+
return {
89+
id: id,
90+
title: paperTitle,
91+
abst: abst,
92+
authors: authors,
93+
url: url,
94+
published: published,
95+
comment: comment,
96+
publisher: 'OpenReview',
97+
};
98+
};
99+
100+
const aclAnthologyParser = async (url) => {
101+
const res = await fetch(url);
102+
const html = await res.text();
103+
const parser = new DOMParser();
104+
const xml = parser.parseFromString(html, 'text/html');
105+
106+
const id = xml
107+
.querySelector('meta[name="citation_doi"]')
108+
.getAttribute('content');
109+
const authors = Array.from(
110+
xml.querySelectorAll('meta[name="citation_author"]'),
111+
(author) => author.getAttribute('content')
112+
);
113+
114+
const paperTitle = xml
115+
.querySelector('meta[name="citation_title"]')
116+
.getAttribute('content');
117+
118+
const abst = 'none';
119+
const date = xml
120+
.querySelector('meta[name="citation_publication_date"]')
121+
.getAttribute('content');
122+
// -> ISO 8601 date string
123+
const published = new Date(date).toISOString().split('T')[0];
124+
const publisher = xml
125+
.querySelectorAll('.acl-paper-details dd')[6]
126+
.textContent.replaceAll('\n', '');
127+
const comment = xml
128+
.querySelector('meta[name="citation_pdf_url"]')
129+
.getAttribute('content');
130+
return {
131+
id: id,
132+
title: paperTitle,
133+
abst: abst,
134+
authors: authors,
135+
url: url,
136+
published: published,
137+
comment: comment,
138+
publisher: publisher,
139+
};
140+
};
141+
142+
const urlParser = new URLParser();
143+
urlParser.addParser('https://openreview.net/', openReviewParser);
144+
urlParser.addParser('https://arxiv.org', arXivParser);
145+
urlParser.addParser('https://aclanthology.org', aclAnthologyParser);
146+
147+
export default urlParser;

src/js/popup.js

Lines changed: 7 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@ import Icons from 'uikit/dist/js/uikit-icons';
77
import Mustache from 'mustache';
88
import NotionClient from './notion.js';
99
import thenChrome from 'then-chrome';
10+
import urlParser from './parsers.js';
1011

1112
UIKit.use(Icons);
1213

1314
const TEST_URL = 'https://arxiv.org/abs/2308.04079';
14-
const ARXIV_API = 'http://export.arxiv.org/api/query/search_query';
15+
// const TEST_URL = 'https://aclanthology.org/2023.ijcnlp-main.1/';
16+
1517
class UI {
1618
constructor() {
1719
this.setupProgressBar();
@@ -97,13 +99,11 @@ class UI {
9799
return url && url.split('.').pop() === 'pdf';
98100
}
99101
async getPaperInfo(url) {
100-
if (this.isArxivUrl(url)) return this.getArXivInfo(url);
101-
if (this.isOpenReviewUrl(url)) return this.getOpenReviewInfo(url);
102+
this.showProgressBar();
103+
const data = await urlParser.parse(url);
104+
this.setFormContents(data.title, data.abst, data.comment, data.authors);
105+
return data;
102106
}
103-
// ref: https://info.arxiv.org/help/arxiv_identifier.html
104-
// e.g. (new id format: 2404.16782) | (old id format: hep-th/0702063)
105-
parseArXivId = (str) => str.match(/(\d+\.\d+$)|((\w|-)+\/\d+$)/)?.[0];
106-
107107
setFormContents(paperTitle, abst, comment, authors) {
108108
document.getElementById('js-title').value = paperTitle;
109109
document.getElementById('js-abst').value = abst;
@@ -118,87 +118,6 @@ class UI {
118118
});
119119
}
120120

121-
async getArXivInfo(url) {
122-
this.showProgressBar();
123-
const paperId = this.parseArXivId(url);
124-
125-
const res = await fetch(ARXIV_API + '?id_list=' + paperId.toString());
126-
if (res.status != 200) {
127-
console.error('arXiv API request failed');
128-
return;
129-
}
130-
const data = await res.text(); // TODO: error handling
131-
console.log(res.status);
132-
const xmlData = new window.DOMParser().parseFromString(data, 'text/xml');
133-
console.log(xmlData);
134-
135-
const entry = xmlData.querySelector('entry');
136-
const id = this.parseArXivId(entry.querySelector('id')?.textContent);
137-
const paperTitle = entry.querySelector('title').textContent;
138-
const abst = entry.querySelector('summary').textContent;
139-
const authors = Array.from(entry.querySelectorAll('author')).map(
140-
(author) => {
141-
return author.textContent.trim();
142-
}
143-
);
144-
const published = entry.querySelector('published').textContent;
145-
const comment = entry.querySelector('comment')?.textContent ?? 'none';
146-
this.setFormContents(paperTitle, abst, comment, authors);
147-
return {
148-
id: id,
149-
title: paperTitle,
150-
abst: abst,
151-
authors: authors,
152-
url: url,
153-
published: published,
154-
comment: comment,
155-
publisher: 'arXiv',
156-
};
157-
}
158-
159-
async getOpenReviewInfo(url) {
160-
this.showProgressBar();
161-
const id = new URLSearchParams(new URL(url).search).get('id');
162-
163-
const res = await fetch(url);
164-
const html = await res.text();
165-
const parser = new DOMParser();
166-
const xml = parser.parseFromString(html, 'text/html');
167-
168-
const authorsArray = Array.from(
169-
xml.querySelectorAll('meta[name="citation_author"]'),
170-
(author) => author.getAttribute('content')
171-
);
172-
const authors = authorsArray.length ? authorsArray : ['Anonymous'];
173-
174-
const paperTitle = xml
175-
.querySelector('meta[name="citation_title"]')
176-
.getAttribute('content');
177-
178-
const abst = xml
179-
.querySelector('meta[name="citation_abstract"]')
180-
.getAttribute('content');
181-
182-
const date = xml
183-
.querySelector('meta[name="citation_publication_date"]')
184-
.getAttribute('content');
185-
// -> ISO 8601 date string
186-
const published = new Date(date).toISOString().split('T')[0];
187-
const comment = 'none';
188-
189-
this.setFormContents(paperTitle, abst, comment, authors);
190-
return {
191-
id: id,
192-
title: paperTitle,
193-
abst: abst,
194-
authors: authors,
195-
url: url,
196-
published: published,
197-
comment: comment,
198-
publisher: 'OpenReview',
199-
};
200-
}
201-
202121
renderMessage(type, message, overwrite = false) {
203122
// type: warning, danger, success, primary
204123
const template = `<div class="uk-alert-{{type}}" uk-alert><a class="uk-alert-close" uk-close></a><p>{{message}}</p></div>`;

0 commit comments

Comments
 (0)