From 38a1c9f1b7d8fb859332cfd576ad3a8a5ff2ca78 Mon Sep 17 00:00:00 2001 From: naveen Date: Sun, 9 Jan 2022 19:18:03 +0530 Subject: [PATCH 1/7] Fix for jsonld --- extruct/jsonld.py | 22 ++++++++++++++----- .../JSONLD_valid_and_invalid.html | 16 ++++++++++++++ .../JSONLD_valid_and_invalid.jsonld | 1 + tests/test_jsonld.py | 6 +++++ 4 files changed, 39 insertions(+), 6 deletions(-) create mode 100644 tests/samples/custom.invalid/JSONLD_valid_and_invalid.html create mode 100644 tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 75b04a87..178272af 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -28,14 +28,24 @@ def extract_items(self, document, base_url=None): if items for item in items if item ] + def _is_valid_json(self, script): + try: + json.loads(script) + return True + except Exception: + return False + def _extract_items(self, node): script = node.xpath('string()') - try: - # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) - except ValueError: - # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = jstyleson.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + # check if valid json. + if not self._is_valid_json(script): + script = jstyleson.dispose( HTML_OR_JS_COMMENTLINE.sub('', script)) + # After processing check if json is still valid. + if not self._is_valid_json(script): + return False + + # if its valid then process the data. + data = json.loads(script, strict=False) if isinstance(data, list): for item in data: yield item diff --git a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html new file mode 100644 index 00000000..28320809 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html @@ -0,0 +1,16 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld new file mode 100644 index 00000000..e17b54c6 --- /dev/null +++ b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.jsonld @@ -0,0 +1 @@ +[ {"foo" : "bar"} ] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 6edc2877..4d48d416 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -18,6 +18,12 @@ def test_songkick(self): 'Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015' ) + def test_when_page_has_invalid_jsonld_elements_should_skip(self): + self.assertJsonLdCorrect( + folder='custom.invalid', + page='JSONLD_valid_and_invalid' + ) + def test_jsonld_empty_item(self): self.assertJsonLdCorrect( folder='songkick', From 0b449e1a26ff047da39520296a53cff011f57d58 Mon Sep 17 00:00:00 2001 From: naveen Date: Sun, 9 Jan 2022 19:22:35 +0530 Subject: [PATCH 2/7] Ignore invalid jsonld --- extruct/jsonld.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 178272af..8d9d41a1 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -28,20 +28,21 @@ def extract_items(self, document, base_url=None): if items for item in items if item ] - def _is_valid_json(self, script): + def _may_be_get_json(self, script): try: - json.loads(script) - return True + return json.loads(script, strict=False) except Exception: return False def _extract_items(self, node): script = node.xpath('string()') + data = self._may_be_get_json(script) # check if valid json. - if not self._is_valid_json(script): + if not data: script = jstyleson.dispose( HTML_OR_JS_COMMENTLINE.sub('', script)) + data = self._may_be_get_json(script) # After processing check if json is still valid. - if not self._is_valid_json(script): + if not data: return False # if its valid then process the data. From edcaa8b0d615532c3d6705da9b3ebca181ae7d02 Mon Sep 17 00:00:00 2001 From: naveen Date: Sun, 9 Jan 2022 19:26:37 +0530 Subject: [PATCH 3/7] Remove repeated line --- extruct/jsonld.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 8d9d41a1..1fc8b449 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -45,8 +45,6 @@ def _extract_items(self, node): if not data: return False - # if its valid then process the data. - data = json.loads(script, strict=False) if isinstance(data, list): for item in data: yield item From 8bd2b16b9178aef58f45969968159fe63979381a Mon Sep 17 00:00:00 2001 From: naveen Date: Sun, 9 Jan 2022 20:11:54 +0530 Subject: [PATCH 4/7] fix --- extruct/jsonld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 1fc8b449..1f979f08 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -40,7 +40,7 @@ def _extract_items(self, node): # check if valid json. if not data: script = jstyleson.dispose( HTML_OR_JS_COMMENTLINE.sub('', script)) - data = self._may_be_get_json(script) + data = self._may_be_get_json(script) # After processing check if json is still valid. if not data: return False From 346f6b924e5f3c55fcf490015cbb9425f94932e7 Mon Sep 17 00:00:00 2001 From: naveen Date: Tue, 11 Jan 2022 05:36:22 +0530 Subject: [PATCH 5/7] initial fixes --- extruct/jsonld.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 1f979f08..c62263b3 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -32,18 +32,19 @@ def _may_be_get_json(self, script): try: return json.loads(script, strict=False) except Exception: - return False + return None def _extract_items(self, node): script = node.xpath('string()') data = self._may_be_get_json(script) # check if valid json. if not data: - script = jstyleson.dispose( HTML_OR_JS_COMMENTLINE.sub('', script)) + # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments + script = jstyleson.dispose(HTML_OR_JS_COMMENTLINE.sub('', script)) data = self._may_be_get_json(script) # After processing check if json is still valid. if not data: - return False + return if isinstance(data, list): for item in data: From 1625503ce5f14272456544b2d6aa8bbde822eb78 Mon Sep 17 00:00:00 2001 From: naveen Date: Tue, 11 Jan 2022 05:40:02 +0530 Subject: [PATCH 6/7] Alter the test for more clarity --- tests/samples/custom.invalid/JSONLD_valid_and_invalid.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html index 28320809..9efd3470 100644 --- a/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html +++ b/tests/samples/custom.invalid/JSONLD_valid_and_invalid.html @@ -3,7 +3,7 @@