File tree 6 files changed +47
-3
lines changed
6 files changed +47
-3
lines changed Original file line number Diff line number Diff line change 6
6
import json
7
7
import re
8
8
9
+ import jstyleson
9
10
import lxml .etree
10
11
11
12
from extruct .utils import parse_html
@@ -34,8 +35,7 @@ def _extract_items(self, node):
34
35
data = json .loads (script , strict = False )
35
36
except ValueError :
36
37
# sometimes JSON-decoding errors are due to leading HTML or JavaScript comments
37
- data = json .loads (
38
- HTML_OR_JS_COMMENTLINE .sub ('' , script ), strict = False )
38
+ data = jstyleson .loads (HTML_OR_JS_COMMENTLINE .sub ('' , script ),strict = False )
39
39
if isinstance (data , list ):
40
40
return data
41
41
elif isinstance (data , dict ):
Original file line number Diff line number Diff line change @@ -8,3 +8,4 @@ mf2py>=1.1.0
8
8
six >= 1.11
9
9
w3lib
10
10
html-text
11
+ jstyleson
Original file line number Diff line number Diff line change @@ -34,7 +34,9 @@ def get_version():
34
34
'mf2py' ,
35
35
'w3lib' ,
36
36
'html-text>=0.5.1' ,
37
- 'six' ],
37
+ 'six' ,
38
+ 'jstyleson'
39
+ ],
38
40
extras_require = {
39
41
'cli' : [
40
42
'requests' ,
Original file line number Diff line number Diff line change
1
+ <!DOCTYPE html>
2
+ < html lang ="en ">
3
+
4
+ < head >
5
+ < script type ="application/ld+json ">
6
+
7
+ {
8
+ "@context" : "http://schema.org" ,
9
+ "@type" : "NewsArticle" ,
10
+ "thumbnailUrl" : "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg" ,
11
+ "keywords" : "" ,
12
+ "url" : "https://money.udn.com/money/story/5635/4158094" ,
13
+ "mainEntityOfPage" : "https://money.udn.com/money/story/5635/4158094" ,
14
+ "headline" : "讓AI挑出感興趣 SparkAmplify精準行銷當紅" ,
15
+ "articleSection" : "商情" , // category
16
+ //"interactionCount": ""
17
+ }
18
+
19
+ </ script >
20
+ </ head >
21
+
22
+ < body > </ body >
23
+
24
+ </ html >
Original file line number Diff line number Diff line change
1
+ [
2
+ {
3
+ "@context" : "http://schema.org" ,
4
+ "@type" : "NewsArticle" ,
5
+ "thumbnailUrl" : "https://uc.udn.com.tw/photo/2019/11/11/99/7053890.jpg" ,
6
+ "keywords" : "" ,
7
+ "url" : "https://money.udn.com/money/story/5635/4158094" ,
8
+ "mainEntityOfPage" : "https://money.udn.com/money/story/5635/4158094" ,
9
+ "headline" : "讓AI挑出感興趣 SparkAmplify精準行銷當紅" ,
10
+ "articleSection" : "商情"
11
+ }
12
+ ]
Original file line number Diff line number Diff line change @@ -40,6 +40,11 @@ def test_jsonld_with_control_characters_comment(self):
40
40
self .assertJsonLdCorrect (
41
41
folder = 'custom.invalid' ,
42
42
page = 'JSONLD_with_control_characters_comment' )
43
+
44
+ def test_jsonld_with_json_including_js_comment (self ):
45
+ self .assertJsonLdCorrect (
46
+ folder = 'custom.invalid' ,
47
+ page = 'JSONLD_with_JS_comment' )
43
48
44
49
def assertJsonLdCorrect (self , folder , page ):
45
50
body , expected = self ._get_body_expected (folder , page )
You can’t perform that action at this time.
0 commit comments