Skip to content

Commit bd49a5f

Browse files
authored
Merge pull request #139 from adityas114/master
Fix for #116
2 parents a64ce58 + 78c76a3 commit bd49a5f

File tree

6 files changed

+1704
-14
lines changed

6 files changed

+1704
-14
lines changed

extruct/rdfa.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
55
Based on pyrdfa3 and rdflib
66
"""
7+
from collections import defaultdict
8+
79
import json
810
import logging
11+
import re
912

1013
rdflib_logger = logging.getLogger('rdflib')
1114
rdflib_logger.setLevel(logging.ERROR)
@@ -35,6 +38,105 @@
3538

3639
class RDFaExtractor(object):
3740

41+
def _replaceNS(self, prop, html_element, head_element):
42+
"""Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""
43+
44+
# context namespaces taken from pyrdfa3
45+
# https://github.com/RDFLib/PyRDFa/blob/master/pyRdfa/initialcontext.py
46+
context = {
47+
'owl' : 'http://www.w3.org/2002/07/owl#',
48+
'gr' : 'http://purl.org/goodrelations/v1#',
49+
'ctag' : 'http://commontag.org/ns#',
50+
'cc' : 'http://creativecommons.org/ns#',
51+
'grddl' : 'http://www.w3.org/2003/g/data-view#',
52+
'rif' : 'http://www.w3.org/2007/rif#',
53+
'sioc' : 'http://rdfs.org/sioc/ns#',
54+
'skos' : 'http://www.w3.org/2004/02/skos/core#',
55+
'xml' : 'http://www.w3.org/XML/1998/namespace',
56+
'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#',
57+
'rev' : 'http://purl.org/stuff/rev#',
58+
'rdfa' : 'http://www.w3.org/ns/rdfa#',
59+
'dc' : 'http://purl.org/dc/terms/',
60+
'foaf' : 'http://xmlns.com/foaf/0.1/',
61+
'void' : 'http://rdfs.org/ns/void#',
62+
'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#',
63+
'vcard' : 'http://www.w3.org/2006/vcard/ns#',
64+
'wdrs' : 'http://www.w3.org/2007/05/powder-s#',
65+
'og' : 'http://ogp.me/ns#',
66+
'wdr' : 'http://www.w3.org/2007/05/powder#',
67+
'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
68+
'xhv' : 'http://www.w3.org/1999/xhtml/vocab#',
69+
'xsd' : 'http://www.w3.org/2001/XMLSchema#',
70+
'v' : 'http://rdf.data-vocabulary.org/#',
71+
'skosxl' : 'http://www.w3.org/2008/05/skos-xl#',
72+
'schema' : 'http://schema.org/',
73+
}
74+
75+
# if bad property
76+
if ':' not in prop:
77+
return prop
78+
79+
# if property has no prefix
80+
if 'http://' in prop:
81+
return prop
82+
83+
prefix = prop.split(':')[0]
84+
85+
match = None
86+
if head_element.get('prefix'):
87+
match = re.search(prefix + ': [^\s]+', head_element.get('prefix'))
88+
89+
# if namespace taken from prefix attribute in head tag
90+
if match:
91+
ns = match.group().split(': ')[1]
92+
return ns + prop.split(':')[1]
93+
94+
# if namespace taken from xmlns attribute in html tag
95+
if ('xmlns:' + prefix) in html_element.keys():
96+
return html_element.get('xmlns:' + prefix) + prop.split(':')[1]
97+
98+
# if namespace present in inital context
99+
if prefix in context:
100+
return context[prefix] + prop.split(':')[1]
101+
102+
return prop
103+
104+
def _sort(self, unordered, ordered):
105+
"""Sort the rdfa tags in jsonld string"""
106+
idx_for_value = dict(reversed([(value, idx) for idx, value in enumerate(ordered)]))
107+
unordered.sort(key=lambda props: idx_for_value.get(props.get('@value'), len(ordered)))
108+
109+
110+
def _fix_order(self, jsonld_string, document):
111+
"""
112+
Fix order of rdfa tags in jsonld string
113+
by checking the appearance order in the HTML
114+
"""
115+
json_objects = json.loads(jsonld_string)
116+
117+
html, head = document.xpath('/html'), document.xpath('//head')
118+
if not html or not head:
119+
return json_objects
120+
html_element, head_element = html[0], head[0]
121+
122+
# Stores the values or each property in appearance order
123+
values_for_property = defaultdict(list)
124+
125+
for meta_tag in head_element.xpath("meta[@property]"):
126+
expanded_property = self._replaceNS(meta_tag.attrib['property'],
127+
html_element,
128+
head_element)
129+
values_for_property[expanded_property].append(meta_tag.get('content'))
130+
131+
for json_object in json_objects:
132+
keys = json_object.keys()
133+
134+
for key in keys:
135+
if type(json_object[key]) is list and len(json_object[key]) > 1:
136+
self._sort(json_object[key], values_for_property[key])
137+
138+
return json_objects
139+
38140
def extract(self, htmlstring, base_url=None, encoding="UTF-8",
39141
expanded=True):
40142
tree = parse_xmldom_html(htmlstring, encoding=encoding)
@@ -51,4 +153,10 @@ def extract_items(self, document, base_url=None, expanded=True):
51153
check_lite=False)
52154
g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
53155
jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
54-
return json.loads(jsonld_string)
156+
157+
try:
158+
# hack to fix the ordering of multi-value properties (see issue 116)
159+
# it should be disabled once PyRDFA fixes itself
160+
return self._fix_order(jsonld_string, document)
161+
except:
162+
return json.loads(jsonld_string)

tests/samples/songkick/elysianfields.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,4 +269,4 @@
269269
]
270270
}
271271
]
272-
}
272+
}

0 commit comments

Comments
 (0)