4
4
5
5
Based on pyrdfa3 and rdflib
6
6
"""
7
+ from collections import defaultdict
8
+
7
9
import json
8
10
import logging
11
+ import re
9
12
10
13
rdflib_logger = logging .getLogger ('rdflib' )
11
14
rdflib_logger .setLevel (logging .ERROR )
35
38
36
39
class RDFaExtractor (object ):
37
40
41
+ def _replaceNS (self , prop , html_element , head_element ):
42
+ """Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""
43
+
44
+ # context namespaces taken from pyrdfa3
45
+ # https://github.com/RDFLib/PyRDFa/blob/master/pyRdfa/initialcontext.py
46
+ context = {
47
+ 'owl' : 'http://www.w3.org/2002/07/owl#' ,
48
+ 'gr' : 'http://purl.org/goodrelations/v1#' ,
49
+ 'ctag' : 'http://commontag.org/ns#' ,
50
+ 'cc' : 'http://creativecommons.org/ns#' ,
51
+ 'grddl' : 'http://www.w3.org/2003/g/data-view#' ,
52
+ 'rif' : 'http://www.w3.org/2007/rif#' ,
53
+ 'sioc' : 'http://rdfs.org/sioc/ns#' ,
54
+ 'skos' : 'http://www.w3.org/2004/02/skos/core#' ,
55
+ 'xml' : 'http://www.w3.org/XML/1998/namespace' ,
56
+ 'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#' ,
57
+ 'rev' : 'http://purl.org/stuff/rev#' ,
58
+ 'rdfa' : 'http://www.w3.org/ns/rdfa#' ,
59
+ 'dc' : 'http://purl.org/dc/terms/' ,
60
+ 'foaf' : 'http://xmlns.com/foaf/0.1/' ,
61
+ 'void' : 'http://rdfs.org/ns/void#' ,
62
+ 'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#' ,
63
+ 'vcard' : 'http://www.w3.org/2006/vcard/ns#' ,
64
+ 'wdrs' : 'http://www.w3.org/2007/05/powder-s#' ,
65
+ 'og' : 'http://ogp.me/ns#' ,
66
+ 'wdr' : 'http://www.w3.org/2007/05/powder#' ,
67
+ 'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' ,
68
+ 'xhv' : 'http://www.w3.org/1999/xhtml/vocab#' ,
69
+ 'xsd' : 'http://www.w3.org/2001/XMLSchema#' ,
70
+ 'v' : 'http://rdf.data-vocabulary.org/#' ,
71
+ 'skosxl' : 'http://www.w3.org/2008/05/skos-xl#' ,
72
+ 'schema' : 'http://schema.org/' ,
73
+ }
74
+
75
+ # if bad property
76
+ if ':' not in prop :
77
+ return prop
78
+
79
+ # if property has no prefix
80
+ if 'http://' in prop :
81
+ return prop
82
+
83
+ prefix = prop .split (':' )[0 ]
84
+
85
+ match = None
86
+ if head_element .get ('prefix' ):
87
+ match = re .search (prefix + ': [^\s]+' , head_element .get ('prefix' ))
88
+
89
+ # if namespace taken from prefix attribute in head tag
90
+ if match :
91
+ ns = match .group ().split (': ' )[1 ]
92
+ return ns + prop .split (':' )[1 ]
93
+
94
+ # if namespace taken from xmlns attribute in html tag
95
+ if ('xmlns:' + prefix ) in html_element .keys ():
96
+ return html_element .get ('xmlns:' + prefix ) + prop .split (':' )[1 ]
97
+
98
+ # if namespace present in inital context
99
+ if prefix in context :
100
+ return context [prefix ] + prop .split (':' )[1 ]
101
+
102
+ return prop
103
+
104
+ def _sort (self , unordered , ordered ):
105
+ """Sort the rdfa tags in jsonld string"""
106
+ idx_for_value = dict (reversed ([(value , idx ) for idx , value in enumerate (ordered )]))
107
+ unordered .sort (key = lambda props : idx_for_value .get (props .get ('@value' ), len (ordered )))
108
+
109
+
110
+ def _fix_order (self , jsonld_string , document ):
111
+ """
112
+ Fix order of rdfa tags in jsonld string
113
+ by checking the appearance order in the HTML
114
+ """
115
+ json_objects = json .loads (jsonld_string )
116
+
117
+ html , head = document .xpath ('/html' ), document .xpath ('//head' )
118
+ if not html or not head :
119
+ return json_objects
120
+ html_element , head_element = html [0 ], head [0 ]
121
+
122
+ # Stores the values or each property in appearance order
123
+ values_for_property = defaultdict (list )
124
+
125
+ for meta_tag in head_element .xpath ("meta[@property]" ):
126
+ expanded_property = self ._replaceNS (meta_tag .attrib ['property' ],
127
+ html_element ,
128
+ head_element )
129
+ values_for_property [expanded_property ].append (meta_tag .get ('content' ))
130
+
131
+ for json_object in json_objects :
132
+ keys = json_object .keys ()
133
+
134
+ for key in keys :
135
+ if type (json_object [key ]) is list and len (json_object [key ]) > 1 :
136
+ self ._sort (json_object [key ], values_for_property [key ])
137
+
138
+ return json_objects
139
+
38
140
def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ,
39
141
expanded = True ):
40
142
tree = parse_xmldom_html (htmlstring , encoding = encoding )
@@ -51,4 +153,10 @@ def extract_items(self, document, base_url=None, expanded=True):
51
153
check_lite = False )
52
154
g = PyRdfa (options , base = base_url ).graph_from_DOM (document , graph = Graph (), pgraph = Graph ())
53
155
jsonld_string = g .serialize (format = 'json-ld' , auto_compact = not expanded ).decode ('utf-8' )
54
- return json .loads (jsonld_string )
156
+
157
+ try :
158
+ # hack to fix the ordering of multi-value properties (see issue 116)
159
+ # it should be disabled once PyRDFA fixes itself
160
+ return self ._fix_order (jsonld_string , document )
161
+ except :
162
+ return json .loads (jsonld_string )
0 commit comments