45
45
46
46
47
47
class LxmlMicrodataExtractor (object ):
48
+ # iterate in document order (used below for fast get_docid)
48
49
_xp_item = lxml .etree .XPath ('descendant-or-self::*[@itemscope]' )
49
50
_xp_prop = lxml .etree .XPath ("""set:difference(.//*[@itemprop],
50
51
.//*[@itemscope]//*[@itemprop])""" ,
51
52
namespaces = {"set" : "http://exslt.org/sets" })
52
53
_xp_clean_text = lxml .etree .XPath ('descendant-or-self::*[not(self::script or self::style)]/text()' )
53
- # ancestor and preceding axes contain all elements before the context node
54
- # so counting them gives the "document order" of the context node
55
- _xp_item_docid = lxml .etree .XPath ("""count(preceding::*[@itemscope])
56
- + count(ancestor::*[@itemscope])
57
- + 1""" )
58
54
59
55
def __init__ (self , nested = True , strict = False , add_text_content = False , add_html_node = False ):
60
56
self .nested = nested
61
57
self .strict = strict
62
58
self .add_text_content = add_text_content
63
59
self .add_html_node = add_html_node
64
60
65
- def get_docid (self , node ):
66
- return int (self ._xp_item_docid (node ))
67
-
68
61
def extract (self , htmlstring , base_url = None , encoding = "UTF-8" ):
69
62
tree = parse_html (htmlstring , encoding = encoding )
70
63
return self .extract_items (tree , base_url )
71
64
72
65
def extract_items (self , document , base_url ):
66
+ itemids = self ._build_itemids (document )
73
67
items_seen = set ()
74
68
return [
75
69
item for item in (
76
- self ._extract_item (it , items_seen = items_seen , base_url = base_url )
70
+ self ._extract_item (
71
+ it , items_seen = items_seen , base_url = base_url , itemids = itemids )
77
72
for it in self ._xp_item (document ))
78
73
if item ]
79
74
80
- def _extract_item (self , node , items_seen , base_url ):
81
- itemid = self .get_docid (node )
75
+ def get_docid (self , node , itemids ):
76
+ return itemids [node ]
77
+
78
+ def _build_itemids (self , document ):
79
+ """ Build itemids for a fast get_docid implementation. Use document order.
80
+ """
81
+ root = document .getroottree ().getroot ()
82
+ return {node : idx + 1 for idx , node in enumerate (self ._xp_item (root ))}
83
+
84
+ def _extract_item (self , node , items_seen , base_url , itemids ):
85
+ itemid = self .get_docid (node , itemids )
82
86
83
87
if self .nested :
84
88
if itemid in items_seen :
@@ -95,21 +99,22 @@ def _extract_item(self, node, items_seen, base_url):
95
99
else :
96
100
item ["type" ] = types
97
101
98
- itemid = node .get ('itemid' )
99
- if itemid :
100
- item ["id" ] = itemid .strip ()
102
+ nodeid = node .get ('itemid' )
103
+ if nodeid :
104
+ item ["id" ] = nodeid .strip ()
101
105
102
106
properties = collections .defaultdict (list )
103
107
for name , value in self ._extract_properties (
104
- node , items_seen = items_seen , base_url = base_url ):
108
+ node , items_seen = items_seen , base_url = base_url , itemids = itemids ):
105
109
properties [name ].append (value )
106
110
107
111
# process item references
108
112
refs = node .get ('itemref' , '' ).split ()
109
113
if refs :
110
114
for refid in refs :
111
115
for name , value in self ._extract_property_refs (
112
- node , refid , items_seen = items_seen , base_url = base_url ):
116
+ node , refid , items_seen = items_seen , base_url = base_url ,
117
+ itemids = itemids ):
113
118
properties [name ].append (value )
114
119
115
120
props = []
@@ -123,7 +128,8 @@ def _extract_item(self, node, items_seen, base_url):
123
128
else :
124
129
# item without properties; let's use the node itself
125
130
item ["value" ] = self ._extract_property_value (
126
- node , force = True , items_seen = items_seen , base_url = base_url )
131
+ node , force = True , items_seen = items_seen , base_url = base_url ,
132
+ itemids = itemids )
127
133
128
134
# below are not in the specs, but can be handy
129
135
if self .add_text_content :
@@ -135,19 +141,19 @@ def _extract_item(self, node, items_seen, base_url):
135
141
136
142
return item
137
143
138
- def _extract_properties (self , node , items_seen , base_url ):
144
+ def _extract_properties (self , node , items_seen , base_url , itemids ):
139
145
for prop in self ._xp_prop (node ):
140
146
for p , v in self ._extract_property (
141
- prop , items_seen = items_seen , base_url = base_url ):
147
+ prop , items_seen = items_seen , base_url = base_url , itemids = itemids ):
142
148
yield p , v
143
149
144
- def _extract_property_refs (self , node , refid , items_seen , base_url ):
150
+ def _extract_property_refs (self , node , refid , items_seen , base_url , itemids ):
145
151
ref_node = node .xpath ("id($refid)[1]" , refid = refid )
146
152
if not ref_node :
147
153
return
148
154
ref_node = ref_node [0 ]
149
155
extract_fn = partial (self ._extract_property , items_seen = items_seen ,
150
- base_url = base_url )
156
+ base_url = base_url , itemids = itemids )
151
157
if 'itemprop' in ref_node .keys () and 'itemscope' in ref_node .keys ():
152
158
# An full item will be extracted from the node, no need to look
153
159
# for individual properties in child nodes
@@ -162,20 +168,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
162
168
for p , v in extract_fn (prop ):
163
169
yield p , v
164
170
165
- def _extract_property (self , node , items_seen , base_url ):
171
+ def _extract_property (self , node , items_seen , base_url , itemids ):
166
172
props = node .get ("itemprop" ).split ()
167
173
value = self ._extract_property_value (
168
- node , items_seen = items_seen , base_url = base_url )
174
+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
169
175
return [(p , value ) for p in props ]
170
176
171
- def _extract_property_value (self , node , items_seen , base_url , force = False ):
177
+ def _extract_property_value (self , node , items_seen , base_url , itemids , force = False ):
172
178
#http://www.w3.org/TR/microdata/#values
173
179
if not force and node .get ("itemscope" ) is not None :
174
180
if self .nested :
175
181
return self ._extract_item (
176
- node , items_seen = items_seen , base_url = base_url )
182
+ node , items_seen = items_seen , base_url = base_url , itemids = itemids )
177
183
else :
178
- return {"iid_ref" : self .get_docid (node )}
184
+ return {"iid_ref" : self .get_docid (node , itemids )}
179
185
180
186
elif node .tag == "meta" :
181
187
return node .get ("content" , "" )
0 commit comments