Skip to content

Commit bf8219b

Browse files
authored
Merge pull request #148 from scrapinghub/microdata-fast-doc-id
Fast get_docid for microdata parser (fixes GH-147)
2 parents 205ee73 + 6f2e2d2 commit bf8219b

File tree

1 file changed

+32
-26
lines changed

1 file changed

+32
-26
lines changed

extruct/w3cmicrodata.py

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -45,40 +45,44 @@
4545

4646

4747
class LxmlMicrodataExtractor(object):
48+
# iterate in document order (used below for fast get_docid)
4849
_xp_item = lxml.etree.XPath('descendant-or-self::*[@itemscope]')
4950
_xp_prop = lxml.etree.XPath("""set:difference(.//*[@itemprop],
5051
.//*[@itemscope]//*[@itemprop])""",
5152
namespaces = {"set": "http://exslt.org/sets"})
5253
_xp_clean_text = lxml.etree.XPath('descendant-or-self::*[not(self::script or self::style)]/text()')
53-
# ancestor and preceding axes contain all elements before the context node
54-
# so counting them gives the "document order" of the context node
55-
_xp_item_docid = lxml.etree.XPath("""count(preceding::*[@itemscope])
56-
+ count(ancestor::*[@itemscope])
57-
+ 1""")
5854

5955
def __init__(self, nested=True, strict=False, add_text_content=False, add_html_node=False):
6056
self.nested = nested
6157
self.strict = strict
6258
self.add_text_content = add_text_content
6359
self.add_html_node = add_html_node
6460

65-
def get_docid(self, node):
66-
return int(self._xp_item_docid(node))
67-
6861
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
6962
tree = parse_html(htmlstring, encoding=encoding)
7063
return self.extract_items(tree, base_url)
7164

7265
def extract_items(self, document, base_url):
66+
itemids = self._build_itemids(document)
7367
items_seen = set()
7468
return [
7569
item for item in (
76-
self._extract_item(it, items_seen=items_seen, base_url=base_url)
70+
self._extract_item(
71+
it, items_seen=items_seen, base_url=base_url, itemids=itemids)
7772
for it in self._xp_item(document))
7873
if item]
7974

80-
def _extract_item(self, node, items_seen, base_url):
81-
itemid = self.get_docid(node)
75+
def get_docid(self, node, itemids):
76+
return itemids[node]
77+
78+
def _build_itemids(self, document):
79+
""" Build itemids for a fast get_docid implementation. Use document order.
80+
"""
81+
root = document.getroottree().getroot()
82+
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}
83+
84+
def _extract_item(self, node, items_seen, base_url, itemids):
85+
itemid = self.get_docid(node, itemids)
8286

8387
if self.nested:
8488
if itemid in items_seen:
@@ -95,21 +99,22 @@ def _extract_item(self, node, items_seen, base_url):
9599
else:
96100
item["type"] = types
97101

98-
itemid = node.get('itemid')
99-
if itemid:
100-
item["id"] = itemid.strip()
102+
nodeid = node.get('itemid')
103+
if nodeid:
104+
item["id"] = nodeid.strip()
101105

102106
properties = collections.defaultdict(list)
103107
for name, value in self._extract_properties(
104-
node, items_seen=items_seen, base_url=base_url):
108+
node, items_seen=items_seen, base_url=base_url, itemids=itemids):
105109
properties[name].append(value)
106110

107111
# process item references
108112
refs = node.get('itemref', '').split()
109113
if refs:
110114
for refid in refs:
111115
for name, value in self._extract_property_refs(
112-
node, refid, items_seen=items_seen, base_url=base_url):
116+
node, refid, items_seen=items_seen, base_url=base_url,
117+
itemids=itemids):
113118
properties[name].append(value)
114119

115120
props = []
@@ -123,7 +128,8 @@ def _extract_item(self, node, items_seen, base_url):
123128
else:
124129
# item without properties; let's use the node itself
125130
item["value"] = self._extract_property_value(
126-
node, force=True, items_seen=items_seen, base_url=base_url)
131+
node, force=True, items_seen=items_seen, base_url=base_url,
132+
itemids=itemids)
127133

128134
# below are not in the specs, but can be handy
129135
if self.add_text_content:
@@ -135,19 +141,19 @@ def _extract_item(self, node, items_seen, base_url):
135141

136142
return item
137143

138-
def _extract_properties(self, node, items_seen, base_url):
144+
def _extract_properties(self, node, items_seen, base_url, itemids):
139145
for prop in self._xp_prop(node):
140146
for p, v in self._extract_property(
141-
prop, items_seen=items_seen, base_url=base_url):
147+
prop, items_seen=items_seen, base_url=base_url, itemids=itemids):
142148
yield p, v
143149

144-
def _extract_property_refs(self, node, refid, items_seen, base_url):
150+
def _extract_property_refs(self, node, refid, items_seen, base_url, itemids):
145151
ref_node = node.xpath("id($refid)[1]", refid=refid)
146152
if not ref_node:
147153
return
148154
ref_node = ref_node[0]
149155
extract_fn = partial(self._extract_property, items_seen=items_seen,
150-
base_url=base_url)
156+
base_url=base_url, itemids=itemids)
151157
if 'itemprop' in ref_node.keys() and 'itemscope' in ref_node.keys():
152158
# An full item will be extracted from the node, no need to look
153159
# for individual properties in child nodes
@@ -162,20 +168,20 @@ def _extract_property_refs(self, node, refid, items_seen, base_url):
162168
for p, v in extract_fn(prop):
163169
yield p, v
164170

165-
def _extract_property(self, node, items_seen, base_url):
171+
def _extract_property(self, node, items_seen, base_url, itemids):
166172
props = node.get("itemprop").split()
167173
value = self._extract_property_value(
168-
node, items_seen=items_seen, base_url=base_url)
174+
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
169175
return [(p, value) for p in props]
170176

171-
def _extract_property_value(self, node, items_seen, base_url, force=False):
177+
def _extract_property_value(self, node, items_seen, base_url, itemids, force=False):
172178
#http://www.w3.org/TR/microdata/#values
173179
if not force and node.get("itemscope") is not None:
174180
if self.nested:
175181
return self._extract_item(
176-
node, items_seen=items_seen, base_url=base_url)
182+
node, items_seen=items_seen, base_url=base_url, itemids=itemids)
177183
else:
178-
return {"iid_ref": self.get_docid(node)}
184+
return {"iid_ref": self.get_docid(node, itemids)}
179185

180186
elif node.tag == "meta":
181187
return node.get("content", "")

0 commit comments

Comments
 (0)