Skip to content

Commit a64ce58

Browse files
authored
Merge pull request #138 from ragnerok/og-array
Added support for Open Graph arrays
2 parents f66c825 + be85256 commit a64ce58

File tree

3 files changed

+71
-8
lines changed

3 files changed

+71
-8
lines changed

extruct/_extruct.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def extract(htmlstring,
2121
uniform=False,
2222
return_html_node=False,
2323
schema_context='http://schema.org',
24+
with_og_array=False,
2425
**kwargs):
2526
"""htmlstring: string with valid html document;
2627
base_url: base url of the html document
@@ -134,7 +135,7 @@ def extract(htmlstring,
134135
for syntax, uniform, raw, schema_context in uniform_processors:
135136
try:
136137
if syntax == 'opengraph':
137-
output[syntax] = uniform(raw)
138+
output[syntax] = uniform(raw, with_og_array=with_og_array)
138139
else:
139140
output[syntax] = uniform(raw, schema_context)
140141
except Exception as e:

extruct/uniform.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,29 @@
11
from six.moves.urllib.parse import urlparse, urljoin
22

33

4-
def _uopengraph(extracted):
4+
def _uopengraph(extracted, with_og_array=False):
55
out = []
66
for obj in extracted:
77
# In order of appearance in the page
8-
properties = list(reversed(obj['properties']))
9-
# Ensuring that never empty value is returned if there is a duplicated
10-
# property with non empty value
11-
non_empty_props = {k for k, v in properties if v and v.strip()}
12-
flattened = {k: v for k, v in properties
13-
if k not in non_empty_props or (v and v.strip())}
8+
properties = list(obj['properties'])
9+
flattened = {}
10+
11+
for k, v in properties:
12+
if k not in flattened.keys():
13+
flattened[k] = v
14+
elif v and v.strip():
15+
# If og_array isn't required add first non empty value
16+
if not with_og_array:
17+
if not flattened[k] or not flattened[k].strip():
18+
flattened[k] = v
19+
else:
20+
if isinstance(flattened[k], list):
21+
flattened[k].append(v)
22+
elif flattened[k] and flattened[k].strip():
23+
flattened[k] = [flattened[k], v]
24+
else:
25+
flattened[k] = v
26+
1427
t = flattened.pop('og:type', None)
1528
if t:
1629
flattened['@type'] = t

tests/test_uniform.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@ def test_uopengraph(self):
2727
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
2828
self.assertEqual(data['opengraph'], expected)
2929

30+
def test_uopengraph_with_og_array(self):
31+
expected = [{"@context": {
32+
"og": "http://ogp.me/ns#",
33+
"fb": "http://www.facebook.com/2008/fbml",
34+
"concerts": "http://ogp.me/ns/fb/songkick-concerts#"
35+
},
36+
"fb:app_id": "308540029359",
37+
"og:site_name": "Songkick",
38+
"@type": "songkick-concerts:artist",
39+
"og:title": "Elysian Fields",
40+
"og:description": "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
41+
"og:url": "http://www.songkick.com/artists/236156-elysian-fields",
42+
"og:image": [ "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg",
43+
"http://images.sk-static.com/SECONDARY_IMAGE.jpg"],
44+
}]
45+
body = get_testdata('songkick', 'elysianfields.html')
46+
data = extruct.extract(body, syntaxes=['opengraph'], uniform=True, with_og_array=True)
47+
self.assertEqual(data['opengraph'], expected)
48+
3049
def test_uopengraph_duplicated_priorities(self):
3150
# Ensures that first seen property is kept when flattening
3251
data = _uopengraph([{'properties':
@@ -58,6 +77,36 @@ def test_uopengraph_duplicated_priorities(self):
5877
assert data[0]['prop_non_empty2'] == 'value!'
5978
assert data[0]['prop_non_empty3'] == 'value!'
6079

80+
def test_uopengraph_duplicated_with_og_array(self):
81+
# Ensures that first seen property is kept when flattening
82+
data = _uopengraph([{'properties':
83+
[('prop_{}'.format(k), 'value_{}'.format(v))
84+
for k in range(5)
85+
for v in range(5)],
86+
'namespace': 'namespace'}], with_og_array=True)
87+
for k in range(5):
88+
assert data[0]['prop_{}'.format(k)] == ['value_0', 'value_1', 'value_2', 'value_3', 'value_4']
89+
90+
# Ensures that empty is not returned if a property contains any
91+
# non empty value
92+
data = _uopengraph([{'properties':
93+
[('prop_empty', ' '),
94+
95+
('prop_non_empty', ' '),
96+
('prop_non_empty', 'value!'),
97+
98+
('prop_non_empty2', 'value!'),
99+
('prop_non_empty2', ' '),
100+
101+
('prop_non_empty3', ' '),
102+
('prop_non_empty3', 'value!'),
103+
('prop_non_empty3', 'other value'),
104+
],
105+
'namespace': 'namespace'}], with_og_array=True)
106+
assert data[0]['prop_empty'] == ' '
107+
assert data[0]['prop_non_empty'] == 'value!'
108+
assert data[0]['prop_non_empty2'] == 'value!'
109+
assert data[0]['prop_non_empty3'] == ['value!', 'other value']
61110

62111
def test_umicroformat(self):
63112
expected = [ { '@context': 'http://microformats.org/wiki/',

0 commit comments

Comments
 (0)