Skip to content

Commit f5351d2

Browse files
authored
Merge pull request #111 from opendata-swiss/feat/enable-multilingual-relations
Feat/enable multilingual relations
2 parents fabdad2 + 9ff08c1 commit f5351d2

File tree

4 files changed

+79
-20
lines changed

4 files changed

+79
-20
lines changed

ckanext/dcatapchharvest/dcat_helpers.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ def get_langs():
7878
return language_priorities
7979

8080

81+
def localize_by_language_priority(multilang_dict):
82+
for lang in get_langs():
83+
if multilang_dict.get(lang, ''):
84+
return multilang_dict[lang]
85+
return ''
86+
87+
8188
def dataset_uri(dataset_dict, dataset_ref=None):
8289
"""
8390
Returns a URI for the dataset

ckanext/dcatapchharvest/profiles.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@
6767

6868

6969
class MultiLangProfile(RDFProfile):
70-
def _add_multilang_value(self, subject, predicate, dataset_key=None,
71-
dataset_dict=None,
70+
def _add_multilang_value(self, subject, predicate, key=None,
71+
data_dict=None,
7272
multilang_values=None): # noqa
73-
if not multilang_values and dataset_dict and dataset_key:
74-
multilang_values = dataset_dict.get(dataset_key)
73+
if not multilang_values and data_dict and key:
74+
multilang_values = data_dict.get(key)
7575
if multilang_values:
7676
try:
7777
for key, values in multilang_values.iteritems():
@@ -251,14 +251,23 @@ def _publisher(self, subject, identifier):
251251
return json.dumps(publisher)
252252

253253
def _relations(self, subject):
254-
255254
relations = []
256-
257255
for relation_node in self.g.objects(subject, DCT.relation):
258256
relation = {
259-
'label': self._object_value(relation_node, RDFS.label),
260-
'url': relation_node
257+
'label': self._object_value(
258+
relation_node,
259+
RDFS.label,
260+
multilang=True
261+
),
262+
'url': unicode(relation_node)
261263
}
264+
# If we don't have a label in any language, use the highest-prio
265+
# language where we do have a label, or fall back to the url
266+
fallback = (dh.localize_by_language_priority(relation['label']) or
267+
relation.get('url', ''))
268+
for lang in dh.get_langs():
269+
if not relation['label'][lang]:
270+
relation['label'][lang] = fallback
262271
relations.append(relation)
263272

264273
return relations
@@ -616,9 +625,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
616625

617626
# Relations
618627
dataset_dict['relations'] = self._relations(dataset_ref)
619-
for relation in dataset_dict['relations']:
620-
if relation['label'] == {}:
621-
relation['label'] = str(relation.get('url', ''))
622628

623629
# Temporal
624630
dataset_dict['temporals'] = self._temporals(dataset_ref)
@@ -862,16 +868,20 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa
862868
if dataset_dict.get('relations'):
863869
relations = dataset_dict.get('relations')
864870
for relation in relations:
865-
relation_name = relation['label']
866871
try:
867872
relation_url = dh.uri_to_iri(relation['url'])
868873
except ValueError:
869874
# skip this relation if the URL is invalid
870875
continue
871876

872-
relation = URIRef(relation_url)
873-
g.add((relation, RDFS.label, Literal(relation_name)))
874-
g.add((dataset_ref, DCT.relation, relation))
877+
relation_uriref = URIRef(relation_url)
878+
self._add_multilang_value(
879+
relation_uriref,
880+
RDFS.label,
881+
'label',
882+
relation
883+
)
884+
g.add((dataset_ref, DCT.relation, relation_uriref))
875885

876886
# References
877887
if dataset_dict.get('see_alsos'):

ckanext/dcatapchharvest/tests/fixtures/1901.xml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,20 @@
66
<rdfs:label>legal_basis</rdfs:label>
77
</rdf:Description>
88
</dct:relation>
9+
<dct:relation>
10+
<rdf:Description rdf:about="https://www.example.org/aaa">
11+
<rdfs:label xml:lang="it">Text for label IT</rdfs:label>
12+
<rdfs:label xml:lang="fr">Text for label FR</rdfs:label>
13+
<rdfs:label xml:lang="de">Text for label DE</rdfs:label>
14+
<rdfs:label xml:lang="en">Text for label EN</rdfs:label>
15+
</rdf:Description>
16+
</dct:relation>
17+
<dct:relation rdf:resource="https://www.example.org/bbb"/>
18+
<dct:relation>
19+
<rdf:Description rdf:about="https://www.example.org/ccc">
20+
<rdfs:label xml:lang="it">Text for label IT</rdfs:label>
21+
</rdf:Description>
22+
</dct:relation>
923
<dcat:distribution>
1024
<dcat:Distribution rdf:about="https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585/resource/c8ec6ca0-6923-4cf3-92f2-95a10e6f8e25">
1125
<dct:title xml:lang="fr">Annuaire statistique de la Suisse 1901</dct:title>

ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from rdflib.namespace import RDF
99

1010
from ckanext.dcat.processors import RDFParser
11+
from ckanext.dcatapchharvest.dcat_helpers import get_langs
1112
from ckanext.dcatapchharvest.profiles import (DCAT, DCT)
1213
from ckanext.dcatapchharvest.tests.base_test_classes import BaseParseTest
1314

@@ -16,6 +17,7 @@
1617

1718

1819
class TestSwissDCATAPProfileParsing(BaseParseTest):
20+
languages = get_langs()
1921

2022
def test_rights_license(self):
2123

@@ -51,16 +53,16 @@ def test_dataset_all_fields(self):
5153
extras = self._extras(dataset)
5254

5355
# Basic fields
54-
assert all(l in dataset['title'] for l in ['de', 'fr', 'it', 'en']), "title contains all languages"
56+
assert all(l in dataset['title'] for l in self.languages), "title contains all languages"
5557
eq_(dataset['title']['de'], u'Statistisches Jahrbuch der Schweiz 1901')
5658
eq_(dataset['title']['fr'], u'Annuaire statistique de la Suisse 1901')
5759

58-
assert all(l in dataset['description'] for l in ['de', 'fr', 'it', 'en']), "description contains all languages"
60+
assert all(l in dataset['description'] for l in self.languages), "description contains all languages"
5961
eq_(dataset['description']['de'], u'')
6062
eq_(dataset['url'], u'https://www.bfs.admin.ch/bfs/de/home/statistiken.html')
6163

6264
# Keywords
63-
assert all(l in dataset['keywords'] for l in ['de', 'fr', 'it', 'en']), "keywords contains all languages"
65+
assert all(l in dataset['keywords'] for l in self.languages), "keywords contains all languages"
6466
eq_(sorted(dataset['keywords']['de']), ['publikation', 'statistische-grundlagen-und-ubersichten'])
6567
eq_(sorted(dataset['keywords']['fr']), ['bases-statistiques-et-generalites', 'publication'])
6668
eq_(sorted(dataset['keywords']['it']), ['basi-statistiche-e-presentazioni-generali', 'pubblicazione'])
@@ -101,6 +103,32 @@ def test_dataset_all_fields(self):
101103
see_also = dataset['see_alsos'][0]
102104
eq_(see_also['dataset_identifier'], u'4682791@bundesamt-fur-statistik-bfs')
103105

106+
relations = sorted(dataset["relations"], key=lambda relation: relation['url'])
107+
108+
# Relations - only one label given, no language specified
109+
eq_(relations[0]['url'], "https://www.admin.ch/opc/de/classified-compilation/19920252/index.html")
110+
for lang in self.languages:
111+
eq_(relations[0]['label'][lang], 'legal_basis')
112+
113+
# Relations - multilingual labels
114+
eq_(relations[1]['url'], "https://www.example.org/aaa")
115+
for lang in self.languages:
116+
eq_(relations[1]['label'][lang], 'Text for label ' + lang.upper())
117+
118+
# Relations - no label given
119+
eq_(relations[2]['url'], "https://www.example.org/bbb")
120+
for lang in self.languages:
121+
eq_(relations[2]['label'][lang], "https://www.example.org/bbb")
122+
123+
# Relations - label given, language specified but not German.
124+
# If there is no label given in a language, we try to get one from
125+
# another language, in the priority order 'en' -> 'de' -> 'fr' -> 'it'.
126+
# Here we test that we end up with a label text in all languages, even
127+
# though the source only had a label in Italian.
128+
eq_(relations[3]['url'], "https://www.example.org/ccc")
129+
for lang in self.languages:
130+
eq_(relations[3]['label'][lang], 'Text for label IT')
131+
104132
# Qualified relations
105133
qualified_relations = sorted(dataset["qualified_relations"])
106134
eq_(
@@ -138,10 +166,10 @@ def test_dataset_all_fields(self):
138166
resource = dataset['resources'][0]
139167

140168
# Simple values
141-
assert all(l in resource['title'] for l in ['de', 'fr', 'it', 'en']), "resource title contains all languages"
169+
assert all(l in resource['title'] for l in self.languages), "resource title contains all languages"
142170
eq_(resource['title']['fr'], u'Annuaire statistique de la Suisse 1901')
143171
eq_(resource['title']['de'], u'')
144-
assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en']), "resource description contains all languages"
172+
assert all(l in resource['description'] for l in self.languages), "resource description contains all languages"
145173
eq_(resource['description']['de'], u'')
146174
eq_(resource['format'], u'html')
147175
eq_(resource['media_type'], u'text/html')

0 commit comments

Comments
 (0)