Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions ckanext/dcatapchharvest/dcat_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ def get_langs():
return language_priorities


def localize_by_language_priority(multilang_dict):
for lang in get_langs():
if multilang_dict.get(lang, ''):
return multilang_dict[lang]
return ''


def dataset_uri(dataset_dict, dataset_ref=None):
"""
Returns a URI for the dataset
Expand Down
40 changes: 25 additions & 15 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@


class MultiLangProfile(RDFProfile):
def _add_multilang_value(self, subject, predicate, dataset_key=None,
dataset_dict=None,
def _add_multilang_value(self, subject, predicate, key=None,
data_dict=None,
multilang_values=None): # noqa
if not multilang_values and dataset_dict and dataset_key:
multilang_values = dataset_dict.get(dataset_key)
if not multilang_values and data_dict and key:
multilang_values = data_dict.get(key)
if multilang_values:
try:
for key, values in multilang_values.iteritems():
Expand Down Expand Up @@ -251,14 +251,23 @@ def _publisher(self, subject, identifier):
return json.dumps(publisher)

def _relations(self, subject):

relations = []

for relation_node in self.g.objects(subject, DCT.relation):
relation = {
'label': self._object_value(relation_node, RDFS.label),
'url': relation_node
'label': self._object_value(
relation_node,
RDFS.label,
multilang=True
),
'url': unicode(relation_node)
}
# If we don't have a label in any language, use the highest-prio
# language where we do have a label, or fall back to the url
fallback = (dh.localize_by_language_priority(relation['label']) or
relation.get('url', ''))
for lang in dh.get_langs():
if not relation['label'][lang]:
relation['label'][lang] = fallback
relations.append(relation)

return relations
Expand Down Expand Up @@ -616,9 +625,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa

# Relations
dataset_dict['relations'] = self._relations(dataset_ref)
for relation in dataset_dict['relations']:
if relation['label'] == {}:
relation['label'] = str(relation.get('url', ''))

# Temporal
dataset_dict['temporals'] = self._temporals(dataset_ref)
Expand Down Expand Up @@ -862,16 +868,20 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa
if dataset_dict.get('relations'):
relations = dataset_dict.get('relations')
for relation in relations:
relation_name = relation['label']
try:
relation_url = dh.uri_to_iri(relation['url'])
except ValueError:
# skip this relation if the URL is invalid
continue

relation = URIRef(relation_url)
g.add((relation, RDFS.label, Literal(relation_name)))
g.add((dataset_ref, DCT.relation, relation))
relation_uriref = URIRef(relation_url)
self._add_multilang_value(
relation_uriref,
RDFS.label,
'label',
relation
)
g.add((dataset_ref, DCT.relation, relation_uriref))

# References
if dataset_dict.get('see_alsos'):
Expand Down
14 changes: 14 additions & 0 deletions ckanext/dcatapchharvest/tests/fixtures/1901.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@
<rdfs:label>legal_basis</rdfs:label>
</rdf:Description>
</dct:relation>
<dct:relation>
<rdf:Description rdf:about="https://www.example.org/aaa">
<rdfs:label xml:lang="it">Text for label IT</rdfs:label>
<rdfs:label xml:lang="fr">Text for label FR</rdfs:label>
<rdfs:label xml:lang="de">Text for label DE</rdfs:label>
<rdfs:label xml:lang="en">Text for label EN</rdfs:label>
</rdf:Description>
</dct:relation>
<dct:relation rdf:resource="https://www.example.org/bbb"/>
<dct:relation>
<rdf:Description rdf:about="https://www.example.org/ccc">
<rdfs:label xml:lang="it">Text for label IT</rdfs:label>
</rdf:Description>
</dct:relation>
<dcat:distribution>
<dcat:Distribution rdf:about="https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585/resource/c8ec6ca0-6923-4cf3-92f2-95a10e6f8e25">
<dct:title xml:lang="fr">Annuaire statistique de la Suisse 1901</dct:title>
Expand Down
38 changes: 33 additions & 5 deletions ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from rdflib.namespace import RDF

from ckanext.dcat.processors import RDFParser
from ckanext.dcatapchharvest.dcat_helpers import get_langs
from ckanext.dcatapchharvest.profiles import (DCAT, DCT)
from ckanext.dcatapchharvest.tests.base_test_classes import BaseParseTest

Expand All @@ -16,6 +17,7 @@


class TestSwissDCATAPProfileParsing(BaseParseTest):
languages = get_langs()

def test_rights_license(self):

Expand Down Expand Up @@ -51,16 +53,16 @@ def test_dataset_all_fields(self):
extras = self._extras(dataset)

# Basic fields
assert all(l in dataset['title'] for l in ['de', 'fr', 'it', 'en']), "title contains all languages"
assert all(l in dataset['title'] for l in self.languages), "title contains all languages"
eq_(dataset['title']['de'], u'Statistisches Jahrbuch der Schweiz 1901')
eq_(dataset['title']['fr'], u'Annuaire statistique de la Suisse 1901')

assert all(l in dataset['description'] for l in ['de', 'fr', 'it', 'en']), "description contains all languages"
assert all(l in dataset['description'] for l in self.languages), "description contains all languages"
eq_(dataset['description']['de'], u'')
eq_(dataset['url'], u'https://www.bfs.admin.ch/bfs/de/home/statistiken.html')

# Keywords
assert all(l in dataset['keywords'] for l in ['de', 'fr', 'it', 'en']), "keywords contains all languages"
assert all(l in dataset['keywords'] for l in self.languages), "keywords contains all languages"
eq_(sorted(dataset['keywords']['de']), ['publikation', 'statistische-grundlagen-und-ubersichten'])
eq_(sorted(dataset['keywords']['fr']), ['bases-statistiques-et-generalites', 'publication'])
eq_(sorted(dataset['keywords']['it']), ['basi-statistiche-e-presentazioni-generali', 'pubblicazione'])
Expand Down Expand Up @@ -101,6 +103,32 @@ def test_dataset_all_fields(self):
see_also = dataset['see_alsos'][0]
eq_(see_also['dataset_identifier'], u'4682791@bundesamt-fur-statistik-bfs')

relations = sorted(dataset["relations"], key=lambda relation: relation['url'])

# Relations - only one label given, no language specified
eq_(relations[0]['url'], "https://www.admin.ch/opc/de/classified-compilation/19920252/index.html")
for lang in self.languages:
eq_(relations[0]['label'][lang], 'legal_basis')

# Relations - multilingual labels
eq_(relations[1]['url'], "https://www.example.org/aaa")
for lang in self.languages:
eq_(relations[1]['label'][lang], 'Text for label ' + lang.upper())

# Relations - no label given
eq_(relations[2]['url'], "https://www.example.org/bbb")
for lang in self.languages:
eq_(relations[2]['label'][lang], "https://www.example.org/bbb")

# Relations - label given, language specified but not German.
# If there is no label given in a language, we try to get one from
# another language, in the priority order 'en' -> 'de' -> 'fr' -> 'it'.
# Here we test that we end up with a label text in all languages, even
# though the source only had a label in Italian.
eq_(relations[3]['url'], "https://www.example.org/ccc")
for lang in self.languages:
eq_(relations[3]['label'][lang], 'Text for label IT')

# Qualified relations
qualified_relations = sorted(dataset["qualified_relations"])
eq_(
Expand Down Expand Up @@ -138,10 +166,10 @@ def test_dataset_all_fields(self):
resource = dataset['resources'][0]

# Simple values
assert all(l in resource['title'] for l in ['de', 'fr', 'it', 'en']), "resource title contains all languages"
assert all(l in resource['title'] for l in self.languages), "resource title contains all languages"
eq_(resource['title']['fr'], u'Annuaire statistique de la Suisse 1901')
eq_(resource['title']['de'], u'')
assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en']), "resource description contains all languages"
assert all(l in resource['description'] for l in self.languages), "resource description contains all languages"
eq_(resource['description']['de'], u'')
eq_(resource['format'], u'html')
eq_(resource['media_type'], u'text/html')
Expand Down