From 7cd7c4bb309e5016ba71d0c6fe10211edec6c7b5 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 10 Feb 2025 18:27:56 +0100 Subject: [PATCH 1/8] feat: Rename variables to reflect usage We use this method to map multilingual values onto other things than datasets, so calling the variables dataset_dict and dataset_key was confusing. --- ckanext/dcatapchharvest/profiles.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index c4530e88..71ea771e 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -67,11 +67,11 @@ class MultiLangProfile(RDFProfile): - def _add_multilang_value(self, subject, predicate, dataset_key=None, - dataset_dict=None, + def _add_multilang_value(self, subject, predicate, key=None, + data_dict=None, multilang_values=None): # noqa - if not multilang_values and dataset_dict and dataset_key: - multilang_values = dataset_dict.get(dataset_key) + if not multilang_values and data_dict and key: + multilang_values = data_dict.get(key) if multilang_values: try: for key, values in multilang_values.iteritems(): From 193159495a0a286a88e160151377b4eb91e33173 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 10 Feb 2025 18:28:42 +0100 Subject: [PATCH 2/8] feat: Map multilingual labels onto relations --- ckanext/dcatapchharvest/profiles.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 71ea771e..b536b51d 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -256,7 +256,7 @@ def _relations(self, subject): for relation_node in self.g.objects(subject, DCT.relation): relation = { - 'label': self._object_value(relation_node, RDFS.label), + 'label': self._object_value(relation_node, RDFS.label, multilang=True), 'url': relation_node } relations.append(relation) @@ -617,8 +617,9 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa # Relations dataset_dict['relations'] = self._relations(dataset_ref) for relation in dataset_dict['relations']: - if relation['label'] == {}: - relation['label'] = str(relation.get('url', '')) + for lang in dh.get_langs(): + if not relation['label'][lang]: + relation['label'][lang] = str(relation.get('url', '')) # Temporal dataset_dict['temporals'] = self._temporals(dataset_ref) From ae1ec7528414484063dc22e9af3b31482231561c Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 10 Feb 2025 18:29:06 +0100 Subject: [PATCH 3/8] feat: Output multilingual labels on relations in graph --- ckanext/dcatapchharvest/profiles.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index b536b51d..9c260769 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -863,16 +863,20 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa if dataset_dict.get('relations'): relations = dataset_dict.get('relations') for relation in relations: - relation_name = relation['label'] try: relation_url = dh.uri_to_iri(relation['url']) except ValueError: # skip this relation if the URL is invalid continue - relation = URIRef(relation_url) - g.add((relation, RDFS.label, Literal(relation_name))) - g.add((dataset_ref, DCT.relation, relation)) + relation_uriref = URIRef(relation_url) + self._add_multilang_value( + relation_uriref, + RDFS.label, + 'label', + relation + ) + g.add((dataset_ref, DCT.relation, relation_uriref)) # References if dataset_dict.get('see_alsos'): From f309c160b3b52f662b6317b57bb3a576ddacc1a2 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 10 Feb 2025 18:30:39 +0100 Subject: [PATCH 4/8] style: Fix long line --- ckanext/dcatapchharvest/profiles.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 9c260769..a2334d0e 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -256,7 +256,11 @@ def _relations(self, subject): for relation_node in self.g.objects(subject, DCT.relation): relation = { - 'label': self._object_value(relation_node, RDFS.label, multilang=True), + 'label': self._object_value( + relation_node, + RDFS.label, + multilang=True + ), 'url': relation_node } relations.append(relation) From 1b75611d11282cbefcb3b95e155e385b5c590b74 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Wed, 12 Feb 2025 14:58:20 +0100 Subject: [PATCH 5/8] feat: Improve mapping multilingual relation labels --- ckanext/dcatapchharvest/dcat_helpers.py | 7 +++++++ ckanext/dcatapchharvest/profiles.py | 15 ++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 68c1d6eb..9bc86979 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -78,6 +78,13 @@ def get_langs(): return language_priorities +def localize_by_language_priority(multilang_dict): + for lang in get_langs(): + if multilang_dict.get(lang, ''): + return multilang_dict[lang] + return '' + + def dataset_uri(dataset_dict, dataset_ref=None): """ Returns a URI for the dataset diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index a2334d0e..5b4e8bbb 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -251,9 +251,7 @@ def _publisher(self, subject, identifier): return json.dumps(publisher) def _relations(self, subject): - relations = [] - for relation_node in self.g.objects(subject, DCT.relation): relation = { 'label': self._object_value( @@ -261,8 +259,15 @@ def _relations(self, subject): RDFS.label, multilang=True ), - 'url': relation_node + 'url': unicode(relation_node) } + # If we don't have a label in any language, use the highest-prio + # language where we do have a label, or fall back to the url + fallback = (dh.localize_by_language_priority(relation['label']) or + relation.get('url', '')) + for lang in dh.get_langs(): + if not relation['label'][lang]: + relation['label'][lang] = fallback relations.append(relation) return relations @@ -620,10 +625,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa # Relations dataset_dict['relations'] = self._relations(dataset_ref) - for relation in dataset_dict['relations']: - for lang in dh.get_langs(): - if not relation['label'][lang]: - relation['label'][lang] = str(relation.get('url', '')) # Temporal dataset_dict['temporals'] = self._temporals(dataset_ref) From 2579cae4fe6e057ebdc1a8f88b43a08c7a9709b6 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Wed, 12 Feb 2025 14:59:32 +0100 Subject: [PATCH 6/8] tests: Standardise the language list used in tests --- .../dcatapchharvest/tests/test_dcatap_ch_parse.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index e796a073..5c183c93 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -8,6 +8,7 @@ from rdflib.namespace import RDF from ckanext.dcat.processors import RDFParser +from ckanext.dcatapchharvest.dcat_helpers import get_langs from ckanext.dcatapchharvest.profiles import (DCAT, DCT) from ckanext.dcatapchharvest.tests.base_test_classes import BaseParseTest @@ -16,6 +17,7 @@ class TestSwissDCATAPProfileParsing(BaseParseTest): + languages = get_langs() def test_rights_license(self): @@ -51,16 +53,16 @@ def test_dataset_all_fields(self): extras = self._extras(dataset) # Basic fields - assert all(l in dataset['title'] for l in ['de', 'fr', 'it', 'en']), "title contains all languages" + assert all(l in dataset['title'] for l in self.languages), "title contains all languages" eq_(dataset['title']['de'], u'Statistisches Jahrbuch der Schweiz 1901') eq_(dataset['title']['fr'], u'Annuaire statistique de la Suisse 1901') - assert all(l in dataset['description'] for l in ['de', 'fr', 'it', 'en']), "description contains all languages" + assert all(l in dataset['description'] for l in self.languages), "description contains all languages" eq_(dataset['description']['de'], u'') eq_(dataset['url'], u'https://www.bfs.admin.ch/bfs/de/home/statistiken.html') # Keywords - assert all(l in dataset['keywords'] for l in ['de', 'fr', 'it', 'en']), "keywords contains all languages" + assert all(l in dataset['keywords'] for l in self.languages), "keywords contains all languages" eq_(sorted(dataset['keywords']['de']), ['publikation', 'statistische-grundlagen-und-ubersichten']) eq_(sorted(dataset['keywords']['fr']), ['bases-statistiques-et-generalites', 'publication']) eq_(sorted(dataset['keywords']['it']), ['basi-statistiche-e-presentazioni-generali', 'pubblicazione']) @@ -138,10 +140,10 @@ def test_dataset_all_fields(self): resource = dataset['resources'][0] # Simple values - assert all(l in resource['title'] for l in ['de', 'fr', 'it', 'en']), "resource title contains all languages" + assert all(l in resource['title'] for l in self.languages), "resource title contains all languages" eq_(resource['title']['fr'], u'Annuaire statistique de la Suisse 1901') eq_(resource['title']['de'], u'') - assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en']), "resource description contains all languages" + assert all(l in resource['description'] for l in self.languages), "resource description contains all languages" eq_(resource['description']['de'], u'') eq_(resource['format'], u'html') eq_(resource['media_type'], u'text/html') From e4be30b9fa96ea813dfc3f734a7a74eaa345d371 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Wed, 12 Feb 2025 15:00:09 +0100 Subject: [PATCH 7/8] tests: Add tests for multilingual relation labels --- .../dcatapchharvest/tests/fixtures/1901.xml | 14 ++++++++++++ .../tests/test_dcatap_ch_parse.py | 22 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/ckanext/dcatapchharvest/tests/fixtures/1901.xml b/ckanext/dcatapchharvest/tests/fixtures/1901.xml index 084c3722..51fd554d 100644 --- a/ckanext/dcatapchharvest/tests/fixtures/1901.xml +++ b/ckanext/dcatapchharvest/tests/fixtures/1901.xml @@ -6,6 +6,20 @@ legal_basis + + + Text for label IT + Text for label FR + Text for label DE + Text for label EN + + + + + + Text for label IT + + Annuaire statistique de la Suisse 1901 diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index 5c183c93..a80d58cf 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -103,6 +103,28 @@ def test_dataset_all_fields(self): see_also = dataset['see_alsos'][0] eq_(see_also['dataset_identifier'], u'4682791@bundesamt-fur-statistik-bfs') + relations = sorted(dataset["relations"], key=lambda relation: relation['url']) + + # Relations - only one label given, no language specified + eq_(relations[0]['url'], "https://www.admin.ch/opc/de/classified-compilation/19920252/index.html") + for lang in self.languages: + eq_(relations[0]['label'][lang], 'legal_basis') + + # Relations - multilingual labels + eq_(relations[1]['url'], "https://www.example.org/aaa") + for lang in self.languages: + eq_(relations[1]['label'][lang], 'Text for label ' + lang.upper()) + + # Relations - no label given + eq_(relations[2]['url'], "https://www.example.org/bbb") + for lang in self.languages: + eq_(relations[2]['label'][lang], "https://www.example.org/bbb") + + # Relations - label given, language specified but not German + eq_(relations[3]['url'], "https://www.example.org/ccc") + for lang in self.languages: + eq_(relations[3]['label'][lang], 'Text for label IT') + # Qualified relations qualified_relations = sorted(dataset["qualified_relations"]) eq_( From 9ff08c114d386c08ee3575270dea502fe93f8707 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 17 Feb 2025 16:55:53 +0100 Subject: [PATCH 8/8] tests: Add explanatory comment --- ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index a80d58cf..c8c1494b 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -120,7 +120,11 @@ def test_dataset_all_fields(self): for lang in self.languages: eq_(relations[2]['label'][lang], "https://www.example.org/bbb") - # Relations - label given, language specified but not German + # Relations - label given, language specified but not German. + # If there is no label given in a language, we try to get one from + # another language, in the priority order 'en' -> 'de' -> 'fr' -> 'it'. + # Here we test that we end up with a label text in all languages, even + # though the source only had a label in Italian. eq_(relations[3]['url'], "https://www.example.org/ccc") for lang in self.languages: eq_(relations[3]['label'][lang], 'Text for label IT')