Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
41eb589
fix: Update parsing of the dataset of the license and rights fields
kovalch Jul 29, 2024
c5263bf
fix: Update the dataset to graph logic
kovalch Jul 29, 2024
0a114ce
fix: Update tests tu use uri and not str as a value for license and r…
kovalch Jul 29, 2024
14fd57f
fix: Remove unsuded Namespace
kovalch Jul 29, 2024
c6b7d56
fix: License and Rights could be just a homepage URI in our dataset
kovalch Jul 30, 2024
d54a75a
style: Remove blank line
kovalch Jul 30, 2024
eec20dd
fix: Rollback to the previous function look, before we migrate all li…
kovalch Jul 30, 2024
b864329
fix: Update license and right values for the tests
kovalch Jul 30, 2024
71e0e7e
feat: Add additional check if cc-license in dct:license
kovalch Aug 5, 2024
67b9ae8
fix: Update the license_literal mapping
kovalch Aug 5, 2024
46f7d84
fix: Restracture _rights_license to graph function
kovalch Aug 5, 2024
df83d15
fix: Set keys and values to unicode when setting up the mappings
kovalch Aug 5, 2024
4c2ac83
fix: Make vocabulary_uri to unicode for the comparison in helper func…
kovalch Aug 5, 2024
ab73216
refactor: Add LicenseHandler class to handle cashing of license vocab…
kovalch Aug 5, 2024
8638c58
refactor: Devide license mapping function to smaller stages
kovalch Aug 5, 2024
d7efda1
fix: Remove blank line
kovalch Aug 5, 2024
f723c76
fix: Remove unused function
kovalch Aug 5, 2024
3aa4ddf
fix: Return a value of a dict
kovalch Aug 5, 2024
2e2ed15
fix: Remove language lable from the uri
kovalch Aug 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 108 additions & 32 deletions ckanext/dcatapchharvest/dcat_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DCT = Namespace("http://purl.org/dc/terms/")
EUTHEMES = \
Namespace("http://publications.europa.eu/resource/authority/data-theme/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
HYDRA = Namespace('http://www.w3.org/ns/hydra/core#')

SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#")
Expand All @@ -38,6 +39,7 @@
"skosxl": SKOSXL,
"rdf": RDF,
"rdfs": RDFS,
"foaf": FOAF,
}

theme_namespaces = {
Expand Down Expand Up @@ -167,38 +169,112 @@ def get_frequency_values():
return frequency_mapping


def get_license_uri_by_name(vocabulary_name):
license_vocabulary = get_license_values()
for key, value in license_vocabulary.items():
if unicode(vocabulary_name) == unicode(value):
return key
return None


def get_license_name_by_uri(vocabulary_uri):
license_vocabulary = get_license_values()
for key, value in license_vocabulary.items():
if unicode(vocabulary_uri) == unicode(key):
return unicode(value)
return None


def get_license_values():
g = Graph()
license_mapping = {}
for prefix, namespace in license_namespaces.items():
g.bind(prefix, namespace)
file = os.path.join(__location__, 'license.ttl')
g.parse(file, format='turtle')
for ogdch_license_ref in g.subjects(predicate=RDF.type,
object=SKOS.Concept):
license_mapping[ogdch_license_ref] = None
for license_pref_label in g.objects(subject=ogdch_license_ref,
predicate=SKOSXL.prefLabel):
for license_literal in g.objects(subject=license_pref_label,
predicate=SKOSXL.literalForm):
license_mapping[ogdch_license_ref] = license_literal
return license_mapping
class LicenseHandler:
def __init__(self):
self._license_cache = None

def _bind_namespaces(self, graph):
for prefix, namespace in license_namespaces.items():
graph.bind(prefix, namespace)

def _parse_graph(self, graph):
file = os.path.join(__location__, 'license.ttl')
graph.parse(file, format='turtle')

def _get_license_homepage(self, graph, license_ref):
for homepage in graph.objects(subject=license_ref,
predicate=FOAF.homepage):
return homepage
return None

def _get_license_literal(self, graph, license_ref):
for license_pref_label in graph.objects(subject=license_ref,
predicate=SKOSXL.prefLabel):
try:
return next(graph.objects(subject=license_pref_label,
predicate=SKOSXL.literalForm))
except StopIteration:
continue
return None

def _process_graph(self, graph):
license_ref_literal_mapping = {}
license_homepages_literal_mapping = {}
license_homepage_ref_mapping = {}

for ogdch_license_ref in graph.subjects(predicate=RDF.type,
object=SKOS.Concept):
license_homepage = self._get_license_homepage(graph,
ogdch_license_ref)
license_literal = self._get_license_literal(graph,
ogdch_license_ref)

license_homepages_literal_mapping[unicode(license_homepage)] = \
unicode(license_literal)
license_ref_literal_mapping[unicode(ogdch_license_ref)] = \
unicode(license_literal)
license_homepage_ref_mapping[unicode(license_homepage)] = \
unicode(ogdch_license_ref)

return (license_homepages_literal_mapping,
license_ref_literal_mapping, license_homepage_ref_mapping)

def _get_license_values(self):
if self._license_cache is None:
try:
g = Graph()
self._bind_namespaces(g)
self._parse_graph(g)

(license_homepages_literal_mapping,
license_ref_literal_mapping,
license_homepage_ref_mapping) = self._process_graph(g)

self._license_cache = (license_homepages_literal_mapping,
license_ref_literal_mapping,
license_homepage_ref_mapping)
except Exception as e:
raise RuntimeError("Failed to load license values: %s"
% e)
return self._license_cache

def get_license_ref_uri_by_name(self, vocabulary_name):
_, license_ref_literal_vocabulary, _ = self._get_license_values()
return next((key for key, value in
license_ref_literal_vocabulary.items()
if unicode(vocabulary_name) == value),
None)

def get_license_ref_uri_by_homepage_uri(self, vocabulary_name):
_, _, license_homepage_ref_vocabulary = self._get_license_values()
return license_homepage_ref_vocabulary.get(unicode(vocabulary_name))

def get_license_name_by_ref_uri(self, vocabulary_uri):
_, license_ref_literal_vocabulary, _ = self._get_license_values()
return license_ref_literal_vocabulary.get(
unicode(vocabulary_uri))

def get_license_name_by_homepage_uri(self, vocabulary_uri):
license_homepages_literal_vocabulary, _, _ = self._get_license_values()
return license_homepages_literal_vocabulary.get(
unicode(vocabulary_uri))

def get_license_homepage_uri_by_name(self, vocabulary_name):
license_homepages_literal_vocabulary, _, _ = self._get_license_values()
return next((key for key, value in
license_homepages_literal_vocabulary.items()
if unicode(vocabulary_name) == value),
None)

def get_license_homepage_uri_by_uri(self, vocabulary_uri):
_, _, license_homepage_ref_vocabulary = self._get_license_values()
license_homepages = list(license_homepage_ref_vocabulary.keys())
if unicode(vocabulary_uri) in license_homepages:
return unicode(vocabulary_uri)
return next((key for key, value in
license_homepage_ref_vocabulary.items()
if unicode(vocabulary_uri) == value),
None)


def get_theme_mapping():
Expand Down
6 changes: 3 additions & 3 deletions ckanext/dcatapchharvest/license.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
skosxl:literalForm "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@en ;
rdfs:label "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@de
] ;
foaf:homepage <https://opendata.swiss/en/terms-of-use/#terms_by> .
foaf:homepage <https://opendata.swiss/terms-of-use/#terms_by> .

<http://dcat-ap.ch/vocabulary/licenses/terms_ask>
a skos:Concept ;
Expand All @@ -61,7 +61,7 @@
skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@en ;
rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@de
] ;
foaf:homepage <https://opendata.swiss/en/terms-of-use/#terms_ask> .
foaf:homepage <https://opendata.swiss/terms-of-use/#terms_ask> .

<http://dcat-ap.ch/vocabulary/licenses/terms_by_ask>
a skos:Concept ;
Expand All @@ -76,7 +76,7 @@
skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@en ;
rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@de
] ;
foaf:homepage <https://opendata.swiss/en/terms-of-use/#terms_by_ask> .
foaf:homepage <https://opendata.swiss/terms-of-use/#terms_by_ask> .

<https://creativecommons.org/publicdomain/zero/1.0/>
a skos:Concept, cc:License ;
Expand Down
123 changes: 55 additions & 68 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
from ckanext.dcat.profiles import CleanedURIRef, RDFProfile, SchemaOrgProfile

log = logging.getLogger(__name__)

license_handler = dh.LicenseHandler()
valid_frequencies = dh.get_frequency_values()
valid_licenses = dh.get_license_values()
eu_theme_mapping = dh.get_theme_mapping()
valid_formats = dh.get_format_values()
valid_media_types = dh.get_iana_media_type_values()
Expand Down Expand Up @@ -277,23 +276,13 @@ def _get_iana_media_type(self, subject):
if media_type_key in valid_media_types:
return media_type_key

def _license_rights_name(self, subject, predicate):
for node in self.g.objects(subject, predicate):
# DCAT-AP CH v1: the license as a literal (should be
# the code for one of the DCAT-AP CH licenses)
if isinstance(node, Literal):
return unicode(node)
if isinstance(node, URIRef):
return dh.get_license_name_by_uri(node)
return None

def _license_rights_uri(self, subject, predicate):
def _license_rights_homepage_uri(self, subject, predicate):
for node in self.g.objects(subject, predicate):
# DCAT-AP CH v2 compatible license has to be a URI.
if isinstance(node, Literal):
return dh.get_license_uri_by_name(node)
return license_handler.get_license_homepage_uri_by_name(node)
if isinstance(node, URIRef):
return node
return license_handler.get_license_homepage_uri_by_uri(node)
return None

def _keywords(self, subject):
Expand Down Expand Up @@ -633,21 +622,34 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
if value:
resource_dict[key] = value

# Rights & License save name
rights = self._license_rights_name(distribution, DCT.rights)
license = self._license_rights_name(distribution, DCT.license)
# Rights & License save homepage uri
rights = self._license_rights_homepage_uri(
distribution, DCT.rights
)
license = self._license_rights_homepage_uri(
distribution, DCT.license
)

if rights is None and license is not None:
resource_dict['license'] = license
resource_dict['rights'] = license
if rights is not None and license is None:
resource_dict['license'] = rights
elif rights is not None and license is None:
resource_dict['rights'] = rights
if license is not None and rights is not None:
if 'cc' not in rights:
resource_dict['license'] = rights
else:
resource_dict['license'] = None
elif license is not None and rights is not None:
resource_dict['license'] = license
resource_dict['rights'] = rights
if 'cc' in rights:
if 'cc' in license and 'cc' not in rights:
resource_dict['license'] = rights
resource_dict['rights'] = license
elif 'cc' in license and 'cc' in rights:
resource_dict['license'] = None
else:
resource_dict['license'] = None
resource_dict['rights'] = None

# Format & Media type
resource_dict['format'] = \
Expand Down Expand Up @@ -1035,55 +1037,40 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa
g.add((distribution, DCAT.byteSize,
Literal(resource_dict['byte_size'])))

def _get_rights_and_license_uri(self, resource_dict, property='license'):
if property not in ['license', 'rights']:
raise ValueError("Property must be 'license' or 'rights'")

homepage_uri = resource_dict.get(property)
if not homepage_uri:
return None

uri = license_handler.get_license_ref_uri_by_homepage_uri(homepage_uri)
if uri is not None:
return URIRef(uri)

name = license_handler.get_license_name_by_homepage_uri(homepage_uri)
if name is not None:
uri = license_handler.get_license_ref_uri_by_name(name)
if uri is not None:
return URIRef(uri)

return None

def _rights_and_license_to_graph(self, resource_dict, distribution):
g = self.g
if resource_dict.get('rights'):
rights_uri = dh.get_license_uri_by_name(
resource_dict.get('rights')
)
if rights_uri is not None:
rights_ref = URIRef(rights_uri)
g.add((rights_ref, RDF.type, DCT.RightsStatement))
g.add((distribution, DCT.rights, rights_ref))
if rights_uri is None:
rights_name = dh.get_license_name_by_uri(
resource_dict.get('rights')
)
if rights_name is not None:
resource_rights_ref = URIRef(
resource_dict.get('rights')
)
g.add((
resource_rights_ref,
RDF.type,
DCT.RightsStatement)
)
g.add((distribution, DCT.rights, resource_rights_ref))

if resource_dict.get('license'):
license_uri = dh.get_license_uri_by_name(
resource_dict.get('license')
)
if license_uri is not None:
license_ref = URIRef(license_uri)
g.add((license_ref, RDF.type, DCT.LicenseDocument))
g.add((distribution, DCT.license, license_ref))
if license_uri is None:
license_name = dh.get_license_name_by_uri(
resource_dict.get('license')
)
if license_name is not None:
resource_license_ref = URIRef(
resource_dict.get('license')
)
g.add((
resource_license_ref,
RDF.type,
DCT.LicenseDocument)
)
g.add(
(distribution, DCT.license, resource_license_ref)
)
rights_uri_ref = self._get_rights_and_license_uri(resource_dict,
'rights')
if rights_uri_ref is not None:
g.add((rights_uri_ref, RDF.type, DCT.RightsStatement))
g.add((distribution, DCT.rights, rights_uri_ref))

license_uri_ref = self._get_rights_and_license_uri(resource_dict,
'license')
if license_uri_ref is not None:
g.add((license_uri_ref, RDF.type, DCT.LicenseDocument))
g.add((distribution, DCT.license, license_uri_ref))

def _format_and_media_type_to_graph(self, resource_dict, distribution):
g = self.g
Expand Down
6 changes: 3 additions & 3 deletions ckanext/dcatapchharvest/tests/fixtures/dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@
"https://example.com/documentation-resource-1",
"https://example.com/documentation-resource-2"
],
"rights": "Creative Commons Zero 1.0 Universell (CC0 1.0)",
"license": "NonCommercialAllowed-CommercialAllowed-ReferenceNotRequired",
"rights": "http://www.opendefinition.org/licenses/cc-zero",
"license": "https://opendata.swiss/terms-of-use/#terms_open",
"format": "CSV",
"issued": "2015-06-26T15:21:09.034694",
"modified": "2015-06-30T15:21:09.000000"
Expand All @@ -135,7 +135,7 @@
"https://example.com/documentation-resource-2"
],
"rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by",
"license": "NonCommercialAllowed-CommercialAllowed-ReferenceRequired",
"license": "https://opendata.swiss/terms-of-use/#terms_by",
"format": "HTML"
},
{
Expand Down
Loading