ResearchObject · rsirvent · May 22, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/docs/all-mappings.md b/docs/all-mappings.md
@@ -7,19 +7,20 @@ Note that RO-Crate and DataCite each contain features that the other does not ha
 ## Mapping of resource type
 
 - `resource_type` is a mandatory field in DataCite
--  RO-Crate does not have a field that describes the type of the entire directory
--  Therefore, we assume the type to be `dataset`
+- RO-Crate does not have a field that describes the type of the entire directory
+- Therefore, we assume the type to be `dataset` by default
+- Only if the 'mainEntity' includes the type 'ComputationalWorkflow', DataCite type is set to 'workflow'
 
 ## Mapping of creators
 
-- an `author` in RO-Crate is mapped to a `creator` in DataCite, alongside with their affiliations
+- an `author` or a `creator` in RO-Crate is mapped to `creators` in DataCite, alongside with their affiliations
 - if the `@id` field of an author is an ORCiD, the ORCiD field is parsed and added in DataCite
 - consists of `person or organization` and `affiliation`
 - if no creator exists, the creator is chosen to be the value `:unkn`
 
 ## Mapping of contributors
 
-- similar to creator mapping
+- similar to creator mapping, but only `contributor` is mapped to `contributors` in DataCite if they have been defined (since they are a valid schema.org term but not mandatory in RO-Crate)
 
 ## Mapping of title
 
@@ -50,9 +51,9 @@ Note that RO-Crate and DataCite each contain features that the other does not ha
 
 ## Mapping of rights/licenses
 
-- the `identifier` field in DataCite is not mapped, since it defaults to SPDX this would require knowlege of the mapping of a licence URL to the SPDX id (https://spdx.org/licenses/)
 - in case the RO-Crate does not reference another object, but contains a direct value the following is applied
   - if the value is a URL: only set the link value in the DataCite file
+  - if the URL is an SPDX URL, the 'id', 'scheme' and 'title' fields are automatically generated from the URL
   - if the value is freetext: only set the description value in the DataCite file
 
 ## Mapping of subjects

diff --git a/src/rocrate_inveniordm/mapping/condition_functions.py b/src/rocrate_inveniordm/mapping/condition_functions.py
@@ -49,3 +49,7 @@ def embargoed(value):
 
 def string(value):
     return value and isinstance(value, str)
+
+
+def ror(value):
+    return value and value.startswith("https://ror.org/")
diff --git a/src/rocrate_inveniordm/mapping/converter.py b/src/rocrate_inveniordm/mapping/converter.py
@@ -51,6 +51,8 @@ def convert(rc: dict, metadata_only: bool = False) -> dict:
     :return: Dictionary containing DataCite metadata
     """
 
+    rc = merge_authors_and_creators(rc)
+
     m = load_mapping_json()
 
     dc = setup_dc()
@@ -87,7 +89,7 @@ def convert(rc: dict, metadata_only: bool = False) -> dict:
             print(f"\t|- Applying mapping {mapping_key}")
 
             mapping = mappings.get(mapping_key)
-            dc, any_present = apply_mapping(mapping, mapping_paths, rc, dc)
+            dc, any_present = apply_mapping(mapping, mapping_paths, rc, dc, mapping_key)
             is_any_present = is_any_present or any_present
 
         if not is_any_present:
@@ -135,7 +137,7 @@ def get_mapping_paths(rc: dict, mappings: dict) -> dict:
     return mapping_paths
 
 
-def apply_mapping(mapping, mapping_paths, rc, dc):  # noqa: C901
+def apply_mapping(mapping, mapping_paths, rc, dc, mapping_key):  # noqa: C901
     """Convert RO-Crate metadata to DataCite according to the specified mapping and
     paths.
 
@@ -152,6 +154,7 @@ def apply_mapping(mapping, mapping_paths, rc, dc):  # noqa: C901
     :param mapping_paths: A list of paths, used to disambiguate array values
     :param rc: Dictionary of RO-Crate metadata
     :param dc: Dictionary of DataCite metadata
+    :param mapping_key: The key of the mapping being applied
     :return: tuple containing the updated dictionary of DataCite metadata, and a boolean
         indicating whether the rule was applied
     """
@@ -179,8 +182,15 @@ def apply_mapping(mapping, mapping_paths, rc, dc):  # noqa: C901
         paths = mapping_paths.get(processed_string)
         print(f"\t\t|- Paths: {paths}")
 
-    for path in paths:
-        print(f"PATH: {path}")
+    for i, path in enumerate(paths):
+        if mapping_key.startswith("publisher_mapping") and i > 0:
+            # RO-Crate can have a list of publishers, but DataCite only supports one
+            # publisher. So, we only apply the first one.
+            print(
+                f"\t\t|- Skipping path {i} for mapping {mapping_key} to avoid "
+                "overwriting previous values."
+            )
+            continue
         new_path = path.copy()
         from_value = get_value_from_rc(rc.copy(), from_mapping_value, new_path)
 
@@ -189,12 +199,12 @@ def apply_mapping(mapping, mapping_paths, rc, dc):  # noqa: C901
             # must be implemented on how to handle it)
             print(
                 "\t\t|- Result is a JSON object, so this rule cannot be applied. "
-                "Skipping to next rule."
+                "Skipping to next path."
             )
             from_value = None
 
-        # if (from_value is None):
-        #    continue
+        if from_value is None:
+            continue
 
         if only_if_value is not None:
             print(f"\t\t|- Checking condition {only_if_value}")
@@ -213,8 +223,8 @@ def apply_mapping(mapping, mapping_paths, rc, dc):  # noqa: C901
                 f"{path.copy()}"
             )
             rule_applied = True
-            print(dc, to_mapping_value, from_value)
             dc = set_dc(dc, to_mapping_value, from_value, path.copy())
+            print(dc)
 
     return dc, rule_applied
 
@@ -363,8 +373,13 @@ def set_dc(dictionary, key, value=None, path=[]):
             path = path[1:]
             last_val = current_dict[key_part[:-2]]
 
-            if len(current_dict[key_part[:-2]]) <= index:
-                current_dict[key_part[:-2]].append({})
+            while len(current_dict[key_part[:-2]]) <= index:
+                current_dict[key_part[:-2]].append(
+                    {}
+                )  # It expands 1 by 1 anyway, since no empty paths can remain after
+                # a mapping rule is applied
+
+            # print(f"INDEX: {index}, len of key: {len(current_dict[key_part[:-2]])}")
 
             current_dict = current_dict[key_part[:-2]][index]
 
@@ -425,5 +440,31 @@ def process(process_rule, value):
     return function(value)
 
 
+def merge_authors_and_creators(rc: dict):
+    """
+    Copy creators to authors in the RO-Crate, so they can be processed in a single
+    mapping. Mapping from 'author' to 'creators' and later from 'creator' to
+    'creators' causes overwritings.
+    """
+
+    for rde in rc["@graph"]:
+        if "creator" in rde:
+            for person_or_org in rde["creator"]:
+                if isinstance(person_or_org, str):
+                    added_authors = [item for item in rde["author"]]
+                    if person_or_org not in added_authors:
+                        rde["author"].append(person_or_org)
+                    continue
+                urls_orcid = [
+                    item["@id"]
+                    for item in rde["author"]
+                    if isinstance(item, dict) and "@id" in item
+                ]
+                if person_or_org["@id"] not in urls_orcid:
+                    rde["author"].append(person_or_org)
+
+    return rc
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/rocrate_inveniordm/mapping/crate_utils.py b/src/rocrate_inveniordm/mapping/crate_utils.py
@@ -92,7 +92,7 @@ def get_value_from_rc(rc, from_key, path=[]):
 
     print(f"\t\t|- Retrieving value {from_key} with path {path} from RO-Crate.")
     keys = from_key.split(".")
-    print(keys)
+    # print(keys)
     current_entity = rc_get_rde(rc)
 
     for key in keys: