Skip to content

Commit a3ee788

Browse files
ml-evseimrek
andauthored
Usability tweaks (#75)
* Fail early if JSONL file is present without --overwrite * Add warnings if IDs across property files and entries do not match * Improve error message when failing to parse custom property into the specified type * Add validation for types and names in config that will otherwise fail later * Drop aliases from the property dataframe to allow for easier validation * Improve warning for mismatching property fields between property file and config * don't warn if property id matches immutable_id * fix pre-commit --------- Co-authored-by: Kristjan Eimre <kristjaneimre@gmail.com>
1 parent ce111a7 commit a3ee788

File tree

3 files changed

+50
-16
lines changed

3 files changed

+50
-16
lines changed

src/optimade_maker/config.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
44
"""
55

6+
from optimade.models.optimade_json import DataType
67
from pydantic import ConfigDict, field_validator, model_validator
78

9+
IDENTIFIER_REGEX = r"^[a-z_][a-z_0-9]*$"
810
__version__ = "0.1.0"
911

1012
from pathlib import Path
@@ -24,9 +26,10 @@ class PropertyDefinition(BaseModel):
2426
"""
2527

2628
name: str = Field(
27-
description="""The field name of the property, as provided in the included
28-
the auxiliary property files.
29-
Will be served with a provider-specific prefix in the actual API, so must not start with an underscore."""
29+
description="""The field name of the property to use in the API. Will be searched for in the included
30+
the auxiliary property files, unless `aliases` is also specified.
31+
Will be served with a provider-specific prefix in the actual API, so must not start with an underscore or contain upper case characters.""",
32+
pattern=IDENTIFIER_REGEX,
3033
)
3134

3235
title: Optional[str] = Field(
@@ -38,8 +41,7 @@ class PropertyDefinition(BaseModel):
3841
unit: Optional[str] = Field(
3942
None, description="The unit of the property, e.g. 'eV' or 'Å'."
4043
)
41-
type: Optional[str] = Field(
42-
None,
44+
type: Optional[DataType] = Field(
4345
description="The OPTIMADE type of the property, e.g., `float` or `string`.",
4446
)
4547
maps_to: Optional[str] = Field(

src/optimade_maker/convert.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ def convert_archive(
8989

9090
if not jsonl_path:
9191
jsonl_path = archive_path / "optimade.jsonl"
92+
if jsonl_path.exists() and not overwrite:
93+
raise RuntimeError(f"Not overwriting existing file at {jsonl_path}")
9294

9395
# if the config specifies just a JSON-L, then extract any archives
9496
# and return the JSONL path
@@ -397,6 +399,10 @@ def _parse_and_assign_properties(
397399
if not property_matches_by_file:
398400
return
399401

402+
optimade_immutable_ids = {
403+
entry["attributes"].get("immutable_id") for entry in optimade_entries.values()
404+
}
405+
400406
for archive_file in property_matches_by_file:
401407
for _path in tqdm.tqdm(
402408
property_matches_by_file[archive_file],
@@ -409,6 +415,14 @@ def _parse_and_assign_properties(
409415
for id in properties:
410416
parsed_properties[id].update(properties[id])
411417
all_property_fields |= set(properties[id].keys())
418+
if (
419+
id not in optimade_entries
420+
and id not in optimade_immutable_ids
421+
):
422+
warnings.warn(
423+
f"Could not find entry {id!r} in OPTIMADE entries.",
424+
)
425+
continue
412426
break
413427
except Exception as exc:
414428
errors.append(exc)
@@ -430,9 +444,13 @@ def _parse_and_assign_properties(
430444
expected_property_fields = set(property_def_dict.keys())
431445

432446
if expected_property_fields != all_property_fields:
433-
warnings.warn(
434-
f"Found {all_property_fields=} in data but {expected_property_fields} in config"
435-
)
447+
warning_message = "Mismatch between parsed property fields (A) and those defined in config (B)."
448+
if all_property_fields - expected_property_fields:
449+
warning_message += f"\n(A - B) = {all_property_fields - expected_property_fields} (will be omitted from API; if intended this can be ignored)."
450+
if expected_property_fields - all_property_fields:
451+
warning_message += f"\n(B - A) = {expected_property_fields - all_property_fields} (configured, but missing; check for typos or missing aliases)"
452+
453+
warnings.warn(warning_message)
436454

437455
# Look for precisely matching IDs, or 'filename' matches
438456
for id in optimade_entries:
@@ -442,6 +460,14 @@ def _parse_and_assign_properties(
442460
# try to find a matching ID based on the filename
443461
property_entry_id = id.split("/")[-1].split(".")[0]
444462

463+
if (property_entry_id not in parsed_properties) and (
464+
id not in parsed_properties
465+
):
466+
warnings.warn(
467+
f"Could not find entry {id!r} (or fully-qualified {property_entry_id!r}) in parsed properties",
468+
)
469+
continue
470+
445471
# Loop over all defined properties and assign them to the entry, setting to None if missing
446472
# Also cast types if provided
447473
for property in all_property_fields:
@@ -451,10 +477,15 @@ def _parse_and_assign_properties(
451477
property, None
452478
) or parsed_properties.get(id, {}).get(property, None)
453479
if property not in property_def_dict:
454-
warnings.warn(f"Missing property definition for {property=}")
480+
# These are already warned about above: fields that are not configured but are present in the property file
455481
continue
456482
if value is not None and property_def_dict[property].type in TYPE_MAP:
457-
value = TYPE_MAP[property_def_dict[property].type](value)
483+
try:
484+
value = TYPE_MAP[property_def_dict[property].type](value)
485+
except Exception as exc:
486+
raise RuntimeError(
487+
f"Could not cast {value=} for {property=} to type {property_def_dict[property].type!r} for entry {id!r}"
488+
) from exc
458489

459490
optimade_entries[id]["attributes"][f"_{provider_prefix}_{property}"] = value
460491

src/optimade_maker/parsers.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
) from exc
1414

1515
from optimade.adapters import Structure
16-
from optimade.models import EntryResource
16+
from optimade.models import DataType, EntryResource
1717

1818
from optimade_maker.config import PropertyDefinition
1919

@@ -57,6 +57,7 @@ def load_csv_file(
5757
for alias in prop.aliases or []:
5858
if alias in df:
5959
df[prop.name] = df[alias]
60+
df.drop(columns=[alias], inplace=True)
6061
break
6162

6263
return df.to_dict(orient="index")
@@ -68,11 +69,11 @@ def load_csv_file(
6869
".csv": [load_csv_file],
6970
}
7071

71-
TYPE_MAP: dict[str | None, type] = {
72-
"float": float,
73-
"string": str,
74-
"integer": int,
75-
"boolean": bool,
72+
TYPE_MAP: dict[DataType, type] = {
73+
DataType.FLOAT: float,
74+
DataType.STRING: str,
75+
DataType.INTEGER: int,
76+
DataType.BOOLEAN: bool,
7677
}
7778

7879

0 commit comments

Comments
 (0)