From a8b1605106ba7578b875888f1bfbabc51ad3b4bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:51:15 +0100 Subject: [PATCH 01/62] TDD: add test case for ignore header case in validating resource with schema_sync option --- .../__spec__/resource/test_schema.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 616f0a2728..b05fe9f003 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -4,6 +4,7 @@ import frictionless from frictionless import Checklist, Detector, FrictionlessException, Schema, fields +from frictionless import Dialect from frictionless.resources import TableResource # General @@ -373,3 +374,36 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"] ) + + +def test_validate_resource_ignoring_header_case_issue_1635(): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [ + { + "name": "AA", + "title": "Field A", + "type": "string", + "constraints": {"required": True}, + }, + {"name": "BB", "title": "Field B", "type": "string"}, + {"name": "CC", "title": "Field C", "type": "string"}, + ], + } + + source = [["aa", "bb", "cc"], ["a", "b", "c"]] + report = frictionless.validate( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True), + dialect=Dialect(header_case=False) + ) + assert report.valid + + report = frictionless.validate( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True) + ) + assert not report.valid + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] From 542398259886d46bb912c2fe7b630243ce03b5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:52:10 +0100 Subject: [PATCH 02/62] TDD: solve new test case --- frictionless/detector/detector.py | 20 ++++++++++++++++---- frictionless/resources/table.py | 6 ++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index dd656427d5..59d773a8b0 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -298,6 +298,7 @@ def detect_schema( labels: Optional[List[str]] = None, schema: Optional[Schema] = None, field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES, + **options: Any ) -> Schema: """Detect schema from fragment @@ -408,17 +409,28 @@ def detect_schema( if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - mapping = {field.name: field for field in schema.fields} # type: ignore + if options["header_case"]: + mapping = {field.name: field for field in schema.fields} # type: ignore + else: + mapping = {field.name.lower(): field for field in schema.fields} # type: ignore schema.clear_fields() for name in labels: - field = mapping.get(name) + if options["header_case"]: + field = mapping.get(name) + else: + field = mapping.get(name.lower()) if not field: field = Field.from_descriptor({"name": name, "type": "any"}) schema.add_field(field) # For required fields that are missing for _, field in mapping.items(): - if field and field.required and field.name not in labels: - schema.add_field(field) + if options["header_case"]: + if field and field.required and field.name not in labels: + schema.add_field(field) + else: + if field and field.required and field.name.lower() not in [ + label.lower() for label in labels]: + schema.add_field(field) # Patch schema if self.schema_patch: diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 98a76f3473..4ba335b459 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -204,6 +204,7 @@ def __open_schema(self): labels=self.labels, schema=self.schema, field_candidates=system.detect_field_candidates(), + header_case=self.dialect.header_case ) self.stats.fields = len(self.schema.fields) @@ -388,14 +389,15 @@ def row_stream(): # NB: missing required labels are not included in the # field_info parameter used for row creation - if self.detector.schema_sync: + if self.detector.schema_sync and self.dialect.header_case: for field in self.schema.fields: if field.name not in self.labels and field.name in field_info["names"]: field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] del field_info["mapping"][field.name] - # # Create row stream + + # Create row stream self.__row_stream = row_stream() # Read From 36d3f777d33b3d238e08cd5ae54fc6fc3187b757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:55:38 +0100 Subject: [PATCH 03/62] Refacto: refactoring test_schema --- .../validator/__spec__/resource/test_schema.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index b05fe9f003..e3db265894 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -392,18 +392,23 @@ def test_validate_resource_ignoring_header_case_issue_1635(): } source = [["aa", "bb", "cc"], ["a", "b", "c"]] + + schema = Schema.from_descriptor(schema_descriptor) + + detector = Detector(schema_sync=True) + report = frictionless.validate( - source=source, - schema=Schema.from_descriptor(schema_descriptor), - detector=Detector(schema_sync=True), + source, + schema=schema, + detector=detector, dialect=Dialect(header_case=False) ) assert report.valid report = frictionless.validate( - source=source, - schema=Schema.from_descriptor(schema_descriptor), - detector=Detector(schema_sync=True) + source, + schema=schema, + detector=detector, ) assert not report.valid assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] From 45bce3b557dd222bdd17dfcdbdb63a6fb4b8e125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 5 Feb 2024 17:57:34 +0100 Subject: [PATCH 04/62] Linting: format files --- frictionless/detector/detector.py | 12 ++++++++---- frictionless/resources/table.py | 2 +- .../validator/__spec__/resource/test_schema.py | 15 +++++++-------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 59d773a8b0..7b53f9d23c 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -298,7 +298,7 @@ def detect_schema( labels: Optional[List[str]] = None, schema: Optional[Schema] = None, field_candidates: List[Dict[str, Any]] = settings.DEFAULT_FIELD_CANDIDATES, - **options: Any + **options: Any, ) -> Schema: """Detect schema from fragment @@ -412,7 +412,7 @@ def detect_schema( if options["header_case"]: mapping = {field.name: field for field in schema.fields} # type: ignore else: - mapping = {field.name.lower(): field for field in schema.fields} # type: ignore + mapping = {field.name.lower(): field for field in schema.fields} # type: ignore schema.clear_fields() for name in labels: if options["header_case"]: @@ -428,8 +428,12 @@ def detect_schema( if field and field.required and field.name not in labels: schema.add_field(field) else: - if field and field.required and field.name.lower() not in [ - label.lower() for label in labels]: + if ( + field + and field.required + and field.name.lower() + not in [label.lower() for label in labels] + ): schema.add_field(field) # Patch schema diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4ba335b459..b143529b33 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -204,7 +204,7 @@ def __open_schema(self): labels=self.labels, schema=self.schema, field_candidates=system.detect_field_candidates(), - header_case=self.dialect.header_case + header_case=self.dialect.header_case, ) self.stats.fields = len(self.schema.fields) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index e3db265894..cd954f551d 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -3,8 +3,8 @@ import pytest import frictionless -from frictionless import Checklist, Detector, FrictionlessException, Schema, fields -from frictionless import Dialect +from frictionless import Checklist, Detector, Dialect, FrictionlessException, Schema +from frictionless import fields from frictionless.resources import TableResource # General @@ -394,14 +394,11 @@ def test_validate_resource_ignoring_header_case_issue_1635(): source = [["aa", "bb", "cc"], ["a", "b", "c"]] schema = Schema.from_descriptor(schema_descriptor) - + detector = Detector(schema_sync=True) report = frictionless.validate( - source, - schema=schema, - detector=detector, - dialect=Dialect(header_case=False) + source, schema=schema, detector=detector, dialect=Dialect(header_case=False) ) assert report.valid @@ -411,4 +408,6 @@ def test_validate_resource_ignoring_header_case_issue_1635(): detector=detector, ) assert not report.valid - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [[None, 4, "AA", "missing-label"]] + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ + [None, 4, "AA", "missing-label"] + ] From dc5d254d22fa1ef61b4b5e8590bd302051992788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 09:59:38 +0100 Subject: [PATCH 05/62] TDD: add test case for missing label in tabular data and ignoring case: test fails --- .../__spec__/resource/test_schema.py | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index cd954f551d..380cb6da0c 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -369,12 +369,34 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 tc["source"], schema=schema, detector=Detector(schema_sync=True) ) report = frictionless.validate(resource) - print(report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) assert ( report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"] ) + # Ignore case + schema_descriptor_3 = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [ + {"name": "Aa", "constraints": {"required": True}}, + {"name": "BB", "constraints": {"required": True}}, + {"name": "cc"} + ] + } + schema = Schema.from_descriptor(schema_descriptor_3) + source = [["bb", "CC"], ["foo", "bar"]] + report = frictionless.validate( + source, + schema=schema, + detector=Detector(schema_sync=True), + dialect=Dialect(header_case=False) + ) + assert not report.valid + # Expect one single error misisng-label related to missing column 'Aa' + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ + [None, 3, "Aa", "missing-label"] + ] + def test_validate_resource_ignoring_header_case_issue_1635(): schema_descriptor = { From 7a12203f5940650030e7ce416aa50c38f2497b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 10:50:57 +0100 Subject: [PATCH 06/62] TDD: fix missing required field with 'header_case=False' dialect and 'schema_sync=True' options --- frictionless/resources/table.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index b143529b33..a855f494bb 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -389,13 +389,21 @@ def row_stream(): # NB: missing required labels are not included in the # field_info parameter used for row creation - if self.detector.schema_sync and self.dialect.header_case: + if self.detector.schema_sync: for field in self.schema.fields: - if field.name not in self.labels and field.name in field_info["names"]: - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] + if self.dialect.header_case: + if field.name not in self.labels and field.name in field_info["names"]: + field_index = field_info["names"].index(field.name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field.name] + else: # Ignore case + if field.name.lower() not in [label.lower() for label in self.labels] \ + and field.name.lower() in [field_info_name.lower() for field_info_name in field_info["names"]]: + field_index = field_info["names"].index(field.name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field.name] # Create row stream self.__row_stream = row_stream() From 7c0514fee91277a21c2654a7f011ace4be50cc34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 10:52:06 +0100 Subject: [PATCH 07/62] Linting: format files and sort imports --- frictionless/resources/table.py | 12 +++++++++--- .../validator/__spec__/resource/test_schema.py | 10 +++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index a855f494bb..03093cb989 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -392,14 +392,20 @@ def row_stream(): if self.detector.schema_sync: for field in self.schema.fields: if self.dialect.header_case: - if field.name not in self.labels and field.name in field_info["names"]: + if ( + field.name not in self.labels + and field.name in field_info["names"] + ): field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] del field_info["mapping"][field.name] else: # Ignore case - if field.name.lower() not in [label.lower() for label in self.labels] \ - and field.name.lower() in [field_info_name.lower() for field_info_name in field_info["names"]]: + if field.name.lower() not in [ + label.lower() for label in self.labels + ] and field.name.lower() in [ + field_info_name.lower() for field_info_name in field_info["names"] + ]: field_index = field_info["names"].index(field.name) del field_info["names"][field_index] del field_info["objects"][field_index] diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 380cb6da0c..d7aa9ff674 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -378,10 +378,10 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 schema_descriptor_3 = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ - {"name": "Aa", "constraints": {"required": True}}, - {"name": "BB", "constraints": {"required": True}}, - {"name": "cc"} - ] + {"name": "Aa", "constraints": {"required": True}}, + {"name": "BB", "constraints": {"required": True}}, + {"name": "cc"}, + ], } schema = Schema.from_descriptor(schema_descriptor_3) source = [["bb", "CC"], ["foo", "bar"]] @@ -389,7 +389,7 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 source, schema=schema, detector=Detector(schema_sync=True), - dialect=Dialect(header_case=False) + dialect=Dialect(header_case=False), ) assert not report.valid # Expect one single error misisng-label related to missing column 'Aa' From 51bfb762b843fd607b2cfe97fa6e5ec0a2c18bfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Tue, 6 Feb 2024 11:04:25 +0100 Subject: [PATCH 08/62] Refacto: refactoring test --- .../__spec__/resource/test_schema.py | 86 +++++++++---------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index d7aa9ff674..69a3acd36b 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -374,62 +374,60 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 == tc["expected_flattened_report"] ) - # Ignore case - schema_descriptor_3 = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [ - {"name": "Aa", "constraints": {"required": True}}, - {"name": "BB", "constraints": {"required": True}}, - {"name": "cc"}, - ], - } - schema = Schema.from_descriptor(schema_descriptor_3) - source = [["bb", "CC"], ["foo", "bar"]] - report = frictionless.validate( - source, - schema=schema, - detector=Detector(schema_sync=True), - dialect=Dialect(header_case=False), - ) - assert not report.valid - # Expect one single error misisng-label related to missing column 'Aa' - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ - [None, 3, "Aa", "missing-label"] - ] - def test_validate_resource_ignoring_header_case_issue_1635(): schema_descriptor = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ { - "name": "AA", + "name": "aa", "title": "Field A", "type": "string", "constraints": {"required": True}, }, - {"name": "BB", "title": "Field B", "type": "string"}, + { + "name": "BB", + "title": "Field B", + "type": "string", + "constraints": {"required": True}, + }, {"name": "CC", "title": "Field C", "type": "string"}, ], } - source = [["aa", "bb", "cc"], ["a", "b", "c"]] - - schema = Schema.from_descriptor(schema_descriptor) - - detector = Detector(schema_sync=True) - - report = frictionless.validate( - source, schema=schema, detector=detector, dialect=Dialect(header_case=False) - ) - assert report.valid - - report = frictionless.validate( - source, - schema=schema, - detector=detector, - ) - assert not report.valid - assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == [ - [None, 4, "AA", "missing-label"] + test_cases = [ + { + "source": [["AA", "bb", "cc"], ["a", "b", "c"]], + "dialect": Dialect(header_case=False), + "expected_valid_report": True, + "expected_flattened_report": [], + }, + { + "source": [["AA", "bb", "cc"], ["a", "b", "c"]], + "dialect": Dialect(header_case=True), + "expected_valid_report": False, + "expected_flattened_report": [ + [None, 4, "aa", "missing-label"], + [None, 5, "BB", "missing-label"], + ], + }, + { + "source": [["bb", "CC"], ["foo", "bar"]], + "dialect": Dialect(header_case=False), + "expected_valid_report": False, + "expected_flattened_report": [[None, 3, "aa", "missing-label"]], + }, ] + + for tc in test_cases: + resource = TableResource( + tc["source"], + schema=Schema.from_descriptor(schema_descriptor), + detector=Detector(schema_sync=True), + dialect=tc["dialect"], + ) + report = frictionless.validate(resource) + assert report.valid == tc["expected_valid_report"] + assert (report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"])) == tc[ + "expected_flattened_report" + ] From 6bbc2c79730916863b7f3501ba8dce543da5e0a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 15:02:06 +0100 Subject: [PATCH 09/62] Refacto: refactoring test --- .../__spec__/resource/test_schema.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 69a3acd36b..458d25ca3b 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -391,31 +391,30 @@ def test_validate_resource_ignoring_header_case_issue_1635(): "type": "string", "constraints": {"required": True}, }, - {"name": "CC", "title": "Field C", "type": "string"}, ], } test_cases = [ { - "source": [["AA", "bb", "cc"], ["a", "b", "c"]], - "dialect": Dialect(header_case=False), + "source": [["AA", "bb"], ["a", "b"]], + "header_case": False, "expected_valid_report": True, "expected_flattened_report": [], }, { - "source": [["AA", "bb", "cc"], ["a", "b", "c"]], - "dialect": Dialect(header_case=True), + "source": [["AA", "bb"], ["a", "b"]], + "header_case": True, "expected_valid_report": False, "expected_flattened_report": [ - [None, 4, "aa", "missing-label"], - [None, 5, "BB", "missing-label"], + [None, 3, "aa", "missing-label"], + [None, 4, "BB", "missing-label"], ], }, { - "source": [["bb", "CC"], ["foo", "bar"]], - "dialect": Dialect(header_case=False), + "source": [["bb"], ["foo"]], + "header_case": False, "expected_valid_report": False, - "expected_flattened_report": [[None, 3, "aa", "missing-label"]], + "expected_flattened_report": [[None, 2, "aa", "missing-label"]], }, ] @@ -424,7 +423,7 @@ def test_validate_resource_ignoring_header_case_issue_1635(): tc["source"], schema=Schema.from_descriptor(schema_descriptor), detector=Detector(schema_sync=True), - dialect=tc["dialect"], + dialect=Dialect(header_case=tc["header_case"]), ) report = frictionless.validate(resource) assert report.valid == tc["expected_valid_report"] From b833a0c1c279ffa2fefefc8bfda6482a164cb31d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 16:10:58 +0100 Subject: [PATCH 10/62] Refacto: refactoring removing missing required label from field info part in '__open_row_stream()' 'TableResource' method --- frictionless/resources/table.py | 61 +++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 03093cb989..89b9fcbc7d 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -5,6 +5,8 @@ import warnings from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from frictionless.schema.field import Field + from .. import errors, helpers, settings from ..analyzer import Analyzer from ..dialect import Dialect @@ -387,33 +389,50 @@ def row_stream(): # Yield row yield row - # NB: missing required labels are not included in the - # field_info parameter used for row creation if self.detector.schema_sync: + # Missing required labels have in 'field_info' + # parameter used for row creation for field in self.schema.fields: - if self.dialect.header_case: - if ( - field.name not in self.labels - and field.name in field_info["names"] - ): - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] - else: # Ignore case - if field.name.lower() not in [ - label.lower() for label in self.labels - ] and field.name.lower() in [ - field_info_name.lower() for field_info_name in field_info["names"] - ]: - field_index = field_info["names"].index(field.name) - del field_info["names"][field_index] - del field_info["objects"][field_index] - del field_info["mapping"][field.name] + self.remove_missing_required_label_from_field_info(field, field_info) # Create row stream self.__row_stream = row_stream() + def remove_missing_required_label_from_field_info( + self, field: Field, field_info: Dict[str, Any] + ): + is_case_sensitive = self.dialect.header_case + if self.field_is_missing( + field.name, field_info["names"], self.labels, is_case_sensitive + ): + self.remove_field_from_field_info(field.name, field_info) + + @staticmethod + def field_is_missing( + field_name: str, + expected_fields_names: List[str], + table_labels: types.ILabels, + case_sensitive: bool, + ) -> bool: + """Check if a schema field name is missing from the TableResource + labels. + """ + if not case_sensitive: + field_name = field_name.lower() + table_labels = [label.lower() for label in table_labels] + expected_fields_names = [ + field_name.lower() for field_name in expected_fields_names + ] + + return field_name not in table_labels and field_name in expected_fields_names + + @staticmethod + def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): + field_index = field_info["names"].index(field_name) + del field_info["names"][field_index] + del field_info["objects"][field_index] + del field_info["mapping"][field_name] + # Read def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]: From 3550cc3e442f589e8bf1478c746dfa138ffe3f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 17:41:43 +0100 Subject: [PATCH 11/62] Refacto: WIP --- frictionless/detector/detector.py | 96 ++++++++++++++++++++++--------- 1 file changed, 70 insertions(+), 26 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 7b53f9d23c..72eb2da7e5 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import attrs @@ -409,32 +409,21 @@ def detect_schema( if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - if options["header_case"]: - mapping = {field.name: field for field in schema.fields} # type: ignore - else: - mapping = {field.name.lower(): field for field in schema.fields} # type: ignore - schema.clear_fields() - for name in labels: - if options["header_case"]: - field = mapping.get(name) - else: - field = mapping.get(name.lower()) - if not field: - field = Field.from_descriptor({"name": name, "type": "any"}) - schema.add_field(field) + + case_sensitive = options["header_case"] + + fields_mapped = self.mapped_schema_fields_names( + schema.fields, case_sensitive + ) + + self.add_fields_to_schema_among_labels( + fields_mapped, schema, labels, case_sensitive + ) + # For required fields that are missing - for _, field in mapping.items(): - if options["header_case"]: - if field and field.required and field.name not in labels: - schema.add_field(field) - else: - if ( - field - and field.required - and field.name.lower() - not in [label.lower() for label in labels] - ): - schema.add_field(field) + self.add_missing_required_labels_to_schema_fields( + fields_mapped, schema, labels, options["header_case"] + ) # Patch schema if self.schema_patch: @@ -449,3 +438,58 @@ def detect_schema( schema = Schema.from_descriptor(descriptor) return schema # type: ignore + + @staticmethod + def mapped_schema_fields_names( + fields: Union[List[None], List[Field]], case_sensitive: bool + ) -> Dict[str, Optional[Field]]: + """Create a dictionnary to map fields name with schema fields + considering case sensitivity + + Args: + fields (Union[List[None], List[Field]]): list of original + schema fields + case_sensitive (bool) + + Returns: + Dict[str, Optional[Field]] + """ + if case_sensitive: + return {field.name: field for field in fields} # type:ignore + else: + return {field.name.lower(): field for field in fields} # type: ignore + + @staticmethod + def add_fields_to_schema_among_labels( + fields_mapped: Dict[str, Optional[Field]], + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + schema.clear_fields() + for name in labels: + default_field = Field.from_descriptor({"name": name, "type": "any"}) + if case_sensitive: + field = fields_mapped.get(name, default_field) + else: + field = fields_mapped.get(name.lower(), default_field) + schema.add_field(field) # type: ignore + + @staticmethod + def add_missing_required_labels_to_schema_fields( + fields_map: Dict[str, Optional[Field]], + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + for _, field in fields_map.items(): + if case_sensitive: + if field and field.required and field.name not in labels: + schema.add_field(field) + else: + if ( + field + and field.required + and field.name.lower() not in [label.lower() for label in labels] + ): + schema.add_field(field) From e4839f1d68cc769b9e51ba3ae4345ca7d3097dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 9 Feb 2024 17:47:57 +0100 Subject: [PATCH 12/62] Refacto: WIP2 --- frictionless/detector/detector.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 72eb2da7e5..10bfc8d267 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -483,6 +483,7 @@ def add_missing_required_labels_to_schema_fields( case_sensitive: bool, ): for _, field in fields_map.items(): + #TODO use self.field_name_not_in_labels if case_sensitive: if field and field.required and field.name not in labels: schema.add_field(field) @@ -493,3 +494,20 @@ def add_missing_required_labels_to_schema_fields( and field.name.lower() not in [label.lower() for label in labels] ): schema.add_field(field) + + @staticmethod + def field_name_not_in_labels( + field: Field, + labels: List[str], + case_sensitive: bool + ) -> bool: + if case_sensitive: + return field and field.required and field.name not in labels + else: + return ( + field + and field.required + and field.name.lower() not in [ + label.lower() for label in labels + ] + ) From 2d5a575bec5680b2250195fa4601b05f16b7686f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:02:49 +0100 Subject: [PATCH 13/62] Refacto: refactoring creating schema fields among labels when using 'schema_sync' 'Detector' option --- frictionless/detector/detector.py | 53 +++++++++++++------------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 10bfc8d267..31b02b87e7 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional import attrs @@ -412,17 +412,21 @@ def detect_schema( case_sensitive = options["header_case"] - fields_mapped = self.mapped_schema_fields_names( - schema.fields, case_sensitive + assert schema + assert schema.fields + assert all(isinstance(field, Field) for field in schema.fields) + + mapped_fields = self.mapped_schema_fields_names( + schema.fields, case_sensitive # type: ignore ) self.add_fields_to_schema_among_labels( - fields_mapped, schema, labels, case_sensitive + mapped_fields, schema, labels, case_sensitive # type: ignore ) # For required fields that are missing self.add_missing_required_labels_to_schema_fields( - fields_mapped, schema, labels, options["header_case"] + mapped_fields, schema, labels, case_sensitive # type: ignore ) # Patch schema @@ -441,7 +445,7 @@ def detect_schema( @staticmethod def mapped_schema_fields_names( - fields: Union[List[None], List[Field]], case_sensitive: bool + fields: List[Field], case_sensitive: bool ) -> Dict[str, Optional[Field]]: """Create a dictionnary to map fields name with schema fields considering case sensitivity @@ -455,13 +459,13 @@ def mapped_schema_fields_names( Dict[str, Optional[Field]] """ if case_sensitive: - return {field.name: field for field in fields} # type:ignore + return {field.name: field for field in fields} else: - return {field.name.lower(): field for field in fields} # type: ignore + return {field.name.lower(): field for field in fields} @staticmethod def add_fields_to_schema_among_labels( - fields_mapped: Dict[str, Optional[Field]], + fields_mapped: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, @@ -473,41 +477,28 @@ def add_fields_to_schema_among_labels( field = fields_mapped.get(name, default_field) else: field = fields_mapped.get(name.lower(), default_field) - schema.add_field(field) # type: ignore + schema.add_field(field) - @staticmethod def add_missing_required_labels_to_schema_fields( - fields_map: Dict[str, Optional[Field]], + self, + fields_map: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, ): for _, field in fields_map.items(): - #TODO use self.field_name_not_in_labels - if case_sensitive: - if field and field.required and field.name not in labels: - schema.add_field(field) - else: - if ( - field - and field.required - and field.name.lower() not in [label.lower() for label in labels] - ): - schema.add_field(field) + if self.field_name_not_in_labels(field, labels, case_sensitive): + schema.add_field(field) @staticmethod def field_name_not_in_labels( - field: Field, - labels: List[str], - case_sensitive: bool + field: Field, labels: List[str], case_sensitive: bool ) -> bool: if case_sensitive: return field and field.required and field.name not in labels else: return ( - field - and field.required - and field.name.lower() not in [ - label.lower() for label in labels - ] + field + and field.required + and field.name.lower() not in [label.lower() for label in labels] ) From 47072f4c840a2f437c9db58f0ed4fde9624f9215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:08:14 +0100 Subject: [PATCH 14/62] Doc: docstring --- frictionless/detector/detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 31b02b87e7..8722f69a10 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -453,7 +453,7 @@ def mapped_schema_fields_names( Args: fields (Union[List[None], List[Field]]): list of original schema fields - case_sensitive (bool) + case_sensitive (bool): True if case sensitive, False else Returns: Dict[str, Optional[Field]] From 424ad2309a822e1d4c53db6dd01fa1edbff63315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:11:31 +0100 Subject: [PATCH 15/62] Refacto: remve useless lines --- frictionless/detector/detector.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 8722f69a10..ba269d0a8a 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -495,10 +495,9 @@ def field_name_not_in_labels( field: Field, labels: List[str], case_sensitive: bool ) -> bool: if case_sensitive: - return field and field.required and field.name not in labels + return field.required and field.name not in labels else: return ( - field - and field.required + field.required and field.name.lower() not in [label.lower() for label in labels] ) From b08aaf219bdf90083c549259691a49ea90aed02a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 10:16:04 +0100 Subject: [PATCH 16/62] TDD: add test case for missing label in TableResource corresponding to a required field and primary key schema: test fails --- frictionless/table/__spec__/test_header.py | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index e3fce0a921..c72fcfc6d9 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -1,3 +1,4 @@ +import frictionless from frictionless import Schema, fields from frictionless.resources import TableResource @@ -37,3 +38,25 @@ def test_missing_label(): assert header == ["id", "name", "extra"] assert header.labels == ["id", "name"] assert header.valid is False + + +def test_missing_primary_key_label_with_shema_sync_issue_1633(): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": {"required": True}}], + "primaryKey": ["A"], + } + + source = [["B"], ["foo"]] + + resource = TableResource( + source, + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + ) + + report = frictionless.validate(resource) + + assert len(report.tasks[0].errors) == 1 + assert report.tasks[0].errors[0].type == "missing-label" + assert not report.valid From 3f3b08dcba19f9cad42cd69ea2a8ec67c4236b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 14:19:46 +0100 Subject: [PATCH 17/62] TDD: fix unexpected key-error: test passes --- frictionless/resources/table.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 89b9fcbc7d..4ae97f06c1 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -334,19 +334,24 @@ def row_stream(): # Primary Key Error if is_integrity and self.schema.primary_key: - cells = tuple(row[name] for name in self.schema.primary_key) - if set(cells) == {None}: - note = 'cells composing the primary keys are all "None"' - error = errors.PrimaryKeyError.from_row(row, note=note) - row.errors.append(error) + try: + cells = tuple(row[name] for name in self.schema.primary_key) + except KeyError: + # Row does not have primary_key as key + pass else: - match = memory_primary.get(cells) - memory_primary[cells] = row.row_number - if match: + if set(cells) == {None}: + note = 'cells composing the primary keys are all "None"' + error = errors.PrimaryKeyError.from_row(row, note=note) + row.errors.append(error) + else: + match = memory_primary.get(cells) + memory_primary[cells] = row.row_number if match: - note = "the same as in the row at position %s" % match - error = errors.PrimaryKeyError.from_row(row, note=note) - row.errors.append(error) + if match: + note = "the same as in the row at position %s" % match + error = errors.PrimaryKeyError.from_row(row, note=note) + row.errors.append(error) # Foreign Key Error if is_integrity and foreign_groups: From 03f65d04eb9375d42d5ba63e15ca077e099924d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 14:25:14 +0100 Subject: [PATCH 18/62] TDD: add test case for missing label in TableRecource columns corresponding to a primary key schema: test fails --- frictionless/table/__spec__/test_header.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index c72fcfc6d9..98271e3338 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -60,3 +60,22 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): assert len(report.tasks[0].errors) == 1 assert report.tasks[0].errors[0].type == "missing-label" assert not report.valid + + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A"}], + "primaryKey": ["A"], + } + + source = [["B"], ["foo"]] + + resource = TableResource( + source, + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + ) + + report = frictionless.validate(resource) + print(report) + assert report.tasks[0].errors[0].type == "primary-key" + assert not report.valid From 8870f26a9cb591812deb2b9fbc5d5e8c4ec9ed93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 14:39:58 +0100 Subject: [PATCH 19/62] TDD: append PrimaryKeyError in case of missing label corresponding to primary key schema field: test passes --- frictionless/resources/table.py | 11 +++++++++-- frictionless/table/__spec__/test_header.py | 11 ++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4ae97f06c1..4780d6f79d 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -338,7 +338,12 @@ def row_stream(): cells = tuple(row[name] for name in self.schema.primary_key) except KeyError: # Row does not have primary_key as key - pass + # There is a missing label corresponding to the schema + note = ( + f"Inexistant label for primary key {self.schema.primary_key}" + ) + error = errors.PrimaryKeyError.from_row(row, note=note) + row.errors.append(error) else: if set(cells) == {None}: note = 'cells composing the primary keys are all "None"' @@ -350,7 +355,9 @@ def row_stream(): if match: if match: note = "the same as in the row at position %s" % match - error = errors.PrimaryKeyError.from_row(row, note=note) + error = errors.PrimaryKeyError.from_row( + row, note=note + ) row.errors.append(error) # Foreign Key Error diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 98271e3338..29c6d229dc 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -57,10 +57,11 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): report = frictionless.validate(resource) - assert len(report.tasks[0].errors) == 1 - assert report.tasks[0].errors[0].type == "missing-label" assert not report.valid - + assert len(report.tasks[0].errors) == 2 + assert report.tasks[0].errors[0].type == "missing-label" + assert report.tasks[0].errors[1].type == "primary-key" + schema_descriptor = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [{"name": "A"}], @@ -76,6 +77,6 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): ) report = frictionless.validate(resource) - print(report) - assert report.tasks[0].errors[0].type == "primary-key" assert not report.valid + assert len(report.tasks[0].errors) == 1 + assert report.tasks[0].errors[0].type == "primary-key" From 1736f54d87793f7e58c674a81e8a624908c7852e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 15:04:23 +0100 Subject: [PATCH 20/62] Refacto: refactoring tests --- frictionless/table/__spec__/test_header.py | 71 ++++++++++------------ 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 29c6d229dc..645a34c9e2 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -41,42 +41,37 @@ def test_missing_label(): def test_missing_primary_key_label_with_shema_sync_issue_1633(): - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A", "constraints": {"required": True}}], - "primaryKey": ["A"], - } - source = [["B"], ["foo"]] - - resource = TableResource( - source, - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - ) - - report = frictionless.validate(resource) - - assert not report.valid - assert len(report.tasks[0].errors) == 2 - assert report.tasks[0].errors[0].type == "missing-label" - assert report.tasks[0].errors[1].type == "primary-key" - - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A"}], - "primaryKey": ["A"], - } - - source = [["B"], ["foo"]] - - resource = TableResource( - source, - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - ) - - report = frictionless.validate(resource) - assert not report.valid - assert len(report.tasks[0].errors) == 1 - assert report.tasks[0].errors[0].type == "primary-key" + test_cases = [ + { + "constraints": {"required": True}, + "nb_errors": 2, + "types_errors_expected": ["missing-label", "primary-key"], + }, + { + "constraints": {}, + "nb_errors": 1, + "types_errors_expected": ["primary-key"], + } + ] + + for tc in test_cases: + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": tc["constraints"]}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=[["B"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + ) + + report = frictionless.validate(resource) + errors = report.tasks[0].errors + + assert not report.valid + assert len(errors) == tc["nb_errors"] + for error, type_expected in zip(errors, tc["types_errors_expected"]): + assert error.type == type_expected From b5157833a4547dcd124623e2c0892ff2649b9107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 15:17:07 +0100 Subject: [PATCH 21/62] TDD: add test case for insensitive case on label corresponding to primary key schema: test fails --- frictionless/resources/table.py | 6 +++++- frictionless/table/__spec__/test_header.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4780d6f79d..ec689862fd 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -335,7 +335,11 @@ def row_stream(): # Primary Key Error if is_integrity and self.schema.primary_key: try: - cells = tuple(row[name] for name in self.schema.primary_key) + if self.dialect.header_case: + cells = tuple(row[name] for name in self.schema.primary_key) + else: # ignore case + primary_key_lowers = [pk.lower() for pk in self.schema.primary_key] + cells = tuple(row[name] for name in primary_key_lowers) except KeyError: # Row does not have primary_key as key # There is a missing label corresponding to the schema diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 645a34c9e2..e4aa307797 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -75,3 +75,19 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): assert len(errors) == tc["nb_errors"] for error, type_expected in zip(errors, tc["types_errors_expected"]): assert error.type == type_expected + + # Ignore header_case + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A"}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=[["a"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=False) + ) + report = frictionless.validate(resource) + assert report.valid From 9eb9970bfdbc3c0ca11ccbe671e19db49ed7ede3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 16:04:38 +0100 Subject: [PATCH 22/62] TDD: fix case insentive with label corresponding to primary key: test passes --- frictionless/resources/table.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index ec689862fd..cdae58b4ed 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -337,9 +337,13 @@ def row_stream(): try: if self.dialect.header_case: cells = tuple(row[name] for name in self.schema.primary_key) - else: # ignore case - primary_key_lowers = [pk.lower() for pk in self.schema.primary_key] - cells = tuple(row[name] for name in primary_key_lowers) + else: # ignore case + cells = () + lower_primary_keys = [pk.lower() for pk in self.schema.primary_key] + # cells = tuple(row[label] if (label.lower() for label in row.field_names) in lower_primary_keys ) + for label in row.field_names: + if label.lower() in lower_primary_keys: + cells = cells + (row[label],) except KeyError: # Row does not have primary_key as key # There is a missing label corresponding to the schema From 6542f0b70a0f6300e91bf797ad369c25e14c3e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 16:19:03 +0100 Subject: [PATCH 23/62] TDD: edit test_cases to expected beahoviour: expect only missing-label error when missing label corresponding to a schema primary key: test fails --- frictionless/table/__spec__/test_header.py | 25 +++++++++++----------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index e4aa307797..7d640f0602 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -41,18 +41,17 @@ def test_missing_label(): def test_missing_primary_key_label_with_shema_sync_issue_1633(): - test_cases = [ { "constraints": {"required": True}, - "nb_errors": 2, - "types_errors_expected": ["missing-label", "primary-key"], - }, - { - "constraints": {}, "nb_errors": 1, - "types_errors_expected": ["primary-key"], - } + "types_errors_expected": ["missing-label"], + }, + # { + # "constraints": {}, + # "nb_errors": 1, + # "types_errors_expected": ["missing-label"], + # }, ] for tc in test_cases: @@ -78,16 +77,16 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): # Ignore header_case schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A"}], - "primaryKey": ["A"], - } + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A"}], + "primaryKey": ["A"], + } resource = TableResource( source=[["a"], ["foo"]], schema=Schema.from_descriptor(schema_descriptor), detector=frictionless.Detector(schema_sync=True), - dialect=frictionless.Dialect(header_case=False) + dialect=frictionless.Dialect(header_case=False), ) report = frictionless.validate(resource) assert report.valid From 23a512569c9255396657fb6e171cbc01f475d773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 16:23:11 +0100 Subject: [PATCH 24/62] TDD: fix expected behaviour: test passes --- frictionless/resources/table.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index cdae58b4ed..bbc83a505c 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -337,29 +337,28 @@ def row_stream(): try: if self.dialect.header_case: cells = tuple(row[name] for name in self.schema.primary_key) - else: # ignore case + else: # Ignore case cells = () - lower_primary_keys = [pk.lower() for pk in self.schema.primary_key] + lower_primary_keys = [ + pk.lower() for pk in self.schema.primary_key + ] # cells = tuple(row[label] if (label.lower() for label in row.field_names) in lower_primary_keys ) for label in row.field_names: if label.lower() in lower_primary_keys: cells = cells + (row[label],) except KeyError: # Row does not have primary_key as key - # There is a missing label corresponding to the schema - note = ( - f"Inexistant label for primary key {self.schema.primary_key}" - ) - error = errors.PrimaryKeyError.from_row(row, note=note) - row.errors.append(error) + # There should already be a missing-label error in + # in self.header corresponding to the schema primary key + assert not self.header.valid else: if set(cells) == {None}: note = 'cells composing the primary keys are all "None"' error = errors.PrimaryKeyError.from_row(row, note=note) row.errors.append(error) else: - match = memory_primary.get(cells) - memory_primary[cells] = row.row_number + match = memory_primary.get(cells) # type: ignore + memory_primary[cells] = row.row_number # type: ignore if match: if match: note = "the same as in the row at position %s" % match From d72bab02b6908662671ba040a4cdb615ef26e1fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Thu, 8 Feb 2024 16:24:33 +0100 Subject: [PATCH 25/62] TDD: restore one tet case: test fails --- frictionless/table/__spec__/test_header.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 7d640f0602..7583e84517 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -47,11 +47,11 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): "nb_errors": 1, "types_errors_expected": ["missing-label"], }, - # { - # "constraints": {}, - # "nb_errors": 1, - # "types_errors_expected": ["missing-label"], - # }, + { + "constraints": {}, + "nb_errors": 1, + "types_errors_expected": ["missing-label"], + }, ] for tc in test_cases: From 01f7fcc01f767d6bb014fa5734cc5f40f08f0652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:37:35 +0100 Subject: [PATCH 26/62] WIP: isolate test case to fix --- frictionless/table/__spec__/test_header.py | 28 ++++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 7583e84517..d4d5d4fbe4 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -47,11 +47,11 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): "nb_errors": 1, "types_errors_expected": ["missing-label"], }, - { - "constraints": {}, - "nb_errors": 1, - "types_errors_expected": ["missing-label"], - }, + # { # This test case raise the unexpected KeyError we want to fix + # "constraints": {}, + # "nb_errors": 1, + # "types_errors_expected": ["missing-label"], + # }, ] for tc in test_cases: @@ -75,6 +75,24 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): for error, type_expected in zip(errors, tc["types_errors_expected"]): assert error.type == type_expected + # TODO: fix this isolated test case and refactoring + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A"}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=[["B"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + ) + + report = frictionless.validate(resource) + errors = report.tasks[0].errors + assert len(errors) == 1 + assert errors[0].type == "missing-label" + # Ignore header_case schema_descriptor = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", From a9f4a7cacc6e504d472329d8fc5bac4bec0352e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 12:56:38 +0100 Subject: [PATCH 27/62] Refacto: WIP --- frictionless/resources/table.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index bbc83a505c..5c160cb802 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -335,17 +335,8 @@ def row_stream(): # Primary Key Error if is_integrity and self.schema.primary_key: try: - if self.dialect.header_case: - cells = tuple(row[name] for name in self.schema.primary_key) - else: # Ignore case - cells = () - lower_primary_keys = [ - pk.lower() for pk in self.schema.primary_key - ] - # cells = tuple(row[label] if (label.lower() for label in row.field_names) in lower_primary_keys ) - for label in row.field_names: - if label.lower() in lower_primary_keys: - cells = cells + (row[label],) + cells = self.primary_key_cells(row, + self.dialect.header_case) except KeyError: # Row does not have primary_key as key # There should already be a missing-label error in @@ -453,7 +444,27 @@ def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): del field_info["mapping"][field_name] # Read + def primary_key_cells( + self, + row: Row, + case_sensitive: bool + ) -> Tuple[Row, Any]: + """Create a tuple containg all cells related to primary_key + """ + if case_sensitive: + cells = tuple(row[name] for name in self.schema.primary_key) + else: # Ignore case + cells = () + lower_primary_keys = [ + pk.lower() for pk in self.schema.primary_key + ] + # cells = tuple(row[label] if (label.lower() for label in row.field_names) in lower_primary_keys ) + for label in row.field_names: + if label.lower() in lower_primary_keys: + cells = cells + (row[label],) + return cells + # Read def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]: """Read lists into memory From 8ca2c1c6dc4db14d67393929ad8fd3aa08b27e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 16:58:52 +0100 Subject: [PATCH 28/62] Refacto: WIP2 --- frictionless/resources/table.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 5c160cb802..9b3471343a 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -454,14 +454,11 @@ def primary_key_cells( if case_sensitive: cells = tuple(row[name] for name in self.schema.primary_key) else: # Ignore case - cells = () lower_primary_keys = [ pk.lower() for pk in self.schema.primary_key ] - # cells = tuple(row[label] if (label.lower() for label in row.field_names) in lower_primary_keys ) - for label in row.field_names: - if label.lower() in lower_primary_keys: - cells = cells + (row[label],) + labels_primary_key = [label for label in row.field_names if label.lower() in lower_primary_keys] + cells = tuple(row[label] for label in labels_primary_key) return cells # Read From 6773f9fa492889acd1d62b2c47e1d830e4df24af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 17:17:48 +0100 Subject: [PATCH 29/62] Refacto: introduce intermediary methods to create cells in case of primary key constraint and considering case-sensitivity --- frictionless/resources/table.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 9b3471343a..eaf96544ba 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -448,18 +448,29 @@ def primary_key_cells( self, row: Row, case_sensitive: bool - ) -> Tuple[Row, Any]: + ) -> Tuple[str, Any]: """Create a tuple containg all cells related to primary_key """ + return tuple(row[label] for label in + self.labels_related_to_primary_key(row, case_sensitive)) + + def labels_related_to_primary_key( + self, + row: Row, + case_sensitive: bool, + ) -> List[str]: + """Create a list of TabelResource labels which correspond to the + primary_key schema considering case-sensitivity + """ if case_sensitive: - cells = tuple(row[name] for name in self.schema.primary_key) - else: # Ignore case - lower_primary_keys = [ + labels_primary_key = self.schema.primary_key + else: + lower_primary_key = [ pk.lower() for pk in self.schema.primary_key ] - labels_primary_key = [label for label in row.field_names if label.lower() in lower_primary_keys] - cells = tuple(row[label] for label in labels_primary_key) - return cells + labels_primary_key = [label for label in row.field_names + if label.lower() in lower_primary_key] + return labels_primary_key # Read def read_cells(self, *, size: Optional[int] = None) -> List[List[Any]]: From df32fdaf53f7d1c0f5ec6064614afe91ef1827e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 18:08:25 +0100 Subject: [PATCH 30/62] TDD: fix test case comment ignore_case test case: test passes --- frictionless/detector/detector.py | 5 +++- frictionless/table/__spec__/test_header.py | 30 +++++++++++----------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index ba269d0a8a..c7569dca30 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -424,7 +424,6 @@ def detect_schema( mapped_fields, schema, labels, case_sensitive # type: ignore ) - # For required fields that are missing self.add_missing_required_labels_to_schema_fields( mapped_fields, schema, labels, case_sensitive # type: ignore ) @@ -487,8 +486,12 @@ def add_missing_required_labels_to_schema_fields( case_sensitive: bool, ): for _, field in fields_map.items(): + # For required fields that are missing if self.field_name_not_in_labels(field, labels, case_sensitive): schema.add_field(field) + # For primary field that are missing + if field and not field.required and field.name in schema.primary_key and field.name not in labels: + schema.add_field(field) @staticmethod def field_name_not_in_labels( diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index d4d5d4fbe4..b83618cfe8 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -93,18 +93,18 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): assert len(errors) == 1 assert errors[0].type == "missing-label" - # Ignore header_case - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A"}], - "primaryKey": ["A"], - } - - resource = TableResource( - source=[["a"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - dialect=frictionless.Dialect(header_case=False), - ) - report = frictionless.validate(resource) - assert report.valid + # # Ignore header_case + # schema_descriptor = { + # "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + # "fields": [{"name": "A", "constraints": {"required": True}}], + # "primaryKey": ["A"], + # } + + # resource = TableResource( + # source=[["a"], ["foo"]], + # schema=Schema.from_descriptor(schema_descriptor), + # detector=frictionless.Detector(schema_sync=True), + # dialect=frictionless.Dialect(header_case=False), + # ) + # report = frictionless.validate(resource) + # assert report.valid From 7abea122cefb3119c1fd79945c16a76899179c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 18:10:07 +0100 Subject: [PATCH 31/62] Refacto: refactoring test --- frictionless/table/__spec__/test_header.py | 28 ++++------------------ 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index b83618cfe8..2189baeb2e 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -47,11 +47,11 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): "nb_errors": 1, "types_errors_expected": ["missing-label"], }, - # { # This test case raise the unexpected KeyError we want to fix - # "constraints": {}, - # "nb_errors": 1, - # "types_errors_expected": ["missing-label"], - # }, + { + "constraints": {}, + "nb_errors": 1, + "types_errors_expected": ["missing-label"], + }, ] for tc in test_cases: @@ -75,24 +75,6 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): for error, type_expected in zip(errors, tc["types_errors_expected"]): assert error.type == type_expected - # TODO: fix this isolated test case and refactoring - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A"}], - "primaryKey": ["A"], - } - - resource = TableResource( - source=[["B"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - ) - - report = frictionless.validate(resource) - errors = report.tasks[0].errors - assert len(errors) == 1 - assert errors[0].type == "missing-label" - # # Ignore header_case # schema_descriptor = { # "$schema": "https://frictionlessdata.io/schemas/table-schema.json", From fbad612de1fb0e7bc002647112415b276ea11d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 18:11:48 +0100 Subject: [PATCH 32/62] TDD: restore insensitive-case test case: test fails --- frictionless/table/__spec__/test_header.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 2189baeb2e..7caad5b424 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -75,18 +75,18 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): for error, type_expected in zip(errors, tc["types_errors_expected"]): assert error.type == type_expected - # # Ignore header_case - # schema_descriptor = { - # "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - # "fields": [{"name": "A", "constraints": {"required": True}}], - # "primaryKey": ["A"], - # } + # Ignore header_case + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": {"required": True}}], + "primaryKey": ["A"], + } - # resource = TableResource( - # source=[["a"], ["foo"]], - # schema=Schema.from_descriptor(schema_descriptor), - # detector=frictionless.Detector(schema_sync=True), - # dialect=frictionless.Dialect(header_case=False), - # ) - # report = frictionless.validate(resource) - # assert report.valid + resource = TableResource( + source=[["a"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=False), + ) + report = frictionless.validate(resource) + assert report.valid From 89fa688f70f56ffb6e13ca2a862bb035de28f6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 19:26:51 +0100 Subject: [PATCH 33/62] TDD: fix ignore header case test case: test passes --- frictionless/detector/detector.py | 36 ++++++++++++++++++---- frictionless/resources/table.py | 28 +++++++---------- frictionless/table/__spec__/test_header.py | 16 ++++++++++ 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index c7569dca30..547f891f33 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -490,8 +490,9 @@ def add_missing_required_labels_to_schema_fields( if self.field_name_not_in_labels(field, labels, case_sensitive): schema.add_field(field) # For primary field that are missing - if field and not field.required and field.name in schema.primary_key and field.name not in labels: - schema.add_field(field) + self.add_missing_primary_key_to_schema_fields( + field, schema, labels, case_sensitive # type: ignore + ) @staticmethod def field_name_not_in_labels( @@ -500,7 +501,30 @@ def field_name_not_in_labels( if case_sensitive: return field.required and field.name not in labels else: - return ( - field.required - and field.name.lower() not in [label.lower() for label in labels] - ) + return field.required and field.name.lower() not in [ + label.lower() for label in labels + ] + + @staticmethod + def add_missing_primary_key_to_schema_fields( + field: Field, + schema: Schema, + labels: List[str], + case_sensitive: bool, + ): + if case_sensitive: + if ( + not field.required + and field.name in schema.primary_key + and field.name not in labels + ): + schema.add_field(field) + else: + lower_primary_key = [pk.lower() for pk in schema.primary_key] + lower_labels = [label.lower() for label in labels] + if ( + not field.required + and field.name.lower() in lower_primary_key + and field.name.lower() not in lower_labels + ): + schema.add_field(field) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index eaf96544ba..085add9648 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -335,8 +335,7 @@ def row_stream(): # Primary Key Error if is_integrity and self.schema.primary_key: try: - cells = self.primary_key_cells(row, - self.dialect.header_case) + cells = self.primary_key_cells(row, self.dialect.header_case) except KeyError: # Row does not have primary_key as key # There should already be a missing-label error in @@ -443,17 +442,13 @@ def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): del field_info["objects"][field_index] del field_info["mapping"][field_name] - # Read - def primary_key_cells( - self, - row: Row, - case_sensitive: bool - ) -> Tuple[str, Any]: - """Create a tuple containg all cells related to primary_key - """ - return tuple(row[label] for label in - self.labels_related_to_primary_key(row, case_sensitive)) - + def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[str, Any]: + """Create a tuple containg all cells related to primary_key""" + return tuple( + row[label] + for label in self.labels_related_to_primary_key(row, case_sensitive) + ) + def labels_related_to_primary_key( self, row: Row, @@ -465,11 +460,10 @@ def labels_related_to_primary_key( if case_sensitive: labels_primary_key = self.schema.primary_key else: - lower_primary_key = [ - pk.lower() for pk in self.schema.primary_key + lower_primary_key = [pk.lower() for pk in self.schema.primary_key] + labels_primary_key = [ + label for label in row.field_names if label.lower() in lower_primary_key ] - labels_primary_key = [label for label in row.field_names - if label.lower() in lower_primary_key] return labels_primary_key # Read diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 7caad5b424..a1ea94e9b5 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -76,6 +76,22 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): assert error.type == type_expected # Ignore header_case + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A"}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=[["a"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=False), + ) + report = frictionless.validate(resource) + assert report.valid + + # TODO to solve this test case needs to merge with working branch Fix-1635-... schema_descriptor = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [{"name": "A", "constraints": {"required": True}}], From 618725d2fb7adb50d7e9d91c8dbe3158089d7179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 10:59:12 +0100 Subject: [PATCH 34/62] Refacto: refactoring tests --- frictionless/table/__spec__/test_header.py | 54 ++++++++++------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index a1ea94e9b5..0630e00c07 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -76,33 +76,27 @@ def test_missing_primary_key_label_with_shema_sync_issue_1633(): assert error.type == type_expected # Ignore header_case - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A"}], - "primaryKey": ["A"], - } - - resource = TableResource( - source=[["a"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - dialect=frictionless.Dialect(header_case=False), - ) - report = frictionless.validate(resource) - assert report.valid - - # TODO to solve this test case needs to merge with working branch Fix-1635-... - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A", "constraints": {"required": True}}], - "primaryKey": ["A"], - } - - resource = TableResource( - source=[["a"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - dialect=frictionless.Dialect(header_case=False), - ) - report = frictionless.validate(resource) - assert report.valid + test_cases = [ + { + "constraints": {"required": True}, + }, + { + "constraints": {}, + }, + ] + + for tc in test_cases: + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": tc["constraints"]}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=[["a"], ["foo"]], + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=False), + ) + report = frictionless.validate(resource) + assert report.valid From b3d8bf82801c2f564ee14f28af86570291952f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 11:41:54 +0100 Subject: [PATCH 35/62] Refacto: refactoring tests 2 --- frictionless/table/__spec__/test_header.py | 86 ++++++++-------------- 1 file changed, 31 insertions(+), 55 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 0630e00c07..94a36231b8 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -1,3 +1,5 @@ +import pytest + import frictionless from frictionless import Schema, fields from frictionless.resources import TableResource @@ -40,63 +42,37 @@ def test_missing_label(): assert header.valid is False -def test_missing_primary_key_label_with_shema_sync_issue_1633(): - test_cases = [ - { - "constraints": {"required": True}, - "nb_errors": 1, - "types_errors_expected": ["missing-label"], - }, - { - "constraints": {}, - "nb_errors": 1, - "types_errors_expected": ["missing-label"], - }, - ] +@pytest.mark.parametrize( + "source, required, valid_report, nb_errors, types_errors_expected, header_case", + [ + ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True), + ([["B"], ["foo"]], {}, False, 1, ["missing-label"], True), + ([["a"], ["foo"]], {"required": True}, True, 0, [], False), # Ignore header_case + ([["a"], ["foo"]], {}, True, 0, [], False), # Ignore header_case + ], +) +def test_missing_primary_key_label_with_shema_sync_issue_1633( + source, required, valid_report, nb_errors, types_errors_expected, header_case +): + schema_descriptor = { + "$schema": "https://frictionlessdata.io/schemas/table-schema.json", + "fields": [{"name": "A", "constraints": required}], + "primaryKey": ["A"], + } + + resource = TableResource( + source=source, + schema=Schema.from_descriptor(schema_descriptor), + detector=frictionless.Detector(schema_sync=True), + dialect=frictionless.Dialect(header_case=header_case), + ) - for tc in test_cases: - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A", "constraints": tc["constraints"]}], - "primaryKey": ["A"], - } + report = frictionless.validate(resource) - resource = TableResource( - source=[["B"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - ) + assert report.valid == valid_report - report = frictionless.validate(resource) + if not report.valid: errors = report.tasks[0].errors - - assert not report.valid - assert len(errors) == tc["nb_errors"] - for error, type_expected in zip(errors, tc["types_errors_expected"]): + assert len(errors) == nb_errors + for error, type_expected in zip(errors, types_errors_expected): assert error.type == type_expected - - # Ignore header_case - test_cases = [ - { - "constraints": {"required": True}, - }, - { - "constraints": {}, - }, - ] - - for tc in test_cases: - schema_descriptor = { - "$schema": "https://frictionlessdata.io/schemas/table-schema.json", - "fields": [{"name": "A", "constraints": tc["constraints"]}], - "primaryKey": ["A"], - } - - resource = TableResource( - source=[["a"], ["foo"]], - schema=Schema.from_descriptor(schema_descriptor), - detector=frictionless.Detector(schema_sync=True), - dialect=frictionless.Dialect(header_case=False), - ) - report = frictionless.validate(resource) - assert report.valid From 8de569104560aa37a7d6f87d5fac4c800302a207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Sat, 10 Feb 2024 19:37:23 +0100 Subject: [PATCH 36/62] Refacto: introduce intermediary 'Dialect' methods to add missing primary key label to the schema fields --- frictionless/detector/detector.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 547f891f33..e6e1f80ba8 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -505,26 +505,39 @@ def field_name_not_in_labels( label.lower() for label in labels ] - @staticmethod def add_missing_primary_key_to_schema_fields( + self, field: Field, schema: Schema, labels: List[str], case_sensitive: bool, ): + if self.primary_key_field_not_in_labels( + field, + schema, + labels, + case_sensitive, + ): + schema.add_field(field) + + @staticmethod + def primary_key_field_not_in_labels( + field: Field, + schema: Schema, + labels: List[str], + case_sensitive: bool, + ) -> bool: if case_sensitive: - if ( + return ( not field.required and field.name in schema.primary_key and field.name not in labels - ): - schema.add_field(field) + ) else: lower_primary_key = [pk.lower() for pk in schema.primary_key] lower_labels = [label.lower() for label in labels] - if ( + return ( not field.required and field.name.lower() in lower_primary_key and field.name.lower() not in lower_labels - ): - schema.add_field(field) + ) From 1af01239ad976caa7a6b0368a4149dea51a08f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 11:52:10 +0100 Subject: [PATCH 37/62] Refacto: refactoring 'Dialect' methods recently introduced --- frictionless/detector/detector.py | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index e6e1f80ba8..c05bb82804 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -487,12 +487,12 @@ def add_missing_required_labels_to_schema_fields( ): for _, field in fields_map.items(): # For required fields that are missing - if self.field_name_not_in_labels(field, labels, case_sensitive): - schema.add_field(field) - # For primary field that are missing - self.add_missing_primary_key_to_schema_fields( + if self.field_name_not_in_labels( + field, labels, case_sensitive + ) or self.primary_key_field_name_not_in_labels( field, schema, labels, case_sensitive # type: ignore - ) + ): + schema.add_field(field) @staticmethod def field_name_not_in_labels( @@ -505,23 +505,23 @@ def field_name_not_in_labels( label.lower() for label in labels ] - def add_missing_primary_key_to_schema_fields( - self, - field: Field, - schema: Schema, - labels: List[str], - case_sensitive: bool, - ): - if self.primary_key_field_not_in_labels( - field, - schema, - labels, - case_sensitive, - ): - schema.add_field(field) + # def add_missing_primary_key_to_schema_fields( + # self, + # field: Field, + # schema: Schema, + # labels: List[str], + # case_sensitive: bool, + # ): + # if self.primary_key_field_not_in_labels( + # field, + # schema, + # labels, + # case_sensitive, + # ): + # schema.add_field(field) @staticmethod - def primary_key_field_not_in_labels( + def primary_key_field_name_not_in_labels( field: Field, schema: Schema, labels: List[str], From e8fb44db1a019a3e101ad809d1dac16b0fe93f89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 11:53:43 +0100 Subject: [PATCH 38/62] TDD: add new test cases: test passes --- frictionless/table/__spec__/test_header.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/frictionless/table/__spec__/test_header.py b/frictionless/table/__spec__/test_header.py index 94a36231b8..b4c43600b1 100644 --- a/frictionless/table/__spec__/test_header.py +++ b/frictionless/table/__spec__/test_header.py @@ -47,8 +47,13 @@ def test_missing_label(): [ ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True), ([["B"], ["foo"]], {}, False, 1, ["missing-label"], True), - ([["a"], ["foo"]], {"required": True}, True, 0, [], False), # Ignore header_case - ([["a"], ["foo"]], {}, True, 0, [], False), # Ignore header_case + ([["a"], ["foo"]], {"required": True}, False, 1, ["missing-label"], True), + ([["a"], ["foo"]], {}, False, 1, ["missing-label"], True), + # Ignore header_case + ([["B"], ["foo"]], {"required": True}, False, 1, ["missing-label"], False), + ([["B"], ["foo"]], {}, False, 1, ["missing-label"], False), + ([["a"], ["foo"]], {"required": True}, True, 0, [], False), + ([["a"], ["foo"]], {}, True, 0, [], False), ], ) def test_missing_primary_key_label_with_shema_sync_issue_1633( From 69547530f701f66822f1b118ffd560f869c0102b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 11:55:40 +0100 Subject: [PATCH 39/62] Refacto: remove useless lines --- frictionless/detector/detector.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index c05bb82804..3639380600 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -505,21 +505,6 @@ def field_name_not_in_labels( label.lower() for label in labels ] - # def add_missing_primary_key_to_schema_fields( - # self, - # field: Field, - # schema: Schema, - # labels: List[str], - # case_sensitive: bool, - # ): - # if self.primary_key_field_not_in_labels( - # field, - # schema, - # labels, - # case_sensitive, - # ): - # schema.add_field(field) - @staticmethod def primary_key_field_name_not_in_labels( field: Field, From b03e1c43214f77acdc25a535368c19d8b595362b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 12:04:06 +0100 Subject: [PATCH 40/62] Refacto: format file --- frictionless/resources/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 085add9648..1a28cde2a2 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -399,7 +399,7 @@ def row_stream(): yield row if self.detector.schema_sync: - # Missing required labels have in 'field_info' + # Missing required labels have in 'field_info' # parameter used for row creation for field in self.schema.fields: self.remove_missing_required_label_from_field_info(field, field_info) From 7b27133e4bfd6f74037d3c11a88c363b14ff7828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Mon, 12 Feb 2024 14:30:42 +0100 Subject: [PATCH 41/62] Doc: docstring --- frictionless/detector/detector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 3639380600..0ab22b6552 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -485,12 +485,15 @@ def add_missing_required_labels_to_schema_fields( labels: List[str], case_sensitive: bool, ): + """This method aims to add missing required labels and + primary key field not in labels to schema fields + """ for _, field in fields_map.items(): - # For required fields that are missing + # For required fields or primary key that are missing # if self.field_name_not_in_labels( field, labels, case_sensitive ) or self.primary_key_field_name_not_in_labels( - field, schema, labels, case_sensitive # type: ignore + field, schema, labels, case_sensitive ): schema.add_field(field) From a8c7ed2a168e8c61032439423cfc344a5bef0fc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 10:56:29 +0100 Subject: [PATCH 42/62] Refactoring: remove incorrect doc --- frictionless/detector/detector.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 0ab22b6552..f5e9d78207 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import attrs @@ -450,12 +450,8 @@ def mapped_schema_fields_names( considering case sensitivity Args: - fields (Union[List[None], List[Field]]): list of original - schema fields - case_sensitive (bool): True if case sensitive, False else - - Returns: - Dict[str, Optional[Field]] + fields: list of original schema fields + case_sensitive: True if case sensitive, False else """ if case_sensitive: return {field.name: field for field in fields} From 181f8113e427ad2abd132c40ef9dcf1d18027f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 10:58:48 +0100 Subject: [PATCH 43/62] Refactoring: rename 'add_fields_to_schema_among_labels' method into 'rearrange_schema_fields_given_labels' --- frictionless/detector/detector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index f5e9d78207..f341a44aea 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -420,7 +420,7 @@ def detect_schema( schema.fields, case_sensitive # type: ignore ) - self.add_fields_to_schema_among_labels( + self.rearrange_schema_fields_given_labels( mapped_fields, schema, labels, case_sensitive # type: ignore ) @@ -459,7 +459,7 @@ def mapped_schema_fields_names( return {field.name.lower(): field for field in fields} @staticmethod - def add_fields_to_schema_among_labels( + def rearrange_schema_fields_given_labels( fields_mapped: Dict[str, Field], schema: Schema, labels: List[str], From 0e359674fefbf68f701e2e4783d081981be429a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:02:41 +0100 Subject: [PATCH 44/62] Refactoring: rename 'fields_mapped'and 'fields_map' agruments into 'fields_mapping' --- frictionless/detector/detector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index f341a44aea..dd84b5345a 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -460,7 +460,7 @@ def mapped_schema_fields_names( @staticmethod def rearrange_schema_fields_given_labels( - fields_mapped: Dict[str, Field], + fields_mapping: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, @@ -469,14 +469,14 @@ def rearrange_schema_fields_given_labels( for name in labels: default_field = Field.from_descriptor({"name": name, "type": "any"}) if case_sensitive: - field = fields_mapped.get(name, default_field) + field = fields_mapping.get(name, default_field) else: - field = fields_mapped.get(name.lower(), default_field) + field = fields_mapping.get(name.lower(), default_field) schema.add_field(field) def add_missing_required_labels_to_schema_fields( self, - fields_map: Dict[str, Field], + fields_mapping: Dict[str, Field], schema: Schema, labels: List[str], case_sensitive: bool, @@ -484,7 +484,7 @@ def add_missing_required_labels_to_schema_fields( """This method aims to add missing required labels and primary key field not in labels to schema fields """ - for _, field in fields_map.items(): + for _, field in fields_mapping.items(): # For required fields or primary key that are missing # if self.field_name_not_in_labels( field, labels, case_sensitive From d29b1e78875f0126bcc7e9828bf571efe7f98e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:06:43 +0100 Subject: [PATCH 45/62] Refactoring: introduce 'index_name' variable to factorize 'rearrange_schema_fields_given_labels' method --- frictionless/detector/detector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index dd84b5345a..a73c61638c 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -469,9 +469,10 @@ def rearrange_schema_fields_given_labels( for name in labels: default_field = Field.from_descriptor({"name": name, "type": "any"}) if case_sensitive: - field = fields_mapping.get(name, default_field) + index_name = name else: - field = fields_mapping.get(name.lower(), default_field) + index_name = name.lower() + field = fields_mapping.get(index_name, default_field) schema.add_field(field) def add_missing_required_labels_to_schema_fields( From feae99c8b03fdba0be3475aa41fe664654ab5ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= <119412389+amelie-rondot@users.noreply.github.com> Date: Wed, 14 Feb 2024 11:13:48 +0100 Subject: [PATCH 46/62] Refactoring: edit comment with appropriate vocabulary Co-authored-by: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> --- frictionless/resources/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 1a28cde2a2..5ec614f01f 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -337,7 +337,7 @@ def row_stream(): try: cells = self.primary_key_cells(row, self.dialect.header_case) except KeyError: - # Row does not have primary_key as key + # Row does not have primary_key as label # There should already be a missing-label error in # in self.header corresponding to the schema primary key assert not self.header.valid From 8d1fb9a093d83772ad460fb2615c19345b1d5d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:20:10 +0100 Subject: [PATCH 47/62] Refacto: remove useless import --- frictionless/detector/detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index a73c61638c..31a86eceb7 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -3,7 +3,7 @@ import codecs import os from copy import copy, deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional import attrs From 9a7d04a934658317c2f7d2467e48130ad8b7ce8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:21:28 +0100 Subject: [PATCH 48/62] Refacto: Fix signature 'primary_key_cells' method and remove useless 'type: ignore' --- frictionless/resources/table.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 5ec614f01f..4cce3af465 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -347,8 +347,8 @@ def row_stream(): error = errors.PrimaryKeyError.from_row(row, note=note) row.errors.append(error) else: - match = memory_primary.get(cells) # type: ignore - memory_primary[cells] = row.row_number # type: ignore + match = memory_primary.get(cells) + memory_primary[cells] = row.row_number if match: if match: note = "the same as in the row at position %s" % match @@ -442,7 +442,7 @@ def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): del field_info["objects"][field_index] del field_info["mapping"][field_name] - def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[str, Any]: + def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any]: """Create a tuple containg all cells related to primary_key""" return tuple( row[label] From 9c85a8d852094561734f64c3b2d618c358d429a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:25:02 +0100 Subject: [PATCH 49/62] Refacto: remove useless 'if match' condition --- frictionless/resources/table.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 4cce3af465..e14b3a61f4 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -350,12 +350,11 @@ def row_stream(): match = memory_primary.get(cells) memory_primary[cells] = row.row_number if match: - if match: - note = "the same as in the row at position %s" % match - error = errors.PrimaryKeyError.from_row( - row, note=note - ) - row.errors.append(error) + note = "the same as in the row at position %s" % match + error = errors.PrimaryKeyError.from_row( + row, note=note + ) + row.errors.append(error) # Foreign Key Error if is_integrity and foreign_groups: From 47d5b8b6b15b87ba400a58f45bb15c919186505a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 11:27:42 +0100 Subject: [PATCH 50/62] Refacto: edit comment --- frictionless/resources/table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index e14b3a61f4..df3b2ba464 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -398,8 +398,8 @@ def row_stream(): yield row if self.detector.schema_sync: - # Missing required labels have in 'field_info' - # parameter used for row creation + # Missing required labels are not included in the + # field_info parameter used for row creation for field in self.schema.fields: self.remove_missing_required_label_from_field_info(field, field_info) From 9dc992d51c87df20bedcad29e48864adf6ef4a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 12:18:03 +0100 Subject: [PATCH 51/62] Refacto: rename 'expected_fields_names' into 'expected_field_names' variable --- frictionless/resources/table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index df3b2ba464..003767dd63 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -418,7 +418,7 @@ def remove_missing_required_label_from_field_info( @staticmethod def field_is_missing( field_name: str, - expected_fields_names: List[str], + expected_field_names: List[str], table_labels: types.ILabels, case_sensitive: bool, ) -> bool: @@ -428,11 +428,11 @@ def field_is_missing( if not case_sensitive: field_name = field_name.lower() table_labels = [label.lower() for label in table_labels] - expected_fields_names = [ - field_name.lower() for field_name in expected_fields_names + expected_field_names = [ + field_name.lower() for field_name in expected_field_names ] - return field_name not in table_labels and field_name in expected_fields_names + return field_name not in table_labels and field_name in expected_field_names @staticmethod def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): From bd50358e16b21e0a8a2180c543f2934521f17541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 16:46:27 +0100 Subject: [PATCH 52/62] Refacto: introduce new method field_is_required and remove 'primary_key_field_name_not_in_label' method --- frictionless/detector/detector.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 31a86eceb7..cab8d0b914 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -487,10 +487,10 @@ def add_missing_required_labels_to_schema_fields( """ for _, field in fields_mapping.items(): # For required fields or primary key that are missing # - if self.field_name_not_in_labels( + if self.field_is_required( + field, schema, case_sensitive + ) and self.field_name_not_in_labels( field, labels, case_sensitive - ) or self.primary_key_field_name_not_in_labels( - field, schema, labels, case_sensitive ): schema.add_field(field) @@ -499,30 +499,23 @@ def field_name_not_in_labels( field: Field, labels: List[str], case_sensitive: bool ) -> bool: if case_sensitive: - return field.required and field.name not in labels + return field.name not in labels else: - return field.required and field.name.lower() not in [ + return field.name.lower() not in [ label.lower() for label in labels ] @staticmethod - def primary_key_field_name_not_in_labels( + def field_is_required( field: Field, schema: Schema, - labels: List[str], case_sensitive: bool, ) -> bool: if case_sensitive: - return ( - not field.required - and field.name in schema.primary_key - and field.name not in labels - ) + return field.required or field.name in schema.primary_key else: lower_primary_key = [pk.lower() for pk in schema.primary_key] - lower_labels = [label.lower() for label in labels] return ( - not field.required - and field.name.lower() in lower_primary_key - and field.name.lower() not in lower_labels + field.required + or field.name.lower() in lower_primary_key ) From 89b3f7317c3135b1675b977423099142378419a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 16:47:35 +0100 Subject: [PATCH 53/62] Refacto: format files --- frictionless/detector/detector.py | 13 +++---------- frictionless/resources/table.py | 4 +--- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index cab8d0b914..b57607aae9 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -489,9 +489,7 @@ def add_missing_required_labels_to_schema_fields( # For required fields or primary key that are missing # if self.field_is_required( field, schema, case_sensitive - ) and self.field_name_not_in_labels( - field, labels, case_sensitive - ): + ) and self.field_name_not_in_labels(field, labels, case_sensitive): schema.add_field(field) @staticmethod @@ -501,9 +499,7 @@ def field_name_not_in_labels( if case_sensitive: return field.name not in labels else: - return field.name.lower() not in [ - label.lower() for label in labels - ] + return field.name.lower() not in [label.lower() for label in labels] @staticmethod def field_is_required( @@ -515,7 +511,4 @@ def field_is_required( return field.required or field.name in schema.primary_key else: lower_primary_key = [pk.lower() for pk in schema.primary_key] - return ( - field.required - or field.name.lower() in lower_primary_key - ) + return field.required or field.name.lower() in lower_primary_key diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 003767dd63..f01cf16dc2 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -351,9 +351,7 @@ def row_stream(): memory_primary[cells] = row.row_number if match: note = "the same as in the row at position %s" % match - error = errors.PrimaryKeyError.from_row( - row, note=note - ) + error = errors.PrimaryKeyError.from_row(row, note=note) row.errors.append(error) # Foreign Key Error From 2dc35add8e6f5c2ff7d0e763ccd5e553f085cd31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Wed, 14 Feb 2024 16:59:53 +0100 Subject: [PATCH 54/62] Refacto: fix pyright errors and remove 'type: ignore' --- frictionless/detector/detector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index b57607aae9..d417ff5b0e 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -421,11 +421,11 @@ def detect_schema( ) self.rearrange_schema_fields_given_labels( - mapped_fields, schema, labels, case_sensitive # type: ignore + mapped_fields, schema, labels, case_sensitive ) self.add_missing_required_labels_to_schema_fields( - mapped_fields, schema, labels, case_sensitive # type: ignore + mapped_fields, schema, labels, case_sensitive ) # Patch schema @@ -445,7 +445,7 @@ def detect_schema( @staticmethod def mapped_schema_fields_names( fields: List[Field], case_sensitive: bool - ) -> Dict[str, Optional[Field]]: + ) -> Dict[str, Field]: """Create a dictionnary to map fields name with schema fields considering case sensitivity From cd7d7aa940af235a4812cae561ce6e3a2d884511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Am=C3=A9lie=20Rondot?= Date: Fri, 23 Feb 2024 12:37:41 +0100 Subject: [PATCH 55/62] Refacto: remove useless assesment lines --- frictionless/detector/detector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index d417ff5b0e..2287392196 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -412,10 +412,6 @@ def detect_schema( case_sensitive = options["header_case"] - assert schema - assert schema.fields - assert all(isinstance(field, Field) for field in schema.fields) - mapped_fields = self.mapped_schema_fields_names( schema.fields, case_sensitive # type: ignore ) @@ -440,7 +436,7 @@ def detect_schema( field_descriptor.update(field_patch) schema = Schema.from_descriptor(descriptor) - return schema # type: ignore + return schema @staticmethod def mapped_schema_fields_names( From 7681f73605924cf2de3f3cdaca8c655fe55d61ea Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Mon, 22 Apr 2024 16:58:45 +0200 Subject: [PATCH 56/62] documentation --- frictionless/detector/detector.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 2287392196..3ffa6f8e95 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -413,7 +413,8 @@ def detect_schema( case_sensitive = options["header_case"] mapped_fields = self.mapped_schema_fields_names( - schema.fields, case_sensitive # type: ignore + schema.fields, # type: ignore + case_sensitive, ) self.rearrange_schema_fields_given_labels( @@ -442,13 +443,7 @@ def detect_schema( def mapped_schema_fields_names( fields: List[Field], case_sensitive: bool ) -> Dict[str, Field]: - """Create a dictionnary to map fields name with schema fields - considering case sensitivity - - Args: - fields: list of original schema fields - case_sensitive: True if case sensitive, False else - """ + """Create a dictionnary to map field names with schema fields""" if case_sensitive: return {field.name: field for field in fields} else: @@ -461,7 +456,10 @@ def rearrange_schema_fields_given_labels( labels: List[str], case_sensitive: bool, ): + """Rearrange fields according to the order of labels. All fields + missing from labels are dropped""" schema.clear_fields() + for name in labels: default_field = Field.from_descriptor({"name": name, "type": "any"}) if case_sensitive: From 6cc454e0555cfa96693b728de70050025401ff2e Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:28:38 +0200 Subject: [PATCH 57/62] refactorings --- frictionless/detector/detector.py | 39 ++++++++++++------------------- frictionless/resources/table.py | 8 +++---- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index 3ffa6f8e95..b96610ec8f 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -406,19 +406,24 @@ def detect_schema( # Sync schema if self.schema_sync: if labels: + case_sensitive = options["header_case"] + + if case_sensitive: + labels = [label.lower() for label in labels] + if len(labels) != len(set(labels)): note = '"schema_sync" requires unique labels in the header' raise FrictionlessException(note) - case_sensitive = options["header_case"] - mapped_fields = self.mapped_schema_fields_names( schema.fields, # type: ignore case_sensitive, ) self.rearrange_schema_fields_given_labels( - mapped_fields, schema, labels, case_sensitive + mapped_fields, + schema, + labels, ) self.add_missing_required_labels_to_schema_fields( @@ -454,7 +459,6 @@ def rearrange_schema_fields_given_labels( fields_mapping: Dict[str, Field], schema: Schema, labels: List[str], - case_sensitive: bool, ): """Rearrange fields according to the order of labels. All fields missing from labels are dropped""" @@ -462,11 +466,7 @@ def rearrange_schema_fields_given_labels( for name in labels: default_field = Field.from_descriptor({"name": name, "type": "any"}) - if case_sensitive: - index_name = name - else: - index_name = name.lower() - field = fields_mapping.get(index_name, default_field) + field = fields_mapping.get(name, default_field) schema.add_field(field) def add_missing_required_labels_to_schema_fields( @@ -477,24 +477,15 @@ def add_missing_required_labels_to_schema_fields( case_sensitive: bool, ): """This method aims to add missing required labels and - primary key field not in labels to schema fields + primary key field not in labels to schema fields. """ - for _, field in fields_mapping.items(): - # For required fields or primary key that are missing # - if self.field_is_required( - field, schema, case_sensitive - ) and self.field_name_not_in_labels(field, labels, case_sensitive): + for name, field in fields_mapping.items(): + if ( + self.field_is_required(field, schema, case_sensitive) + and name not in labels + ): schema.add_field(field) - @staticmethod - def field_name_not_in_labels( - field: Field, labels: List[str], case_sensitive: bool - ) -> bool: - if case_sensitive: - return field.name not in labels - else: - return field.name.lower() not in [label.lower() for label in labels] - @staticmethod def field_is_required( field: Field, diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index f01cf16dc2..1cf83cbb1c 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -408,13 +408,13 @@ def remove_missing_required_label_from_field_info( self, field: Field, field_info: Dict[str, Any] ): is_case_sensitive = self.dialect.header_case - if self.field_is_missing( + if self.label_is_missing( field.name, field_info["names"], self.labels, is_case_sensitive ): self.remove_field_from_field_info(field.name, field_info) @staticmethod - def field_is_missing( + def label_is_missing( field_name: str, expected_field_names: List[str], table_labels: types.ILabels, @@ -439,7 +439,7 @@ def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): del field_info["objects"][field_index] del field_info["mapping"][field_name] - def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any]: + def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: """Create a tuple containg all cells related to primary_key""" return tuple( row[label] @@ -451,7 +451,7 @@ def labels_related_to_primary_key( row: Row, case_sensitive: bool, ) -> List[str]: - """Create a list of TabelResource labels which correspond to the + """Create a list of TableResource labels which correspond to the primary_key schema considering case-sensitivity """ if case_sensitive: From 0ef13951c4177a53837e923d9b2705c5adc90a96 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Mon, 22 Apr 2024 22:21:15 +0200 Subject: [PATCH 58/62] format --- frictionless/resource/__spec__/test_security.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/frictionless/resource/__spec__/test_security.py b/frictionless/resource/__spec__/test_security.py index e44916e727..36c2fc2b3b 100644 --- a/frictionless/resource/__spec__/test_security.py +++ b/frictionless/resource/__spec__/test_security.py @@ -27,9 +27,11 @@ def test_resource_source_path_error_bad_path_not_safe_traversing(): Resource( { "name": "name", - "path": "data/../data/table.csv" - if not platform.type == "windows" - else "data\\..\\table.csv", + "path": ( + "data/../data/table.csv" + if not platform.type == "windows" + else "data\\..\\table.csv" + ), } ) error = excinfo.value.error From 585869ef29cba976a8eb7b118d1042e78c42c511 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Tue, 23 Apr 2024 16:57:57 +0200 Subject: [PATCH 59/62] fixup! documentation --- frictionless/resources/table.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 1cf83cbb1c..fdf9ffce4b 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -440,20 +440,16 @@ def remove_field_from_field_info(field_name: str, field_info: Dict[str, Any]): del field_info["mapping"][field_name] def primary_key_cells(self, row: Row, case_sensitive: bool) -> Tuple[Any, ...]: - """Create a tuple containg all cells related to primary_key""" - return tuple( - row[label] - for label in self.labels_related_to_primary_key(row, case_sensitive) - ) + """Create a tuple containg all cells from a given row associated to primary + keys""" + return tuple(row[label] for label in self.primary_key_labels(row, case_sensitive)) - def labels_related_to_primary_key( + def primary_key_labels( self, row: Row, case_sensitive: bool, ) -> List[str]: - """Create a list of TableResource labels which correspond to the - primary_key schema considering case-sensitivity - """ + """Create a list of TableResource labels that are primary keys""" if case_sensitive: labels_primary_key = self.schema.primary_key else: From 37f16726d2aba4a9998791afb89624f34a2fb8b8 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:08:47 +0200 Subject: [PATCH 60/62] =?UTF-8?q?=F0=9F=94=B5=20linting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../validator/__spec__/resource/test_schema.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 458d25ca3b..58211ee535 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -3,8 +3,14 @@ import pytest import frictionless -from frictionless import Checklist, Detector, Dialect, FrictionlessException, Schema -from frictionless import fields +from frictionless import ( + Checklist, + Detector, + Dialect, + FrictionlessException, + Schema, + fields, +) from frictionless.resources import TableResource # General From b2391826f7a176157fa3d8b70df6e44723281752 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Fri, 3 May 2024 09:55:13 +0200 Subject: [PATCH 61/62] =?UTF-8?q?=F0=9F=94=B5=20renames?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frictionless/detector/detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/detector/detector.py b/frictionless/detector/detector.py index b96610ec8f..fc3e47e0a1 100644 --- a/frictionless/detector/detector.py +++ b/frictionless/detector/detector.py @@ -408,7 +408,7 @@ def detect_schema( if labels: case_sensitive = options["header_case"] - if case_sensitive: + if not case_sensitive: labels = [label.lower() for label in labels] if len(labels) != len(set(labels)): From d464802d4ccae01ceef85948baf472df233aa2a8 Mon Sep 17 00:00:00 2001 From: Pierre Camilleri <22995923+pierrecamilleri@users.noreply.github.com> Date: Fri, 3 May 2024 09:55:18 +0200 Subject: [PATCH 62/62] fix --- .../validator/__spec__/resource/test_schema.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/frictionless/validator/__spec__/resource/test_schema.py b/frictionless/validator/__spec__/resource/test_schema.py index 58211ee535..b386f05c3b 100644 --- a/frictionless/validator/__spec__/resource/test_schema.py +++ b/frictionless/validator/__spec__/resource/test_schema.py @@ -317,7 +317,7 @@ def test_resource_validate_less_actual_fields_with_required_constraint_issue_950 def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_1611(): - schema_descriptor_1 = { + schema_A_required = { "$schema": "https://frictionlessdata.io/schemas/table-schema.json", "fields": [ { @@ -331,20 +331,20 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], } - schema_descriptor_2 = deepcopy(schema_descriptor_1) + schema_AC_required = deepcopy(schema_A_required) # Add required constraint on "C" field - schema_descriptor_2["fields"][2]["constraints"] = {"required": True} + schema_AC_required["fields"][2]["constraints"] = {"required": True} test_cases = [ { - "schema": schema_descriptor_1, + "schema": schema_A_required, "source": [["B", "C"], ["b", "c"]], "expected_flattened_report": [ [None, 3, "A", "missing-label"], ], }, { - "schema": schema_descriptor_2, + "schema": schema_AC_required, "source": [["B"], ["b"]], "expected_flattened_report": [ [None, 2, "A", "missing-label"], @@ -352,7 +352,7 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], }, { - "schema": schema_descriptor_2, + "schema": schema_AC_required, "source": [ ["A", "B"], ["a", "b"], @@ -369,12 +369,15 @@ def test_resource_with_missing_required_header_with_schema_sync_is_true_issue_16 ], }, ] + for tc in test_cases: schema = Schema.from_descriptor(tc["schema"]) resource = TableResource( tc["source"], schema=schema, detector=Detector(schema_sync=True) ) + report = frictionless.validate(resource) + assert ( report.flatten(["rowNumber", "fieldNumber", "fieldName", "type"]) == tc["expected_flattened_report"]