From d8cff029db0ea5f79f1d041800db4bdf0b98917c Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Tue, 14 Oct 2025 09:57:04 +0200 Subject: [PATCH] add null island check --- docs/dqx/docs/reference/quality_checks.mdx | 17 ++++++++++++- src/databricks/labs/dqx/geo/check_funcs.py | 29 ++++++++++++++++++++++ tests/integration/test_apply_checks.py | 11 ++++++++ tests/integration/test_row_checks_geo.py | 24 ++++++++++++++++++ tests/perf/test_apply_checks.py | 12 +++++++++ tests/resources/all_row_geo_checks.yaml | 7 ++++++ 6 files changed, 99 insertions(+), 1 deletion(-) diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index 5020e6b3d..f9d5e698c 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -61,6 +61,7 @@ You can also define your own custom checks (see [Creating custom checks](#creati | `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_not_null_island` | Checks whether the values in the input column are null island geometries (POINT(0 0)). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | | `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | | `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | @@ -578,7 +579,14 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen function: is_non_empty_geometry arguments: column: point_geom - + +# is_not_null_island check +- criticality: error + check: + function: is_not_null_island + arguments: + column: point_geom + # has_dimension check - criticality: error check: @@ -1042,6 +1050,13 @@ checks = [ column="point_geom" ), + # is_not_null_island check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom" + ), + # has_dimension check DQRowRule( criticality="error", diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 612f7a717..354dd0626 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -357,6 +357,35 @@ def is_non_empty_geometry(column: str | Column) -> Column: ) +@register_rule("row") +def is_not_null_island(column: str | Column) -> Column: + """Checks whether the values in the input column are NULL island geometries (POINT(0 0)). + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are NULL island geometries + + Note: + This function requires Databricks serverless compute or runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry`, `st_geometrytype`, `st_x`, and `st_y` functions. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + is_point_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) = '{POINT_TYPE}'") + is_zero_zero = F.expr(f"st_x(try_to_geometry({col_str_norm})) = 0.0 AND st_y(try_to_geometry({col_str_norm})) = 0.0") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(~geom_cond & is_point_cond & is_zero_zero) + condition_str = f"column `{col_expr_str}` contains a null island" + + return make_condition( + condition, + F.lit(condition_str), + f"{col_str_norm}_contains_null_island", + ) + + @register_rule("row") def has_dimension(column: str | Column, dimension: int) -> Column: """Checks whether the geometries/geographies in the input column have a given dimension. diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 8ee1fe1f3..2e918859e 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -5819,6 +5819,17 @@ def test_apply_checks_all_geo_checks_using_classes(skip_if_runtime_not_geo_compa check_func=geo_check_funcs.is_non_empty_geometry, column=F.col("point_geom"), ), + # is_not_null_island check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column=F.col("point_geom"), + ), # has_dimension check DQRowRule( criticality="error", diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index a12ebcebe..0cb016c68 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -13,6 +13,7 @@ is_multilinestring, is_multipoint, is_multipolygon, + is_not_null_island, is_point, is_polygon, is_ogc_valid, @@ -333,6 +334,29 @@ def test_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, spark): assert_df_equality(actual, expected, ignore_nullable=True) +def test_is_not_null_island(skip_if_runtime_not_geo_compatible, spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["POINT(0 0)"], ["LINESTRING(0 0, 1 1)"], ["nonsense"], [None]], + input_schema, + ) + + actual = test_df.select(is_not_null_island("geom")) + + checked_schema = "geom_contains_null_island: string" + expected = spark.createDataFrame( + [ + [None], + ["column `geom` contains a null island"], + [None], + [None], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + def test_has_dimension(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 1f223d476..526384067 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1509,6 +1509,18 @@ def test_benchmark_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, ben actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS +def test_benchmark_is_not_null_island(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_not_null_island, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS def test_benchmark_has_dimension(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) diff --git a/tests/resources/all_row_geo_checks.yaml b/tests/resources/all_row_geo_checks.yaml index 8cc66ed7c..3d88016e7 100644 --- a/tests/resources/all_row_geo_checks.yaml +++ b/tests/resources/all_row_geo_checks.yaml @@ -92,6 +92,13 @@ arguments: column: point_geom +# is_not_null_island check +- criticality: error + check: + function: is_not_null_island + arguments: + column: point_geom + # has_dimension check - criticality: error check: