Fixed bug where pyarrow conversions were causing string accessor to fail in search (#718)

charles-turner-1 · web-flow · commit bf1810fb4590 · 2025-05-08T16:00:35.000+10:00
diff --git a/intake_esm/_search.py b/intake_esm/_search.py
@@ -45,7 +45,10 @@ def search(
         column_has_iterables = column in columns_with_iterables
         for value in values:
             if column_has_iterables:
-                mask = df[column].str.contains(value, regex=False)
+                try:
+                    mask = df[column].str.contains(value, regex=False)
+                except AttributeError:
+                    mask = df[column].apply(tuple).str.contains(value, regex=False)
             elif column_is_stringtype and is_pattern(value):
                 mask = df[column].str.contains(value, regex=True, case=True, flags=0)
             elif pd.isna(value):
diff --git a/tests/sample-catalogs/access-columns-with-iterables.csv.gz b/tests/sample-catalogs/access-columns-with-iterables.csv.gz
diff --git a/tests/sample-catalogs/access-columns-with-iterables.json b/tests/sample-catalogs/access-columns-with-iterables.json
@@ -0,0 +1,81 @@
+{
+  "esmcat_version": "0.0.1",
+  "attributes": [
+    {
+      "column_name": "filename",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "file_id",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "path",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "filename_timestamp",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "start_date",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "end_date",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_long_name",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_standard_name",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_cell_methods",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "variable_units",
+      "vocabulary": ""
+    },
+    {
+      "column_name": "realm",
+      "vocabulary": ""
+    }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "netcdf",
+    "format_column_name": null
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable",
+    "groupby_attrs": ["file_id", "frequency"],
+    "aggregations": [
+      {
+        "type": "join_existing",
+        "attribute_name": "start_date",
+        "options": {
+          "dim": "time",
+          "combine": "by_coords"
+        }
+      }
+    ]
+  },
+  "id": "01deg_jra55v13_ryf9091",
+  "description": "0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091  repeat year forcing (May 1990 to Apr 1991)",
+  "title": null,
+  "last_updated": "2025-03-04T01:25:35Z",
+  "catalog_file": "access-columns-with-iterables.csv.gz"
+}
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -19,6 +19,7 @@
 import intake_esm
 
 from .utils import (
+    access_columns_with_iterables_cat,
     catalog_dict_records,
     cdf_cat_sample_cesmle,
     cdf_cat_sample_cmip5,
@@ -201,6 +202,18 @@ def test_catalog_search(path, query, expected_size):
     assert len(new_cat) == expected_size
 
 
+@pytest.mark.parametrize(
+    'path, columns_with_iterables, query, expected_size',
+    [
+        (access_columns_with_iterables_cat, ['variable'], {'variable': ['aice_m']}, 1),
+    ],
+)
+def test_catalog_search_columns_with_iterables(path, columns_with_iterables, query, expected_size):
+    cat = intake.open_esm_datastore(path, columns_with_iterables=columns_with_iterables)
+    new_cat = cat.search(**query)
+    assert len(new_cat) == expected_size
+
+
 def test_catalog_with_registry_search():
     cat = intake.open_esm_datastore(zarr_cat_aws_cesm, registry=registry)
     new_cat = cat.search(variable='FOO')
diff --git a/tests/utils.py b/tests/utils.py
@@ -16,6 +16,9 @@
     'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'
 )
 mixed_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-bcc-mixed-formats.json')
+access_columns_with_iterables_cat = os.path.join(
+    here, 'sample-catalogs/access-columns-with-iterables.json'
+)
 
 
 sample_df = pd.DataFrame(

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,9 @@`
`16`	`16`	`'https://raw.githubusercontent.com/NCAR/cesm-lens-aws/master/intake-catalogs/aws-cesm1-le.json'`
`17`	`17`	`)`
`18`	`18`	`mixed_cat_sample_cmip6 = os.path.join(here, 'sample-catalogs/cmip6-bcc-mixed-formats.json')`
	`19`	`+access_columns_with_iterables_cat = os.path.join(`
	`20`	`+ here, 'sample-catalogs/access-columns-with-iterables.json'`
	`21`	`+)`
`19`	`22`
`20`	`23`
`21`	`24`	`sample_df = pd.DataFrame(`