From e29cfa72009f72a35a89fa573b833ccd388611f6 Mon Sep 17 00:00:00 2001 From: HuangShaoyu Date: Sun, 6 Apr 2025 19:35:42 +0000 Subject: [PATCH 1/4] Fix: ambigious ValueError when data with duplicate columns passed to PairGrid --- seaborn/axisgrid.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/seaborn/axisgrid.py b/seaborn/axisgrid.py index 17d333bc89..293544aa9f 100644 --- a/seaborn/axisgrid.py +++ b/seaborn/axisgrid.py @@ -1248,12 +1248,23 @@ def __init__( data = handle_data_source(data) # Sort out the variables that define the grid - numeric_cols = self._find_numeric_cols(data) - if hue in numeric_cols: - numeric_cols.remove(hue) - if vars is not None: + + # It will crash in `_find_numeric_cols` if there are duplicated columns in the data, + # even if these columns are not in the `vars` variable. + # If `vars` is provided, we don't need `_find_numeric_cols`, which will cause extra crash. + # If `vars` is not provided, we need `_find_numeric_cols`, but it crashes with ambigious ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). + # My fix is to skip `_find_numeric_cols` when `vars` is provided. + # And raise early error if data.columns is not unique when `vars` is not provided.[I suppose duplicated columns are not expected in PairGrid] + + if vars is not None: # user provide vars x_vars = list(vars) y_vars = list(vars) + else: + if not data.columns.is_unique: + raise ValueError(f"Columns: {data.columns.duplicated()} are duplicated.") + numeric_cols = self._find_numeric_cols(data) + if hue in numeric_cols: + numeric_cols.remove(hue) if x_vars is None: x_vars = numeric_cols if y_vars is None: From 783fa9d8cccfc59ea5d00e1e678c985a154b78c5 Mon Sep 17 00:00:00 2001 From: HuangShaoyu Date: Sun, 6 Apr 2025 20:52:47 +0000 Subject: [PATCH 2/4] lint optimization and add new tests --- seaborn/axisgrid.py | 30 ++++++++++++++++++++++++------ tests/test_axisgrid.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/seaborn/axisgrid.py b/seaborn/axisgrid.py index 293544aa9f..fc743c94ef 100644 --- a/seaborn/axisgrid.py +++ b/seaborn/axisgrid.py @@ -1249,19 +1249,37 @@ def __init__( # Sort out the variables that define the grid - # It will crash in `_find_numeric_cols` if there are duplicated columns in the data, + # `_find_numeric_cols` will crash if there are duplicated columns in the data, # even if these columns are not in the `vars` variable. - # If `vars` is provided, we don't need `_find_numeric_cols`, which will cause extra crash. - # If `vars` is not provided, we need `_find_numeric_cols`, but it crashes with ambigious ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). + # If `vars` is provided, we don't need `_find_numeric_cols`. + # If `vars` is not provided, we need `_find_numeric_cols`, + # but it crashes with ambigious ValueError: The truth value of a DataFrame ... # My fix is to skip `_find_numeric_cols` when `vars` is provided. - # And raise early error if data.columns is not unique when `vars` is not provided.[I suppose duplicated columns are not expected in PairGrid] + # And raise early error if data_to_plot.columns is duplicated. + # [I suppose duplicated columns are not expected in PairGrid] - if vars is not None: # user provide vars + if vars is not None: # user provide vars x_vars = list(vars) y_vars = list(vars) + if len(set(vars)) Date: Sun, 6 Apr 2025 20:58:40 +0000 Subject: [PATCH 3/4] fix errors in last commit --- tests/test_axisgrid.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_axisgrid.py b/tests/test_axisgrid.py index f71eeca744..f9ad25e7bc 100644 --- a/tests/test_axisgrid.py +++ b/tests/test_axisgrid.py @@ -787,14 +787,14 @@ def test_remove_hue_from_default(self): def test_duplicates_in_df_columns_without_vars(self): # should fail with clear msg - df_with_dupe = self.df.copy() + df_with_dupe = self.df.loc[:,["x", "y", "z"]].copy() df_with_dupe.columns = ["x", "y", "y"] with pytest.raises(ValueError, match=r"Columns: .* are duplicated\."): a = ag.PairGrid(df_with_dupe) def test_duplicates_in_df_columns_with_related_vars(self): # should fail with clear msg - df_with_dupe = self.df.copy() + df_with_dupe = self.df.loc[:,["x", "y", "z"]].copy() df_with_dupe.columns = ["x", "y", "y"] with pytest.raises(ValueError, match=r"Columns: .* are duplicated\."): a = ag.PairGrid(df_with_dupe,vars = ['x','y']) From 0ac0ef53823a7aa3bcc5f41aab5076ad61ec9433 Mon Sep 17 00:00:00 2001 From: HuangShaoyu Date: Sun, 6 Apr 2025 21:11:59 +0000 Subject: [PATCH 4/4] Optimize lint --- seaborn/axisgrid.py | 10 ++++++---- tests/test_axisgrid.py | 20 +++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/seaborn/axisgrid.py b/seaborn/axisgrid.py index fc743c94ef..6c7fe4de77 100644 --- a/seaborn/axisgrid.py +++ b/seaborn/axisgrid.py @@ -1261,7 +1261,7 @@ def __init__( if vars is not None: # user provide vars x_vars = list(vars) y_vars = list(vars) - if len(set(vars))