From b1de27d839aadaa7848796493bd0908e4f78e2fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 13:42:06 +0200 Subject: [PATCH 01/33] qc_metrics methods improved with new metrics, tests and docstrings updated --- ehrapy/preprocessing/_quality_control.py | 117 ++++++++++++++++++++ tests/preprocessing/test_quality_control.py | 28 +++++ 2 files changed, 145 insertions(+) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index fcd70b78e..3e4dc86dc 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd from lamin_utils import logger +from scipy.stats import kurtosis, skew from thefuzz import process from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata @@ -46,16 +47,31 @@ def qc_metrics( - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. + - `unique_values_abs`: Absolute amount of unique values. + - `unique_values_ratio`: Relative amount of unique values in percent. + - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. Feature level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. + - `unique_values_abs`: Absolute amount of unique values. + - `unique_values_ratio`: Relative amount of unique values in percent. + - `entropy_of_missingness`: Entropy of the missingness pattern for each feature. Higher values indicate a more heterogeneous (less structured) missingness pattern. + - `mean`: Mean value of the features. - `median`: Median value of the features. - `std`: Standard deviation of the features. - `min`: Minimum value of the features. - `max`: Maximum value of the features. + - `coefficient_of_variation`: Coefficient of variation of the features. + - `is_constant`: Whether the feature is constant (with near zero variance). + - `constant_variable_ratio`: Relative amount of constant features in percent. + - `range_ratio`: Relative dispersion of features values respective to their mean. + - `skewness`: Skewness of the feature distribution. + - `kurtosis`: Kurtosis of the feature distribution. + - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. + Examples: >>> import ehrapy as ep @@ -90,6 +106,49 @@ def _(mtx: DaskArray, axis) -> np.ndarray: return da.isnull(mtx).sum(axis).compute() +@singledispatch +def _compute_unique_values(mtx, axis): + _raise_array_type_not_implemented(_compute_unique_values, type(mtx)) + + +@_compute_unique_values.register +def _(mtx: np.ndarray, axis) -> np.ndarray: + return pd.DataFrame(mtx).nunique(axis=axis, dropna=True).to_numpy() + + +@_compute_unique_values.register +def _(mtx: DaskArray, axis) -> np.ndarray: + import dask.array as da + + def nunique_block(block, axis): + return pd.DataFrame(block).nunique(axis=axis, dropna=True).to_numpy() + + return da.map_blocks(nunique_block, mtx, axis=axis, dtype=int).compute() + + +@singledispatch +def _compute_entropy_of_missingness(mtx, axis): + _raise_array_type_not_implemented(_compute_entropy_of_missingness, type(mtx)) + + +@_compute_entropy_of_missingness.register +def _(mtx: np.ndarray, axis) -> np.ndarray: + missing_mask = pd.isnull(mtx) + p_miss = missing_mask.mean(axis=axis) + p = np.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) + return -(p * np.log2(p) + (1 - p) * np.log2(1 - p)) + + +@_compute_entropy_of_missingness.register +def _(mtx: DaskArray, axis) -> np.ndarray: + import dask.array as da + + missing_mask = da.isnan(mtx) + p_miss = missing_mask.mean(axis=axis) + p = da.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) + return -(p * da.log2(p) + (1 - p) * da.log2(1 - p)).compute() + + def _compute_obs_metrics( mtx, edata: EHRData | AnnData, @@ -131,6 +190,19 @@ def _compute_obs_metrics( obs_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=1) obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / mtx.shape[1]) * 100 + obs_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=1) + valid_counts = mtx.shape[1] - obs_metrics["missing_values_abs"] + obs_metrics["unique_values_ratio"] = ( + np.where( + valid_counts > 0, + obs_metrics["unique_values_abs"] / valid_counts, + np.nan, + ) + * 100 + ) + + obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) + # Specific QC metrics for qc_var in qc_vars: obs_metrics[f"total_features_{qc_var}"] = np.ravel(mtx[:, edata.var[qc_var].values].sum(axis=1)) @@ -180,14 +252,34 @@ def _compute_var_metrics( var_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=0) var_metrics["missing_values_pct"] = (var_metrics["missing_values_abs"] / mtx.shape[0]) * 100 + var_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=0) + valid_counts = mtx.shape[0] - var_metrics["missing_values_abs"] + var_metrics["unique_values_ratio"] = ( + np.where( + valid_counts > 0, + var_metrics["unique_values_abs"] / valid_counts, + np.nan, + ) + * 100 + ) + + var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) + var_metrics["mean"] = np.nan var_metrics["median"] = np.nan var_metrics["standard_deviation"] = np.nan var_metrics["min"] = np.nan var_metrics["max"] = np.nan + var_metrics["coefficient_of_variation"] = np.nan + var_metrics["is_constant"] = np.nan + var_metrics["constant_variable_ratio"] = np.nan + var_metrics["range_ratio"] = np.nan + var_metrics["skewness"] = np.nan + var_metrics["kurtosis"] = np.nan var_metrics["iqr_outliers"] = np.nan try: + # Calculate statistics for non-categorical variables var_metrics.loc[non_categorical_indices, "mean"] = np.nanmean( mtx[:, non_categorical_indices].astype(np.float64), axis=0 ) @@ -203,7 +295,32 @@ def _compute_var_metrics( var_metrics.loc[non_categorical_indices, "max"] = np.nanmax( mtx[:, non_categorical_indices].astype(np.float64), axis=0 ) + var_metrics.loc[non_categorical_indices, "coefficient_of_variation"] = ( + var_metrics.loc[non_categorical_indices, "standard_deviation"] + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) + + # Constant column detection + constant_mask = (var_metrics.loc[non_categorical_indices, "standard_deviation"] == 0) | ( + var_metrics.loc[non_categorical_indices, "max"] == var_metrics.loc[non_categorical_indices, "min"] + ) + var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 + + # Calculate range ratio + var_metrics.loc[non_categorical_indices, "range_ratio"] = ( + (var_metrics.loc[non_categorical_indices, "max"] - var_metrics.loc[non_categorical_indices, "min"]) + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) * 100 + + # Calculate skewness and kurtosis + var_metrics.loc[non_categorical_indices, "skewness"] = skew( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) + var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) # Calculate IQR and define IQR outliers q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 297c171cd..8b0c99de0 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -24,13 +24,27 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) + assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) + assert np.allclose( + var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True + ) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) + assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() # check that none of the columns were modified @@ -48,6 +62,9 @@ def test_obs_qc_metrics(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) @pytest.mark.parametrize("array_type", ARRAY_TYPES) @@ -58,10 +75,21 @@ def test_var_qc_metrics(array_type, missing_values_edata): assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) + assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) + assert np.allclose( + var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True + ) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) + assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From dd3e67ac7494604171db2c4a9b92c140e4b344b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 14:31:52 +0200 Subject: [PATCH 02/33] skewness and kurtosis left out --- ehrapy/preprocessing/_quality_control.py | 9 ++++----- tests/preprocessing/test_quality_control.py | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 3e4dc86dc..119d23748 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -68,8 +68,6 @@ def qc_metrics( - `is_constant`: Whether the feature is constant (with near zero variance). - `constant_variable_ratio`: Relative amount of constant features in percent. - `range_ratio`: Relative dispersion of features values respective to their mean. - - `skewness`: Skewness of the feature distribution. - - `kurtosis`: Kurtosis of the feature distribution. - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. @@ -274,8 +272,8 @@ def _compute_var_metrics( var_metrics["is_constant"] = np.nan var_metrics["constant_variable_ratio"] = np.nan var_metrics["range_ratio"] = np.nan - var_metrics["skewness"] = np.nan - var_metrics["kurtosis"] = np.nan + # var_metrics["skewness"] = np.nan + # var_metrics["kurtosis"] = np.nan var_metrics["iqr_outliers"] = np.nan try: @@ -315,12 +313,13 @@ def _compute_var_metrics( ).replace([np.inf, -np.inf], np.nan) * 100 # Calculate skewness and kurtosis - var_metrics.loc[non_categorical_indices, "skewness"] = skew( + """var_metrics.loc[non_categorical_indices, "skewness"] = skew( mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) + """ # Calculate IQR and define IQR outliers q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 8b0c99de0..f4d6eba58 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -43,8 +43,8 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True ) assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) - assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() # check that none of the columns were modified @@ -88,8 +88,8 @@ def test_var_qc_metrics(array_type, missing_values_edata): var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True ) assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) - assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From 149b6fe859103caf412cd2f9c2aacd5c8b0cd9f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 17:01:11 +0200 Subject: [PATCH 03/33] is_constant metric for categorical variables datatype set to boolean --- ehrapy/preprocessing/_quality_control.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 119d23748..e423e8a12 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -304,6 +304,8 @@ def _compute_var_metrics( ) var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + var_metrics["is_constant"] = var_metrics["is_constant"].astype("boolean") + var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 # Calculate range ratio From e9d3fc68367c3b61115b6d9fc962d42154570abd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 17:40:15 +0200 Subject: [PATCH 04/33] small error fixed --- ehrapy/preprocessing/_quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index e423e8a12..7164d6307 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -141,7 +141,7 @@ def _(mtx: np.ndarray, axis) -> np.ndarray: def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da - missing_mask = da.isnan(mtx) + missing_mask = da.isnull(mtx) p_miss = missing_mask.mean(axis=axis) p = da.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) return -(p * da.log2(p) + (1 - p) * da.log2(1 - p)).compute() From de05373614ebb7fc37c3b16c141221497b2e895b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 22 Nov 2025 13:25:40 +0000 Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ehrapy/preprocessing/_quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 50a11030e..99de07b4d 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -7,8 +7,8 @@ import numpy as np import pandas as pd -from scipy.stats import kurtosis, skew from ehrdata._logger import logger +from scipy.stats import kurtosis, skew from thefuzz import process from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata From e2fc04ad207046ba8c74f854fc46e44d91776ca3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 13:42:06 +0200 Subject: [PATCH 06/33] qc_metrics methods improved with new metrics, tests and docstrings updated --- ehrapy/plot/_scanpy_pl_api.py | 2 +- ehrapy/preprocessing/_quality_control.py | 116 ++++++++++++++++++++ tests/preprocessing/test_quality_control.py | 28 +++++ 3 files changed, 145 insertions(+), 1 deletion(-) diff --git a/ehrapy/plot/_scanpy_pl_api.py b/ehrapy/plot/_scanpy_pl_api.py index 99b14b8bc..d770f0481 100644 --- a/ehrapy/plot/_scanpy_pl_api.py +++ b/ehrapy/plot/_scanpy_pl_api.py @@ -478,7 +478,7 @@ def tracksplot( # noqa: D417 gene_symbols=feature_symbols, var_group_positions=var_group_positions, var_group_labels=var_group_labels, - layer=layer, + layer=layer, show=show, save=save, figsize=figsize, diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index a3cbcbbe3..c96b61c68 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -46,16 +46,31 @@ def qc_metrics( - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. + - `unique_values_abs`: Absolute amount of unique values. + - `unique_values_ratio`: Relative amount of unique values in percent. + - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. Feature level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. + - `unique_values_abs`: Absolute amount of unique values. + - `unique_values_ratio`: Relative amount of unique values in percent. + - `entropy_of_missingness`: Entropy of the missingness pattern for each feature. Higher values indicate a more heterogeneous (less structured) missingness pattern. + - `mean`: Mean value of the features. - `median`: Median value of the features. - `std`: Standard deviation of the features. - `min`: Minimum value of the features. - `max`: Maximum value of the features. + - `coefficient_of_variation`: Coefficient of variation of the features. + - `is_constant`: Whether the feature is constant (with near zero variance). + - `constant_variable_ratio`: Relative amount of constant features in percent. + - `range_ratio`: Relative dispersion of features values respective to their mean. + - `skewness`: Skewness of the feature distribution. + - `kurtosis`: Kurtosis of the feature distribution. + - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. + Examples: >>> import ehrapy as ep @@ -90,6 +105,49 @@ def _(mtx: DaskArray, axis) -> np.ndarray: return da.isnull(mtx).sum(axis).compute() +@singledispatch +def _compute_unique_values(mtx, axis): + _raise_array_type_not_implemented(_compute_unique_values, type(mtx)) + + +@_compute_unique_values.register +def _(mtx: np.ndarray, axis) -> np.ndarray: + return pd.DataFrame(mtx).nunique(axis=axis, dropna=True).to_numpy() + + +@_compute_unique_values.register +def _(mtx: DaskArray, axis) -> np.ndarray: + import dask.array as da + + def nunique_block(block, axis): + return pd.DataFrame(block).nunique(axis=axis, dropna=True).to_numpy() + + return da.map_blocks(nunique_block, mtx, axis=axis, dtype=int).compute() + + +@singledispatch +def _compute_entropy_of_missingness(mtx, axis): + _raise_array_type_not_implemented(_compute_entropy_of_missingness, type(mtx)) + + +@_compute_entropy_of_missingness.register +def _(mtx: np.ndarray, axis) -> np.ndarray: + missing_mask = pd.isnull(mtx) + p_miss = missing_mask.mean(axis=axis) + p = np.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) + return -(p * np.log2(p) + (1 - p) * np.log2(1 - p)) + + +@_compute_entropy_of_missingness.register +def _(mtx: DaskArray, axis) -> np.ndarray: + import dask.array as da + + missing_mask = da.isnan(mtx) + p_miss = missing_mask.mean(axis=axis) + p = da.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) + return -(p * da.log2(p) + (1 - p) * da.log2(1 - p)).compute() + + def _compute_obs_metrics( mtx, edata: EHRData | AnnData, @@ -131,6 +189,19 @@ def _compute_obs_metrics( obs_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=1) obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / mtx.shape[1]) * 100 + obs_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=1) + valid_counts = mtx.shape[1] - obs_metrics["missing_values_abs"] + obs_metrics["unique_values_ratio"] = ( + np.where( + valid_counts > 0, + obs_metrics["unique_values_abs"] / valid_counts, + np.nan, + ) + * 100 + ) + + obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) + # Specific QC metrics for qc_var in qc_vars: obs_metrics[f"total_features_{qc_var}"] = np.ravel(mtx[:, edata.var[qc_var].values].sum(axis=1)) @@ -180,14 +251,34 @@ def _compute_var_metrics( var_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=0) var_metrics["missing_values_pct"] = (var_metrics["missing_values_abs"] / mtx.shape[0]) * 100 + var_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=0) + valid_counts = mtx.shape[0] - var_metrics["missing_values_abs"] + var_metrics["unique_values_ratio"] = ( + np.where( + valid_counts > 0, + var_metrics["unique_values_abs"] / valid_counts, + np.nan, + ) + * 100 + ) + + var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) + var_metrics["mean"] = np.nan var_metrics["median"] = np.nan var_metrics["standard_deviation"] = np.nan var_metrics["min"] = np.nan var_metrics["max"] = np.nan + var_metrics["coefficient_of_variation"] = np.nan + var_metrics["is_constant"] = np.nan + var_metrics["constant_variable_ratio"] = np.nan + var_metrics["range_ratio"] = np.nan + var_metrics["skewness"] = np.nan + var_metrics["kurtosis"] = np.nan var_metrics["iqr_outliers"] = np.nan try: + # Calculate statistics for non-categorical variables var_metrics.loc[non_categorical_indices, "mean"] = np.nanmean( mtx[:, non_categorical_indices].astype(np.float64), axis=0 ) @@ -203,7 +294,32 @@ def _compute_var_metrics( var_metrics.loc[non_categorical_indices, "max"] = np.nanmax( mtx[:, non_categorical_indices].astype(np.float64), axis=0 ) + var_metrics.loc[non_categorical_indices, "coefficient_of_variation"] = ( + var_metrics.loc[non_categorical_indices, "standard_deviation"] + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) + + # Constant column detection + constant_mask = (var_metrics.loc[non_categorical_indices, "standard_deviation"] == 0) | ( + var_metrics.loc[non_categorical_indices, "max"] == var_metrics.loc[non_categorical_indices, "min"] + ) + var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 + + # Calculate range ratio + var_metrics.loc[non_categorical_indices, "range_ratio"] = ( + (var_metrics.loc[non_categorical_indices, "max"] - var_metrics.loc[non_categorical_indices, "min"]) + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) * 100 + + # Calculate skewness and kurtosis + var_metrics.loc[non_categorical_indices, "skewness"] = skew( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) + var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) # Calculate IQR and define IQR outliers q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 73247f4c5..938797008 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -24,13 +24,27 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) + assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) + assert np.allclose( + var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True + ) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) + assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() # check that none of the columns were modified @@ -48,6 +62,9 @@ def test_obs_qc_metrics(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) @pytest.mark.parametrize("array_type", ARRAY_TYPES) @@ -58,10 +75,21 @@ def test_var_qc_metrics(array_type, missing_values_edata): assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) + assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) + assert np.allclose( + var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True + ) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) + assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From 9e41a831c9be7bb029f6ea876d11f6d7328f7326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 14:31:52 +0200 Subject: [PATCH 07/33] skewness and kurtosis left out --- ehrapy/preprocessing/_quality_control.py | 9 ++++----- tests/preprocessing/test_quality_control.py | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index c96b61c68..d6985173a 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -67,8 +67,6 @@ def qc_metrics( - `is_constant`: Whether the feature is constant (with near zero variance). - `constant_variable_ratio`: Relative amount of constant features in percent. - `range_ratio`: Relative dispersion of features values respective to their mean. - - `skewness`: Skewness of the feature distribution. - - `kurtosis`: Kurtosis of the feature distribution. - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. @@ -273,8 +271,8 @@ def _compute_var_metrics( var_metrics["is_constant"] = np.nan var_metrics["constant_variable_ratio"] = np.nan var_metrics["range_ratio"] = np.nan - var_metrics["skewness"] = np.nan - var_metrics["kurtosis"] = np.nan + # var_metrics["skewness"] = np.nan + # var_metrics["kurtosis"] = np.nan var_metrics["iqr_outliers"] = np.nan try: @@ -314,12 +312,13 @@ def _compute_var_metrics( ).replace([np.inf, -np.inf], np.nan) * 100 # Calculate skewness and kurtosis - var_metrics.loc[non_categorical_indices, "skewness"] = skew( + """var_metrics.loc[non_categorical_indices, "skewness"] = skew( mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) + """ # Calculate IQR and define IQR outliers q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 938797008..83f775565 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -43,8 +43,8 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True ) assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) - assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() # check that none of the columns were modified @@ -88,8 +88,8 @@ def test_var_qc_metrics(array_type, missing_values_edata): var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True ) assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) - assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) + # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From dd2bee916b4e7b06807cfeb8ee4a5b7d3c5cfaf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 17:01:11 +0200 Subject: [PATCH 08/33] is_constant metric for categorical variables datatype set to boolean --- ehrapy/preprocessing/_quality_control.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index d6985173a..5aacf1148 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -303,6 +303,8 @@ def _compute_var_metrics( ) var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + var_metrics["is_constant"] = var_metrics["is_constant"].astype("boolean") + var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 # Calculate range ratio From 5e143c28ff97ebd21f3445c11f252113f701aa77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 24 Oct 2025 17:40:15 +0200 Subject: [PATCH 09/33] small error fixed --- ehrapy/preprocessing/_quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 5aacf1148..d68126266 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -140,7 +140,7 @@ def _(mtx: np.ndarray, axis) -> np.ndarray: def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da - missing_mask = da.isnan(mtx) + missing_mask = da.isnull(mtx) p_miss = missing_mask.mean(axis=axis) p = da.clip(p_miss, 1e-10, 1 - 1e-10) # avoid log(0) return -(p * da.log2(p) + (1 - p) * da.log2(1 - p)).compute() From f5f36f18b697826e2b910ab8c1b99fa876c4b866 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 24 Nov 2025 17:48:25 +0000 Subject: [PATCH 10/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ehrapy/plot/_scanpy_pl_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/plot/_scanpy_pl_api.py b/ehrapy/plot/_scanpy_pl_api.py index d770f0481..99b14b8bc 100644 --- a/ehrapy/plot/_scanpy_pl_api.py +++ b/ehrapy/plot/_scanpy_pl_api.py @@ -478,7 +478,7 @@ def tracksplot( # noqa: D417 gene_symbols=feature_symbols, var_group_positions=var_group_positions, var_group_labels=var_group_labels, - layer=layer, + layer=layer, show=show, save=save, figsize=figsize, From a95cfe9205c860450be2a72ab24467205a7c9669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Tue, 25 Nov 2025 23:06:11 +0100 Subject: [PATCH 11/33] new argument advanced added to qc_metrics, distinction between categorical & numerical vars, tests applied to both advanced=True and advanced=False --- ehrapy/preprocessing/_quality_control.py | 189 +++++++++++++------- tests/conftest.py | 19 ++ tests/preprocessing/test_quality_control.py | 48 ++++- 3 files changed, 183 insertions(+), 73 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 99de07b4d..23019054b 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -29,6 +29,7 @@ def qc_metrics( qc_vars: Collection[str] = (), *, layer: str | None = None, + advanced: bool = False, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Calculates various quality control metrics. @@ -39,36 +40,42 @@ def qc_metrics( edata: Central data object. qc_vars: Optional List of vars to calculate additional metrics for. layer: Layer to use to calculate the metrics. + advanced: Determines if the advanced metrics should be calculated that require feature type information. If it is set to `True`, ehrdata.infer_feature_types must be run first. Returns: Two Pandas DataFrames of all calculated QC metrics for `obs` and `var` respectively. - Observation level metrics include: + Default observation level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. + - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. + + Advanced observation level metrics include: - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. - - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. Feature level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. - - `unique_values_abs`: Absolute amount of unique values. - - `unique_values_ratio`: Relative amount of unique values in percent. - `entropy_of_missingness`: Entropy of the missingness pattern for each feature. Higher values indicate a more heterogeneous (less structured) missingness pattern. - - `mean`: Mean value of the features. - `median`: Median value of the features. - `std`: Standard deviation of the features. - `min`: Minimum value of the features. - `max`: Maximum value of the features. + - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. + + + Advanced feature level metrics include: + + - `unique_values_abs`: Absolute amount of unique values. + - `unique_values_ratio`: Relative amount of unique values in percent. - `coefficient_of_variation`: Coefficient of variation of the features. - `is_constant`: Whether the feature is constant (with near zero variance). - `constant_variable_ratio`: Relative amount of constant features in percent. - `range_ratio`: Relative dispersion of features values respective to their mean. - - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. Examples: @@ -77,9 +84,17 @@ def qc_metrics( >>> obs_qc, var_qc = ep.pp.qc_metrics(edata) >>> obs_qc["missing_values_pct"].plot(kind="hist", bins=20) """ + if advanced: + feature_type = edata.var.get("feature_type", None) + if feature_type is None: + raise ValueError( + "Advanced variable metrics require `edata.var['feature_type']`. " + "Please run `infer_feature_types(edata)` first." + ) + mtx = edata.X if layer is None else edata.layers[layer] - var_metrics = _compute_var_metrics(mtx, edata) - obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True) + var_metrics = _compute_var_metrics(mtx, edata, advanced=advanced) + obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=advanced) edata.var[var_metrics.columns] = var_metrics edata.obs[obs_metrics.columns] = obs_metrics @@ -92,12 +107,12 @@ def _compute_missing_values(mtx, axis): _raise_array_type_not_implemented(_compute_missing_values, type(mtx)) -@_compute_missing_values.register +@_compute_missing_values.register(np.ndarray) def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.isnull(mtx).sum(axis) -@_compute_missing_values.register +@_compute_missing_values.register(DaskArray) def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -109,12 +124,12 @@ def _compute_unique_values(mtx, axis): _raise_array_type_not_implemented(_compute_unique_values, type(mtx)) -@_compute_unique_values.register +@_compute_unique_values.register(np.ndarray) def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.DataFrame(mtx).nunique(axis=axis, dropna=True).to_numpy() -@_compute_unique_values.register +@_compute_unique_values.register(DaskArray) def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -129,7 +144,7 @@ def _compute_entropy_of_missingness(mtx, axis): _raise_array_type_not_implemented(_compute_entropy_of_missingness, type(mtx)) -@_compute_entropy_of_missingness.register +@_compute_entropy_of_missingness.register(np.ndarray) def _(mtx: np.ndarray, axis) -> np.ndarray: missing_mask = pd.isnull(mtx) p_miss = missing_mask.mean(axis=axis) @@ -137,7 +152,7 @@ def _(mtx: np.ndarray, axis) -> np.ndarray: return -(p * np.log2(p) + (1 - p) * np.log2(1 - p)) -@_compute_entropy_of_missingness.register +@_compute_entropy_of_missingness.register(DaskArray) def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -153,6 +168,7 @@ def _compute_obs_metrics( *, qc_vars: Collection[str] = (), log1p: bool = True, + advanced: bool = False, ): """Calculates quality control metrics for observations. @@ -163,6 +179,7 @@ def _compute_obs_metrics( edata: Central data object. qc_vars: A list of previously calculated QC metrics to calculate summary statistics for. log1p: Whether to apply log1p normalization for the QC metrics. Only used with parameter 'qc_vars'. + advanced: Whether to calculate further metrics that require feature type information. Returns: A Pandas DataFrame with the calculated metrics. @@ -187,19 +204,35 @@ def _compute_obs_metrics( obs_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=1) obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / mtx.shape[1]) * 100 + obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) - obs_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=1) - valid_counts = mtx.shape[1] - obs_metrics["missing_values_abs"] - obs_metrics["unique_values_ratio"] = ( - np.where( - valid_counts > 0, - obs_metrics["unique_values_abs"] / valid_counts, - np.nan, - ) - * 100 - ) + if advanced and "feature_type" in edata.var: + feature_type = edata.var["feature_type"] + categorical_mask = feature_type == "categorical" - obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) + if np.any(categorical_mask): + cat_mask_np = np.asarray(categorical_mask) + mtx_cat = mtx[:, cat_mask_np] + + unique_val_abs = _compute_unique_values(mtx_cat, axis=1) + missing_cat = _compute_missing_values(mtx_cat, axis=1) + valid_counts = mtx_cat.shape[1] - missing_cat + + unique_val_ratio = ( + np.where( + valid_counts > 0, + unique_val_abs / valid_counts, + np.nan, + ) + * 100 + ) + else: + n_obs = mtx.shape[0] + unique_val_abs = np.full(n_obs, np.nan) + unique_val_ratio = np.full(n_obs, np.nan) + + obs_metrics["unique_values_abs"] = unique_val_abs + obs_metrics["unique_values_ratio"] = unique_val_ratio # Specific QC metrics for qc_var in qc_vars: @@ -217,12 +250,14 @@ def _compute_obs_metrics( def _compute_var_metrics( mtx, edata: EHRData | AnnData, + advanced: bool = False, ): """Compute variable metrics for quality control. Args: mtx: Data array. edata: Central data object. + advanced: Whether to calculate further metrics that require feature type information. """ categorical_indices = np.ndarray([0], dtype=int) var_metrics = pd.DataFrame(index=edata.var_names) @@ -249,19 +284,36 @@ def _compute_var_metrics( var_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=0) var_metrics["missing_values_pct"] = (var_metrics["missing_values_abs"] / mtx.shape[0]) * 100 + var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) - var_metrics["unique_values_abs"] = _compute_unique_values(mtx, axis=0) - valid_counts = mtx.shape[0] - var_metrics["missing_values_abs"] - var_metrics["unique_values_ratio"] = ( - np.where( - valid_counts > 0, - var_metrics["unique_values_abs"] / valid_counts, - np.nan, - ) - * 100 - ) + if advanced and "feature_type" in edata.var: + feature_type = edata.var["feature_type"] + categorical_mask = feature_type == "categorical" - var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) + n_vars = mtx.shape[1] + unique_val_abs_full = np.full(n_vars, np.nan) + unique_val_ratio_full = np.full(n_vars, np.nan) + + if np.any(categorical_mask): + cat_mask_np = np.asarray(categorical_mask) + + mtx_cat = mtx[:, cat_mask_np] + + unique_val_abs = _compute_unique_values(mtx_cat, axis=0) + missing_cat = _compute_missing_values(mtx_cat, axis=0) + valid_counts = mtx_cat.shape[0] - missing_cat + + unique_val_ratio = np.where( + valid_counts > 0, + unique_val_abs / valid_counts * 100, + np.nan, + ) + + unique_val_abs_full[cat_mask_np] = unique_val_abs + unique_val_ratio_full[cat_mask_np] = unique_val_ratio + + var_metrics["unique_values_abs"] = unique_val_abs_full + var_metrics["unique_values_ratio"] = unique_val_ratio_full var_metrics["mean"] = np.nan var_metrics["median"] = np.nan @@ -293,35 +345,7 @@ def _compute_var_metrics( var_metrics.loc[non_categorical_indices, "max"] = np.nanmax( mtx[:, non_categorical_indices].astype(np.float64), axis=0 ) - var_metrics.loc[non_categorical_indices, "coefficient_of_variation"] = ( - var_metrics.loc[non_categorical_indices, "standard_deviation"] - / var_metrics.loc[non_categorical_indices, "mean"] - ).replace([np.inf, -np.inf], np.nan) - - # Constant column detection - constant_mask = (var_metrics.loc[non_categorical_indices, "standard_deviation"] == 0) | ( - var_metrics.loc[non_categorical_indices, "max"] == var_metrics.loc[non_categorical_indices, "min"] - ) - - var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask - var_metrics["is_constant"] = var_metrics["is_constant"].astype("boolean") - var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 - - # Calculate range ratio - var_metrics.loc[non_categorical_indices, "range_ratio"] = ( - (var_metrics.loc[non_categorical_indices, "max"] - var_metrics.loc[non_categorical_indices, "min"]) - / var_metrics.loc[non_categorical_indices, "mean"] - ).replace([np.inf, -np.inf], np.nan) * 100 - - # Calculate skewness and kurtosis - """var_metrics.loc[non_categorical_indices, "skewness"] = skew( - mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" - ) - var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( - mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" - ) - """ # Calculate IQR and define IQR outliers q1 = np.nanpercentile(mtx[:, non_categorical_indices], 25, axis=0) q3 = np.nanpercentile(mtx[:, non_categorical_indices], 75, axis=0) @@ -335,6 +359,43 @@ def _compute_var_metrics( ) # Fill all non_categoricals with False because else we have a dtype object Series which h5py cannot save var_metrics["iqr_outliers"] = var_metrics["iqr_outliers"].astype(bool).fillna(False) + + if advanced: + feature_type = edata.var["feature_type"] + numeric_mask = feature_type == "numeric" + + non_categorical_indices = np.asarray(numeric_mask) + + if np.any(non_categorical_indices): + var_metrics.loc[non_categorical_indices, "coefficient_of_variation"] = ( + var_metrics.loc[non_categorical_indices, "standard_deviation"] + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) + + # Constant column detection + constant_mask = (var_metrics.loc[non_categorical_indices, "standard_deviation"] == 0) | ( + var_metrics.loc[non_categorical_indices, "max"] == var_metrics.loc[non_categorical_indices, "min"] + ) + + var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + + var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 + + # Calculate range ratio + var_metrics.loc[non_categorical_indices, "range_ratio"] = ( + (var_metrics.loc[non_categorical_indices, "max"] - var_metrics.loc[non_categorical_indices, "min"]) + / var_metrics.loc[non_categorical_indices, "mean"] + ).replace([np.inf, -np.inf], np.nan) * 100 + + # Calculate skewness and kurtosis + """var_metrics.loc[non_categorical_indices, "skewness"] = skew( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) + var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( + mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + ) + """ + var_metrics = var_metrics.infer_objects() except (TypeError, ValueError): # We assume that the data just hasn't been encoded yet diff --git a/tests/conftest.py b/tests/conftest.py index a134caf29..3e5c1dc7e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,6 +52,16 @@ def var_data(): } +@pytest.fixture +def var_data_adv(): + return { + "alive": ["yes", "no", "maybe"], + "hospital": ["hospital 1", "hospital 2", "hospital 1"], + "crazy": ["yes", "yes", "yes"], + "feature_type": ["numeric", "numeric", "categorical"], + } + + @pytest.fixture def edata_feature_type_specifications(): df = pd.DataFrame( @@ -79,6 +89,15 @@ def missing_values_edata(obs_data, var_data): ) +@pytest.fixture +def missing_values_edata_adv(obs_data, var_data_adv): + return ed.EHRData( + X=np.array([[0.21, np.nan, 41.42], [np.nan, np.nan, 7.234]], dtype=np.float32), + obs=pd.DataFrame(data=obs_data), + var=pd.DataFrame(data=var_data_adv, index=["Acetaminophen", "hospital", "crazy"]), + ) + + @pytest.fixture def lab_measurements_simple_edata(obs_data, var_data): X = np.array([[73, 0.02, 1.00], [148, 0.25, 3.55]], dtype=np.float32) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index b2388cb68..6749048c3 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -62,7 +62,18 @@ def test_obs_qc_metrics(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) - assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) +def test_obs_qc_metrics_advanced(array_type, missing_values_edata_adv): + missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) + mtx = missing_values_edata_adv.X + obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_adv, advanced=True) + + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([1, 1])) assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) @@ -75,19 +86,38 @@ def test_var_qc_metrics(array_type, missing_values_edata): assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) - assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) - assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) - assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) - assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) - assert np.allclose( - var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True - ) - assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) + assert (~var_metrics["iqr_outliers"]).all() + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) +def test_var_qc_metrics_advanced(array_type, missing_values_edata_adv): + missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) + mtx = missing_values_edata_adv.X + var_metrics = _compute_var_metrics(mtx, missing_values_edata_adv, advanced=True) + + assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.allclose(var_metrics["unique_values_abs"].values, np.array([np.nan, np.nan, 2.0]), equal_nan=True) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([np.nan, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) + assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) + assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) + assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) + assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) + + is_const = var_metrics["is_constant"].values + assert is_const[0] is True + assert is_const[1] is False + assert np.isnan(is_const[2]) + + assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([50.0, 50.0, 50.0]), equal_nan=True) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From 3839c2eadb2420c3dc548eef2fd85e41d702f67b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Tue, 25 Nov 2025 23:19:30 +0100 Subject: [PATCH 12/33] array types moved from conftest.py to _compat.py --- ehrapy/_compat.py | 23 +++++++++++++++++++++++ tests/conftest.py | 30 +++++++----------------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/ehrapy/_compat.py b/ehrapy/_compat.py index 6777136a0..cd0084b0e 100644 --- a/ehrapy/_compat.py +++ b/ehrapy/_compat.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, ParamSpec, TypeVar, cast import numpy as np +import scipy.sparse as sp P = ParamSpec("P") R = TypeVar("R") @@ -248,3 +249,25 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return wrapper return decorator + + +def asarray(a): + import numpy as np + + return np.asarray(a) + + +def as_dense_dask_array(a, chunk_size=1000): + import dask.array as da + + return da.from_array(a, chunks=chunk_size) + + +ARRAY_TYPES_NUMERIC = ( + asarray, + as_dense_dask_array, + sp.csr_array, + sp.csc_array, +) # add coo_array once supported in AnnData +ARRAY_TYPES_NUMERIC_3D_ABLE = (asarray, as_dense_dask_array) # add coo_array once supported in AnnData +ARRAY_TYPES_NONNUMERIC = (asarray, as_dense_dask_array) diff --git a/tests/conftest.py b/tests/conftest.py index 3e5c1dc7e..cff014e48 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,12 +9,18 @@ import numpy as np import pandas as pd import pytest -import scipy.sparse as sp from anndata import AnnData from ehrdata.core.constants import CATEGORICAL_TAG, DEFAULT_TEM_LAYER_NAME, FEATURE_TYPE_KEY, NUMERIC_TAG from matplotlib.testing.compare import compare_images import ehrapy as ep +from ehrapy._compat import ( + ARRAY_TYPES_NONNUMERIC, + ARRAY_TYPES_NUMERIC, + ARRAY_TYPES_NUMERIC_3D_ABLE, + as_dense_dask_array, + asarray, +) if TYPE_CHECKING: import os @@ -439,25 +445,3 @@ def clean_up_plots(): plt.clf() plt.cla() plt.close("all") - - -def asarray(a): - import numpy as np - - return np.asarray(a) - - -def as_dense_dask_array(a, chunk_size=1000): - import dask.array as da - - return da.from_array(a, chunks=chunk_size) - - -ARRAY_TYPES_NUMERIC = ( - asarray, - as_dense_dask_array, - sp.csr_array, - sp.csc_array, -) # add coo_array once supported in AnnData -ARRAY_TYPES_NUMERIC_3D_ABLE = (asarray, as_dense_dask_array) # add coo_array once supported in AnnData -ARRAY_TYPES_NONNUMERIC = (asarray, as_dense_dask_array) From 5ba8e6bded1486fc82217000fce1ec365fb03ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Tue, 25 Nov 2025 23:29:58 +0100 Subject: [PATCH 13/33] vanilla test advanced added and default fixed --- tests/preprocessing/test_quality_control.py | 50 ++++++++++++++++----- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 6749048c3..3d7d4dc06 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -24,27 +24,55 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) - assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 1])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) + + assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) + assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) + assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) + assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) + assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) + assert (~var_metrics["iqr_outliers"]).all() + + # check that none of the columns were modified + for key in modification_copy.obs.keys(): + assert np.array_equal(modification_copy.obs[key], adata.obs[key]) + for key in modification_copy.var.keys(): + assert np.array_equal(modification_copy.var[key], adata.var[key]) + + +@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) +def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata_adv): + adata = missing_values_edata_adv + adata.X = array_type(adata.X) + modification_copy = adata.copy() + obs_metrics, var_metrics = ep.pp.qc_metrics(adata, advanced=True) + + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([1, 1])) assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) - assert np.array_equal(var_metrics["unique_values_abs"].values, np.array([1, 0, 2])) - assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([100.0, np.nan, 100.0]), equal_nan=True) + assert np.allclose(var_metrics["unique_values_abs"].values, np.array([np.nan, np.nan, 2.0]), equal_nan=True) + assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([np.nan, np.nan, 100.0]), equal_nan=True) assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) - assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, 0.70263]), equal_nan=True) - assert np.array_equal(var_metrics["is_constant"].values, np.array([True, False, False])) - assert np.allclose( - var_metrics["constant_variable_ratio"].values, np.array([33.3333, 33.3333, 33.3333]), equal_nan=True - ) - assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, 140.52698]), equal_nan=True) - # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) + + is_const = var_metrics["is_constant"].values + assert is_const[0] is True + assert is_const[1] is False + assert np.isnan(is_const[2]) + + assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([50.0, 50.0, 50.0]), equal_nan=True) + assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() # check that none of the columns were modified From ddb893107d4cae9351c9f51213e1764324591684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 12:27:40 +0100 Subject: [PATCH 14/33] fixed small things considering the reviews --- ehrapy/preprocessing/_quality_control.py | 82 ++++++++++++++---------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 23019054b..1f7e703ec 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import warnings from functools import singledispatch from pathlib import Path from typing import TYPE_CHECKING, Literal @@ -34,28 +35,29 @@ def qc_metrics( """Calculates various quality control metrics. Uses the original values to calculate the metrics and not the encoded ones. - Look at the return type for a more in depth description of the calculated metrics. + Look at the return type for a more in depth description of the basic and advanced calculated metrics. + Args: edata: Central data object. qc_vars: Optional List of vars to calculate additional metrics for. layer: Layer to use to calculate the metrics. - advanced: Determines if the advanced metrics should be calculated that require feature type information. If it is set to `True`, ehrdata.infer_feature_types must be run first. + advanced: Determines if the advanced metrics should be calculated that require feature type information. If it is set to `True`, ehrdata.infer_feature_types must be run first. Default is `False`. Returns: Two Pandas DataFrames of all calculated QC metrics for `obs` and `var` respectively. - Default observation level metrics include: + Basic (default) observation level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. - Advanced observation level metrics include: + Advanced observation level metrics include (only computed if advanced is set to `True`): - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. - Feature level metrics include: + Basic (default) feature level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. @@ -68,7 +70,7 @@ def qc_metrics( - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. - Advanced feature level metrics include: + Advanced feature level metrics include (only computed if advanced is set to `True`): - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. @@ -84,17 +86,21 @@ def qc_metrics( >>> obs_qc, var_qc = ep.pp.qc_metrics(edata) >>> obs_qc["missing_values_pct"].plot(kind="hist", bins=20) """ + feature_type = edata.var.get("feature_type", None) + if_advanced = advanced if advanced: - feature_type = edata.var.get("feature_type", None) if feature_type is None: - raise ValueError( - "Advanced variable metrics require `edata.var['feature_type']`. " - "Please run `infer_feature_types(edata)` first." + warnings.warn( + "Advanced QC metrics require `edata.var['feature_type']`." + "Only basic metrics will be computed." + "Please run `infer_feature_types(edata)` first to enable advanced metrics.", + stacklevel=2, ) + if_advanced = False mtx = edata.X if layer is None else edata.layers[layer] - var_metrics = _compute_var_metrics(mtx, edata, advanced=advanced) - obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=advanced) + var_metrics = _compute_var_metrics(mtx, edata, advanced=if_advanced) + obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=if_advanced) edata.var[var_metrics.columns] = var_metrics edata.obs[obs_metrics.columns] = obs_metrics @@ -206,7 +212,12 @@ def _compute_obs_metrics( obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / mtx.shape[1]) * 100 obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) - if advanced and "feature_type" in edata.var: + if advanced and "feature_type" not in edata.var: + raise ValueError( + "Advanced QC metrics require `edata.var['feature_type']`.Please run `infer_feature_types(edata)` first" + ) + + if advanced: feature_type = edata.var["feature_type"] categorical_mask = feature_type == "categorical" @@ -218,13 +229,10 @@ def _compute_obs_metrics( missing_cat = _compute_missing_values(mtx_cat, axis=1) valid_counts = mtx_cat.shape[1] - missing_cat - unique_val_ratio = ( - np.where( - valid_counts > 0, - unique_val_abs / valid_counts, - np.nan, - ) - * 100 + unique_val_ratio = np.where( + valid_counts > 0, + unique_val_abs / valid_counts * 100, + np.nan, ) else: n_obs = mtx.shape[0] @@ -286,7 +294,12 @@ def _compute_var_metrics( var_metrics["missing_values_pct"] = (var_metrics["missing_values_abs"] / mtx.shape[0]) * 100 var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) - if advanced and "feature_type" in edata.var: + if advanced and "feature_type" not in edata.var: + raise ValueError( + "Advanced QC metrics require `edata.var['feature_type']`. Please run `infer_feature_types(edata)` first" + ) + + if advanced: feature_type = edata.var["feature_type"] categorical_mask = feature_type == "categorical" @@ -364,35 +377,34 @@ def _compute_var_metrics( feature_type = edata.var["feature_type"] numeric_mask = feature_type == "numeric" - non_categorical_indices = np.asarray(numeric_mask) + numeric_indices = np.asarray(numeric_mask) - if np.any(non_categorical_indices): + if np.any(numeric_indices): var_metrics.loc[non_categorical_indices, "coefficient_of_variation"] = ( - var_metrics.loc[non_categorical_indices, "standard_deviation"] - / var_metrics.loc[non_categorical_indices, "mean"] + var_metrics.loc[numeric_indices, "standard_deviation"] / var_metrics.loc[numeric_indices, "mean"] ).replace([np.inf, -np.inf], np.nan) # Constant column detection - constant_mask = (var_metrics.loc[non_categorical_indices, "standard_deviation"] == 0) | ( - var_metrics.loc[non_categorical_indices, "max"] == var_metrics.loc[non_categorical_indices, "min"] + constant_mask = (var_metrics.loc[numeric_indices, "standard_deviation"] == 0) | ( + var_metrics.loc[numeric_indices, "max"] == var_metrics.loc[numeric_indices, "min"] ) - var_metrics.loc[non_categorical_indices, "is_constant"] = constant_mask + var_metrics.loc[numeric_indices, "is_constant"] = constant_mask var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 # Calculate range ratio - var_metrics.loc[non_categorical_indices, "range_ratio"] = ( - (var_metrics.loc[non_categorical_indices, "max"] - var_metrics.loc[non_categorical_indices, "min"]) - / var_metrics.loc[non_categorical_indices, "mean"] + var_metrics.loc[numeric_indices, "range_ratio"] = ( + (var_metrics.loc[numeric_indices, "max"] - var_metrics.loc[numeric_indices, "min"]) + / var_metrics.loc[numeric_indices, "mean"] ).replace([np.inf, -np.inf], np.nan) * 100 # Calculate skewness and kurtosis - """var_metrics.loc[non_categorical_indices, "skewness"] = skew( - mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + """var_metrics.loc[numeric_indices, "skewness"] = skew( + mtx[:, numeric_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) - var_metrics.loc[non_categorical_indices, "kurtosis"] = kurtosis( - mtx[:, non_categorical_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" + var_metrics.loc[numeric_indices, "kurtosis"] = kurtosis( + mtx[:, numeric_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" ) """ From 736d8bcc8555b0bdf3cea0fa1005238aaeca9793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 12:50:40 +0100 Subject: [PATCH 15/33] _apply_over_time_axis decorator for the functions added --- ehrapy/preprocessing/_quality_control.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 1f7e703ec..8bb54b75f 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -12,7 +12,7 @@ from scipy.stats import kurtosis, skew from thefuzz import process -from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata +from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata, _apply_over_time_axis from ehrapy.anndata import anndata_to_df from ehrapy.preprocessing._encoding import _get_encoded_features @@ -24,7 +24,6 @@ @use_ehrdata(deprecated_after="1.0.0") -@function_2D_only() def qc_metrics( edata: EHRData | AnnData, qc_vars: Collection[str] = (), @@ -114,11 +113,13 @@ def _compute_missing_values(mtx, axis): @_compute_missing_values.register(np.ndarray) +@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.isnull(mtx).sum(axis) @_compute_missing_values.register(DaskArray) +@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -131,11 +132,13 @@ def _compute_unique_values(mtx, axis): @_compute_unique_values.register(np.ndarray) +@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.DataFrame(mtx).nunique(axis=axis, dropna=True).to_numpy() @_compute_unique_values.register(DaskArray) +@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -151,6 +154,7 @@ def _compute_entropy_of_missingness(mtx, axis): @_compute_entropy_of_missingness.register(np.ndarray) +@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: missing_mask = pd.isnull(mtx) p_miss = missing_mask.mean(axis=axis) @@ -159,6 +163,7 @@ def _(mtx: np.ndarray, axis) -> np.ndarray: @_compute_entropy_of_missingness.register(DaskArray) +@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da From cb3bc2aefa453eb8036b9c99ee631fefabf0c181 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 28 Nov 2025 11:51:06 +0000 Subject: [PATCH 16/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ehrapy/preprocessing/_quality_control.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 8bb54b75f..7c5da9e3f 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -12,7 +12,13 @@ from scipy.stats import kurtosis, skew from thefuzz import process -from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata, _apply_over_time_axis +from ehrapy._compat import ( + DaskArray, + _apply_over_time_axis, + _raise_array_type_not_implemented, + function_2D_only, + use_ehrdata, +) from ehrapy.anndata import anndata_to_df from ehrapy.preprocessing._encoding import _get_encoded_features From 5d86fc002a3df71740708d847056185149e6e689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 12:52:29 +0100 Subject: [PATCH 17/33] forgot to add after pre-commit made changes --- ehrapy/preprocessing/_quality_control.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 8bb54b75f..7c5da9e3f 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -12,7 +12,13 @@ from scipy.stats import kurtosis, skew from thefuzz import process -from ehrapy._compat import DaskArray, _raise_array_type_not_implemented, function_2D_only, use_ehrdata, _apply_over_time_axis +from ehrapy._compat import ( + DaskArray, + _apply_over_time_axis, + _raise_array_type_not_implemented, + function_2D_only, + use_ehrdata, +) from ehrapy.anndata import anndata_to_df from ehrapy.preprocessing._encoding import _get_encoded_features From 5ad274c0ec03a048297a5d3e07127733715ca971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 14:26:23 +0100 Subject: [PATCH 18/33] undo the decorator for the metric functions, since it doesnt work in this case --- ehrapy/preprocessing/_quality_control.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 7c5da9e3f..a2229d0bd 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -30,6 +30,7 @@ @use_ehrdata(deprecated_after="1.0.0") +@function_2D_only() def qc_metrics( edata: EHRData | AnnData, qc_vars: Collection[str] = (), @@ -119,13 +120,11 @@ def _compute_missing_values(mtx, axis): @_compute_missing_values.register(np.ndarray) -@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.isnull(mtx).sum(axis) @_compute_missing_values.register(DaskArray) -@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -138,13 +137,11 @@ def _compute_unique_values(mtx, axis): @_compute_unique_values.register(np.ndarray) -@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: return pd.DataFrame(mtx).nunique(axis=axis, dropna=True).to_numpy() @_compute_unique_values.register(DaskArray) -@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da @@ -160,7 +157,6 @@ def _compute_entropy_of_missingness(mtx, axis): @_compute_entropy_of_missingness.register(np.ndarray) -@_apply_over_time_axis def _(mtx: np.ndarray, axis) -> np.ndarray: missing_mask = pd.isnull(mtx) p_miss = missing_mask.mean(axis=axis) @@ -169,7 +165,6 @@ def _(mtx: np.ndarray, axis) -> np.ndarray: @_compute_entropy_of_missingness.register(DaskArray) -@_apply_over_time_axis def _(mtx: DaskArray, axis) -> np.ndarray: import dask.array as da From f5e220da671b884742880fba66ded6dd1f4c0fab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 16:36:34 +0100 Subject: [PATCH 19/33] 3d enabled qc metrics --- ehrapy/preprocessing/_quality_control.py | 48 ++++++++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index a2229d0bd..e5ccd1cc3 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -199,6 +199,8 @@ def _compute_obs_metrics( obs_metrics = pd.DataFrame(index=edata.obs_names) var_metrics = pd.DataFrame(index=edata.var_names) + original_mtx = mtx + if "encoding_mode" in edata.var: for original_values_categorical in _get_encoded_features(edata): mtx = mtx.astype(object) @@ -214,13 +216,19 @@ def _compute_obs_metrics( ) ) - obs_metrics["missing_values_abs"] = _compute_missing_values(mtx, axis=1) - obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / mtx.shape[1]) * 100 - obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=1) + if mtx.ndim == 3: + n_obs, n_vars, n_time = mtx.shape + flat_mtx = mtx.reshape(n_obs, n_vars * n_time) + if mtx.ndim == 2: + flat_mtx = mtx + + obs_metrics["missing_values_abs"] = _compute_missing_values(flat_mtx, axis=1) + obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / flat_mtx.shape[1]) * 100 + obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(flat_mtx, axis=1) if advanced and "feature_type" not in edata.var: raise ValueError( - "Advanced QC metrics require `edata.var['feature_type']`.Please run `infer_feature_types(edata)` first" + "Advanced QC metrics require `edata.var['feature_type']`. Please run `infer_feature_types(edata)` first" ) if advanced: @@ -229,11 +237,28 @@ def _compute_obs_metrics( if np.any(categorical_mask): cat_mask_np = np.asarray(categorical_mask) - mtx_cat = mtx[:, cat_mask_np] + if original_mtx.ndim == 2: + mtx_cat = mtx[:, cat_mask_np] + + unique_val_abs = _compute_unique_values(mtx_cat, axis=1) + missing_cat = _compute_missing_values(mtx_cat, axis=1) + valid_counts = mtx_cat.shape[1] - missing_cat - unique_val_abs = _compute_unique_values(mtx_cat, axis=1) - missing_cat = _compute_missing_values(mtx_cat, axis=1) - valid_counts = mtx_cat.shape[1] - missing_cat + elif original_mtx.ndim == 3: + mtx_cat_3d = original_mtx[:, cat_mask_np, :] + n_obs, n_cat, n_time = mtx_cat_3d.shape + + unique_per_time = np.empty((n_obs, n_time)) + valid_per_time = np.empty((n_obs, n_time)) + + for t in range(n_time): + slice_t = mtx_cat_3d[:, :, t] # (n_obs, n_cat_vars) + unique_per_time[:, t] = _compute_unique_values(slice_t, axis=1) + missing_t = _compute_missing_values(slice_t, axis=1) + valid_per_time[:, t] = n_cat - missing_t + + unique_val_abs = unique_per_time.sum(axis=1) + valid_counts = valid_per_time.sum(axis=1) unique_val_ratio = np.where( valid_counts > 0, @@ -250,6 +275,9 @@ def _compute_obs_metrics( # Specific QC metrics for qc_var in qc_vars: + if mtx.ndim == 3: + raise ValueError("Only 2D matrices are supported for qc_vars argument") + obs_metrics[f"total_features_{qc_var}"] = np.ravel(mtx[:, edata.var[qc_var].values].sum(axis=1)) if log1p: obs_metrics[f"log1p_total_features_{qc_var}"] = np.log1p(obs_metrics[f"total_features_{qc_var}"]) @@ -276,6 +304,10 @@ def _compute_var_metrics( categorical_indices = np.ndarray([0], dtype=int) var_metrics = pd.DataFrame(index=edata.var_names) + if mtx.ndim == 3: + n_obs, n_vars, n_time = mtx.shape + mtx = np.moveaxis(mtx, 1, 2).reshape(-1, n_vars) + if "encoding_mode" in edata.var.keys(): for original_values_categorical in _get_encoded_features(edata): mtx = copy.deepcopy(mtx.astype(object)) From a8aff81af145dc73328ffe042aa661d68ee71fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 28 Nov 2025 17:18:48 +0100 Subject: [PATCH 20/33] tests for 3d qc_metrics --- tests/conftest.py | 14 +++++ tests/preprocessing/test_quality_control.py | 59 +++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index cff014e48..cfcce1cd4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -104,6 +104,20 @@ def missing_values_edata_adv(obs_data, var_data_adv): ) +@pytest.fixture +def missing_values_edata_3d(obs_data, var_data_adv): + layer = np.array( + [[[0.21, 0.55], [np.nan, 1.23], [41.42, np.nan]], [[np.nan, np.nan], [np.nan, 3.14], [7.234, 9.99]]], + dtype=np.float32, + ) + return ed.EHRData( + layers={"layer_1": layer}, + obs=pd.DataFrame(data=obs_data), + var=pd.DataFrame(data=var_data_adv, index=["Acetaminophen", "hospital", "crazy"]), + tem=pd.DataFrame(index=["t0", "t1"]), + ) + + @pytest.fixture def lab_measurements_simple_edata(obs_data, var_data): X = np.array([[73, 0.02, 1.00], [148, 0.25, 3.55]], dtype=np.float32) diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 3d7d4dc06..25550c38c 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -93,6 +93,15 @@ def test_obs_qc_metrics(array_type, missing_values_edata): assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) +def test_obs_qc_metrics_3D(missing_values_edata_3d): + mtx = missing_values_edata_3d.layers["layer_1"] + obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_3d) + + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([2, 3])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 50.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 1.0])) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) def test_obs_qc_metrics_advanced(array_type, missing_values_edata_adv): missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) @@ -106,6 +115,17 @@ def test_obs_qc_metrics_advanced(array_type, missing_values_edata_adv): assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) +def test_obs_qc_metrics_advanced_3D(missing_values_edata_3d): + mtx = missing_values_edata_3d.layers["layer_1"] + obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_3d, advanced=True) + + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([2, 3])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 50.0])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([1, 2])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 1.0])) + + @pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) def test_var_qc_metrics(array_type, missing_values_edata): missing_values_edata.X = array_type(missing_values_edata.X) @@ -122,6 +142,20 @@ def test_var_qc_metrics(array_type, missing_values_edata): assert (~var_metrics["iqr_outliers"]).all() +def test_var_qc_metrics_3D(missing_values_edata_3d): + mtx = missing_values_edata_3d.layers["layer_1"] + var_metrics = _compute_var_metrics(mtx, missing_values_edata_3d) + + assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([2, 2, 1])) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 50.0, 25.0])) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 1.0, 0.811278])) + assert np.allclose(var_metrics["mean"].values, np.array([0.38, 2.185, 19.547999])) + assert np.allclose(var_metrics["median"].values, np.array([0.38, 2.185, 9.98999])) + assert np.allclose(var_metrics["min"].values, np.array([0.21, 1.23, 7.234])) + assert np.allclose(var_metrics["max"].values, np.array([0.55, 3.14, 41.42])) + assert (~var_metrics["iqr_outliers"]).all() + + @pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) def test_var_qc_metrics_advanced(array_type, missing_values_edata_adv): missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) @@ -151,6 +185,31 @@ def test_var_qc_metrics_advanced(array_type, missing_values_edata_adv): assert (~var_metrics["iqr_outliers"]).all() +def test_var_qc_metrics_advanced_3D(missing_values_edata_3d): + mtx = missing_values_edata_3d.layers["layer_1"] + var_metrics = _compute_var_metrics(mtx, missing_values_edata_3d, advanced=True) + + assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([2, 2, 1])) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 50.0, 25.0])) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 1.0, 0.811278])) + assert np.allclose(var_metrics["mean"].values, np.array([0.38, 2.185, 19.547999])) + assert np.allclose(var_metrics["median"].values, np.array([0.38, 2.185, 9.98999])) + assert np.allclose(var_metrics["min"].values, np.array([0.21, 1.23, 7.234])) + assert np.allclose(var_metrics["max"].values, np.array([0.55, 3.14, 41.42])) + assert np.allclose( + var_metrics["coefficient_of_variation"].values, np.array([0.44737, 0.43707, np.nan]), equal_nan=True + ) + + is_const = var_metrics["is_constant"].values + assert is_const[0] is False + assert is_const[1] is False + assert np.isnan(is_const[2]) + + assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([0, 0, 0])) + assert np.allclose(var_metrics["range_ratio"].values, np.array([89.473688, 87.414189, np.nan]), equal_nan=True) + assert (~var_metrics["iqr_outliers"]).all() + + @pytest.mark.parametrize( "array_type, expected_error", [ From 968d12b6235a9a3b3be8f3da3a92c5b4d4ab54e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 02:29:43 +0100 Subject: [PATCH 21/33] updated according to the comments --- ehrapy/preprocessing/_quality_control.py | 51 ++--- tests/conftest.py | 36 +--- tests/preprocessing/test_quality_control.py | 217 +++++++++----------- 3 files changed, 123 insertions(+), 181 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index e5ccd1cc3..55d2688f7 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -30,7 +30,6 @@ @use_ehrdata(deprecated_after="1.0.0") -@function_2D_only() def qc_metrics( edata: EHRData | AnnData, qc_vars: Collection[str] = (), @@ -174,6 +173,19 @@ def _(mtx: DaskArray, axis) -> np.ndarray: return -(p * da.log2(p) + (1 - p) * da.log2(1 - p)).compute() +@_apply_over_time_axis +def _row_unique(arr_2d: np.ndarray, axis) -> np.ndarray: + uniques = _compute_unique_values(arr_2d, axis=axis) + return np.broadcast_to(uniques[:, None], arr_2d.shape) + + +@_apply_over_time_axis +def _row_valid(arr_2d: np.ndarray, axis) -> np.ndarray: + missing = _compute_missing_values(arr_2d, axis=axis) + valid = arr_2d.shape[axis] - missing + return np.broadcast_to(valid[:, None], arr_2d.shape) + + def _compute_obs_metrics( mtx, edata: EHRData | AnnData, @@ -237,25 +249,22 @@ def _compute_obs_metrics( if np.any(categorical_mask): cat_mask_np = np.asarray(categorical_mask) - if original_mtx.ndim == 2: - mtx_cat = mtx[:, cat_mask_np] - unique_val_abs = _compute_unique_values(mtx_cat, axis=1) - missing_cat = _compute_missing_values(mtx_cat, axis=1) - valid_counts = mtx_cat.shape[1] - missing_cat + if original_mtx.ndim == 2: + mtx_cat = mtx[:, cat_mask_np] # (n_obs, n_cat_var) + else: # ndim == 3 + mtx_cat = original_mtx[:, cat_mask_np, :] # (n_obs, n_cat_var, n_time) - elif original_mtx.ndim == 3: - mtx_cat_3d = original_mtx[:, cat_mask_np, :] - n_obs, n_cat, n_time = mtx_cat_3d.shape + unique_arr = _row_unique(mtx_cat, axis=1) + valid_arr = _row_valid(mtx_cat, axis=1) - unique_per_time = np.empty((n_obs, n_time)) - valid_per_time = np.empty((n_obs, n_time)) + if unique_arr.ndim == 2: + unique_val_abs = unique_arr[:, 0] + valid_counts = valid_arr[:, 0] - for t in range(n_time): - slice_t = mtx_cat_3d[:, :, t] # (n_obs, n_cat_vars) - unique_per_time[:, t] = _compute_unique_values(slice_t, axis=1) - missing_t = _compute_missing_values(slice_t, axis=1) - valid_per_time[:, t] = n_cat - missing_t + else: + unique_per_time = unique_arr[:, 0, :] + valid_per_time = valid_arr[:, 0, :] unique_val_abs = unique_per_time.sum(axis=1) valid_counts = valid_per_time.sum(axis=1) @@ -303,6 +312,7 @@ def _compute_var_metrics( """ categorical_indices = np.ndarray([0], dtype=int) var_metrics = pd.DataFrame(index=edata.var_names) + mtx = np.asarray(mtx) if mtx.ndim == 3: n_obs, n_vars, n_time = mtx.shape @@ -437,15 +447,6 @@ def _compute_var_metrics( / var_metrics.loc[numeric_indices, "mean"] ).replace([np.inf, -np.inf], np.nan) * 100 - # Calculate skewness and kurtosis - """var_metrics.loc[numeric_indices, "skewness"] = skew( - mtx[:, numeric_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" - ) - var_metrics.loc[numeric_indices, "kurtosis"] = kurtosis( - mtx[:, numeric_indices].astype(np.float64), axis=0, bias=False, nan_policy="omit" - ) - """ - var_metrics = var_metrics.infer_objects() except (TypeError, ValueError): # We assume that the data just hasn't been encoded yet diff --git a/tests/conftest.py b/tests/conftest.py index cfcce1cd4..e68875540 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,16 +58,6 @@ def var_data(): } -@pytest.fixture -def var_data_adv(): - return { - "alive": ["yes", "no", "maybe"], - "hospital": ["hospital 1", "hospital 2", "hospital 1"], - "crazy": ["yes", "yes", "yes"], - "feature_type": ["numeric", "numeric", "categorical"], - } - - @pytest.fixture def edata_feature_type_specifications(): df = pd.DataFrame( @@ -95,29 +85,6 @@ def missing_values_edata(obs_data, var_data): ) -@pytest.fixture -def missing_values_edata_adv(obs_data, var_data_adv): - return ed.EHRData( - X=np.array([[0.21, np.nan, 41.42], [np.nan, np.nan, 7.234]], dtype=np.float32), - obs=pd.DataFrame(data=obs_data), - var=pd.DataFrame(data=var_data_adv, index=["Acetaminophen", "hospital", "crazy"]), - ) - - -@pytest.fixture -def missing_values_edata_3d(obs_data, var_data_adv): - layer = np.array( - [[[0.21, 0.55], [np.nan, 1.23], [41.42, np.nan]], [[np.nan, np.nan], [np.nan, 3.14], [7.234, 9.99]]], - dtype=np.float32, - ) - return ed.EHRData( - layers={"layer_1": layer}, - obs=pd.DataFrame(data=obs_data), - var=pd.DataFrame(data=var_data_adv, index=["Acetaminophen", "hospital", "crazy"]), - tem=pd.DataFrame(index=["t0", "t1"]), - ) - - @pytest.fixture def lab_measurements_simple_edata(obs_data, var_data): X = np.array([[73, 0.02, 1.00], [148, 0.25, 3.55]], dtype=np.float32) @@ -206,7 +173,8 @@ def edata_mini_3D_missing_values(): ], dtype=object, ) - return ed.EHRData(layers={DEFAULT_TEM_LAYER_NAME: tiny_mixed_array}) + n_obs, n_vars, _ = tiny_mixed_array.shape + return ed.EHRData(shape=(n_obs, n_vars), layers={DEFAULT_TEM_LAYER_NAME: tiny_mixed_array}) @pytest.fixture diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 25550c38c..5a869cc4e 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -43,9 +43,11 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): @pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) -def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata_adv): - adata = missing_values_edata_adv - adata.X = array_type(adata.X) +def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata): + adata = missing_values_edata + + adata.var["feature_type"] = ["numeric", "numeric", "categorical"] + adata.X = array_type(missing_values_edata.X) modification_copy = adata.copy() obs_metrics, var_metrics = ep.pp.qc_metrics(adata, advanced=True) @@ -82,133 +84,110 @@ def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata_adv): assert np.array_equal(modification_copy.var[key], adata.var[key]) -@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) -def test_obs_qc_metrics(array_type, missing_values_edata): - missing_values_edata.X = array_type(missing_values_edata.X) - mtx = missing_values_edata.X - obs_metrics = _compute_obs_metrics(mtx, missing_values_edata) - - assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) - assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) - assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) - - -def test_obs_qc_metrics_3D(missing_values_edata_3d): - mtx = missing_values_edata_3d.layers["layer_1"] - obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_3d) - - assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([2, 3])) - assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 50.0])) - assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 1.0])) - - -@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) -def test_obs_qc_metrics_advanced(array_type, missing_values_edata_adv): - missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) - mtx = missing_values_edata_adv.X - obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_adv, advanced=True) - - assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) - assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) - assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([1, 1])) - assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) - assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 0.9183])) - - -def test_obs_qc_metrics_advanced_3D(missing_values_edata_3d): - mtx = missing_values_edata_3d.layers["layer_1"] - obs_metrics = _compute_obs_metrics(mtx, missing_values_edata_3d, advanced=True) - - assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([2, 3])) - assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 50.0])) - assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([1, 2])) - assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0])) - assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.9183, 1.0])) - - -@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) -def test_var_qc_metrics(array_type, missing_values_edata): - missing_values_edata.X = array_type(missing_values_edata.X) - mtx = missing_values_edata.X - var_metrics = _compute_var_metrics(mtx, missing_values_edata) - - assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) - assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) - assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) - assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) - assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) - assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) - assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) - assert (~var_metrics["iqr_outliers"]).all() - - -def test_var_qc_metrics_3D(missing_values_edata_3d): - mtx = missing_values_edata_3d.layers["layer_1"] - var_metrics = _compute_var_metrics(mtx, missing_values_edata_3d) - - assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([2, 2, 1])) - assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 50.0, 25.0])) - assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 1.0, 0.811278])) - assert np.allclose(var_metrics["mean"].values, np.array([0.38, 2.185, 19.547999])) - assert np.allclose(var_metrics["median"].values, np.array([0.38, 2.185, 9.98999])) - assert np.allclose(var_metrics["min"].values, np.array([0.21, 1.23, 7.234])) - assert np.allclose(var_metrics["max"].values, np.array([0.55, 3.14, 41.42])) - assert (~var_metrics["iqr_outliers"]).all() - - -@pytest.mark.parametrize("array_type", ARRAY_TYPES_NONNUMERIC) -def test_var_qc_metrics_advanced(array_type, missing_values_edata_adv): - missing_values_edata_adv.X = array_type(missing_values_edata_adv.X) - mtx = missing_values_edata_adv.X - var_metrics = _compute_var_metrics(mtx, missing_values_edata_adv, advanced=True) +def test_qc_metrics_3d_vanilla(edata_mini_3D_missing_values): + edata = edata_mini_3D_missing_values + edata = edata[:, :4] + modification_copy = edata.copy() - assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([1, 2, 0])) - assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 100.0, 0.0])) - assert np.allclose(var_metrics["unique_values_abs"].values, np.array([np.nan, np.nan, 2.0]), equal_nan=True) - assert np.allclose(var_metrics["unique_values_ratio"].values, np.array([np.nan, np.nan, 100.0]), equal_nan=True) - assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 0.0, 0.0])) - assert np.allclose(var_metrics["mean"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) - assert np.allclose(var_metrics["median"].values, np.array([0.21, np.nan, 24.327]), equal_nan=True) - assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) - assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) - assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) + obs_metrics, var_metrics = ep.pp.qc_metrics(edata, layer=DEFAULT_TEM_LAYER_NAME) - is_const = var_metrics["is_constant"].values - assert is_const[0] is True - assert is_const[1] is False - assert np.isnan(is_const[2]) + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 0, 1, 1])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([12.5, 0.0, 12.5, 12.5])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.54356, 0, 0.54356, 0.54356])) - assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([50.0, 50.0, 50.0]), equal_nan=True) - assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) - # assert np.allclose(var_metrics["skewness"].values, np.array([np.nan, np.nan, 0.0]), equal_nan=True) - # assert np.allclose(var_metrics["kurtosis"].values, np.array([np.nan, np.nan, -2.0]), equal_nan=True) + assert np.array_equal( + var_metrics["missing_values_abs"].values, + np.array( + [ + 0, + 1, + 2, + 0, + ] + ), + ) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([0.0, 12.5, 25.0, 0.0])) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([0, 0.54356, 0.811278, 0])) + assert np.allclose(var_metrics["mean"].values, np.array([144.5, 79.0, 78.16667, 1.25])) + assert np.allclose(var_metrics["median"].values, np.array([144.5, 79.0, 76.5, 1.0])) + assert np.allclose( + var_metrics["standard_deviation"].values, + np.array([5.12347538, 1.30930734, 18.16972451, 0.96824584]), + equal_nan=True, + ) + assert np.allclose(var_metrics["min"].values, np.array([138.0, 77.0, 56.0, 0.0])) + assert np.allclose(var_metrics["max"].values, np.array([151.0, 81.0, 110.0, 3.0])) assert (~var_metrics["iqr_outliers"]).all() - -def test_var_qc_metrics_advanced_3D(missing_values_edata_3d): - mtx = missing_values_edata_3d.layers["layer_1"] - var_metrics = _compute_var_metrics(mtx, missing_values_edata_3d, advanced=True) - - assert np.array_equal(var_metrics["missing_values_abs"].values, np.array([2, 2, 1])) - assert np.allclose(var_metrics["missing_values_pct"].values, np.array([50.0, 50.0, 25.0])) - assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([1.0, 1.0, 0.811278])) - assert np.allclose(var_metrics["mean"].values, np.array([0.38, 2.185, 19.547999])) - assert np.allclose(var_metrics["median"].values, np.array([0.38, 2.185, 9.98999])) - assert np.allclose(var_metrics["min"].values, np.array([0.21, 1.23, 7.234])) - assert np.allclose(var_metrics["max"].values, np.array([0.55, 3.14, 41.42])) + # check that none of the columns were modified + for key in modification_copy.obs.keys(): + assert np.array_equal(modification_copy.obs[key], edata.obs[key]) + for key in modification_copy.var.keys(): + assert np.array_equal(modification_copy.var[key], edata.var[key]) + + +def test_qc_metrics_3d_vanilla_advanced(edata_mini_3D_missing_values): + edata = edata_mini_3D_missing_values.copy() + edata = edata[:, :4] + edata.var["feature_type"] = ["numeric", "numeric", "numeric", "categorical"] + modification_copy = edata.copy() + + obs_metrics, var_metrics = ep.pp.qc_metrics(edata, layer=DEFAULT_TEM_LAYER_NAME, advanced=True) + + assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 0, 1, 1])) + assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([12.5, 0.0, 12.5, 12.5])) + assert np.allclose(obs_metrics["entropy_of_missingness"].values, np.array([0.54356, 0, 0.54356, 0.54356])) + assert np.array_equal(obs_metrics["unique_values_abs"].values, np.array([2, 2, 2, 2])) + assert np.allclose(obs_metrics["unique_values_ratio"].values, np.array([100.0, 100.0, 100.0, 100.0])) + + assert np.array_equal( + var_metrics["missing_values_abs"].values, + np.array( + [ + 0, + 1, + 2, + 0, + ] + ), + ) + assert np.allclose(var_metrics["missing_values_pct"].values, np.array([0.0, 12.5, 25.0, 0.0])) + assert np.allclose(var_metrics["entropy_of_missingness"].values, np.array([0, 0.54356, 0.811278, 0])) + assert np.allclose(var_metrics["mean"].values, np.array([144.5, 79.0, 78.16667, 1.25])) + assert np.allclose(var_metrics["median"].values, np.array([144.5, 79.0, 76.5, 1.0])) + assert np.allclose( + var_metrics["standard_deviation"].values, + np.array([5.12347538, 1.30930734, 18.16972451, 0.96824584]), + equal_nan=True, + ) + assert np.allclose(var_metrics["min"].values, np.array([138.0, 77.0, 56.0, 0.0])) + assert np.allclose(var_metrics["max"].values, np.array([151.0, 81.0, 110.0, 3.0])) + assert np.allclose(var_metrics["unique_values_abs"].values, np.array([np.nan, np.nan, np.nan, 4.0]), equal_nan=True) assert np.allclose( - var_metrics["coefficient_of_variation"].values, np.array([0.44737, 0.43707, np.nan]), equal_nan=True + var_metrics["unique_values_ratio"].values, np.array([np.nan, np.nan, np.nan, 50.0]), equal_nan=True + ) + assert np.allclose( + var_metrics["coefficient_of_variation"].values, + np.array([0.03545658, 0.01657351, 0.2324485, np.nan]), + equal_nan=True, ) is_const = var_metrics["is_constant"].values assert is_const[0] is False assert is_const[1] is False - assert np.isnan(is_const[2]) + assert is_const[2] is False + assert np.isnan(is_const[3]) - assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([0, 0, 0])) - assert np.allclose(var_metrics["range_ratio"].values, np.array([89.473688, 87.414189, np.nan]), equal_nan=True) + assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([0, 0, 0, 0])) + assert np.allclose(var_metrics["range_ratio"].values, np.array([8.9965, 5.0633, 69.0832, np.nan]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() + # check that none of the columns were modified + for key in modification_copy.obs.keys(): + assert np.array_equal(modification_copy.obs[key], edata.obs[key]) + for key in modification_copy.var.keys(): + assert np.array_equal(modification_copy.var[key], edata.var[key]) + @pytest.mark.parametrize( "array_type, expected_error", @@ -284,12 +263,6 @@ def test_calculate_qc_metrics(missing_values_edata): assert missing_values_edata.var.missing_values_abs is not None -def test_encode_3D_edata(edata_blob_small): - ep.pp.qc_metrics(edata_blob_small, layer="layer_2") - with pytest.raises(ValueError, match=r"only supports 2D data"): - ep.pp.qc_metrics(edata_blob_small, layer=DEFAULT_TEM_LAYER_NAME) - - def test_qc_lab_measurements_simple(lab_measurements_simple_edata): expected_obs_data = pd.Series( data={ From 943f8e04a07fc54b90c38aebafd0bc2015c262e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 10:25:12 +0100 Subject: [PATCH 22/33] advanced argument removed, calculation of advanced metrics dependant on feature_type var --- ehrapy/preprocessing/_quality_control.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 55d2688f7..c7bae198e 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -35,19 +35,18 @@ def qc_metrics( qc_vars: Collection[str] = (), *, layer: str | None = None, - advanced: bool = False, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Calculates various quality control metrics. Uses the original values to calculate the metrics and not the encoded ones. Look at the return type for a more in depth description of the basic and advanced calculated metrics. + If ehrdata.infer_feature_types is run first, then advanced metrics are calculated in addition to basic metrics that require feature type information. Args: edata: Central data object. qc_vars: Optional List of vars to calculate additional metrics for. layer: Layer to use to calculate the metrics. - advanced: Determines if the advanced metrics should be calculated that require feature type information. If it is set to `True`, ehrdata.infer_feature_types must be run first. Default is `False`. Returns: Two Pandas DataFrames of all calculated QC metrics for `obs` and `var` respectively. @@ -75,7 +74,7 @@ def qc_metrics( - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. - Advanced feature level metrics include (only computed if advanced is set to `True`): + Advanced feature level metrics include (only computed if ehrdata.infer_feature_types is run first): - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. @@ -92,16 +91,9 @@ def qc_metrics( >>> obs_qc["missing_values_pct"].plot(kind="hist", bins=20) """ feature_type = edata.var.get("feature_type", None) - if_advanced = advanced - if advanced: - if feature_type is None: - warnings.warn( - "Advanced QC metrics require `edata.var['feature_type']`." - "Only basic metrics will be computed." - "Please run `infer_feature_types(edata)` first to enable advanced metrics.", - stacklevel=2, - ) - if_advanced = False + if_advanced = True + if feature_type is None: + if_advanced = False mtx = edata.X if layer is None else edata.layers[layer] var_metrics = _compute_var_metrics(mtx, edata, advanced=if_advanced) From afdaca0920ffe38563f1b18a1709f2dc958fb584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 11:06:25 +0100 Subject: [PATCH 23/33] is_constant type changed to float --- ehrapy/preprocessing/_quality_control.py | 4 +--- tests/preprocessing/test_quality_control.py | 19 ++++--------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index c7bae198e..da58b81b5 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -377,8 +377,6 @@ def _compute_var_metrics( var_metrics["is_constant"] = np.nan var_metrics["constant_variable_ratio"] = np.nan var_metrics["range_ratio"] = np.nan - # var_metrics["skewness"] = np.nan - # var_metrics["kurtosis"] = np.nan var_metrics["iqr_outliers"] = np.nan try: @@ -429,7 +427,7 @@ def _compute_var_metrics( var_metrics.loc[numeric_indices, "max"] == var_metrics.loc[numeric_indices, "min"] ) - var_metrics.loc[numeric_indices, "is_constant"] = constant_mask + var_metrics.loc[numeric_indices, "is_constant"] = constant_mask.astype(float) var_metrics["constant_variable_ratio"] = constant_mask.mean() * 100 diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 5a869cc4e..f760bfd35 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -49,7 +49,7 @@ def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata): adata.var["feature_type"] = ["numeric", "numeric", "categorical"] adata.X = array_type(missing_values_edata.X) modification_copy = adata.copy() - obs_metrics, var_metrics = ep.pp.qc_metrics(adata, advanced=True) + obs_metrics, var_metrics = ep.pp.qc_metrics(adata) assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([33.3333, 66.6667])) @@ -67,12 +67,7 @@ def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata): assert np.allclose(var_metrics["min"].values, np.array([0.21, np.nan, 7.234]), equal_nan=True) assert np.allclose(var_metrics["max"].values, np.array([0.21, np.nan, 41.419998]), equal_nan=True) assert np.allclose(var_metrics["coefficient_of_variation"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) - - is_const = var_metrics["is_constant"].values - assert is_const[0] is True - assert is_const[1] is False - assert np.isnan(is_const[2]) - + assert np.array_equal(var_metrics["is_constant"].values, np.array([1, 0, np.nan]), equal_nan=True) assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([50.0, 50.0, 50.0]), equal_nan=True) assert np.allclose(var_metrics["range_ratio"].values, np.array([0.0, np.nan, np.nan]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() @@ -132,7 +127,7 @@ def test_qc_metrics_3d_vanilla_advanced(edata_mini_3D_missing_values): edata.var["feature_type"] = ["numeric", "numeric", "numeric", "categorical"] modification_copy = edata.copy() - obs_metrics, var_metrics = ep.pp.qc_metrics(edata, layer=DEFAULT_TEM_LAYER_NAME, advanced=True) + obs_metrics, var_metrics = ep.pp.qc_metrics(edata, layer=DEFAULT_TEM_LAYER_NAME) assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 0, 1, 1])) assert np.allclose(obs_metrics["missing_values_pct"].values, np.array([12.5, 0.0, 12.5, 12.5])) @@ -171,13 +166,7 @@ def test_qc_metrics_3d_vanilla_advanced(edata_mini_3D_missing_values): np.array([0.03545658, 0.01657351, 0.2324485, np.nan]), equal_nan=True, ) - - is_const = var_metrics["is_constant"].values - assert is_const[0] is False - assert is_const[1] is False - assert is_const[2] is False - assert np.isnan(is_const[3]) - + assert np.array_equal(var_metrics["is_constant"].values, np.array([0, 0, 0, np.nan]), equal_nan=True) assert np.allclose(var_metrics["constant_variable_ratio"].values, np.array([0, 0, 0, 0])) assert np.allclose(var_metrics["range_ratio"].values, np.array([8.9965, 5.0633, 69.0832, np.nan]), equal_nan=True) assert (~var_metrics["iqr_outliers"]).all() From 04457d9ee4af8928ee4ae9b56b1a714f6397f88b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 12:19:08 +0100 Subject: [PATCH 24/33] solve imputation errors attempt --- ehrapy/preprocessing/_quality_control.py | 1 - tests/preprocessing/test_quality_control.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index da58b81b5..50d145b80 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -304,7 +304,6 @@ def _compute_var_metrics( """ categorical_indices = np.ndarray([0], dtype=int) var_metrics = pd.DataFrame(index=edata.var_names) - mtx = np.asarray(mtx) if mtx.ndim == 3: n_obs, n_vars, n_time = mtx.shape diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index f760bfd35..d11b44509 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -4,6 +4,7 @@ import pandas as pd import pytest from ehrdata.core.constants import DEFAULT_TEM_LAYER_NAME +from scipy import sparse as sp import ehrapy as ep from ehrapy.io._read import read_csv @@ -20,6 +21,7 @@ def test_qc_metrics_vanilla(array_type, missing_values_edata): adata = missing_values_edata adata.X = array_type(adata.X) modification_copy = adata.copy() + obs_metrics, var_metrics = ep.pp.qc_metrics(adata) assert np.array_equal(obs_metrics["missing_values_abs"].values, np.array([1, 2])) @@ -80,8 +82,7 @@ def test_qc_metrics_vanilla_advanced(array_type, missing_values_edata): def test_qc_metrics_3d_vanilla(edata_mini_3D_missing_values): - edata = edata_mini_3D_missing_values - edata = edata[:, :4] + edata = edata_mini_3D_missing_values[:, :4].copy() modification_copy = edata.copy() obs_metrics, var_metrics = ep.pp.qc_metrics(edata, layer=DEFAULT_TEM_LAYER_NAME) @@ -122,8 +123,7 @@ def test_qc_metrics_3d_vanilla(edata_mini_3D_missing_values): def test_qc_metrics_3d_vanilla_advanced(edata_mini_3D_missing_values): - edata = edata_mini_3D_missing_values.copy() - edata = edata[:, :4] + edata = edata_mini_3D_missing_values[:, :4].copy() edata.var["feature_type"] = ["numeric", "numeric", "numeric", "categorical"] modification_copy = edata.copy() From ba86ed60cb55a2f34280a36e55f7903f96a4d749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 12:43:16 +0100 Subject: [PATCH 25/33] minimal fix --- ehrapy/preprocessing/_quality_control.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 50d145b80..bc984efbc 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -367,16 +367,17 @@ def _compute_var_metrics( var_metrics["unique_values_abs"] = unique_val_abs_full var_metrics["unique_values_ratio"] = unique_val_ratio_full + var_metrics["coefficient_of_variation"] = np.nan + var_metrics["is_constant"] = np.nan + var_metrics["constant_variable_ratio"] = np.nan + var_metrics["range_ratio"] = np.nan + var_metrics["iqr_outliers"] = np.nan + var_metrics["mean"] = np.nan var_metrics["median"] = np.nan var_metrics["standard_deviation"] = np.nan var_metrics["min"] = np.nan var_metrics["max"] = np.nan - var_metrics["coefficient_of_variation"] = np.nan - var_metrics["is_constant"] = np.nan - var_metrics["constant_variable_ratio"] = np.nan - var_metrics["range_ratio"] = np.nan - var_metrics["iqr_outliers"] = np.nan try: # Calculate statistics for non-categorical variables From fb0ceb330b6ac20a0bd4605690e486876027ccd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 12:50:35 +0100 Subject: [PATCH 26/33] minimal fix --- ehrapy/preprocessing/_quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index bc984efbc..508cb1213 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -371,13 +371,13 @@ def _compute_var_metrics( var_metrics["is_constant"] = np.nan var_metrics["constant_variable_ratio"] = np.nan var_metrics["range_ratio"] = np.nan - var_metrics["iqr_outliers"] = np.nan var_metrics["mean"] = np.nan var_metrics["median"] = np.nan var_metrics["standard_deviation"] = np.nan var_metrics["min"] = np.nan var_metrics["max"] = np.nan + var_metrics["iqr_outliers"] = np.nan try: # Calculate statistics for non-categorical variables From 9b5e4778e9238ed11302e8fad7a8a66b4458803b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 17:51:10 +0100 Subject: [PATCH 27/33] infer_feature_types linked, example with head() outputs --- ehrapy/preprocessing/_quality_control.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 508cb1213..02ffd0d9f 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -40,7 +40,7 @@ def qc_metrics( Uses the original values to calculate the metrics and not the encoded ones. Look at the return type for a more in depth description of the basic and advanced calculated metrics. - If ehrdata.infer_feature_types is run first, then advanced metrics are calculated in addition to basic metrics that require feature type information. + If :func:`~ehrdata.infer_feature_types` is run first, then advanced metrics are calculated in addition to basic metrics that require feature type information. Args: @@ -57,7 +57,7 @@ def qc_metrics( - `missing_values_pct`: Relative amount of missing values in percent. - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. - Advanced observation level metrics include (only computed if advanced is set to `True`): + Advanced observation level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. @@ -74,7 +74,7 @@ def qc_metrics( - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. - Advanced feature level metrics include (only computed if ehrdata.infer_feature_types is run first): + Advanced feature level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - `unique_values_abs`: Absolute amount of unique values. - `unique_values_ratio`: Relative amount of unique values in percent. @@ -88,7 +88,8 @@ def qc_metrics( >>> import ehrapy as ep >>> edata = ed.dt.mimic_2() >>> obs_qc, var_qc = ep.pp.qc_metrics(edata) - >>> obs_qc["missing_values_pct"].plot(kind="hist", bins=20) + >>> obs_qc.head() + >>> var_qc.head() """ feature_type = edata.var.get("feature_type", None) if_advanced = True From 63a6526c120cf5c4d13b8c4fa6bd5e1aeee9ec77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Wed, 3 Dec 2025 18:02:07 +0100 Subject: [PATCH 28/33] NaN output for numerical features added to docstring --- ehrapy/preprocessing/_quality_control.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 02ffd0d9f..169fceaa3 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -58,8 +58,8 @@ def qc_metrics( - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. Advanced observation level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - - `unique_values_abs`: Absolute amount of unique values. - - `unique_values_ratio`: Relative amount of unique values in percent. + - `unique_values_abs`: Absolute amount of unique values. Returned as ``NaN`` for numeric features. + - `unique_values_ratio`: Relative amount of unique values in percent. Returned as ``NaN`` for numeric features. Basic (default) feature level metrics include: @@ -76,8 +76,8 @@ def qc_metrics( Advanced feature level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - - `unique_values_abs`: Absolute amount of unique values. - - `unique_values_ratio`: Relative amount of unique values in percent. + - `unique_values_abs`: Absolute amount of unique values. Returned as ``NaN`` for numeric features + - `unique_values_ratio`: Relative amount of unique values in percent. Returned as ``NaN`` for numeric features - `coefficient_of_variation`: Coefficient of variation of the features. - `is_constant`: Whether the feature is constant (with near zero variance). - `constant_variable_ratio`: Relative amount of constant features in percent. @@ -233,7 +233,7 @@ def _compute_obs_metrics( if advanced and "feature_type" not in edata.var: raise ValueError( - "Advanced QC metrics require `edata.var['feature_type']`. Please run `infer_feature_types(edata)` first" + "Advanced QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) if advanced: From 676a8a49fc1ceaf436c6310bf10c63ee8dc7b219 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 5 Dec 2025 10:23:22 +0100 Subject: [PATCH 29/33] _quality_control comments adressed --- ehrapy/preprocessing/_quality_control.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index 169fceaa3..aeb4c2323 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -254,7 +254,6 @@ def _compute_obs_metrics( if unique_arr.ndim == 2: unique_val_abs = unique_arr[:, 0] valid_counts = valid_arr[:, 0] - else: unique_per_time = unique_arr[:, 0, :] valid_per_time = valid_arr[:, 0, :] @@ -336,7 +335,7 @@ def _compute_var_metrics( if advanced and "feature_type" not in edata.var: raise ValueError( - "Advanced QC metrics require `edata.var['feature_type']`. Please run `infer_feature_types(edata)` first" + "Advanced QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) if advanced: From 430e2ce6db511c824c2c187e7d53939cb432216e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 5 Dec 2025 13:53:37 +0100 Subject: [PATCH 30/33] failure case added, array types moved from compat.py to _types.py --- ehrapy/_compat.py | 17 +------- ehrapy/_types.py | 14 +++++++ ehrapy/preprocessing/_quality_control.py | 45 ++++++++++++++++----- tests/conftest.py | 2 +- tests/preprocessing/test_quality_control.py | 9 +++++ 5 files changed, 61 insertions(+), 26 deletions(-) diff --git a/ehrapy/_compat.py b/ehrapy/_compat.py index cd0084b0e..22d236607 100644 --- a/ehrapy/_compat.py +++ b/ehrapy/_compat.py @@ -251,23 +251,8 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return decorator -def asarray(a): - import numpy as np - - return np.asarray(a) - - def as_dense_dask_array(a, chunk_size=1000): + """Convert input to a dense Dask array.""" import dask.array as da return da.from_array(a, chunks=chunk_size) - - -ARRAY_TYPES_NUMERIC = ( - asarray, - as_dense_dask_array, - sp.csr_array, - sp.csc_array, -) # add coo_array once supported in AnnData -ARRAY_TYPES_NUMERIC_3D_ABLE = (asarray, as_dense_dask_array) # add coo_array once supported in AnnData -ARRAY_TYPES_NONNUMERIC = (asarray, as_dense_dask_array) diff --git a/ehrapy/_types.py b/ehrapy/_types.py index edf460325..754f42ddb 100644 --- a/ehrapy/_types.py +++ b/ehrapy/_types.py @@ -1,13 +1,27 @@ from __future__ import annotations from collections.abc import Sequence +from functools import partial from typing import Literal import numpy as np import scipy.sparse as sp +from fast_array_utils.conv import to_dense + +from ehrapy._compat import as_dense_dask_array KnownTransformer = Literal["pynndescent", "sklearn"] CSBase = sp.csr_matrix | sp.csc_matrix RNGLike = np.random.Generator | np.random.BitGenerator SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence AnyRandom = int | np.random.RandomState | None + +asarray = partial(to_dense, to_cpu_memory=True) +ARRAY_TYPES_NUMERIC = ( + asarray, + as_dense_dask_array, + sp.csr_array, + sp.csc_array, +) # add coo_array once supported in AnnData +ARRAY_TYPES_NUMERIC_3D_ABLE = (asarray, as_dense_dask_array) # add coo_array once supported in AnnData +ARRAY_TYPES_NONNUMERIC = (asarray, as_dense_dask_array) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index aeb4c2323..d50dbccd0 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -26,7 +26,7 @@ from collections.abc import Collection from anndata import AnnData - from ehrdata import EHRData +from ehrdata import EHRData @use_ehrdata(deprecated_after="1.0.0") @@ -39,8 +39,8 @@ def qc_metrics( """Calculates various quality control metrics. Uses the original values to calculate the metrics and not the encoded ones. - Look at the return type for a more in depth description of the basic and advanced calculated metrics. - If :func:`~ehrdata.infer_feature_types` is run first, then advanced metrics are calculated in addition to basic metrics that require feature type information. + Look at the return type for a more in depth description of the default and extended metrics. + If :func:`~ehrdata.infer_feature_types` is run first, then extended metrics that require feature type information are calculated in addition to default metrics. Args: @@ -51,17 +51,17 @@ def qc_metrics( Returns: Two Pandas DataFrames of all calculated QC metrics for `obs` and `var` respectively. - Basic (default) observation level metrics include: + Default observation level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. - `entropy_of_missingness`: Entropy of the missingness pattern for each observation. Higher values indicate a more heterogeneous (less structured) missingness pattern. - Advanced observation level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): + Extended observation level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - `unique_values_abs`: Absolute amount of unique values. Returned as ``NaN`` for numeric features. - `unique_values_ratio`: Relative amount of unique values in percent. Returned as ``NaN`` for numeric features. - Basic (default) feature level metrics include: + Default feature level metrics include: - `missing_values_abs`: Absolute amount of missing values. - `missing_values_pct`: Relative amount of missing values in percent. @@ -74,7 +74,7 @@ def qc_metrics( - `iqr_outliers`: Whether the feature contains outliers based on the interquartile range (IQR) method. - Advanced feature level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): + Extended feature level metrics include (only computed if :func:`~ehrdata.infer_feature_types` is run first): - `unique_values_abs`: Absolute amount of unique values. Returned as ``NaN`` for numeric features - `unique_values_ratio`: Relative amount of unique values in percent. Returned as ``NaN`` for numeric features @@ -91,12 +91,39 @@ def qc_metrics( >>> obs_qc.head() >>> var_qc.head() """ + if not isinstance(edata, EHRData): + raise ValueError(f"Central data object should be an EHRData object, but received {type(edata).__name__}") + feature_type = edata.var.get("feature_type", None) if_advanced = True if feature_type is None: if_advanced = False mtx = edata.X if layer is None else edata.layers[layer] + + if mtx.ndim == 3: + mtx_check = mtx[:, :, 0] + else: + mtx_check = mtx + + mtx_df = pd.DataFrame(mtx_check) + mixed = [] + for col in mtx_df.columns: + s = mtx_df[col].dropna() + if s.empty: + continue + types = {type(v) for v in s} + + if all(issubclass(t, (int, float, bool)) for t in types): + continue + + if all(isinstance(v, str) for v in s): + continue + + mixed.append(col) + if mixed: + raise ValueError(f"Mixed or unsupported types are found in columns {mixed}Columns must be homogeneous") + var_metrics = _compute_var_metrics(mtx, edata, advanced=if_advanced) obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=if_advanced) @@ -233,7 +260,7 @@ def _compute_obs_metrics( if advanced and "feature_type" not in edata.var: raise ValueError( - "Advanced QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" + "Extended QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) if advanced: @@ -335,7 +362,7 @@ def _compute_var_metrics( if advanced and "feature_type" not in edata.var: raise ValueError( - "Advanced QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" + "Extended QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) if advanced: diff --git a/tests/conftest.py b/tests/conftest.py index e68875540..1021e1d96 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,7 +14,7 @@ from matplotlib.testing.compare import compare_images import ehrapy as ep -from ehrapy._compat import ( +from ehrapy._types import ( ARRAY_TYPES_NONNUMERIC, ARRAY_TYPES_NUMERIC, ARRAY_TYPES_NUMERIC_3D_ABLE, diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index d11b44509..614d09ea9 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -1,5 +1,6 @@ from pathlib import Path +import ehrdata as ed import numpy as np import pandas as pd import pytest @@ -178,6 +179,14 @@ def test_qc_metrics_3d_vanilla_advanced(edata_mini_3D_missing_values): assert np.array_equal(modification_copy.var[key], edata.var[key]) +def test_qc_metrics_heterogeneous_columns(): + mtx = np.array([[11, "a"], [True, 22]], dtype=object) + + edata = ed.EHRData(shape=(2, 2), layers={"tem_data": mtx}) + with pytest.raises(ValueError): + ep.pp.qc_metrics(edata, layer="tem_data") + + @pytest.mark.parametrize( "array_type, expected_error", [ From 57652105f9766be29306c5a9c68ac880b13ab09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 5 Dec 2025 14:48:30 +0100 Subject: [PATCH 31/33] mixed columns error fixed, proper asarray with fast_array_utils --- ehrapy/_types.py | 8 ++- ehrapy/preprocessing/_quality_control.py | 68 +++++++++++++++--------- 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/ehrapy/_types.py b/ehrapy/_types.py index 754f42ddb..92d24490f 100644 --- a/ehrapy/_types.py +++ b/ehrapy/_types.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections.abc import Sequence -from functools import partial from typing import Literal import numpy as np @@ -16,7 +15,12 @@ SeedLike = int | np.integer | Sequence[int] | np.random.SeedSequence AnyRandom = int | np.random.RandomState | None -asarray = partial(to_dense, to_cpu_memory=True) + +def asarray(a): + """Convert input to a dense NumPy array in CPU memory using fast-array-utils.""" + return to_dense(a, to_cpu_memory=True) + + ARRAY_TYPES_NUMERIC = ( asarray, as_dense_dask_array, diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index d50dbccd0..b416749ea 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -25,7 +25,8 @@ if TYPE_CHECKING: from collections.abc import Collection - from anndata import AnnData + +from anndata import AnnData from ehrdata import EHRData @@ -91,8 +92,10 @@ def qc_metrics( >>> obs_qc.head() >>> var_qc.head() """ - if not isinstance(edata, EHRData): - raise ValueError(f"Central data object should be an EHRData object, but received {type(edata).__name__}") + if not isinstance(edata, EHRData) or not isinstance(edata, AnnData): + raise ValueError( + f"Central data object should be an EHRData or an AnnData object, but received {type(edata).__name__}" + ) feature_type = edata.var.get("feature_type", None) if_advanced = True @@ -101,28 +104,7 @@ def qc_metrics( mtx = edata.X if layer is None else edata.layers[layer] - if mtx.ndim == 3: - mtx_check = mtx[:, :, 0] - else: - mtx_check = mtx - - mtx_df = pd.DataFrame(mtx_check) - mixed = [] - for col in mtx_df.columns: - s = mtx_df[col].dropna() - if s.empty: - continue - types = {type(v) for v in s} - - if all(issubclass(t, (int, float, bool)) for t in types): - continue - - if all(isinstance(v, str) for v in s): - continue - - mixed.append(col) - if mixed: - raise ValueError(f"Mixed or unsupported types are found in columns {mixed}Columns must be homogeneous") + _raise_error_when_heterogeneous(mtx) var_metrics = _compute_var_metrics(mtx, edata, advanced=if_advanced) obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=if_advanced) @@ -206,6 +188,42 @@ def _row_valid(arr_2d: np.ndarray, axis) -> np.ndarray: return np.broadcast_to(valid[:, None], arr_2d.shape) +@singledispatch +def _raise_error_when_heterogeneous(mtx): + _raise_array_type_not_implemented(_raise_error_when_heterogeneous, type(mtx)) + + +@_raise_error_when_heterogeneous.register(np.ndarray) +@_raise_error_when_heterogeneous.register(DaskArray) +def _(mtx: np.ndarray | DaskArray): + if mtx.ndim == 3: + mtx_check = mtx[:, :, 0] + else: + mtx_check = mtx + try: + mtx_check = mtx_check.compute() + except AttributeError: + # numpy arrays don't have .compute() + pass + + mtx_df = pd.DataFrame(mtx_check) + mixed = [] + for col in mtx_df.columns: + s = mtx_df[col].dropna() + if s.empty: + continue + types = {type(v) for v in s} + + if all(issubclass(t, (int, float, bool)) for t in types): + continue + if all(isinstance(v, str) for v in s): + continue + + mixed.append(col) + if mixed: + raise ValueError(f"Mixed or unsupported types are found in columns {mixed}. Columns must be homogeneous") + + def _compute_obs_metrics( mtx, edata: EHRData | AnnData, From 7491b5576232c9dbf859645e2f045d3f3dea0acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 5 Dec 2025 14:58:14 +0100 Subject: [PATCH 32/33] minimal fix --- ehrapy/preprocessing/_quality_control.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index b416749ea..e2c5a0fda 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -92,7 +92,7 @@ def qc_metrics( >>> obs_qc.head() >>> var_qc.head() """ - if not isinstance(edata, EHRData) or not isinstance(edata, AnnData): + if not isinstance(edata, EHRData) and not isinstance(edata, AnnData): raise ValueError( f"Central data object should be an EHRData or an AnnData object, but received {type(edata).__name__}" ) From f65d66ed035335684b71ab84c4c83eb918f565d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96yk=C3=BC=20S=C3=BCoglu?= Date: Fri, 5 Dec 2025 15:44:16 +0100 Subject: [PATCH 33/33] code terminology updated with new wording, pytest error message --- ehrapy/preprocessing/_quality_control.py | 26 ++++++++++----------- tests/preprocessing/test_quality_control.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ehrapy/preprocessing/_quality_control.py b/ehrapy/preprocessing/_quality_control.py index e2c5a0fda..129b68cba 100644 --- a/ehrapy/preprocessing/_quality_control.py +++ b/ehrapy/preprocessing/_quality_control.py @@ -98,16 +98,16 @@ def qc_metrics( ) feature_type = edata.var.get("feature_type", None) - if_advanced = True + extended = True if feature_type is None: - if_advanced = False + extended = False mtx = edata.X if layer is None else edata.layers[layer] _raise_error_when_heterogeneous(mtx) - var_metrics = _compute_var_metrics(mtx, edata, advanced=if_advanced) - obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, advanced=if_advanced) + var_metrics = _compute_var_metrics(mtx, edata, extended=extended) + obs_metrics = _compute_obs_metrics(mtx, edata, qc_vars=qc_vars, log1p=True, extended=extended) edata.var[var_metrics.columns] = var_metrics edata.obs[obs_metrics.columns] = obs_metrics @@ -230,7 +230,7 @@ def _compute_obs_metrics( *, qc_vars: Collection[str] = (), log1p: bool = True, - advanced: bool = False, + extended: bool = False, ): """Calculates quality control metrics for observations. @@ -241,7 +241,7 @@ def _compute_obs_metrics( edata: Central data object. qc_vars: A list of previously calculated QC metrics to calculate summary statistics for. log1p: Whether to apply log1p normalization for the QC metrics. Only used with parameter 'qc_vars'. - advanced: Whether to calculate further metrics that require feature type information. + extended: Whether to calculate further metrics that require feature type information. Returns: A Pandas DataFrame with the calculated metrics. @@ -276,12 +276,12 @@ def _compute_obs_metrics( obs_metrics["missing_values_pct"] = (obs_metrics["missing_values_abs"] / flat_mtx.shape[1]) * 100 obs_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(flat_mtx, axis=1) - if advanced and "feature_type" not in edata.var: + if extended and "feature_type" not in edata.var: raise ValueError( "Extended QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) - if advanced: + if extended: feature_type = edata.var["feature_type"] categorical_mask = feature_type == "categorical" @@ -338,14 +338,14 @@ def _compute_obs_metrics( def _compute_var_metrics( mtx, edata: EHRData | AnnData, - advanced: bool = False, + extended: bool = False, ): """Compute variable metrics for quality control. Args: mtx: Data array. edata: Central data object. - advanced: Whether to calculate further metrics that require feature type information. + extended: Whether to calculate further metrics that require feature type information. """ categorical_indices = np.ndarray([0], dtype=int) var_metrics = pd.DataFrame(index=edata.var_names) @@ -378,12 +378,12 @@ def _compute_var_metrics( var_metrics["missing_values_pct"] = (var_metrics["missing_values_abs"] / mtx.shape[0]) * 100 var_metrics["entropy_of_missingness"] = _compute_entropy_of_missingness(mtx, axis=0) - if advanced and "feature_type" not in edata.var: + if extended and "feature_type" not in edata.var: raise ValueError( "Extended QC metrics require `edata.var['feature_type']`. Please run `ehrdata.infer_feature_types(edata)` first" ) - if advanced: + if extended: feature_type = edata.var["feature_type"] categorical_mask = feature_type == "categorical" @@ -456,7 +456,7 @@ def _compute_var_metrics( # Fill all non_categoricals with False because else we have a dtype object Series which h5py cannot save var_metrics["iqr_outliers"] = var_metrics["iqr_outliers"].astype(bool).fillna(False) - if advanced: + if extended: feature_type = edata.var["feature_type"] numeric_mask = feature_type == "numeric" diff --git a/tests/preprocessing/test_quality_control.py b/tests/preprocessing/test_quality_control.py index 614d09ea9..498aae2b9 100644 --- a/tests/preprocessing/test_quality_control.py +++ b/tests/preprocessing/test_quality_control.py @@ -183,7 +183,7 @@ def test_qc_metrics_heterogeneous_columns(): mtx = np.array([[11, "a"], [True, 22]], dtype=object) edata = ed.EHRData(shape=(2, 2), layers={"tem_data": mtx}) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Mixed or unsupported"): ep.pp.qc_metrics(edata, layer="tem_data")