Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

### Added
- Better documentation for `sopa.io.visium_hd` and a warning if the full res image is not loaded (#254)
- `no_overlap` argument in `sopa.aggregate` to avoid cells from overlapping when aggregating the bins and the channels

## [2.0.7] - 2025-05-19

Expand Down
2 changes: 2 additions & 0 deletions docs/api/misc.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ Use `sopa.settings.gene_exclude_pattern = None` to keep all genes.

::: sopa.segmentation.shapes.expand_radius

::: sopa.segmentation.shapes.remove_overlap

## Xenium Explorer

::: sopa.io.explorer.write
Expand Down
13 changes: 7 additions & 6 deletions sopa/aggregation/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,7 @@

from .._constants import ATTRS_KEY, SopaAttrs, SopaKeys
from ..io.explorer.utils import str_cell_id
from ..utils import (
add_spatial_element,
get_boundaries,
get_spatial_element,
get_spatial_image,
)
from ..utils import add_spatial_element, get_boundaries, get_spatial_element, get_spatial_image
from . import aggregate_bins, count_transcripts
from . import aggregate_channels as _aggregate_channels

Expand All @@ -34,6 +29,7 @@ def aggregate(
expand_radius_ratio: float | None = None,
min_transcripts: int = 0,
min_intensity_ratio: float = 0.1,
no_overlap: bool = False,
key_added: str | None = "table",
):
"""Aggregate gene counts and/or channel intensities over a `SpatialData` object to create an `AnnData` table (saved in `sdata["table"]`).
Expand All @@ -57,6 +53,7 @@ def aggregate(
expand_radius_ratio: Ratio to expand the cells polygons for channels averaging. For instance, a ratio of 0.5 expands the shape radius by 50%. If `None` (default), use 1 if we aggregate bins data, and 0 otherwise.
min_transcripts: Min number of transcripts to keep a cell.
min_intensity_ratio: Min ratio of the 90th quantile of the mean channel intensity to keep a cell.
no_overlap: *Experimental feature*: If `True`, the (expanded) cells will not overlap for channels and bins aggregation.
key_added: Key to save the table in `sdata.tables`. If `None`, it will be `f"{shapes_key}_table"`.
"""
assert points_key is None or bins_key is None, "Provide either `points_key` or `bins_key`, not both."
Expand Down Expand Up @@ -86,6 +83,7 @@ def aggregate(
min_transcripts=min_transcripts,
min_intensity_ratio=min_intensity_ratio,
gene_column=gene_column,
no_overlap=no_overlap,
key_added=key_added,
)

Expand Down Expand Up @@ -154,6 +152,7 @@ def compute_table(
min_intensity_ratio: float = 0,
average_intensities: bool | None = None, # deprecated argument
points_key: str | None = None, # deprecated argument
no_overlap: bool = False,
key_added: str = SopaKeys.TABLE,
):
aggregate_genes, aggregate_channels = self._legacy_arguments(
Expand All @@ -175,6 +174,7 @@ def compute_table(
self.shapes_key,
self.bins_key,
expand_radius_ratio=1 if expand_radius_ratio is None else expand_radius_ratio,
no_overlap=no_overlap,
)
elif not self.already_has_valid_table(key_added):
self.table = count_transcripts(
Expand All @@ -190,6 +190,7 @@ def compute_table(
image_key=self.image_key,
shapes_key=self.shapes_key,
expand_radius_ratio=expand_radius_ratio or 0,
no_overlap=no_overlap,
)

if min_intensity_ratio > 0:
Expand Down
4 changes: 3 additions & 1 deletion sopa/aggregation/bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def aggregate_bins(
shapes_key: str,
bins_key: str,
expand_radius_ratio: float = 0,
no_overlap: bool = False,
) -> AnnData:
"""Aggregate bins (for instance, from Visium HD data) into cells.

Expand All @@ -25,6 +26,7 @@ def aggregate_bins(
shapes_key: Key of the shapes containing the cell boundaries
bins_key: Key of the table containing the bin-by-gene counts
expand_radius_ratio: Cells polygons will be expanded by `expand_radius_ratio * mean_radius`. This help better aggregate bins from the cytoplasm.
no_overlap: If `True`, the (expanded) cells will not overlap.

Returns:
An `AnnData` object of shape with the cell-by-gene count matrix
Expand All @@ -37,7 +39,7 @@ def aggregate_bins(
bins = gpd.GeoDataFrame(geometry=bins.centroid.values) # bins as points

cells = to_intrinsic(sdata, shapes_key, bins_shapes_key).reset_index(drop=True)
cells = expand_radius(cells, expand_radius_ratio)
cells = expand_radius(cells, expand_radius_ratio, no_overlap=no_overlap)

bin_within_cell = gpd.sjoin(bins, cells)

Expand Down
4 changes: 3 additions & 1 deletion sopa/aggregation/channels.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def aggregate_channels(
shapes_key: str | None = None,
expand_radius_ratio: float = 0,
mode: str = "average",
no_overlap: bool = False,
) -> np.ndarray:
"""Aggregate the channel intensities per cell (either `"average"`, or take the `"min"` / `"max"`).

Expand All @@ -40,6 +41,7 @@ def aggregate_channels(
shapes_key: Key of `sdata` containing the cell boundaries. If only one `shapes` element, this does not have to be provided.
expand_radius_ratio: Cells polygons will be expanded by `expand_radius_ratio * mean_radius`. This help better aggregate boundary stainings.
mode: Aggregation mode. One of `"average"`, `"min"`, `"max"`. By default, average the intensity inside the cell mask.
no_overlap: If `True`, the (expanded) cells will not overlap.

Returns:
A numpy `ndarray` of shape `(n_cells, n_channels)`
Expand All @@ -50,7 +52,7 @@ def aggregate_channels(

geo_df = get_boundaries(sdata, key=shapes_key)
geo_df = to_intrinsic(sdata, geo_df, image)
geo_df = expand_radius(geo_df, expand_radius_ratio)
geo_df = expand_radius(geo_df, expand_radius_ratio, no_overlap=no_overlap)

return _aggregate_channels_aligned(image, geo_df, mode)

Expand Down
130 changes: 129 additions & 1 deletion sopa/segmentation/shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import shapely
import shapely.affinity
from shapely.errors import GEOSException
from shapely.geometry import GeometryCollection, MultiPolygon, Polygon

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -161,19 +162,146 @@ def rasterize(cell: Polygon | MultiPolygon, shape: tuple[int, int], xy_min: tupl
return rasterized_image


def expand_radius(geo_df: gpd.GeoDataFrame, expand_radius_ratio: float | None) -> gpd.GeoDataFrame:
def expand_radius(
geo_df: gpd.GeoDataFrame, expand_radius_ratio: float | None, no_overlap: bool = False, inplace: bool = False
) -> gpd.GeoDataFrame:
"""Expand the radius of the cells by a given ratio.

Args:
geo_df: A GeoDataFrame containing the cells or shapes.
expand_radius_ratio: Ratio to expand the cells polygons for channels averaging. For instance, a ratio of 0.5 expands the shape radius by 50%. If `None`, doesn't expand cells.
no_overlap: *Experimental feature*: if `True`, ensures that the expanded cells do not overlap by computing the Voronoi diagram of the centroids of the cells.
inplace: If `True`, modifies the input GeoDataFrame in place. If `False`, returns a new GeoDataFrame.

Returns:
A GeoDataFrame with the expanded cells.
"""
if not inplace:
geo_df = geo_df.copy()

if not expand_radius_ratio:
return geo_df

expand_radius_ = expand_radius_ratio * np.mean(np.sqrt(geo_df.area / np.pi))
geo_df.geometry = geo_df.buffer(expand_radius_)

if no_overlap:
log.warning("Computing Voronoi polygons to ensure no overlap between shapes is still experimental.")
geo_df.geometry = remove_overlap(geo_df, as_gdf=False)

return geo_df


def remove_overlap(geo_df: gpd.GeoDataFrame, as_gdf: bool = True) -> gpd.GeoSeries | gpd.GeoDataFrame:
"""Remove overlapping areas from a GeoDataFrame by computing the Voronoi polygons of the shapes.

Args:
geo_df: A GeoDataFrame containing the shapes.
as_gdf: Whether to return a GeoDataFrame or a GeoSeries.

Returns:
A GeoSeries or GeoDataFrame with the overlapping areas removed.
"""
geo_df["_index"] = geo_df.index # to keep track of the index after the overlay

overlay = geo_df.overlay(geo_df, how="intersection")
overlap = overlay[overlay["_index_1"] != overlay["_index_2"]].union_all()

del geo_df["_index"]

if overlap.is_empty:
return geo_df.geometry

shapes_no_overlap = geo_df.difference(overlap).buffer(-1e-4) # to avoid touching polygons on single points
_voronoi = voronoi_frames(shapes_no_overlap)

geometry = geo_df.intersection(_voronoi)

if not as_gdf:
return geometry

geo_df = geo_df.copy()
geo_df.geometry = geometry
return geo_df


def voronoi_frames(
geometry: gpd.GeoSeries | gpd.GeoDataFrame,
clip: str | shapely.Geometry | None = "bounding_box",
grid_size: float = 1e-5,
) -> gpd.GeoSeries:
"""
Copied and simplified from https://pysal.org/libpysal/generated/libpysal.cg.voronoi_frames.html
"""
# Check if the input geometry is in a geographic CRS
if geometry.crs and geometry.crs.is_geographic:
raise ValueError(
"Geometry is in a geographic CRS. "
"Use 'GeoSeries.to_crs()' to re-project geometries to a "
"projected CRS before using voronoi_polygons.",
)

# Set precision of the input geometry (avoids GEOS precision issues)
objects: gpd.GeoSeries = shapely.set_precision(geometry.geometry.copy(), grid_size)

geom_types = objects.geom_type
assert geom_types.isin(["Polygon", "MultiPolygon"]).all(), (
"Only Polygon and MultiPolygon geometries are supported to remove overlaps."
)

limit = _get_limit(objects, clip)

# Compute Voronoi polygons
voronoi = shapely.voronoi_polygons(shapely.GeometryCollection(objects.values), extend_to=limit)
# Get individual polygons out of the collection
polygons = gpd.GeoSeries(shapely.make_valid(shapely.get_parts(voronoi)), crs=geometry.crs)

# temporary fix for libgeos/geos#1062
if not (polygons.geom_type == "Polygon").all():
polygons = polygons.explode(ignore_index=True)
polygons = polygons[polygons.geom_type == "Polygon"]

ids_objects, ids_polygons = polygons.sindex.query(objects, predicate="intersects")

# Dissolve polygons
polygons = polygons.iloc[ids_polygons].groupby(objects.index.take(ids_objects)).agg(_union_with_fallback)
if geometry.crs is not None:
polygons = polygons.set_crs(geometry.crs)

# ensure validity as union can occasionally produce invalid polygons that may
# break the intersection below
if not polygons.is_valid.all():
polygons = polygons.make_valid()

# Clip polygons if limit is provided
if limit is not None:
to_be_clipped = polygons.sindex.query(limit.boundary, "intersects")
polygons.iloc[to_be_clipped] = polygons.iloc[to_be_clipped].intersection(limit)

return polygons


def _union_with_fallback(arr):
"""
Coverage union is finnicky with floating point precision and tends to occasionally
raise an error from within GEOS we have no control over. It is not a data issue
typically. Falling back to unary union if that happens.
"""
try:
r = shapely.coverage_union_all(arr)
except GEOSException:
r = shapely.union_all(arr)

return r


def _get_limit(points, clip: shapely.Geometry | str | None | bool) -> shapely.Geometry | None:
if isinstance(clip, shapely.Geometry):
return clip
if clip is None or clip is False:
return None
if clip.lower() == "bounding_box":
return shapely.box(*points.total_bounds)
raise ValueError(
f"Clip type '{clip}' not understood. Try one of the supported options: [None, 'bounding_box', shapely.Polygon]."
)
37 changes: 37 additions & 0 deletions tests/test_shapes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import geopandas as gpd
from shapely.geometry import Point, box

from sopa.segmentation.shapes import expand_radius, remove_overlap

gdf_squares = gpd.GeoDataFrame(
{"color": [0, 1, 2, 3, 4]},
geometry=[
box(0, 0, 0.8, 0.8),
box(1, 1, 2, 2),
box(0.5, 0.5, 1.5, 1.5),
box(2.5, 2.5, 3.4, 3.4),
box(1.4, 0.5, 2, 1.1),
],
)

gdf_circles = gpd.GeoDataFrame(
{"color": [0, 1, 2]},
geometry=[Point(0, 0).buffer(1), Point(0, 1.75).buffer(1), Point(0.5, 1).buffer(0.3)],
)


def test_remove_overlap():
for gdf in [gdf_squares, gdf_circles]:
res = remove_overlap(gdf)

assert res.union_all().area / gdf.union_all().area > 0.999 # should conserve the original area

res.geometry = res.geometry.buffer(-1e-3) # to avoid polygons touching on single points

assert len(gpd.sjoin(res, res)) == len(res) # should not have overlaps

res = expand_radius(gdf, 0.5, no_overlap=True) # should not raise an error

res.geometry = res.geometry.buffer(-1e-3)

assert len(gpd.sjoin(res, res)) == len(res)
Loading
Loading