Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions cubedash/_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,11 +283,10 @@ def as_stac_collection(res: CollectionItem) -> pystac.Collection:
providers=[],
extent=Extent(
pystac.SpatialExtent(
bboxes=[
res.footprint_wgs84.bounds
if res.footprint_wgs84
else [-180.0, -90.0, 180.0, 90.0]
]
# TODO: Find a nicer way to make the typechecker happier
# pystac is too specific in wanting a list[float | int]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not going to help right now, but it's possible pystac would accept a PR that changes the type annotation from list to Sequence.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I was thinking that's the best solution too.

# odc-geo BoundingBox class is a Sequence[float]
bboxes=[list(res.bbox) if res.bbox else [-180.0, -90.0, 180.0, 90.0]]
),
temporal=pystac.TemporalExtent(
intervals=[
Expand Down
4 changes: 0 additions & 4 deletions cubedash/index/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,6 @@ def put_summary(
@abstractmethod
def product_summary_cols(self, product_name: str) -> Row: ...

@abstractmethod
def collection_cols(self) -> Select:
"""Get all columns necessary for creating a Collection"""

@abstractmethod
def collections_search_query(
self,
Expand Down
79 changes: 43 additions & 36 deletions cubedash/index/postgis/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
union_all,
update,
)
from sqlalchemy.dialects.postgresql import TSTZRANGE, insert
from sqlalchemy.dialects.postgresql import TSTZRANGE, array, insert
from sqlalchemy.orm import aliased
from sqlalchemy.sql import ColumnElement
from sqlalchemy.types import TIMESTAMP
Expand Down Expand Up @@ -346,27 +346,6 @@ def product_time_summary(
)
)

@override
def collection_cols(self) -> Select:
product_overview = (
select(
ProductSpatial.name,
TimeOverview.footprint_geometry,
ProductSpatial.time_earliest,
ProductSpatial.time_latest,
TimeOverview.period_type,
)
.select_from(TimeOverview)
.join(ProductSpatial)
.cte("product_overview")
)

return (
select(ODC_PRODUCT.definition, product_overview)
.select_from(product_overview)
.join(ODC_PRODUCT, product_overview.c.name == ODC_PRODUCT.name)
)

@override
def collections_search_query(
self,
Expand All @@ -376,41 +355,69 @@ def collections_search_query(
bbox: tuple[float, float, float, float] | None = None,
time: tuple[datetime, datetime] | None = None,
q: Sequence[str] | None = None,
) -> Result:
collection = self.collection_cols().subquery()
query = select(collection).where(collection.c.period_type == "all")

) -> list[Row]:
# STAC Collections only hold a bounding box in EPSG:4326, no polygons
# Calculate the bounding box on the server, it's far more efficient

# The Cubedash Product (which maps to a STAC Collection) doesn't have
# any bounding box or geometry attached, all the geometries are in the
# TimeOverview table, grouped by different `period_types`. In this case
# we use the `period_type=="all"` to get the one that covers all time.
collection_bbox = func.Box2D(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RHS takes its data from TimeOverview, shouldn't this be called t_o_bbox (or overview_bbox, but that is longer)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. I might add some comments.

The Cubedash Product (which maps to a STAC Collection) doesn't have any bounding box or geometry attached, all the geometries are in the TimeOverview table, grouped by different period_types. In this case we use the period_type=="all" to get the one that covers all of time.

func.ST_Transform(TimeOverview.footprint_geometry, 4326)
)
bbox_array = array(
[
func.ST_XMin(collection_bbox),
func.ST_YMin(collection_bbox),
func.ST_XMax(collection_bbox),
func.ST_YMax(collection_bbox),
]
)
query = (
select(
ODC_PRODUCT.definition,
ProductSpatial.name,
case(
(func.ST_XMin(collection_bbox).is_(None), None), else_=bbox_array
).label("bbox"),
ProductSpatial.time_earliest,
ProductSpatial.time_latest,
)
.select_from(TimeOverview)
.join(ProductSpatial, ProductSpatial.id == TimeOverview.product_ref)
.join(ODC_PRODUCT, ProductSpatial.name == ODC_PRODUCT.name)
.where(TimeOverview.period_type == "all")
)
if name:
query = query.where(collection.c.name == name)
query = query.where(ProductSpatial.name == name)

if bbox:
query = query.where(
collection.c.footprint_geometry.intersects(func.ST_MakeEnvelope(*bbox))
TimeOverview.footprint_geometry.intersects(func.ST_MakeEnvelope(*bbox))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no test coverage for most of these if-branches, but I'll trust you know what you are doing with these changes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, there's a least one bug that I found in these if-branches. The 'time' filter was crashing by trying to apply a python datetime function to a SQLAlchemy Column.

I only added one test in this PR. There's room for more...

In this case, collection was a subquery that held the required columns. I've removed the sub-query since it was unnecessary obfuscation, and apply the WHERE clause against the table.

)

if time:
query = query.where(
and_(
default_utc(time[0]) <= default_utc(collection.c.time_latest),
default_utc(collection.c.time_earliest) <= default_utc(time[1]),
)
default_utc(time[0]) <= ProductSpatial.time_latest,
ProductSpatial.time_earliest <= default_utc(time[1]),
)

if q:
title = SimpleDocField(
name="title",
description="product title",
alchemy_column=collection.c.definition,
alchemy_column=ODC_PRODUCT.definition,
indexed=False,
offset=("metadata", "title"),
)

description = SimpleDocField(
name="description",
description="product description",
alchemy_column=collection.c.definition,
alchemy_column=ODC_PRODUCT.definition,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The corresponding postgres change also fixes the type of the offset= parameter.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that SimpleDocField turns a plain string into a length 1 tuple... but I'm not sure.

This is ugly though, I'll make them match.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fixed type annotations around that (and a bunch of usages) in datacube-core, I'm quite sure you need the tuple. A bit surprised the type checker wasn't complaining about these lines.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

pyright was complaining, but mypy wasn't for some reason.

It's fixed now regardless.

indexed=False,
offset=("description"),
offset=("description",),
)

expressions = []
Expand All @@ -419,7 +426,7 @@ def collections_search_query(
[
title.alchemy_expression.icontains(value),
description.alchemy_expression.icontains(value),
collection.c.name.icontains(value),
ODC_PRODUCT.name.icontains(value),
]
)
query = query.where(or_(*expressions))
Expand Down
80 changes: 47 additions & 33 deletions cubedash/index/postgres/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
text,
union_all,
)
from sqlalchemy.dialects.postgresql import TSTZRANGE, insert
from sqlalchemy.dialects.postgresql import TSTZRANGE, array, insert
from sqlalchemy.sql import ColumnElement

import cubedash.summary._schema as _schema
Expand Down Expand Up @@ -392,26 +392,6 @@ def product_time_summary(
)
)

@override
def collection_cols(self) -> Select:
product_overview = (
select(
PRODUCT.c.name,
TIME_OVERVIEW.c.footprint_geometry,
PRODUCT.c.time_earliest,
PRODUCT.c.time_latest,
TIME_OVERVIEW.c.period_type,
)
.select_from(TIME_OVERVIEW.join(PRODUCT))
.cte("product_overview")
)

return select(ODC_PRODUCT.c.definition, product_overview).select_from(
product_overview.join(
ODC_PRODUCT, product_overview.c.name == ODC_PRODUCT.c.name
)
)

@override
def collections_search_query(
self,
Expand All @@ -422,40 +402,74 @@ def collections_search_query(
time: tuple[datetime, datetime] | None = None,
q: Sequence[str] | None = None,
) -> Result:
collection = self.collection_cols().alias("collection")
query = select(collection).where(collection.c.period_type == "all")
# STAC Collections only hold a bounding box in EPSG:4326, no polygons
# Calculate the bounding box on the server, it's far more efficient.

# The Cubedash Product (which maps to a STAC Collection) doesn't have
# any bounding box or geometry attached, all the geometries are in the
# TimeOverview table, grouped by different `period_types`. In this case
# we use the `period_type=="all"` to get the one that covers all time.

collection_bbox = func.Box2D(
func.ST_Transform(TIME_OVERVIEW.c.footprint_geometry, 4326)
)
bbox_array = array(
[
func.ST_XMin(collection_bbox),
func.ST_YMin(collection_bbox),
func.ST_XMax(collection_bbox),
func.ST_YMax(collection_bbox),
]
)
query = (
select(
ODC_PRODUCT.c.definition,
PRODUCT.c.name,
case(
(func.ST_XMin(collection_bbox).is_(None), None), else_=bbox_array
).label("bbox"),
PRODUCT.c.time_earliest,
PRODUCT.c.time_latest,
)
.select_from(
TIME_OVERVIEW.join(
PRODUCT, PRODUCT.c.id == TIME_OVERVIEW.c.product_ref
).join(ODC_PRODUCT, PRODUCT.c.name == ODC_PRODUCT.c.name)
)
.where(TIME_OVERVIEW.c.period_type == "all")
)

if name:
query = query.where(collection.c.name == name)
query = query.where(PRODUCT.c.name == name)

if bbox:
query = query.where(
collection.c.footprint_geometry.intersects(func.ST_MakeEnvelope(*bbox))
TIME_OVERVIEW.c.footprint_geometry.intersects(
func.ST_MakeEnvelope(*bbox)
)
)

if time:
query = query.where(
and_(
default_utc(time[0]) <= default_utc(collection.c.time_latest),
default_utc(collection.c.time_earliest) <= default_utc(time[1]),
)
default_utc(time[0]) <= PRODUCT.c.time_latest,
PRODUCT.c.time_earliest <= default_utc(time[1]),
)

if q:
title = SimpleDocField(
name="title",
description="product title",
alchemy_column=collection.c.definition,
alchemy_column=ODC_PRODUCT.c.definition,
indexed=False,
offset=("metadata", "title"),
)

description = SimpleDocField(
name="description",
description="product description",
alchemy_column=collection.c.definition,
alchemy_column=ODC_PRODUCT.c.definition,
indexed=False,
offset=("description"),
offset=("description",),
)

expressions = []
Expand All @@ -464,7 +478,7 @@ def collections_search_query(
[
title.alchemy_expression.icontains(value),
description.alchemy_expression.icontains(value),
collection.c.name.icontains(value),
ODC_PRODUCT.c.name.icontains(value),
]
)
query = query.where(or_(*expressions))
Expand Down
34 changes: 6 additions & 28 deletions cubedash/summary/_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@
from geoalchemy2 import WKBElement
from geoalchemy2 import shape as geo_shape
from geoalchemy2.shape import from_shape, to_shape
from odc.geo import MaybeCRS
from odc.geo import BoundingBox, MaybeCRS
from pygeofilter.backends.sqlalchemy.evaluate import (
SQLAlchemyFilterEvaluator as FilterEvaluator,
)
from pygeofilter.parsers.cql2_json import parse as parse_cql2_json
from pygeofilter.parsers.cql2_text import parse as parse_cql2_text
from shapely.geometry import MultiPolygon
from shapely.geometry.base import BaseGeometry
from sqlalchemy import Row, RowMapping, func, select
from sqlalchemy.dialects.postgresql import TSTZRANGE
Expand Down Expand Up @@ -187,22 +186,7 @@ class CollectionItem:
definition: dict[str, Any]
time_earliest: datetime | None
time_latest: datetime | None
footprint_geometry: Geometry | None
footprint_crs: str | None

@property
def footprint_wgs84(self) -> MultiPolygon | None:
if not self.footprint_geometry:
return None
if not self.footprint_crs:
_LOG.warning(f"Geometry without a crs for {self.name}", stacklevel=2)
return None

return (
Geometry(self.footprint_geometry, crs=self.footprint_crs)
.to_crs("EPSG:4326", wrapdateline=True)
.geom
)
bbox: BoundingBox | None

@property
def title(self) -> str:
Expand Down Expand Up @@ -1602,6 +1586,9 @@ def _summary_from_row(
def _row_to_collection(
res: Row, grouping_timezone: tzinfo = default_timezone
) -> CollectionItem:
# the 'res' at the moment has
# ('definition', 'name', 'bbox', 'time_earliest', 'time_latest')

return CollectionItem(
name=res.name,
time_earliest=res.time_earliest.astimezone(grouping_timezone)
Expand All @@ -1610,16 +1597,7 @@ def _row_to_collection(
time_latest=res.time_latest.astimezone(grouping_timezone)
if res.time_latest
else None,
footprint_geometry=(
None
if res.footprint_geometry is None
else geo_shape.to_shape(res.footprint_geometry)
),
footprint_crs=(
None
if res.footprint_geometry is None or res.footprint_geometry.srid == -1
else "EPSG:{}".format(res.footprint_geometry.srid)
),
bbox=res.bbox,
definition=res.definition,
)

Expand Down
2 changes: 1 addition & 1 deletion integration_tests/asserts.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def get_json(client: FlaskClient, url: str, expect_status_code=200) -> dict:
f"Expected status {expect_status_code} not {rv.status_code}."
f"\nGot:\n{indent(rv.data.decode('utf-8'), ' ' * 6)}"
)
assert rv.is_json, "Expected json content type in response"
assert rv.is_json, "Expected JSON content type in response"
data = rv.json
assert data is not None, "Empty response from server"
except AssertionError:
Expand Down
Loading