yolo: multiple collections (#13)

gadomski · web-flow · commit 2a34f39532ae · 2025-04-25T05:21:44.000-06:00
* deps: update

* feat: bit of a yolo fix for multiple collections

* fix: typing
diff --git a/data/collections.json b/data/collections.json
@@ -32,5 +32,39 @@
         "type": "application/vnd.apache.parquet"
       }
     }
+  },
+  {
+    "type": "Collection",
+    "stac_version": "1.1.0",
+    "id": "openaerialmap",
+    "description": "This collection was generated by rustac v0.12.0 from 17702 items",
+    "license": "other",
+    "extent": {
+      "spatial": {
+        "bbox": [
+          [
+            -175.25390625,
+            -43.683527,
+            177.537581,
+            60.172121352329114
+          ]
+        ]
+      },
+      "temporal": {
+        "interval": [
+          [
+            "1944-12-31T13:00:00Z",
+            "2025-03-30T21:16:00Z"
+          ]
+        ]
+      }
+    },
+    "links": [],
+    "assets": {
+      "data": {
+        "href": "./openaerialmap.parquet",
+        "type": "application/vnd.apache.parquet"
+      }
+    }
   }
 ]
diff --git a/data/openaerialmap.parquet b/data/openaerialmap.parquet
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ dependencies = [
     "fastapi>=0.115.8",
     "geojson-pydantic>=1.2.0",
     "pydantic>=2.10.4",
-    "rustac @ git+https://github.com/stac-utils/rustac-py",
+    "rustac==0.7.0b2",
     "stac-fastapi-api>=5.0.2",
     "stac-fastapi-extensions>=5.0.2",
     "stac-fastapi-types>=5.0.2",
diff --git a/src/stac_fastapi/geoparquet/client.py b/src/stac_fastapi/geoparquet/client.py
@@ -164,42 +164,38 @@ async def search(
         client = cast(DuckdbClient, request.state.client)
         hrefs = cast(dict[str, list[str]], request.state.hrefs)
 
-        hrefs_to_search = []
         if search.collections:
-            for collection in search.collections:
-                if collection_hrefs := hrefs.get(collection):
-                    hrefs_to_search.extend(collection_hrefs)
+            collections = search.collections
         else:
-            for collection_hrefs in hrefs.values():
-                hrefs_to_search.extend(collection_hrefs)
-
-        if len(hrefs) > 1:
-            raise ValidationError(
-                "Cannot search multiple geoparquet files (don't know how to page)"
-            )
-        elif len(hrefs) == 0:
-            return ItemCollection()
-        else:
-            href = hrefs_to_search.pop()
+            collections = list(hrefs.keys())
 
         search_dict = search.model_dump(exclude_none=True)
-        search_dict.update(**kwargs)
-        items = client.search(
-            href,
-            **search_dict,
-        )
+        search_dict["offset"] = kwargs.get("offset", 0)
+        items: list[dict[str, Any]] = []
+        while collections and not (search.limit and len(items) >= search.limit):
+            collection = collections.pop(0)
+            if collection_hrefs := hrefs.get(collection):
+                collection_search_dict = copy.deepcopy(search_dict)
+                collection_search_dict["collections"] = [collection]
+                for href in collection_hrefs:
+                    items.extend(client.search(href, **collection_search_dict))
+                    if search.limit and len(items) >= search.limit:
+                        collections.insert(0, collection)
+                        break
+                search_dict["offset"] = 0
+
         item_collection = {
             "type": "FeatureCollection",
             "features": [self.item_with_links(item, request) for item in items],
         }
         num_items = len(item_collection["features"])
-        limit = int(search_dict.get("limit", None) or num_items)
         offset = int(search_dict.get("offset", None) or 0)
 
-        if limit <= num_items:
+        if search.limit and search.limit <= num_items:
             next_search = copy.deepcopy(search_dict)
-            next_search["limit"] = limit
-            next_search["offset"] = offset + limit
+            next_search["limit"] = search.limit
+            next_search["offset"] = offset + search.limit
+            next_search["collections"] = collections
         else:
             next_search = None
 
@@ -220,6 +216,8 @@ async def search(
                 }
             )
             if next_search:
+                if "collections" in next_search:
+                    next_search["collections"] = ",".join(collections)
                 links.append(
                     {
                         "href": url + "?" + urllib.parse.urlencode(next_search),
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -22,7 +22,11 @@ async def test_paging(client: TestClient) -> None:
         (link for link in response.json()["links"] if link["rel"] == "next")
     )
     url = urllib.parse.urlparse(next_link["href"])
-    assert urllib.parse.parse_qs(url.query) == {"limit": ["1"], "offset": ["1"]}
+    assert urllib.parse.parse_qs(url.query) == {
+        "limit": ["1"],
+        "offset": ["1"],
+        "collections": ["naip,openaerialmap"],
+    }
     response = client.get("/search", params=url.query)
     assert response.status_code == 200
     assert response.json()["features"][0]["id"] == "ne_m_4110263_sw_13_060_20220820"
diff --git a/uv.lock b/uv.lock