stac-utils
diff --git a/‎Cargo.lock‎
Lines changed: 281 additions & 259 deletions b/‎Cargo.lock‎
Lines changed: 281 additions & 259 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 5 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/api/parquet.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/api/parquet.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/api/stac.md‎
Lines changed: 0 additions & 8 deletions b/‎docs/api/stac.md‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎docs/notebooks/stac-geoparquet.ipynb‎
Lines changed: 107 additions & 46 deletions b/‎docs/notebooks/stac-geoparquet.ipynb‎
Lines changed: 107 additions & 46 deletions
diff --git a/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions b/‎mkdocs.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/rustac/__init__.py‎
Lines changed: 34 additions & 0 deletions b/‎python/rustac/__init__.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎python/rustac/rustac.pyi‎
Lines changed: 32 additions & 0 deletions b/‎python/rustac/rustac.pyi‎
Lines changed: 32 additions & 0 deletions
@@ -43,3 +43,8 @@ tracing = "0.1.41"
 
 [build-dependencies]
 cargo-lock = "10"
+
+[patch.crates-io]
+stac = { git = 'https://github.com/stac-utils/rustac.git' }
+stac-io = { git = 'https://github.com/stac-utils/rustac.git' }
+rustac = { git = 'https://github.com/stac-utils/rustac.git' }
@@ -0,0 +1,8 @@
+---
+description: Write stac-geoparquet
+---
+
+# parquet
+
+::: rustac.GeoparquetWriter
+::: rustac.geoparquet_writer
@@ -22,15 +22,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 1,
    "id": "37025933",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "150.2 kB\n"
+      "74.0 kB\n"
      ]
     }
    ],
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 2,
    "id": "164ecaee",
    "metadata": {},
    "outputs": [
@@ -134,6 +134,67 @@
     "print(json.dumps(item_collection[\"features\"][0], indent=2))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9325e2af",
+   "metadata": {},
+   "source": [
+    "### Writing in chunks\n",
+    "\n",
+    "If you have a lot of items, you might not want to load them all into memory at once.\n",
+    "We provide a context manager for iteratively writing **stac-geoparquet**.\n",
+    "This example is a bit contrived, but you get the idea."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9045b4b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 499 items\n",
+      "Writing batch of 20 items\n",
+      "Read back 10000 items\n"
+     ]
+    }
+   ],
+   "source": [
+    "import itertools\n",
+    "\n",
+    "iterator = itertools.batched(items, 499)\n",
+    "\n",
+    "with rustac.geoparquet_writer(list(next(iterator)), \"items-batched.parquet\") as writer:\n",
+    "    for item_batch in iterator:\n",
+    "        print(f\"Writing batch of {len(item_batch)} items\")\n",
+    "        writer.write(list(item_batch))\n",
+    "\n",
+    "\n",
+    "item_collection = await rustac.read(\"items-batched.parquet\")\n",
+    "print(\"Read back\", len(item_collection[\"features\"]), \"items\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2223d4ce",
@@ -162,16 +223,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 4,
    "id": "870cbebb",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "That took 0.37 seconds to read\n",
-      "That took 1.20 seconds to write\n",
+      "That took 0.08 seconds to read\n",
+      "That took 0.28 seconds to write\n",
       "9999 items have a 'foo' property\n"
      ]
     }
@@ -219,46 +280,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 5,
    "id": "0fabaa18",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "┌───────────┬──────────────────────────┬───────────────────────────┐\n",
-       "│    id     │         datetime         │         geometry          │\n",
-       "│  varchar  │ timestamp with time zone │         geometry          │\n",
-       "├───────────┼──────────────────────────┼───────────────────────────┤\n",
-       "│ item-0    │ 2023-12-31 17:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-1    │ 2023-12-31 18:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-2    │ 2023-12-31 19:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-3    │ 2023-12-31 20:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-4    │ 2023-12-31 21:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-5    │ 2023-12-31 22:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-6    │ 2023-12-31 23:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-7    │ 2024-01-01 00:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-8    │ 2024-01-01 01:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9    │ 2024-01-01 02:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│   ·       │           ·              │             ·             │\n",
-       "│   ·       │           ·              │             ·             │\n",
-       "│   ·       │           ·              │             ·             │\n",
-       "│ item-9990 │ 2025-02-19 23:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9991 │ 2025-02-20 00:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9992 │ 2025-02-20 01:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9993 │ 2025-02-20 02:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9994 │ 2025-02-20 03:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9995 │ 2025-02-20 04:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9996 │ 2025-02-20 05:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9997 │ 2025-02-20 06:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9998 │ 2025-02-20 07:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "│ item-9999 │ 2025-02-20 08:00:00-07   │ POINT (-105.1019 40.1672) │\n",
-       "├───────────┴──────────────────────────┴───────────────────────────┤\n",
-       "│ ? rows (>9999 rows, 20 shown)                          3 columns │\n",
-       "└──────────────────────────────────────────────────────────────────┘"
+       "┌───────────┬──────────────────────────┬────────────────────────────────────────────────────────────────────┐\n",
+       "│    id     │         datetime         │                              geometry                              │\n",
+       "│  varchar  │ timestamp with time zone │                                blob                                │\n",
+       "├───────────┼──────────────────────────┼────────────────────────────────────────────────────────────────────┤\n",
+       "│ item-0    │ 2023-12-31 17:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-1    │ 2023-12-31 18:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-2    │ 2023-12-31 19:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-3    │ 2023-12-31 20:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-4    │ 2023-12-31 21:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-5    │ 2023-12-31 22:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-6    │ 2023-12-31 23:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-7    │ 2024-01-01 00:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-8    │ 2024-01-01 01:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9    │ 2024-01-01 02:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│   ·       │           ·              │                                 ·                                  │\n",
+       "│   ·       │           ·              │                                 ·                                  │\n",
+       "│   ·       │           ·              │                                 ·                                  │\n",
+       "│ item-9990 │ 2025-02-19 23:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9991 │ 2025-02-20 00:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9992 │ 2025-02-20 01:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9993 │ 2025-02-20 02:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9994 │ 2025-02-20 03:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9995 │ 2025-02-20 04:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9996 │ 2025-02-20 05:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9997 │ 2025-02-20 06:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9998 │ 2025-02-20 07:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "│ item-9999 │ 2025-02-20 08:00:00-07   │ \\x01\\x01\\x00\\x00\\x00\\x98\\xDD\\x93\\x87\\x85FZ\\xC0\\x13\\xF2A\\xCFf\\x15D@ │\n",
+       "├───────────┴──────────────────────────┴────────────────────────────────────────────────────────────────────┤\n",
+       "│ ? rows (>9999 rows, 20 shown)                                                                   3 columns │\n",
+       "└───────────────────────────────────────────────────────────────────────────────────────────────────────────┘"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -282,7 +343,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 6,
    "id": "c01c0ef5",
    "metadata": {},
    "outputs": [
@@ -297,7 +358,7 @@
        "└──────────────┘"
       ]
      },
-     "execution_count": 72,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -318,7 +379,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 7,
    "id": "18bc3a4b",
    "metadata": {},
    "outputs": [
@@ -327,10 +388,10 @@
      "evalue": "Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\"",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mBinderException\u001b[0m                           Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[73], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mduckdb\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mselect id, foo from read_parquet([\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mitems.parquet\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m, \u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnew-items.parquet\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m])\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "\u001b[0;31mBinderException\u001b[0m: Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\""
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mBinderException\u001b[39m                           Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mduckdb\u001b[49m\u001b[43m.\u001b[49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mselect id, foo from read_parquet([\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mitems.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mnew-items.parquet\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m])\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "\u001b[31mBinderException\u001b[39m: Binder Error: Referenced column \"foo\" not found in FROM clause!\nCandidate bindings: \"bbox\""
      ]
     }
    ],
@@ -341,7 +402,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "rustac-py",
    "language": "python",
    "name": "python3"
   },
 
@@ -31,6 +31,7 @@ nav:
       - arrow: api/arrow.md
       - duckdb: api/duckdb.md
       - migrate: api/migrate.md
+      - parquet: api/parquet.md
       - read: api/read.md
       - search: api/search.md
       - stac: api/stac.md
 
@@ -1,8 +1,42 @@
 from __future__ import annotations
+from pathlib import Path
+from collections.abc import Generator
+from contextlib import contextmanager
+from typing import Any
 
 from .rustac import *
 from . import store
 
+@contextmanager
+def geoparquet_writer(items: list[dict[str, Any]], path: str, drop_invalid_attributes: bool = True) -> Generator[GeoparquetWriter]:
+    """Open a geoparquet writer in a context manager.
+
+    The items provided to the initial call will be used to build the geoparquet
+    schema. All subsequent items must have the same schema.
+
+    The underlying parquet writer will group batches of items into row groups
+    based upon it's default configuration; the row groups are _not_ determined
+    by the size of the item lists passed to the writer.
+
+    Args:
+        items: The STAC items
+        path: The path for the stac-geoparquet file
+        drop_invalid_attributes: If true, invalid attributes (e.g. an `id` in
+            the `properties` field) will be dropped. If false, raise an error if
+            an invalid attribute is encountered.
+
+    Examples:
+
+        >>> with geoparquet_writer(item_batches[0], "out.parquet") as w:
+        ...     for items in item_batches[1:]:
+        ...         w.write(items)
+        ...
+        >>>
+    """
+    writer = GeoparquetWriter(items, path, drop_invalid_attributes)
+    yield writer
+    writer.finish()
+
 __doc__ = rustac.__doc__
 if hasattr(rustac, "__all__"):
     __all__ = rustac.__all__
 
@@ -11,6 +11,38 @@ from rustac.store import ObjectStore
 
 AnyObjectStore = ObjectStore | ObstoreObjectStore
 
+class GeoparquetWriter:
+    """A helper class to write geoparquet from batches of items."""
+
+    def __init__(
+        self,
+        items: list[dict[str, Any]],
+        path: str,
+        drop_invalid_attributes: bool = True,
+    ) -> None:
+        """Creates a new writer for the provided items and the path.
+
+        Args:
+            items: The STAC items to write to geoparquet. The schema of these
+                items will be used for the output file, and any additional items
+                added to the writer need to have the same schema.
+            path: The filesystem path to write the stac-geoparquet to.
+            drop_invalid_attributes: Whether to drop invalid attributes in the
+                items' `properties` (e.g. an additional `id` property). If false,
+                raise an error instead.
+        """
+
+    def write(self, items: list[dict[str, Any]]) -> None:
+        """Writes more items to the geoparquet.
+
+        Args:
+            items: The items to write. They must have the same schema as the
+                items used to initialize the writer.
+        """
+
+    def finish(self) -> None:
+        """Finishes writing the stac-geoparquet file."""
+
 class RustacError(Exception):
     """A package-specific exception."""