diff --git a/README.md b/README.md index 8833e16..5929c73 100644 --- a/README.md +++ b/README.md @@ -270,3 +270,43 @@ These should match the relevant column type. Mapped variables are substituted in A `variables` map usually isn't needed for simple queries. The basic condition string should automatically get converted to a meaningful type, but when this fails replacing tricky elements with a variable may help. + +### Remote registration [Experimental] + +For **OMERO Plus** installations which support TileDB as the OMERO.tables backend +it is possible to register tables in-place in a similar manner to in-place image +imports (otherwise table data is stored in the ManagedRepository). + +If you don't know what table backend your OMERO Plus server is using, you +probably don't have this feature available. If you have access to the server +machine you can check by running `omero config get omero.tables.module`, +if the response is `omero_plus.run_tables_pytables_or_tiledb` then tiledb is +available. + +This feature is currently in active development. The current version of +omero2pandas can export tables locally in TileDB format to be registered with +OMERO using external tooling. + + +For this mode to be available extra dependencies must also be installed as follows + +```bash +pip install omero2pandas[remote] +``` + +To activate this mode use `omero2pandas.upload_table` with arguments as +follows: + +```python +import omero2pandas +db_path = omero2pandas.upload_table("/path/to/my_data.csv", "Name for table", + local_path="/path/to/mytable.tiledb") +# Returns the path to the created tiledb file +``` + +Similar to regular table uploads, the input can be a dataframe in memory or a +csv file on disk. + +A `remote_path` argument is also available. In future versions this will be +used if the remote table path is different from the server's point of view (e.g. +network drives are mapped at another location). \ No newline at end of file diff --git a/omero2pandas/__init__.py b/omero2pandas/__init__.py index e9bd95a..bf71db7 100644 --- a/omero2pandas/__init__.py +++ b/omero2pandas/__init__.py @@ -7,6 +7,7 @@ # If the file is missing please request a copy by contacting # support@glencoesoftware.com. import collections +from importlib.util import find_spec import logging import os import sys @@ -19,6 +20,10 @@ from omero2pandas.connect import OMEROConnection from omero2pandas.upload import create_table +if find_spec("tiledb"): + from omero2pandas.remote import register_table +else: + register_table = None logging.basicConfig( format="%(asctime)s %(levelname)-7s [%(name)16s] %(message)s", @@ -185,7 +190,8 @@ def read_table(file_id=None, annotation_id=None, column_names=(), rows=None, def upload_table(source, table_name, parent_id=None, parent_type='Image', links=None, chunk_size=None, omero_connector=None, - server=None, port=4064, username=None, password=None): + server=None, port=4064, username=None, password=None, + local_path=None, remote_path=None): """ Upload a pandas dataframe to a new OMERO table. For the connection, supply either an active client object or server @@ -205,6 +211,10 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image', :param server: Address of the server :param port: Port the server runs on (default 4064) :param username: Username for server login + :param local_path: [TileDB only], construct table at this file path and + register remotely + :param remote_path: [TileDB only], mapping for local_path on the server + (if different from local system) :param password: Password for server login :return: File Annotation ID of the new table """ @@ -220,7 +230,7 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image', if parent_id is not None: if (parent_type, parent_id) not in links: links.append((parent_type, parent_id)) - if not links: + if not links and not local_path: raise ValueError("No OMERO objects to link the table to") elif not isinstance(links, Iterable): raise ValueError(f"Links should be an iterable list of " @@ -229,7 +239,14 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image', port=port, client=omero_connector) as connector: conn = connector.get_gateway() conn.SERVICE_OPTS.setOmeroGroup('-1') - ann_id = create_table(source, table_name, links, conn, chunk_size) + if local_path or remote_path: + if not register_table: + raise ValueError("Remote table support is not installed") + ann_id = register_table(source, local_path, + remote_path=remote_path, + chunk_size=chunk_size) + else: + ann_id = create_table(source, table_name, links, conn, chunk_size) if ann_id is None: LOGGER.warning("Failed to create OMERO table") return ann_id diff --git a/omero2pandas/remote.py b/omero2pandas/remote.py new file mode 100644 index 0000000..637073d --- /dev/null +++ b/omero2pandas/remote.py @@ -0,0 +1,65 @@ +# encoding: utf-8 +# +# Copyright (c) Glencoe Software, Inc. All rights reserved. +# +# This software is distributed under the terms described by the LICENCE file +# you can find at the root of the distribution bundle. +# If the file is missing please request a copy by contacting +# support@glencoesoftware.com. +import logging +from pathlib import Path, PurePosixPath +import time + +import pandas as pd +import tiledb +from tqdm.auto import tqdm + +LOGGER = logging.getLogger(__name__) + +OMERO_TILEDB_VERSION = '3' # Version of the omero table implementation + + +def register_table(source, local_path, remote_path=None, chunk_size=1000): + LOGGER.info("Registering remote table") + # Default filters from tiledb.from_pandas() + write_path = Path(local_path or remote_path).with_suffix(".tiledb") + # Assume the server will be running on Linux + remote_path = PurePosixPath( + remote_path or local_path).with_suffix(".tiledb") + LOGGER.debug(f"Remote path would be {str(remote_path)}") + if write_path.exists(): + raise ValueError(f"Table file {write_path} already exists") + # path.as_uri() exists but mangles any spaces in the path! + write_path = str(write_path) + # Use a default chunk size if not set + chunk_size = chunk_size or 1000 + LOGGER.info("Writing data to TileDB") + # Export table + if isinstance(source, (str, Path)): + data_iterator = pd.read_csv(source, chunksize=chunk_size) + total_rows = None + else: + data_iterator = (source.iloc[i:i + chunk_size] + for i in range(0, len(source), chunk_size)) + total_rows = len(source) + progress_monitor = tqdm( + desc="Generating TileDB file...", initial=1, dynamic_ncols=True, + total=total_rows, + bar_format='{desc}: {percentage:3.0f}%|{bar}| ' + '{n_fmt}/{total_fmt} rows, {elapsed} {postfix}') + row_idx = 0 + for chunk in data_iterator: + tiledb.from_pandas(write_path, chunk, sparse=True, full_domain=True, + tile=10000, attr_filters=None, + row_start_idx=row_idx, allows_duplicates=False, + mode="append" if row_idx else "ingest") + progress_monitor.update(len(chunk)) + row_idx += len(chunk) + progress_monitor.close() + LOGGER.debug("Appending metadata to TileDB") + # Append omero metadata + with tiledb.open(write_path, mode="w") as array: + array.meta['__version'] = OMERO_TILEDB_VERSION + array.meta['__initialized'] = time.time() + LOGGER.info("Table saved successfully") + return write_path diff --git a/omero2pandas/upload.py b/omero2pandas/upload.py index 579a2ee..bb9b31c 100644 --- a/omero2pandas/upload.py +++ b/omero2pandas/upload.py @@ -1,6 +1,6 @@ # encoding: utf-8 # -# Copyright (c) 2023 Glencoe Software, Inc. All rights reserved. +# Copyright (c) Glencoe Software, Inc. All rights reserved. # # This software is distributed under the terms described by the LICENCE file # you can find at the root of the distribution bundle. @@ -9,6 +9,7 @@ import logging import math import os +from pathlib import Path import omero import omero.grid @@ -170,7 +171,7 @@ def create_table(source, table_name, links, conn, chunk_size): bar_format='{desc}: {percentage:3.0f}%|{bar}| ' '{n_fmt}/{total_fmt} rows, {elapsed} {postfix}') - if isinstance(source, str): + if isinstance(source, (str, Path)): assert os.path.exists(source), f"Could not find file {source}" columns, str_cols, total_rows, chunk_size = generate_omero_columns_csv( source, chunk_size) diff --git a/pyproject.toml b/pyproject.toml index e550e56..6e8b39d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,12 +8,6 @@ name = "omero2pandas" description = "OMERO.tables to pandas bridge" readme = "README.md" license = {file = "LICENSE.txt"} -dependencies = [ - 'omero-py>=5.19.5', - 'pandas>2', - 'tqdm', -] -requires-python = ">=3.9" authors = [ {name = "Glencoe Software, Inc.", email="info@glencoesoftware.com"}, ] @@ -25,10 +19,19 @@ classifiers = [ 'Intended Audience :: End Users/Desktop', 'Programming Language :: Python :: 3', ] - +requires-python = ">=3.9" +dependencies = [ + 'omero-py>=5.19.5', + 'pandas>2', + 'tqdm', +] [project.optional-dependencies] token = ["omero-user-token>=0.3.0"] +remote = [ + "pyarrow>=19.0.0", + "tiledb>=0.33.2", +] [project.urls] github = "https://github.com/glencoesoftware/omero2pandas"