Skip to content

Add initial remote table registration implementation #25

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,43 @@ These should match the relevant column type. Mapped variables are substituted in

A `variables` map usually isn't needed for simple queries. The basic condition string should automatically get converted to a meaningful type, but when this fails
replacing tricky elements with a variable may help.

### Remote registration [Experimental]

For **OMERO Plus** installations which support TileDB as the OMERO.tables backend
it is possible to register tables in-place in a similar manner to in-place image
imports (otherwise table data is stored in the ManagedRepository).

If you don't know what table backend your OMERO Plus server is using, you
probably don't have this feature available. If you have access to the server
machine you can check by running `omero config get omero.tables.module`,
if the response is `omero_plus.run_tables_pytables_or_tiledb` then tiledb is
available.

This feature is currently in active development. The current version of
omero2pandas can export tables locally in TileDB format to be registered with
OMERO using external tooling.


For this mode to be available extra dependencies must also be installed as follows

```bash
pip install omero2pandas[remote]
```

To activate this mode use `omero2pandas.upload_table` with arguments as
follows:

```python
import omero2pandas
db_path = omero2pandas.upload_table("/path/to/my_data.csv", "Name for table",
local_path="/path/to/mytable.tiledb")
# Returns the path to the created tiledb file
```

Similar to regular table uploads, the input can be a dataframe in memory or a
csv file on disk.

A `remote_path` argument is also available. In future versions this will be
used if the remote table path is different from the server's point of view (e.g.
network drives are mapped at another location).
23 changes: 20 additions & 3 deletions omero2pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# If the file is missing please request a copy by contacting
# support@glencoesoftware.com.
import collections
from importlib.util import find_spec
import logging
import os
import sys
Expand All @@ -19,6 +20,10 @@

from omero2pandas.connect import OMEROConnection
from omero2pandas.upload import create_table
if find_spec("tiledb"):
from omero2pandas.remote import register_table
else:
register_table = None

logging.basicConfig(
format="%(asctime)s %(levelname)-7s [%(name)16s] %(message)s",
Expand Down Expand Up @@ -185,7 +190,8 @@ def read_table(file_id=None, annotation_id=None, column_names=(), rows=None,

def upload_table(source, table_name, parent_id=None, parent_type='Image',
links=None, chunk_size=None, omero_connector=None,
server=None, port=4064, username=None, password=None):
server=None, port=4064, username=None, password=None,
local_path=None, remote_path=None):
"""
Upload a pandas dataframe to a new OMERO table.
For the connection, supply either an active client object or server
Expand All @@ -205,6 +211,10 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
:param server: Address of the server
:param port: Port the server runs on (default 4064)
:param username: Username for server login
:param local_path: [TileDB only], construct table at this file path and
register remotely
:param remote_path: [TileDB only], mapping for local_path on the server
(if different from local system)
:param password: Password for server login
:return: File Annotation ID of the new table
"""
Expand All @@ -220,7 +230,7 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
if parent_id is not None:
if (parent_type, parent_id) not in links:
links.append((parent_type, parent_id))
if not links:
if not links and not local_path:
raise ValueError("No OMERO objects to link the table to")
elif not isinstance(links, Iterable):
raise ValueError(f"Links should be an iterable list of "
Expand All @@ -229,7 +239,14 @@ def upload_table(source, table_name, parent_id=None, parent_type='Image',
port=port, client=omero_connector) as connector:
conn = connector.get_gateway()
conn.SERVICE_OPTS.setOmeroGroup('-1')
ann_id = create_table(source, table_name, links, conn, chunk_size)
if local_path or remote_path:
if not register_table:
raise ValueError("Remote table support is not installed")
ann_id = register_table(source, local_path,
remote_path=remote_path,
chunk_size=chunk_size)
else:
ann_id = create_table(source, table_name, links, conn, chunk_size)
if ann_id is None:
LOGGER.warning("Failed to create OMERO table")
return ann_id
Expand Down
65 changes: 65 additions & 0 deletions omero2pandas/remote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# encoding: utf-8
#
# Copyright (c) Glencoe Software, Inc. All rights reserved.
#
# This software is distributed under the terms described by the LICENCE file
# you can find at the root of the distribution bundle.
# If the file is missing please request a copy by contacting
# support@glencoesoftware.com.
import logging
from pathlib import Path, PurePosixPath
import time

import pandas as pd
import tiledb
from tqdm.auto import tqdm

LOGGER = logging.getLogger(__name__)

OMERO_TILEDB_VERSION = '3' # Version of the omero table implementation


def register_table(source, local_path, remote_path=None, chunk_size=1000):
LOGGER.info("Registering remote table")
# Default filters from tiledb.from_pandas()
write_path = Path(local_path or remote_path).with_suffix(".tiledb")
# Assume the server will be running on Linux
remote_path = PurePosixPath(
remote_path or local_path).with_suffix(".tiledb")
LOGGER.debug(f"Remote path would be {str(remote_path)}")
if write_path.exists():
raise ValueError(f"Table file {write_path} already exists")
# path.as_uri() exists but mangles any spaces in the path!
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what this comment means. Is it an explanation of why you use str(write_path)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. tiledb.from_pandas' first argument is formally called uri, but doesn't seem to handle escaped special characters like spaces. I wanted to head off people asking "why don't you just use the Pathlib to_uri method?"

write_path = str(write_path)
# Use a default chunk size if not set
chunk_size = chunk_size or 1000
Comment on lines +34 to +35
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably just use a default value in the function signature instead.

Copy link
Member Author

@DavidStirling DavidStirling Jan 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'd normally do this. However in this context we may receive chunk_size=None from the higher level omero2pandas.upload_table function. This is used to indicate that the chunk size should be calculated automatically. With local tiledb we don't need to worry about ice message size limits so we don't run the pre-scan that figures out chunk size like we do with normal table uploads, so this line simply grants a fallback default.

Nonetheless I'll add some defaults to this function's signature so that people using it in isolation have an easier time.

LOGGER.info("Writing data to TileDB")
# Export table
if isinstance(source, (str, Path)):
data_iterator = pd.read_csv(source, chunksize=chunk_size)
total_rows = None
else:
data_iterator = (source.iloc[i:i + chunk_size]
for i in range(0, len(source), chunk_size))
total_rows = len(source)
progress_monitor = tqdm(
desc="Generating TileDB file...", initial=1, dynamic_ncols=True,
total=total_rows,
bar_format='{desc}: {percentage:3.0f}%|{bar}| '
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}')
row_idx = 0
for chunk in data_iterator:
tiledb.from_pandas(write_path, chunk, sparse=True, full_domain=True,
tile=10000, attr_filters=None,
row_start_idx=row_idx, allows_duplicates=False,
mode="append" if row_idx else "ingest")
progress_monitor.update(len(chunk))
row_idx += len(chunk)
progress_monitor.close()
LOGGER.debug("Appending metadata to TileDB")
# Append omero metadata
with tiledb.open(write_path, mode="w") as array:
array.meta['__version'] = OMERO_TILEDB_VERSION
array.meta['__initialized'] = time.time()
LOGGER.info("Table saved successfully")
return write_path
5 changes: 3 additions & 2 deletions omero2pandas/upload.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# encoding: utf-8
#
# Copyright (c) 2023 Glencoe Software, Inc. All rights reserved.
# Copyright (c) Glencoe Software, Inc. All rights reserved.
#
# This software is distributed under the terms described by the LICENCE file
# you can find at the root of the distribution bundle.
Expand All @@ -9,6 +9,7 @@
import logging
import math
import os
from pathlib import Path

import omero
import omero.grid
Expand Down Expand Up @@ -170,7 +171,7 @@ def create_table(source, table_name, links, conn, chunk_size):
bar_format='{desc}: {percentage:3.0f}%|{bar}| '
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}')

if isinstance(source, str):
if isinstance(source, (str, Path)):
assert os.path.exists(source), f"Could not find file {source}"
columns, str_cols, total_rows, chunk_size = generate_omero_columns_csv(
source, chunk_size)
Expand Down
17 changes: 10 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ name = "omero2pandas"
description = "OMERO.tables to pandas bridge"
readme = "README.md"
license = {file = "LICENSE.txt"}
dependencies = [
'omero-py>=5.19.5',
'pandas>2',
'tqdm',
]
requires-python = ">=3.9"
authors = [
{name = "Glencoe Software, Inc.", email="info@glencoesoftware.com"},
]
Expand All @@ -25,10 +19,19 @@ classifiers = [
'Intended Audience :: End Users/Desktop',
'Programming Language :: Python :: 3',
]

requires-python = ">=3.9"
dependencies = [
'omero-py>=5.19.5',
'pandas>2',
'tqdm',
]

[project.optional-dependencies]
token = ["omero-user-token>=0.3.0"]
remote = [
"pyarrow>=19.0.0",
"tiledb>=0.33.2",
]

[project.urls]
github = "https://github.com/glencoesoftware/omero2pandas"
Expand Down