-
Notifications
You must be signed in to change notification settings - Fork 4
Add initial remote table registration implementation #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
66a9f85
cc19a77
e142dc5
3ae1482
9399e67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# encoding: utf-8 | ||
# | ||
# Copyright (c) Glencoe Software, Inc. All rights reserved. | ||
# | ||
# This software is distributed under the terms described by the LICENCE file | ||
# you can find at the root of the distribution bundle. | ||
# If the file is missing please request a copy by contacting | ||
# support@glencoesoftware.com. | ||
import logging | ||
from pathlib import Path, PurePosixPath | ||
import time | ||
|
||
import pandas as pd | ||
import tiledb | ||
from tqdm.auto import tqdm | ||
|
||
LOGGER = logging.getLogger(__name__) | ||
|
||
OMERO_TILEDB_VERSION = '3' # Version of the omero table implementation | ||
|
||
|
||
def register_table(source, local_path, remote_path=None, chunk_size=1000): | ||
LOGGER.info("Registering remote table") | ||
# Default filters from tiledb.from_pandas() | ||
write_path = Path(local_path or remote_path).with_suffix(".tiledb") | ||
# Assume the server will be running on Linux | ||
remote_path = PurePosixPath( | ||
remote_path or local_path).with_suffix(".tiledb") | ||
LOGGER.debug(f"Remote path would be {str(remote_path)}") | ||
if write_path.exists(): | ||
raise ValueError(f"Table file {write_path} already exists") | ||
# path.as_uri() exists but mangles any spaces in the path! | ||
write_path = str(write_path) | ||
# Use a default chunk size if not set | ||
chunk_size = chunk_size or 1000 | ||
Comment on lines
+34
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should probably just use a default value in the function signature instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I'd normally do this. However in this context we may receive Nonetheless I'll add some defaults to this function's signature so that people using it in isolation have an easier time. |
||
LOGGER.info("Writing data to TileDB") | ||
# Export table | ||
if isinstance(source, (str, Path)): | ||
data_iterator = pd.read_csv(source, chunksize=chunk_size) | ||
total_rows = None | ||
else: | ||
data_iterator = (source.iloc[i:i + chunk_size] | ||
for i in range(0, len(source), chunk_size)) | ||
total_rows = len(source) | ||
progress_monitor = tqdm( | ||
desc="Generating TileDB file...", initial=1, dynamic_ncols=True, | ||
total=total_rows, | ||
bar_format='{desc}: {percentage:3.0f}%|{bar}| ' | ||
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}') | ||
row_idx = 0 | ||
for chunk in data_iterator: | ||
tiledb.from_pandas(write_path, chunk, sparse=True, full_domain=True, | ||
tile=10000, attr_filters=None, | ||
row_start_idx=row_idx, allows_duplicates=False, | ||
mode="append" if row_idx else "ingest") | ||
progress_monitor.update(len(chunk)) | ||
row_idx += len(chunk) | ||
progress_monitor.close() | ||
LOGGER.debug("Appending metadata to TileDB") | ||
# Append omero metadata | ||
with tiledb.open(write_path, mode="w") as array: | ||
array.meta['__version'] = OMERO_TILEDB_VERSION | ||
array.meta['__initialized'] = time.time() | ||
LOGGER.info("Table saved successfully") | ||
return write_path |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure what this comment means. Is it an explanation of why you use
str(write_path)
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes.
tiledb.from_pandas
' first argument is formally calleduri
, but doesn't seem to handle escaped special characters like spaces. I wanted to head off people asking "why don't you just use the Pathlibto_uri
method?"