-
Notifications
You must be signed in to change notification settings - Fork 4
Add initial remote table registration implementation #25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
66a9f85
cc19a77
e142dc5
3ae1482
9399e67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# encoding: utf-8 | ||
# | ||
# Copyright (c) Glencoe Software, Inc. All rights reserved. | ||
# | ||
# This software is distributed under the terms described by the LICENCE file | ||
# you can find at the root of the distribution bundle. | ||
# If the file is missing please request a copy by contacting | ||
# support@glencoesoftware.com. | ||
import logging | ||
from pathlib import Path, PurePosixPath | ||
import time | ||
|
||
import pandas as pd | ||
import tiledb | ||
from tqdm.auto import tqdm | ||
|
||
LOGGER = logging.getLogger(__name__) | ||
|
||
OMERO_TILEDB_VERSION = '3' # Version of the omero table implementation | ||
|
||
|
||
def register_table(source, local_path, remote_path=None, chunk_size=1000): | ||
LOGGER.info("Registering remote table") | ||
# Default filters from tiledb.from_pandas() | ||
write_path = Path(local_path or remote_path).with_suffix(".tiledb") | ||
# Assume the server will be running on Linux | ||
remote_path = PurePosixPath( | ||
remote_path or local_path).with_suffix(".tiledb") | ||
LOGGER.debug(f"Remote path would be {str(remote_path)}") | ||
if write_path.exists(): | ||
raise ValueError(f"Table file {write_path} already exists") | ||
# path.as_uri() exists but mangles any spaces in the path! | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what this comment means. Is it an explanation of why you use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. |
||
write_path = str(write_path) | ||
# Use a default chunk size if not set | ||
chunk_size = chunk_size or 1000 | ||
Comment on lines
+34
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should probably just use a default value in the function signature instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I'd normally do this. However in this context we may receive Nonetheless I'll add some defaults to this function's signature so that people using it in isolation have an easier time. |
||
LOGGER.info("Writing data to TileDB") | ||
# Export table | ||
if isinstance(source, (str, Path)): | ||
data_iterator = pd.read_csv(source, chunksize=chunk_size) | ||
total_rows = None | ||
else: | ||
data_iterator = (source.iloc[i:i + chunk_size] | ||
for i in range(0, len(source), chunk_size)) | ||
total_rows = len(source) | ||
progress_monitor = tqdm( | ||
desc="Generating TileDB file...", initial=1, dynamic_ncols=True, | ||
total=total_rows, | ||
bar_format='{desc}: {percentage:3.0f}%|{bar}| ' | ||
'{n_fmt}/{total_fmt} rows, {elapsed} {postfix}') | ||
row_idx = 0 | ||
for chunk in data_iterator: | ||
tiledb.from_pandas(write_path, chunk, sparse=True, full_domain=True, | ||
tile=10000, attr_filters=None, | ||
row_start_idx=row_idx, allows_duplicates=False, | ||
mode="append" if row_idx else "ingest") | ||
progress_monitor.update(len(chunk)) | ||
row_idx += len(chunk) | ||
progress_monitor.close() | ||
LOGGER.debug("Appending metadata to TileDB") | ||
# Append omero metadata | ||
with tiledb.open(write_path, mode="w") as array: | ||
array.meta['__version'] = OMERO_TILEDB_VERSION | ||
array.meta['__initialized'] = time.time() | ||
LOGGER.info("Table saved successfully") | ||
return write_path |
Uh oh!
There was an error while loading. Please reload this page.