Skip to content

Commit 60d4770

Browse files
Add more logging for variant chunk memory
This is likely as good as we can do here, we just have to document the problem Closes #141
1 parent 4a8e0ff commit 60d4770

File tree

2 files changed

+13
-11
lines changed

2 files changed

+13
-11
lines changed

bio2zarr/core.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def flush(self):
128128
sync_flush_2d_array(
129129
self.buff[: self.buffer_row], self.array, self.array_offset
130130
)
131-
# FIXME the array.name doesn't seem to be working here for some reason
132131
logger.debug(
133132
f"Flushed <{self.array.name} {self.array.shape} "
134133
f"{self.array.dtype}> "

bio2zarr/vcf.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1392,11 +1392,15 @@ def variant_chunk_nbytes(self):
13921392
"""
13931393
Returns the nbytes for a single variant chunk of this array.
13941394
"""
1395-
# TODO WARNING IF this is a string
13961395
chunk_items = self.chunks[0]
13971396
for size in self.shape[1:]:
13981397
chunk_items *= size
13991398
dt = np.dtype(self.dtype)
1399+
if dt.kind == "O":
1400+
logger.warning(
1401+
f"Field {self.name} is a string; max memory usage may "
1402+
"be a significant underestimate"
1403+
)
14001404
return chunk_items * dt.itemsize
14011405

14021406

@@ -1890,13 +1894,15 @@ def encode_partition(self, partition_index):
18901894
os.rename(partition_path, final_path)
18911895

18921896
def init_partition_array(self, partition_index, name):
1893-
wip_path = self.wip_partition_array_path(partition_index, name)
18941897
# Create an empty array like the definition
18951898
src = self.arrays_path / name
18961899
# Overwrite any existing WIP files
1900+
wip_path = self.wip_partition_array_path(partition_index, name)
18971901
shutil.copytree(src, wip_path, dirs_exist_ok=True)
1898-
array = zarr.open(wip_path)
1899-
logger.debug(f"Opened empty array {array} @ {wip_path}")
1902+
store = zarr.DirectoryStore(self.wip_partition_path(partition_index))
1903+
wip_root = zarr.group(store=store)
1904+
array = wip_root[name]
1905+
logger.debug(f"Opened empty array {array.name} <{array.dtype}> @ {wip_path}")
19001906
return array
19011907

19021908
def finalise_partition_array(self, partition_index, name):
@@ -2109,12 +2115,9 @@ def get_max_encoding_memory(self):
21092115
"""
21102116
Return the approximate maximum memory used to encode a variant chunk.
21112117
"""
2112-
# NOTE This size number is also not quite enough, you need a bit of
2113-
# headroom with it (probably 10% or so). We should include this.
2114-
# FIXME this is actively wrong for String columns. See if we can do better.
2115-
max_encoding_mem = max(
2116-
col.variant_chunk_nbytes for col in self.schema.fields.values()
2117-
)
2118+
max_encoding_mem = 0
2119+
for col in self.schema.fields.values():
2120+
max_encoding_mem = max(max_encoding_mem, col.variant_chunk_nbytes)
21182121
gt_mem = 0
21192122
if "call_genotype" in self.schema.fields:
21202123
encoded_together = [

0 commit comments

Comments
 (0)