@@ -1392,11 +1392,15 @@ def variant_chunk_nbytes(self):
1392
1392
"""
1393
1393
Returns the nbytes for a single variant chunk of this array.
1394
1394
"""
1395
- # TODO WARNING IF this is a string
1396
1395
chunk_items = self .chunks [0 ]
1397
1396
for size in self .shape [1 :]:
1398
1397
chunk_items *= size
1399
1398
dt = np .dtype (self .dtype )
1399
+ if dt .kind == "O" :
1400
+ logger .warning (
1401
+ f"Field { self .name } is a string; max memory usage may "
1402
+ "be a significant underestimate"
1403
+ )
1400
1404
return chunk_items * dt .itemsize
1401
1405
1402
1406
@@ -1890,13 +1894,15 @@ def encode_partition(self, partition_index):
1890
1894
os .rename (partition_path , final_path )
1891
1895
1892
1896
def init_partition_array (self , partition_index , name ):
1893
- wip_path = self .wip_partition_array_path (partition_index , name )
1894
1897
# Create an empty array like the definition
1895
1898
src = self .arrays_path / name
1896
1899
# Overwrite any existing WIP files
1900
+ wip_path = self .wip_partition_array_path (partition_index , name )
1897
1901
shutil .copytree (src , wip_path , dirs_exist_ok = True )
1898
- array = zarr .open (wip_path )
1899
- logger .debug (f"Opened empty array { array } @ { wip_path } " )
1902
+ store = zarr .DirectoryStore (self .wip_partition_path (partition_index ))
1903
+ wip_root = zarr .group (store = store )
1904
+ array = wip_root [name ]
1905
+ logger .debug (f"Opened empty array { array .name } <{ array .dtype } > @ { wip_path } " )
1900
1906
return array
1901
1907
1902
1908
def finalise_partition_array (self , partition_index , name ):
@@ -2109,12 +2115,9 @@ def get_max_encoding_memory(self):
2109
2115
"""
2110
2116
Return the approximate maximum memory used to encode a variant chunk.
2111
2117
"""
2112
- # NOTE This size number is also not quite enough, you need a bit of
2113
- # headroom with it (probably 10% or so). We should include this.
2114
- # FIXME this is actively wrong for String columns. See if we can do better.
2115
- max_encoding_mem = max (
2116
- col .variant_chunk_nbytes for col in self .schema .fields .values ()
2117
- )
2118
+ max_encoding_mem = 0
2119
+ for col in self .schema .fields .values ():
2120
+ max_encoding_mem = max (max_encoding_mem , col .variant_chunk_nbytes )
2118
2121
gt_mem = 0
2119
2122
if "call_genotype" in self .schema .fields :
2120
2123
encoded_together = [
0 commit comments