Roughly working distributed encode

jeromekelleher · jeromekelleher · commit a616dcf67702 · 2024-04-22T23:46:39.000+01:00
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import math
+import os
 import pathlib
 import pickle
 import shutil
@@ -1592,7 +1593,8 @@ def fromdict(d):
                 f"{d['format_version']} != {VZW_METADATA_FORMAT_VERSION}"
             )
         ret = VcfZarrWriterMetadata(**d)
-        ret.schema = VcfZarrSchema(**ret.schema)
+        ret.schema = VcfZarrSchema.fromdict(ret.schema)
+        ret.partitions = [VcfZarrPartition(**p) for p in ret.partitions]
         return ret
 
 
@@ -1751,35 +1753,28 @@ def load_metadata(self):
 
     def encode_partition(self, partition, *, show_progress=False, worker_processes=1):
         self.load_metadata()
-        partition_path = self.partitions_path / f"wip_{partition}"
-        # If the partition path exists already, keep going. Let's assume
-        # that it's an earlier failed attempt. Not worrying about having
-        # concurrent encode_partition runs happening for now, but I guess
-        # we could make the partition_path depend on a uuid or something
-        # to make sure it's unique.
-        logger.debug(f"Copying empty arrays in {self.arrays_path} to {partition_path}")
-        shutil.copytree(self.arrays_path, partition_path, dirs_exist_ok=True)
-        # self.root = partition_path
-        # self.encode_partition_slice([partition], show_progress=show_progress,
-        #         worker_processes=worker_processes)
-        #
-        # NOTE not sure what to do here, started making some changes and
-        # got too tired.
-
-    # def get_array(self, name):
-    #     return self.root[name]
-
-    #     def finalise_array(self, variable_name):
-    #         source = self.path / ("wip_" + variable_name)
-    #         dest = self.path / variable_name
-    #         # Atomic swap
-    #         os.rename(source, dest)
-    #         logger.info(f"Finalised {variable_name}")
-
-    def encode_array_partition(self, column, partition):
+        partition_path = self.partitions_path / f"{partition}"
+        partition_path.mkdir(exist_ok=True)
+        logger.debug(f"Creating partition dir {partition_path}")
+
+        for col in self.metadata.schema.columns.values():
+            if col.vcf_field is not None:
+                self.encode_array_partition(col, partition, partition_path)
+
+    def encode_array_partition(self, column, partition_index, partition_path):
+        wip_path = partition_path / f"wip_{column.name}"
+        final_path = partition_path / column.name
+        # Create an empty array like the definition
+        src = self.arrays_path / column.name
+        # TODO add overwrite here
+        shutil.copytree(src, wip_path)
+
+        array = zarr.open(wip_path)
+        logger.debug(f"Opened empty array {array}")
+
+        partition = self.metadata.partitions[partition_index]
+        ba = core.BufferedArray(array, partition.start_index)
         source_col = self.icf.columns[column.vcf_field]
-        array = self.get_array(column.name)
-        ba = core.BufferedArray(array, start)
         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
 
         for value in source_col.iter_values(
@@ -1790,11 +1785,41 @@ def encode_array_partition(self, column, partition):
             j = ba.next_buffer_row()
             sanitiser(ba.buff, j, value)
         ba.flush()
+
+        # Atomic swap
+        os.rename(wip_path, final_path)
         logger.debug(
-            f"Encoded {column.name} chunk slice "
+            f"Encoded {column.name} partition "
             f"{partition.start_chunk}:{partition.stop_chunk}"
         )
 
+    #######################
+    # finalise
+    #######################
+
+    def finalise_array(self, name):
+        logger.debug(f"Finalising {name}")
+        for partition in range(len(self.metadata.partitions)):
+            # Move all the files in partition dir to dest dir
+
+            partition_path = self.partitions_path / f"{partition}"
+            src = partition_path / name
+            dest = self.arrays_path / name
+            for chunk_file in list(src.iterdir()):
+                if not chunk_file.name.startswith("."):
+                    # TODO check for a count of then number of files
+                    os.rename(chunk_file, dest / chunk_file.name)
+        # Finally, once all the chunks have moved into the arrays dir,
+        # we move it out of wip
+        os.rename(self.arrays_path / name, self.path / name)
+
+    def finalise(self):
+        self.load_metadata()
+
+        for col in self.metadata.schema.columns.values():
+            if col.vcf_field is not None:
+                self.finalise_array(col.name)
+
 
 #     def encode_genotypes_slice(self, start, stop):
 #         source_col = self.icf.columns["FORMAT/GT"]