Some non-working thinking through stuff changes

jeromekelleher · jeromekelleher · commit 24b171144ca4 · 2024-04-21T22:30:44.000+01:00
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1752,37 +1752,49 @@ def load_metadata(self):
     def encode_partition(self, partition, *, show_progress=False, worker_processes=1):
         self.load_metadata()
         partition_path = self.partitions_path / f"wip_{partition}"
-        partition_path.mkdir()
-        # Now - copy the arrays from arrays_path to partition_path somehow (shutil?)
-        # and then set self.root to that parition_path.
-        # Then we should be able to encode just the slice for this partition
-        # into those arrays. Once that's fully completed, we rename the
-        # partition_path to just {partition}
-
-
-#     def get_array(self, name):
-#         return self.root["wip_" + name]
-
-#     def finalise_array(self, variable_name):
-#         source = self.path / ("wip_" + variable_name)
-#         dest = self.path / variable_name
-#         # Atomic swap
-#         os.rename(source, dest)
-#         logger.info(f"Finalised {variable_name}")
-
-#     def encode_array_slice(self, column, start, stop):
-#         source_col = self.icf.columns[column.vcf_field]
-#         array = self.get_array(column.name)
-#         ba = core.BufferedArray(array, start)
-#         sanitiser = source_col.sanitiser_factory(ba.buff.shape)
+        # If the partition path exists already, keep going. Let's assume
+        # that it's an earlier failed attempt. Not worrying about having
+        # concurrent encode_partition runs happening for now, but I guess
+        # we could make the partition_path depend on a uuid or something
+        # to make sure it's unique.
+        logger.debug(f"Copying empty arrays in {self.arrays_path} to {partition_path}")
+        shutil.copytree(self.arrays_path, partition_path, dirs_exist_ok=True)
+        # self.root = partition_path
+        # self.encode_partition_slice([partition], show_progress=show_progress,
+        #         worker_processes=worker_processes)
+        #
+        # NOTE not sure what to do here, started making some changes and
+        # got too tired.
+
+    # def get_array(self, name):
+    #     return self.root[name]
+
+    #     def finalise_array(self, variable_name):
+    #         source = self.path / ("wip_" + variable_name)
+    #         dest = self.path / variable_name
+    #         # Atomic swap
+    #         os.rename(source, dest)
+    #         logger.info(f"Finalised {variable_name}")
+
+    def encode_array_partition(self, column, partition):
+        source_col = self.icf.columns[column.vcf_field]
+        array = self.get_array(column.name)
+        ba = core.BufferedArray(array, start)
+        sanitiser = source_col.sanitiser_factory(ba.buff.shape)
+
+        for value in source_col.iter_values(
+            partition.start_index, partition.stop_index
+        ):
+            # We write directly into the buffer in the sanitiser function
+            # to make it easier to reason about dimension padding
+            j = ba.next_buffer_row()
+            sanitiser(ba.buff, j, value)
+        ba.flush()
+        logger.debug(
+            f"Encoded {column.name} chunk slice "
+            f"{partition.start_chunk}:{partition.stop_chunk}"
+        )
 
-#         for value in source_col.iter_values(start, stop):
-#             # We write directly into the buffer in the sanitiser function
-#             # to make it easier to reason about dimension padding
-#             j = ba.next_buffer_row()
-#             sanitiser(ba.buff, j, value)
-#         ba.flush()
-#         logger.debug(f"Encoded {column.name} slice {start}:{stop}")
 
 #     def encode_genotypes_slice(self, start, stop):
 #         source_col = self.icf.columns["FORMAT/GT"]