@@ -144,6 +144,7 @@ class VcfPartition:
144144 num_records : int = - 1
145145
146146
147+ # TODO bump this before current PR is done!
147148ICF_METADATA_FORMAT_VERSION = "0.2"
148149ICF_DEFAULT_COMPRESSOR = numcodecs .Blosc (
149150 cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
@@ -903,6 +904,40 @@ def num_columns(self):
903904 return len (self .columns )
904905
905906
907+ @dataclasses .dataclass
908+ class IcfPartitionMetadata :
909+ num_records : int
910+ last_position : int
911+ field_summaries : dict
912+
913+ def asdict (self ):
914+ return dataclasses .asdict (self )
915+
916+ def asjson (self ):
917+ return json .dumps (self .asdict (), indent = 4 )
918+
919+ @staticmethod
920+ def fromdict (d ):
921+ md = IcfPartitionMetadata (** d )
922+ for k , v in md .field_summaries .items ():
923+ md .field_summaries [k ] = VcfFieldSummary .fromdict (v )
924+ return md
925+
926+
927+ def check_overlapping_partitions (partitions ):
928+ for i in range (1 , len (partitions )):
929+ prev_region = partitions [i - 1 ].region
930+ current_region = partitions [i ].region
931+ if prev_region .contig == current_region .contig :
932+ assert prev_region .end is not None
933+ # Regions are *inclusive*
934+ if prev_region .end >= current_region .start :
935+ raise ValueError (
936+ f"Overlapping VCF regions in partitions { i - 1 } and { i } : "
937+ f"{ prev_region } and { current_region } "
938+ )
939+
940+
906941class IntermediateColumnarFormatWriter :
907942 def __init__ (self , path ):
908943 self .path = pathlib .Path (path )
@@ -974,11 +1009,8 @@ def load_partition_summaries(self):
9741009 not_found = []
9751010 for j in range (self .num_partitions ):
9761011 try :
977- with open (self .wip_path / f"p{ j } _summary.json" ) as f :
978- summary = json .load (f )
979- for k , v in summary ["field_summaries" ].items ():
980- summary ["field_summaries" ][k ] = VcfFieldSummary .fromdict (v )
981- summaries .append (summary )
1012+ with open (self .wip_path / f"p{ j } .json" ) as f :
1013+ summaries .append (IcfPartitionMetadata .fromdict (json .load (f )))
9821014 except FileNotFoundError :
9831015 not_found .append (j )
9841016 if len (not_found ) > 0 :
@@ -995,7 +1027,7 @@ def load_metadata(self):
9951027
9961028 def process_partition (self , partition_index ):
9971029 self .load_metadata ()
998- summary_path = self .wip_path / f"p{ partition_index } _summary .json"
1030+ summary_path = self .wip_path / f"p{ partition_index } .json"
9991031 # If someone is rewriting a summary path (for whatever reason), make sure it
10001032 # doesn't look like it's already been completed.
10011033 # NOTE to do this properly we probably need to take a lock on this file - but
@@ -1016,6 +1048,7 @@ def process_partition(self, partition_index):
10161048 else :
10171049 format_fields .append (field )
10181050
1051+ last_position = None
10191052 with IcfPartitionWriter (
10201053 self .metadata ,
10211054 self .path ,
@@ -1025,6 +1058,7 @@ def process_partition(self, partition_index):
10251058 num_records = 0
10261059 for variant in ivcf .variants (partition .region ):
10271060 num_records += 1
1061+ last_position = variant .POS
10281062 tcw .append ("CHROM" , variant .CHROM )
10291063 tcw .append ("POS" , variant .POS )
10301064 tcw .append ("QUAL" , variant .QUAL )
@@ -1049,15 +1083,16 @@ def process_partition(self, partition_index):
10491083 f"flushing buffers"
10501084 )
10511085
1052- partition_metadata = {
1053- "num_records" : num_records ,
1054- "field_summaries" : {k : v .asdict () for k , v in tcw .field_summaries .items ()},
1055- }
1086+ partition_metadata = IcfPartitionMetadata (
1087+ num_records = num_records ,
1088+ last_position = last_position ,
1089+ field_summaries = tcw .field_summaries ,
1090+ )
10561091 with open (summary_path , "w" ) as f :
1057- json . dump (partition_metadata , f , indent = 4 )
1092+ f . write (partition_metadata . asjson () )
10581093 logger .info (
1059- f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } = "
1060- f"{ num_records } records"
1094+ f"Finish p{ partition_index } { partition .vcf_path } __{ partition .region } "
1095+ f"{ num_records } records last_pos= { last_position } "
10611096 )
10621097
10631098 def explode (self , * , worker_processes = 1 , show_progress = False ):
@@ -1099,8 +1134,9 @@ def finalise(self):
10991134 partition_summaries = self .load_partition_summaries ()
11001135 total_records = 0
11011136 for index , summary in enumerate (partition_summaries ):
1102- partition_records = summary [ " num_records" ]
1137+ partition_records = summary . num_records
11031138 self .metadata .partitions [index ].num_records = partition_records
1139+ self .metadata .partitions [index ].region .end = summary .last_position
11041140 total_records += partition_records
11051141 if not np .isinf (self .metadata .num_records ):
11061142 # Note: this is just telling us that there's a bug in the
@@ -1110,9 +1146,11 @@ def finalise(self):
11101146 assert total_records == self .metadata .num_records
11111147 self .metadata .num_records = total_records
11121148
1149+ check_overlapping_partitions (self .metadata .partitions )
1150+
11131151 for field in self .metadata .fields :
11141152 for summary in partition_summaries :
1115- field .summary .update (summary [ " field_summaries" ] [field .full_name ])
1153+ field .summary .update (summary . field_summaries [field .full_name ])
11161154
11171155 logger .info ("Finalising metadata" )
11181156 with open (self .path / "metadata.json" , "w" ) as f :
@@ -1756,7 +1794,7 @@ def encode_partition(self, partition_index):
17561794 final_path = self .partition_path (partition_index )
17571795 logger .info (f"Finalising { partition_index } at { final_path } " )
17581796 if final_path .exists ():
1759- logger .warning ("Removing existing partition at {final_path}" )
1797+ logger .warning (f "Removing existing partition at { final_path } " )
17601798 shutil .rmtree (final_path )
17611799 os .rename (partition_path , final_path )
17621800
0 commit comments