Skip to content

Commit 7c0cd3a

Browse files
Merge pull request #121 from jeromekelleher/trim-empty-regions
Trim empty regions
2 parents 33cf089 + d17ee5d commit 7c0cd3a

File tree

3 files changed

+20
-9
lines changed

3 files changed

+20
-9
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
# 0.0.5 2024-04-XX
1+
# 0.0.5 2024-04-17
22

33
- Fix bug in schema handling (compressor settings ignored)
44
- Move making ICF field partition directories into per-partition processing.
55
Remove progress on the init mkdirs step.
66
- Turn off progress monitor on dexplode-partition
7+
- Fix empty partition bug
78

89
# 0.0.4 2024-04-08
910

bio2zarr/vcf_utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,22 @@ def variants(self, region):
435435
if var.POS >= start:
436436
yield var
437437

438+
def _filter_empty(self, regions):
439+
"""
440+
Return all regions in the specified list that have one or more records.
441+
442+
Sometimes with Tabix indexes these seem to crop up:
443+
444+
- https://github.com/sgkit-dev/bio2zarr/issues/45
445+
- https://github.com/sgkit-dev/bio2zarr/issues/120
446+
"""
447+
ret = []
448+
for region in regions:
449+
variants = self.variants(region)
450+
if next(variants, None) is not None:
451+
ret.append(region)
452+
return ret
453+
438454
def partition_into_regions(
439455
self,
440456
num_parts: Optional[int] = None,
@@ -511,4 +527,4 @@ def partition_into_regions(
511527
if self.index.record_counts[ri] > 0:
512528
regions.append(Region(self.sequence_names[ri]))
513529

514-
return regions
530+
return self._filter_empty(regions)

tests/test_vcf_utils.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,7 @@
99

1010

1111
def assert_part_counts_non_zero(part_counts, index_file):
12-
# We may have one zero count value at the end in Tabix indexes.
13-
# Should probably try to get rid of it, but probably no harm
14-
# https://github.com/jeromekelleher/bio2zarr/issues/45
15-
if index_file.endswith(".tbi"):
16-
assert np.all(part_counts[:-1] > 0)
17-
else:
18-
assert np.all(part_counts > 0)
12+
assert np.all(part_counts > 0)
1913

2014

2115
class TestIndexedVcf:

0 commit comments

Comments
 (0)