Skip to content

Commit a8ef611

Browse files
Merge pull request #178 from jeromekelleher/one-based
Support one-based coordinates in explode/encode partition
2 parents 55801f7 + 6ae035e commit a8ef611

File tree

5 files changed

+81
-34
lines changed

5 files changed

+81
-34
lines changed

bio2zarr/cli.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ def list_commands(self, ctx):
5858
help="Force overwriting of existing directories",
5959
)
6060

61+
one_based = click.option(
62+
"--one-based",
63+
is_flag=True,
64+
flag_value=True,
65+
help="Partition indexes are interpreted as one-based",
66+
)
67+
6168
version = click.version_option(version=f"{provenance.__version__}")
6269

6370
worker_processes = click.option(
@@ -226,13 +233,18 @@ def dexplode_init(
226233
@icf_path
227234
@partition
228235
@verbose
229-
def dexplode_partition(icf_path, partition, verbose):
236+
@one_based
237+
def dexplode_partition(icf_path, partition, verbose, one_based):
230238
"""
231-
Convert a VCF partition to intermediate columnar format. Must be called *after*
232-
the ICF path has been initialised with dexplode_init. Partition indexes must be
233-
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
239+
Convert a VCF partition to intermediate columnar format. Must be called
240+
after the ICF path has been initialised with dexplode_init. By default,
241+
partition indexes are from 0 to the number of partitions N (returned by
242+
dexplode_init), exclusive. If the --one-based option is specifed,
243+
partition indexes are in the range 1 to N, inclusive.
234244
"""
235245
setup_logging(verbose)
246+
if one_based:
247+
partition -= 1
236248
vcf.explode_partition(icf_path, partition)
237249

238250

@@ -371,14 +383,17 @@ def dencode_init(
371383
@zarr_path
372384
@partition
373385
@verbose
374-
def dencode_partition(zarr_path, partition, verbose):
375-
"""
376-
Convert a partition from intermediate columnar format to VCF Zarr.
377-
Must be called *after* the Zarr path has been initialised with dencode_init.
378-
Partition indexes must be from 0 (inclusive) to the number of paritions
379-
returned by dencode_init (exclusive).
386+
@one_based
387+
def dencode_partition(zarr_path, partition, verbose, one_based):
380388
"""
389+
Convert a partition from intermediate columnar format to VCF Zarr. Must be
390+
called after the Zarr path has been initialised with dencode_init. By
391+
default, partition indexes are from 0 to the number of partitions N
392+
(returned by dencode_init), exclusive. If the --one-based option is
393+
specifed, partition indexes are in the range 1 to N, inclusive."""
381394
setup_logging(verbose)
395+
if one_based:
396+
partition -= 1
382397
vcf.encode_partition(zarr_path, partition)
383398

384399

bio2zarr/vcf.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -382,8 +382,8 @@ def sanitise_value_bool(buff, j, value):
382382
def sanitise_value_float_scalar(buff, j, value):
383383
x = value
384384
if value is None:
385-
x = FLOAT32_MISSING
386-
buff[j] = x
385+
x = [FLOAT32_MISSING]
386+
buff[j] = x[0]
387387

388388

389389
def sanitise_value_int_scalar(buff, j, value):
@@ -392,7 +392,7 @@ def sanitise_value_int_scalar(buff, j, value):
392392
# print("MISSING", INT_MISSING, INT_FILL)
393393
x = [INT_MISSING]
394394
else:
395-
x = sanitise_int_array([value], ndmin=1, dtype=np.int32)
395+
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
396396
buff[j] = x[0]
397397

398398

@@ -1148,9 +1148,7 @@ def explode(self, *, worker_processes=1, show_progress=False):
11481148
def explode_partition(self, partition):
11491149
self.load_metadata()
11501150
if partition < 0 or partition >= self.num_partitions:
1151-
raise ValueError(
1152-
"Partition index must be in the range 0 <= index < num_partitions"
1153-
)
1151+
raise ValueError("Partition index not in the valid range")
11541152
self.process_partition(partition)
11551153

11561154
def finalise(self):
@@ -1801,9 +1799,7 @@ def partition_array_path(self, partition_index, name):
18011799
def encode_partition(self, partition_index):
18021800
self.load_metadata()
18031801
if partition_index < 0 or partition_index >= self.num_partitions:
1804-
raise ValueError(
1805-
"Partition index must be in the range 0 <= index < num_partitions"
1806-
)
1802+
raise ValueError("Partition index not in the valid range")
18071803
partition_path = self.wip_partition_path(partition_index)
18081804
partition_path.mkdir(exist_ok=True)
18091805
logger.info(f"Encoding partition {partition_index} to {partition_path}")

tests/test_cli.py

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,23 @@ def test_vcf_dexplode_partition(self, mocked, tmp_path):
314314
str(icf_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS
315315
)
316316

317+
@mock.patch("bio2zarr.vcf.explode_partition")
318+
def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path):
319+
runner = ct.CliRunner(mix_stderr=False)
320+
icf_path = tmp_path / "icf"
321+
icf_path.mkdir()
322+
result = runner.invoke(
323+
cli.vcf2zarr,
324+
f"dexplode-partition {icf_path} 1 --one-based",
325+
catch_exceptions=False,
326+
)
327+
assert result.exit_code == 0
328+
assert len(result.stdout) == 0
329+
assert len(result.stderr) == 0
330+
mocked.assert_called_once_with(
331+
str(icf_path), 0, **DEFAULT_DEXPLODE_PARTITION_ARGS
332+
)
333+
317334
@mock.patch("bio2zarr.vcf.explode_partition")
318335
def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path):
319336
runner = ct.CliRunner(mix_stderr=False)
@@ -436,6 +453,23 @@ def test_vcf_dencode_partition(self, mocked, tmp_path):
436453
str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS
437454
)
438455

456+
@mock.patch("bio2zarr.vcf.encode_partition")
457+
def test_vcf_dencode_partition_one_based(self, mocked, tmp_path):
458+
runner = ct.CliRunner(mix_stderr=False)
459+
zarr_path = tmp_path / "zarr"
460+
zarr_path.mkdir()
461+
result = runner.invoke(
462+
cli.vcf2zarr,
463+
f"dencode-partition {zarr_path} 1 --one-based",
464+
catch_exceptions=False,
465+
)
466+
assert result.exit_code == 0
467+
assert len(result.stdout) == 0
468+
assert len(result.stderr) == 0
469+
mocked.assert_called_once_with(
470+
str(zarr_path), 0, **DEFAULT_DENCODE_PARTITION_ARGS
471+
)
472+
439473
@mock.patch("bio2zarr.vcf.encode_finalise")
440474
def test_vcf_dencode_finalise(self, mocked, tmp_path):
441475
runner = ct.CliRunner(mix_stderr=False)
@@ -489,7 +523,8 @@ def test_convert_plink(self, mocked):
489523
class TestVcfEndToEnd:
490524
vcf_path = "tests/data/vcf/sample.vcf.gz"
491525

492-
def test_dexplode(self, tmp_path):
526+
@pytest.mark.parametrize("one_based", [False, True])
527+
def test_dexplode(self, tmp_path, one_based):
493528
icf_path = tmp_path / "icf"
494529
runner = ct.CliRunner(mix_stderr=False)
495530
result = runner.invoke(
@@ -501,11 +536,11 @@ def test_dexplode(self, tmp_path):
501536
assert result.stdout.strip() == "3"
502537

503538
for j in range(3):
504-
result = runner.invoke(
505-
cli.vcf2zarr,
506-
f"dexplode-partition {icf_path} {j}",
507-
catch_exceptions=False,
508-
)
539+
if one_based:
540+
cmd = f"dexplode-partition {icf_path} {j + 1} --one-based"
541+
else:
542+
cmd = f"dexplode-partition {icf_path} {j}"
543+
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
509544
assert result.exit_code == 0
510545
result = runner.invoke(
511546
cli.vcf2zarr, f"dexplode-finalise {icf_path}", catch_exceptions=False
@@ -552,7 +587,8 @@ def test_encode(self, tmp_path):
552587
# Arbitrary check
553588
assert "variant_position" in result.stdout
554589

555-
def test_dencode(self, tmp_path):
590+
@pytest.mark.parametrize("one_based", [False, True])
591+
def test_dencode(self, tmp_path, one_based):
556592
icf_path = tmp_path / "icf"
557593
zarr_path = tmp_path / "zarr"
558594
runner = ct.CliRunner(mix_stderr=False)
@@ -569,12 +605,12 @@ def test_dencode(self, tmp_path):
569605
assert result.stdout.split()[0] == "3"
570606

571607
for j in range(3):
572-
result = runner.invoke(
573-
cli.vcf2zarr,
574-
f"dencode-partition {zarr_path} {j}",
575-
catch_exceptions=False,
576-
)
577-
assert result.exit_code == 0
608+
if one_based:
609+
cmd = f"dencode-partition {zarr_path} {j + 1} --one-based"
610+
else:
611+
cmd = f"dencode-partition {zarr_path} {j}"
612+
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
613+
assert result.exit_code == 0
578614

579615
result = runner.invoke(
580616
cli.vcf2zarr, f"dencode-finalise {zarr_path}", catch_exceptions=False

tests/test_icf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def test_double_explode_partition(self, tmp_path):
170170
def test_explode_partition_out_of_range(self, tmp_path, partition):
171171
icf_path = tmp_path / "x.icf"
172172
vcf.explode_init(icf_path, [self.data_path])
173-
with pytest.raises(ValueError, match="Partition index must be in the range"):
173+
with pytest.raises(ValueError, match="Partition index not in the valid range"):
174174
vcf.explode_partition(icf_path, partition)
175175

176176
def test_explode_same_file_twice(self, tmp_path):

tests/test_vcf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,5 +502,5 @@ def test_double_encode_partition(self, icf_path, tmp_path, caplog):
502502
def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition):
503503
zarr_path = tmp_path / "x.zarr"
504504
vcf.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3)
505-
with pytest.raises(ValueError, match="Partition index must be in the range"):
505+
with pytest.raises(ValueError, match="Partition index not in the valid range"):
506506
vcf.encode_partition(zarr_path, partition)

0 commit comments

Comments
 (0)