diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 54f344d8..670a323f 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -58,6 +58,13 @@ def list_commands(self, ctx): help="Force overwriting of existing directories", ) +one_based = click.option( + "--one-based", + is_flag=True, + flag_value=True, + help="Partition indexes are interpreted as one-based", +) + version = click.version_option(version=f"{provenance.__version__}") worker_processes = click.option( @@ -226,13 +233,18 @@ def dexplode_init( @icf_path @partition @verbose -def dexplode_partition(icf_path, partition, verbose): +@one_based +def dexplode_partition(icf_path, partition, verbose, one_based): """ - Convert a VCF partition to intermediate columnar format. Must be called *after* - the ICF path has been initialised with dexplode_init. Partition indexes must be - from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive). + Convert a VCF partition to intermediate columnar format. Must be called + after the ICF path has been initialised with dexplode_init. By default, + partition indexes are from 0 to the number of partitions N (returned by + dexplode_init), exclusive. If the --one-based option is specifed, + partition indexes are in the range 1 to N, inclusive. """ setup_logging(verbose) + if one_based: + partition -= 1 vcf.explode_partition(icf_path, partition) @@ -371,14 +383,17 @@ def dencode_init( @zarr_path @partition @verbose -def dencode_partition(zarr_path, partition, verbose): - """ - Convert a partition from intermediate columnar format to VCF Zarr. - Must be called *after* the Zarr path has been initialised with dencode_init. - Partition indexes must be from 0 (inclusive) to the number of paritions - returned by dencode_init (exclusive). +@one_based +def dencode_partition(zarr_path, partition, verbose, one_based): """ + Convert a partition from intermediate columnar format to VCF Zarr. Must be + called after the Zarr path has been initialised with dencode_init. By + default, partition indexes are from 0 to the number of partitions N + (returned by dencode_init), exclusive. If the --one-based option is + specifed, partition indexes are in the range 1 to N, inclusive.""" setup_logging(verbose) + if one_based: + partition -= 1 vcf.encode_partition(zarr_path, partition) diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index a2a090fd..c642ad3e 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -382,8 +382,8 @@ def sanitise_value_bool(buff, j, value): def sanitise_value_float_scalar(buff, j, value): x = value if value is None: - x = FLOAT32_MISSING - buff[j] = x + x = [FLOAT32_MISSING] + buff[j] = x[0] def sanitise_value_int_scalar(buff, j, value): @@ -392,7 +392,7 @@ def sanitise_value_int_scalar(buff, j, value): # print("MISSING", INT_MISSING, INT_FILL) x = [INT_MISSING] else: - x = sanitise_int_array([value], ndmin=1, dtype=np.int32) + x = sanitise_int_array(value, ndmin=1, dtype=np.int32) buff[j] = x[0] @@ -1148,9 +1148,7 @@ def explode(self, *, worker_processes=1, show_progress=False): def explode_partition(self, partition): self.load_metadata() if partition < 0 or partition >= self.num_partitions: - raise ValueError( - "Partition index must be in the range 0 <= index < num_partitions" - ) + raise ValueError("Partition index not in the valid range") self.process_partition(partition) def finalise(self): @@ -1801,9 +1799,7 @@ def partition_array_path(self, partition_index, name): def encode_partition(self, partition_index): self.load_metadata() if partition_index < 0 or partition_index >= self.num_partitions: - raise ValueError( - "Partition index must be in the range 0 <= index < num_partitions" - ) + raise ValueError("Partition index not in the valid range") partition_path = self.wip_partition_path(partition_index) partition_path.mkdir(exist_ok=True) logger.info(f"Encoding partition {partition_index} to {partition_path}") diff --git a/tests/test_cli.py b/tests/test_cli.py index 9a9421aa..f37a3fc7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -314,6 +314,23 @@ def test_vcf_dexplode_partition(self, mocked, tmp_path): str(icf_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS ) + @mock.patch("bio2zarr.vcf.explode_partition") + def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path): + runner = ct.CliRunner(mix_stderr=False) + icf_path = tmp_path / "icf" + icf_path.mkdir() + result = runner.invoke( + cli.vcf2zarr, + f"dexplode-partition {icf_path} 1 --one-based", + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert len(result.stdout) == 0 + assert len(result.stderr) == 0 + mocked.assert_called_once_with( + str(icf_path), 0, **DEFAULT_DEXPLODE_PARTITION_ARGS + ) + @mock.patch("bio2zarr.vcf.explode_partition") def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) @@ -436,6 +453,23 @@ def test_vcf_dencode_partition(self, mocked, tmp_path): str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS ) + @mock.patch("bio2zarr.vcf.encode_partition") + def test_vcf_dencode_partition_one_based(self, mocked, tmp_path): + runner = ct.CliRunner(mix_stderr=False) + zarr_path = tmp_path / "zarr" + zarr_path.mkdir() + result = runner.invoke( + cli.vcf2zarr, + f"dencode-partition {zarr_path} 1 --one-based", + catch_exceptions=False, + ) + assert result.exit_code == 0 + assert len(result.stdout) == 0 + assert len(result.stderr) == 0 + mocked.assert_called_once_with( + str(zarr_path), 0, **DEFAULT_DENCODE_PARTITION_ARGS + ) + @mock.patch("bio2zarr.vcf.encode_finalise") def test_vcf_dencode_finalise(self, mocked, tmp_path): runner = ct.CliRunner(mix_stderr=False) @@ -489,7 +523,8 @@ def test_convert_plink(self, mocked): class TestVcfEndToEnd: vcf_path = "tests/data/vcf/sample.vcf.gz" - def test_dexplode(self, tmp_path): + @pytest.mark.parametrize("one_based", [False, True]) + def test_dexplode(self, tmp_path, one_based): icf_path = tmp_path / "icf" runner = ct.CliRunner(mix_stderr=False) result = runner.invoke( @@ -501,11 +536,11 @@ def test_dexplode(self, tmp_path): assert result.stdout.strip() == "3" for j in range(3): - result = runner.invoke( - cli.vcf2zarr, - f"dexplode-partition {icf_path} {j}", - catch_exceptions=False, - ) + if one_based: + cmd = f"dexplode-partition {icf_path} {j + 1} --one-based" + else: + cmd = f"dexplode-partition {icf_path} {j}" + result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False) assert result.exit_code == 0 result = runner.invoke( cli.vcf2zarr, f"dexplode-finalise {icf_path}", catch_exceptions=False @@ -552,7 +587,8 @@ def test_encode(self, tmp_path): # Arbitrary check assert "variant_position" in result.stdout - def test_dencode(self, tmp_path): + @pytest.mark.parametrize("one_based", [False, True]) + def test_dencode(self, tmp_path, one_based): icf_path = tmp_path / "icf" zarr_path = tmp_path / "zarr" runner = ct.CliRunner(mix_stderr=False) @@ -569,12 +605,12 @@ def test_dencode(self, tmp_path): assert result.stdout.split()[0] == "3" for j in range(3): - result = runner.invoke( - cli.vcf2zarr, - f"dencode-partition {zarr_path} {j}", - catch_exceptions=False, - ) - assert result.exit_code == 0 + if one_based: + cmd = f"dencode-partition {zarr_path} {j + 1} --one-based" + else: + cmd = f"dencode-partition {zarr_path} {j}" + result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False) + assert result.exit_code == 0 result = runner.invoke( cli.vcf2zarr, f"dencode-finalise {zarr_path}", catch_exceptions=False diff --git a/tests/test_icf.py b/tests/test_icf.py index 45800001..2cc21432 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -170,7 +170,7 @@ def test_double_explode_partition(self, tmp_path): def test_explode_partition_out_of_range(self, tmp_path, partition): icf_path = tmp_path / "x.icf" vcf.explode_init(icf_path, [self.data_path]) - with pytest.raises(ValueError, match="Partition index must be in the range"): + with pytest.raises(ValueError, match="Partition index not in the valid range"): vcf.explode_partition(icf_path, partition) def test_explode_same_file_twice(self, tmp_path): diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 40723dc1..c71b666d 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -502,5 +502,5 @@ def test_double_encode_partition(self, icf_path, tmp_path, caplog): def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition): zarr_path = tmp_path / "x.zarr" vcf.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3) - with pytest.raises(ValueError, match="Partition index must be in the range"): + with pytest.raises(ValueError, match="Partition index not in the valid range"): vcf.encode_partition(zarr_path, partition)