Skip to content

Support one-based coordinates in explode/encode partition #178

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions bio2zarr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ def list_commands(self, ctx):
help="Force overwriting of existing directories",
)

one_based = click.option(
"--one-based",
is_flag=True,
flag_value=True,
help="Partition indexes are interpreted as one-based",
)

version = click.version_option(version=f"{provenance.__version__}")

worker_processes = click.option(
Expand Down Expand Up @@ -226,13 +233,18 @@ def dexplode_init(
@icf_path
@partition
@verbose
def dexplode_partition(icf_path, partition, verbose):
@one_based
def dexplode_partition(icf_path, partition, verbose, one_based):
"""
Convert a VCF partition to intermediate columnar format. Must be called *after*
the ICF path has been initialised with dexplode_init. Partition indexes must be
from 0 (inclusive) to the number of paritions returned by dexplode_init (exclusive).
Convert a VCF partition to intermediate columnar format. Must be called
after the ICF path has been initialised with dexplode_init. By default,
partition indexes are from 0 to the number of partitions N (returned by
dexplode_init), exclusive. If the --one-based option is specifed,
partition indexes are in the range 1 to N, inclusive.
"""
setup_logging(verbose)
if one_based:
partition -= 1
vcf.explode_partition(icf_path, partition)


Expand Down Expand Up @@ -371,14 +383,17 @@ def dencode_init(
@zarr_path
@partition
@verbose
def dencode_partition(zarr_path, partition, verbose):
"""
Convert a partition from intermediate columnar format to VCF Zarr.
Must be called *after* the Zarr path has been initialised with dencode_init.
Partition indexes must be from 0 (inclusive) to the number of paritions
returned by dencode_init (exclusive).
@one_based
def dencode_partition(zarr_path, partition, verbose, one_based):
"""
Convert a partition from intermediate columnar format to VCF Zarr. Must be
called after the Zarr path has been initialised with dencode_init. By
default, partition indexes are from 0 to the number of partitions N
(returned by dencode_init), exclusive. If the --one-based option is
specifed, partition indexes are in the range 1 to N, inclusive."""
setup_logging(verbose)
if one_based:
partition -= 1
vcf.encode_partition(zarr_path, partition)


Expand Down
14 changes: 5 additions & 9 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,8 +382,8 @@ def sanitise_value_bool(buff, j, value):
def sanitise_value_float_scalar(buff, j, value):
x = value
if value is None:
x = FLOAT32_MISSING
buff[j] = x
x = [FLOAT32_MISSING]
buff[j] = x[0]


def sanitise_value_int_scalar(buff, j, value):
Expand All @@ -392,7 +392,7 @@ def sanitise_value_int_scalar(buff, j, value):
# print("MISSING", INT_MISSING, INT_FILL)
x = [INT_MISSING]
else:
x = sanitise_int_array([value], ndmin=1, dtype=np.int32)
x = sanitise_int_array(value, ndmin=1, dtype=np.int32)
buff[j] = x[0]


Expand Down Expand Up @@ -1148,9 +1148,7 @@ def explode(self, *, worker_processes=1, show_progress=False):
def explode_partition(self, partition):
self.load_metadata()
if partition < 0 or partition >= self.num_partitions:
raise ValueError(
"Partition index must be in the range 0 <= index < num_partitions"
)
raise ValueError("Partition index not in the valid range")
self.process_partition(partition)

def finalise(self):
Expand Down Expand Up @@ -1801,9 +1799,7 @@ def partition_array_path(self, partition_index, name):
def encode_partition(self, partition_index):
self.load_metadata()
if partition_index < 0 or partition_index >= self.num_partitions:
raise ValueError(
"Partition index must be in the range 0 <= index < num_partitions"
)
raise ValueError("Partition index not in the valid range")
partition_path = self.wip_partition_path(partition_index)
partition_path.mkdir(exist_ok=True)
logger.info(f"Encoding partition {partition_index} to {partition_path}")
Expand Down
62 changes: 49 additions & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,23 @@ def test_vcf_dexplode_partition(self, mocked, tmp_path):
str(icf_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.explode_partition")
def test_vcf_dexplode_partition_one_based(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
icf_path = tmp_path / "icf"
icf_path.mkdir()
result = runner.invoke(
cli.vcf2zarr,
f"dexplode-partition {icf_path} 1 --one-based",
catch_exceptions=False,
)
assert result.exit_code == 0
assert len(result.stdout) == 0
assert len(result.stderr) == 0
mocked.assert_called_once_with(
str(icf_path), 0, **DEFAULT_DEXPLODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.explode_partition")
def test_vcf_dexplode_partition_missing_dir(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
Expand Down Expand Up @@ -436,6 +453,23 @@ def test_vcf_dencode_partition(self, mocked, tmp_path):
str(zarr_path), 1, **DEFAULT_DENCODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.encode_partition")
def test_vcf_dencode_partition_one_based(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
zarr_path = tmp_path / "zarr"
zarr_path.mkdir()
result = runner.invoke(
cli.vcf2zarr,
f"dencode-partition {zarr_path} 1 --one-based",
catch_exceptions=False,
)
assert result.exit_code == 0
assert len(result.stdout) == 0
assert len(result.stderr) == 0
mocked.assert_called_once_with(
str(zarr_path), 0, **DEFAULT_DENCODE_PARTITION_ARGS
)

@mock.patch("bio2zarr.vcf.encode_finalise")
def test_vcf_dencode_finalise(self, mocked, tmp_path):
runner = ct.CliRunner(mix_stderr=False)
Expand Down Expand Up @@ -489,7 +523,8 @@ def test_convert_plink(self, mocked):
class TestVcfEndToEnd:
vcf_path = "tests/data/vcf/sample.vcf.gz"

def test_dexplode(self, tmp_path):
@pytest.mark.parametrize("one_based", [False, True])
def test_dexplode(self, tmp_path, one_based):
icf_path = tmp_path / "icf"
runner = ct.CliRunner(mix_stderr=False)
result = runner.invoke(
Expand All @@ -501,11 +536,11 @@ def test_dexplode(self, tmp_path):
assert result.stdout.strip() == "3"

for j in range(3):
result = runner.invoke(
cli.vcf2zarr,
f"dexplode-partition {icf_path} {j}",
catch_exceptions=False,
)
if one_based:
cmd = f"dexplode-partition {icf_path} {j + 1} --one-based"
else:
cmd = f"dexplode-partition {icf_path} {j}"
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
assert result.exit_code == 0
result = runner.invoke(
cli.vcf2zarr, f"dexplode-finalise {icf_path}", catch_exceptions=False
Expand Down Expand Up @@ -552,7 +587,8 @@ def test_encode(self, tmp_path):
# Arbitrary check
assert "variant_position" in result.stdout

def test_dencode(self, tmp_path):
@pytest.mark.parametrize("one_based", [False, True])
def test_dencode(self, tmp_path, one_based):
icf_path = tmp_path / "icf"
zarr_path = tmp_path / "zarr"
runner = ct.CliRunner(mix_stderr=False)
Expand All @@ -569,12 +605,12 @@ def test_dencode(self, tmp_path):
assert result.stdout.split()[0] == "3"

for j in range(3):
result = runner.invoke(
cli.vcf2zarr,
f"dencode-partition {zarr_path} {j}",
catch_exceptions=False,
)
assert result.exit_code == 0
if one_based:
cmd = f"dencode-partition {zarr_path} {j + 1} --one-based"
else:
cmd = f"dencode-partition {zarr_path} {j}"
result = runner.invoke(cli.vcf2zarr, cmd, catch_exceptions=False)
assert result.exit_code == 0

result = runner.invoke(
cli.vcf2zarr, f"dencode-finalise {zarr_path}", catch_exceptions=False
Expand Down
2 changes: 1 addition & 1 deletion tests/test_icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_double_explode_partition(self, tmp_path):
def test_explode_partition_out_of_range(self, tmp_path, partition):
icf_path = tmp_path / "x.icf"
vcf.explode_init(icf_path, [self.data_path])
with pytest.raises(ValueError, match="Partition index must be in the range"):
with pytest.raises(ValueError, match="Partition index not in the valid range"):
vcf.explode_partition(icf_path, partition)

def test_explode_same_file_twice(self, tmp_path):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,5 +502,5 @@ def test_double_encode_partition(self, icf_path, tmp_path, caplog):
def test_encode_partition_out_of_range(self, icf_path, tmp_path, partition):
zarr_path = tmp_path / "x.zarr"
vcf.encode_init(icf_path, zarr_path, 3, variants_chunk_size=3)
with pytest.raises(ValueError, match="Partition index must be in the range"):
with pytest.raises(ValueError, match="Partition index not in the valid range"):
vcf.encode_partition(zarr_path, partition)
Loading