Skip to content

Commit 6e1ec0d

Browse files
Merge pull request #247 from jeromekelleher/num-partitions-named-arg
Change num_partitions to option
2 parents e19455b + 3d2c1b7 commit 6e1ec0d

File tree

5 files changed

+70
-26
lines changed

5 files changed

+70
-26
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ jobs:
5454
python -m bio2zarr vcf2zarr encode sample.icf sample.vcz -f
5555
- name: Run distributed explode example
5656
run: |
57-
python -m bio2zarr vcf2zarr dexplode-init tests/data/vcf/sample.vcf.gz sample.icf 3 -f
57+
python -m bio2zarr vcf2zarr dexplode-init tests/data/vcf/sample.vcf.gz sample.icf -fn 3
5858
python -m bio2zarr vcf2zarr dexplode-partition sample.icf 0
5959
python -m bio2zarr vcf2zarr dexplode-partition sample.icf 1
6060
python -m bio2zarr vcf2zarr dexplode-partition sample.icf 2
6161
python -m bio2zarr vcf2zarr dexplode-finalise sample.icf
6262
- name: Run distributed encode example
6363
run: |
64-
python -m bio2zarr vcf2zarr dencode-init sample.icf sample.vcz 3 -f --variants-chunk-size=3
64+
python -m bio2zarr vcf2zarr dencode-init sample.icf sample.vcz -fn 3 --variants-chunk-size=3
6565
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 0
6666
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 1
6767
python -m bio2zarr vcf2zarr dencode-partition sample.vcz 2

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# 0.1.0 2024-06-10
22

3+
- Initial production-ready version.
34
- Add -Q/--no-progress flag to CLI
5+
- Change num-partitions argument in dexplode-init and dencode-init
6+
to a named option.
47

58
# 0.0.10 2024-05-15
69
- Change output format of dexplode-init and dencode-init

bio2zarr/cli.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,13 @@ def list_commands(self, ctx):
4444
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
4545
)
4646

47-
num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
47+
num_partitions = click.option(
48+
"-n",
49+
"--num-partitions",
50+
type=click.IntRange(min=1),
51+
default=None,
52+
help="Target number of partitions to split into",
53+
)
4854

4955
partition = click.argument("partition", type=click.IntRange(min=0))
5056

@@ -172,6 +178,15 @@ def check_overwrite_dir(path, force):
172178
shutil.rmtree(tmp_delete_path)
173179

174180

181+
def check_partitions(num_partitions):
182+
if num_partitions is None:
183+
raise click.UsageError(
184+
"-n/--num-partitions must currently be specified. Future versions "
185+
"will provide reasonable defaults or other means of specifying "
186+
"partitions."
187+
)
188+
189+
175190
def get_compressor(cname):
176191
if cname is None:
177192
return None
@@ -249,10 +264,11 @@ def dexplode_init(
249264
):
250265
"""
251266
Initial step for distributed conversion of VCF(s) to intermediate columnar format
252-
over the requested number of paritions.
267+
over some number of paritions.
253268
"""
254269
setup_logging(verbose)
255270
check_overwrite_dir(icf_path, force)
271+
check_partitions(num_partitions)
256272
work_summary = vcf2zarr.explode_init(
257273
icf_path,
258274
vcfs,
@@ -388,7 +404,7 @@ def dencode_init(
388404
"""
389405
Initialise conversion of intermediate format to VCF Zarr. This will
390406
set up the specified ZARR_PATH to perform this conversion over
391-
NUM_PARTITIONS.
407+
some number of partitions.
392408
393409
The output of this commmand is the actual number of partitions generated
394410
(which may be less then the requested number, if there is not sufficient
@@ -400,6 +416,7 @@ def dencode_init(
400416
"""
401417
setup_logging(verbose)
402418
check_overwrite_dir(zarr_path, force)
419+
check_partitions(num_partitions)
403420
work_summary = vcf2zarr.encode_init(
404421
icf_path,
405422
zarr_path,
@@ -545,21 +562,15 @@ def plink2zarr():
545562
@version
546563
@click.argument("vcf_path", type=click.Path(exists=True, dir_okay=False))
547564
@verbose
548-
@click.option(
549-
"-n",
550-
"--num-parts",
551-
type=int,
552-
default=None,
553-
help="Target number of partitions to split the VCF into",
554-
)
565+
@num_partitions
555566
@click.option(
556567
"-s",
557-
"--part-size",
568+
"--partition-size",
558569
type=str,
559570
default=None,
560571
help="Target (compressed) size of VCF partitions, e.g. 100KB, 10MiB, 1G.",
561572
)
562-
def vcfpartition(vcf_path, verbose, num_parts, part_size):
573+
def vcfpartition(vcf_path, verbose, num_partitions, partition_size):
563574
"""
564575
Output bcftools region strings that partition an indexed VCF/BCF file
565576
into either an approximate number of parts (-n), or parts of approximately
@@ -574,12 +585,14 @@ def vcfpartition(vcf_path, verbose, num_parts, part_size):
574585
of records that they contain.
575586
"""
576587
setup_logging(verbose)
577-
if num_parts is None and part_size is None:
578-
raise click.UsageError("Either --num-parts or --part-size must be specified")
588+
if num_partitions is None and partition_size is None:
589+
raise click.UsageError(
590+
"Either --num-partitions or --partition-size must be specified"
591+
)
579592

580593
indexed_vcf = vcf_utils.IndexedVcf(vcf_path)
581594
regions = indexed_vcf.partition_into_regions(
582-
num_parts=num_parts, target_part_size=part_size
595+
num_parts=num_partitions, target_part_size=partition_size
583596
)
584597
for region in regions:
585598
click.echo(f"{region}\t{vcf_path}")

docs/vcf2zarr/tutorial.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ Let's go through the example above using the distributed commands. First, we
214214
rm -fR sample-dist.icf
215215
```
216216
```{code-cell}
217-
vcf2zarr dexplode-init sample.vcf.gz sample-dist.icf -Q 5
217+
vcf2zarr dexplode-init sample.vcf.gz sample-dist.icf -n 5 -Q
218218
```
219219

220220
Here we asked ``dexplode-init`` to set up an ICF store in which the data

tests/test_cli.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def test_vcf_dexplode_init_compressor(self, mocked, tmp_path, compressor):
116116
runner = ct.CliRunner(mix_stderr=False)
117117
result = runner.invoke(
118118
cli.vcf2zarr_main,
119-
f"dexplode-init {self.vcf_path} {icf_path} 1 -C {compressor}",
119+
f"dexplode-init {self.vcf_path} {icf_path} -n 1 -C {compressor}",
120120
catch_exceptions=False,
121121
)
122122
assert result.exit_code == 0
@@ -294,7 +294,7 @@ def test_vcf_dexplode_init(self, mocked, tmp_path, progress, flag):
294294
icf_path = tmp_path / "icf"
295295
result = runner.invoke(
296296
cli.vcf2zarr_main,
297-
f"dexplode-init {self.vcf_path} {icf_path} 5 {flag}",
297+
f"dexplode-init {self.vcf_path} {icf_path} -n 5 {flag}",
298298
catch_exceptions=False,
299299
)
300300
assert result.exit_code == 0
@@ -309,7 +309,7 @@ def test_vcf_dexplode_init(self, mocked, tmp_path, progress, flag):
309309
**args,
310310
)
311311

312-
@pytest.mark.parametrize("num_partitions", ["-- -1", "0", "asdf", "1.112"])
312+
@pytest.mark.parametrize("num_partitions", ["-1", "0", "asdf", "1.112"])
313313
@mock.patch("bio2zarr.vcf2zarr.explode_init", return_value=5)
314314
def test_vcf_dexplode_init_bad_num_partitions(
315315
self, mocked, tmp_path, num_partitions
@@ -318,11 +318,24 @@ def test_vcf_dexplode_init_bad_num_partitions(
318318
icf_path = tmp_path / "icf"
319319
result = runner.invoke(
320320
cli.vcf2zarr_main,
321-
f"dexplode-init {self.vcf_path} {icf_path} {num_partitions}",
321+
f"dexplode-init {self.vcf_path} {icf_path} -n {num_partitions}",
322322
catch_exceptions=False,
323323
)
324324
assert result.exit_code == 2
325-
assert "Invalid value for 'NUM_PARTITIONS'" in result.stderr
325+
assert "Invalid value for '-n'" in result.stderr
326+
mocked.assert_not_called()
327+
328+
@mock.patch("bio2zarr.vcf2zarr.explode_init", return_value=5)
329+
def test_vcf_dexplode_init_no_partitions(self, mocked, tmp_path):
330+
runner = ct.CliRunner(mix_stderr=False)
331+
icf_path = tmp_path / "icf"
332+
result = runner.invoke(
333+
cli.vcf2zarr_main,
334+
f"dexplode-init {self.vcf_path} {icf_path}",
335+
catch_exceptions=False,
336+
)
337+
assert result.exit_code == 2
338+
assert "-n/--num-partitions must currently be specified" in result.stderr
326339
mocked.assert_not_called()
327340

328341
@mock.patch("bio2zarr.vcf2zarr.explode_partition")
@@ -457,7 +470,7 @@ def test_dencode_init(self, mocked, tmp_path, progress, flag):
457470
runner = ct.CliRunner(mix_stderr=False)
458471
result = runner.invoke(
459472
cli.vcf2zarr_main,
460-
f"dencode-init {icf_path} {zarr_path} 10 {flag}",
473+
f"dencode-init {icf_path} {zarr_path} -n 10 {flag}",
461474
catch_exceptions=False,
462475
)
463476
assert result.exit_code == 0
@@ -472,6 +485,21 @@ def test_dencode_init(self, mocked, tmp_path, progress, flag):
472485
**args,
473486
)
474487

488+
@mock.patch("bio2zarr.vcf2zarr.encode_init", return_value=5)
489+
def test_vcf_dencode_init_no_partitions(self, mocked, tmp_path):
490+
runner = ct.CliRunner(mix_stderr=False)
491+
icf_path = tmp_path / "icf"
492+
icf_path.mkdir()
493+
zarr_path = tmp_path / "zarr"
494+
result = runner.invoke(
495+
cli.vcf2zarr_main,
496+
f"dencode-init {icf_path} {zarr_path}",
497+
catch_exceptions=False,
498+
)
499+
assert result.exit_code == 2
500+
assert "-n/--num-partitions must currently be specified" in result.stderr
501+
mocked.assert_not_called()
502+
475503
@mock.patch("bio2zarr.vcf2zarr.encode_partition")
476504
def test_vcf_dencode_partition(self, mocked, tmp_path):
477505
runner = ct.CliRunner(mix_stderr=False)
@@ -601,7 +629,7 @@ def test_dexplode(self, tmp_path, one_based):
601629
runner = ct.CliRunner(mix_stderr=False)
602630
result = runner.invoke(
603631
cli.vcf2zarr_main,
604-
f"dexplode-init {self.vcf_path} {icf_path} 5 --json -Q",
632+
f"dexplode-init {self.vcf_path} {icf_path} -n 5 --json -Q",
605633
catch_exceptions=False,
606634
)
607635
assert result.exit_code == 0
@@ -678,7 +706,7 @@ def test_dencode(self, tmp_path, one_based):
678706
assert result.exit_code == 0
679707
result = runner.invoke(
680708
cli.vcf2zarr_main,
681-
f"dencode-init {icf_path} {zarr_path} 5 --variants-chunk-size=3 --json",
709+
f"dencode-init {icf_path} {zarr_path} -n 5 --variants-chunk-size=3 --json",
682710
catch_exceptions=False,
683711
)
684712
assert result.exit_code == 0

0 commit comments

Comments
 (0)