Skip to content

Commit 5bf2635

Browse files
Initial CLI skeleton for dexplode
1 parent 90bd40a commit 5bf2635

File tree

3 files changed

+219
-23
lines changed

3 files changed

+219
-23
lines changed

bio2zarr/cli.py

Lines changed: 102 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ def list_commands(self, ctx):
3939
"zarr_path", type=click.Path(file_okay=False, dir_okay=True)
4040
)
4141

42+
zarr_path = click.argument(
43+
"zarr_path", type=click.Path(exists=True, file_okay=False, dir_okay=True)
44+
)
45+
46+
num_partitions = click.argument("num_partitions", type=click.IntRange(min=1))
47+
48+
partition = click.argument("partition", type=click.IntRange(min=0))
49+
4250
verbose = click.option("-v", "--verbose", count=True, help="Increase verbosity")
4351

4452
force = click.option(
@@ -92,6 +100,27 @@ def list_commands(self, ctx):
92100
help="Chunk size in the samples dimension",
93101
)
94102

103+
schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
104+
105+
max_variant_chunks = click.option(
106+
"-V",
107+
"--max-variant-chunks",
108+
type=int,
109+
default=None,
110+
help=(
111+
"Truncate the output in the variants dimension to have "
112+
"this number of chunks. Mainly intended to help with "
113+
"schema tuning."
114+
),
115+
)
116+
117+
max_memory = click.option(
118+
"-M",
119+
"--max-memory",
120+
default=None,
121+
help="An approximate bound on overall memory usage (e.g. 10G),",
122+
)
123+
95124

96125
def setup_logging(verbosity):
97126
level = "WARNING"
@@ -158,7 +187,7 @@ def explode(
158187
@click.command
159188
@vcfs
160189
@new_icf_path
161-
@click.argument("num_partitions", type=click.IntRange(min=1))
190+
@num_partitions
162191
@force
163192
@column_chunk_size
164193
@compressor
@@ -194,7 +223,7 @@ def dexplode_init(
194223

195224
@click.command
196225
@icf_path
197-
@click.argument("partition", type=click.IntRange(min=0))
226+
@partition
198227
@verbose
199228
def dexplode_partition(icf_path, partition, verbose):
200229
"""
@@ -207,14 +236,14 @@ def dexplode_partition(icf_path, partition, verbose):
207236

208237

209238
@click.command
210-
@click.argument("path", type=click.Path(), required=True)
239+
@icf_path
211240
@verbose
212-
def dexplode_finalise(path, verbose):
241+
def dexplode_finalise(icf_path, verbose):
213242
"""
214243
Final step for distributed conversion of VCF(s) to intermediate columnar format.
215244
"""
216245
setup_logging(verbose)
217-
vcf.explode_finalise(path)
246+
vcf.explode_finalise(icf_path)
218247

219248

220249
@click.command
@@ -244,26 +273,11 @@ def mkschema(icf_path):
244273
@new_zarr_path
245274
@force
246275
@verbose
247-
@click.option("-s", "--schema", default=None, type=click.Path(exists=True))
276+
@schema
248277
@variants_chunk_size
249278
@samples_chunk_size
250-
@click.option(
251-
"-V",
252-
"--max-variant-chunks",
253-
type=int,
254-
default=None,
255-
help=(
256-
"Truncate the output in the variants dimension to have "
257-
"this number of chunks. Mainly intended to help with "
258-
"schema tuning."
259-
),
260-
)
261-
@click.option(
262-
"-M",
263-
"--max-memory",
264-
default=None,
265-
help="An approximate bound on overall memory usage (e.g. 10G),",
266-
)
279+
@max_variant_chunks
280+
@max_memory
267281
@worker_processes
268282
def encode(
269283
icf_path,
@@ -295,6 +309,68 @@ def encode(
295309
)
296310

297311

312+
@click.command
313+
@icf_path
314+
@new_zarr_path
315+
@num_partitions
316+
@force
317+
@schema
318+
@variants_chunk_size
319+
@samples_chunk_size
320+
@max_variant_chunks
321+
@verbose
322+
def dencode_init(
323+
icf_path,
324+
zarr_path,
325+
num_partitions,
326+
force,
327+
schema,
328+
variants_chunk_size,
329+
samples_chunk_size,
330+
max_variant_chunks,
331+
verbose,
332+
):
333+
"""
334+
TODO DOCUMENT
335+
"""
336+
setup_logging(verbose)
337+
check_overwrite_dir(zarr_path, force)
338+
num_partitions = vcf.encode_init(
339+
icf_path,
340+
zarr_path,
341+
target_num_partitions=num_partitions,
342+
schema_path=schema,
343+
variants_chunk_size=variants_chunk_size,
344+
samples_chunk_size=samples_chunk_size,
345+
max_v_chunks=max_variant_chunks,
346+
show_progress=True,
347+
)
348+
click.echo(num_partitions)
349+
350+
351+
@click.command
352+
@zarr_path
353+
@partition
354+
@verbose
355+
def dencode_partition(zarr_path, partition, verbose):
356+
"""
357+
TODO DOCUMENT
358+
"""
359+
setup_logging(verbose)
360+
vcf.encode_partition(zarr_path, partition, show_progress=False)
361+
362+
363+
@click.command
364+
@zarr_path
365+
@verbose
366+
def dencode_finalise(zarr_path, verbose):
367+
"""
368+
TODO DOCUMENT
369+
"""
370+
setup_logging(verbose)
371+
vcf.encode_finalise(zarr_path)
372+
373+
298374
@click.command(name="convert")
299375
@vcfs
300376
@new_zarr_path
@@ -382,6 +458,9 @@ def vcf2zarr():
382458
vcf2zarr.add_command(dexplode_init)
383459
vcf2zarr.add_command(dexplode_partition)
384460
vcf2zarr.add_command(dexplode_finalise)
461+
vcf2zarr.add_command(dencode_init)
462+
vcf2zarr.add_command(dencode_partition)
463+
vcf2zarr.add_command(dencode_finalise)
385464

386465

387466
@click.command(name="convert")

bio2zarr/vcf.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,6 +1947,65 @@ def encode(
19471947
vzw.finalise()
19481948

19491949

1950+
def encode_init(
1951+
icf_path,
1952+
zarr_path,
1953+
target_num_partitions,
1954+
*,
1955+
schema_path=None,
1956+
variants_chunk_size=None,
1957+
samples_chunk_size=None,
1958+
max_v_chunks=None,
1959+
dimension_separator=None,
1960+
max_memory=None,
1961+
worker_processes=1,
1962+
show_progress=False,
1963+
):
1964+
pass
1965+
# icf = IntermediateColumnarFormat(icf_path)
1966+
# if schema_path is None:
1967+
# schema = VcfZarrSchema.generate(
1968+
# icf,
1969+
# variants_chunk_size=variants_chunk_size,
1970+
# samples_chunk_size=samples_chunk_size,
1971+
# )
1972+
# else:
1973+
# logger.info(f"Reading schema from {schema_path}")
1974+
# if variants_chunk_size is not None or samples_chunk_size is not None:
1975+
# raise ValueError(
1976+
# "Cannot specify schema along with chunk sizes"
1977+
# ) # NEEDS TEST
1978+
# with open(schema_path) as f:
1979+
# schema = VcfZarrSchema.fromjson(f.read())
1980+
# zarr_path = pathlib.Path(zarr_path)
1981+
# if zarr_path.exists():
1982+
# logger.warning(f"Deleting existing {zarr_path}")
1983+
1984+
# shutil.rmtree(zarr_path)
1985+
# vzw = VcfZarrWriter(zarr_path, icf, schema,
1986+
# dimension_separator=dimension_separator)
1987+
# vzw.init()
1988+
# vzw.encode(
1989+
# max_v_chunks=max_v_chunks,
1990+
# worker_processes=worker_processes,
1991+
# max_memory=max_memory,
1992+
# show_progress=show_progress,
1993+
# )
1994+
# vzw.finalise()
1995+
1996+
1997+
def encode_partition(zarr_path, partition, *, show_progress=False, worker_processes=1):
1998+
writer = VcfZarrWriter(zarr_path)
1999+
writer.encode_partition(
2000+
partition, show_progress=show_progress, worker_processes=worker_processes
2001+
)
2002+
2003+
2004+
def encode_finalise(zarr_path):
2005+
writer = VcfZarrWriter(zarr_path)
2006+
writer.finalise()
2007+
2008+
19502009
def convert(
19512010
vcfs,
19522011
out_path,

tests/test_cli.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
compressor=None,
2323
show_progress=True,
2424
)
25+
2526
DEFAULT_ENCODE_ARGS = dict(
2627
schema_path=None,
2728
variants_chunk_size=None,
@@ -32,6 +33,14 @@
3233
show_progress=True,
3334
)
3435

36+
DEFAULT_DENCODE_INIT_ARGS = dict(
37+
schema_path=None,
38+
variants_chunk_size=None,
39+
samples_chunk_size=None,
40+
max_v_chunks=None,
41+
show_progress=True,
42+
)
43+
3544

3645
class TestWithMocks:
3746
vcf_path = "tests/data/vcf/sample.vcf.gz"
@@ -385,6 +394,55 @@ def test_encode(self, mocked, tmp_path):
385394
**DEFAULT_ENCODE_ARGS,
386395
)
387396

397+
@mock.patch("bio2zarr.vcf.encode_init", return_value=10)
398+
def test_dencode(self, mocked, tmp_path):
399+
icf_path = tmp_path / "icf"
400+
icf_path.mkdir()
401+
zarr_path = tmp_path / "zarr"
402+
runner = ct.CliRunner(mix_stderr=False)
403+
result = runner.invoke(
404+
cli.vcf2zarr,
405+
f"dencode-init {icf_path} {zarr_path} 10",
406+
catch_exceptions=False,
407+
)
408+
assert result.exit_code == 0
409+
assert result.stdout == "10\n"
410+
assert len(result.stderr) == 0
411+
mocked.assert_called_once_with(
412+
str(icf_path),
413+
str(zarr_path),
414+
target_num_partitions=10,
415+
**DEFAULT_DENCODE_INIT_ARGS,
416+
)
417+
418+
@mock.patch("bio2zarr.vcf.encode_partition")
419+
def test_vcf_dencode_partition(self, mocked, tmp_path):
420+
runner = ct.CliRunner(mix_stderr=False)
421+
zarr_path = tmp_path / "zarr"
422+
zarr_path.mkdir()
423+
result = runner.invoke(
424+
cli.vcf2zarr,
425+
f"dencode-partition {zarr_path} 1",
426+
catch_exceptions=False,
427+
)
428+
assert result.exit_code == 0
429+
assert len(result.stdout) == 0
430+
assert len(result.stderr) == 0
431+
mocked.assert_called_once_with(
432+
str(zarr_path), 1, **DEFAULT_DEXPLODE_PARTITION_ARGS
433+
)
434+
435+
@mock.patch("bio2zarr.vcf.encode_finalise")
436+
def test_vcf_dencode_finalise(self, mocked, tmp_path):
437+
runner = ct.CliRunner(mix_stderr=False)
438+
result = runner.invoke(
439+
cli.vcf2zarr, f"dencode-finalise {tmp_path}", catch_exceptions=False
440+
)
441+
assert result.exit_code == 0
442+
assert len(result.stdout) == 0
443+
assert len(result.stderr) == 0
444+
mocked.assert_called_once_with(str(tmp_path))
445+
388446
@mock.patch("bio2zarr.vcf.convert")
389447
def test_convert_vcf(self, mocked):
390448
runner = ct.CliRunner(mix_stderr=False)

0 commit comments

Comments
 (0)