Skip to content

Commit 307f2c0

Browse files
committed
Address comments
1 parent 1f3cd17 commit 307f2c0

File tree

4 files changed

+78
-49
lines changed

4 files changed

+78
-49
lines changed

bio2zarr/plink.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,19 @@ def generate_schema(
5858
m = self.bed.sid_count
5959
logging.info(f"Scanned plink with {n} samples and {m} variants")
6060

61-
# FIXME
62-
if samples_chunk_size is None:
63-
samples_chunk_size = 1000
64-
if variants_chunk_size is None:
65-
variants_chunk_size = 10_000
61+
schema_instance = schema.VcfZarrSchema(
62+
format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
63+
samples_chunk_size=samples_chunk_size,
64+
variants_chunk_size=variants_chunk_size,
65+
fields=[],
66+
samples=[schema.Sample(id=sample) for sample in self.bed.iid],
67+
contigs=[],
68+
filters=[],
69+
)
6670

6771
logger.info(
68-
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
72+
"Generating schema with chunks="
73+
f"{schema_instance.variants_chunk_size, schema_instance.samples_chunk_size}"
6974
)
7075

7176
array_specs = [
@@ -75,7 +80,7 @@ def generate_schema(
7580
dtype="i4",
7681
shape=[m],
7782
dimensions=["variants"],
78-
chunks=[variants_chunk_size],
83+
chunks=[schema_instance.variants_chunk_size],
7984
description=None,
8085
),
8186
schema.ZarrArraySpec.new(
@@ -84,7 +89,7 @@ def generate_schema(
8489
dtype="O",
8590
shape=[m, 2],
8691
dimensions=["variants", "alleles"],
87-
chunks=[variants_chunk_size, 2],
92+
chunks=[schema_instance.variants_chunk_size, 2],
8893
description=None,
8994
),
9095
schema.ZarrArraySpec.new(
@@ -93,7 +98,10 @@ def generate_schema(
9398
dtype="bool",
9499
shape=[m, n],
95100
dimensions=["variants", "samples"],
96-
chunks=[variants_chunk_size, samples_chunk_size],
101+
chunks=[
102+
schema_instance.variants_chunk_size,
103+
schema_instance.samples_chunk_size,
104+
],
97105
description=None,
98106
),
99107
schema.ZarrArraySpec.new(
@@ -102,7 +110,11 @@ def generate_schema(
102110
dtype="i1",
103111
shape=[m, n, 2],
104112
dimensions=["variants", "samples", "ploidy"],
105-
chunks=[variants_chunk_size, samples_chunk_size, 2],
113+
chunks=[
114+
schema_instance.variants_chunk_size,
115+
schema_instance.samples_chunk_size,
116+
2,
117+
],
106118
description=None,
107119
),
108120
schema.ZarrArraySpec.new(
@@ -111,20 +123,16 @@ def generate_schema(
111123
dtype="bool",
112124
shape=[m, n, 2],
113125
dimensions=["variants", "samples", "ploidy"],
114-
chunks=[variants_chunk_size, samples_chunk_size, 2],
126+
chunks=[
127+
schema_instance.variants_chunk_size,
128+
schema_instance.samples_chunk_size,
129+
2,
130+
],
115131
description=None,
116132
),
117133
]
118-
119-
return schema.VcfZarrSchema(
120-
format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
121-
samples_chunk_size=samples_chunk_size,
122-
variants_chunk_size=variants_chunk_size,
123-
fields=array_specs,
124-
samples=[schema.Sample(id=sample) for sample in self.bed.iid],
125-
contigs=[],
126-
filters=[],
127-
)
134+
schema_instance.fields = array_specs
135+
return schema_instance
128136

129137

130138
def convert(

bio2zarr/schema.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,28 @@ class VcfZarrSchema(core.JsonDataclass):
184184
filters: list
185185
fields: list
186186

187+
def __init__(
188+
self,
189+
format_version: str,
190+
samples: list,
191+
contigs: list,
192+
filters: list,
193+
fields: list,
194+
variants_chunk_size: int = None,
195+
samples_chunk_size: int = None,
196+
):
197+
self.format_version = format_version
198+
self.samples = samples
199+
self.contigs = contigs
200+
self.filters = filters
201+
self.fields = fields
202+
if variants_chunk_size is None:
203+
variants_chunk_size = 1000
204+
self.variants_chunk_size = variants_chunk_size
205+
if samples_chunk_size is None:
206+
samples_chunk_size = 10_000
207+
self.samples_chunk_size = samples_chunk_size
208+
187209
def validate(self):
188210
"""
189211
Checks that the schema is well-formed and within required limits.

bio2zarr/vcf2zarr/icf.py

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
import numcodecs
1616
import numpy as np
1717

18-
from bio2zarr import schema
19-
20-
from .. import constants, core, provenance, vcf_utils, writer
18+
from .. import constants, core, provenance, schema, vcf_utils, writer
2119

2220
logger = logging.getLogger(__name__)
2321

@@ -1029,28 +1027,33 @@ def iter_genotypes(self, shape, start, stop):
10291027
def generate_schema(
10301028
self, variants_chunk_size=None, samples_chunk_size=None, local_alleles=None
10311029
):
1032-
# Import schema here to avoid circular import
1033-
from bio2zarr import schema
1034-
10351030
m = self.num_records
10361031
n = self.num_samples
1037-
if samples_chunk_size is None:
1038-
samples_chunk_size = 10_000
1039-
if variants_chunk_size is None:
1040-
variants_chunk_size = 1000
10411032
if local_alleles is None:
10421033
local_alleles = False
1034+
1035+
schema_instance = schema.VcfZarrSchema(
1036+
format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
1037+
samples_chunk_size=samples_chunk_size,
1038+
variants_chunk_size=variants_chunk_size,
1039+
fields=[],
1040+
samples=self.metadata.samples,
1041+
contigs=self.metadata.contigs,
1042+
filters=self.metadata.filters,
1043+
)
1044+
10431045
logger.info(
1044-
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
1046+
"Generating schema with chunks="
1047+
f"{schema_instance.variants_chunk_size, schema_instance.samples_chunk_size}"
10451048
)
10461049

10471050
def spec_from_field(field, array_name=None):
10481051
return schema.ZarrArraySpec.from_field(
10491052
field,
10501053
num_samples=n,
10511054
num_variants=m,
1052-
samples_chunk_size=samples_chunk_size,
1053-
variants_chunk_size=variants_chunk_size,
1055+
samples_chunk_size=schema_instance.samples_chunk_size,
1056+
variants_chunk_size=schema_instance.variants_chunk_size,
10541057
array_name=array_name,
10551058
)
10561059

@@ -1069,7 +1072,7 @@ def fixed_field_spec(
10691072
shape=shape,
10701073
description="",
10711074
dimensions=dimensions,
1072-
chunks=chunks or [variants_chunk_size],
1075+
chunks=chunks or [schema_instance.variants_chunk_size],
10731076
)
10741077

10751078
alt_field = self.fields["ALT"]
@@ -1085,14 +1088,14 @@ def fixed_field_spec(
10851088
dtype="bool",
10861089
shape=(m, self.metadata.num_filters),
10871090
dimensions=["variants", "filters"],
1088-
chunks=(variants_chunk_size, self.metadata.num_filters),
1091+
chunks=(schema_instance.variants_chunk_size, self.metadata.num_filters),
10891092
),
10901093
fixed_field_spec(
10911094
name="variant_allele",
10921095
dtype="O",
10931096
shape=(m, max_alleles),
10941097
dimensions=["variants", "alleles"],
1095-
chunks=(variants_chunk_size, max_alleles),
1098+
chunks=(schema_instance.variants_chunk_size, max_alleles),
10961099
),
10971100
fixed_field_spec(
10981101
name="variant_id",
@@ -1127,7 +1130,10 @@ def fixed_field_spec(
11271130
if gt_field is not None and n > 0:
11281131
ploidy = max(gt_field.summary.max_number - 1, 1)
11291132
shape = [m, n]
1130-
chunks = [variants_chunk_size, samples_chunk_size]
1133+
chunks = [
1134+
schema_instance.variants_chunk_size,
1135+
schema_instance.samples_chunk_size,
1136+
]
11311137
dimensions = ["variants", "samples"]
11321138
array_specs.append(
11331139
schema.ZarrArraySpec.new(
@@ -1169,15 +1175,8 @@ def fixed_field_spec(
11691175
if local_alleles:
11701176
array_specs = convert_local_allele_field_types(array_specs)
11711177

1172-
return schema.VcfZarrSchema(
1173-
format_version=schema.ZARR_SCHEMA_FORMAT_VERSION,
1174-
samples_chunk_size=samples_chunk_size,
1175-
variants_chunk_size=variants_chunk_size,
1176-
fields=array_specs,
1177-
samples=self.metadata.samples,
1178-
contigs=self.metadata.contigs,
1179-
filters=self.metadata.filters,
1180-
)
1178+
schema_instance.fields = array_specs
1179+
return schema_instance
11811180

11821181

11831182
@dataclasses.dataclass

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ line-length = 88
8282
indent-width = 4
8383

8484
[tool.ruff.lint]
85-
select = ["E", "F", "B", "W", "I", "N", "UP", "A", "RUF", "PT"]
85+
select = ["E", "F", "B", "W", "I", "N", "UP", "A", "PT"]
8686
#Allow uppercase names for e.g. call_AD
87-
ignore = ["N806", "N802", "A001", "A002"]
87+
ignore = ["N806", "N802", "A001", "A002", "RUF"]
8888

8989
fixable = ["ALL"]
9090
unfixable = []

0 commit comments

Comments
 (0)