Skip to content

Remove schema data #343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 0.1.6 2025-0X-XX

Breaking changes

- Remove explicit sample, contig and filter lists from the schema.
Existing ICFs will need to be recreated. (#343)

# 0.1.5 2025-03-31

- Add support for merging contig IDs across multiple VCFs (#335)
Expand Down
32 changes: 16 additions & 16 deletions bio2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,9 +877,9 @@ def convert_local_allele_field_types(fields):
return [*fields, la]


class IntermediateColumnarFormat(collections.abc.Mapping):
class IntermediateColumnarFormat(vcz.Source):
def __init__(self, path):
self.path = pathlib.Path(path)
self._path = pathlib.Path(path)
# TODO raise a more informative error here telling people this
# directory is either a WIP or the wrong format.
with open(self.path / "metadata.json") as f:
Expand All @@ -902,20 +902,11 @@ def __init__(self, path):

def __repr__(self):
return (
f"IntermediateColumnarFormat(fields={len(self)}, "
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
f"partitions={self.num_partitions}, "
f"records={self.num_records}, path={self.path})"
)

def __getitem__(self, key):
return self.fields[key]

def __iter__(self):
return iter(self.fields)

def __len__(self):
return len(self.fields)

def summary_table(self):
data = []
for name, icf_field in self.fields.items():
Expand All @@ -934,6 +925,10 @@ def summary_table(self):
data.append(d)
return data

@property
def path(self):
return self._path

@property
def num_records(self):
return self.metadata.num_records
Expand All @@ -944,7 +939,15 @@ def num_partitions(self):

@property
def samples(self):
return [sample.id for sample in self.metadata.samples]
return self.metadata.samples

@property
def contigs(self):
return self.metadata.contigs

@property
def filters(self):
return self.metadata.filters

@property
def num_samples(self):
Expand Down Expand Up @@ -1037,9 +1040,6 @@ def generate_schema(
samples_chunk_size=samples_chunk_size,
variants_chunk_size=variants_chunk_size,
fields=[],
samples=self.metadata.samples,
contigs=self.metadata.contigs,
filters=self.metadata.filters,
)

logger.info(
Expand Down
27 changes: 18 additions & 9 deletions bio2zarr/plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,26 @@
logger = logging.getLogger(__name__)


class PlinkFormat:
class PlinkFormat(vcz.Source):
def __init__(self, path):
self.path = path
self._path = pathlib.Path(path)
self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
self.num_records = self.bed.sid_count
self.samples = list(self.bed.iid)
self.num_samples = len(self.samples)
self.root_attrs = {}

@property
def path(self):
return self._path

@property
def num_records(self):
return self.bed.sid_count

@property
def samples(self):
return [vcz.Sample(id=sample) for sample in self.bed.iid]

@property
def num_samples(self):
return len(self.samples)

def iter_alleles(self, start, stop, num_alleles):
ref_field = self.bed.allele_1
Expand Down Expand Up @@ -62,9 +74,6 @@ def generate_schema(
samples_chunk_size=samples_chunk_size,
variants_chunk_size=variants_chunk_size,
fields=[],
samples=[vcz.Sample(id=sample) for sample in self.bed.iid],
contigs=[],
filters=[],
)

logger.info(
Expand Down
102 changes: 75 additions & 27 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import abc
import dataclasses
import json
import logging
Expand All @@ -13,7 +14,7 @@

logger = logging.getLogger(__name__)

ZARR_SCHEMA_FORMAT_VERSION = "0.4"
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)

_fixed_field_descriptions = {
Expand All @@ -28,6 +29,62 @@
}


class Source(abc.ABC):
@property
@abc.abstractmethod
def path(self):
pass

@property
@abc.abstractmethod
def num_records(self):
pass

@property
@abc.abstractmethod
def num_samples(self):
pass

@property
@abc.abstractmethod
def samples(self):
pass

@property
def contigs(self):
return None

@property
def filters(self):
return None

@property
def root_attrs(self):
return {}

@abc.abstractmethod
def iter_alleles(self, start, stop, num_alleles):
pass

@abc.abstractmethod
def iter_genotypes(self, start, stop, num_alleles):
pass

def iter_id(self, start, stop):
return

def iter_contig(self, start, stop):
return

@abc.abstractmethod
def iter_field(self, field_name, shape, start, stop):
pass

@abc.abstractmethod
def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
pass


@dataclasses.dataclass
class ZarrArraySpec:
name: str
Expand Down Expand Up @@ -182,25 +239,16 @@ class VcfZarrSchema(core.JsonDataclass):
format_version: str
samples_chunk_size: int
variants_chunk_size: int
samples: list
contigs: list
filters: list
fields: list

def __init__(
self,
format_version: str,
samples: list,
contigs: list,
filters: list,
fields: list,
variants_chunk_size: int = None,
samples_chunk_size: int = None,
):
self.format_version = format_version
self.samples = samples
self.contigs = contigs
self.filters = filters
self.fields = fields
if variants_chunk_size is None:
variants_chunk_size = 1000
Expand Down Expand Up @@ -238,9 +286,6 @@ def fromdict(d):
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
)
ret = VcfZarrSchema(**d)
ret.samples = [Sample(**sd) for sd in d["samples"]]
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
ret.filters = [Filter(**sd) for sd in d["filters"]]
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
return ret

Expand Down Expand Up @@ -474,8 +519,10 @@ def init(

# Doing this synchronously - this is fine surely
self.encode_samples(root)
self.encode_filter_id(root)
self.encode_contig_id(root)
if self.source.filters is not None:
self.encode_filter_id(root)
if self.source.contigs is not None:
self.encode_contigs(root)

self.wip_path.mkdir()
self.arrays_path.mkdir()
Expand All @@ -502,33 +549,33 @@ def init(
)

def encode_samples(self, root):
if [s.id for s in self.schema.samples] != self.source.samples:
raise ValueError("Subsetting or reordering samples not supported currently")
samples = self.source.samples
array = root.array(
"sample_id",
data=[sample.id for sample in self.schema.samples],
shape=len(self.schema.samples),
data=[sample.id for sample in samples],
shape=len(samples),
dtype="str",
compressor=DEFAULT_ZARR_COMPRESSOR,
chunks=(self.schema.samples_chunk_size,),
)
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
logger.debug("Samples done")

def encode_contig_id(self, root):
def encode_contigs(self, root):
contigs = self.source.contigs
array = root.array(
"contig_id",
data=[contig.id for contig in self.schema.contigs],
shape=len(self.schema.contigs),
data=[contig.id for contig in contigs],
shape=len(contigs),
dtype="str",
compressor=DEFAULT_ZARR_COMPRESSOR,
)
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
if all(contig.length is not None for contig in self.schema.contigs):
if all(contig.length is not None for contig in contigs):
array = root.array(
"contig_length",
data=[contig.length for contig in self.schema.contigs],
shape=len(self.schema.contigs),
data=[contig.length for contig in contigs],
shape=len(contigs),
dtype=np.int64,
compressor=DEFAULT_ZARR_COMPRESSOR,
)
Expand All @@ -537,10 +584,11 @@ def encode_contig_id(self, root):
def encode_filter_id(self, root):
# TODO need a way to store description also
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
filters = self.source.filters
array = root.array(
"filter_id",
data=[filt.id for filt in self.schema.filters],
shape=len(self.schema.filters),
data=[filt.id for filt in filters],
shape=len(filters),
dtype="str",
compressor=DEFAULT_ZARR_COMPRESSOR,
)
Expand Down
Loading