Skip to content

Report correct file sizes in inspect #143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions bio2zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import dataclasses
import logging
import multiprocessing
import os
import os.path
import threading
import time

Expand Down Expand Up @@ -45,6 +47,22 @@ def chunk_aligned_slices(z, n, max_chunks=None):
return slices


def du(path):
"""
Return the total bytes stored at this path.
"""
total = os.path.getsize(path)
# pathlib walk method doesn't exist until 3.12 :(
for root, dirs, files in os.walk(path):
for lst in [dirs, files]:
for name in lst:
fullname = os.path.join(root, name)
size = os.path.getsize(fullname)
total += size
logger.debug(f"du({path}) = {total}")
return total


class SynchronousExecutor(cf.Executor):
def submit(self, fn, /, *args, **kwargs):
future = cf.Future()
Expand Down
7 changes: 3 additions & 4 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import math
import os
import os.path
import pathlib
import pickle
import shutil
Expand Down Expand Up @@ -1509,14 +1510,12 @@ class VcfZarr:
def __init__(self, path):
if not (path / ".zmetadata").exists():
raise ValueError("Not in VcfZarr format") # NEEDS TEST
self.path = path
self.root = zarr.open(path, mode="r")

def __repr__(self):
return repr(self.root) # NEEDS TEST

def summary_table(self):
data = []
arrays = [(a.nbytes_stored, a) for _, a in self.root.arrays()]
arrays = [(core.du(self.path / a.basename), a) for _, a in self.root.arrays()]
arrays.sort(key=lambda x: x[0])
for stored, array in reversed(arrays):
d = {
Expand Down
16 changes: 16 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,19 @@ def test_5_chunk_1(self, n, expected):
z = zarr.array(np.arange(5), chunks=1, dtype=int)
result = core.chunk_aligned_slices(z, n)
assert result == expected


@pytest.mark.parametrize(
("path", "expected"),
[
# NOTE: this data was generated using du -sb on a Linux system.
# It *might* work in CI, but it may well not either, as it's
# probably dependent on a whole bunch of things. Expect to fail
# at some point.
("tests/data", 4630726),
("tests/data/vcf", 4618589),
("tests/data/vcf/sample.vcf.gz", 1089),
],
)
def test_du(path, expected):
assert core.du(path) == expected
Loading