From 8e97bc3f845977dcc0b0ed77af54075c14127a0a Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 17 Sep 2024 16:10:59 +0200 Subject: [PATCH 01/17] Draft bed2zarr code Draft code to convert Bed3-format to Zarr. --- bio2zarr/__main__.py | 1 + bio2zarr/bed2zarr.py | 116 ++++++++++++++++++ bio2zarr/cli.py | 37 +++++- pyproject.toml | 1 + .../1kg_2020_chr20_annotations_mask.bed.gz | Bin 0 -> 95 bytes ...1kg_2020_chr20_annotations_mask.bed.gz.csi | Bin 0 -> 122 bytes tests/data/bed/sample_mask.bed.gz | Bin 0 -> 84 bytes tests/data/bed/sample_mask.bed.gz.csi | Bin 0 -> 149 bytes tests/test_bed.py | 50 ++++++++ 9 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 bio2zarr/bed2zarr.py create mode 100644 tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz create mode 100644 tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi create mode 100644 tests/data/bed/sample_mask.bed.gz create mode 100644 tests/data/bed/sample_mask.bed.gz.csi create mode 100644 tests/test_bed.py diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py index cab080b6..024b9ab2 100644 --- a/bio2zarr/__main__.py +++ b/bio2zarr/__main__.py @@ -15,6 +15,7 @@ def bio2zarr(): # is handy for development and for those whose PATHs aren't set # up in the right way. bio2zarr.add_command(cli.vcf2zarr_main) +bio2zarr.add_command(cli.bed2zarr_main) bio2zarr.add_command(cli.plink2zarr) bio2zarr.add_command(cli.vcfpartition) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py new file mode 100644 index 00000000..8328c252 --- /dev/null +++ b/bio2zarr/bed2zarr.py @@ -0,0 +1,116 @@ +import dataclasses +import numpy as np +import pathlib +import gzip +import zarr +from . import core + + +# see https://samtools.github.io/hts-specs/BEDv1.pdf +@dataclasses.dataclass +class Bed3: + """BED3 genomic region with chromosome, start, and end. Intervals + are 0-based, half-open.""" + + chrom: str + start: int + end: int + + @property + def width(self): + """Width of the region.""" + return self.end - self.start + + def __len__(self): + return self.width + + def mask(self, invert=False): + """Create a mask for the region. The mask is an array of 1's + (0's if inverted).""" + func = np.zeros if invert else np.ones + return func(self.width, dtype=np.uint8) + + +class BedReader: + def __init__(self, bed_path): + self.bed_path = pathlib.Path(bed_path) + + def __enter__(self): + if self.bed_path.suffix == ".gz": + self.fh = gzip.open(self.bed_path, "rt") + else: + self.fh = self.bed_path.open("rt") + + return self + + def __exit__(self, *args): + self.fh.close() + + +# Here we are assuming that we write a mask. However, the BED file +# could represent other things, such as scores, and there could be up +# to 9 columns, in which case more fields (aka data arrays?) would be +# needed. +def bed2zarr( + bed_path, + zarr_path, + bed_array="bed_mask", # More generic name? + show_progress=False, +): + # 1. Make sure the bed file is gzipped and indexed + bed_path = pathlib.Path(bed_path) + + if bed_path.suffix != ".gz": + raise ValueError("BED file must be gzipped.") + if ( + not bed_path.with_suffix(".gz.csi").exists() + or not bed_path.with_suffix(".gz.tbi").exists() + ): + raise ValueError("BED file must be indexed.") + + # 2. Make sure there are contig lengths + store = zarr.open(zarr_path) + if "contig_length" not in store: + raise ValueError( + ( + "No contig lengths in Zarr store. Contig lengths must be" + " present in the Zarr store before writing Bed entries." + ) + ) + # 2b. Make chromosome to integer mapping + chrom_d = { + k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"]))) + } + # 2c. Make cumulative index of contig lengths + contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0) + + # 3. Init the zarr group with the contig lengths + # bed_array and bed_array_contig are of equal lengths = total genome + if bed_array not in store: + bed_array_contig = f"{bed_array}_contig" + dtype = core.min_int_dtype(0, len(store["contig_id"])) + n_bases = np.sum(store["contig_length"]) + + store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,)) + store.create_dataset( + bed_array_contig, + data=np.repeat( + np.arange(len(store["contig_id"])), store["contig_length"] + ).astype(dtype), + dtype=dtype, + shape=(n_bases,), + ) + + # 4. Read the bed file and write the mask to the zarr dataset, + # updating for each entry; many I/O operations; better read entire + # file, store regions by chromosomes and generate index by + # chromosome for all regions? + with BedReader(bed_path) as br: + for line in br.fh: + chrom, start, end = line.strip().split("\t") + i = chrom_d[chrom] + start = int(start) + contig_indices[i] + end = int(end) + contig_indices[i] + bed = Bed3(chrom, start, end) + mask = bed.mask() + store[bed_array][start:end] = mask diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 0722521d..816de45b 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -8,7 +8,7 @@ import numcodecs import tabulate -from . import plink, provenance, vcf2zarr, vcf_utils +from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr from .vcf2zarr import icf as icf_mod logger = logging.getLogger(__name__) @@ -574,6 +574,41 @@ def plink2zarr(): plink2zarr.add_command(convert_plink) +@click.command +@version +@click.argument( + "bed_path", + type=click.Path(exists=True, dir_okay=False), +) +@zarr_path +@click.argument( + "zarr_field", + type=str, +) +@verbose +@force +@progress +def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress): + """ + Convert BED file to the Zarr format. The BED regions will be + converted to binary-encoded arrays whose length is equal to the + length of the reference genome. The BED file regions are used to + mask the reference genome, where the masked regions are set to 1 + and the unmasked regions are set to 0. + + The BED file must be compressed and tabix-indexed. + """ + setup_logging(verbose) + path = pathlib.Path(zarr_path) / bed_array + check_overwrite_dir(path, force) + bed2zarr( + bed_path, + zarr_path, + bed_array, + show_progress=progress, + ) + + @click.command @version @vcfs diff --git a/pyproject.toml b/pyproject.toml index 08bc9742..b57758b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr" documentation = "https://sgkit-dev.github.io/bio2zarr/" [project.scripts] +bed2zarr = "bio2zarr.cli:bed2zarr" vcf2zarr = "bio2zarr.cli:vcf2zarr_main" vcfpartition = "bio2zarr.cli:vcfpartition" diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz new file mode 100644 index 0000000000000000000000000000000000000000..8713cdfcc239aef2781152275d05ae7496a2d52d GIT binary patch literal 95 zcmb2|=3rp}f&Xj_PR>jWP7JOm7BV&%2(TE`&Q-JdBDrGOuHW1$6C;05@$oS3-RkeT eI{en#`Ry?YHQfgS7|nq?<jWu?)EjW<_xS2hP(_4JO|$S`FcM%!1-_L1U=2GI+?3Wu2g!QV|S`t T{6-O|TpmrIGy^l(7!UyfvL6%P literal 0 HcmV?d00001 diff --git a/tests/data/bed/sample_mask.bed.gz.csi b/tests/data/bed/sample_mask.bed.gz.csi new file mode 100644 index 0000000000000000000000000000000000000000..2d96b9276eaa8425768351223ab2b83d822bc8f3 GIT binary patch literal 149 zcmb2|=3rp}f&Xj_PR>jW6%3_k4EdT31Q-r(ysY4QiaFykyUNN*R-#)>_H>IYGqRm- z%6(U|`u6_|t?W`N0xT|fK5V)4;P2;3$?4TD=BhIvJlJ=(&g-}3ydR(3FY46ZyWO4k lcKxiS`Q^WDbFb~oOG`_)eOF+^$iN_vX0bE_GuRy<0s!@KH-rEH literal 0 HcmV?d00001 diff --git a/tests/test_bed.py b/tests/test_bed.py new file mode 100644 index 00000000..689a60bd --- /dev/null +++ b/tests/test_bed.py @@ -0,0 +1,50 @@ +import pytest +import sgkit as sg +import xarray.testing as xt +import zarr +from bio2zarr import bed2zarr, vcf2zarr + + +class Test1kgBed: + data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf" + bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" + csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi" + + @pytest.fixture(scope="module") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded" + vcf2zarr.explode(out, [self.data_path]) + return out + + @pytest.fixture(scope="module") + def zarr(self, icf, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr" + vcf2zarr.encode(icf, out) + return out + + def test_add_mask_chr20(self, zarr): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True) + + +class TestSampleBed: + data_path = "tests/data/vcf/sample.bcf" + bed_path = "tests/data/bed/sample_mask.bed.gz" + csi_path = "tests/data/bed/sample_mask.bed.gz.csi" + + @pytest.fixture(scope="module") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "sample.exploded" + vcf2zarr.explode(out, [self.data_path]) + return out + + @pytest.fixture(scope="module") + def zarr(self, icf, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "sample.zarr" + vcf2zarr.encode(icf, out) + return out + + def test_add_mask_sample(self, zarr): + with pytest.raises(ValueError): + bed2zarr.bed2zarr( + bed_path=self.bed_path, zarr_path=zarr, show_progress=True + ) From 49b6f6af911b40299c1621db684403ce688b9429 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 17 Sep 2024 16:34:54 +0200 Subject: [PATCH 02/17] Fix bed2zarr call --- bio2zarr/bed2zarr.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 8328c252..852fa7d5 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -91,7 +91,7 @@ def bed2zarr( dtype = core.min_int_dtype(0, len(store["contig_id"])) n_bases = np.sum(store["contig_length"]) - store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,)) + store.create_dataset(bed_array, fill_value=0, dtype="u1", shape=(n_bases,)) store.create_dataset( bed_array_contig, data=np.repeat( diff --git a/pyproject.toml b/pyproject.toml index b57758b8..2883186f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr" documentation = "https://sgkit-dev.github.io/bio2zarr/" [project.scripts] -bed2zarr = "bio2zarr.cli:bed2zarr" +bed2zarr = "bio2zarr.cli:bed2zarr_main" vcf2zarr = "bio2zarr.cli:vcf2zarr_main" vcfpartition = "bio2zarr.cli:vcfpartition" From 1c195c6807c42b1a4e1a87e711f2f1a84f5229f5 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 17 Sep 2024 16:37:27 +0200 Subject: [PATCH 03/17] Rename zarr_field to bed_array --- bio2zarr/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 816de45b..10addbf3 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -582,7 +582,7 @@ def plink2zarr(): ) @zarr_path @click.argument( - "zarr_field", + "bed_array", type=str, ) @verbose From 1e3efa1d9273dae179ea2cb52c1c953b24a731ca Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Wed, 18 Sep 2024 22:49:36 +0200 Subject: [PATCH 04/17] Refactor bed2zarr code base - use pandas for reading - write to isolated zarr archive - map BED columns to arrays named after BED specification (hts-specs) --- bio2zarr/bed2zarr.py | 179 ++++++++++++++++++++----------------------- bio2zarr/cli.py | 16 ++-- tests/test_bed.py | 51 ++++-------- 3 files changed, 103 insertions(+), 143 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 852fa7d5..a2c91698 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -1,116 +1,105 @@ import dataclasses -import numpy as np +import logging import pathlib -import gzip + +import numcodecs +import pandas as pd import zarr -from . import core +from . import core, provenance -# see https://samtools.github.io/hts-specs/BEDv1.pdf -@dataclasses.dataclass -class Bed3: - """BED3 genomic region with chromosome, start, and end. Intervals - are 0-based, half-open.""" +logger = logging.getLogger(__name__) - chrom: str - start: int - end: int +DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7) - @property - def width(self): - """Width of the region.""" - return self.end - self.start - def __len__(self): - return self.width +@dataclasses.dataclass +class BedMetadata(core.JsonDataclass): + fields: list - def mask(self, invert=False): - """Create a mask for the region. The mask is an array of 1's - (0's if inverted).""" - func = np.zeros if invert else np.ones - return func(self.width, dtype=np.uint8) +@dataclasses.dataclass +class BedField: + category: str + name: str + + +class BedZarrWriter: + def __init__(self, path): + self.path = pathlib.Path(path) + self.metadata = None + self.data = None + + def init( + self, + bed_path, + *, + show_progress=False, + ): + self.read(bed_path) + fields = mandatory_bed_field_definitions() + # FIXME: add optional fields; depends on the number of columns + # in the bed file. BED3 is the minimum. + self.metadata = BedMetadata(fields) + self.path.mkdir() + store = zarr.DirectoryStore(self.path) + root = zarr.group(store=store) + root.attrs.update( + { + "bed_zarr_version": "0.1", + "source": f"bio2zarr-{provenance.__version__}", + } + ) + self.encode_mandatory_fields(root) + + # FIXME: error checking, number of columns, etc. + def read(self, bed_path): + logger.info("Reading bed file %s", bed_path) + first = pd.read_table(bed_path, header=None, nrows=1) + num_cols = len(first.columns) + if num_cols < 3: + raise ValueError(f"Expected at least 3 columns in bed file, got {num_cols}") + + # FIXME: support chunked reading + self.data = pd.read_table(bed_path, header=None).rename( + columns={0: "chrom", 1: "chromStart", 2: "chromEnd"} + ) -class BedReader: - def __init__(self, bed_path): - self.bed_path = pathlib.Path(bed_path) + def encode_mandatory_fields(self, root): + for field, dtype in zip( + ["chrom", "chromStart", "chromEnd"], ["str", "int", "int"] + ): + # FIXME: Check schema for chunks + array = root.array( + field, + self.data[field].values, + dtype=dtype, + compressor=DEFAULT_ZARR_COMPRESSOR, + chunks=(1000,), + ) + logger.debug("%s done", field) - def __enter__(self): - if self.bed_path.suffix == ".gz": - self.fh = gzip.open(self.bed_path, "rt") - else: - self.fh = self.bed_path.open("rt") - return self +# See https://samtools.github.io/hts-specs/BEDv1.pdf +def mandatory_bed_field_definitions(): + def make_field_def(name): + return BedField( + category="mandatory", + name=name, + ) - def __exit__(self, *args): - self.fh.close() + fields = [ + make_field_def("chrom"), + make_field_def("chromStart"), + make_field_def("chromEnd"), + ] + return fields -# Here we are assuming that we write a mask. However, the BED file -# could represent other things, such as scores, and there could be up -# to 9 columns, in which case more fields (aka data arrays?) would be -# needed. def bed2zarr( bed_path, zarr_path, - bed_array="bed_mask", # More generic name? show_progress=False, ): - # 1. Make sure the bed file is gzipped and indexed - bed_path = pathlib.Path(bed_path) - - if bed_path.suffix != ".gz": - raise ValueError("BED file must be gzipped.") - if ( - not bed_path.with_suffix(".gz.csi").exists() - or not bed_path.with_suffix(".gz.tbi").exists() - ): - raise ValueError("BED file must be indexed.") - - # 2. Make sure there are contig lengths - store = zarr.open(zarr_path) - if "contig_length" not in store: - raise ValueError( - ( - "No contig lengths in Zarr store. Contig lengths must be" - " present in the Zarr store before writing Bed entries." - ) - ) - # 2b. Make chromosome to integer mapping - chrom_d = { - k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"]))) - } - # 2c. Make cumulative index of contig lengths - contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0) - - # 3. Init the zarr group with the contig lengths - # bed_array and bed_array_contig are of equal lengths = total genome - if bed_array not in store: - bed_array_contig = f"{bed_array}_contig" - dtype = core.min_int_dtype(0, len(store["contig_id"])) - n_bases = np.sum(store["contig_length"]) - - store.create_dataset(bed_array, fill_value=0, dtype="u1", shape=(n_bases,)) - store.create_dataset( - bed_array_contig, - data=np.repeat( - np.arange(len(store["contig_id"])), store["contig_length"] - ).astype(dtype), - dtype=dtype, - shape=(n_bases,), - ) - - # 4. Read the bed file and write the mask to the zarr dataset, - # updating for each entry; many I/O operations; better read entire - # file, store regions by chromosomes and generate index by - # chromosome for all regions? - with BedReader(bed_path) as br: - for line in br.fh: - chrom, start, end = line.strip().split("\t") - i = chrom_d[chrom] - start = int(start) + contig_indices[i] - end = int(end) + contig_indices[i] - bed = Bed3(chrom, start, end) - mask = bed.mask() - store[bed_array][start:end] = mask + writer = BedZarrWriter(zarr_path) + writer.init(bed_path, show_progress=show_progress) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 10addbf3..fe60971e 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -8,7 +8,7 @@ import numcodecs import tabulate -from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr +from . import bed2zarr, plink, provenance, vcf2zarr, vcf_utils from .vcf2zarr import icf as icf_mod logger = logging.getLogger(__name__) @@ -580,15 +580,11 @@ def plink2zarr(): "bed_path", type=click.Path(exists=True, dir_okay=False), ) -@zarr_path -@click.argument( - "bed_array", - type=str, -) +@new_zarr_path @verbose @force @progress -def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress): +def bed2zarr_main(bed_path, zarr_path, verbose, force, progress): """ Convert BED file to the Zarr format. The BED regions will be converted to binary-encoded arrays whose length is equal to the @@ -599,12 +595,10 @@ def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress): The BED file must be compressed and tabix-indexed. """ setup_logging(verbose) - path = pathlib.Path(zarr_path) / bed_array - check_overwrite_dir(path, force) - bed2zarr( + check_overwrite_dir(zarr_path, force) + bed2zarr.bed2zarr( bed_path, zarr_path, - bed_array, show_progress=progress, ) diff --git a/tests/test_bed.py b/tests/test_bed.py index 689a60bd..72d8ad93 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,50 +1,27 @@ import pytest -import sgkit as sg -import xarray.testing as xt import zarr -from bio2zarr import bed2zarr, vcf2zarr +from bio2zarr import bed2zarr -class Test1kgBed: - data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf" - bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" - csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi" - - @pytest.fixture(scope="module") - def icf(self, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded" - vcf2zarr.explode(out, [self.data_path]) - return out +class TestBed: + bed_path = "tests/data/bed/sample_mask.bed.gz" - @pytest.fixture(scope="module") - def zarr(self, icf, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr" - vcf2zarr.encode(icf, out) + @pytest.fixture() + def zarr(self, tmp_path): + out = tmp_path / "sample_mask.zarr" return out - def test_add_mask_chr20(self, zarr): + def test_bed2zarr(self, zarr): bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True) -class TestSampleBed: - data_path = "tests/data/vcf/sample.bcf" - bed_path = "tests/data/bed/sample_mask.bed.gz" - csi_path = "tests/data/bed/sample_mask.bed.gz.csi" - - @pytest.fixture(scope="module") - def icf(self, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "sample.exploded" - vcf2zarr.explode(out, [self.data_path]) - return out +class Test1kgBed: + bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" - @pytest.fixture(scope="module") - def zarr(self, icf, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "sample.zarr" - vcf2zarr.encode(icf, out) + @pytest.fixture() + def zarr(self, tmp_path): + out = tmp_path / "1kg_2020_chr20_annotations_mask.zarr" return out - def test_add_mask_sample(self, zarr): - with pytest.raises(ValueError): - bed2zarr.bed2zarr( - bed_path=self.bed_path, zarr_path=zarr, show_progress=True - ) + def test_bed2zarr(self, zarr): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True) From f94234e9f7f8a1f8d9ad9efd3d78c28bfe89bfd3 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 20 Sep 2024 14:34:52 +0200 Subject: [PATCH 05/17] Add initial support for BED3-BED12 - guess BED file type - add draft schema - add tests --- bio2zarr/bed2zarr.py | 366 ++++++++++++++++++++++++++++++++++++++----- bio2zarr/cli.py | 4 +- tests/test_bed.py | 112 ++++++++++++- 3 files changed, 437 insertions(+), 45 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index a2c91698..c5e77b04 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -1,6 +1,9 @@ import dataclasses +import json import logging import pathlib +import shutil +from enum import Enum import numcodecs import pandas as pd @@ -13,35 +16,220 @@ DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7) +class BedType(Enum): + BED3 = 3 + BED4 = 4 + BED5 = 5 + BED6 = 6 + BED7 = 7 + BED8 = 8 + BED9 = 9 + BED12 = 12 + + +class BedColumns(Enum): + contig = 0 + start = 1 + end = 2 + name = 3 + score = 4 + strand = 5 + thickStart = 6 + thickEnd = 7 + itemRgb = 8 + blockCount = 9 + blockSizes = 10 + blockStarts = 11 + + @dataclasses.dataclass -class BedMetadata(core.JsonDataclass): +class ZarrArraySpec: + name: str + dtype: str + shape: list + chunks: list + dimensions: tuple + description: str + compressor: dict + filters: list + + def __post_init__(self): + self.shape = tuple(self.shape) + self.chunks = tuple(self.chunks) + self.dimensions = tuple(self.dimensions) + self.filters = tuple(self.filters) + + @staticmethod + def new(**kwargs): + spec = ZarrArraySpec( + **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[] + ) + spec._choose_compressor_settings() + return spec + + def _choose_compressor_settings(self): + pass + + @staticmethod + def from_field( + bed_field, + *, + num_records, + records_chunk_size, + ): + shape = [num_records] + dimensions = ["records"] + chunks = [records_chunk_size] + return ZarrArraySpec.new( + name=bed_field.name, + dtype=bed_field.dtype, + shape=shape, + chunks=chunks, + dimensions=dimensions, + description=bed_field.description, + ) + + +ZARR_SCHEMA_FORMAT_VERSION = "0.1" + + +@dataclasses.dataclass +class BedZarrSchema(core.JsonDataclass): + format_version: str fields: list + bed_type: str + records_chunk_size: int + + def validate(self): + """ + Checks that the schema is well-formed and within required limits. + """ + + @staticmethod + def fromdict(d): + if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION: + raise ValueError( + "BedZarrSchema format version mismatch: " + f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}" + ) + ret = BedZarrSchema(**d) + return ret + + @staticmethod + def fromjson(s): + return BedZarrSchema.fromdict(json.loads(s)) + + @staticmethod + def generate(num_records, bed_type, records_chunk_size=None): + if records_chunk_size is None: + records_chunk_size = 1000 + logger.info("Generating schema with chunks=%d", records_chunk_size) + + def spec_from_field(field): + return ZarrArraySpec.from_field( + field, + num_records=num_records, + records_chunk_size=records_chunk_size, + ) + + fields = mkfields(bed_type) + specs = [spec_from_field(field) for field in fields] + return BedZarrSchema( + format_version=ZARR_SCHEMA_FORMAT_VERSION, + fields=specs, + bed_type=bed_type.name, + records_chunk_size=records_chunk_size, + ) @dataclasses.dataclass class BedField: category: str name: str + alias: str + description: str + dtype: str + + +def guess_bed_file_type(path): + """Check the number of columns in a BED file and guess BED + type.""" + first = pd.read_table(path, header=None, nrows=1) + num_cols = first.shape[1] + if num_cols < 3: + raise ValueError(f"Expected at least 3 columns in BED file, got {num_cols}") + if num_cols > 12: + raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}") + if num_cols in (10, 11): + raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns") + return BedType(num_cols) + + +BZW_METADATA_FORMAT_VERSION = "0.1" + + +@dataclasses.dataclass +class BedZarrWriterMetadata(core.JsonDataclass): + format_version: str + fields: list + bed_type: str + schema: BedZarrSchema + + @staticmethod + def fromdict(d): + if d["format_version"] != BZW_METADATA_FORMAT_VERSION: + raise ValueError( + "BedZarrWriterMetadata format version mismatch: " + f"{d['format_version']} != {BZW_METADATA_FORMAT_VERSION}" + ) + ret = BedZarrWriterMetadata(**d) + ret.schema = BedZarrSchema.fromdict(d["schema"]) + return ret class BedZarrWriter: def __init__(self, path): self.path = pathlib.Path(path) + self.wip_path = self.path / "wip" self.metadata = None self.data = None + self.bed_path = None + self.bed_type = None + + @property + def schema(self): + return self.metadata.schema def init( self, - bed_path, + data, *, - show_progress=False, + bed_path, + bed_type, + schema, ): - self.read(bed_path) - fields = mandatory_bed_field_definitions() - # FIXME: add optional fields; depends on the number of columns - # in the bed file. BED3 is the minimum. - self.metadata = BedMetadata(fields) + self.data = data + self.bed_type = bed_type + self.bed_path = bed_path + + if self.path.exists(): + raise ValueError("Zarr path already exists") + schema.validate() + + fields = mkfields(self.bed_type) + self.metadata = BedZarrWriterMetadata( + format_version=BZW_METADATA_FORMAT_VERSION, + bed_type=self.bed_type.name, + fields=fields, + schema=schema, + ) self.path.mkdir() + self.wip_path.mkdir() + logger.info("Writing WIP metadata") + with open(self.wip_path / "metadata.json", "w") as f: + json.dump(self.metadata.asdict(), f, indent=4) + + def write(self): store = zarr.DirectoryStore(self.path) root = zarr.group(store=store) root.attrs.update( @@ -50,56 +238,158 @@ def init( "source": f"bio2zarr-{provenance.__version__}", } ) - self.encode_mandatory_fields(root) - - # FIXME: error checking, number of columns, etc. - def read(self, bed_path): - logger.info("Reading bed file %s", bed_path) - first = pd.read_table(bed_path, header=None, nrows=1) - num_cols = len(first.columns) - if num_cols < 3: - raise ValueError(f"Expected at least 3 columns in bed file, got {num_cols}") - - # FIXME: support chunked reading - self.data = pd.read_table(bed_path, header=None).rename( - columns={0: "chrom", 1: "chromStart", 2: "chromEnd"} - ) + d = {i: f.name for i, f in enumerate(self.schema.fields)} + self.data.rename(columns=d, inplace=True) + + # 1. Create contig name <-> contig id mapping - def encode_mandatory_fields(self, root): - for field, dtype in zip( - ["chrom", "chromStart", "chromEnd"], ["str", "int", "int"] - ): - # FIXME: Check schema for chunks + # 2. update self.data.contig based on mapping (-> also means + # schema needs modification since we change from string to + # int!) + + self.encode_fields(root) + + def finalise(self): + logger.debug("Removing %s", self.wip_path) + shutil.rmtree(self.wip_path) + + def load_metadata(self): + if self.metadata is None: + with open(self.wip_path / "metadata.json") as f: + self.metadata = BedZarrWriterMetadata.fromdict(json.load(f)) + + # FIXME: fields 9-12 are multi-dimensional + def encode_fields(self, root): + for field in self.schema.fields: + object_codec = None + if field.dtype == "O": + object_codec = numcodecs.VLenUTF8() array = root.array( - field, - self.data[field].values, - dtype=dtype, + field.name, + self.data[field.name].values, + shape=field.shape, + dtype=field.dtype, compressor=DEFAULT_ZARR_COMPRESSOR, - chunks=(1000,), + chunks=(self.schema.records_chunk_size,), + object_codec=object_codec, ) + array.attrs["_ARRAY_DIMENSIONS"] = ["records"] logger.debug("%s done", field) # See https://samtools.github.io/hts-specs/BEDv1.pdf def mandatory_bed_field_definitions(): - def make_field_def(name): + def make_field_def(name, alias, dtype, description=""): return BedField( category="mandatory", name=name, + alias=alias, + description=description, + dtype=dtype, ) fields = [ - make_field_def("chrom"), - make_field_def("chromStart"), - make_field_def("chromEnd"), + make_field_def("contig", "chrom", "O", "Chromosome name"), + make_field_def("start", "chromStart", "i8", "Feature start position"), + make_field_def("end", "chromEnd", "i8", "Feature end position"), ] return fields +def optional_bed_field_definitions(num_fields=0): + def make_field_def(name, alias, dtype=None, *, description=""): + return BedField( + category="optional", + name=name, + alias=alias, + description=description, + dtype=dtype, + ) + + fields = [ + make_field_def("name", "name", "O", description="Feature description"), + make_field_def("score", "score", "i2", description="A numerical value"), + make_field_def("strand", "strand", "O", description="Feature strand"), + make_field_def( + "thickStart", "thickStart", "i8", description="Thick start position" + ), + make_field_def("thickEnd", "thickEnd", "i8", description="Thick end position"), + make_field_def("itemRgb", "itemRgb", description="Display"), + make_field_def("blockCount", "blockCount", description="Number of blocks"), + make_field_def("blockSizes", "blockSizes", description="Block sizes"), + make_field_def( + "blockStarts", "blockStarts", description="Block start positions" + ), + ] + + return fields[:num_fields] + + +def mkschema(bed_path, out): + bed_type = guess_bed_file_type(bed_path) + data = pd.read_table(bed_path, header=None) + spec = BedZarrSchema.generate( + data.shape[0], + bed_type, + ) + out.write(spec.asjson()) + + +def mkfields(bed_type): + mandatory = mandatory_bed_field_definitions() + optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value) + return mandatory + optional + + +def bed2zarr_init( + bed_path, + bed_type, + zarr_path, + *, + schema_path=None, + records_chunk_size=None, +): + data = pd.read_table(bed_path, header=None) + + if schema_path is None: + schema = BedZarrSchema.generate( + data.shape[0], + bed_type, + records_chunk_size=records_chunk_size, + ) + else: + logger.info("Reading schema from %s", schema_path) + if records_chunk_size is not None: + raise ValueError( + "Cannot specify schema along with chunk sizes" + ) # NEEDS TEST + with open(schema_path) as f: + schema = BedZarrSchema.fromjson(f.read()) + + # 2. init store + bedzw = BedZarrWriter(zarr_path) + bedzw.init( + data, + bed_path=bed_path, + bed_type=bed_type, + schema=schema, + ) + return bedzw + + def bed2zarr( bed_path, zarr_path, - show_progress=False, + schema_path=None, + records_chunk_size=None, ): - writer = BedZarrWriter(zarr_path) - writer.init(bed_path, show_progress=show_progress) + bed_type = guess_bed_file_type(bed_path) + bedzw = bed2zarr_init( + bed_path, + bed_type, + zarr_path, + schema_path=schema_path, + records_chunk_size=records_chunk_size, + ) + bedzw.write() + bedzw.finalise() diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index fe60971e..9179a08a 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -583,8 +583,7 @@ def plink2zarr(): @new_zarr_path @verbose @force -@progress -def bed2zarr_main(bed_path, zarr_path, verbose, force, progress): +def bed2zarr_main(bed_path, zarr_path, verbose, force): """ Convert BED file to the Zarr format. The BED regions will be converted to binary-encoded arrays whose length is equal to the @@ -599,7 +598,6 @@ def bed2zarr_main(bed_path, zarr_path, verbose, force, progress): bed2zarr.bed2zarr( bed_path, zarr_path, - show_progress=progress, ) diff --git a/tests/test_bed.py b/tests/test_bed.py index 72d8ad93..57df4e7b 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,9 +1,113 @@ +import pandas as pd import pytest -import zarr + from bio2zarr import bed2zarr +ALLOWED_BED_FORMATS = [3, 4, 5, 6, 7, 8, 9, 12] +DISALLOWED_BED_FORMATS = [2, 10, 11, 13] +ALL_BED_FORMATS = ALLOWED_BED_FORMATS + DISALLOWED_BED_FORMATS + + +@pytest.fixture() +def bed_data(request): + data = [ + [ + "chr22", + 1000, + 5000, + "cloneA", + 960, + "+", + 1000, + 5000, + 0, + 2, + "567,488", + "0,3512", + "foo", + ], + [ + "chr22", + 2000, + 6000, + "cloneB", + 900, + "-", + 2000, + 6000, + 0, + 2, + "433,399", + "0,3601", + "bar", + ], + ] + return [row[0 : request.param] for row in data] + + +@pytest.fixture() +def bed_df(bed_data): + return pd.DataFrame(bed_data) + + +@pytest.fixture() +def bed_path(bed_data, tmp_path): + out = tmp_path / "sample_mask.bed" + with open(out, "w") as fh: + for row in bed_data: + fh.write("\t".join(map(str, row)) + "\n") + return out + + +@pytest.fixture() +def schema_path(bed_path, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.schema.json" + with open(out, "w") as f: + bed2zarr.mkschema(bed_path, f) + return out + + +@pytest.fixture() +def schema(schema_path): + with open(schema_path) as f: + return bed2zarr.BedZarrSchema.fromjson(f.read()) + + +class TestDefaultSchema: + @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) + def test_format_version(self, schema): + assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION + + +class TestSchema: + @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) + def test_generate_schema(self, bed_df, request): + bedspec = request.node.callspec.params["bed_data"] + bed_type = bed2zarr.BedType(bedspec) + schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type) + assert schema.bed_type == bed_type.name + assert schema.records_chunk_size == 1000 + + +class TestBedData: + @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True) + def test_guess_bed_type_from_path(self, bed_path, request): + bedspec = request.node.callspec.params["bed_data"] + if bedspec in [2, 10, 11, 13]: + with pytest.raises(ValueError): + bed2zarr.guess_bed_file_type(bed_path) + else: + bed_type = bed2zarr.guess_bed_file_type(bed_path) + assert bed_type.value == bedspec + + @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) + def test_bed_fields(self, bed_path, request): + bedspec = request.node.callspec.params["bed_data"] + fields = bed2zarr.mkfields(bed2zarr.BedType(bedspec)) + assert len(fields) == bedspec + -class TestBed: +class TestSampleMaskBed: bed_path = "tests/data/bed/sample_mask.bed.gz" @pytest.fixture() @@ -12,7 +116,7 @@ def zarr(self, tmp_path): return out def test_bed2zarr(self, zarr): - bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True) + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr) class Test1kgBed: @@ -24,4 +128,4 @@ def zarr(self, tmp_path): return out def test_bed2zarr(self, zarr): - bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True) + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr) From aff19f2feda392bbfc5941199aba6c84f2b73daf Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 20 Sep 2024 15:06:18 +0200 Subject: [PATCH 06/17] Minor refactorization --- bio2zarr/bed2zarr.py | 12 ++++++------ tests/test_bed.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index c5e77b04..267ed457 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -325,6 +325,12 @@ def make_field_def(name, alias, dtype=None, *, description=""): return fields[:num_fields] +def mkfields(bed_type): + mandatory = mandatory_bed_field_definitions() + optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value) + return mandatory + optional + + def mkschema(bed_path, out): bed_type = guess_bed_file_type(bed_path) data = pd.read_table(bed_path, header=None) @@ -335,12 +341,6 @@ def mkschema(bed_path, out): out.write(spec.asjson()) -def mkfields(bed_type): - mandatory = mandatory_bed_field_definitions() - optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value) - return mandatory + optional - - def bed2zarr_init( bed_path, bed_type, diff --git a/tests/test_bed.py b/tests/test_bed.py index 57df4e7b..c8993b94 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -93,7 +93,7 @@ class TestBedData: @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True) def test_guess_bed_type_from_path(self, bed_path, request): bedspec = request.node.callspec.params["bed_data"] - if bedspec in [2, 10, 11, 13]: + if bedspec in DISALLOWED_BED_FORMATS: with pytest.raises(ValueError): bed2zarr.guess_bed_file_type(bed_path) else: From 731cfa622ea9022f7178514233a46699e36e1025 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Mon, 14 Oct 2024 16:58:24 +0200 Subject: [PATCH 07/17] Simplify main bed2zarr, encode features and contigs as ints - simplification of main bed2zarr function - encode features, contigs as Categories - add specs for BED9-BED12 --- bio2zarr/bed2zarr.py | 302 ++++++++++++++++++++++++++++++++----------- tests/test_bed.py | 36 +++++- 2 files changed, 265 insertions(+), 73 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 267ed457..3427ff03 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -1,11 +1,14 @@ import dataclasses import json import logging +import math import pathlib import shutil from enum import Enum +from typing import Any import numcodecs +import numpy as np import pandas as pd import zarr @@ -82,7 +85,7 @@ def from_field( chunks = [records_chunk_size] return ZarrArraySpec.new( name=bed_field.name, - dtype=bed_field.dtype, + dtype=bed_field.smallest_dtype(), shape=shape, chunks=chunks, dimensions=dimensions, @@ -93,10 +96,40 @@ def from_field( ZARR_SCHEMA_FORMAT_VERSION = "0.1" +@dataclasses.dataclass +class Contig: + id: str + + +@dataclasses.dataclass +class Feature: + id: str + + +@dataclasses.dataclass +class BedMetadata(core.JsonDataclass): + contigs: list + features: list + fields: list + bed_type: BedType + num_records: int = -1 + records_chunk_size: int = 1000 + + @property + def num_contigs(self): + return len(self.contigs) + + @property + def num_features(self): + return len(self.features) + + @dataclasses.dataclass class BedZarrSchema(core.JsonDataclass): format_version: str fields: list + contigs: list + features: list bed_type: str records_chunk_size: int @@ -113,6 +146,9 @@ def fromdict(d): f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}" ) ret = BedZarrSchema(**d) + ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]] + ret.contigs = [Contig(**sd) for sd in d["contigs"]] + ret.features = [Feature(**sd) for sd in d["features"]] return ret @staticmethod @@ -120,7 +156,7 @@ def fromjson(s): return BedZarrSchema.fromdict(json.loads(s)) @staticmethod - def generate(num_records, bed_type, records_chunk_size=None): + def generate(num_records, bed_type, fields, contigs, features, records_chunk_size=None): if records_chunk_size is None: records_chunk_size = 1000 logger.info("Generating schema with chunks=%d", records_chunk_size) @@ -132,23 +168,80 @@ def spec_from_field(field): records_chunk_size=records_chunk_size, ) - fields = mkfields(bed_type) - specs = [spec_from_field(field) for field in fields] + specs = [spec_from_field(f) for f in fields] + + contig_spec = ZarrArraySpec.new( + name="contig_id", + dtype="O", + shape=[len(contigs)], + chunks=[len(contigs)], + dimensions=["contigs"], + description="Contig ID", + ) + feature_spec = ZarrArraySpec.new( + name="feature_id", + dtype="O", + shape=[len(features)], + chunks=[len(features)], + dimensions=["features"], + description="Feature ID", + ) + fields.extend([contig_spec, feature_spec]) return BedZarrSchema( format_version=ZARR_SCHEMA_FORMAT_VERSION, fields=specs, + contigs=contigs, + features=features, bed_type=bed_type.name, records_chunk_size=records_chunk_size, ) +@dataclasses.dataclass +class BedFieldSummary(core.JsonDataclass): + num_chunks: int = 0 + compressed_size: int = 0 + uncompressed_size: int = 0 + max_value: Any = -math.inf + min_value: Any = math.inf + + def update(self, other): + self.num_chunks = array.nchunks + self.compressed_size = array.nbytes + self.uncompressed_size = array.nbytes + self.min_value = min(self.min_value, other.min_value) + self.max_value = max(self.max_value, other.max_value) + + @staticmethod + def fromdict(d): + return BedFieldSummary(**d) + + @dataclasses.dataclass class BedField: category: str name: str alias: str description: str - dtype: str + bed_dtype: str + summary: BedFieldSummary + + def smallest_dtype(self): + """Return the smallest dtype suitable for this field based on type and values""" + s = self.summary + if self.bed_dtype == "Integer": + if not math.isfinite(s.max_value): + ret = "i1" + else: + ret = core.min_int_dtype(s.min_value, s.max_value) + elif self.bed_dtype == "Character": + ret = "U1" + elif self.bed_dtype == "Category": + ret = core.min_int_dtype(s.min_value, s.max_value) + else: + assert self.bed_dtype == "String" + ret = "O" + return ret def guess_bed_file_type(path): @@ -172,6 +265,8 @@ def guess_bed_file_type(path): class BedZarrWriterMetadata(core.JsonDataclass): format_version: str fields: list + contigs: list + features: list bed_type: str schema: BedZarrSchema @@ -191,36 +286,40 @@ class BedZarrWriter: def __init__(self, path): self.path = pathlib.Path(path) self.wip_path = self.path / "wip" - self.metadata = None self.data = None + self.metadata = None self.bed_path = None self.bed_type = None + self.fields = None @property def schema(self): + if self.metadata is None: + return return self.metadata.schema def init( self, data, *, + metadata, bed_path, - bed_type, schema, ): self.data = data - self.bed_type = bed_type + self.bed_metadata = metadata self.bed_path = bed_path if self.path.exists(): raise ValueError("Zarr path already exists") schema.validate() - fields = mkfields(self.bed_type) self.metadata = BedZarrWriterMetadata( format_version=BZW_METADATA_FORMAT_VERSION, - bed_type=self.bed_type.name, - fields=fields, + bed_type=self.bed_metadata.bed_type.name, + fields=self.fields, + contigs=self.bed_metadata.contigs, + features=self.bed_metadata.features, schema=schema, ) self.path.mkdir() @@ -240,15 +339,31 @@ def write(self): ) d = {i: f.name for i, f in enumerate(self.schema.fields)} self.data.rename(columns=d, inplace=True) + self.encode_fields(root) + self.encode_contig_id(root) + self.encode_feature_id(root) + + def encode_contig_id(self, root): + print(self.schema.contigs) + array = root.array( + "contig_id", + [contig.id for contig in self.schema.contigs], + dtype="str", + compressor=DEFAULT_ZARR_COMPRESSOR, + ) + array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"] - # 1. Create contig name <-> contig id mapping - - # 2. update self.data.contig based on mapping (-> also means - # schema needs modification since we change from string to - # int!) - self.encode_fields(root) + def encode_feature_id(self, root): + array = root.array( + "feature_id", + [feature.id for feature in self.schema.features], + dtype="str", + compressor=DEFAULT_ZARR_COMPRESSOR, + ) + array.attrs["_ARRAY_DIMENSIONS"] = ["features"] + def finalise(self): logger.debug("Removing %s", self.wip_path) shutil.rmtree(self.wip_path) @@ -258,7 +373,6 @@ def load_metadata(self): with open(self.wip_path / "metadata.json") as f: self.metadata = BedZarrWriterMetadata.fromdict(json.load(f)) - # FIXME: fields 9-12 are multi-dimensional def encode_fields(self, root): for field in self.schema.fields: object_codec = None @@ -279,46 +393,48 @@ def encode_fields(self, root): # See https://samtools.github.io/hts-specs/BEDv1.pdf def mandatory_bed_field_definitions(): - def make_field_def(name, alias, dtype, description=""): + def make_field_def(name, alias, bed_dtype, description=""): return BedField( category="mandatory", name=name, alias=alias, description=description, - dtype=dtype, + bed_dtype=bed_dtype, + summary=BedFieldSummary(), ) fields = [ - make_field_def("contig", "chrom", "O", "Chromosome name"), - make_field_def("start", "chromStart", "i8", "Feature start position"), - make_field_def("end", "chromEnd", "i8", "Feature end position"), + make_field_def("contig", "chrom", "Category", "Chromosome name"), + make_field_def("start", "chromStart", "Integer", "Feature start position"), + make_field_def("end", "chromEnd", "Integer", "Feature end position"), ] return fields def optional_bed_field_definitions(num_fields=0): - def make_field_def(name, alias, dtype=None, *, description=""): + def make_field_def(name, alias, bed_dtype, description=""): return BedField( category="optional", name=name, alias=alias, description=description, - dtype=dtype, + bed_dtype=bed_dtype, + summary=BedFieldSummary(), ) fields = [ - make_field_def("name", "name", "O", description="Feature description"), - make_field_def("score", "score", "i2", description="A numerical value"), - make_field_def("strand", "strand", "O", description="Feature strand"), + make_field_def("name", "name", "Category", "Feature description"), + make_field_def("score", "score", "Integer", "A numerical value"), + make_field_def("strand", "strand", "Character", "Feature strand"), make_field_def( - "thickStart", "thickStart", "i8", description="Thick start position" + "thickStart", "thickStart", "Integer", "Thick start position" ), - make_field_def("thickEnd", "thickEnd", "i8", description="Thick end position"), - make_field_def("itemRgb", "itemRgb", description="Display"), - make_field_def("blockCount", "blockCount", description="Number of blocks"), - make_field_def("blockSizes", "blockSizes", description="Block sizes"), + make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"), + make_field_def("itemRgb", "itemRgb", "Integer", "Display"), + make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"), + make_field_def("blockSizes", "blockSizes", "Integer", "Block sizes"), make_field_def( - "blockStarts", "blockStarts", description="Block start positions" + "blockStarts", "blockStarts", "Integer", "Block start positions" ), ] @@ -331,51 +447,78 @@ def mkfields(bed_type): return mandatory + optional -def mkschema(bed_path, out): - bed_type = guess_bed_file_type(bed_path) - data = pd.read_table(bed_path, header=None) - spec = BedZarrSchema.generate( - data.shape[0], - bed_type, - ) - out.write(spec.asjson()) +def read_bed(bed_path, bed_type): + """Read BED file.""" + fields = mkfields(bed_type) + data = pd.read_table(bed_path, header=None, + names=[f.name for f in fields]) + return data -def bed2zarr_init( - bed_path, - bed_type, - zarr_path, - *, - schema_path=None, - records_chunk_size=None, -): - data = pd.read_table(bed_path, header=None) - +def encode_categoricals(data, bed_type): + """Convert categoricals to integer encodings.""" + contig = pd.Categorical( + data["contig"], categories=data["contig"].unique(), ordered=True + ) + contig_id = contig.categories.values + data["contig"] = contig.codes + if bed_type.value >= BedType.BED4.value: + feature = pd.Categorical( + data["name"], categories=data["name"].unique(), ordered=True + ) + feature_id = feature.categories.values + data["name"] = feature.codes + else: + feature_id = None + return data, contig_id, feature_id + + +def update_field_bounds(bed_type, data): + """Update field bounds based on data.""" + ret = [] + fields = mkfields(bed_type) + for f in fields: + if f.bed_dtype == "Integer": + if f.name in ("itemRgb", "blockSizes", "blockStarts"): + if data[f.name].dtype == "O": + values = np.concatenate( + data[f.name].str.split(",").apply( + lambda x: np.array([int(y) for y in x]) + )) + else: + values = data[f.name] + f.summary.min_value = min(values) + f.summary.max_value = max(values) + else: + f.summary.min_value = data[f.name].min() + f.summary.max_value = data[f.name].max() + elif f.bed_dtype == "Category": + f.summary.min_value = 0 + f.summary.max_value = max(data[f.name].unique()) + ret.append(f) + return ret + + +def mkschema(schema_path, metadata): + """Read schema or make schema from fields.""" if schema_path is None: schema = BedZarrSchema.generate( - data.shape[0], - bed_type, - records_chunk_size=records_chunk_size, + num_records=metadata.num_records, + bed_type=metadata.bed_type, + fields=metadata.fields, + contigs=metadata.contigs, + features=metadata.features, + records_chunk_size=metadata.records_chunk_size, ) else: logger.info("Reading schema from %s", schema_path) - if records_chunk_size is not None: + if metadata.records_chunk_size is not None: raise ValueError( "Cannot specify schema along with chunk sizes" ) # NEEDS TEST with open(schema_path) as f: schema = BedZarrSchema.fromjson(f.read()) - - # 2. init store - bedzw = BedZarrWriter(zarr_path) - bedzw.init( - data, - bed_path=bed_path, - bed_type=bed_type, - schema=schema, - ) - return bedzw - + return schema def bed2zarr( bed_path, @@ -384,12 +527,27 @@ def bed2zarr( records_chunk_size=None, ): bed_type = guess_bed_file_type(bed_path) - bedzw = bed2zarr_init( - bed_path, - bed_type, - zarr_path, - schema_path=schema_path, + data = read_bed(bed_path, bed_type) + data, contig_id, feature_id = encode_categoricals(data, bed_type) + fields = update_field_bounds(bed_type, data) + metadata = BedMetadata( + contigs=[Contig(c) for c in contig_id], + features=[Feature(f) for f in feature_id], + fields=fields, + bed_type=bed_type, + num_records=data.shape[0], records_chunk_size=records_chunk_size, ) + schema = mkschema( + schema_path, + metadata, + ) + bedzw = BedZarrWriter(zarr_path) + bedzw.init( + data=data, + metadata=metadata, + bed_path=bed_path, + schema=schema, + ) bedzw.write() bedzw.finalise() diff --git a/tests/test_bed.py b/tests/test_bed.py index c8993b94..3d97da30 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -84,7 +84,11 @@ class TestSchema: def test_generate_schema(self, bed_df, request): bedspec = request.node.callspec.params["bed_data"] bed_type = bed2zarr.BedType(bedspec) - schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type) + fields = bed2zarr.mkfields(bed_type) + bed_df.columns = [f.name for f in fields] + bed_df, contig_id, feature_id = bed2zarr.encode_categoricals(bed_df, bed_type) + fields = bed2zarr.update_field_bounds(bed_type, bed_df) + schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type, fields) assert schema.bed_type == bed_type.name assert schema.records_chunk_size == 1000 @@ -129,3 +133,33 @@ def zarr(self, tmp_path): def test_bed2zarr(self, zarr): bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr) + + +class TestContigRename: + bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" + bed_data = [ + "chr21\t50000\t60000\texon", + "chr21\t60000\t90000\tfive_prime_utr", + "chr20\t50000\t60000\texon", + "chr20\t60000\t80000\tintron", + ] + + @pytest.fixture() + def bed(self, tmp_path): + out = tmp_path / "test.bed" + with open(out, "w") as fh: + for row in self.bed_data: + fh.write(row + "\n") + return out + + def test_bed2zarr(self, bed, tmp_path): + zarr = tmp_path / "test.zarr" + + data = pd.read_table(bed, header=None) + print(data) + print(data.describe(percentiles=[])) + bedzw = bed2zarr.BedZarrWriter(zarr) + print(bedzw) + #print(bedzw.schema) + print(bedzw.data) + From eda56542435f91259c5adac2e0076650d2e461bb Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 15 Oct 2024 09:42:22 +0200 Subject: [PATCH 08/17] Add tests and withdraw support for BED9, BED12 --- bio2zarr/bed2zarr.py | 193 ++++++++++++++++++++++++------------------- tests/test_bed.py | 99 +++++++++++----------- tests/test_core.py | 2 +- 3 files changed, 160 insertions(+), 134 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 3427ff03..cfa4e3cb 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -102,14 +102,14 @@ class Contig: @dataclasses.dataclass -class Feature: +class Name: id: str @dataclasses.dataclass class BedMetadata(core.JsonDataclass): contigs: list - features: list + names: list fields: list bed_type: BedType num_records: int = -1 @@ -120,8 +120,8 @@ def num_contigs(self): return len(self.contigs) @property - def num_features(self): - return len(self.features) + def num_names(self): + return len(self.names) @dataclasses.dataclass @@ -129,7 +129,7 @@ class BedZarrSchema(core.JsonDataclass): format_version: str fields: list contigs: list - features: list + names: list bed_type: str records_chunk_size: int @@ -147,8 +147,8 @@ def fromdict(d): ) ret = BedZarrSchema(**d) ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]] - ret.contigs = [Contig(**sd) for sd in d["contigs"]] - ret.features = [Feature(**sd) for sd in d["features"]] + ret.contig_ids = [Contig(**sd) for sd in d["contigs"]] + ret.name_ids = [Name(**sd) for sd in d["names"]] return ret @staticmethod @@ -156,43 +156,45 @@ def fromjson(s): return BedZarrSchema.fromdict(json.loads(s)) @staticmethod - def generate(num_records, bed_type, fields, contigs, features, records_chunk_size=None): + def generate(metadata, records_chunk_size=None): if records_chunk_size is None: records_chunk_size = 1000 logger.info("Generating schema with chunks=%d", records_chunk_size) + fields = metadata.fields def spec_from_field(field): return ZarrArraySpec.from_field( field, - num_records=num_records, + num_records=metadata.num_records, records_chunk_size=records_chunk_size, ) specs = [spec_from_field(f) for f in fields] - contig_spec = ZarrArraySpec.new( + # Contig_id and name_id specs unnecessary? + contig_id_spec = ZarrArraySpec.new( name="contig_id", dtype="O", - shape=[len(contigs)], - chunks=[len(contigs)], + shape=[metadata.num_contigs], + chunks=[metadata.num_contigs], dimensions=["contigs"], description="Contig ID", ) - feature_spec = ZarrArraySpec.new( - name="feature_id", + name_id_spec = ZarrArraySpec.new( + name="name_id", dtype="O", - shape=[len(features)], - chunks=[len(features)], - dimensions=["features"], - description="Feature ID", + shape=[metadata.num_names], + chunks=[metadata.num_names], + dimensions=["names"], + description="Name ID", ) - fields.extend([contig_spec, feature_spec]) + specs.extend([contig_id_spec, name_id_spec]) return BedZarrSchema( format_version=ZARR_SCHEMA_FORMAT_VERSION, fields=specs, - contigs=contigs, - features=features, - bed_type=bed_type.name, + contigs=metadata.contigs, + names=metadata.names, + bed_type=metadata.bed_type.name, records_chunk_size=records_chunk_size, ) @@ -206,9 +208,9 @@ class BedFieldSummary(core.JsonDataclass): min_value: Any = math.inf def update(self, other): - self.num_chunks = array.nchunks - self.compressed_size = array.nbytes - self.uncompressed_size = array.nbytes + self.num_chunks = other.num_chunks + self.compressed_size = other.compressed_size + self.uncompressed_size = other.uncompressed_size self.min_value = min(self.min_value, other.min_value) self.max_value = max(self.max_value, other.max_value) @@ -216,7 +218,7 @@ def update(self, other): def fromdict(d): return BedFieldSummary(**d) - + @dataclasses.dataclass class BedField: category: str @@ -227,7 +229,8 @@ class BedField: summary: BedFieldSummary def smallest_dtype(self): - """Return the smallest dtype suitable for this field based on type and values""" + """Return the smallest dtype suitable for this field based on + type and values""" s = self.summary if self.bed_dtype == "Integer": if not math.isfinite(s.max_value): @@ -255,6 +258,8 @@ def guess_bed_file_type(path): raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}") if num_cols in (10, 11): raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns") + if num_cols in (9, 12): + raise ValueError("BED9 and BED12 are valid but currently unsupported formats") return BedType(num_cols) @@ -266,7 +271,7 @@ class BedZarrWriterMetadata(core.JsonDataclass): format_version: str fields: list contigs: list - features: list + names: list bed_type: str schema: BedZarrSchema @@ -319,7 +324,7 @@ def init( bed_type=self.bed_metadata.bed_type.name, fields=self.fields, contigs=self.bed_metadata.contigs, - features=self.bed_metadata.features, + names=self.bed_metadata.names, schema=schema, ) self.path.mkdir() @@ -337,14 +342,14 @@ def write(self): "source": f"bio2zarr-{provenance.__version__}", } ) - d = {i: f.name for i, f in enumerate(self.schema.fields)} + datafields = self.schema.fields[0 : BedType[self.schema.bed_type].value] + d = {i: f.name for i, f in enumerate(datafields)} self.data.rename(columns=d, inplace=True) - self.encode_fields(root) + self.encode_fields(root, datafields) self.encode_contig_id(root) - self.encode_feature_id(root) + self.encode_name_id(root) def encode_contig_id(self, root): - print(self.schema.contigs) array = root.array( "contig_id", [contig.id for contig in self.schema.contigs], @@ -353,28 +358,29 @@ def encode_contig_id(self, root): ) array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"] - - def encode_feature_id(self, root): + def encode_name_id(self, root): array = root.array( - "feature_id", - [feature.id for feature in self.schema.features], + "name_id", + [name.id for name in self.schema.names], dtype="str", compressor=DEFAULT_ZARR_COMPRESSOR, ) - array.attrs["_ARRAY_DIMENSIONS"] = ["features"] + array.attrs["_ARRAY_DIMENSIONS"] = ["names"] - def finalise(self): + self.load_metadata() logger.debug("Removing %s", self.wip_path) shutil.rmtree(self.wip_path) + logger.info("Consolidating Zarr metadata") + zarr.consolidate_metadata(self.path) def load_metadata(self): if self.metadata is None: with open(self.wip_path / "metadata.json") as f: self.metadata = BedZarrWriterMetadata.fromdict(json.load(f)) - def encode_fields(self, root): - for field in self.schema.fields: + def encode_fields(self, root, datafields): + for field in datafields: object_codec = None if field.dtype == "O": object_codec = numcodecs.VLenUTF8() @@ -405,8 +411,8 @@ def make_field_def(name, alias, bed_dtype, description=""): fields = [ make_field_def("contig", "chrom", "Category", "Chromosome name"), - make_field_def("start", "chromStart", "Integer", "Feature start position"), - make_field_def("end", "chromEnd", "Integer", "Feature end position"), + make_field_def("start", "chromStart", "Integer", "Name start position"), + make_field_def("end", "chromEnd", "Integer", "Name end position"), ] return fields @@ -423,12 +429,10 @@ def make_field_def(name, alias, bed_dtype, description=""): ) fields = [ - make_field_def("name", "name", "Category", "Feature description"), + make_field_def("name", "name", "Category", "Name description"), make_field_def("score", "score", "Integer", "A numerical value"), - make_field_def("strand", "strand", "Character", "Feature strand"), - make_field_def( - "thickStart", "thickStart", "Integer", "Thick start position" - ), + make_field_def("strand", "strand", "Character", "Name strand"), + make_field_def("thickStart", "thickStart", "Integer", "Thick start position"), make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"), make_field_def("itemRgb", "itemRgb", "Integer", "Display"), make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"), @@ -447,11 +451,41 @@ def mkfields(bed_type): return mandatory + optional -def read_bed(bed_path, bed_type): - """Read BED file.""" - fields = mkfields(bed_type) - data = pd.read_table(bed_path, header=None, - names=[f.name for f in fields]) +def parse_bed(bed_path, records_chunk_size=None): + """Read and parse BED file and return data frame and metadata.""" + bed_type = guess_bed_file_type(bed_path) + fields = mkfields(bed_type) + data = pd.read_table(bed_path, header=None, names=[f.name for f in fields]) + data, contig_id, name_id = encode_categoricals(data, bed_type) + data = parse_csv_fields(data, bed_type) + fields = update_field_bounds(bed_type, data) + metadata = BedMetadata( + contigs=[Contig(c) for c in contig_id], + names=[Name(f) for f in name_id], + fields=fields, + bed_type=bed_type, + num_records=data.shape[0], + records_chunk_size=records_chunk_size, + ) + return data, metadata + + +def parse_csv_fields(data, bed_type): + if bed_type.value < BedType.BED8.value: + return data + + def _convert_csv_data(values): + if values.dtype == "O": + ret = values.str.split(",").apply(lambda x: np.array([int(y) for y in x])) + else: + ret = values + return ret + + if bed_type.value >= BedType.BED9.value: + data["itemRgb"] = _convert_csv_data(data["itemRgb"]) + if bed_type.value == BedType.BED12.value: + data["blockSizes"] = _convert_csv_data(data["blockSizes"]) + data["blockStarts"] = _convert_csv_data(data["blockStarts"]) return data @@ -463,14 +497,14 @@ def encode_categoricals(data, bed_type): contig_id = contig.categories.values data["contig"] = contig.codes if bed_type.value >= BedType.BED4.value: - feature = pd.Categorical( + name = pd.Categorical( data["name"], categories=data["name"].unique(), ordered=True ) - feature_id = feature.categories.values - data["name"] = feature.codes + name_id = name.categories.values + data["name"] = name.codes else: - feature_id = None - return data, contig_id, feature_id + name_id = [] + return data, contig_id, name_id def update_field_bounds(bed_type, data): @@ -481,10 +515,7 @@ def update_field_bounds(bed_type, data): if f.bed_dtype == "Integer": if f.name in ("itemRgb", "blockSizes", "blockStarts"): if data[f.name].dtype == "O": - values = np.concatenate( - data[f.name].str.split(",").apply( - lambda x: np.array([int(y) for y in x]) - )) + values = np.concatenate(data[f.name]) else: values = data[f.name] f.summary.min_value = min(values) @@ -499,16 +530,19 @@ def update_field_bounds(bed_type, data): return ret -def mkschema(schema_path, metadata): - """Read schema or make schema from fields.""" +def mkschema(bed_path, out): + """Write schema to file""" + _, metadata = parse_bed(bed_path) + spec = BedZarrSchema.generate( + metadata, records_chunk_size=metadata.records_chunk_size + ) + out.write(spec.asjson()) + + +def init_schema(schema_path, metadata): if schema_path is None: schema = BedZarrSchema.generate( - num_records=metadata.num_records, - bed_type=metadata.bed_type, - fields=metadata.fields, - contigs=metadata.contigs, - features=metadata.features, - records_chunk_size=metadata.records_chunk_size, + metadata, records_chunk_size=metadata.records_chunk_size ) else: logger.info("Reading schema from %s", schema_path) @@ -520,28 +554,15 @@ def mkschema(schema_path, metadata): schema = BedZarrSchema.fromjson(f.read()) return schema + def bed2zarr( bed_path, zarr_path, schema_path=None, records_chunk_size=None, ): - bed_type = guess_bed_file_type(bed_path) - data = read_bed(bed_path, bed_type) - data, contig_id, feature_id = encode_categoricals(data, bed_type) - fields = update_field_bounds(bed_type, data) - metadata = BedMetadata( - contigs=[Contig(c) for c in contig_id], - features=[Feature(f) for f in feature_id], - fields=fields, - bed_type=bed_type, - num_records=data.shape[0], - records_chunk_size=records_chunk_size, - ) - schema = mkschema( - schema_path, - metadata, - ) + data, metadata = parse_bed(bed_path, records_chunk_size) + schema = init_schema(schema_path, metadata) bedzw = BedZarrWriter(zarr_path) bedzw.init( data=data, diff --git a/tests/test_bed.py b/tests/test_bed.py index 3d97da30..b04d4aa9 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,9 +1,12 @@ +import numpy as np import pandas as pd import pytest +import zarr from bio2zarr import bed2zarr ALLOWED_BED_FORMATS = [3, 4, 5, 6, 7, 8, 9, 12] +SUPPORTED_BED_FORMATS = [3, 4, 5, 6, 7, 8] DISALLOWED_BED_FORMATS = [2, 10, 11, 13] ALL_BED_FORMATS = ALLOWED_BED_FORMATS + DISALLOWED_BED_FORMATS @@ -52,7 +55,7 @@ def bed_df(bed_data): @pytest.fixture() def bed_path(bed_data, tmp_path): - out = tmp_path / "sample_mask.bed" + out = tmp_path / "sample.bed" with open(out, "w") as fh: for row in bed_data: fh.write("\t".join(map(str, row)) + "\n") @@ -74,37 +77,69 @@ def schema(schema_path): class TestDefaultSchema: - @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) def test_format_version(self, schema): assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION class TestSchema: - @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) - def test_generate_schema(self, bed_df, request): + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) + def test_generate_schema(self, bed_path, request): bedspec = request.node.callspec.params["bed_data"] bed_type = bed2zarr.BedType(bedspec) - fields = bed2zarr.mkfields(bed_type) - bed_df.columns = [f.name for f in fields] - bed_df, contig_id, feature_id = bed2zarr.encode_categoricals(bed_df, bed_type) - fields = bed2zarr.update_field_bounds(bed_type, bed_df) - schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type, fields) + data, metadata = bed2zarr.parse_bed(bed_path) + schema = bed2zarr.BedZarrSchema.generate( + metadata, records_chunk_size=metadata.records_chunk_size + ) assert schema.bed_type == bed_type.name assert schema.records_chunk_size == 1000 + assert len(schema.contigs) == 1 + assert schema.contigs[0].id == "chr22" + if bed_type.value >= bed2zarr.BedType.BED4.value: + assert len(schema.names) == 2 + assert schema.names[0].id == "cloneA" + assert schema.names[1].id == "cloneB" + if bed_type == bed2zarr.BedType.BED12: + assert len(schema.fields) == 12 + + +class TestBed2Zarr: + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) + def test_bed2zarr(self, bed_path, bed_df, tmp_path, request): + bedspec = request.node.callspec.params["bed_data"] + bed_type = bed2zarr.BedType(bedspec) + zarr_path = tmp_path / "test.zarr" + bed2zarr.bed2zarr(bed_path=bed_path, zarr_path=zarr_path) + root = zarr.open(zarr_path) + np.testing.assert_array_equal(root["contig"][:], [0, 0]) + np.testing.assert_array_equal(root["contig_id"][:], ["chr22"]) + np.testing.assert_array_equal(root["start"][:], bed_df[1].values) + np.testing.assert_array_equal(root["end"][:], bed_df[2].values) + if bed_type.value >= bed2zarr.BedType.BED4.value: + np.testing.assert_array_equal(root["name"][:], [0, 1]) + np.testing.assert_array_equal(root["name_id"][:], bed_df[3].values) + if bed_type.value >= bed2zarr.BedType.BED5.value: + np.testing.assert_array_equal(root["score"][:], bed_df[4].values) + if bed_type.value >= bed2zarr.BedType.BED6.value: + np.testing.assert_array_equal(root["strand"][:], bed_df[5].values) + if bed_type.value >= bed2zarr.BedType.BED7.value: + np.testing.assert_array_equal(root["thickStart"][:], bed_df[6].values) + if bed_type.value >= bed2zarr.BedType.BED8.value: + np.testing.assert_array_equal(root["thickEnd"][:], bed_df[7].values) class TestBedData: @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True) def test_guess_bed_type_from_path(self, bed_path, request): bedspec = request.node.callspec.params["bed_data"] - if bedspec in DISALLOWED_BED_FORMATS: + if bedspec not in SUPPORTED_BED_FORMATS: with pytest.raises(ValueError): bed2zarr.guess_bed_file_type(bed_path) else: bed_type = bed2zarr.guess_bed_file_type(bed_path) assert bed_type.value == bedspec - @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True) + @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) def test_bed_fields(self, bed_path, request): bedspec = request.node.callspec.params["bed_data"] fields = bed2zarr.mkfields(bed2zarr.BedType(bedspec)) @@ -115,51 +150,21 @@ class TestSampleMaskBed: bed_path = "tests/data/bed/sample_mask.bed.gz" @pytest.fixture() - def zarr(self, tmp_path): + def zarr_path(self, tmp_path): out = tmp_path / "sample_mask.zarr" return out - def test_bed2zarr(self, zarr): - bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr) + def test_bed2zarr(self, zarr_path): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path) class Test1kgBed: bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" @pytest.fixture() - def zarr(self, tmp_path): + def zarr_path(self, tmp_path): out = tmp_path / "1kg_2020_chr20_annotations_mask.zarr" return out - def test_bed2zarr(self, zarr): - bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr) - - -class TestContigRename: - bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz" - bed_data = [ - "chr21\t50000\t60000\texon", - "chr21\t60000\t90000\tfive_prime_utr", - "chr20\t50000\t60000\texon", - "chr20\t60000\t80000\tintron", - ] - - @pytest.fixture() - def bed(self, tmp_path): - out = tmp_path / "test.bed" - with open(out, "w") as fh: - for row in self.bed_data: - fh.write(row + "\n") - return out - - def test_bed2zarr(self, bed, tmp_path): - zarr = tmp_path / "test.zarr" - - data = pd.read_table(bed, header=None) - print(data) - print(data.describe(percentiles=[])) - bedzw = bed2zarr.BedZarrWriter(zarr) - print(bedzw) - #print(bedzw.schema) - print(bedzw.data) - + def test_bed2zarr(self, zarr_path): + bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path) diff --git a/tests/test_core.py b/tests/test_core.py index 16e2c0c0..8ec47443 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -202,7 +202,7 @@ def test_5_chunk_1(self, n, expected): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4977391), + ("tests/data", 4989995), ("tests/data/vcf", 4965254), ("tests/data/vcf/sample.vcf.gz", 1089), ], From 0230ec0435a15fa4a1e7a47e25e79ecb7bd4a1ae Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 15 Oct 2024 09:53:58 +0200 Subject: [PATCH 09/17] Fix lint errors --- bio2zarr/bed2zarr.py | 15 --------------- tests/test_bed.py | 3 ++- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index cfa4e3cb..e9210571 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -30,21 +30,6 @@ class BedType(Enum): BED12 = 12 -class BedColumns(Enum): - contig = 0 - start = 1 - end = 2 - name = 3 - score = 4 - strand = 5 - thickStart = 6 - thickEnd = 7 - itemRgb = 8 - blockCount = 9 - blockSizes = 10 - blockStarts = 11 - - @dataclasses.dataclass class ZarrArraySpec: name: str diff --git a/tests/test_bed.py b/tests/test_bed.py index b04d4aa9..28f5f5d2 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -132,8 +132,9 @@ class TestBedData: @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True) def test_guess_bed_type_from_path(self, bed_path, request): bedspec = request.node.callspec.params["bed_data"] + match_str = rf"(got {bedspec}|prohibited|unsupported)" if bedspec not in SUPPORTED_BED_FORMATS: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=match_str): bed2zarr.guess_bed_file_type(bed_path) else: bed_type = bed2zarr.guess_bed_file_type(bed_path) From d13e93fdd560a3ded67396660f4e941b45d4c957 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 15 Oct 2024 09:57:16 +0200 Subject: [PATCH 10/17] Add pandas dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2883186f..d3c07fef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ # colouredlogs pulls in humanfriendly", "cyvcf2", "bed_reader", + "pandas", ] requires-python = ">=3.9" classifiers = [ From e113eb223a60a993560f4725dec78ef0073eaf5e Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Tue, 15 Oct 2024 10:10:41 +0200 Subject: [PATCH 11/17] Update du stats --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 8ec47443..0484fd22 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -202,7 +202,7 @@ def test_5_chunk_1(self, n, expected): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4989995), + ("tests/data", 4989501), ("tests/data/vcf", 4965254), ("tests/data/vcf/sample.vcf.gz", 1089), ], From bc7813a10ca2b6d216fb50c79b65be5df9057177 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 6 Dec 2024 08:55:52 +0100 Subject: [PATCH 12/17] Fix cli docstring. --- bio2zarr/cli.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 9179a08a..2a44cfea 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -585,13 +585,11 @@ def plink2zarr(): @force def bed2zarr_main(bed_path, zarr_path, verbose, force): """ - Convert BED file to the Zarr format. The BED regions will be - converted to binary-encoded arrays whose length is equal to the - length of the reference genome. The BED file regions are used to - mask the reference genome, where the masked regions are set to 1 - and the unmasked regions are set to 0. + Convert BED file to the Zarr format. Each BED column will be + converted to a Zarr array with appropriate encoding. - The BED file must be compressed and tabix-indexed. + The BED file must be compressed and tabix-indexed. BED9 and BED12 + formats are currently not supported. """ setup_logging(verbose) check_overwrite_dir(zarr_path, force) From 3a7f17bdfdfd5f9e9305fad72cddc2a6986c2367 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Mon, 9 Dec 2024 22:28:53 +0100 Subject: [PATCH 13/17] Add records chunk size and schema options to cli --- bio2zarr/cli.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 2a44cfea..3a4623f5 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -128,6 +128,14 @@ def list_commands(self, ctx): help="Chunk size in the samples dimension", ) +records_chunk_size = click.option( + "-r", + "--records-chunk-size", + type=int, + default=None, + help="Chunk size in the records dimension", +) + schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True)) max_variant_chunks = click.option( @@ -581,9 +589,18 @@ def plink2zarr(): type=click.Path(exists=True, dir_okay=False), ) @new_zarr_path +@schema +@records_chunk_size @verbose @force -def bed2zarr_main(bed_path, zarr_path, verbose, force): +def bed2zarr_main( + bed_path, + zarr_path, + schema, + records_chunk_size, + verbose, + force, +): """ Convert BED file to the Zarr format. Each BED column will be converted to a Zarr array with appropriate encoding. @@ -596,6 +613,8 @@ def bed2zarr_main(bed_path, zarr_path, verbose, force): bed2zarr.bed2zarr( bed_path, zarr_path, + records_chunk_size=records_chunk_size, + schema_path=schema, ) From 4137a756cd244f11c07a83788926c4f95f64d5ea Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 13 Dec 2024 09:32:40 +0100 Subject: [PATCH 14/17] Simplify bed2zarr conversion - read BED as data frame - rely on xarray.Dataset.to_zarr to convert --- bio2zarr/bed2zarr.py | 391 +++---------------------------------------- bio2zarr/cli.py | 3 - tests/test_bed.py | 41 ----- 3 files changed, 26 insertions(+), 409 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index e9210571..66365341 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -10,6 +10,7 @@ import numcodecs import numpy as np import pandas as pd +import xarray as xr import zarr from . import core, provenance @@ -30,160 +31,6 @@ class BedType(Enum): BED12 = 12 -@dataclasses.dataclass -class ZarrArraySpec: - name: str - dtype: str - shape: list - chunks: list - dimensions: tuple - description: str - compressor: dict - filters: list - - def __post_init__(self): - self.shape = tuple(self.shape) - self.chunks = tuple(self.chunks) - self.dimensions = tuple(self.dimensions) - self.filters = tuple(self.filters) - - @staticmethod - def new(**kwargs): - spec = ZarrArraySpec( - **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[] - ) - spec._choose_compressor_settings() - return spec - - def _choose_compressor_settings(self): - pass - - @staticmethod - def from_field( - bed_field, - *, - num_records, - records_chunk_size, - ): - shape = [num_records] - dimensions = ["records"] - chunks = [records_chunk_size] - return ZarrArraySpec.new( - name=bed_field.name, - dtype=bed_field.smallest_dtype(), - shape=shape, - chunks=chunks, - dimensions=dimensions, - description=bed_field.description, - ) - - -ZARR_SCHEMA_FORMAT_VERSION = "0.1" - - -@dataclasses.dataclass -class Contig: - id: str - - -@dataclasses.dataclass -class Name: - id: str - - -@dataclasses.dataclass -class BedMetadata(core.JsonDataclass): - contigs: list - names: list - fields: list - bed_type: BedType - num_records: int = -1 - records_chunk_size: int = 1000 - - @property - def num_contigs(self): - return len(self.contigs) - - @property - def num_names(self): - return len(self.names) - - -@dataclasses.dataclass -class BedZarrSchema(core.JsonDataclass): - format_version: str - fields: list - contigs: list - names: list - bed_type: str - records_chunk_size: int - - def validate(self): - """ - Checks that the schema is well-formed and within required limits. - """ - - @staticmethod - def fromdict(d): - if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION: - raise ValueError( - "BedZarrSchema format version mismatch: " - f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}" - ) - ret = BedZarrSchema(**d) - ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]] - ret.contig_ids = [Contig(**sd) for sd in d["contigs"]] - ret.name_ids = [Name(**sd) for sd in d["names"]] - return ret - - @staticmethod - def fromjson(s): - return BedZarrSchema.fromdict(json.loads(s)) - - @staticmethod - def generate(metadata, records_chunk_size=None): - if records_chunk_size is None: - records_chunk_size = 1000 - logger.info("Generating schema with chunks=%d", records_chunk_size) - fields = metadata.fields - - def spec_from_field(field): - return ZarrArraySpec.from_field( - field, - num_records=metadata.num_records, - records_chunk_size=records_chunk_size, - ) - - specs = [spec_from_field(f) for f in fields] - - # Contig_id and name_id specs unnecessary? - contig_id_spec = ZarrArraySpec.new( - name="contig_id", - dtype="O", - shape=[metadata.num_contigs], - chunks=[metadata.num_contigs], - dimensions=["contigs"], - description="Contig ID", - ) - name_id_spec = ZarrArraySpec.new( - name="name_id", - dtype="O", - shape=[metadata.num_names], - chunks=[metadata.num_names], - dimensions=["names"], - description="Name ID", - ) - specs.extend([contig_id_spec, name_id_spec]) - return BedZarrSchema( - format_version=ZARR_SCHEMA_FORMAT_VERSION, - fields=specs, - contigs=metadata.contigs, - names=metadata.names, - bed_type=metadata.bed_type.name, - records_chunk_size=records_chunk_size, - ) - - @dataclasses.dataclass class BedFieldSummary(core.JsonDataclass): num_chunks: int = 0 @@ -248,140 +95,6 @@ def guess_bed_file_type(path): return BedType(num_cols) -BZW_METADATA_FORMAT_VERSION = "0.1" - - -@dataclasses.dataclass -class BedZarrWriterMetadata(core.JsonDataclass): - format_version: str - fields: list - contigs: list - names: list - bed_type: str - schema: BedZarrSchema - - @staticmethod - def fromdict(d): - if d["format_version"] != BZW_METADATA_FORMAT_VERSION: - raise ValueError( - "BedZarrWriterMetadata format version mismatch: " - f"{d['format_version']} != {BZW_METADATA_FORMAT_VERSION}" - ) - ret = BedZarrWriterMetadata(**d) - ret.schema = BedZarrSchema.fromdict(d["schema"]) - return ret - - -class BedZarrWriter: - def __init__(self, path): - self.path = pathlib.Path(path) - self.wip_path = self.path / "wip" - self.data = None - self.metadata = None - self.bed_path = None - self.bed_type = None - self.fields = None - - @property - def schema(self): - if self.metadata is None: - return - return self.metadata.schema - - def init( - self, - data, - *, - metadata, - bed_path, - schema, - ): - self.data = data - self.bed_metadata = metadata - self.bed_path = bed_path - - if self.path.exists(): - raise ValueError("Zarr path already exists") - schema.validate() - - self.metadata = BedZarrWriterMetadata( - format_version=BZW_METADATA_FORMAT_VERSION, - bed_type=self.bed_metadata.bed_type.name, - fields=self.fields, - contigs=self.bed_metadata.contigs, - names=self.bed_metadata.names, - schema=schema, - ) - self.path.mkdir() - self.wip_path.mkdir() - logger.info("Writing WIP metadata") - with open(self.wip_path / "metadata.json", "w") as f: - json.dump(self.metadata.asdict(), f, indent=4) - - def write(self): - store = zarr.DirectoryStore(self.path) - root = zarr.group(store=store) - root.attrs.update( - { - "bed_zarr_version": "0.1", - "source": f"bio2zarr-{provenance.__version__}", - } - ) - datafields = self.schema.fields[0 : BedType[self.schema.bed_type].value] - d = {i: f.name for i, f in enumerate(datafields)} - self.data.rename(columns=d, inplace=True) - self.encode_fields(root, datafields) - self.encode_contig_id(root) - self.encode_name_id(root) - - def encode_contig_id(self, root): - array = root.array( - "contig_id", - [contig.id for contig in self.schema.contigs], - dtype="str", - compressor=DEFAULT_ZARR_COMPRESSOR, - ) - array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"] - - def encode_name_id(self, root): - array = root.array( - "name_id", - [name.id for name in self.schema.names], - dtype="str", - compressor=DEFAULT_ZARR_COMPRESSOR, - ) - array.attrs["_ARRAY_DIMENSIONS"] = ["names"] - - def finalise(self): - self.load_metadata() - logger.debug("Removing %s", self.wip_path) - shutil.rmtree(self.wip_path) - logger.info("Consolidating Zarr metadata") - zarr.consolidate_metadata(self.path) - - def load_metadata(self): - if self.metadata is None: - with open(self.wip_path / "metadata.json") as f: - self.metadata = BedZarrWriterMetadata.fromdict(json.load(f)) - - def encode_fields(self, root, datafields): - for field in datafields: - object_codec = None - if field.dtype == "O": - object_codec = numcodecs.VLenUTF8() - array = root.array( - field.name, - self.data[field.name].values, - shape=field.shape, - dtype=field.dtype, - compressor=DEFAULT_ZARR_COMPRESSOR, - chunks=(self.schema.records_chunk_size,), - object_codec=object_codec, - ) - array.attrs["_ARRAY_DIMENSIONS"] = ["records"] - logger.debug("%s done", field) - - # See https://samtools.github.io/hts-specs/BEDv1.pdf def mandatory_bed_field_definitions(): def make_field_def(name, alias, bed_dtype, description=""): @@ -436,63 +149,27 @@ def mkfields(bed_type): return mandatory + optional -def parse_bed(bed_path, records_chunk_size=None): - """Read and parse BED file and return data frame and metadata.""" - bed_type = guess_bed_file_type(bed_path) - fields = mkfields(bed_type) - data = pd.read_table(bed_path, header=None, names=[f.name for f in fields]) - data, contig_id, name_id = encode_categoricals(data, bed_type) - data = parse_csv_fields(data, bed_type) - fields = update_field_bounds(bed_type, data) - metadata = BedMetadata( - contigs=[Contig(c) for c in contig_id], - names=[Name(f) for f in name_id], - fields=fields, - bed_type=bed_type, - num_records=data.shape[0], - records_chunk_size=records_chunk_size, - ) - return data, metadata - - -def parse_csv_fields(data, bed_type): - if bed_type.value < BedType.BED8.value: - return data - - def _convert_csv_data(values): - if values.dtype == "O": - ret = values.str.split(",").apply(lambda x: np.array([int(y) for y in x])) - else: - ret = values - return ret - - if bed_type.value >= BedType.BED9.value: - data["itemRgb"] = _convert_csv_data(data["itemRgb"]) - if bed_type.value == BedType.BED12.value: - data["blockSizes"] = _convert_csv_data(data["blockSizes"]) - data["blockStarts"] = _convert_csv_data(data["blockStarts"]) - return data - - def encode_categoricals(data, bed_type): """Convert categoricals to integer encodings.""" contig = pd.Categorical( data["contig"], categories=data["contig"].unique(), ordered=True ) contig_id = contig.categories.values + contig_id = contig_id.astype(f"= BedType.BED4.value: name = pd.Categorical( data["name"], categories=data["name"].unique(), ordered=True ) name_id = name.categories.values + name_id = name_id.astype(f"= BedType.BED4.value: + ds["name_id"] = xr.DataArray(name_id, dims=["names"]) + chunks["names"] = len(name_id) + ds = ds.chunk(chunks) + ds.to_zarr(zarr_path, mode="w") diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index 3a4623f5..dc0f2cbb 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -589,14 +589,12 @@ def plink2zarr(): type=click.Path(exists=True, dir_okay=False), ) @new_zarr_path -@schema @records_chunk_size @verbose @force def bed2zarr_main( bed_path, zarr_path, - schema, records_chunk_size, verbose, force, @@ -614,7 +612,6 @@ def bed2zarr_main( bed_path, zarr_path, records_chunk_size=records_chunk_size, - schema_path=schema, ) diff --git a/tests/test_bed.py b/tests/test_bed.py index 28f5f5d2..a122f3dd 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -62,47 +62,6 @@ def bed_path(bed_data, tmp_path): return out -@pytest.fixture() -def schema_path(bed_path, tmp_path_factory): - out = tmp_path_factory.mktemp("data") / "example.schema.json" - with open(out, "w") as f: - bed2zarr.mkschema(bed_path, f) - return out - - -@pytest.fixture() -def schema(schema_path): - with open(schema_path) as f: - return bed2zarr.BedZarrSchema.fromjson(f.read()) - - -class TestDefaultSchema: - @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) - def test_format_version(self, schema): - assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION - - -class TestSchema: - @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) - def test_generate_schema(self, bed_path, request): - bedspec = request.node.callspec.params["bed_data"] - bed_type = bed2zarr.BedType(bedspec) - data, metadata = bed2zarr.parse_bed(bed_path) - schema = bed2zarr.BedZarrSchema.generate( - metadata, records_chunk_size=metadata.records_chunk_size - ) - assert schema.bed_type == bed_type.name - assert schema.records_chunk_size == 1000 - assert len(schema.contigs) == 1 - assert schema.contigs[0].id == "chr22" - if bed_type.value >= bed2zarr.BedType.BED4.value: - assert len(schema.names) == 2 - assert schema.names[0].id == "cloneA" - assert schema.names[1].id == "cloneB" - if bed_type == bed2zarr.BedType.BED12: - assert len(schema.fields) == 12 - - class TestBed2Zarr: @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True) def test_bed2zarr(self, bed_path, bed_df, tmp_path, request): From a398a9196f689682350545ed6ec606b60f9c7c2f Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 13 Dec 2024 09:38:54 +0100 Subject: [PATCH 15/17] Set smallest dtypes on DataSet --- bio2zarr/bed2zarr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 66365341..0fbfd599 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -203,9 +203,10 @@ def bed2zarr( data, contig_id, name_id = encode_categoricals(data, bed_type) fields = update_field_bounds(data, bed_type) dtypes = {f.name: f.smallest_dtype() for f in fields} - data = data.astype(dtypes) data.index.name = "records" ds = xr.Dataset.from_dataframe(data) + for k, v in dtypes.items(): + ds[k] = ds[k].astype(v) if records_chunk_size is None: records_chunk_size = len(data) chunks = { From d6e9b82501acdd1a2229fa5dbb6a909fff66d523 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 13 Dec 2024 10:10:08 +0100 Subject: [PATCH 16/17] Linting --- bio2zarr/bed2zarr.py | 6 +----- bio2zarr/cli.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index 0fbfd599..f73f6861 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -1,9 +1,6 @@ import dataclasses -import json import logging import math -import pathlib -import shutil from enum import Enum from typing import Any @@ -11,9 +8,8 @@ import numpy as np import pandas as pd import xarray as xr -import zarr -from . import core, provenance +from . import core logger = logging.getLogger(__name__) diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py index dc0f2cbb..aec6811c 100644 --- a/bio2zarr/cli.py +++ b/bio2zarr/cli.py @@ -595,7 +595,7 @@ def plink2zarr(): def bed2zarr_main( bed_path, zarr_path, - records_chunk_size, + records_chunk_size, verbose, force, ): From 1bf2b00a35f95fae9b38b6809a2fb1fadada4ae3 Mon Sep 17 00:00:00 2001 From: Per Unneberg Date: Fri, 13 Dec 2024 14:21:18 +0100 Subject: [PATCH 17/17] Remove xarray dependency --- bio2zarr/bed2zarr.py | 44 ++++++++++++++++++++++++++++---------------- tests/test_bed.py | 1 + 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py index f73f6861..de36a3c0 100644 --- a/bio2zarr/bed2zarr.py +++ b/bio2zarr/bed2zarr.py @@ -7,13 +7,14 @@ import numcodecs import numpy as np import pandas as pd -import xarray as xr +import zarr -from . import core +from . import core, provenance logger = logging.getLogger(__name__) DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7) +BED_ZARR_VERSION = 0.1 class BedType(Enum): @@ -200,18 +201,29 @@ def bed2zarr( fields = update_field_bounds(data, bed_type) dtypes = {f.name: f.smallest_dtype() for f in fields} data.index.name = "records" - ds = xr.Dataset.from_dataframe(data) - for k, v in dtypes.items(): - ds[k] = ds[k].astype(v) - if records_chunk_size is None: - records_chunk_size = len(data) - chunks = { - "records": records_chunk_size, - "contigs": len(contig_id), - } - ds["contig_id"] = xr.DataArray(contig_id, dims=["contigs"]) + data = data.astype(dtypes) + store = zarr.DirectoryStore(zarr_path) + root = zarr.group(store=store) + root.attrs.update( + { + "bed_zarr_version": f"{BED_ZARR_VERSION}", + "source": f"bio2zarr-{provenance.__version__}", + } + ) + for field in fields[0 : bed_type.value]: + if field.name == "strand": + root.array( + field.name, + data[field.name].values, + chunks=(records_chunk_size,), + dtype="= BedType.BED4.value: - ds["name_id"] = xr.DataArray(name_id, dims=["names"]) - chunks["names"] = len(name_id) - ds = ds.chunk(chunks) - ds.to_zarr(zarr_path, mode="w") + root.array("name_id", name_id, chunks=(len(name_id),)) diff --git a/tests/test_bed.py b/tests/test_bed.py index a122f3dd..3f162679 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -85,6 +85,7 @@ def test_bed2zarr(self, bed_path, bed_df, tmp_path, request): np.testing.assert_array_equal(root["thickStart"][:], bed_df[6].values) if bed_type.value >= bed2zarr.BedType.BED8.value: np.testing.assert_array_equal(root["thickEnd"][:], bed_df[7].values) + print(zarr_path) class TestBedData: