From 8e97bc3f845977dcc0b0ed77af54075c14127a0a Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 17 Sep 2024 16:10:59 +0200
Subject: [PATCH 01/17] Draft bed2zarr code

Draft code to convert Bed3-format to Zarr.
---
 bio2zarr/__main__.py                          |   1 +
 bio2zarr/bed2zarr.py                          | 116 ++++++++++++++++++
 bio2zarr/cli.py                               |  37 +++++-
 pyproject.toml                                |   1 +
 .../1kg_2020_chr20_annotations_mask.bed.gz    | Bin 0 -> 95 bytes
 ...1kg_2020_chr20_annotations_mask.bed.gz.csi | Bin 0 -> 122 bytes
 tests/data/bed/sample_mask.bed.gz             | Bin 0 -> 84 bytes
 tests/data/bed/sample_mask.bed.gz.csi         | Bin 0 -> 149 bytes
 tests/test_bed.py                             |  50 ++++++++
 9 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 bio2zarr/bed2zarr.py
 create mode 100644 tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz
 create mode 100644 tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi
 create mode 100644 tests/data/bed/sample_mask.bed.gz
 create mode 100644 tests/data/bed/sample_mask.bed.gz.csi
 create mode 100644 tests/test_bed.py

diff --git a/bio2zarr/__main__.py b/bio2zarr/__main__.py
index cab080b6..024b9ab2 100644
--- a/bio2zarr/__main__.py
+++ b/bio2zarr/__main__.py
@@ -15,6 +15,7 @@ def bio2zarr():
 # is handy for development and for those whose PATHs aren't set
 # up in the right way.
 bio2zarr.add_command(cli.vcf2zarr_main)
+bio2zarr.add_command(cli.bed2zarr_main)
 bio2zarr.add_command(cli.plink2zarr)
 bio2zarr.add_command(cli.vcfpartition)
 
diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
new file mode 100644
index 00000000..8328c252
--- /dev/null
+++ b/bio2zarr/bed2zarr.py
@@ -0,0 +1,116 @@
+import dataclasses
+import numpy as np
+import pathlib
+import gzip
+import zarr
+from . import core
+
+
+# see https://samtools.github.io/hts-specs/BEDv1.pdf
+@dataclasses.dataclass
+class Bed3:
+    """BED3 genomic region with chromosome, start, and end. Intervals
+    are 0-based, half-open."""
+
+    chrom: str
+    start: int
+    end: int
+
+    @property
+    def width(self):
+        """Width of the region."""
+        return self.end - self.start
+
+    def __len__(self):
+        return self.width
+
+    def mask(self, invert=False):
+        """Create a mask for the region. The mask is an array of 1's
+        (0's if inverted)."""
+        func = np.zeros if invert else np.ones
+        return func(self.width, dtype=np.uint8)
+
+
+class BedReader:
+    def __init__(self, bed_path):
+        self.bed_path = pathlib.Path(bed_path)
+
+    def __enter__(self):
+        if self.bed_path.suffix == ".gz":
+            self.fh = gzip.open(self.bed_path, "rt")
+        else:
+            self.fh = self.bed_path.open("rt")
+
+        return self
+
+    def __exit__(self, *args):
+        self.fh.close()
+
+
+# Here we are assuming that we write a mask. However, the BED file
+# could represent other things, such as scores, and there could be up
+# to 9 columns, in which case more fields (aka data arrays?) would be
+# needed.
+def bed2zarr(
+    bed_path,
+    zarr_path,
+    bed_array="bed_mask",  # More generic name?
+    show_progress=False,
+):
+    # 1. Make sure the bed file is gzipped and indexed
+    bed_path = pathlib.Path(bed_path)
+
+    if bed_path.suffix != ".gz":
+        raise ValueError("BED file must be gzipped.")
+    if (
+        not bed_path.with_suffix(".gz.csi").exists()
+        or not bed_path.with_suffix(".gz.tbi").exists()
+    ):
+        raise ValueError("BED file must be indexed.")
+
+    # 2. Make sure there are contig lengths
+    store = zarr.open(zarr_path)
+    if "contig_length" not in store:
+        raise ValueError(
+            (
+                "No contig lengths in Zarr store. Contig lengths must be"
+                " present in the Zarr store before writing Bed entries."
+            )
+        )
+    # 2b. Make chromosome to integer mapping
+    chrom_d = {
+        k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"])))
+    }
+    # 2c. Make cumulative index of contig lengths
+    contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0)
+
+    # 3. Init the zarr group with the contig lengths
+    # bed_array and bed_array_contig are of equal lengths = total genome
+    if bed_array not in store:
+        bed_array_contig = f"{bed_array}_contig"
+        dtype = core.min_int_dtype(0, len(store["contig_id"]))
+        n_bases = np.sum(store["contig_length"])
+
+        store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,))
+        store.create_dataset(
+            bed_array_contig,
+            data=np.repeat(
+                np.arange(len(store["contig_id"])), store["contig_length"]
+            ).astype(dtype),
+            dtype=dtype,
+            shape=(n_bases,),
+        )
+
+    # 4. Read the bed file and write the mask to the zarr dataset,
+    # updating for each entry; many I/O operations; better read entire
+    # file, store regions by chromosomes and generate index by
+    # chromosome for all regions?
+    with BedReader(bed_path) as br:
+        for line in br.fh:
+            chrom, start, end = line.strip().split("\t")
+            i = chrom_d[chrom]
+            start = int(start) + contig_indices[i]
+            end = int(end) + contig_indices[i]
+            bed = Bed3(chrom, start, end)
+            mask = bed.mask()
+            store[bed_array][start:end] = mask
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 0722521d..816de45b 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -8,7 +8,7 @@
 import numcodecs
 import tabulate
 
-from . import plink, provenance, vcf2zarr, vcf_utils
+from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr
 from .vcf2zarr import icf as icf_mod
 
 logger = logging.getLogger(__name__)
@@ -574,6 +574,41 @@ def plink2zarr():
 plink2zarr.add_command(convert_plink)
 
 
+@click.command
+@version
+@click.argument(
+    "bed_path",
+    type=click.Path(exists=True, dir_okay=False),
+)
+@zarr_path
+@click.argument(
+    "zarr_field",
+    type=str,
+)
+@verbose
+@force
+@progress
+def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress):
+    """
+    Convert BED file to the Zarr format. The BED regions will be
+    converted to binary-encoded arrays whose length is equal to the
+    length of the reference genome. The BED file regions are used to
+    mask the reference genome, where the masked regions are set to 1
+    and the unmasked regions are set to 0.
+
+    The BED file must be compressed and tabix-indexed.
+    """
+    setup_logging(verbose)
+    path = pathlib.Path(zarr_path) / bed_array
+    check_overwrite_dir(path, force)
+    bed2zarr(
+        bed_path,
+        zarr_path,
+        bed_array,
+        show_progress=progress,
+    )
+
+
 @click.command
 @version
 @vcfs
diff --git a/pyproject.toml b/pyproject.toml
index 08bc9742..b57758b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
 documentation = "https://sgkit-dev.github.io/bio2zarr/"
 
 [project.scripts]
+bed2zarr = "bio2zarr.cli:bed2zarr"
 vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
 vcfpartition = "bio2zarr.cli:vcfpartition"
 
diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8713cdfcc239aef2781152275d05ae7496a2d52d
GIT binary patch
literal 95
zcmb2|=3rp}f&Xj_PR>jWP7JOm7BV&%2(TE`&Q-JdBDrGOuHW1$6C;05@$oS3-RkeT
eI{en#`Ry?YHQfgS7|nq?<<X3gW?%-J2qFMcRT)<R

literal 0
HcmV?d00001

diff --git a/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi b/tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi
new file mode 100644
index 0000000000000000000000000000000000000000..e9288838b03a75804884a9649c857c8afd4ad2b5
GIT binary patch
literal 122
zcmb2|=3rp}f&Xj_PR>jWu?)E<H}X0-Ftk2=FAyln#M~hovfQ-EEqBW72&Wj4@1pA~
z{_i?|Vew&xrUoukxAkJjD|L(48JgN1y#MRXv}HBnt#R7B8SNdno}L6WR36QAX$EGn
HWgr3o(Jv+2

literal 0
HcmV?d00001

diff --git a/tests/data/bed/sample_mask.bed.gz b/tests/data/bed/sample_mask.bed.gz
new file mode 100644
index 0000000000000000000000000000000000000000..b89e813bca96d4e12988e575d169fd4b14a7dc68
GIT binary patch
literal 84
zcmb2|=3rp}f&Xj_PR>jW<_xS2hP(_4JO|$S`FcM%!1-_L1U=2GI+?3Wu2g!QV|S`t
T{6-O|TpmrIGy^l(7!UyfvL6%P

literal 0
HcmV?d00001

diff --git a/tests/data/bed/sample_mask.bed.gz.csi b/tests/data/bed/sample_mask.bed.gz.csi
new file mode 100644
index 0000000000000000000000000000000000000000..2d96b9276eaa8425768351223ab2b83d822bc8f3
GIT binary patch
literal 149
zcmb2|=3rp}f&Xj_PR>jW6%3_k4EdT31Q-r(ysY4QiaFykyUNN*R-#)>_H>IYGqRm-
z%6(U|`u6_|t?W`N0xT|fK5V)4;P2;3$?4TD=BhIvJlJ=(&g-}3ydR(3FY46ZyWO4k
lcKxiS`Q^WDbFb~oOG`_)eOF+^$iN_vX0bE_GuRy<0s!@KH-rEH

literal 0
HcmV?d00001

diff --git a/tests/test_bed.py b/tests/test_bed.py
new file mode 100644
index 00000000..689a60bd
--- /dev/null
+++ b/tests/test_bed.py
@@ -0,0 +1,50 @@
+import pytest
+import sgkit as sg
+import xarray.testing as xt
+import zarr
+from bio2zarr import bed2zarr, vcf2zarr
+
+
+class Test1kgBed:
+    data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf"
+    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
+    csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi"
+
+    @pytest.fixture(scope="module")
+    def icf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded"
+        vcf2zarr.explode(out, [self.data_path])
+        return out
+
+    @pytest.fixture(scope="module")
+    def zarr(self, icf, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr"
+        vcf2zarr.encode(icf, out)
+        return out
+
+    def test_add_mask_chr20(self, zarr):
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
+
+
+class TestSampleBed:
+    data_path = "tests/data/vcf/sample.bcf"
+    bed_path = "tests/data/bed/sample_mask.bed.gz"
+    csi_path = "tests/data/bed/sample_mask.bed.gz.csi"
+
+    @pytest.fixture(scope="module")
+    def icf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "sample.exploded"
+        vcf2zarr.explode(out, [self.data_path])
+        return out
+
+    @pytest.fixture(scope="module")
+    def zarr(self, icf, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "sample.zarr"
+        vcf2zarr.encode(icf, out)
+        return out
+
+    def test_add_mask_sample(self, zarr):
+        with pytest.raises(ValueError):
+            bed2zarr.bed2zarr(
+                bed_path=self.bed_path, zarr_path=zarr, show_progress=True
+            )

From 49b6f6af911b40299c1621db684403ce688b9429 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 17 Sep 2024 16:34:54 +0200
Subject: [PATCH 02/17] Fix bed2zarr call

---
 bio2zarr/bed2zarr.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 8328c252..852fa7d5 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -91,7 +91,7 @@ def bed2zarr(
         dtype = core.min_int_dtype(0, len(store["contig_id"]))
         n_bases = np.sum(store["contig_length"])
 
-        store.create_dataset(bed_array, fill_value=0, dtype=dtype, shape=(n_bases,))
+        store.create_dataset(bed_array, fill_value=0, dtype="u1", shape=(n_bases,))
         store.create_dataset(
             bed_array_contig,
             data=np.repeat(
diff --git a/pyproject.toml b/pyproject.toml
index b57758b8..2883186f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ repository = "https://github.com/sgkit-dev/bio2zarr"
 documentation = "https://sgkit-dev.github.io/bio2zarr/"
 
 [project.scripts]
-bed2zarr = "bio2zarr.cli:bed2zarr"
+bed2zarr = "bio2zarr.cli:bed2zarr_main"
 vcf2zarr = "bio2zarr.cli:vcf2zarr_main"
 vcfpartition = "bio2zarr.cli:vcfpartition"
 

From 1c195c6807c42b1a4e1a87e711f2f1a84f5229f5 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 17 Sep 2024 16:37:27 +0200
Subject: [PATCH 03/17] Rename zarr_field to bed_array

---
 bio2zarr/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 816de45b..10addbf3 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -582,7 +582,7 @@ def plink2zarr():
 )
 @zarr_path
 @click.argument(
-    "zarr_field",
+    "bed_array",
     type=str,
 )
 @verbose

From 1e3efa1d9273dae179ea2cb52c1c953b24a731ca Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Wed, 18 Sep 2024 22:49:36 +0200
Subject: [PATCH 04/17] Refactor bed2zarr code base

- use pandas for reading
- write to isolated zarr archive
- map BED columns to arrays named after BED specification (hts-specs)
---
 bio2zarr/bed2zarr.py | 179 ++++++++++++++++++++-----------------------
 bio2zarr/cli.py      |  16 ++--
 tests/test_bed.py    |  51 ++++--------
 3 files changed, 103 insertions(+), 143 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 852fa7d5..a2c91698 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -1,116 +1,105 @@
 import dataclasses
-import numpy as np
+import logging
 import pathlib
-import gzip
+
+import numcodecs
+import pandas as pd
 import zarr
-from . import core
 
+from . import core, provenance
 
-# see https://samtools.github.io/hts-specs/BEDv1.pdf
-@dataclasses.dataclass
-class Bed3:
-    """BED3 genomic region with chromosome, start, and end. Intervals
-    are 0-based, half-open."""
+logger = logging.getLogger(__name__)
 
-    chrom: str
-    start: int
-    end: int
+DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
 
-    @property
-    def width(self):
-        """Width of the region."""
-        return self.end - self.start
 
-    def __len__(self):
-        return self.width
+@dataclasses.dataclass
+class BedMetadata(core.JsonDataclass):
+    fields: list
 
-    def mask(self, invert=False):
-        """Create a mask for the region. The mask is an array of 1's
-        (0's if inverted)."""
-        func = np.zeros if invert else np.ones
-        return func(self.width, dtype=np.uint8)
 
+@dataclasses.dataclass
+class BedField:
+    category: str
+    name: str
+
+
+class BedZarrWriter:
+    def __init__(self, path):
+        self.path = pathlib.Path(path)
+        self.metadata = None
+        self.data = None
+
+    def init(
+        self,
+        bed_path,
+        *,
+        show_progress=False,
+    ):
+        self.read(bed_path)
+        fields = mandatory_bed_field_definitions()
+        # FIXME: add optional fields; depends on the number of columns
+        # in the bed file. BED3 is the minimum.
+        self.metadata = BedMetadata(fields)
+        self.path.mkdir()
+        store = zarr.DirectoryStore(self.path)
+        root = zarr.group(store=store)
+        root.attrs.update(
+            {
+                "bed_zarr_version": "0.1",
+                "source": f"bio2zarr-{provenance.__version__}",
+            }
+        )
+        self.encode_mandatory_fields(root)
+
+    # FIXME: error checking, number of columns, etc.
+    def read(self, bed_path):
+        logger.info("Reading bed file %s", bed_path)
+        first = pd.read_table(bed_path, header=None, nrows=1)
+        num_cols = len(first.columns)
+        if num_cols < 3:
+            raise ValueError(f"Expected at least 3 columns in bed file, got {num_cols}")
+
+        # FIXME: support chunked reading
+        self.data = pd.read_table(bed_path, header=None).rename(
+            columns={0: "chrom", 1: "chromStart", 2: "chromEnd"}
+        )
 
-class BedReader:
-    def __init__(self, bed_path):
-        self.bed_path = pathlib.Path(bed_path)
+    def encode_mandatory_fields(self, root):
+        for field, dtype in zip(
+            ["chrom", "chromStart", "chromEnd"], ["str", "int", "int"]
+        ):
+            # FIXME: Check schema for chunks
+            array = root.array(
+                field,
+                self.data[field].values,
+                dtype=dtype,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
+                chunks=(1000,),
+            )
+            logger.debug("%s done", field)
 
-    def __enter__(self):
-        if self.bed_path.suffix == ".gz":
-            self.fh = gzip.open(self.bed_path, "rt")
-        else:
-            self.fh = self.bed_path.open("rt")
 
-        return self
+# See https://samtools.github.io/hts-specs/BEDv1.pdf
+def mandatory_bed_field_definitions():
+    def make_field_def(name):
+        return BedField(
+            category="mandatory",
+            name=name,
+        )
 
-    def __exit__(self, *args):
-        self.fh.close()
+    fields = [
+        make_field_def("chrom"),
+        make_field_def("chromStart"),
+        make_field_def("chromEnd"),
+    ]
+    return fields
 
 
-# Here we are assuming that we write a mask. However, the BED file
-# could represent other things, such as scores, and there could be up
-# to 9 columns, in which case more fields (aka data arrays?) would be
-# needed.
 def bed2zarr(
     bed_path,
     zarr_path,
-    bed_array="bed_mask",  # More generic name?
     show_progress=False,
 ):
-    # 1. Make sure the bed file is gzipped and indexed
-    bed_path = pathlib.Path(bed_path)
-
-    if bed_path.suffix != ".gz":
-        raise ValueError("BED file must be gzipped.")
-    if (
-        not bed_path.with_suffix(".gz.csi").exists()
-        or not bed_path.with_suffix(".gz.tbi").exists()
-    ):
-        raise ValueError("BED file must be indexed.")
-
-    # 2. Make sure there are contig lengths
-    store = zarr.open(zarr_path)
-    if "contig_length" not in store:
-        raise ValueError(
-            (
-                "No contig lengths in Zarr store. Contig lengths must be"
-                " present in the Zarr store before writing Bed entries."
-            )
-        )
-    # 2b. Make chromosome to integer mapping
-    chrom_d = {
-        k: v for k, v in zip(store["contig_id"], np.arange(len(store["contig_id"])))
-    }
-    # 2c. Make cumulative index of contig lengths
-    contig_indices = np.insert(np.cumsum(store["contig_length"])[:-1], 0, 0)
-
-    # 3. Init the zarr group with the contig lengths
-    # bed_array and bed_array_contig are of equal lengths = total genome
-    if bed_array not in store:
-        bed_array_contig = f"{bed_array}_contig"
-        dtype = core.min_int_dtype(0, len(store["contig_id"]))
-        n_bases = np.sum(store["contig_length"])
-
-        store.create_dataset(bed_array, fill_value=0, dtype="u1", shape=(n_bases,))
-        store.create_dataset(
-            bed_array_contig,
-            data=np.repeat(
-                np.arange(len(store["contig_id"])), store["contig_length"]
-            ).astype(dtype),
-            dtype=dtype,
-            shape=(n_bases,),
-        )
-
-    # 4. Read the bed file and write the mask to the zarr dataset,
-    # updating for each entry; many I/O operations; better read entire
-    # file, store regions by chromosomes and generate index by
-    # chromosome for all regions?
-    with BedReader(bed_path) as br:
-        for line in br.fh:
-            chrom, start, end = line.strip().split("\t")
-            i = chrom_d[chrom]
-            start = int(start) + contig_indices[i]
-            end = int(end) + contig_indices[i]
-            bed = Bed3(chrom, start, end)
-            mask = bed.mask()
-            store[bed_array][start:end] = mask
+    writer = BedZarrWriter(zarr_path)
+    writer.init(bed_path, show_progress=show_progress)
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 10addbf3..fe60971e 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -8,7 +8,7 @@
 import numcodecs
 import tabulate
 
-from . import plink, provenance, vcf2zarr, vcf_utils, bed2zarr
+from . import bed2zarr, plink, provenance, vcf2zarr, vcf_utils
 from .vcf2zarr import icf as icf_mod
 
 logger = logging.getLogger(__name__)
@@ -580,15 +580,11 @@ def plink2zarr():
     "bed_path",
     type=click.Path(exists=True, dir_okay=False),
 )
-@zarr_path
-@click.argument(
-    "bed_array",
-    type=str,
-)
+@new_zarr_path
 @verbose
 @force
 @progress
-def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress):
+def bed2zarr_main(bed_path, zarr_path, verbose, force, progress):
     """
     Convert BED file to the Zarr format. The BED regions will be
     converted to binary-encoded arrays whose length is equal to the
@@ -599,12 +595,10 @@ def bed2zarr_main(bed_path, zarr_path, bed_array, verbose, force, progress):
     The BED file must be compressed and tabix-indexed.
     """
     setup_logging(verbose)
-    path = pathlib.Path(zarr_path) / bed_array
-    check_overwrite_dir(path, force)
-    bed2zarr(
+    check_overwrite_dir(zarr_path, force)
+    bed2zarr.bed2zarr(
         bed_path,
         zarr_path,
-        bed_array,
         show_progress=progress,
     )
 
diff --git a/tests/test_bed.py b/tests/test_bed.py
index 689a60bd..72d8ad93 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -1,50 +1,27 @@
 import pytest
-import sgkit as sg
-import xarray.testing as xt
 import zarr
-from bio2zarr import bed2zarr, vcf2zarr
+from bio2zarr import bed2zarr
 
 
-class Test1kgBed:
-    data_path = "tests/data/vcf/1kg_2020_chr20_annotations.bcf"
-    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
-    csi_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz.csi"
-
-    @pytest.fixture(scope="module")
-    def icf(self, tmp_path_factory):
-        out = tmp_path_factory.mktemp("data") / "1kg_2020.exploded"
-        vcf2zarr.explode(out, [self.data_path])
-        return out
+class TestBed:
+    bed_path = "tests/data/bed/sample_mask.bed.gz"
 
-    @pytest.fixture(scope="module")
-    def zarr(self, icf, tmp_path_factory):
-        out = tmp_path_factory.mktemp("data") / "1kg_2020.zarr"
-        vcf2zarr.encode(icf, out)
+    @pytest.fixture()
+    def zarr(self, tmp_path):
+        out = tmp_path / "sample_mask.zarr"
         return out
 
-    def test_add_mask_chr20(self, zarr):
+    def test_bed2zarr(self, zarr):
         bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
 
 
-class TestSampleBed:
-    data_path = "tests/data/vcf/sample.bcf"
-    bed_path = "tests/data/bed/sample_mask.bed.gz"
-    csi_path = "tests/data/bed/sample_mask.bed.gz.csi"
-
-    @pytest.fixture(scope="module")
-    def icf(self, tmp_path_factory):
-        out = tmp_path_factory.mktemp("data") / "sample.exploded"
-        vcf2zarr.explode(out, [self.data_path])
-        return out
+class Test1kgBed:
+    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
 
-    @pytest.fixture(scope="module")
-    def zarr(self, icf, tmp_path_factory):
-        out = tmp_path_factory.mktemp("data") / "sample.zarr"
-        vcf2zarr.encode(icf, out)
+    @pytest.fixture()
+    def zarr(self, tmp_path):
+        out = tmp_path / "1kg_2020_chr20_annotations_mask.zarr"
         return out
 
-    def test_add_mask_sample(self, zarr):
-        with pytest.raises(ValueError):
-            bed2zarr.bed2zarr(
-                bed_path=self.bed_path, zarr_path=zarr, show_progress=True
-            )
+    def test_bed2zarr(self, zarr):
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)

From f94234e9f7f8a1f8d9ad9efd3d78c28bfe89bfd3 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 20 Sep 2024 14:34:52 +0200
Subject: [PATCH 05/17] Add initial support for BED3-BED12

- guess BED file type
- add draft schema
- add tests
---
 bio2zarr/bed2zarr.py | 366 ++++++++++++++++++++++++++++++++++++++-----
 bio2zarr/cli.py      |   4 +-
 tests/test_bed.py    | 112 ++++++++++++-
 3 files changed, 437 insertions(+), 45 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index a2c91698..c5e77b04 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -1,6 +1,9 @@
 import dataclasses
+import json
 import logging
 import pathlib
+import shutil
+from enum import Enum
 
 import numcodecs
 import pandas as pd
@@ -13,35 +16,220 @@
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
 
 
+class BedType(Enum):
+    BED3 = 3
+    BED4 = 4
+    BED5 = 5
+    BED6 = 6
+    BED7 = 7
+    BED8 = 8
+    BED9 = 9
+    BED12 = 12
+
+
+class BedColumns(Enum):
+    contig = 0
+    start = 1
+    end = 2
+    name = 3
+    score = 4
+    strand = 5
+    thickStart = 6
+    thickEnd = 7
+    itemRgb = 8
+    blockCount = 9
+    blockSizes = 10
+    blockStarts = 11
+
+
 @dataclasses.dataclass
-class BedMetadata(core.JsonDataclass):
+class ZarrArraySpec:
+    name: str
+    dtype: str
+    shape: list
+    chunks: list
+    dimensions: tuple
+    description: str
+    compressor: dict
+    filters: list
+
+    def __post_init__(self):
+        self.shape = tuple(self.shape)
+        self.chunks = tuple(self.chunks)
+        self.dimensions = tuple(self.dimensions)
+        self.filters = tuple(self.filters)
+
+    @staticmethod
+    def new(**kwargs):
+        spec = ZarrArraySpec(
+            **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
+        )
+        spec._choose_compressor_settings()
+        return spec
+
+    def _choose_compressor_settings(self):
+        pass
+
+    @staticmethod
+    def from_field(
+        bed_field,
+        *,
+        num_records,
+        records_chunk_size,
+    ):
+        shape = [num_records]
+        dimensions = ["records"]
+        chunks = [records_chunk_size]
+        return ZarrArraySpec.new(
+            name=bed_field.name,
+            dtype=bed_field.dtype,
+            shape=shape,
+            chunks=chunks,
+            dimensions=dimensions,
+            description=bed_field.description,
+        )
+
+
+ZARR_SCHEMA_FORMAT_VERSION = "0.1"
+
+
+@dataclasses.dataclass
+class BedZarrSchema(core.JsonDataclass):
+    format_version: str
     fields: list
+    bed_type: str
+    records_chunk_size: int
+
+    def validate(self):
+        """
+        Checks that the schema is well-formed and within required limits.
+        """
+
+    @staticmethod
+    def fromdict(d):
+        if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
+            raise ValueError(
+                "BedZarrSchema format version mismatch: "
+                f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
+            )
+        ret = BedZarrSchema(**d)
+        return ret
+
+    @staticmethod
+    def fromjson(s):
+        return BedZarrSchema.fromdict(json.loads(s))
+
+    @staticmethod
+    def generate(num_records, bed_type, records_chunk_size=None):
+        if records_chunk_size is None:
+            records_chunk_size = 1000
+        logger.info("Generating schema with chunks=%d", records_chunk_size)
+
+        def spec_from_field(field):
+            return ZarrArraySpec.from_field(
+                field,
+                num_records=num_records,
+                records_chunk_size=records_chunk_size,
+            )
+
+        fields = mkfields(bed_type)
+        specs = [spec_from_field(field) for field in fields]
+        return BedZarrSchema(
+            format_version=ZARR_SCHEMA_FORMAT_VERSION,
+            fields=specs,
+            bed_type=bed_type.name,
+            records_chunk_size=records_chunk_size,
+        )
 
 
 @dataclasses.dataclass
 class BedField:
     category: str
     name: str
+    alias: str
+    description: str
+    dtype: str
+
+
+def guess_bed_file_type(path):
+    """Check the number of columns in a BED file and guess BED
+    type."""
+    first = pd.read_table(path, header=None, nrows=1)
+    num_cols = first.shape[1]
+    if num_cols < 3:
+        raise ValueError(f"Expected at least 3 columns in BED file, got {num_cols}")
+    if num_cols > 12:
+        raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}")
+    if num_cols in (10, 11):
+        raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns")
+    return BedType(num_cols)
+
+
+BZW_METADATA_FORMAT_VERSION = "0.1"
+
+
+@dataclasses.dataclass
+class BedZarrWriterMetadata(core.JsonDataclass):
+    format_version: str
+    fields: list
+    bed_type: str
+    schema: BedZarrSchema
+
+    @staticmethod
+    def fromdict(d):
+        if d["format_version"] != BZW_METADATA_FORMAT_VERSION:
+            raise ValueError(
+                "BedZarrWriterMetadata format version mismatch: "
+                f"{d['format_version']} != {BZW_METADATA_FORMAT_VERSION}"
+            )
+        ret = BedZarrWriterMetadata(**d)
+        ret.schema = BedZarrSchema.fromdict(d["schema"])
+        return ret
 
 
 class BedZarrWriter:
     def __init__(self, path):
         self.path = pathlib.Path(path)
+        self.wip_path = self.path / "wip"
         self.metadata = None
         self.data = None
+        self.bed_path = None
+        self.bed_type = None
+
+    @property
+    def schema(self):
+        return self.metadata.schema
 
     def init(
         self,
-        bed_path,
+        data,
         *,
-        show_progress=False,
+        bed_path,
+        bed_type,
+        schema,
     ):
-        self.read(bed_path)
-        fields = mandatory_bed_field_definitions()
-        # FIXME: add optional fields; depends on the number of columns
-        # in the bed file. BED3 is the minimum.
-        self.metadata = BedMetadata(fields)
+        self.data = data
+        self.bed_type = bed_type
+        self.bed_path = bed_path
+
+        if self.path.exists():
+            raise ValueError("Zarr path already exists")
+        schema.validate()
+
+        fields = mkfields(self.bed_type)
+        self.metadata = BedZarrWriterMetadata(
+            format_version=BZW_METADATA_FORMAT_VERSION,
+            bed_type=self.bed_type.name,
+            fields=fields,
+            schema=schema,
+        )
         self.path.mkdir()
+        self.wip_path.mkdir()
+        logger.info("Writing WIP metadata")
+        with open(self.wip_path / "metadata.json", "w") as f:
+            json.dump(self.metadata.asdict(), f, indent=4)
+
+    def write(self):
         store = zarr.DirectoryStore(self.path)
         root = zarr.group(store=store)
         root.attrs.update(
@@ -50,56 +238,158 @@ def init(
                 "source": f"bio2zarr-{provenance.__version__}",
             }
         )
-        self.encode_mandatory_fields(root)
-
-    # FIXME: error checking, number of columns, etc.
-    def read(self, bed_path):
-        logger.info("Reading bed file %s", bed_path)
-        first = pd.read_table(bed_path, header=None, nrows=1)
-        num_cols = len(first.columns)
-        if num_cols < 3:
-            raise ValueError(f"Expected at least 3 columns in bed file, got {num_cols}")
-
-        # FIXME: support chunked reading
-        self.data = pd.read_table(bed_path, header=None).rename(
-            columns={0: "chrom", 1: "chromStart", 2: "chromEnd"}
-        )
+        d = {i: f.name for i, f in enumerate(self.schema.fields)}
+        self.data.rename(columns=d, inplace=True)
+
+        # 1. Create contig name <-> contig id mapping
 
-    def encode_mandatory_fields(self, root):
-        for field, dtype in zip(
-            ["chrom", "chromStart", "chromEnd"], ["str", "int", "int"]
-        ):
-            # FIXME: Check schema for chunks
+        # 2. update self.data.contig based on mapping (-> also means
+        # schema needs modification since we change from string to
+        # int!)
+
+        self.encode_fields(root)
+
+    def finalise(self):
+        logger.debug("Removing %s", self.wip_path)
+        shutil.rmtree(self.wip_path)
+
+    def load_metadata(self):
+        if self.metadata is None:
+            with open(self.wip_path / "metadata.json") as f:
+                self.metadata = BedZarrWriterMetadata.fromdict(json.load(f))
+
+    # FIXME: fields 9-12 are multi-dimensional
+    def encode_fields(self, root):
+        for field in self.schema.fields:
+            object_codec = None
+            if field.dtype == "O":
+                object_codec = numcodecs.VLenUTF8()
             array = root.array(
-                field,
-                self.data[field].values,
-                dtype=dtype,
+                field.name,
+                self.data[field.name].values,
+                shape=field.shape,
+                dtype=field.dtype,
                 compressor=DEFAULT_ZARR_COMPRESSOR,
-                chunks=(1000,),
+                chunks=(self.schema.records_chunk_size,),
+                object_codec=object_codec,
             )
+            array.attrs["_ARRAY_DIMENSIONS"] = ["records"]
             logger.debug("%s done", field)
 
 
 # See https://samtools.github.io/hts-specs/BEDv1.pdf
 def mandatory_bed_field_definitions():
-    def make_field_def(name):
+    def make_field_def(name, alias, dtype, description=""):
         return BedField(
             category="mandatory",
             name=name,
+            alias=alias,
+            description=description,
+            dtype=dtype,
         )
 
     fields = [
-        make_field_def("chrom"),
-        make_field_def("chromStart"),
-        make_field_def("chromEnd"),
+        make_field_def("contig", "chrom", "O", "Chromosome name"),
+        make_field_def("start", "chromStart", "i8", "Feature start position"),
+        make_field_def("end", "chromEnd", "i8", "Feature end position"),
     ]
     return fields
 
 
+def optional_bed_field_definitions(num_fields=0):
+    def make_field_def(name, alias, dtype=None, *, description=""):
+        return BedField(
+            category="optional",
+            name=name,
+            alias=alias,
+            description=description,
+            dtype=dtype,
+        )
+
+    fields = [
+        make_field_def("name", "name", "O", description="Feature description"),
+        make_field_def("score", "score", "i2", description="A numerical value"),
+        make_field_def("strand", "strand", "O", description="Feature strand"),
+        make_field_def(
+            "thickStart", "thickStart", "i8", description="Thick start position"
+        ),
+        make_field_def("thickEnd", "thickEnd", "i8", description="Thick end position"),
+        make_field_def("itemRgb", "itemRgb", description="Display"),
+        make_field_def("blockCount", "blockCount", description="Number of blocks"),
+        make_field_def("blockSizes", "blockSizes", description="Block sizes"),
+        make_field_def(
+            "blockStarts", "blockStarts", description="Block start positions"
+        ),
+    ]
+
+    return fields[:num_fields]
+
+
+def mkschema(bed_path, out):
+    bed_type = guess_bed_file_type(bed_path)
+    data = pd.read_table(bed_path, header=None)
+    spec = BedZarrSchema.generate(
+        data.shape[0],
+        bed_type,
+    )
+    out.write(spec.asjson())
+
+
+def mkfields(bed_type):
+    mandatory = mandatory_bed_field_definitions()
+    optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value)
+    return mandatory + optional
+
+
+def bed2zarr_init(
+    bed_path,
+    bed_type,
+    zarr_path,
+    *,
+    schema_path=None,
+    records_chunk_size=None,
+):
+    data = pd.read_table(bed_path, header=None)
+
+    if schema_path is None:
+        schema = BedZarrSchema.generate(
+            data.shape[0],
+            bed_type,
+            records_chunk_size=records_chunk_size,
+        )
+    else:
+        logger.info("Reading schema from %s", schema_path)
+        if records_chunk_size is not None:
+            raise ValueError(
+                "Cannot specify schema along with chunk sizes"
+            )  # NEEDS TEST
+        with open(schema_path) as f:
+            schema = BedZarrSchema.fromjson(f.read())
+
+    # 2. init store
+    bedzw = BedZarrWriter(zarr_path)
+    bedzw.init(
+        data,
+        bed_path=bed_path,
+        bed_type=bed_type,
+        schema=schema,
+    )
+    return bedzw
+
+
 def bed2zarr(
     bed_path,
     zarr_path,
-    show_progress=False,
+    schema_path=None,
+    records_chunk_size=None,
 ):
-    writer = BedZarrWriter(zarr_path)
-    writer.init(bed_path, show_progress=show_progress)
+    bed_type = guess_bed_file_type(bed_path)
+    bedzw = bed2zarr_init(
+        bed_path,
+        bed_type,
+        zarr_path,
+        schema_path=schema_path,
+        records_chunk_size=records_chunk_size,
+    )
+    bedzw.write()
+    bedzw.finalise()
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index fe60971e..9179a08a 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -583,8 +583,7 @@ def plink2zarr():
 @new_zarr_path
 @verbose
 @force
-@progress
-def bed2zarr_main(bed_path, zarr_path, verbose, force, progress):
+def bed2zarr_main(bed_path, zarr_path, verbose, force):
     """
     Convert BED file to the Zarr format. The BED regions will be
     converted to binary-encoded arrays whose length is equal to the
@@ -599,7 +598,6 @@ def bed2zarr_main(bed_path, zarr_path, verbose, force, progress):
     bed2zarr.bed2zarr(
         bed_path,
         zarr_path,
-        show_progress=progress,
     )
 
 
diff --git a/tests/test_bed.py b/tests/test_bed.py
index 72d8ad93..57df4e7b 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -1,9 +1,113 @@
+import pandas as pd
 import pytest
-import zarr
+
 from bio2zarr import bed2zarr
 
+ALLOWED_BED_FORMATS = [3, 4, 5, 6, 7, 8, 9, 12]
+DISALLOWED_BED_FORMATS = [2, 10, 11, 13]
+ALL_BED_FORMATS = ALLOWED_BED_FORMATS + DISALLOWED_BED_FORMATS
+
+
+@pytest.fixture()
+def bed_data(request):
+    data = [
+        [
+            "chr22",
+            1000,
+            5000,
+            "cloneA",
+            960,
+            "+",
+            1000,
+            5000,
+            0,
+            2,
+            "567,488",
+            "0,3512",
+            "foo",
+        ],
+        [
+            "chr22",
+            2000,
+            6000,
+            "cloneB",
+            900,
+            "-",
+            2000,
+            6000,
+            0,
+            2,
+            "433,399",
+            "0,3601",
+            "bar",
+        ],
+    ]
+    return [row[0 : request.param] for row in data]
+
+
+@pytest.fixture()
+def bed_df(bed_data):
+    return pd.DataFrame(bed_data)
+
+
+@pytest.fixture()
+def bed_path(bed_data, tmp_path):
+    out = tmp_path / "sample_mask.bed"
+    with open(out, "w") as fh:
+        for row in bed_data:
+            fh.write("\t".join(map(str, row)) + "\n")
+    return out
+
+
+@pytest.fixture()
+def schema_path(bed_path, tmp_path_factory):
+    out = tmp_path_factory.mktemp("data") / "example.schema.json"
+    with open(out, "w") as f:
+        bed2zarr.mkschema(bed_path, f)
+    return out
+
+
+@pytest.fixture()
+def schema(schema_path):
+    with open(schema_path) as f:
+        return bed2zarr.BedZarrSchema.fromjson(f.read())
+
+
+class TestDefaultSchema:
+    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
+    def test_format_version(self, schema):
+        assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION
+
+
+class TestSchema:
+    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
+    def test_generate_schema(self, bed_df, request):
+        bedspec = request.node.callspec.params["bed_data"]
+        bed_type = bed2zarr.BedType(bedspec)
+        schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type)
+        assert schema.bed_type == bed_type.name
+        assert schema.records_chunk_size == 1000
+
+
+class TestBedData:
+    @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True)
+    def test_guess_bed_type_from_path(self, bed_path, request):
+        bedspec = request.node.callspec.params["bed_data"]
+        if bedspec in [2, 10, 11, 13]:
+            with pytest.raises(ValueError):
+                bed2zarr.guess_bed_file_type(bed_path)
+        else:
+            bed_type = bed2zarr.guess_bed_file_type(bed_path)
+            assert bed_type.value == bedspec
+
+    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
+    def test_bed_fields(self, bed_path, request):
+        bedspec = request.node.callspec.params["bed_data"]
+        fields = bed2zarr.mkfields(bed2zarr.BedType(bedspec))
+        assert len(fields) == bedspec
+
 
-class TestBed:
+class TestSampleMaskBed:
     bed_path = "tests/data/bed/sample_mask.bed.gz"
 
     @pytest.fixture()
@@ -12,7 +116,7 @@ def zarr(self, tmp_path):
         return out
 
     def test_bed2zarr(self, zarr):
-        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr)
 
 
 class Test1kgBed:
@@ -24,4 +128,4 @@ def zarr(self, tmp_path):
         return out
 
     def test_bed2zarr(self, zarr):
-        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr, show_progress=True)
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr)

From aff19f2feda392bbfc5941199aba6c84f2b73daf Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 20 Sep 2024 15:06:18 +0200
Subject: [PATCH 06/17] Minor refactorization

---
 bio2zarr/bed2zarr.py | 12 ++++++------
 tests/test_bed.py    |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index c5e77b04..267ed457 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -325,6 +325,12 @@ def make_field_def(name, alias, dtype=None, *, description=""):
     return fields[:num_fields]
 
 
+def mkfields(bed_type):
+    mandatory = mandatory_bed_field_definitions()
+    optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value)
+    return mandatory + optional
+
+
 def mkschema(bed_path, out):
     bed_type = guess_bed_file_type(bed_path)
     data = pd.read_table(bed_path, header=None)
@@ -335,12 +341,6 @@ def mkschema(bed_path, out):
     out.write(spec.asjson())
 
 
-def mkfields(bed_type):
-    mandatory = mandatory_bed_field_definitions()
-    optional = optional_bed_field_definitions(bed_type.value - BedType.BED3.value)
-    return mandatory + optional
-
-
 def bed2zarr_init(
     bed_path,
     bed_type,
diff --git a/tests/test_bed.py b/tests/test_bed.py
index 57df4e7b..c8993b94 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -93,7 +93,7 @@ class TestBedData:
     @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True)
     def test_guess_bed_type_from_path(self, bed_path, request):
         bedspec = request.node.callspec.params["bed_data"]
-        if bedspec in [2, 10, 11, 13]:
+        if bedspec in DISALLOWED_BED_FORMATS:
             with pytest.raises(ValueError):
                 bed2zarr.guess_bed_file_type(bed_path)
         else:

From 731cfa622ea9022f7178514233a46699e36e1025 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Mon, 14 Oct 2024 16:58:24 +0200
Subject: [PATCH 07/17] Simplify main bed2zarr, encode features and contigs as
 ints

- simplification of main bed2zarr function
- encode features, contigs as Categories
- add specs for BED9-BED12
---
 bio2zarr/bed2zarr.py | 302 ++++++++++++++++++++++++++++++++-----------
 tests/test_bed.py    |  36 +++++-
 2 files changed, 265 insertions(+), 73 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 267ed457..3427ff03 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -1,11 +1,14 @@
 import dataclasses
 import json
 import logging
+import math
 import pathlib
 import shutil
 from enum import Enum
+from typing import Any
 
 import numcodecs
+import numpy as np
 import pandas as pd
 import zarr
 
@@ -82,7 +85,7 @@ def from_field(
         chunks = [records_chunk_size]
         return ZarrArraySpec.new(
             name=bed_field.name,
-            dtype=bed_field.dtype,
+            dtype=bed_field.smallest_dtype(),
             shape=shape,
             chunks=chunks,
             dimensions=dimensions,
@@ -93,10 +96,40 @@ def from_field(
 ZARR_SCHEMA_FORMAT_VERSION = "0.1"
 
 
+@dataclasses.dataclass
+class Contig:
+    id: str
+
+
+@dataclasses.dataclass
+class Feature:
+    id: str
+
+
+@dataclasses.dataclass
+class BedMetadata(core.JsonDataclass):
+    contigs: list
+    features: list
+    fields: list
+    bed_type: BedType
+    num_records: int = -1
+    records_chunk_size: int = 1000
+
+    @property
+    def num_contigs(self):
+        return len(self.contigs)
+
+    @property
+    def num_features(self):
+        return len(self.features)
+
+
 @dataclasses.dataclass
 class BedZarrSchema(core.JsonDataclass):
     format_version: str
     fields: list
+    contigs: list
+    features: list
     bed_type: str
     records_chunk_size: int
 
@@ -113,6 +146,9 @@ def fromdict(d):
                 f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
             )
         ret = BedZarrSchema(**d)
+        ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
+        ret.contigs = [Contig(**sd) for sd in d["contigs"]]
+        ret.features = [Feature(**sd) for sd in d["features"]]
         return ret
 
     @staticmethod
@@ -120,7 +156,7 @@ def fromjson(s):
         return BedZarrSchema.fromdict(json.loads(s))
 
     @staticmethod
-    def generate(num_records, bed_type, records_chunk_size=None):
+    def generate(num_records, bed_type, fields, contigs, features, records_chunk_size=None):
         if records_chunk_size is None:
             records_chunk_size = 1000
         logger.info("Generating schema with chunks=%d", records_chunk_size)
@@ -132,23 +168,80 @@ def spec_from_field(field):
                 records_chunk_size=records_chunk_size,
             )
 
-        fields = mkfields(bed_type)
-        specs = [spec_from_field(field) for field in fields]
+        specs = [spec_from_field(f) for f in fields]
+
+        contig_spec = ZarrArraySpec.new(
+            name="contig_id",
+            dtype="O",
+            shape=[len(contigs)],
+            chunks=[len(contigs)],
+            dimensions=["contigs"],
+            description="Contig ID",
+        )
+        feature_spec = ZarrArraySpec.new(
+            name="feature_id",
+            dtype="O",
+            shape=[len(features)],
+            chunks=[len(features)],
+            dimensions=["features"],
+            description="Feature ID",
+        )
+        fields.extend([contig_spec, feature_spec])
         return BedZarrSchema(
             format_version=ZARR_SCHEMA_FORMAT_VERSION,
             fields=specs,
+            contigs=contigs,
+            features=features,
             bed_type=bed_type.name,
             records_chunk_size=records_chunk_size,
         )
 
 
+@dataclasses.dataclass
+class BedFieldSummary(core.JsonDataclass):
+    num_chunks: int = 0
+    compressed_size: int = 0
+    uncompressed_size: int = 0
+    max_value: Any = -math.inf
+    min_value: Any = math.inf
+
+    def update(self, other):
+        self.num_chunks = array.nchunks
+        self.compressed_size = array.nbytes
+        self.uncompressed_size = array.nbytes
+        self.min_value = min(self.min_value, other.min_value)
+        self.max_value = max(self.max_value, other.max_value)
+
+    @staticmethod
+    def fromdict(d):
+        return BedFieldSummary(**d)
+
+    
 @dataclasses.dataclass
 class BedField:
     category: str
     name: str
     alias: str
     description: str
-    dtype: str
+    bed_dtype: str
+    summary: BedFieldSummary
+
+    def smallest_dtype(self):
+        """Return the smallest dtype suitable for this field based on type and values"""
+        s = self.summary
+        if self.bed_dtype == "Integer":
+            if not math.isfinite(s.max_value):
+                ret = "i1"
+            else:
+                ret = core.min_int_dtype(s.min_value, s.max_value)
+        elif self.bed_dtype == "Character":
+            ret = "U1"
+        elif self.bed_dtype == "Category":
+            ret = core.min_int_dtype(s.min_value, s.max_value)
+        else:
+            assert self.bed_dtype == "String"
+            ret = "O"
+        return ret
 
 
 def guess_bed_file_type(path):
@@ -172,6 +265,8 @@ def guess_bed_file_type(path):
 class BedZarrWriterMetadata(core.JsonDataclass):
     format_version: str
     fields: list
+    contigs: list
+    features: list
     bed_type: str
     schema: BedZarrSchema
 
@@ -191,36 +286,40 @@ class BedZarrWriter:
     def __init__(self, path):
         self.path = pathlib.Path(path)
         self.wip_path = self.path / "wip"
-        self.metadata = None
         self.data = None
+        self.metadata = None
         self.bed_path = None
         self.bed_type = None
+        self.fields = None
 
     @property
     def schema(self):
+        if self.metadata is None:
+            return
         return self.metadata.schema
 
     def init(
         self,
         data,
         *,
+        metadata,
         bed_path,
-        bed_type,
         schema,
     ):
         self.data = data
-        self.bed_type = bed_type
+        self.bed_metadata = metadata
         self.bed_path = bed_path
 
         if self.path.exists():
             raise ValueError("Zarr path already exists")
         schema.validate()
 
-        fields = mkfields(self.bed_type)
         self.metadata = BedZarrWriterMetadata(
             format_version=BZW_METADATA_FORMAT_VERSION,
-            bed_type=self.bed_type.name,
-            fields=fields,
+            bed_type=self.bed_metadata.bed_type.name,
+            fields=self.fields,
+            contigs=self.bed_metadata.contigs,
+            features=self.bed_metadata.features,
             schema=schema,
         )
         self.path.mkdir()
@@ -240,15 +339,31 @@ def write(self):
         )
         d = {i: f.name for i, f in enumerate(self.schema.fields)}
         self.data.rename(columns=d, inplace=True)
+        self.encode_fields(root)
+        self.encode_contig_id(root)
+        self.encode_feature_id(root)
+
+    def encode_contig_id(self, root):
+        print(self.schema.contigs)
+        array = root.array(
+            "contig_id",
+            [contig.id for contig in self.schema.contigs],
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
 
-        # 1. Create contig name <-> contig id mapping
-
-        # 2. update self.data.contig based on mapping (-> also means
-        # schema needs modification since we change from string to
-        # int!)
 
-        self.encode_fields(root)
+    def encode_feature_id(self, root):
+        array = root.array(
+            "feature_id",
+            [feature.id for feature in self.schema.features],
+            dtype="str",
+            compressor=DEFAULT_ZARR_COMPRESSOR,
+        )
+        array.attrs["_ARRAY_DIMENSIONS"] = ["features"]
 
+        
     def finalise(self):
         logger.debug("Removing %s", self.wip_path)
         shutil.rmtree(self.wip_path)
@@ -258,7 +373,6 @@ def load_metadata(self):
             with open(self.wip_path / "metadata.json") as f:
                 self.metadata = BedZarrWriterMetadata.fromdict(json.load(f))
 
-    # FIXME: fields 9-12 are multi-dimensional
     def encode_fields(self, root):
         for field in self.schema.fields:
             object_codec = None
@@ -279,46 +393,48 @@ def encode_fields(self, root):
 
 # See https://samtools.github.io/hts-specs/BEDv1.pdf
 def mandatory_bed_field_definitions():
-    def make_field_def(name, alias, dtype, description=""):
+    def make_field_def(name, alias, bed_dtype, description=""):
         return BedField(
             category="mandatory",
             name=name,
             alias=alias,
             description=description,
-            dtype=dtype,
+            bed_dtype=bed_dtype,
+            summary=BedFieldSummary(),
         )
 
     fields = [
-        make_field_def("contig", "chrom", "O", "Chromosome name"),
-        make_field_def("start", "chromStart", "i8", "Feature start position"),
-        make_field_def("end", "chromEnd", "i8", "Feature end position"),
+        make_field_def("contig", "chrom", "Category", "Chromosome name"),
+        make_field_def("start", "chromStart", "Integer", "Feature start position"),
+        make_field_def("end", "chromEnd", "Integer", "Feature end position"),
     ]
     return fields
 
 
 def optional_bed_field_definitions(num_fields=0):
-    def make_field_def(name, alias, dtype=None, *, description=""):
+    def make_field_def(name, alias, bed_dtype, description=""):
         return BedField(
             category="optional",
             name=name,
             alias=alias,
             description=description,
-            dtype=dtype,
+            bed_dtype=bed_dtype,
+            summary=BedFieldSummary(),
         )
 
     fields = [
-        make_field_def("name", "name", "O", description="Feature description"),
-        make_field_def("score", "score", "i2", description="A numerical value"),
-        make_field_def("strand", "strand", "O", description="Feature strand"),
+        make_field_def("name", "name", "Category", "Feature description"),
+        make_field_def("score", "score", "Integer", "A numerical value"),
+        make_field_def("strand", "strand", "Character", "Feature strand"),
         make_field_def(
-            "thickStart", "thickStart", "i8", description="Thick start position"
+            "thickStart", "thickStart", "Integer", "Thick start position"
         ),
-        make_field_def("thickEnd", "thickEnd", "i8", description="Thick end position"),
-        make_field_def("itemRgb", "itemRgb", description="Display"),
-        make_field_def("blockCount", "blockCount", description="Number of blocks"),
-        make_field_def("blockSizes", "blockSizes", description="Block sizes"),
+        make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"),
+        make_field_def("itemRgb", "itemRgb", "Integer", "Display"),
+        make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"),
+        make_field_def("blockSizes", "blockSizes", "Integer", "Block sizes"),
         make_field_def(
-            "blockStarts", "blockStarts", description="Block start positions"
+            "blockStarts", "blockStarts", "Integer", "Block start positions"
         ),
     ]
 
@@ -331,51 +447,78 @@ def mkfields(bed_type):
     return mandatory + optional
 
 
-def mkschema(bed_path, out):
-    bed_type = guess_bed_file_type(bed_path)
-    data = pd.read_table(bed_path, header=None)
-    spec = BedZarrSchema.generate(
-        data.shape[0],
-        bed_type,
-    )
-    out.write(spec.asjson())
+def read_bed(bed_path, bed_type):
+    """Read BED file."""
+    fields = mkfields(bed_type)    
+    data = pd.read_table(bed_path, header=None,
+                         names=[f.name for f in fields])
+    return data
 
 
-def bed2zarr_init(
-    bed_path,
-    bed_type,
-    zarr_path,
-    *,
-    schema_path=None,
-    records_chunk_size=None,
-):
-    data = pd.read_table(bed_path, header=None)
-
+def encode_categoricals(data, bed_type):
+    """Convert categoricals to integer encodings."""
+    contig = pd.Categorical(
+        data["contig"], categories=data["contig"].unique(), ordered=True
+    )
+    contig_id = contig.categories.values
+    data["contig"] = contig.codes
+    if bed_type.value >= BedType.BED4.value:
+        feature = pd.Categorical(
+            data["name"], categories=data["name"].unique(), ordered=True
+        )
+        feature_id = feature.categories.values
+        data["name"] = feature.codes
+    else:
+        feature_id = None
+    return data, contig_id, feature_id
+
+
+def update_field_bounds(bed_type, data):
+    """Update field bounds based on data."""
+    ret = []
+    fields = mkfields(bed_type)
+    for f in fields:
+        if f.bed_dtype == "Integer":
+            if f.name in ("itemRgb", "blockSizes", "blockStarts"):
+                if data[f.name].dtype == "O":
+                    values = np.concatenate(
+                        data[f.name].str.split(",").apply(
+                            lambda x: np.array([int(y) for y in x])
+                        ))
+                else:
+                    values = data[f.name]
+                f.summary.min_value = min(values)
+                f.summary.max_value = max(values)
+            else:
+                f.summary.min_value = data[f.name].min()
+                f.summary.max_value = data[f.name].max()
+        elif f.bed_dtype == "Category":
+            f.summary.min_value = 0
+            f.summary.max_value = max(data[f.name].unique())
+        ret.append(f)
+    return ret
+
+
+def mkschema(schema_path, metadata):
+    """Read schema or make schema from fields."""
     if schema_path is None:
         schema = BedZarrSchema.generate(
-            data.shape[0],
-            bed_type,
-            records_chunk_size=records_chunk_size,
+            num_records=metadata.num_records,
+            bed_type=metadata.bed_type,
+            fields=metadata.fields,
+            contigs=metadata.contigs,
+            features=metadata.features,
+            records_chunk_size=metadata.records_chunk_size,
         )
     else:
         logger.info("Reading schema from %s", schema_path)
-        if records_chunk_size is not None:
+        if metadata.records_chunk_size is not None:
             raise ValueError(
                 "Cannot specify schema along with chunk sizes"
             )  # NEEDS TEST
         with open(schema_path) as f:
             schema = BedZarrSchema.fromjson(f.read())
-
-    # 2. init store
-    bedzw = BedZarrWriter(zarr_path)
-    bedzw.init(
-        data,
-        bed_path=bed_path,
-        bed_type=bed_type,
-        schema=schema,
-    )
-    return bedzw
-
+    return schema
 
 def bed2zarr(
     bed_path,
@@ -384,12 +527,27 @@ def bed2zarr(
     records_chunk_size=None,
 ):
     bed_type = guess_bed_file_type(bed_path)
-    bedzw = bed2zarr_init(
-        bed_path,
-        bed_type,
-        zarr_path,
-        schema_path=schema_path,
+    data = read_bed(bed_path, bed_type)
+    data, contig_id, feature_id = encode_categoricals(data, bed_type)
+    fields = update_field_bounds(bed_type, data)
+    metadata = BedMetadata(
+        contigs=[Contig(c) for c in contig_id],
+        features=[Feature(f) for f in feature_id],
+        fields=fields,
+        bed_type=bed_type,
+        num_records=data.shape[0],
         records_chunk_size=records_chunk_size,
     )
+    schema = mkschema(
+        schema_path,
+        metadata,
+    )
+    bedzw = BedZarrWriter(zarr_path)
+    bedzw.init(
+        data=data,
+        metadata=metadata,
+        bed_path=bed_path,
+        schema=schema,
+    )
     bedzw.write()
     bedzw.finalise()
diff --git a/tests/test_bed.py b/tests/test_bed.py
index c8993b94..3d97da30 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -84,7 +84,11 @@ class TestSchema:
     def test_generate_schema(self, bed_df, request):
         bedspec = request.node.callspec.params["bed_data"]
         bed_type = bed2zarr.BedType(bedspec)
-        schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type)
+        fields = bed2zarr.mkfields(bed_type)
+        bed_df.columns = [f.name for f in fields]
+        bed_df, contig_id, feature_id = bed2zarr.encode_categoricals(bed_df, bed_type)
+        fields = bed2zarr.update_field_bounds(bed_type, bed_df)
+        schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type, fields)
         assert schema.bed_type == bed_type.name
         assert schema.records_chunk_size == 1000
 
@@ -129,3 +133,33 @@ def zarr(self, tmp_path):
 
     def test_bed2zarr(self, zarr):
         bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr)
+
+
+class TestContigRename:
+    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
+    bed_data = [
+        "chr21\t50000\t60000\texon",
+        "chr21\t60000\t90000\tfive_prime_utr",
+        "chr20\t50000\t60000\texon",
+        "chr20\t60000\t80000\tintron",
+    ]
+
+    @pytest.fixture()
+    def bed(self, tmp_path):
+        out = tmp_path / "test.bed"
+        with open(out, "w") as fh:
+            for row in self.bed_data:
+                fh.write(row + "\n")
+        return out
+    
+    def test_bed2zarr(self, bed, tmp_path):
+        zarr = tmp_path / "test.zarr"
+
+        data = pd.read_table(bed, header=None)
+        print(data)
+        print(data.describe(percentiles=[]))
+        bedzw = bed2zarr.BedZarrWriter(zarr)
+        print(bedzw)
+        #print(bedzw.schema)
+        print(bedzw.data)
+

From eda56542435f91259c5adac2e0076650d2e461bb Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 15 Oct 2024 09:42:22 +0200
Subject: [PATCH 08/17] Add tests and withdraw support for BED9, BED12

---
 bio2zarr/bed2zarr.py | 193 ++++++++++++++++++++++++-------------------
 tests/test_bed.py    |  99 +++++++++++-----------
 tests/test_core.py   |   2 +-
 3 files changed, 160 insertions(+), 134 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 3427ff03..cfa4e3cb 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -102,14 +102,14 @@ class Contig:
 
 
 @dataclasses.dataclass
-class Feature:
+class Name:
     id: str
 
 
 @dataclasses.dataclass
 class BedMetadata(core.JsonDataclass):
     contigs: list
-    features: list
+    names: list
     fields: list
     bed_type: BedType
     num_records: int = -1
@@ -120,8 +120,8 @@ def num_contigs(self):
         return len(self.contigs)
 
     @property
-    def num_features(self):
-        return len(self.features)
+    def num_names(self):
+        return len(self.names)
 
 
 @dataclasses.dataclass
@@ -129,7 +129,7 @@ class BedZarrSchema(core.JsonDataclass):
     format_version: str
     fields: list
     contigs: list
-    features: list
+    names: list
     bed_type: str
     records_chunk_size: int
 
@@ -147,8 +147,8 @@ def fromdict(d):
             )
         ret = BedZarrSchema(**d)
         ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
-        ret.contigs = [Contig(**sd) for sd in d["contigs"]]
-        ret.features = [Feature(**sd) for sd in d["features"]]
+        ret.contig_ids = [Contig(**sd) for sd in d["contigs"]]
+        ret.name_ids = [Name(**sd) for sd in d["names"]]
         return ret
 
     @staticmethod
@@ -156,43 +156,45 @@ def fromjson(s):
         return BedZarrSchema.fromdict(json.loads(s))
 
     @staticmethod
-    def generate(num_records, bed_type, fields, contigs, features, records_chunk_size=None):
+    def generate(metadata, records_chunk_size=None):
         if records_chunk_size is None:
             records_chunk_size = 1000
         logger.info("Generating schema with chunks=%d", records_chunk_size)
+        fields = metadata.fields
 
         def spec_from_field(field):
             return ZarrArraySpec.from_field(
                 field,
-                num_records=num_records,
+                num_records=metadata.num_records,
                 records_chunk_size=records_chunk_size,
             )
 
         specs = [spec_from_field(f) for f in fields]
 
-        contig_spec = ZarrArraySpec.new(
+        # Contig_id and name_id specs unnecessary?
+        contig_id_spec = ZarrArraySpec.new(
             name="contig_id",
             dtype="O",
-            shape=[len(contigs)],
-            chunks=[len(contigs)],
+            shape=[metadata.num_contigs],
+            chunks=[metadata.num_contigs],
             dimensions=["contigs"],
             description="Contig ID",
         )
-        feature_spec = ZarrArraySpec.new(
-            name="feature_id",
+        name_id_spec = ZarrArraySpec.new(
+            name="name_id",
             dtype="O",
-            shape=[len(features)],
-            chunks=[len(features)],
-            dimensions=["features"],
-            description="Feature ID",
+            shape=[metadata.num_names],
+            chunks=[metadata.num_names],
+            dimensions=["names"],
+            description="Name ID",
         )
-        fields.extend([contig_spec, feature_spec])
+        specs.extend([contig_id_spec, name_id_spec])
         return BedZarrSchema(
             format_version=ZARR_SCHEMA_FORMAT_VERSION,
             fields=specs,
-            contigs=contigs,
-            features=features,
-            bed_type=bed_type.name,
+            contigs=metadata.contigs,
+            names=metadata.names,
+            bed_type=metadata.bed_type.name,
             records_chunk_size=records_chunk_size,
         )
 
@@ -206,9 +208,9 @@ class BedFieldSummary(core.JsonDataclass):
     min_value: Any = math.inf
 
     def update(self, other):
-        self.num_chunks = array.nchunks
-        self.compressed_size = array.nbytes
-        self.uncompressed_size = array.nbytes
+        self.num_chunks = other.num_chunks
+        self.compressed_size = other.compressed_size
+        self.uncompressed_size = other.uncompressed_size
         self.min_value = min(self.min_value, other.min_value)
         self.max_value = max(self.max_value, other.max_value)
 
@@ -216,7 +218,7 @@ def update(self, other):
     def fromdict(d):
         return BedFieldSummary(**d)
 
-    
+
 @dataclasses.dataclass
 class BedField:
     category: str
@@ -227,7 +229,8 @@ class BedField:
     summary: BedFieldSummary
 
     def smallest_dtype(self):
-        """Return the smallest dtype suitable for this field based on type and values"""
+        """Return the smallest dtype suitable for this field based on
+        type and values"""
         s = self.summary
         if self.bed_dtype == "Integer":
             if not math.isfinite(s.max_value):
@@ -255,6 +258,8 @@ def guess_bed_file_type(path):
         raise ValueError(f"Expected at most 12 columns in BED file, got {num_cols}")
     if num_cols in (10, 11):
         raise ValueError(f"BED10 and BED11 are prohibited, got {num_cols} columns")
+    if num_cols in (9, 12):
+        raise ValueError("BED9 and BED12 are valid but currently unsupported formats")
     return BedType(num_cols)
 
 
@@ -266,7 +271,7 @@ class BedZarrWriterMetadata(core.JsonDataclass):
     format_version: str
     fields: list
     contigs: list
-    features: list
+    names: list
     bed_type: str
     schema: BedZarrSchema
 
@@ -319,7 +324,7 @@ def init(
             bed_type=self.bed_metadata.bed_type.name,
             fields=self.fields,
             contigs=self.bed_metadata.contigs,
-            features=self.bed_metadata.features,
+            names=self.bed_metadata.names,
             schema=schema,
         )
         self.path.mkdir()
@@ -337,14 +342,14 @@ def write(self):
                 "source": f"bio2zarr-{provenance.__version__}",
             }
         )
-        d = {i: f.name for i, f in enumerate(self.schema.fields)}
+        datafields = self.schema.fields[0 : BedType[self.schema.bed_type].value]
+        d = {i: f.name for i, f in enumerate(datafields)}
         self.data.rename(columns=d, inplace=True)
-        self.encode_fields(root)
+        self.encode_fields(root, datafields)
         self.encode_contig_id(root)
-        self.encode_feature_id(root)
+        self.encode_name_id(root)
 
     def encode_contig_id(self, root):
-        print(self.schema.contigs)
         array = root.array(
             "contig_id",
             [contig.id for contig in self.schema.contigs],
@@ -353,28 +358,29 @@ def encode_contig_id(self, root):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
 
-
-    def encode_feature_id(self, root):
+    def encode_name_id(self, root):
         array = root.array(
-            "feature_id",
-            [feature.id for feature in self.schema.features],
+            "name_id",
+            [name.id for name in self.schema.names],
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
         )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["features"]
+        array.attrs["_ARRAY_DIMENSIONS"] = ["names"]
 
-        
     def finalise(self):
+        self.load_metadata()
         logger.debug("Removing %s", self.wip_path)
         shutil.rmtree(self.wip_path)
+        logger.info("Consolidating Zarr metadata")
+        zarr.consolidate_metadata(self.path)
 
     def load_metadata(self):
         if self.metadata is None:
             with open(self.wip_path / "metadata.json") as f:
                 self.metadata = BedZarrWriterMetadata.fromdict(json.load(f))
 
-    def encode_fields(self, root):
-        for field in self.schema.fields:
+    def encode_fields(self, root, datafields):
+        for field in datafields:
             object_codec = None
             if field.dtype == "O":
                 object_codec = numcodecs.VLenUTF8()
@@ -405,8 +411,8 @@ def make_field_def(name, alias, bed_dtype, description=""):
 
     fields = [
         make_field_def("contig", "chrom", "Category", "Chromosome name"),
-        make_field_def("start", "chromStart", "Integer", "Feature start position"),
-        make_field_def("end", "chromEnd", "Integer", "Feature end position"),
+        make_field_def("start", "chromStart", "Integer", "Name start position"),
+        make_field_def("end", "chromEnd", "Integer", "Name end position"),
     ]
     return fields
 
@@ -423,12 +429,10 @@ def make_field_def(name, alias, bed_dtype, description=""):
         )
 
     fields = [
-        make_field_def("name", "name", "Category", "Feature description"),
+        make_field_def("name", "name", "Category", "Name description"),
         make_field_def("score", "score", "Integer", "A numerical value"),
-        make_field_def("strand", "strand", "Character", "Feature strand"),
-        make_field_def(
-            "thickStart", "thickStart", "Integer", "Thick start position"
-        ),
+        make_field_def("strand", "strand", "Character", "Name strand"),
+        make_field_def("thickStart", "thickStart", "Integer", "Thick start position"),
         make_field_def("thickEnd", "thickEnd", "Integer", "Thick end position"),
         make_field_def("itemRgb", "itemRgb", "Integer", "Display"),
         make_field_def("blockCount", "blockCount", "Integer", "Number of blocks"),
@@ -447,11 +451,41 @@ def mkfields(bed_type):
     return mandatory + optional
 
 
-def read_bed(bed_path, bed_type):
-    """Read BED file."""
-    fields = mkfields(bed_type)    
-    data = pd.read_table(bed_path, header=None,
-                         names=[f.name for f in fields])
+def parse_bed(bed_path, records_chunk_size=None):
+    """Read and parse BED file and return data frame and metadata."""
+    bed_type = guess_bed_file_type(bed_path)
+    fields = mkfields(bed_type)
+    data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
+    data, contig_id, name_id = encode_categoricals(data, bed_type)
+    data = parse_csv_fields(data, bed_type)
+    fields = update_field_bounds(bed_type, data)
+    metadata = BedMetadata(
+        contigs=[Contig(c) for c in contig_id],
+        names=[Name(f) for f in name_id],
+        fields=fields,
+        bed_type=bed_type,
+        num_records=data.shape[0],
+        records_chunk_size=records_chunk_size,
+    )
+    return data, metadata
+
+
+def parse_csv_fields(data, bed_type):
+    if bed_type.value < BedType.BED8.value:
+        return data
+
+    def _convert_csv_data(values):
+        if values.dtype == "O":
+            ret = values.str.split(",").apply(lambda x: np.array([int(y) for y in x]))
+        else:
+            ret = values
+        return ret
+
+    if bed_type.value >= BedType.BED9.value:
+        data["itemRgb"] = _convert_csv_data(data["itemRgb"])
+    if bed_type.value == BedType.BED12.value:
+        data["blockSizes"] = _convert_csv_data(data["blockSizes"])
+        data["blockStarts"] = _convert_csv_data(data["blockStarts"])
     return data
 
 
@@ -463,14 +497,14 @@ def encode_categoricals(data, bed_type):
     contig_id = contig.categories.values
     data["contig"] = contig.codes
     if bed_type.value >= BedType.BED4.value:
-        feature = pd.Categorical(
+        name = pd.Categorical(
             data["name"], categories=data["name"].unique(), ordered=True
         )
-        feature_id = feature.categories.values
-        data["name"] = feature.codes
+        name_id = name.categories.values
+        data["name"] = name.codes
     else:
-        feature_id = None
-    return data, contig_id, feature_id
+        name_id = []
+    return data, contig_id, name_id
 
 
 def update_field_bounds(bed_type, data):
@@ -481,10 +515,7 @@ def update_field_bounds(bed_type, data):
         if f.bed_dtype == "Integer":
             if f.name in ("itemRgb", "blockSizes", "blockStarts"):
                 if data[f.name].dtype == "O":
-                    values = np.concatenate(
-                        data[f.name].str.split(",").apply(
-                            lambda x: np.array([int(y) for y in x])
-                        ))
+                    values = np.concatenate(data[f.name])
                 else:
                     values = data[f.name]
                 f.summary.min_value = min(values)
@@ -499,16 +530,19 @@ def update_field_bounds(bed_type, data):
     return ret
 
 
-def mkschema(schema_path, metadata):
-    """Read schema or make schema from fields."""
+def mkschema(bed_path, out):
+    """Write schema to file"""
+    _, metadata = parse_bed(bed_path)
+    spec = BedZarrSchema.generate(
+        metadata, records_chunk_size=metadata.records_chunk_size
+    )
+    out.write(spec.asjson())
+
+
+def init_schema(schema_path, metadata):
     if schema_path is None:
         schema = BedZarrSchema.generate(
-            num_records=metadata.num_records,
-            bed_type=metadata.bed_type,
-            fields=metadata.fields,
-            contigs=metadata.contigs,
-            features=metadata.features,
-            records_chunk_size=metadata.records_chunk_size,
+            metadata, records_chunk_size=metadata.records_chunk_size
         )
     else:
         logger.info("Reading schema from %s", schema_path)
@@ -520,28 +554,15 @@ def mkschema(schema_path, metadata):
             schema = BedZarrSchema.fromjson(f.read())
     return schema
 
+
 def bed2zarr(
     bed_path,
     zarr_path,
     schema_path=None,
     records_chunk_size=None,
 ):
-    bed_type = guess_bed_file_type(bed_path)
-    data = read_bed(bed_path, bed_type)
-    data, contig_id, feature_id = encode_categoricals(data, bed_type)
-    fields = update_field_bounds(bed_type, data)
-    metadata = BedMetadata(
-        contigs=[Contig(c) for c in contig_id],
-        features=[Feature(f) for f in feature_id],
-        fields=fields,
-        bed_type=bed_type,
-        num_records=data.shape[0],
-        records_chunk_size=records_chunk_size,
-    )
-    schema = mkschema(
-        schema_path,
-        metadata,
-    )
+    data, metadata = parse_bed(bed_path, records_chunk_size)
+    schema = init_schema(schema_path, metadata)
     bedzw = BedZarrWriter(zarr_path)
     bedzw.init(
         data=data,
diff --git a/tests/test_bed.py b/tests/test_bed.py
index 3d97da30..b04d4aa9 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -1,9 +1,12 @@
+import numpy as np
 import pandas as pd
 import pytest
+import zarr
 
 from bio2zarr import bed2zarr
 
 ALLOWED_BED_FORMATS = [3, 4, 5, 6, 7, 8, 9, 12]
+SUPPORTED_BED_FORMATS = [3, 4, 5, 6, 7, 8]
 DISALLOWED_BED_FORMATS = [2, 10, 11, 13]
 ALL_BED_FORMATS = ALLOWED_BED_FORMATS + DISALLOWED_BED_FORMATS
 
@@ -52,7 +55,7 @@ def bed_df(bed_data):
 
 @pytest.fixture()
 def bed_path(bed_data, tmp_path):
-    out = tmp_path / "sample_mask.bed"
+    out = tmp_path / "sample.bed"
     with open(out, "w") as fh:
         for row in bed_data:
             fh.write("\t".join(map(str, row)) + "\n")
@@ -74,37 +77,69 @@ def schema(schema_path):
 
 
 class TestDefaultSchema:
-    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
+    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
     def test_format_version(self, schema):
         assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION
 
 
 class TestSchema:
-    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
-    def test_generate_schema(self, bed_df, request):
+    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
+    def test_generate_schema(self, bed_path, request):
         bedspec = request.node.callspec.params["bed_data"]
         bed_type = bed2zarr.BedType(bedspec)
-        fields = bed2zarr.mkfields(bed_type)
-        bed_df.columns = [f.name for f in fields]
-        bed_df, contig_id, feature_id = bed2zarr.encode_categoricals(bed_df, bed_type)
-        fields = bed2zarr.update_field_bounds(bed_type, bed_df)
-        schema = bed2zarr.BedZarrSchema.generate(bed_df.shape[0], bed_type, fields)
+        data, metadata = bed2zarr.parse_bed(bed_path)
+        schema = bed2zarr.BedZarrSchema.generate(
+            metadata, records_chunk_size=metadata.records_chunk_size
+        )
         assert schema.bed_type == bed_type.name
         assert schema.records_chunk_size == 1000
+        assert len(schema.contigs) == 1
+        assert schema.contigs[0].id == "chr22"
+        if bed_type.value >= bed2zarr.BedType.BED4.value:
+            assert len(schema.names) == 2
+            assert schema.names[0].id == "cloneA"
+            assert schema.names[1].id == "cloneB"
+        if bed_type == bed2zarr.BedType.BED12:
+            assert len(schema.fields) == 12
+
+
+class TestBed2Zarr:
+    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
+    def test_bed2zarr(self, bed_path, bed_df, tmp_path, request):
+        bedspec = request.node.callspec.params["bed_data"]
+        bed_type = bed2zarr.BedType(bedspec)
+        zarr_path = tmp_path / "test.zarr"
+        bed2zarr.bed2zarr(bed_path=bed_path, zarr_path=zarr_path)
+        root = zarr.open(zarr_path)
+        np.testing.assert_array_equal(root["contig"][:], [0, 0])
+        np.testing.assert_array_equal(root["contig_id"][:], ["chr22"])
+        np.testing.assert_array_equal(root["start"][:], bed_df[1].values)
+        np.testing.assert_array_equal(root["end"][:], bed_df[2].values)
+        if bed_type.value >= bed2zarr.BedType.BED4.value:
+            np.testing.assert_array_equal(root["name"][:], [0, 1])
+            np.testing.assert_array_equal(root["name_id"][:], bed_df[3].values)
+        if bed_type.value >= bed2zarr.BedType.BED5.value:
+            np.testing.assert_array_equal(root["score"][:], bed_df[4].values)
+        if bed_type.value >= bed2zarr.BedType.BED6.value:
+            np.testing.assert_array_equal(root["strand"][:], bed_df[5].values)
+        if bed_type.value >= bed2zarr.BedType.BED7.value:
+            np.testing.assert_array_equal(root["thickStart"][:], bed_df[6].values)
+        if bed_type.value >= bed2zarr.BedType.BED8.value:
+            np.testing.assert_array_equal(root["thickEnd"][:], bed_df[7].values)
 
 
 class TestBedData:
     @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True)
     def test_guess_bed_type_from_path(self, bed_path, request):
         bedspec = request.node.callspec.params["bed_data"]
-        if bedspec in DISALLOWED_BED_FORMATS:
+        if bedspec not in SUPPORTED_BED_FORMATS:
             with pytest.raises(ValueError):
                 bed2zarr.guess_bed_file_type(bed_path)
         else:
             bed_type = bed2zarr.guess_bed_file_type(bed_path)
             assert bed_type.value == bedspec
 
-    @pytest.mark.parametrize("bed_data", ALLOWED_BED_FORMATS, indirect=True)
+    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
     def test_bed_fields(self, bed_path, request):
         bedspec = request.node.callspec.params["bed_data"]
         fields = bed2zarr.mkfields(bed2zarr.BedType(bedspec))
@@ -115,51 +150,21 @@ class TestSampleMaskBed:
     bed_path = "tests/data/bed/sample_mask.bed.gz"
 
     @pytest.fixture()
-    def zarr(self, tmp_path):
+    def zarr_path(self, tmp_path):
         out = tmp_path / "sample_mask.zarr"
         return out
 
-    def test_bed2zarr(self, zarr):
-        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr)
+    def test_bed2zarr(self, zarr_path):
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path)
 
 
 class Test1kgBed:
     bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
 
     @pytest.fixture()
-    def zarr(self, tmp_path):
+    def zarr_path(self, tmp_path):
         out = tmp_path / "1kg_2020_chr20_annotations_mask.zarr"
         return out
 
-    def test_bed2zarr(self, zarr):
-        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr)
-
-
-class TestContigRename:
-    bed_path = "tests/data/bed/1kg_2020_chr20_annotations_mask.bed.gz"
-    bed_data = [
-        "chr21\t50000\t60000\texon",
-        "chr21\t60000\t90000\tfive_prime_utr",
-        "chr20\t50000\t60000\texon",
-        "chr20\t60000\t80000\tintron",
-    ]
-
-    @pytest.fixture()
-    def bed(self, tmp_path):
-        out = tmp_path / "test.bed"
-        with open(out, "w") as fh:
-            for row in self.bed_data:
-                fh.write(row + "\n")
-        return out
-    
-    def test_bed2zarr(self, bed, tmp_path):
-        zarr = tmp_path / "test.zarr"
-
-        data = pd.read_table(bed, header=None)
-        print(data)
-        print(data.describe(percentiles=[]))
-        bedzw = bed2zarr.BedZarrWriter(zarr)
-        print(bedzw)
-        #print(bedzw.schema)
-        print(bedzw.data)
-
+    def test_bed2zarr(self, zarr_path):
+        bed2zarr.bed2zarr(bed_path=self.bed_path, zarr_path=zarr_path)
diff --git a/tests/test_core.py b/tests/test_core.py
index 16e2c0c0..8ec47443 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -202,7 +202,7 @@ def test_5_chunk_1(self, n, expected):
         # It works in CI on Linux, but it'll probably break at some point.
         # It's also necessary to update these numbers each time a new data
         # file gets added
-        ("tests/data", 4977391),
+        ("tests/data", 4989995),
         ("tests/data/vcf", 4965254),
         ("tests/data/vcf/sample.vcf.gz", 1089),
     ],

From 0230ec0435a15fa4a1e7a47e25e79ecb7bd4a1ae Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 15 Oct 2024 09:53:58 +0200
Subject: [PATCH 09/17] Fix lint errors

---
 bio2zarr/bed2zarr.py | 15 ---------------
 tests/test_bed.py    |  3 ++-
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index cfa4e3cb..e9210571 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -30,21 +30,6 @@ class BedType(Enum):
     BED12 = 12
 
 
-class BedColumns(Enum):
-    contig = 0
-    start = 1
-    end = 2
-    name = 3
-    score = 4
-    strand = 5
-    thickStart = 6
-    thickEnd = 7
-    itemRgb = 8
-    blockCount = 9
-    blockSizes = 10
-    blockStarts = 11
-
-
 @dataclasses.dataclass
 class ZarrArraySpec:
     name: str
diff --git a/tests/test_bed.py b/tests/test_bed.py
index b04d4aa9..28f5f5d2 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -132,8 +132,9 @@ class TestBedData:
     @pytest.mark.parametrize("bed_data", ALL_BED_FORMATS, indirect=True)
     def test_guess_bed_type_from_path(self, bed_path, request):
         bedspec = request.node.callspec.params["bed_data"]
+        match_str = rf"(got {bedspec}|prohibited|unsupported)"
         if bedspec not in SUPPORTED_BED_FORMATS:
-            with pytest.raises(ValueError):
+            with pytest.raises(ValueError, match=match_str):
                 bed2zarr.guess_bed_file_type(bed_path)
         else:
             bed_type = bed2zarr.guess_bed_file_type(bed_path)

From d13e93fdd560a3ded67396660f4e941b45d4c957 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 15 Oct 2024 09:57:16 +0200
Subject: [PATCH 10/17] Add pandas dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2883186f..d3c07fef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
   # colouredlogs pulls in humanfriendly",
   "cyvcf2",
   "bed_reader",
+  "pandas",
 ]
 requires-python = ">=3.9"
 classifiers = [

From e113eb223a60a993560f4725dec78ef0073eaf5e Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Tue, 15 Oct 2024 10:10:41 +0200
Subject: [PATCH 11/17] Update du stats

---
 tests/test_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_core.py b/tests/test_core.py
index 8ec47443..0484fd22 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -202,7 +202,7 @@ def test_5_chunk_1(self, n, expected):
         # It works in CI on Linux, but it'll probably break at some point.
         # It's also necessary to update these numbers each time a new data
         # file gets added
-        ("tests/data", 4989995),
+        ("tests/data", 4989501),
         ("tests/data/vcf", 4965254),
         ("tests/data/vcf/sample.vcf.gz", 1089),
     ],

From bc7813a10ca2b6d216fb50c79b65be5df9057177 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 6 Dec 2024 08:55:52 +0100
Subject: [PATCH 12/17] Fix cli docstring.

---
 bio2zarr/cli.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 9179a08a..2a44cfea 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -585,13 +585,11 @@ def plink2zarr():
 @force
 def bed2zarr_main(bed_path, zarr_path, verbose, force):
     """
-    Convert BED file to the Zarr format. The BED regions will be
-    converted to binary-encoded arrays whose length is equal to the
-    length of the reference genome. The BED file regions are used to
-    mask the reference genome, where the masked regions are set to 1
-    and the unmasked regions are set to 0.
+    Convert BED file to the Zarr format. Each BED column will be
+    converted to a Zarr array with appropriate encoding.
 
-    The BED file must be compressed and tabix-indexed.
+    The BED file must be compressed and tabix-indexed. BED9 and BED12
+    formats are currently not supported.
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)

From 3a7f17bdfdfd5f9e9305fad72cddc2a6986c2367 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Mon, 9 Dec 2024 22:28:53 +0100
Subject: [PATCH 13/17] Add records chunk size and schema options to cli

---
 bio2zarr/cli.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 2a44cfea..3a4623f5 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -128,6 +128,14 @@ def list_commands(self, ctx):
     help="Chunk size in the samples dimension",
 )
 
+records_chunk_size = click.option(
+    "-r",
+    "--records-chunk-size",
+    type=int,
+    default=None,
+    help="Chunk size in the records dimension",
+)
+
 schema = click.option("-s", "--schema", default=None, type=click.Path(exists=True))
 
 max_variant_chunks = click.option(
@@ -581,9 +589,18 @@ def plink2zarr():
     type=click.Path(exists=True, dir_okay=False),
 )
 @new_zarr_path
+@schema
+@records_chunk_size
 @verbose
 @force
-def bed2zarr_main(bed_path, zarr_path, verbose, force):
+def bed2zarr_main(
+    bed_path,
+    zarr_path,
+    schema,
+    records_chunk_size,    
+    verbose,
+    force,
+):
     """
     Convert BED file to the Zarr format. Each BED column will be
     converted to a Zarr array with appropriate encoding.
@@ -596,6 +613,8 @@ def bed2zarr_main(bed_path, zarr_path, verbose, force):
     bed2zarr.bed2zarr(
         bed_path,
         zarr_path,
+        records_chunk_size=records_chunk_size,
+        schema_path=schema,
     )
 
 

From 4137a756cd244f11c07a83788926c4f95f64d5ea Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 13 Dec 2024 09:32:40 +0100
Subject: [PATCH 14/17] Simplify bed2zarr conversion

- read BED as data frame
- rely on xarray.Dataset.to_zarr to convert
---
 bio2zarr/bed2zarr.py | 391 +++----------------------------------------
 bio2zarr/cli.py      |   3 -
 tests/test_bed.py    |  41 -----
 3 files changed, 26 insertions(+), 409 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index e9210571..66365341 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -10,6 +10,7 @@
 import numcodecs
 import numpy as np
 import pandas as pd
+import xarray as xr
 import zarr
 
 from . import core, provenance
@@ -30,160 +31,6 @@ class BedType(Enum):
     BED12 = 12
 
 
-@dataclasses.dataclass
-class ZarrArraySpec:
-    name: str
-    dtype: str
-    shape: list
-    chunks: list
-    dimensions: tuple
-    description: str
-    compressor: dict
-    filters: list
-
-    def __post_init__(self):
-        self.shape = tuple(self.shape)
-        self.chunks = tuple(self.chunks)
-        self.dimensions = tuple(self.dimensions)
-        self.filters = tuple(self.filters)
-
-    @staticmethod
-    def new(**kwargs):
-        spec = ZarrArraySpec(
-            **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
-        )
-        spec._choose_compressor_settings()
-        return spec
-
-    def _choose_compressor_settings(self):
-        pass
-
-    @staticmethod
-    def from_field(
-        bed_field,
-        *,
-        num_records,
-        records_chunk_size,
-    ):
-        shape = [num_records]
-        dimensions = ["records"]
-        chunks = [records_chunk_size]
-        return ZarrArraySpec.new(
-            name=bed_field.name,
-            dtype=bed_field.smallest_dtype(),
-            shape=shape,
-            chunks=chunks,
-            dimensions=dimensions,
-            description=bed_field.description,
-        )
-
-
-ZARR_SCHEMA_FORMAT_VERSION = "0.1"
-
-
-@dataclasses.dataclass
-class Contig:
-    id: str
-
-
-@dataclasses.dataclass
-class Name:
-    id: str
-
-
-@dataclasses.dataclass
-class BedMetadata(core.JsonDataclass):
-    contigs: list
-    names: list
-    fields: list
-    bed_type: BedType
-    num_records: int = -1
-    records_chunk_size: int = 1000
-
-    @property
-    def num_contigs(self):
-        return len(self.contigs)
-
-    @property
-    def num_names(self):
-        return len(self.names)
-
-
-@dataclasses.dataclass
-class BedZarrSchema(core.JsonDataclass):
-    format_version: str
-    fields: list
-    contigs: list
-    names: list
-    bed_type: str
-    records_chunk_size: int
-
-    def validate(self):
-        """
-        Checks that the schema is well-formed and within required limits.
-        """
-
-    @staticmethod
-    def fromdict(d):
-        if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
-            raise ValueError(
-                "BedZarrSchema format version mismatch: "
-                f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
-            )
-        ret = BedZarrSchema(**d)
-        ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
-        ret.contig_ids = [Contig(**sd) for sd in d["contigs"]]
-        ret.name_ids = [Name(**sd) for sd in d["names"]]
-        return ret
-
-    @staticmethod
-    def fromjson(s):
-        return BedZarrSchema.fromdict(json.loads(s))
-
-    @staticmethod
-    def generate(metadata, records_chunk_size=None):
-        if records_chunk_size is None:
-            records_chunk_size = 1000
-        logger.info("Generating schema with chunks=%d", records_chunk_size)
-        fields = metadata.fields
-
-        def spec_from_field(field):
-            return ZarrArraySpec.from_field(
-                field,
-                num_records=metadata.num_records,
-                records_chunk_size=records_chunk_size,
-            )
-
-        specs = [spec_from_field(f) for f in fields]
-
-        # Contig_id and name_id specs unnecessary?
-        contig_id_spec = ZarrArraySpec.new(
-            name="contig_id",
-            dtype="O",
-            shape=[metadata.num_contigs],
-            chunks=[metadata.num_contigs],
-            dimensions=["contigs"],
-            description="Contig ID",
-        )
-        name_id_spec = ZarrArraySpec.new(
-            name="name_id",
-            dtype="O",
-            shape=[metadata.num_names],
-            chunks=[metadata.num_names],
-            dimensions=["names"],
-            description="Name ID",
-        )
-        specs.extend([contig_id_spec, name_id_spec])
-        return BedZarrSchema(
-            format_version=ZARR_SCHEMA_FORMAT_VERSION,
-            fields=specs,
-            contigs=metadata.contigs,
-            names=metadata.names,
-            bed_type=metadata.bed_type.name,
-            records_chunk_size=records_chunk_size,
-        )
-
-
 @dataclasses.dataclass
 class BedFieldSummary(core.JsonDataclass):
     num_chunks: int = 0
@@ -248,140 +95,6 @@ def guess_bed_file_type(path):
     return BedType(num_cols)
 
 
-BZW_METADATA_FORMAT_VERSION = "0.1"
-
-
-@dataclasses.dataclass
-class BedZarrWriterMetadata(core.JsonDataclass):
-    format_version: str
-    fields: list
-    contigs: list
-    names: list
-    bed_type: str
-    schema: BedZarrSchema
-
-    @staticmethod
-    def fromdict(d):
-        if d["format_version"] != BZW_METADATA_FORMAT_VERSION:
-            raise ValueError(
-                "BedZarrWriterMetadata format version mismatch: "
-                f"{d['format_version']} != {BZW_METADATA_FORMAT_VERSION}"
-            )
-        ret = BedZarrWriterMetadata(**d)
-        ret.schema = BedZarrSchema.fromdict(d["schema"])
-        return ret
-
-
-class BedZarrWriter:
-    def __init__(self, path):
-        self.path = pathlib.Path(path)
-        self.wip_path = self.path / "wip"
-        self.data = None
-        self.metadata = None
-        self.bed_path = None
-        self.bed_type = None
-        self.fields = None
-
-    @property
-    def schema(self):
-        if self.metadata is None:
-            return
-        return self.metadata.schema
-
-    def init(
-        self,
-        data,
-        *,
-        metadata,
-        bed_path,
-        schema,
-    ):
-        self.data = data
-        self.bed_metadata = metadata
-        self.bed_path = bed_path
-
-        if self.path.exists():
-            raise ValueError("Zarr path already exists")
-        schema.validate()
-
-        self.metadata = BedZarrWriterMetadata(
-            format_version=BZW_METADATA_FORMAT_VERSION,
-            bed_type=self.bed_metadata.bed_type.name,
-            fields=self.fields,
-            contigs=self.bed_metadata.contigs,
-            names=self.bed_metadata.names,
-            schema=schema,
-        )
-        self.path.mkdir()
-        self.wip_path.mkdir()
-        logger.info("Writing WIP metadata")
-        with open(self.wip_path / "metadata.json", "w") as f:
-            json.dump(self.metadata.asdict(), f, indent=4)
-
-    def write(self):
-        store = zarr.DirectoryStore(self.path)
-        root = zarr.group(store=store)
-        root.attrs.update(
-            {
-                "bed_zarr_version": "0.1",
-                "source": f"bio2zarr-{provenance.__version__}",
-            }
-        )
-        datafields = self.schema.fields[0 : BedType[self.schema.bed_type].value]
-        d = {i: f.name for i, f in enumerate(datafields)}
-        self.data.rename(columns=d, inplace=True)
-        self.encode_fields(root, datafields)
-        self.encode_contig_id(root)
-        self.encode_name_id(root)
-
-    def encode_contig_id(self, root):
-        array = root.array(
-            "contig_id",
-            [contig.id for contig in self.schema.contigs],
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-
-    def encode_name_id(self, root):
-        array = root.array(
-            "name_id",
-            [name.id for name in self.schema.names],
-            dtype="str",
-            compressor=DEFAULT_ZARR_COMPRESSOR,
-        )
-        array.attrs["_ARRAY_DIMENSIONS"] = ["names"]
-
-    def finalise(self):
-        self.load_metadata()
-        logger.debug("Removing %s", self.wip_path)
-        shutil.rmtree(self.wip_path)
-        logger.info("Consolidating Zarr metadata")
-        zarr.consolidate_metadata(self.path)
-
-    def load_metadata(self):
-        if self.metadata is None:
-            with open(self.wip_path / "metadata.json") as f:
-                self.metadata = BedZarrWriterMetadata.fromdict(json.load(f))
-
-    def encode_fields(self, root, datafields):
-        for field in datafields:
-            object_codec = None
-            if field.dtype == "O":
-                object_codec = numcodecs.VLenUTF8()
-            array = root.array(
-                field.name,
-                self.data[field.name].values,
-                shape=field.shape,
-                dtype=field.dtype,
-                compressor=DEFAULT_ZARR_COMPRESSOR,
-                chunks=(self.schema.records_chunk_size,),
-                object_codec=object_codec,
-            )
-            array.attrs["_ARRAY_DIMENSIONS"] = ["records"]
-            logger.debug("%s done", field)
-
-
 # See https://samtools.github.io/hts-specs/BEDv1.pdf
 def mandatory_bed_field_definitions():
     def make_field_def(name, alias, bed_dtype, description=""):
@@ -436,63 +149,27 @@ def mkfields(bed_type):
     return mandatory + optional
 
 
-def parse_bed(bed_path, records_chunk_size=None):
-    """Read and parse BED file and return data frame and metadata."""
-    bed_type = guess_bed_file_type(bed_path)
-    fields = mkfields(bed_type)
-    data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
-    data, contig_id, name_id = encode_categoricals(data, bed_type)
-    data = parse_csv_fields(data, bed_type)
-    fields = update_field_bounds(bed_type, data)
-    metadata = BedMetadata(
-        contigs=[Contig(c) for c in contig_id],
-        names=[Name(f) for f in name_id],
-        fields=fields,
-        bed_type=bed_type,
-        num_records=data.shape[0],
-        records_chunk_size=records_chunk_size,
-    )
-    return data, metadata
-
-
-def parse_csv_fields(data, bed_type):
-    if bed_type.value < BedType.BED8.value:
-        return data
-
-    def _convert_csv_data(values):
-        if values.dtype == "O":
-            ret = values.str.split(",").apply(lambda x: np.array([int(y) for y in x]))
-        else:
-            ret = values
-        return ret
-
-    if bed_type.value >= BedType.BED9.value:
-        data["itemRgb"] = _convert_csv_data(data["itemRgb"])
-    if bed_type.value == BedType.BED12.value:
-        data["blockSizes"] = _convert_csv_data(data["blockSizes"])
-        data["blockStarts"] = _convert_csv_data(data["blockStarts"])
-    return data
-
-
 def encode_categoricals(data, bed_type):
     """Convert categoricals to integer encodings."""
     contig = pd.Categorical(
         data["contig"], categories=data["contig"].unique(), ordered=True
     )
     contig_id = contig.categories.values
+    contig_id = contig_id.astype(f"<U{len(max(contig_id, key=len))}")
     data["contig"] = contig.codes
     if bed_type.value >= BedType.BED4.value:
         name = pd.Categorical(
             data["name"], categories=data["name"].unique(), ordered=True
         )
         name_id = name.categories.values
+        name_id = name_id.astype(f"<U{len(max(name_id, key=len))}")
         data["name"] = name.codes
     else:
-        name_id = []
+        name_id = None
     return data, contig_id, name_id
 
 
-def update_field_bounds(bed_type, data):
+def update_field_bounds(data, bed_type):
     """Update field bounds based on data."""
     ret = []
     fields = mkfields(bed_type)
@@ -515,45 +192,29 @@ def update_field_bounds(bed_type, data):
     return ret
 
 
-def mkschema(bed_path, out):
-    """Write schema to file"""
-    _, metadata = parse_bed(bed_path)
-    spec = BedZarrSchema.generate(
-        metadata, records_chunk_size=metadata.records_chunk_size
-    )
-    out.write(spec.asjson())
-
-
-def init_schema(schema_path, metadata):
-    if schema_path is None:
-        schema = BedZarrSchema.generate(
-            metadata, records_chunk_size=metadata.records_chunk_size
-        )
-    else:
-        logger.info("Reading schema from %s", schema_path)
-        if metadata.records_chunk_size is not None:
-            raise ValueError(
-                "Cannot specify schema along with chunk sizes"
-            )  # NEEDS TEST
-        with open(schema_path) as f:
-            schema = BedZarrSchema.fromjson(f.read())
-    return schema
-
-
 def bed2zarr(
     bed_path,
     zarr_path,
-    schema_path=None,
     records_chunk_size=None,
 ):
-    data, metadata = parse_bed(bed_path, records_chunk_size)
-    schema = init_schema(schema_path, metadata)
-    bedzw = BedZarrWriter(zarr_path)
-    bedzw.init(
-        data=data,
-        metadata=metadata,
-        bed_path=bed_path,
-        schema=schema,
-    )
-    bedzw.write()
-    bedzw.finalise()
+    bed_type = guess_bed_file_type(bed_path)
+    fields = mkfields(bed_type)
+    data = pd.read_table(bed_path, header=None, names=[f.name for f in fields])
+    data, contig_id, name_id = encode_categoricals(data, bed_type)
+    fields = update_field_bounds(data, bed_type)
+    dtypes = {f.name: f.smallest_dtype() for f in fields}
+    data = data.astype(dtypes)
+    data.index.name = "records"
+    ds = xr.Dataset.from_dataframe(data)
+    if records_chunk_size is None:
+        records_chunk_size = len(data)
+    chunks = {
+        "records": records_chunk_size,
+        "contigs": len(contig_id),
+    }
+    ds["contig_id"] = xr.DataArray(contig_id, dims=["contigs"])
+    if bed_type.value >= BedType.BED4.value:
+        ds["name_id"] = xr.DataArray(name_id, dims=["names"])
+        chunks["names"] = len(name_id)
+    ds = ds.chunk(chunks)
+    ds.to_zarr(zarr_path, mode="w")
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index 3a4623f5..dc0f2cbb 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -589,14 +589,12 @@ def plink2zarr():
     type=click.Path(exists=True, dir_okay=False),
 )
 @new_zarr_path
-@schema
 @records_chunk_size
 @verbose
 @force
 def bed2zarr_main(
     bed_path,
     zarr_path,
-    schema,
     records_chunk_size,    
     verbose,
     force,
@@ -614,7 +612,6 @@ def bed2zarr_main(
         bed_path,
         zarr_path,
         records_chunk_size=records_chunk_size,
-        schema_path=schema,
     )
 
 
diff --git a/tests/test_bed.py b/tests/test_bed.py
index 28f5f5d2..a122f3dd 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -62,47 +62,6 @@ def bed_path(bed_data, tmp_path):
     return out
 
 
-@pytest.fixture()
-def schema_path(bed_path, tmp_path_factory):
-    out = tmp_path_factory.mktemp("data") / "example.schema.json"
-    with open(out, "w") as f:
-        bed2zarr.mkschema(bed_path, f)
-    return out
-
-
-@pytest.fixture()
-def schema(schema_path):
-    with open(schema_path) as f:
-        return bed2zarr.BedZarrSchema.fromjson(f.read())
-
-
-class TestDefaultSchema:
-    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
-    def test_format_version(self, schema):
-        assert schema.format_version == bed2zarr.ZARR_SCHEMA_FORMAT_VERSION
-
-
-class TestSchema:
-    @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
-    def test_generate_schema(self, bed_path, request):
-        bedspec = request.node.callspec.params["bed_data"]
-        bed_type = bed2zarr.BedType(bedspec)
-        data, metadata = bed2zarr.parse_bed(bed_path)
-        schema = bed2zarr.BedZarrSchema.generate(
-            metadata, records_chunk_size=metadata.records_chunk_size
-        )
-        assert schema.bed_type == bed_type.name
-        assert schema.records_chunk_size == 1000
-        assert len(schema.contigs) == 1
-        assert schema.contigs[0].id == "chr22"
-        if bed_type.value >= bed2zarr.BedType.BED4.value:
-            assert len(schema.names) == 2
-            assert schema.names[0].id == "cloneA"
-            assert schema.names[1].id == "cloneB"
-        if bed_type == bed2zarr.BedType.BED12:
-            assert len(schema.fields) == 12
-
-
 class TestBed2Zarr:
     @pytest.mark.parametrize("bed_data", SUPPORTED_BED_FORMATS, indirect=True)
     def test_bed2zarr(self, bed_path, bed_df, tmp_path, request):

From a398a9196f689682350545ed6ec606b60f9c7c2f Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 13 Dec 2024 09:38:54 +0100
Subject: [PATCH 15/17] Set smallest dtypes on DataSet

---
 bio2zarr/bed2zarr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 66365341..0fbfd599 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -203,9 +203,10 @@ def bed2zarr(
     data, contig_id, name_id = encode_categoricals(data, bed_type)
     fields = update_field_bounds(data, bed_type)
     dtypes = {f.name: f.smallest_dtype() for f in fields}
-    data = data.astype(dtypes)
     data.index.name = "records"
     ds = xr.Dataset.from_dataframe(data)
+    for k, v in dtypes.items():
+        ds[k] = ds[k].astype(v)
     if records_chunk_size is None:
         records_chunk_size = len(data)
     chunks = {

From d6e9b82501acdd1a2229fa5dbb6a909fff66d523 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 13 Dec 2024 10:10:08 +0100
Subject: [PATCH 16/17] Linting

---
 bio2zarr/bed2zarr.py | 6 +-----
 bio2zarr/cli.py      | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index 0fbfd599..f73f6861 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -1,9 +1,6 @@
 import dataclasses
-import json
 import logging
 import math
-import pathlib
-import shutil
 from enum import Enum
 from typing import Any
 
@@ -11,9 +8,8 @@
 import numpy as np
 import pandas as pd
 import xarray as xr
-import zarr
 
-from . import core, provenance
+from . import core
 
 logger = logging.getLogger(__name__)
 
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
index dc0f2cbb..aec6811c 100644
--- a/bio2zarr/cli.py
+++ b/bio2zarr/cli.py
@@ -595,7 +595,7 @@ def plink2zarr():
 def bed2zarr_main(
     bed_path,
     zarr_path,
-    records_chunk_size,    
+    records_chunk_size,
     verbose,
     force,
 ):

From 1bf2b00a35f95fae9b38b6809a2fb1fadada4ae3 Mon Sep 17 00:00:00 2001
From: Per Unneberg <per.unneberg@scilifelab.se>
Date: Fri, 13 Dec 2024 14:21:18 +0100
Subject: [PATCH 17/17] Remove xarray dependency

---
 bio2zarr/bed2zarr.py | 44 ++++++++++++++++++++++++++++----------------
 tests/test_bed.py    |  1 +
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/bio2zarr/bed2zarr.py b/bio2zarr/bed2zarr.py
index f73f6861..de36a3c0 100644
--- a/bio2zarr/bed2zarr.py
+++ b/bio2zarr/bed2zarr.py
@@ -7,13 +7,14 @@
 import numcodecs
 import numpy as np
 import pandas as pd
-import xarray as xr
+import zarr
 
-from . import core
+from . import core, provenance
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
+BED_ZARR_VERSION = 0.1
 
 
 class BedType(Enum):
@@ -200,18 +201,29 @@ def bed2zarr(
     fields = update_field_bounds(data, bed_type)
     dtypes = {f.name: f.smallest_dtype() for f in fields}
     data.index.name = "records"
-    ds = xr.Dataset.from_dataframe(data)
-    for k, v in dtypes.items():
-        ds[k] = ds[k].astype(v)
-    if records_chunk_size is None:
-        records_chunk_size = len(data)
-    chunks = {
-        "records": records_chunk_size,
-        "contigs": len(contig_id),
-    }
-    ds["contig_id"] = xr.DataArray(contig_id, dims=["contigs"])
+    data = data.astype(dtypes)
+    store = zarr.DirectoryStore(zarr_path)
+    root = zarr.group(store=store)
+    root.attrs.update(
+        {
+            "bed_zarr_version": f"{BED_ZARR_VERSION}",
+            "source": f"bio2zarr-{provenance.__version__}",
+        }
+    )
+    for field in fields[0 : bed_type.value]:
+        if field.name == "strand":
+            root.array(
+                field.name,
+                data[field.name].values,
+                chunks=(records_chunk_size,),
+                dtype="<U1",
+            )
+        else:
+            root.array(
+                field.name,
+                data[field.name].values,
+                chunks=(records_chunk_size,),
+            )
+    root.array("contig_id", contig_id, chunks=(len(contig_id),))
     if bed_type.value >= BedType.BED4.value:
-        ds["name_id"] = xr.DataArray(name_id, dims=["names"])
-        chunks["names"] = len(name_id)
-    ds = ds.chunk(chunks)
-    ds.to_zarr(zarr_path, mode="w")
+        root.array("name_id", name_id, chunks=(len(name_id),))
diff --git a/tests/test_bed.py b/tests/test_bed.py
index a122f3dd..3f162679 100644
--- a/tests/test_bed.py
+++ b/tests/test_bed.py
@@ -85,6 +85,7 @@ def test_bed2zarr(self, bed_path, bed_df, tmp_path, request):
             np.testing.assert_array_equal(root["thickStart"][:], bed_df[6].values)
         if bed_type.value >= bed2zarr.BedType.BED8.value:
             np.testing.assert_array_equal(root["thickEnd"][:], bed_df[7].values)
+        print(zarr_path)
 
 
 class TestBedData: