From 8eefcf097955d08beb9695a34a4f3472be45c3f7 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Thu, 15 Feb 2024 17:16:54 +0000
Subject: [PATCH] Basic tests working

---
 bio2zarr/vcf.py   |    2 +-
 tests/test_vcf.py | 2029 ++++-----------------------------------------
 2 files changed, 180 insertions(+), 1851 deletions(-)

diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
index 8a7f3a7..8194877 100644
--- a/bio2zarr/vcf.py
+++ b/bio2zarr/vcf.py
@@ -74,7 +74,7 @@ def assert_prefix_integer_equal_2d(vcf_val, zarr_val):
 # Will need to hand-craft from examples to test
 def assert_prefix_float_equal_1d(vcf_val, zarr_val):
     v = np.array(vcf_val, dtype=np.float32, ndmin=1)
-    vi = v.view(np.int32)
+    # vi = v.view(np.int32)
     z = np.array(zarr_val, dtype=np.float32, ndmin=1)
     zi = z.view(np.int32)
     assert np.sum(zi == FLOAT32_MISSING_AS_INT32) == 0
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
index 4e14405..d2964c8 100644
--- a/tests/test_vcf.py
+++ b/tests/test_vcf.py
@@ -1,1891 +1,220 @@
-import os
-import tempfile
-from os import listdir
-from os.path import join
-from typing import MutableMapping
-
 import numpy as np
+import numpy.testing as nt
 import pytest
-import xarray as xr
-import zarr
-from numcodecs import Blosc, Delta, FixedScaleOffset, PackBits, VLenUTF8
-from numpy.testing import assert_allclose, assert_array_equal, assert_array_almost_equal
-
-from sgkit import load_dataset, save_dataset
-from sgkit.io.utils import FLOAT32_FILL, FLOAT32_MISSING, INT_FILL, INT_MISSING
-from sgkit.io.vcf import (
-    MaxAltAllelesExceededWarning,
-    partition_into_regions,
-    read_vcf,
-    vcf_to_zarr,
-)
-from sgkit.io.vcf.vcf_reader import (
-    FloatFormatFieldWarning,
-    merge_zarr_array_sizes,
-    zarr_array_sizes,
-)
-from sgkit.io.vcf.vcf_converter import convert_vcf, validate
-from sgkit.model import get_contigs, get_filters, num_contigs
-from sgkit.tests.io.test_dataset import assert_identical
+import sgkit as sg
 
-from .utils import path_for_test
+from bio2zarr import vcf
 
 
-@pytest.mark.parametrize(
-    "read_chunk_length",
-    [None, 1],
-)
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.parametrize("method", ["to_zarr", "convert", "load"])
-@pytest.mark.filterwarnings("ignore::xarray.coding.variables.SerializationWarning")
-def test_vcf_to_zarr__small_vcf(
-    shared_datadir,
-    is_path,
-    read_chunk_length,
-    tmp_path,
-    method,
-):
-    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-    fields = [
-        "INFO/NS",
-        "INFO/AN",
-        "INFO/AA",
-        "INFO/DB",
-        "INFO/AC",
-        "INFO/AF",
-        "FORMAT/GT",
-        "FORMAT/DP",
-        "FORMAT/HQ",
-    ]
-    field_defs = {
-        "FORMAT/HQ": {"dimension": "ploidy"},
-        "INFO/AF": {"Number": "2", "dimension": "AF"},
-        "INFO/AC": {"Number": "2", "dimension": "AC"},
-    }
-    if method == "to_zarr":
-        vcf_to_zarr(
-            path,
-            output,
-            max_alt_alleles=3,
-            chunk_length=5,
-            chunk_width=2,
-            read_chunk_length=read_chunk_length,
-            fields=fields,
-            field_defs=field_defs,
-        )
-        ds = xr.open_zarr(output)
+class TestSmallExampleValues:
+    @pytest.fixture(scope="class")
+    def ds(self, tmp_path_factory):
+        path = "tests/data/vcf/sample.vcf.gz"
+        out = tmp_path_factory.mktemp("data") / "example.vcf.zarr"
+        vcf.convert_vcf([path], out)
+        return sg.load_dataset(out)
 
-    elif method == "convert":
-        convert_vcf(
-            [path],
-            output,
-            chunk_length=5,
-            chunk_width=2,
-        )
-        ds = xr.open_zarr(output)
-    else:
-        ds = read_vcf(
-            path, chunk_length=5, chunk_width=2, fields=fields, field_defs=field_defs
+    def test_filters(self, ds):
+        nt.assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
+        nt.assert_array_equal(
+            ds["variant_filter"],
+            [
+                [False, False, False],
+                [False, False, False],
+                [True, False, False],
+                [False, False, True],
+                [True, False, False],
+                [True, False, False],
+                [True, False, False],
+                [False, False, False],
+                [True, False, False],
+            ],
         )
 
-    assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
-    assert_array_equal(
-        ds["variant_filter"],
-        [
-            [False, False, False],
-            [False, False, False],
-            [True, False, False],
-            [False, False, True],
-            [True, False, False],
-            [True, False, False],
-            [True, False, False],
-            [False, False, False],
-            [True, False, False],
-        ],
-    )
-    assert_array_equal(ds["contig_id"], ["19", "20", "X"])
-    assert "contig_length" not in ds
-    assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
-    assert ds["variant_contig"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_position"],
-        [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
-    )
-    assert ds["variant_position"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_NS"],
-        [-1, -1, 3, 3, 2, 3, 3, -1, -1],
-    )
-    assert ds["variant_NS"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_AN"],
-        [-1, -1, -1, -1, -1, -1, 6, -1, -1],
-    )
-    assert ds["variant_AN"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_AA"],
-        [
-            ".",
-            ".",
-            ".",
-            ".",
-            "T",
-            "T",
-            "G",
-            ".",
-            ".",
-        ],
-    )
-    assert ds["variant_AN"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_DB"],
-        [
-            False,
-            False,
-            True,
-            False,
-            True,
-            False,
-            False,
-            False,
-            False,
-        ],
-    )
-    assert ds["variant_AN"].chunks[0][0] == 5
-
-    variant_AF = np.full((9, 2), FLOAT32_MISSING, dtype=np.float32)
-    variant_AF[2, 0] = 0.5
-    variant_AF[3, 0] = 0.017
-    variant_AF[4, 0] = 0.333
-    variant_AF[4, 1] = 0.667
-    assert_array_almost_equal(ds["variant_AF"], variant_AF, 3)
-    assert ds["variant_AF"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_AC"],
-        [
-            [-1, -1],
-            [-1, -1],
-            [-1, -1],
-            [-1, -1],
-            [-1, -1],
-            [-1, -1],
-            [3, 1],
-            [-1, -1],
-            [-1, -1],
-        ],
-    )
-    assert ds["variant_AC"].chunks[0][0] == 5
-
-    assert_array_equal(
-        ds["variant_allele"].values.tolist(),
-        [
-            ["A", "C", "", ""],
-            ["A", "G", "", ""],
-            ["G", "A", "", ""],
-            ["T", "A", "", ""],
-            ["A", "G", "T", ""],
-            ["T", "", "", ""],
-            ["G", "GA", "GAC", ""],
-            ["T", "", "", ""],
-            ["AC", "A", "ATG", "C"],
-        ],
-    )
-    assert ds["variant_allele"].chunks[0][0] == 5
-    assert ds["variant_allele"].dtype == "O"
-    assert_array_equal(
-        ds["variant_id"].values.tolist(),
-        [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
-    )
-    assert ds["variant_id"].chunks[0][0] == 5
-    assert ds["variant_id"].dtype == "O"
-    assert_array_equal(
-        ds["variant_id_mask"],
-        [True, True, False, True, False, True, False, True, False],
-    )
-    assert ds["variant_id_mask"].chunks[0][0] == 5
-
-    assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
-    assert ds["sample_id"].chunks[0][0] == 2
-
-    call_genotype = np.array(
-        [
-            [[0, 0], [0, 0], [0, 1]],
-            [[0, 0], [0, 0], [0, 1]],
-            [[0, 0], [1, 0], [1, 1]],
-            [[0, 0], [0, 1], [0, 0]],
-            [[1, 2], [2, 1], [2, 2]],
-            [[0, 0], [0, 0], [0, 0]],
-            [[0, 1], [0, 2], [-1, -1]],
-            [[0, 0], [0, 0], [-1, -1]],
-            # NOTE: inconsistency here on pad vs missing. I think this is a
-            # pad value.
-            [[0, -2], [0, 1], [0, 2]],
-        ],
-        dtype="i1",
-    )
-    call_genotype_phased = np.array(
-        [
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [False, False, False],
-            [False, True, False],
-            [True, False, True],
-        ],
-        dtype=bool,
-    )
-    call_DP = [
-        [-1, -1, -1],
-        [-1, -1, -1],
-        [1, 8, 5],
-        [3, 5, 3],
-        [6, 0, 4],
-        [-1, 4, 2],
-        [4, 2, 3],
-        [-1, -1, -1],
-        [-1, -1, -1],
-    ]
-    call_HQ = [
-        [[10, 15], [10, 10], [3, 3]],
-        [[10, 10], [10, 10], [3, 3]],
-        [[51, 51], [51, 51], [-1, -1]],
-        [[58, 50], [65, 3], [-1, -1]],
-        [[23, 27], [18, 2], [-1, -1]],
-        [[56, 60], [51, 51], [-1, -1]],
-        [[-1, -1], [-1, -1], [-1, -1]],
-        [[-1, -1], [-1, -1], [-1, -1]],
-        [[-1, -1], [-1, -1], [-1, -1]],
-    ]
+    def test_contigs(self, ds):
+        nt.assert_array_equal(ds["contig_id"], ["19", "20", "X"])
+        assert "contig_length" not in ds
+        nt.assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
 
-    # print(np.array2string(ds["call_HQ"].values, separator=","))
-    # print(np.array2string(ds["call_genotype"].values < 0, separator=","))
-
-    assert_array_equal(ds["call_genotype"], call_genotype)
-    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
-    assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
-    assert_array_equal(ds["call_DP"], call_DP)
-    assert_array_equal(ds["call_HQ"], call_HQ)
-
-    for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
-        assert ds[name].chunks == ((5, 4), (2, 1), (2,))
-
-    for name in ["call_genotype_phased", "call_DP"]:
-        assert ds[name].chunks == ((5, 4), (2, 1))
-
-    # save and load again to test https://github.com/pydata/xarray/issues/3476
-    path2 = tmp_path / "ds2.zarr"
-    if not is_path:
-        path2 = str(path2)
-    save_dataset(ds, path2)
-    assert_identical(ds, load_dataset(path2))
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
+    def test_position(self, ds):
+        nt.assert_array_equal(
+            ds["variant_position"],
+            [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
+        )
 
-    with pytest.warns(MaxAltAllelesExceededWarning):
-        max_alt_alleles = 1
-        vcf_to_zarr(
-            path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
+    def test_int_info_fields(self, ds):
+        nt.assert_array_equal(
+            ds["variant_NS"],
+            [-1, -1, 3, 3, 2, 3, 3, -1, -1],
+        )
+        nt.assert_array_equal(
+            ds["variant_AN"],
+            [-1, -1, -1, -1, -1, -1, 6, -1, -1],
         )
-        ds = xr.open_zarr(output)
 
-        # extra alt alleles are dropped
-        assert_array_equal(
-            ds["variant_allele"].values.tolist(),
+        nt.assert_array_equal(
+            ds["variant_AC"],
             [
-                ["A", "C"],
-                ["A", "G"],
-                ["G", "A"],
-                ["T", "A"],
-                ["A", "G"],
-                ["T", ""],
-                ["G", "GA"],
-                ["T", ""],
-                ["AC", "A"],
+                [-1, -1],
+                [-1, -1],
+                [-1, -1],
+                [-1, -1],
+                [-1, -1],
+                [-1, -1],
+                [3, 1],
+                [-1, -1],
+                [-1, -1],
             ],
         )
 
-        # genotype calls are truncated
-        assert np.all(ds["call_genotype"].values <= max_alt_alleles)
-
-        # the maximum number of alt alleles actually seen is stored as an attribute
-        assert ds.attrs["max_alt_alleles_seen"] == 3
-
-
-@pytest.mark.parametrize(
-    "read_chunk_length",
-    [None, 1_000],
-)
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, read_chunk_length, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output, chunk_length=5_000, read_chunk_length=read_chunk_length)
-    ds = xr.open_zarr(output)
-
-    assert_array_equal(ds["contig_id"], ["20", "21"])
-    assert_array_equal(ds["contig_length"], [63025520, 48129895])
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds["variant_allele"].dtype == "O"
-    assert ds["variant_id"].dtype == "O"
-
-    # check underlying zarr chunk size is 1 in samples dim
-    za = zarr.open(output)
-    assert za["sample_id"].chunks == (1,)
-    assert za["call_genotype"].chunks == (5000, 1, 2)
-
-
-def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
-    path = path_for_test(
-        shared_datadir,
-        "mixed.vcf",
-    )
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output, truncate_calls=True)
-    ds = xr.open_zarr(output)
-    assert ds["sample_id"].shape == (3,)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__mutable_mapping(shared_datadir, is_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output: MutableMapping[str, bytes] = {}
-
-    vcf_to_zarr(path, output, chunk_length=5_000)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds["variant_allele"].dtype == "O"
-    assert ds["variant_id"].dtype == "O"
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz", is_path)
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
-    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
-    encoding = dict(
-        variant_id=dict(compressor=variant_id_compressor),
-        variant_id_mask=dict(filters=None),
-    )
-    vcf_to_zarr(
-        path,
-        output,
-        chunk_length=5,
-        chunk_width=2,
-        compressor=compressor,
-        encoding=encoding,
-    )
-
-    # look at actual Zarr store to check compressor and filters
-    z = zarr.open(output)
-    assert z["call_genotype"].compressor == compressor
-    assert z["call_genotype"].filters is None  # sgkit default
-    assert z["call_genotype"].chunks == (5, 2, 2)
-    assert z["call_genotype_mask"].compressor == compressor
-    assert z["call_genotype_mask"].filters == [PackBits()]  # sgkit default
-    assert z["call_genotype_mask"].chunks == (5, 2, 2)
-
-    assert z["variant_id"].compressor == variant_id_compressor
-    assert z["variant_id"].filters == [VLenUTF8()]  # sgkit default
-    assert z["variant_id"].chunks == (5,)
-    assert z["variant_id_mask"].compressor == compressor
-    assert z["variant_id_mask"].filters is None
-    assert z["variant_id_mask"].chunks == (5,)
-
-    assert z["variant_position"].filters == [
-        Delta(dtype="i4", astype="i4")
-    ]  # sgkit default
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__parallel_compressor_and_filters(
-    shared_datadir, is_path, tmp_path
-):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-    regions = ["20", "21"]
-
-    compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
-    variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
-    encoding = dict(
-        variant_id=dict(compressor=variant_id_compressor),
-        variant_id_mask=dict(filters=None),
-    )
-    vcf_to_zarr(
-        path,
-        output,
-        regions=regions,
-        chunk_length=5_000,
-        compressor=compressor,
-        encoding=encoding,
-    )
-
-    # look at actual Zarr store to check compressor and filters
-    z = zarr.open(output)
-    assert z["call_genotype"].compressor == compressor
-    assert z["call_genotype"].filters is None  # sgkit default
-    assert z["call_genotype"].chunks == (5000, 1, 2)
-    assert z["call_genotype_mask"].compressor == compressor
-    assert z["call_genotype_mask"].filters == [PackBits()]  # sgkit default
-    assert z["call_genotype_mask"].chunks == (5000, 1, 2)
-
-    assert z["variant_id"].compressor == variant_id_compressor
-    assert z["variant_id"].filters == [VLenUTF8()]  # sgkit default
-    assert z["variant_id"].chunks == (5000,)
-    assert z["variant_id_mask"].compressor == compressor
-    assert z["variant_id_mask"].filters is None
-    assert z["variant_id_mask"].chunks == (5000,)
-
-    assert z["variant_position"].filters == [
-        Delta(dtype="i4", astype="i4")
-    ]  # sgkit default
-
-
-def test_vcf_to_zarr__float_format_field_warning(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "simple.output.mixed_depth.likelihoods.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    with pytest.warns(FloatFormatFieldWarning):
-        vcf_to_zarr(
-            path,
-            output,
-            ploidy=4,
-            max_alt_alleles=3,
-            fields=["FORMAT/GL"],
+    def test_float_info_fields(self, ds):
+        missing = vcf.FLOAT32_MISSING
+        fill = vcf.FLOAT32_FILL
+        variant_AF = np.array(
+            [
+                [missing, missing],
+                [missing, missing],
+                [0.5, fill],
+                [0.017, fill],
+                [0.333, 0.667],
+                [missing, missing],
+                [missing, missing],
+                [missing, missing],
+                [missing, missing],
+            ],
+            dtype=np.float32,
         )
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.parametrize(
-    "output_is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__parallel(shared_datadir, is_path, output_is_path, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output = tmp_path.joinpath("vcf_concat.zarr")
-    if not output_is_path:
-        output = output.as_posix()
-
-    regions = ["20", "21"]
-
-    vcf_to_zarr(
-        path,
-        output,
-        regions=regions,
-        chunk_length=5_000,
-    )
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds["variant_allele"].dtype == "O"
-    assert ds["variant_id"].dtype == "O"
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::UserWarning")
-def test_vcf_to_zarr__empty_region(shared_datadir, is_path, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-    regions = "23"
-
-    vcf_to_zarr(path, output, regions=regions)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (0, 1, 2)
-    assert ds["call_genotype_mask"].shape == (0, 1, 2)
-    assert ds["call_genotype_phased"].shape == (0, 1)
-    assert ds["variant_allele"].shape == (0, 4)
-    assert ds["variant_contig"].shape == (0,)
-    assert ds["variant_id"].shape == (0,)
-    assert ds["variant_id_mask"].shape == (0,)
-    assert ds["variant_position"].shape == (0,)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__parallel_temp_chunk_length(shared_datadir, is_path, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", is_path)
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-    regions = ["20", "21"]
-
-    # Use a temp_chunk_length that is smaller than chunk_length
-    # Open the temporary parts to check that they have the right temp chunk length
-    with tempfile.TemporaryDirectory() as tempdir:
-        vcf_to_zarr(
-            path,
-            output,
-            regions=regions,
-            chunk_length=5_000,
-            temp_chunk_length=2_500,
-            tempdir=tempdir,
-            retain_temp_files=True,
+        values = ds["variant_AF"].values
+        nt.assert_array_almost_equal(values, variant_AF, 3)
+        nans = np.isnan(variant_AF)
+        nt.assert_array_equal(
+            variant_AF.view(np.int32)[nans], values.view(np.int32)[nans]
         )
-        inner_temp_dir = join(tempdir, listdir(tempdir)[0])
-        parts_dir = join(inner_temp_dir, listdir(inner_temp_dir)[0])
-        part = xr.open_zarr(join(parts_dir, "part-0.zarr"))
-        assert part["call_genotype"].chunks[0][0] == 2_500
-        assert part["variant_position"].chunks[0][0] == 2_500
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype"].chunks[0][0] == 5_000
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-    assert ds["variant_position"].chunks[0][0] == 5_000
 
-    assert ds["variant_allele"].dtype == "O"
-    assert ds["variant_id"].dtype == "O"
-
-
-def test_vcf_to_zarr__parallel_temp_chunk_length_not_divisible(
-    shared_datadir, tmp_path
-):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz", False)
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-    regions = ["20", "21"]
-
-    with pytest.raises(
-        ValueError,
-        match=r"Temporary chunk length in variant dimension \(4000\) must evenly divide target chunk length 5000",
-    ):
-        # Use a temp_chunk_length that does not divide into chunk_length
-        vcf_to_zarr(
-            path, output, regions=regions, chunk_length=5_000, temp_chunk_length=4_000
+    def test_string_info_fields(self, ds):
+        nt.assert_array_equal(
+            ds["variant_AA"],
+            [
+                ".",
+                ".",
+                ".",
+                ".",
+                "T",
+                "T",
+                "G",
+                ".",
+                ".",
+            ],
         )
 
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__parallel_partitioned(shared_datadir, is_path, tmp_path):
-    path = path_for_test(
-        shared_datadir,
-        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
-        is_path,
-    )
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    regions = partition_into_regions(path, num_parts=4)
-
-    vcf_to_zarr(path, output, regions=regions, chunk_length=1_000, chunk_width=1_000)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (2535,)
-    assert ds["variant_id"].shape == (1406,)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__parallel_partitioned_by_size(shared_datadir, is_path, tmp_path):
-    path = path_for_test(
-        shared_datadir,
-        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
-        is_path,
-    )
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    vcf_to_zarr(
-        path, output, target_part_size="4MB", chunk_length=1_000, chunk_width=1_000
-    )
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (2535,)
-    assert ds["variant_id"].shape == (1406,)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__multiple(shared_datadir, is_path, tmp_path):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
-    ]
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    vcf_to_zarr(paths, output, target_part_size=None, chunk_length=5_000)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__multiple_partitioned(shared_datadir, is_path, tmp_path):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
-    ]
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    regions = [partition_into_regions(path, num_parts=2) for path in paths]
-
-    vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__multiple_partitioned_by_size(shared_datadir, is_path, tmp_path):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
-    ]
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    vcf_to_zarr(paths, output, target_part_size="40KB", chunk_length=5_000)
-    ds = xr.open_zarr(output)
-
-    assert ds["sample_id"].shape == (1,)
-    assert ds["call_genotype"].shape == (19910, 1, 2)
-    assert ds["call_genotype_mask"].shape == (19910, 1, 2)
-    assert ds["call_genotype_phased"].shape == (19910, 1)
-    assert ds["variant_allele"].shape == (19910, 4)
-    assert ds["variant_contig"].shape == (19910,)
-    assert ds["variant_id"].shape == (19910,)
-    assert ds["variant_id_mask"].shape == (19910,)
-    assert ds["variant_position"].shape == (19910,)
-
-    assert ds.chunks["variants"] == (5000, 5000, 5000, 4910)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__mutiple_partitioned_invalid_regions(
-    shared_datadir, is_path, tmp_path
-):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
-    ]
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    # invalid regions, should be a sequence of sequences
-    regions = partition_into_regions(paths[0], num_parts=2)
-
-    with pytest.raises(
-        ValueError,
-        match=r"multiple input regions must be a sequence of sequence of strings",
-    ):
-        vcf_to_zarr(paths, output, regions=regions, chunk_length=5_000)
-
-
-@pytest.mark.parametrize(
-    "is_path",
-    [True, False],
-)
-def test_vcf_to_zarr__multiple_max_alt_alleles(shared_datadir, is_path, tmp_path):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz", is_path),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz", is_path),
-    ]
-    output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
-
-    with pytest.warns(MaxAltAllelesExceededWarning):
-        vcf_to_zarr(
-            paths,
-            output,
-            target_part_size="40KB",
-            chunk_length=5_000,
-            max_alt_alleles=1,
+    def test_flag_info_fields(self, ds):
+        nt.assert_array_equal(
+            ds["variant_DB"],
+            [
+                False,
+                False,
+                True,
+                False,
+                True,
+                False,
+                False,
+                False,
+                False,
+            ],
         )
-        ds = xr.open_zarr(output)
-
-        # the maximum number of alt alleles actually seen is stored as an attribute
-        assert ds.attrs["max_alt_alleles_seen"] == 7
 
-
-@pytest.mark.parametrize(
-    "max_alt_alleles,dtype,warning",
-    [
-        (2, np.int8, True),
-        (127, np.int8, True),
-        (128, np.int16, True),
-        (145, np.int16, True),
-        (164, np.int16, False),
-    ],
-)
-def test_vcf_to_zarr__call_genotype_dtype(
-    shared_datadir, tmp_path, max_alt_alleles, dtype, warning
-):
-    path = path_for_test(shared_datadir, "allele_overflow.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-    if warning:
-        with pytest.warns(MaxAltAllelesExceededWarning):
-            vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
-    else:
-        vcf_to_zarr(path, output, max_alt_alleles=max_alt_alleles)
-    ds = load_dataset(output)
-    assert ds.call_genotype.dtype == dtype
-    assert ds.call_genotype.values.max() <= max_alt_alleles
-
-
-@pytest.mark.parametrize(
-    "ploidy,mixed_ploidy,truncate_calls,regions",
-    [
-        (2, False, True, None),
-        (4, False, False, None),
-        (4, False, False, ["CHR1:0-5", "CHR1:5-10"]),
-        (4, True, False, None),
-        (4, True, False, ["CHR1:0-5", "CHR1:5-10"]),
-        (5, True, False, None),
-    ],
-)
-def test_vcf_to_zarr__mixed_ploidy_vcf(
-    shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls, regions
-):
-    path = path_for_test(shared_datadir, "mixed.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        regions=regions,
-        chunk_length=5,
-        chunk_width=2,
-        ploidy=ploidy,
-        mixed_ploidy=mixed_ploidy,
-        truncate_calls=truncate_calls,
-    )
-    ds = load_dataset(output)
-
-    variant_dtype = "O"
-    assert_array_equal(ds["contig_id"], ["CHR1", "CHR2", "CHR3"])
-    assert_array_equal(ds["variant_contig"], [0, 0])
-    assert_array_equal(ds["variant_position"], [2, 7])
-    assert_array_equal(
-        ds["variant_allele"].values.tolist(),
-        np.array(
+    def test_allele(self, ds):
+        fill = vcf.STR_FILL
+        nt.assert_array_equal(
+            ds["variant_allele"].values.tolist(),
             [
-                ["A", "T", "", ""],
-                ["A", "C", "", ""],
+                ["A", "C", fill, fill],
+                ["A", "G", fill, fill],
+                ["G", "A", fill, fill],
+                ["T", "A", fill, fill],
+                ["A", "G", "T", fill],
+                ["T", fill, fill, fill],
+                ["G", "GA", "GAC", fill],
+                ["T", fill, fill, fill],
+                ["AC", "A", "ATG", "C"],
             ],
-            dtype=variant_dtype,
-        ),
-    )
-    assert ds["variant_allele"].dtype == variant_dtype  # type: ignore[comparison-overlap]
-    assert_array_equal(
-        ds["variant_id"],
-        np.array([".", "."], dtype=variant_dtype),
-    )
-    assert ds["variant_id"].dtype == variant_dtype  # type: ignore[comparison-overlap]
-    assert_array_equal(
-        ds["variant_id_mask"],
-        [True, True],
-    )
-    assert_array_equal(ds["sample_id"], ["SAMPLE1", "SAMPLE2", "SAMPLE3"])
-
-    assert ds["call_genotype"].attrs["mixed_ploidy"] == mixed_ploidy
-    pad = -2 if mixed_ploidy else -1  # -2 indicates a fill (non-allele) value
-    call_genotype = np.array(
-        [
-            [[0, 0, 1, 1, pad], [0, 0, pad, pad, pad], [0, 0, 0, 1, pad]],
-            [[0, 0, 1, 1, pad], [0, 1, pad, pad, pad], [0, 1, -1, -1, pad]],
-        ],
-        dtype="i1",
-    )
-    # truncate row vectors if lower ploidy
-    call_genotype = call_genotype[:, :, 0:ploidy]
-
-    assert_array_equal(ds["call_genotype"], call_genotype)
-    assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
-    if mixed_ploidy:
-        assert_array_equal(ds["call_genotype_fill"], call_genotype < -1)
-
-
-@pytest.mark.parametrize(
-    "ploidy,mixed_ploidy,truncate_calls",
-    [
-        (2, False, False),
-        (3, True, False),
-    ],
-)
-def test_vcf_to_zarr__mixed_ploidy_vcf_exception(
-    shared_datadir, tmp_path, ploidy, mixed_ploidy, truncate_calls
-):
-    path = path_for_test(shared_datadir, "mixed.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    with pytest.raises(ValueError) as excinfo:
-        vcf_to_zarr(
-            path,
-            output,
-            ploidy=ploidy,
-            mixed_ploidy=mixed_ploidy,
-            truncate_calls=truncate_calls,
         )
-    assert "Genotype call longer than ploidy." == str(excinfo.value)
-
-
-def test_vcf_to_zarr__no_genotypes(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "no_genotypes.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output)
-
-    ds = xr.open_zarr(output)
+        assert ds["variant_allele"].dtype == "O"
 
-    assert "call_genotype" not in ds
-    assert "call_genotype_mask" not in ds
-    assert "call_genotype_phased" not in ds
-
-    assert ds["sample_id"].shape == (0,)
-    assert ds["variant_allele"].shape == (26, 4)
-    assert ds["variant_contig"].shape == (26,)
-    assert ds["variant_id"].shape == (26,)
-    assert ds["variant_id_mask"].shape == (26,)
-    assert ds["variant_position"].shape == (26,)
-
-
-def test_vcf_to_zarr__no_genotypes_with_gt_header(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "no_genotypes_with_gt_header.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output)
-
-    ds = xr.open_zarr(output)
-
-    assert_array_equal(ds["call_genotype"], -1)
-    assert_array_equal(ds["call_genotype_mask"], 1)
-    assert_array_equal(ds["call_genotype_phased"], 0)
-
-    assert ds["sample_id"].shape == (0,)
-    assert ds["variant_allele"].shape == (26, 4)
-    assert ds["variant_contig"].shape == (26,)
-    assert ds["variant_id"].shape == (26,)
-    assert ds["variant_id_mask"].shape == (26,)
-    assert ds["variant_position"].shape == (26,)
-
-
-def test_vcf_to_zarr__contig_not_defined_in_header(shared_datadir, tmp_path):
-    # sample.vcf does not define the contigs in the header, and isn't indexed
-    path = path_for_test(shared_datadir, "sample.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    with pytest.raises(
-        ValueError,
-        match=r"Contig '19' is not defined in the header.",
-    ):
-        vcf_to_zarr(path, output)
-
-
-def test_vcf_to_zarr__filter_not_defined_in_header(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "no_filter_defined.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    with pytest.raises(
-        ValueError,
-        match=r"Filter 'FAIL' is not defined in the header.",
-    ):
-        vcf_to_zarr(path, output)
-
-
-def test_vcf_to_zarr__info_name_clash(shared_datadir, tmp_path):
-    # info_name_clash.vcf has an info field called 'id' which would be mapped to
-    # 'variant_id', clashing with the fixed field of the same name
-    path = path_for_test(shared_datadir, "info_name_clash.vcf")
-    output = tmp_path.joinpath("info_name_clash.zarr").as_posix()
-
-    vcf_to_zarr(path, output)  # OK if problematic field is ignored
-
-    with pytest.raises(
-        ValueError,
-        match=r"Generated name for INFO field 'id' clashes with 'variant_id' from fixed VCF fields.",
-    ):
-        vcf_to_zarr(path, output, fields=["INFO/id"])
-
-
-def test_vcf_to_zarr__large_number_of_contigs(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "Homo_sapiens_assembly38.headerOnly.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output)
-
-    ds = xr.open_zarr(output)
-
-    assert len(ds["contig_id"]) == 3366
-    assert ds["variant_contig"].dtype == np.int16  # needs larger dtype than np.int8
-
-
-def test_vcf_to_zarr__fields(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        chunk_length=5,
-        chunk_width=2,
-        fields=["INFO/DP", "INFO/AA", "INFO/DB", "FORMAT/DP"],
-    )
-    ds = xr.open_zarr(output)
-
-    missing, fill = INT_MISSING, INT_FILL
-    assert_array_equal(ds["variant_DP"], [fill, fill, 14, 11, 10, 13, 9, fill, fill])
-    assert ds["variant_DP"].attrs["comment"] == "Total Depth"
-
-    assert_array_equal(
-        ds["variant_AA"],
-        np.array(["", "", "", "", "T", "T", "G", "", ""], dtype="O"),
-    )
-    assert ds["variant_AA"].attrs["comment"] == "Ancestral Allele"
-
-    assert_array_equal(
-        ds["variant_DB"], [False, False, True, False, True, False, False, False, False]
-    )
-    assert ds["variant_DB"].attrs["comment"] == "dbSNP membership, build 129"
-
-    dp = np.array(
-        [
-            [fill, fill, fill],
-            [fill, fill, fill],
-            [1, 8, 5],
-            [3, 5, 3],
-            [6, 0, 4],
-            [missing, 4, 2],
-            [4, 2, 3],
-            [fill, fill, fill],
-            [fill, fill, fill],
-        ],
-        dtype="i4",
-    )
-    assert_array_equal(ds["call_DP"], dp)
-    assert ds["call_DP"].attrs["comment"] == "Read Depth"
-
-
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__parallel_with_fields(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-    regions = ["20", "21"]
-
-    vcf_to_zarr(
-        path,
-        output,
-        regions=regions,
-        chunk_length=5_000,
-        temp_chunk_length=2_500,
-        fields=["INFO/MQ", "FORMAT/PGT"],
-    )
-    ds = xr.open_zarr(output)
+    def test_id(self, ds):
+        nt.assert_array_equal(
+            ds["variant_id"].values.tolist(),
+            [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
+        )
+        assert ds["variant_id"].dtype == "O"
+        nt.assert_array_equal(
+            ds["variant_id_mask"],
+            [True, True, False, True, False, True, False, True, False],
+        )
 
-    # select a small region to check
-    ds = ds.set_index(variants=("variant_contig", "variant_position")).sel(
-        variants=slice((0, 10001661), (0, 10001670))
-    )
+    def test_samples(self, ds):
+        nt.assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
 
-    # check strings have not been truncated after concat_zarrs
-    assert_array_equal(
-        ds["variant_allele"],
-        np.array(
+    def test_call_genotype(self, ds):
+        call_genotype = np.array(
             [
-                ["T", "C", "<NON_REF>", ""],
-                ["T", "<NON_REF>", "", ""],
-                ["T", "G", "<NON_REF>", ""],
+                [[0, 0], [0, 0], [0, 1]],
+                [[0, 0], [0, 0], [0, 1]],
+                [[0, 0], [1, 0], [1, 1]],
+                [[0, 0], [0, 1], [0, 0]],
+                [[1, 2], [2, 1], [2, 2]],
+                [[0, 0], [0, 0], [0, 0]],
+                [[0, 1], [0, 2], [-1, -1]],
+                [[0, 0], [0, 0], [-1, -1]],
+                # FIXME this depends on "mixed ploidy" interpretation.
+                [[0, -2], [0, 1], [0, 2]],
             ],
-            dtype="O",
-        ),
-    )
-
-    # convert floats to ints to check nan type
-    fill = FLOAT32_FILL
-    assert_allclose(
-        ds["variant_MQ"].values.view("i4"),
-        np.array([58.33, fill, 57.45], dtype="f4").view("i4"),
-    )
-    assert ds["variant_MQ"].attrs["comment"] == "RMS Mapping Quality"
-
-    assert_array_equal(ds["call_PGT"], np.array([["0|1"], [""], ["0|1"]], dtype="O"))
-    assert (
-        ds["call_PGT"].attrs["comment"]
-        == "Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another"
-    )
-
-
-def test_vcf_to_zarr__field_defs(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        fields=["INFO/DP"],
-        field_defs={"INFO/DP": {"Description": "Combined depth across samples"}},
-    )
-    ds = xr.open_zarr(output)
-
-    fill = INT_FILL
-    assert_array_equal(ds["variant_DP"], [fill, fill, 14, 11, 10, 13, 9, fill, fill])
-    assert ds["variant_DP"].attrs["comment"] == "Combined depth across samples"
-
-    vcf_to_zarr(
-        path,
-        output,
-        fields=["INFO/DP"],
-        field_defs={"INFO/DP": {"Description": ""}},  # blank description
-    )
-    ds = xr.open_zarr(output)
-
-    assert_array_equal(ds["variant_DP"], [fill, fill, 14, 11, 10, 13, 9, fill, fill])
-    assert "comment" not in ds["variant_DP"].attrs
-
-
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__field_number_A(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        max_alt_alleles=2,
-        fields=["INFO/AC"],
-        field_defs={"INFO/AC": {"Number": "A"}},
-    )
-    ds = xr.open_zarr(output)
-
-    fill = INT_FILL
-    assert_array_equal(
-        ds["variant_AC"],
-        [
-            [fill, fill],
-            [fill, fill],
-            [fill, fill],
-            [fill, fill],
-            [fill, fill],
-            [fill, fill],
-            [3, 1],
-            [fill, fill],
-            [fill, fill],
-        ],
-    )
-    assert (
-        ds["variant_AC"].attrs["comment"]
-        == "Allele count in genotypes, for each ALT allele, in the same order as listed"
-    )
-
-
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__field_number_R(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        fields=["FORMAT/AD"],
-        field_defs={"FORMAT/AD": {"Number": "R"}},
-    )
-    ds = xr.open_zarr(output)
-
-    # select a small region to check
-    ds = ds.set_index(variants="variant_position").sel(
-        variants=slice(10002764, 10002793)
-    )
-
-    fill = INT_FILL
-    ad = np.array(
-        [
-            [[40, 14, 0, fill]],
-            [[fill, fill, fill, fill]],
-            [[65, 8, 5, 0]],
-            [[fill, fill, fill, fill]],
-        ],
-    )
-    assert_array_equal(ds["call_AD"], ad)
-    assert (
-        ds["call_AD"].attrs["comment"]
-        == "Allelic depths for the ref and alt alleles in the order listed"
-    )
-
-
-@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
-def test_vcf_to_zarr__field_number_G(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output, fields=["FORMAT/PL"])
-    ds = xr.open_zarr(output)
-
-    # select a small region to check
-    ds = ds.set_index(variants="variant_position").sel(
-        variants=slice(10002764, 10002793)
-    )
-
-    fill = INT_FILL
-    pl = np.array(
-        [
-            [[319, 0, 1316, 440, 1358, 1798, fill, fill, fill, fill]],
-            [[0, 120, 1800, fill, fill, fill, fill, fill, fill, fill]],
-            [[8, 0, 1655, 103, 1743, 2955, 184, 1653, 1928, 1829]],
-            [[0, 0, 2225, fill, fill, fill, fill, fill, fill, fill]],
-        ],
-    )
-    assert_array_equal(ds["call_PL"], pl)
-    assert (
-        ds["call_PL"].attrs["comment"]
-        == "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"
-    )
-
-
-def test_vcf_to_zarr__field_number_G_non_diploid(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "simple.output.mixed_depth.likelihoods.vcf")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    # store GL field as 2dp
-    encoding = {
-        "call_GL": {
-            "filters": [FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")]
-        }
-    }
-    vcf_to_zarr(
-        path,
-        output,
-        ploidy=4,
-        max_alt_alleles=3,
-        fields=["FORMAT/GL"],
-        encoding=encoding,
-    )
-    ds = xr.open_zarr(output)
-
-    # comb(n_alleles + ploidy - 1, ploidy) = comb(4 + 4 - 1, 4) = comb(7, 4) = 35
-    assert_array_equal(ds["call_GL"].shape, (4, 3, 35))
-    assert ds["call_GL"].attrs["comment"] == "Genotype likelihoods"
-
-
-@pytest.mark.filterwarnings(
-    "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning"
-)
-def test_vcf_to_zarr__field_number_fixed(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    # HQ Number is 2, and a dimension is automatically assigned (FORMAT_HQ_dim)
-    vcf_to_zarr(
-        path,
-        output,
-        fields=["FORMAT/HQ"],
-    )
-    ds = xr.open_zarr(output)
-
-    missing, fill = INT_MISSING, INT_FILL
-    assert_array_equal(
-        ds["call_HQ"],
-        [
-            [[10, 15], [10, 10], [3, 3]],
-            [[10, 10], [10, 10], [3, 3]],
-            [[51, 51], [51, 51], [missing, missing]],
-            [[58, 50], [65, 3], [missing, missing]],
-            [[23, 27], [18, 2], [missing, missing]],
-            [[56, 60], [51, 51], [missing, missing]],
-            [[fill, fill], [fill, fill], [fill, fill]],
-            [[fill, fill], [fill, fill], [fill, fill]],
-            [[fill, fill], [fill, fill], [fill, fill]],
-        ],
-    )
-    assert ds["call_HQ"].dims == ("variants", "samples", "FORMAT_HQ_dim")
-    assert ds["call_HQ"].attrs["comment"] == "Haplotype Quality"
-
-
-def test_vcf_to_zarr__fields_errors(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    with pytest.raises(
-        ValueError,
-        match=r"VCF field must be prefixed with 'INFO/' or 'FORMAT/'",
-    ):
-        vcf_to_zarr(path, output, fields=["DP"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"INFO field 'XX' is not defined in the header.",
-    ):
-        vcf_to_zarr(path, output, fields=["INFO/XX"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"FORMAT field 'XX' is not defined in the header.",
-    ):
-        vcf_to_zarr(path, output, fields=["FORMAT/XX"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"FORMAT field 'XX' is not defined in the header.",
-    ):
-        vcf_to_zarr(path, output, exclude_fields=["FORMAT/XX"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"INFO field 'AC' is defined as Number '.', which is not supported. Consider specifying `field_defs` to provide a concrete size for this field.",
-    ):
-        vcf_to_zarr(path, output, fields=["INFO/AC"])
-
-    with pytest.raises(
-        ValueError,
-        match=r"INFO field 'AN' is defined as Type 'Blah', which is not supported.",
-    ):
-        vcf_to_zarr(
-            path,
-            output,
-            fields=["INFO/AN"],
-            field_defs={"INFO/AN": {"Type": "Blah"}},
+            dtype="i1",
         )
+        nt.assert_array_equal(ds["call_genotype"], call_genotype)
+        nt.assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
 
-
-@pytest.mark.parametrize(
-    "vcf_file, expected_sizes",
-    [
-        (
-            "sample.vcf.gz",
-            {
-                "max_alt_alleles": 3,
-                "field_defs": {"INFO/AC": {"Number": 2}, "INFO/AF": {"Number": 2}},
-                "ploidy": 2,
-            },
-        ),
-        ("mixed.vcf.gz", {"max_alt_alleles": 1, "ploidy": 4}),
-        ("no_genotypes.vcf", {"max_alt_alleles": 1}),
-        (
-            "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
-            {
-                "max_alt_alleles": 7,
-                "field_defs": {"FORMAT/AD": {"Number": 8}},
-                "ploidy": 2,
-            },
-        ),
-    ],
-)
-def test_zarr_array_sizes(shared_datadir, vcf_file, expected_sizes):
-    path = path_for_test(shared_datadir, vcf_file)
-    sizes = zarr_array_sizes(path)
-    assert sizes == expected_sizes
-
-
-def test_zarr_array_sizes__parallel(shared_datadir):
-    path = path_for_test(shared_datadir, "CEUTrio.20.21.gatk3.4.g.vcf.bgz")
-    regions = ["20", "21"]
-    sizes = zarr_array_sizes(path, regions=regions)
-    assert sizes == {
-        "max_alt_alleles": 7,
-        "field_defs": {"FORMAT/AD": {"Number": 8}},
-        "ploidy": 2,
-    }
-
-
-def test_zarr_array_sizes__multiple(shared_datadir):
-    paths = [
-        path_for_test(shared_datadir, "CEUTrio.20.gatk3.4.g.vcf.bgz"),
-        path_for_test(shared_datadir, "CEUTrio.21.gatk3.4.g.vcf.bgz"),
-    ]
-    sizes = zarr_array_sizes(paths, target_part_size=None)
-    assert sizes == {
-        "max_alt_alleles": 7,
-        "field_defs": {"FORMAT/AD": {"Number": 8}},
-        "ploidy": 2,
-    }
-
-
-def test_zarr_array_sizes__parallel_partitioned_by_size(shared_datadir):
-    path = path_for_test(
-        shared_datadir,
-        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
-    )
-    sizes = zarr_array_sizes(path, target_part_size="4MB")
-    assert sizes == {
-        "max_alt_alleles": 3,
-        "field_defs": {"FORMAT/AD": {"Number": 4}},
-        "ploidy": 2,
-    }
-
-
-@pytest.mark.parametrize(
-    "all_kwargs, expected_sizes",
-    [
-        ([{"max_alt_alleles": 1}, {"max_alt_alleles": 2}], {"max_alt_alleles": 2}),
-        (
-            [{"max_alt_alleles": 1, "ploidy": 3}, {"max_alt_alleles": 2}],
-            {"max_alt_alleles": 2, "ploidy": 3},
-        ),
-        (
+    def test_call_genotype_phased(self, ds):
+        call_genotype_phased = np.array(
             [
-                {"max_alt_alleles": 1, "field_defs": {"FORMAT/AD": {"Number": 8}}},
-                {"max_alt_alleles": 2, "field_defs": {"FORMAT/AD": {"Number": 6}}},
+                [True, True, False],
+                [True, True, False],
+                [True, True, False],
+                [True, True, False],
+                [True, True, False],
+                [True, True, False],
+                [False, False, False],
+                [False, True, False],
+                [True, False, True],
             ],
-            {"max_alt_alleles": 2, "field_defs": {"FORMAT/AD": {"Number": 8}}},
-        ),
-    ],
-)
-def test_merge_zarr_array_sizes(all_kwargs, expected_sizes):
-    assert merge_zarr_array_sizes(all_kwargs) == expected_sizes
-
-
-def check_field(group, name, ndim, shape, dimension_names, dtype):
-    assert group[name].ndim == ndim
-    assert group[name].shape == shape
-    assert group[name].attrs["_ARRAY_DIMENSIONS"] == dimension_names
-    if dtype == str:
-        assert group[name].dtype == np.object_
-        assert VLenUTF8() in group[name].filters
-    else:
-        assert group[name].dtype == dtype
-
-
-@pytest.mark.filterwarnings(
-    "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning"
-)
-def test_spec(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample_multiple_filters.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    kwargs = zarr_array_sizes(path)
-    vcf_to_zarr(
-        path,
-        output,
-        chunk_length=5,
-        fields=["INFO/*", "FORMAT/*"],
-        mixed_ploidy=True,
-        **kwargs,
-    )
-
-    variants = 9
-    alt_alleles = 3
-    samples = 3
-    ploidy = 2
-
-    group = zarr.open_group(output)
-
-    # VCF Zarr group attributes
-    assert group.attrs["vcf_zarr_version"] == "0.2"
-    assert group.attrs["vcf_header"].startswith("##fileformat=VCFv4.0")
-    assert group.attrs["contigs"] == ["19", "20", "X"]
-
-    # VCF Zarr arrays
-    assert set(list(group.array_keys())) == set(
-        [
-            "variant_contig",
-            "variant_position",
-            "variant_id",
-            "variant_id_mask",
-            "variant_allele",
-            "variant_quality",
-            "variant_filter",
-            "variant_AA",
-            "variant_AC",
-            "variant_AF",
-            "variant_AN",
-            "variant_DB",
-            "variant_DP",
-            "variant_H2",
-            "variant_NS",
-            "call_DP",
-            "call_GQ",
-            "call_genotype",
-            "call_genotype_mask",
-            "call_genotype_fill",
-            "call_genotype_phased",
-            "call_HQ",
-            "contig_id",
-            "filter_id",
-            "sample_id",
-        ]
-    )
-
-    # Fixed fields
-    check_field(
-        group,
-        "variant_contig",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.int8,
-    )
-    check_field(
-        group,
-        "variant_position",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "variant_id",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=str,
-    )
-    check_field(
-        group,
-        "variant_allele",
-        ndim=2,
-        shape=(variants, alt_alleles + 1),
-        dimension_names=["variants", "alleles"],
-        dtype=str,
-    )
-    check_field(
-        group,
-        "variant_quality",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.float32,
-    )
-    check_field(
-        group,
-        "variant_filter",
-        ndim=2,
-        shape=(variants, 3),
-        dimension_names=["variants", "filters"],
-        dtype=bool,
-    )
-
-    # INFO fields
-    check_field(
-        group,
-        "variant_AA",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=str,
-    )
-    check_field(
-        group,
-        "variant_AC",
-        ndim=2,
-        shape=(variants, 2),
-        dimension_names=["variants", "INFO_AC_dim"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "variant_AF",
-        ndim=2,
-        shape=(variants, 2),
-        dimension_names=["variants", "INFO_AF_dim"],
-        dtype=np.float32,
-    )
-    check_field(
-        group,
-        "variant_AN",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "variant_DB",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=bool,
-    )
-    check_field(
-        group,
-        "variant_DP",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "variant_H2",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=bool,
-    )
-    check_field(
-        group,
-        "variant_NS",
-        ndim=1,
-        shape=(variants,),
-        dimension_names=["variants"],
-        dtype=np.int32,
-    )
-
-    # FORMAT fields
-    check_field(
-        group,
-        "call_DP",
-        ndim=2,
-        shape=(variants, samples),
-        dimension_names=["variants", "samples"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "call_GQ",
-        ndim=2,
-        shape=(variants, samples),
-        dimension_names=["variants", "samples"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "call_HQ",
-        ndim=3,
-        shape=(variants, samples, 2),
-        dimension_names=["variants", "samples", "FORMAT_HQ_dim"],
-        dtype=np.int32,
-    )
-    check_field(
-        group,
-        "call_genotype",
-        ndim=3,
-        shape=(variants, samples, ploidy),
-        dimension_names=["variants", "samples", "ploidy"],
-        dtype=np.int8,
-    )
-    check_field(
-        group,
-        "call_genotype_phased",
-        ndim=2,
-        shape=(variants, samples),
-        dimension_names=["variants", "samples"],
-        dtype=bool,
-    )
-
-    # Sample information
-    check_field(
-        group,
-        "sample_id",
-        ndim=1,
-        shape=(samples,),
-        dimension_names=["samples"],
-        dtype=str,
-    )
-
-    # Array values
-    assert_array_equal(group["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
-    assert_array_equal(
-        group["variant_position"],
-        [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
-    )
-    assert_array_equal(
-        group["variant_id"],
-        [".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
-    )
-    assert_array_equal(
-        group["variant_allele"],
-        [
-            ["A", "C", "", ""],
-            ["A", "G", "", ""],
-            ["G", "A", "", ""],
-            ["T", "A", "", ""],
-            ["A", "G", "T", ""],
-            ["T", "", "", ""],
-            ["G", "GA", "GAC", ""],
-            ["T", "", "", ""],
-            ["AC", "A", "ATG", "C"],
-        ],
-    )
-    assert_allclose(
-        group["variant_quality"], [9.6, 10.0, 29.0, 3.0, 67.0, 47.0, 50.0, np.nan, 10.0]
-    )
-    assert (
-        group["variant_quality"][:].view(np.int32)[7]
-        == np.array([0x7F800001], dtype=np.int32).item()
-    )  # missing nan
-    assert_array_equal(
-        group["variant_filter"],
-        [
-            [False, False, False],
-            [False, False, False],
-            [True, False, False],
-            [False, True, True],
-            [True, False, False],
-            [True, False, False],
-            [True, False, False],
-            [False, False, False],
-            [True, False, False],
-        ],
-    )
-
-    assert_array_equal(
-        group["variant_NS"],
-        [INT_FILL, INT_FILL, 3, 3, 2, 3, 3, INT_FILL, INT_FILL],
-    )
+            dtype=bool,
+        )
+        nt.assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
 
-    assert_array_equal(
-        group["call_DP"],
-        [
-            [INT_FILL, INT_FILL, INT_FILL],
-            [INT_FILL, INT_FILL, INT_FILL],
+    def test_call_DP(self, ds):
+        call_DP = [
+            [-1, -1, -1],
+            [-1, -1, -1],
             [1, 8, 5],
             [3, 5, 3],
             [6, 0, 4],
-            [INT_MISSING, 4, 2],
+            [-1, 4, 2],
             [4, 2, 3],
-            [INT_FILL, INT_FILL, INT_FILL],
-            [INT_FILL, INT_FILL, INT_FILL],
-        ],
-    )
-    assert_array_equal(
-        group["call_genotype"],
-        [
-            [[0, 0], [0, 0], [0, 1]],
-            [[0, 0], [0, 0], [0, 1]],
-            [[0, 0], [1, 0], [1, 1]],
-            [[0, 0], [0, 1], [0, 0]],
-            [[1, 2], [2, 1], [2, 2]],
-            [[0, 0], [0, 0], [0, 0]],
-            [[0, 1], [0, 2], [-1, -1]],
-            [[0, 0], [0, 0], [-1, -1]],
-            [[0, -2], [0, 1], [0, 2]],
-        ],
-    )
-    assert_array_equal(
-        group["call_genotype_phased"],
-        [
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [True, True, False],
-            [False, False, False],
-            [False, True, False],
-            [True, False, True],
-        ],
-    )
-
-    assert_array_equal(group["sample_id"], ["NA00001", "NA00002", "NA00003"])
-
-
-@pytest.mark.parametrize(
-    "retain_temp_files",
-    [True, False],
-)
-def test_vcf_to_zarr__retain_files(shared_datadir, tmp_path, retain_temp_files):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-    temp_path = tmp_path.joinpath("temp").as_posix()
-
-    vcf_to_zarr(
-        path,
-        output,
-        chunk_length=5,
-        chunk_width=2,
-        tempdir=temp_path,
-        retain_temp_files=retain_temp_files,
-        target_part_size="500B",
-    )
-    ds = xr.open_zarr(output)
-    assert_array_equal(ds["contig_id"], ["19", "20", "X"])
-    assert (len(os.listdir(temp_path)) == 0) != retain_temp_files
-
-
-def test_vcf_to_zarr__legacy_contig_and_filter_attrs(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "sample.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-
-    vcf_to_zarr(path, output, chunk_length=5, chunk_width=2)
-    ds = xr.open_zarr(output)
-
-    # drop new contig_id and filter_id variables
-    ds = ds.drop_vars(["contig_id", "filter_id"])
-
-    # check that contigs and filters can still be retrieved (with a warning)
-    assert num_contigs(ds) == 3
-    with pytest.warns(DeprecationWarning):
-        assert_array_equal(get_contigs(ds), np.array(["19", "20", "X"], dtype="S"))
-    with pytest.warns(DeprecationWarning):
-        assert_array_equal(get_filters(ds), np.array(["PASS", "s50", "q10"], dtype="S"))
-
-
-def test_vcf_to_zarr__no_samples(shared_datadir, tmp_path):
-    path = path_for_test(shared_datadir, "no_samples.vcf.gz")
-    output = tmp_path.joinpath("vcf.zarr").as_posix()
-    vcf_to_zarr(path, output)
-    # Run with many parts to test concat_zarrs path also accepts no samples
-    vcf_to_zarr(path, output, target_part_size="1k")
-    ds = xr.open_zarr(output)
-    assert_array_equal(ds["sample_id"], [])
-    assert_array_equal(ds["contig_id"], ["1"])
-    assert ds.sizes["variants"] == 973
-
-
-# TODO take out some of these, they take far too long
-@pytest.mark.parametrize(
-    "vcf_name",
-    [
-        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
-        "CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz",
-        "CEUTrio.20.21.gatk3.4.g.bcf",
-        "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
-        "CEUTrio.20.gatk3.4.g.vcf.bgz",
-        "CEUTrio.21.gatk3.4.g.vcf.bgz",
-        "sample_multiple_filters.vcf.gz",
-        "sample.vcf.gz",
-        "allele_overflow.vcf.gz",
-    ],
-)
-def test_compare_vcf_to_zarr_convert(shared_datadir, tmp_path, vcf_name):
-    vcf_path = path_for_test(shared_datadir, vcf_name)
-    zarr1_path = tmp_path.joinpath("vcf1.zarr").as_posix()
-    zarr2_path = tmp_path.joinpath("vcf2.zarr").as_posix()
-
-    # Convert gets the actual number of alleles by default, so use this as the
-    # input for
-    convert_vcf([vcf_path], zarr2_path)
-    ds2 = load_dataset(zarr2_path)
-    vcf_to_zarr(
-        vcf_path,
-        zarr1_path,
-        mixed_ploidy=True,
-        max_alt_alleles=ds2.variant_allele.shape[1] - 1,
-    )
-    ds1 = load_dataset(zarr1_path)
-
-    # convert reads all variables by default.
-    base_vars = list(ds1)
-    ds2 = load_dataset(zarr2_path)
-    # print(ds1.call_genotype.values)
-    # print(ds2.call_genotype.values)
-    xr.testing.assert_equal(ds1, ds2[base_vars])
-
-
-@pytest.mark.parametrize(
-    "vcf_name",
-    [
-        "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
-        "CEUTrio.20.21.gatk3.4.csi.g.vcf.bgz",
-        "CEUTrio.20.21.gatk3.4.g.bcf",
-        "CEUTrio.20.21.gatk3.4.g.vcf.bgz",
-        "CEUTrio.20.gatk3.4.g.vcf.bgz",
-        "CEUTrio.21.gatk3.4.g.vcf.bgz",
-        "sample_multiple_filters.vcf.gz",
-        "sample.vcf.gz",
-        "allele_overflow.vcf.gz",
-    ],
-)
-def test_validate_vcf(shared_datadir, tmp_path, vcf_name):
-    vcf_path = path_for_test(shared_datadir, vcf_name)
-    zarr_path = os.path.join("tmp/converted/", vcf_name, ".vcf.zarr")
-    # zarr_path = tmp_path.joinpath("vcf.zarr").as_posix()
-    print("converting", zarr_path)
-    convert_vcf([vcf_path], zarr_path)
-    # validate([vcf_path], zarr_path)
+            [-1, -1, -1],
+            [-1, -1, -1],
+        ]
+        nt.assert_array_equal(ds["call_DP"], call_DP)
 
+    def test_call_HQ(self, ds):
+        call_HQ = [
+            [[10, 15], [10, 10], [3, 3]],
+            [[10, 10], [10, 10], [3, 3]],
+            [[51, 51], [51, 51], [-1, -1]],
+            [[58, 50], [65, 3], [-1, -1]],
+            [[23, 27], [18, 2], [-1, -1]],
+            [[56, 60], [51, 51], [-1, -1]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+            [[-1, -1], [-1, -1], [-1, -1]],
+        ]
+        nt.assert_array_equal(ds["call_HQ"], call_HQ)