add gtsdb and stsd datasets (#354)

* add gtsdb and stsd datasets * use object instead of np.object * update tests * add setuptools
HazyResearch · May 8, 2023 · 9ba766e · 9ba766e
1 parent bf1f0df
commit 9ba766e
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 1 deletion.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -18,3 +18,4 @@ jinja2
 pydata-sphinx-theme==0.8.1
 sphinx==4.5.0
 sphinx_remove_toctrees
+setuptools
diff --git a/meerkat/datasets/__init__.py b/meerkat/datasets/__init__.py
@@ -6,6 +6,7 @@
 from .coco import coco
 from .expw import expw
 from .fer import fer
+from .gtsdb import gtsdb
 from .imagenet import imagenet
 from .imagenette import imagenette
 from .lra import pathfinder
@@ -17,6 +18,7 @@
 from .registry import datasets
 from .rfw import rfw
 from .siim_cxr import siim_cxr
+from .stsd import stsd
 from .torchaudio import yesno
 
 __all__ = [
@@ -35,6 +37,8 @@
     "yesno",
     "siim_cxr",
     "pathfinder",
+    "gtsdb",
+    "stsd",
 ]
 
 DOWNLOAD_MODES = ["force", "extract", "reuse", "skip"]

diff --git a/meerkat/datasets/gtsdb/__init__.py b/meerkat/datasets/gtsdb/__init__.py
@@ -0,0 +1,96 @@
+import os
+import re
+
+import pandas as pd
+
+import meerkat as mk
+
+from ..abstract import DatasetBuilder
+from ..info import DatasetInfo
+from ..registry import datasets
+from ..utils import download_url, extract
+
+_URL = "https://sid.erda.dk/public/archives/ff17dc924eba88d5d01a807357d6614c/FullIJCNN{version}.zip"  # noqa: E501
+
+
+@datasets.register()
+class gtsdb(DatasetBuilder):
+    """German Traffic Sign Detection Benchmark GTSDB."""
+
+    VERSIONS = ["2013"]
+
+    info = DatasetInfo(
+        name="gtsdb",
+        full_name="German Traffic Sign Detection Benchmark GTSDB",
+        description=("Image data set to detect street signs."),
+        homepage="https://sid.erda.dk/public/archives/ff17dc924eba88d5d01a807357d6614c/published-archive.html",  # noqa: E501
+        tags=["image", "object recognition"],
+        citation=None,
+    )
+
+    def build(self):
+        """Get the processed dataframe hosted on huggingface."""
+        folder = os.path.join(self.dataset_dir, f"FullIJCNN{self.version}")
+        gt_ann = pd.read_csv(os.path.join(folder, "gt.txt"), sep=";", header=None)
+
+        # Format categories
+        readme = os.path.join(folder, "ReadMe.txt")
+        with open(readme, "r") as f:
+            lines = [
+                x.strip() for x in f.readlines() if re.match("^[0-9]* = .*$", x.strip())
+            ]
+
+        categories = []
+        for line in lines:
+            category_id, category_full_name = line.split(" = ")
+            category_id = int(category_id)
+            category_full_name = category_full_name.strip()
+
+            category_name, supercategory = category_full_name.rsplit(" ", 1)
+            category_name = category_name.strip()
+            supercategory = supercategory.strip().strip("(").strip(")")
+
+            categories.append(
+                {
+                    "category_id": int(category_id),
+                    "category": category_name,
+                    "supercategory": supercategory,
+                }
+            )
+
+        categories = pd.DataFrame(categories)
+
+        # Format dataframe
+        df = gt_ann.rename(
+            {0: "filename", 1: "x1", 2: "y1", 3: "x2", 4: "y2", 5: "category_id"},
+            axis=1,
+        )
+        df = df.merge(categories, on="category_id")
+
+        # Split
+        images_files = sorted([x for x in os.listdir(folder) if x.endswith(".ppm")])
+        image_df = pd.DataFrame({"filename": images_files})
+        image_df["split"] = "train"
+        image_df.loc[600:, "split"] = "test"
+        df = df.merge(image_df, on="filename")
+
+        df = mk.DataFrame.from_pandas(df).drop("index")
+        df["image"] = mk.files(df["filename"], base_dir=folder, type="image")
+        df["image_crop"] = mk.defer(df, crop)
+        return df
+
+    def download(self):
+        downloaded_path = download_url(
+            _URL.format(version=self.version), self.dataset_dir
+        )
+        extract(downloaded_path, self.dataset_dir)
+
+    def is_downloaded(self):
+        return os.path.exists(self.dataset_dir) and os.path.exists(
+            os.path.join(self.dataset_dir, f"FullIJCNN{self.version}")
+        )
+
+
+def crop(image, x1, y1, x2, y2):
+    out = image.crop((x1, y1, x2, y2))
+    return out
diff --git a/meerkat/datasets/stsd/__init__.py b/meerkat/datasets/stsd/__init__.py
@@ -0,0 +1,130 @@
+# Swedish Traffic Signs Dataset (STSD)
+
+import os
+
+import pandas as pd
+from tqdm.auto import tqdm
+
+import meerkat as mk
+
+from ..abstract import DatasetBuilder
+from ..info import DatasetInfo
+from ..registry import datasets
+from ..utils import download_url, extract
+
+_SETS_TO_URLS = {
+    "Set1/annotations.txt": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set1/annotations.txt",  # noqa: E501
+    "Set2/annotations.txt": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set2/annotations.txt",  # noqa: E501
+    "Set1/Set1Part0": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set1/Set1Part0.zip",  # noqa: E501
+    "Set2/Set2Part0": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set2/Set2Part0.zip",  # noqa: E501
+}
+
+
+@datasets.register()
+class stsd(DatasetBuilder):
+    """Swedish Traffic Sign Dataset STSD."""
+
+    VERSIONS = ["2019"]
+
+    info = DatasetInfo(
+        name="stsd",
+        full_name="Swedish Traffic Sign Dataset STSD",
+        description=("Image data set to detect street signs."),
+        homepage="https://www.cvl.isy.liu.se/en/research/datasets/traffic-signs-dataset/download/",  # noqa: E501
+        tags=["image", "object recognition"],
+        citation=None,
+    )
+
+    def build(self):
+        """Get the processed dataframe hosted on huggingface."""
+        annotations = []
+        for set_name in ["Set1", "Set2"]:
+            ann_file = os.path.join(self.dataset_dir, f"{set_name}/annotations.txt")
+            df = _format_annotations(ann_file)
+            df["path"] = df["filename"].apply(
+                lambda x: os.path.join(
+                    self.dataset_dir, set_name, f"{set_name}Part0", x
+                )
+            )
+            annotations.append(df)
+        annotations = pd.concat(annotations).reset_index(drop=True)
+
+        df = pd.DataFrame(annotations)
+        df = mk.DataFrame.from_pandas(df).drop("index")
+        df["image"] = mk.files(
+            df["path"],
+            type="image",
+        )
+        df["image_crop"] = mk.defer(df, crop)
+        return df
+
+    def download(self):
+        for relative_path, url in tqdm(_SETS_TO_URLS.items(), verbose=True):
+            downloaded_path = download_url(url, self.dataset_dir)
+            path = os.path.join(self.dataset_dir, relative_path)
+            if url.endswith(".zip"):
+                os.makedirs(path, exist_ok=True)
+                extract(downloaded_path, path)
+            else:
+                os.makedirs(os.path.dirname(path), exist_ok=True)
+                os.rename(downloaded_path, path)
+
+    def is_downloaded(self):
+        return os.path.exists(self.dataset_dir) and all(
+            os.path.exists(os.path.join(self.dataset_dir, x)) for x in _SETS_TO_URLS
+        )
+
+
+def crop(image, x1, y1, x2, y2):
+    # Don't crop the image if the crop coordinates aren't valid.
+    if any(v == -1 for v in [x1, y1, x2, y2]):
+        return image.copy()
+    out = image.crop((x1, y1, x2, y2))
+    return out
+
+
+def _format_annotations(ann_file):
+    annotations = []
+    with open(ann_file, "r") as f:
+        lines = f.readlines()
+
+    for line in lines:
+        filename, anns = (x.strip() for x in line.split(":"))
+        for ann in anns.split(";"):
+            ann = ann.strip()
+            if len(ann) == 0:
+                continue
+            if ann == "MISC_SIGNS":
+                annotations.append(
+                    {
+                        "filename": filename,
+                        "visibility": "N/A",
+                        "x1": -1,
+                        "y1": -1,
+                        "x2": -1,
+                        "y2": -1,
+                        "sign_type": "MISC_SIGNS",
+                        "category": "MISC_SIGNS",
+                    }
+                )
+                continue
+            visibility, x2, y2, x1, y1, sign_type, name = (
+                x.strip() for x in ann.split(",")
+            )
+            # Annotation file is malformed for this example.
+            x2, y2, x1, y1 = [
+                x.split("l")[0] if "l" in x else x for x in [x2, y2, x1, y1]
+            ]
+            annotations.append(
+                {
+                    "filename": filename,
+                    "visibility": visibility,
+                    "x1": float(x1),
+                    "y1": float(y1),
+                    "x2": float(x2),
+                    "y2": float(y2),
+                    "sign_type": sign_type,
+                    "category": name,
+                }
+            )
+    return pd.DataFrame(annotations)
diff --git a/tests/meerkat/test_dataframe.py b/tests/meerkat/test_dataframe.py
@@ -929,6 +929,8 @@ def test_from_jsonl():
             data_to_compare = df_new[k]._data
         if k == "d":
             assert data_to_compare == data[k]
+        elif k == "b":
+            assert list(data_to_compare) == data[k]
         else:
             assert (data_to_compare == np.array(data[k])).all()
     temp_f.close()
@@ -1107,7 +1109,7 @@ def test_json_io(testbed, tmpdir):
             assert name not in df2
         else:
             assert name in df2
-            if col.to_numpy().dtype == np.object:
+            if col.to_numpy().dtype == "object":
                 assert np.all(df2[name].to_numpy() == col.to_numpy())
             else:
                 assert np.allclose(df2[name].to_numpy(), col.to_numpy())