Skip to content

Commit

Permalink
add gtsdb and stsd datasets (#354)
Browse files Browse the repository at this point in the history
* add gtsdb and stsd datasets

* use object instead of np.object

* update tests

* add setuptools
  • Loading branch information
ad12 authored May 8, 2023
1 parent bf1f0df commit 9ba766e
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ jinja2
pydata-sphinx-theme==0.8.1
sphinx==4.5.0
sphinx_remove_toctrees
setuptools
4 changes: 4 additions & 0 deletions meerkat/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .coco import coco
from .expw import expw
from .fer import fer
from .gtsdb import gtsdb
from .imagenet import imagenet
from .imagenette import imagenette
from .lra import pathfinder
Expand All @@ -17,6 +18,7 @@
from .registry import datasets
from .rfw import rfw
from .siim_cxr import siim_cxr
from .stsd import stsd
from .torchaudio import yesno

__all__ = [
Expand All @@ -35,6 +37,8 @@
"yesno",
"siim_cxr",
"pathfinder",
"gtsdb",
"stsd",
]

DOWNLOAD_MODES = ["force", "extract", "reuse", "skip"]
Expand Down
96 changes: 96 additions & 0 deletions meerkat/datasets/gtsdb/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os
import re

import pandas as pd

import meerkat as mk

from ..abstract import DatasetBuilder
from ..info import DatasetInfo
from ..registry import datasets
from ..utils import download_url, extract

_URL = "https://sid.erda.dk/public/archives/ff17dc924eba88d5d01a807357d6614c/FullIJCNN{version}.zip" # noqa: E501


@datasets.register()
class gtsdb(DatasetBuilder):
"""German Traffic Sign Detection Benchmark GTSDB."""

VERSIONS = ["2013"]

info = DatasetInfo(
name="gtsdb",
full_name="German Traffic Sign Detection Benchmark GTSDB",
description=("Image data set to detect street signs."),
homepage="https://sid.erda.dk/public/archives/ff17dc924eba88d5d01a807357d6614c/published-archive.html", # noqa: E501
tags=["image", "object recognition"],
citation=None,
)

def build(self):
"""Get the processed dataframe hosted on huggingface."""
folder = os.path.join(self.dataset_dir, f"FullIJCNN{self.version}")
gt_ann = pd.read_csv(os.path.join(folder, "gt.txt"), sep=";", header=None)

# Format categories
readme = os.path.join(folder, "ReadMe.txt")
with open(readme, "r") as f:
lines = [
x.strip() for x in f.readlines() if re.match("^[0-9]* = .*$", x.strip())
]

categories = []
for line in lines:
category_id, category_full_name = line.split(" = ")
category_id = int(category_id)
category_full_name = category_full_name.strip()

category_name, supercategory = category_full_name.rsplit(" ", 1)
category_name = category_name.strip()
supercategory = supercategory.strip().strip("(").strip(")")

categories.append(
{
"category_id": int(category_id),
"category": category_name,
"supercategory": supercategory,
}
)

categories = pd.DataFrame(categories)

# Format dataframe
df = gt_ann.rename(
{0: "filename", 1: "x1", 2: "y1", 3: "x2", 4: "y2", 5: "category_id"},
axis=1,
)
df = df.merge(categories, on="category_id")

# Split
images_files = sorted([x for x in os.listdir(folder) if x.endswith(".ppm")])
image_df = pd.DataFrame({"filename": images_files})
image_df["split"] = "train"
image_df.loc[600:, "split"] = "test"
df = df.merge(image_df, on="filename")

df = mk.DataFrame.from_pandas(df).drop("index")
df["image"] = mk.files(df["filename"], base_dir=folder, type="image")
df["image_crop"] = mk.defer(df, crop)
return df

def download(self):
downloaded_path = download_url(
_URL.format(version=self.version), self.dataset_dir
)
extract(downloaded_path, self.dataset_dir)

def is_downloaded(self):
return os.path.exists(self.dataset_dir) and os.path.exists(
os.path.join(self.dataset_dir, f"FullIJCNN{self.version}")
)


def crop(image, x1, y1, x2, y2):
out = image.crop((x1, y1, x2, y2))
return out
130 changes: 130 additions & 0 deletions meerkat/datasets/stsd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Swedish Traffic Signs Dataset (STSD)

import os

import pandas as pd
from tqdm.auto import tqdm

import meerkat as mk

from ..abstract import DatasetBuilder
from ..info import DatasetInfo
from ..registry import datasets
from ..utils import download_url, extract

_SETS_TO_URLS = {
"Set1/annotations.txt": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set1/annotations.txt", # noqa: E501
"Set2/annotations.txt": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set2/annotations.txt", # noqa: E501
"Set1/Set1Part0": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set1/Set1Part0.zip", # noqa: E501
"Set2/Set2Part0": "http://www.isy.liu.se/cvl/research/trafficSigns/swedishSignsSummer/Set2/Set2Part0.zip", # noqa: E501
}


@datasets.register()
class stsd(DatasetBuilder):
"""Swedish Traffic Sign Dataset STSD."""

VERSIONS = ["2019"]

info = DatasetInfo(
name="stsd",
full_name="Swedish Traffic Sign Dataset STSD",
description=("Image data set to detect street signs."),
homepage="https://www.cvl.isy.liu.se/en/research/datasets/traffic-signs-dataset/download/", # noqa: E501
tags=["image", "object recognition"],
citation=None,
)

def build(self):
"""Get the processed dataframe hosted on huggingface."""
annotations = []
for set_name in ["Set1", "Set2"]:
ann_file = os.path.join(self.dataset_dir, f"{set_name}/annotations.txt")
df = _format_annotations(ann_file)
df["path"] = df["filename"].apply(
lambda x: os.path.join(
self.dataset_dir, set_name, f"{set_name}Part0", x
)
)
annotations.append(df)
annotations = pd.concat(annotations).reset_index(drop=True)

df = pd.DataFrame(annotations)
df = mk.DataFrame.from_pandas(df).drop("index")
df["image"] = mk.files(
df["path"],
type="image",
)
df["image_crop"] = mk.defer(df, crop)
return df

def download(self):
for relative_path, url in tqdm(_SETS_TO_URLS.items(), verbose=True):
downloaded_path = download_url(url, self.dataset_dir)
path = os.path.join(self.dataset_dir, relative_path)
if url.endswith(".zip"):
os.makedirs(path, exist_ok=True)
extract(downloaded_path, path)
else:
os.makedirs(os.path.dirname(path), exist_ok=True)
os.rename(downloaded_path, path)

def is_downloaded(self):
return os.path.exists(self.dataset_dir) and all(
os.path.exists(os.path.join(self.dataset_dir, x)) for x in _SETS_TO_URLS
)


def crop(image, x1, y1, x2, y2):
# Don't crop the image if the crop coordinates aren't valid.
if any(v == -1 for v in [x1, y1, x2, y2]):
return image.copy()
out = image.crop((x1, y1, x2, y2))
return out


def _format_annotations(ann_file):
annotations = []
with open(ann_file, "r") as f:
lines = f.readlines()

for line in lines:
filename, anns = (x.strip() for x in line.split(":"))
for ann in anns.split(";"):
ann = ann.strip()
if len(ann) == 0:
continue
if ann == "MISC_SIGNS":
annotations.append(
{
"filename": filename,
"visibility": "N/A",
"x1": -1,
"y1": -1,
"x2": -1,
"y2": -1,
"sign_type": "MISC_SIGNS",
"category": "MISC_SIGNS",
}
)
continue
visibility, x2, y2, x1, y1, sign_type, name = (
x.strip() for x in ann.split(",")
)
# Annotation file is malformed for this example.
x2, y2, x1, y1 = [
x.split("l")[0] if "l" in x else x for x in [x2, y2, x1, y1]
]
annotations.append(
{
"filename": filename,
"visibility": visibility,
"x1": float(x1),
"y1": float(y1),
"x2": float(x2),
"y2": float(y2),
"sign_type": sign_type,
"category": name,
}
)
return pd.DataFrame(annotations)
4 changes: 3 additions & 1 deletion tests/meerkat/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,6 +929,8 @@ def test_from_jsonl():
data_to_compare = df_new[k]._data
if k == "d":
assert data_to_compare == data[k]
elif k == "b":
assert list(data_to_compare) == data[k]
else:
assert (data_to_compare == np.array(data[k])).all()
temp_f.close()
Expand Down Expand Up @@ -1107,7 +1109,7 @@ def test_json_io(testbed, tmpdir):
assert name not in df2
else:
assert name in df2
if col.to_numpy().dtype == np.object:
if col.to_numpy().dtype == "object":
assert np.all(df2[name].to_numpy() == col.to_numpy())
else:
assert np.allclose(df2[name].to_numpy(), col.to_numpy())
Expand Down

0 comments on commit 9ba766e

Please sign in to comment.