diff --git a/parsers/soda/.gitignore b/parsers/soda/.gitignore new file mode 100644 index 0000000..4548f02 --- /dev/null +++ b/parsers/soda/.gitignore @@ -0,0 +1,140 @@ +# vscode +.vscode/ +images/ +annotations/ +*gz +*zip +cifar-10-batches-py/ +main.py +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + +**/.DS_Store \ No newline at end of file diff --git a/parsers/soda/README.md b/parsers/soda/README.md new file mode 100644 index 0000000..b3b5049 --- /dev/null +++ b/parsers/soda/README.md @@ -0,0 +1,43 @@ +# SODA10M Dataset + +## Get the Dataset +You can download the labeled dataset from this [link](https://drive.google.com/file/d/1oSJ0rbqNHLmlOOzpmQqXLDraCCQss4Q4/view?usp=sharing). The link contains both the training and the validation datasets along with their annotations. + +## Usage + +```python + +from parsers.soda.parser import SODAParser +from pathlib import Path +import os + +parser = SODAParser( + images_dir = 'path_to_image_dir/SSLAD-2d/labelled/train', + annotation_dir = 'path_to_image_dir/SSLAD-2d/labelled/annotations', + dataset_name = 'soda10m', +) + +#parse it +parser_annotation = parser.parse_annotations() + +#convert annotations to JSON +parser.save_to_json( + path_to_save = 'path', + dir_name = "annotationFolder" +) + +#upload to Darwin +parser.upload_to_darwin( + api_key = api_key, + image_dir = 'path_to_image_dir/SSLAD-2d/labelled/train', + json_dir = 'path/annotationFolder' +) + +#get filenames +print(parser.filename) + +#get single annotations +print(parser.get_annotations(100)) + +``` + diff --git a/parsers/soda/__init__.py b/parsers/soda/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parsers/soda/parser.py b/parsers/soda/parser.py new file mode 100644 index 0000000..d4e0ee3 --- /dev/null +++ b/parsers/soda/parser.py @@ -0,0 +1,116 @@ +from ..base import Parser +from pathlib import Path +import pickle +import numpy as np +from ..datatypes import Annotation, Image, ImageAnnotationFile, Tag +from typing import Any, List, Tuple +from PIL import Image as PILImage +from dataclasses import asdict +import json +from tqdm import tqdm +from glob import glob + + +class SODAParser: + annotations = {} + filename = [] + ALLOWED_PATHS = ["/train", "/test", "/val"] + + def __init__( + self, + images_dir: Path, + annotation_dir: Path, + dataset_name: str, + path: str = "/train", + ): + self.images_dir = images_dir + self.annotation_dir = annotation_dir + self.dataset_name = dataset_name + + if path not in self.ALLOWED_PATHS: + raise ValueError(f"path should be one of {self.ALLOWED_PATHS}") + + self.path = path + + def parse_annotations( + self, data_type="train", save_annotation=True + ) -> ImageAnnotationFile: + + files = glob(f"{self.annotation_dir}/*.json") + + if data_type == "train": + json_data = f"{self.annotation_dir}/instance_train.json" + else: + json_data = f"{self.annotation_dir}/instance_val.json" + + f = open(json_data) + data = json.load(f) + + for idx, file in tqdm(enumerate(data["images"], 1), total=5000, desc="parsing"): + coor = [] + self.filename.append(file["file_name"].split(".")[0]) + for seq, an in enumerate(data["annotations"]): + if len(an) != 0: + if an["image_id"] == idx: + + coor.append( + Annotation( + name=data["categories"][an["category_id"] - 1]["name"] + ).add_data( + BoundingBox( + x=int(an["bbox"][0]), + y=int(an["bbox"][1]), + w=int(an["bbox"][2]), + h=int(an["bbox"][3]), + ) + ) + ) + ann = ImageAnnotationFile( + dataset=self.dataset_name, + image=Image( + width=int(file["width"]), + height=int(file["height"]), + original_filename=file["file_name"], + filename=file["file_name"], + path=self.path, + ), + annotations=coor, + ) + if save_annotation: + self.annotations[file["file_name"].split(".")[0]] = ann + + def get_annotations(self, idx: int): + ann = self.annotations[self.filename[idx]] + return ann + + def save_to_json(self, path_to_save="", dir_name="annotationFolder"): + path = f"{path_to_save}/{dir_name}" + try: + if not os.path.exists(path): + os.mkdir(path) + for idx in tqdm(range(5000), desc="Creating JSON file"): + ann = self.get_annotations(idx) + filename = self.filename[idx] + json_object = json.dumps(asdict(ann), indent=4) + with open(f"{path}/{filename}.json", "w") as outfile: + outfile.write(json_object) + except: + print("path exist") + + def upload_to_darwin(self, api_key: str, image_dir: Path, json_dir: Path): + + images = glob(f"{image_dir}/*.jpg") + annotations = glob(f"{json_dir}/*.json") + client = Client.from_api_key(api_key) + dataset_identifier = f"{client.default_team}/{self.dataset_name}" + try: + dataset = client.create_dataset(self.dataset_name) + except darwin.exceptions.NameTaken: + dataset = client.get_remote_dataset(dataset_identifier) + dataset.push(images, path=self.path) + importer.import_annotations( + dataset, + formats.darwin.parse_file, + annotations, + append=True, + )