Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Soda10m parser #6

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
140 changes: 140 additions & 0 deletions parsers/soda/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# vscode
.vscode/
images/
annotations/
*gz
*zip
cifar-10-batches-py/
main.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


**/.DS_Store
43 changes: 43 additions & 0 deletions parsers/soda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SODA10M Dataset

## Get the Dataset
You can download the labeled dataset from this [link](https://drive.google.com/file/d/1oSJ0rbqNHLmlOOzpmQqXLDraCCQss4Q4/view?usp=sharing). The link contains both the training and the validation datasets along with their annotations.

## Usage

```python

from parsers.soda.parser import SODAParser
from pathlib import Path
import os

parser = SODAParser(
images_dir = 'path_to_image_dir/SSLAD-2d/labelled/train',
annotation_dir = 'path_to_image_dir/SSLAD-2d/labelled/annotations',
dataset_name = 'soda10m',
)

#parse it
parser_annotation = parser.parse_annotations()

#convert annotations to JSON
parser.save_to_json(
path_to_save = 'path',
dir_name = "annotationFolder"
)

#upload to Darwin
parser.upload_to_darwin(
api_key = api_key,
image_dir = 'path_to_image_dir/SSLAD-2d/labelled/train',
json_dir = 'path/annotationFolder'
)

#get filenames
print(parser.filename)

#get single annotations
print(parser.get_annotations(100))

```

Empty file added parsers/soda/__init__.py
Empty file.
116 changes: 116 additions & 0 deletions parsers/soda/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from ..base import Parser
from pathlib import Path
import pickle
import numpy as np
from ..datatypes import Annotation, Image, ImageAnnotationFile, Tag
from typing import Any, List, Tuple
from PIL import Image as PILImage
from dataclasses import asdict
import json
from tqdm import tqdm
from glob import glob


class SODAParser:
annotations = {}
filename = []
ALLOWED_PATHS = ["/train", "/test", "/val"]

def __init__(
self,
images_dir: Path,
annotation_dir: Path,
dataset_name: str,
path: str = "/train",
):
self.images_dir = images_dir
self.annotation_dir = annotation_dir
self.dataset_name = dataset_name

if path not in self.ALLOWED_PATHS:
raise ValueError(f"path should be one of {self.ALLOWED_PATHS}")

self.path = path

def parse_annotations(
self, data_type="train", save_annotation=True
) -> ImageAnnotationFile:

files = glob(f"{self.annotation_dir}/*.json")

if data_type == "train":
json_data = f"{self.annotation_dir}/instance_train.json"
else:
json_data = f"{self.annotation_dir}/instance_val.json"

f = open(json_data)
data = json.load(f)

for idx, file in tqdm(enumerate(data["images"], 1), total=5000, desc="parsing"):
coor = []
self.filename.append(file["file_name"].split(".")[0])
for seq, an in enumerate(data["annotations"]):
if len(an) != 0:
if an["image_id"] == idx:

coor.append(
Annotation(
name=data["categories"][an["category_id"] - 1]["name"]
).add_data(
BoundingBox(
x=int(an["bbox"][0]),
y=int(an["bbox"][1]),
w=int(an["bbox"][2]),
h=int(an["bbox"][3]),
)
)
)
ann = ImageAnnotationFile(
dataset=self.dataset_name,
image=Image(
width=int(file["width"]),
height=int(file["height"]),
original_filename=file["file_name"],
filename=file["file_name"],
path=self.path,
),
annotations=coor,
)
if save_annotation:
self.annotations[file["file_name"].split(".")[0]] = ann

def get_annotations(self, idx: int):
ann = self.annotations[self.filename[idx]]
return ann

def save_to_json(self, path_to_save="", dir_name="annotationFolder"):
path = f"{path_to_save}/{dir_name}"
try:
if not os.path.exists(path):
os.mkdir(path)
for idx in tqdm(range(5000), desc="Creating JSON file"):
ann = self.get_annotations(idx)
filename = self.filename[idx]
json_object = json.dumps(asdict(ann), indent=4)
with open(f"{path}/{filename}.json", "w") as outfile:
outfile.write(json_object)
except:
print("path exist")

def upload_to_darwin(self, api_key: str, image_dir: Path, json_dir: Path):

images = glob(f"{image_dir}/*.jpg")
annotations = glob(f"{json_dir}/*.json")
client = Client.from_api_key(api_key)
dataset_identifier = f"{client.default_team}/{self.dataset_name}"
try:
dataset = client.create_dataset(self.dataset_name)
except darwin.exceptions.NameTaken:
dataset = client.get_remote_dataset(dataset_identifier)
dataset.push(images, path=self.path)
importer.import_annotations(
dataset,
formats.darwin.parse_file,
annotations,
append=True,
)