Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
from_tensors support for VideoClassification (#1389)
Browse files Browse the repository at this point in the history
Co-authored-by: Ethan Harris <[email protected]>
  • Loading branch information
krshrimali and ethanwharris authored Sep 1, 2022
1 parent 0e9fdc0 commit 4674aba
Show file tree
Hide file tree
Showing 5 changed files with 441 additions and 16 deletions.
123 changes: 121 additions & 2 deletions flash/video/classification/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
from typing import Any, Callable, Collection, Dict, List, Optional, Sequence, Type, Union

import pandas as pd
import torch
Expand Down Expand Up @@ -41,6 +41,8 @@
VideoClassificationFilesInput,
VideoClassificationFoldersInput,
VideoClassificationPathsPredictInput,
VideoClassificationTensorsInput,
VideoClassificationTensorsPredictInput,
)
from flash.video.classification.input_transform import VideoClassificationInputTransform

Expand All @@ -63,6 +65,7 @@
"VideoClassificationData.from_folders",
"VideoClassificationData.from_data_frame",
"VideoClassificationData.from_csv",
"VideoClassificationData.from_tensors",
]
if not _VIDEO_EXTRAS_TESTING:
__doctest_skip__ += ["VideoClassificationData.from_fiftyone"]
Expand Down Expand Up @@ -395,7 +398,6 @@ def from_data_frame(
predict_data_frame: Optional[pd.DataFrame] = None,
predict_videos_root: Optional[str] = None,
predict_resolver: Optional[Callable[[str, str], str]] = None,
target_formatter: Optional[TargetFormatter] = None,
clip_sampler: Union[str, "ClipSampler"] = "random",
clip_duration: float = 2,
clip_sampler_kwargs: Dict[str, Any] = None,
Expand All @@ -404,6 +406,7 @@ def from_data_frame(
decoder: str = "pyav",
input_cls: Type[Input] = VideoClassificationDataFrameInput,
predict_input_cls: Type[Input] = VideoClassificationDataFramePredictInput,
target_formatter: Optional[TargetFormatter] = None,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
Expand Down Expand Up @@ -566,6 +569,122 @@ def from_data_frame(
**data_module_kwargs,
)

@classmethod
def from_tensors(
cls,
train_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
train_targets: Optional[Collection[Any]] = None,
val_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
val_targets: Optional[Sequence[Any]] = None,
test_data: Optional[Collection[torch.Tensor]] = None,
test_targets: Optional[Sequence[Any]] = None,
predict_data: Optional[Union[Collection[torch.Tensor], torch.Tensor]] = None,
target_formatter: Optional[TargetFormatter] = None,
video_sampler: Type[Sampler] = torch.utils.data.SequentialSampler,
input_cls: Type[Input] = VideoClassificationTensorsInput,
predict_input_cls: Type[Input] = VideoClassificationTensorsPredictInput,
transform: INPUT_TRANSFORM_TYPE = VideoClassificationInputTransform,
transform_kwargs: Optional[Dict] = None,
**data_module_kwargs: Any,
) -> "VideoClassificationData":
"""Load the :class:`~flash.video.classification.data.VideoClassificationData` from a dictionary containing
PyTorch tensors representing input video frames and their corresponding targets.
Input tensor(s) will be extracted from the ``input_field`` in the ``dict``.
The targets will be extracted from the ``target_fields`` in the ``dict`` and can be in any of our
:ref:`supported classification target formats <formatting_classification_targets>`.
To learn how to customize the transforms applied for each stage, read our
:ref:`customizing transforms guide <customizing_transforms>`.
Args:
train_data: The torch tensor or list of tensors to use when training.
train_targets: The list of targets to use when training.
val_data: The torch tensor or list of tensors to use when validating.
val_targets: The list of targets to use when validating.
test_data: The torch tensor or list of tensors to use when testing.
test_targets: The list of targets to use when testing.
predict_data: The torch tensor or list of tensors to use when predicting.
train_data: A torch tensor or list of tensors to use when training.
train_targets: The list of targets to use when training.
target_formatter: Optionally provide a :class:`~flash.core.data.utilities.classification.TargetFormatter` to
control how targets are handled. See :ref:`formatting_classification_targets` for more details.
video_sampler: Sampler for the internal video container. This defines the order tensors are used and,
if necessary, the distributed split.
input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the data.
predict_input_cls: The :class:`~flash.core.data.io.input.Input` type to use for loading the prediction data.
transform: The :class:`~flash.core.data.io.input_transform.InputTransform` type to use.
transform_kwargs: Dict of keyword arguments to be provided when instantiating the transforms.
data_module_kwargs: Additional keyword arguments to provide to the
:class:`~flash.core.data.data_module.DataModule` constructor.
Returns:
The constructed :class:`~flash.video.classification.data.VideoClassificationData`.
Examples
________
.. doctest::
>>> import torch
>>> from flash import Trainer
>>> from flash.video import VideoClassifier, VideoClassificationData
>>> frame = torch.randint(low=0, high=255, size=(3, 5, 10, 10), dtype=torch.uint8, device="cpu")
>>> datamodule = VideoClassificationData.from_tensors(
... train_data=[frame, frame, frame],
... train_targets=["fruit", "vegetable", "fruit"],
... val_data=[frame, frame],
... val_targets=["vegetable", "fruit"],
... predict_data=[frame],
... batch_size=1,
... )
>>> datamodule.num_classes
2
>>> datamodule.labels
['fruit', 'vegetable']
>>> model = VideoClassifier(backbone="x3d_xs", num_classes=datamodule.num_classes)
>>> trainer = Trainer(fast_dev_run=True)
>>> trainer.fit(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Training...
>>> trainer.predict(model, datamodule=datamodule) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
Predicting...
.. testcleanup::
>>> del frame
"""

train_input = input_cls(
RunningStage.TRAINING,
train_data,
train_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
)
target_formatter = getattr(train_input, "target_formatter", None)

return cls(
train_input,
input_cls(
RunningStage.VALIDATING,
val_data,
val_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
input_cls(
RunningStage.TESTING,
test_data,
test_targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
),
predict_input_cls(RunningStage.PREDICTING, predict_data),
transform=transform,
transform_kwargs=transform_kwargs,
**data_module_kwargs,
)

@classmethod
def from_csv(
cls,
Expand Down
104 changes: 101 additions & 3 deletions flash/video/classification/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Any, Callable, Dict, List, Optional, Type, Union
from typing import Any, Callable, Collection, Dict, List, Optional, Type, Union

import pandas as pd
import torch
Expand All @@ -21,7 +21,7 @@

from flash.core.data.io.classification_input import ClassificationInputMixin
from flash.core.data.io.input import DataKeys, Input, IterableInput
from flash.core.data.utilities.classification import MultiBinaryTargetFormatter, TargetFormatter
from flash.core.data.utilities.classification import _is_list_like, MultiBinaryTargetFormatter, TargetFormatter
from flash.core.data.utilities.data_frame import resolve_files, resolve_targets
from flash.core.data.utilities.loading import load_data_frame
from flash.core.data.utilities.paths import list_valid_files, make_dataset, PATH_TYPE
Expand All @@ -40,8 +40,17 @@
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset
from pytorchvideo.data.labeled_video_paths import LabeledVideoPaths

from flash.video.classification.utils import LabeledVideoTensorDataset

else:
ClipSampler, LabeledVideoDataset, EncodedVideo, ApplyTransformToKey = None, None, None, None
ClipSampler, LabeledVideoDataset, LabeledVideoTensorDataset, EncodedVideo, ApplyTransformToKey = (
None,
None,
None,
None,
None,
)


def _make_clip_sampler(
Expand Down Expand Up @@ -87,6 +96,43 @@ def load_sample(self, sample):
return sample


class VideoClassificationTensorsBaseInput(IterableInput, ClassificationInputMixin):
def load_data(
self,
inputs: Optional[Union[Collection[torch.Tensor], torch.Tensor]],
targets: Union[List[Any], Any],
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
if isinstance(inputs, torch.Tensor):
# In case of (number of videos x CTHW) format
if inputs.ndim == 5:
inputs = list(inputs)
elif inputs.ndim == 4:
inputs = [inputs]
else:
raise ValueError(
f"Got dimension of the input tensor: {inputs.ndim}"
" for stack of tensors - dimension should be 5 or for a single tensor, dimension should be 4.",
)
elif not _is_list_like(inputs):
raise TypeError(f"Expected either a list/tuple of torch.Tensor or torch.Tensor, but got: {type(inputs)}.")

# Note: We take whatever is the shortest out of inputs and targets
dataset = LabeledVideoTensorDataset(list(zip(inputs, targets)), video_sampler=video_sampler)
if not self.predicting:
self.load_target_metadata(
[sample[1] for sample in dataset._labeled_videos], target_formatter=target_formatter
)
return dataset

def load_sample(self, sample):
sample["label"] = self.format_target(sample["label"])
sample[DataKeys.INPUT] = sample.pop("video")
sample[DataKeys.TARGET] = sample.pop("label")
return sample


class VideoClassificationFoldersInput(VideoClassificationInput):
def load_data(
self,
Expand Down Expand Up @@ -178,6 +224,34 @@ def load_data(
return result


class VideoClassificationTensorsInput(VideoClassificationTensorsBaseInput):
labels: list

def load_data(
self,
tensors: Any,
targets: Optional[List[Any]] = None,
video_sampler: Type[Sampler] = torch.utils.data.RandomSampler,
target_formatter: Optional[TargetFormatter] = None,
) -> "LabeledVideoTensorDataset":
result = super().load_data(
tensors,
targets,
video_sampler=video_sampler,
target_formatter=target_formatter,
)

# If we had binary multi-class targets then we also know the labels (column names)
if (
self.training
and isinstance(self.target_formatter, MultiBinaryTargetFormatter)
and isinstance(targets, List)
):
self.labels = targets

return result


class VideoClassificationCSVInput(VideoClassificationDataFrameInput):
def load_data(
self,
Expand Down Expand Up @@ -316,6 +390,30 @@ def predict_load_data(
)


class VideoClassificationTensorsPredictInput(Input):
def predict_load_data(self, data: Union[torch.Tensor, List[Any], Any]):
if _is_list_like(data):
return data
else:
if not isinstance(data, torch.Tensor):
raise TypeError(f"Expected either a list/tuple of torch.Tensor or torch.Tensor, but got: {type(data)}.")
if data.ndim == 5:
return list(data)
elif data.ndim == 4:
return [data]
else:
raise ValueError(
f"Got dimension of the input tensor: {data.ndim},"
" for stack of tensors - dimension should be 5 or for a single tensor, dimension should be 4."
)

def predict_load_sample(self, sample: torch.Tensor) -> Dict[str, Any]:
return {
DataKeys.INPUT: sample,
"video_index": 0,
}


class VideoClassificationCSVPredictInput(VideoClassificationDataFramePredictInput):
def predict_load_data(
self,
Expand Down
Loading

0 comments on commit 4674aba

Please sign in to comment.