Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make write only write columns in datapanel #240

Merged
merged 2 commits into from
May 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions meerkat/block/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def apply(self, method_name: str = "_get", *args, **kwargs) -> BlockManager:
results.reorder(self.keys())
return results

def consolidate(self):
def consolidate(self, consolidate_unitary_groups: bool = False):
column_order = list(
self._columns.keys()
) # need to maintain order after consolidate
Expand All @@ -88,7 +88,7 @@ def consolidate(self):
block_ref_groups[block_ref.block.signature].append(block_ref)

for block_refs in block_ref_groups.values():
if len(block_refs) == 1:
if (not consolidate_unitary_groups) and len(block_refs) == 1:
# if there is only one block ref in the group, do not consolidate
continue

Expand Down Expand Up @@ -241,7 +241,9 @@ def write(self, path: str):
os.makedirs(columns_dir)

# consolidate before writing
self.consolidate()
# we also want to consolidate unitary groups (i.e. groups with only one block
# ref) so that we don't write any data not actually in the dataframe
self.consolidate(consolidate_unitary_groups=True)
for block_id, block_ref in self._block_refs.items():
block: AbstractBlock = block_ref.block
block_dir = os.path.join(blocks_dir, str(block_id))
Expand Down
11 changes: 6 additions & 5 deletions meerkat/columns/audio_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@


class AudioColumn(FileColumn):
"""A lambda column where each cell represents an audio file on disk. The underlying
data is a `PandasSeriesColumn` of strings, where each string is the path to an
image. The column materializes the images into memory when indexed. If the column
is lazy indexed with the ``lz`` indexer, the images are not materialized and an
``FileCell`` or an ``AudioColumn`` is returned instead.
"""A lambda column where each cell represents an audio file on disk. The
underlying data is a `PandasSeriesColumn` of strings, where each string is
the path to an image. The column materializes the images into memory when
indexed. If the column is lazy indexed with the ``lz`` indexer, the images
are not materialized and an ``FileCell`` or an ``AudioColumn`` is returned
instead.

Args:
data (Sequence[str]): A list of filepaths to images.
Expand Down
13 changes: 6 additions & 7 deletions meerkat/columns/file_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ def __repr__(self):


class FileColumn(FileLoaderMixin, LambdaColumn):
"""A column where each cell represents an file stored on disk or the web. The
underlying data is a `PandasSeriesColumn` of strings, where each string is the path
to a file. The column materializes the files into memory when indexed. If the column
is lazy indexed with the ``lz`` indexer, the files are not materialized and a
``FileCell`` or a ``FileColumn`` is returned instead.
"""A column where each cell represents an file stored on disk or the web.
The underlying data is a `PandasSeriesColumn` of strings, where each string
is the path to a file. The column materializes the files into memory when
indexed. If the column is lazy indexed with the ``lz`` indexer, the files
are not materialized and a ``FileCell`` or a ``FileColumn`` is returned
instead.

Args:
data (Sequence[str]): A list of filepaths to images.
Expand All @@ -93,8 +94,6 @@ class FileColumn(FileLoaderMixin, LambdaColumn):

base_dir (str): A base directory that the paths in ``data`` are relative to. If
``None``, the paths are assumed to be absolute.


"""

def __init__(
Expand Down
13 changes: 6 additions & 7 deletions meerkat/columns/image_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@


class ImageColumn(FileColumn):
"""A column where each cell represents an image stored on disk. The underlying data
is a `PandasSeriesColumn` of strings, where each string is the path to an image.
The column materializes the images into memory when indexed. If the column is
lazy indexed with the ``lz`` indexer, the images are not materialized and an
``ImageCell`` or an ``ImageColumn`` is returned instead.
"""A column where each cell represents an image stored on disk. The
underlying data is a `PandasSeriesColumn` of strings, where each string is
the path to an image. The column materializes the images into memory when
indexed. If the column is lazy indexed with the ``lz`` indexer, the images
are not materialized and an ``ImageCell`` or an ``ImageColumn`` is returned
instead.

Args:
data (Sequence[str]): A list of filepaths to images.
Expand All @@ -38,8 +39,6 @@ class ImageColumn(FileColumn):

base_dir (str): A base directory that the paths in ``data`` are relative to. If
``None``, the paths are assumed to be absolute.


"""

@staticmethod
Expand Down
17 changes: 7 additions & 10 deletions meerkat/contrib/audioset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ def build_audioset_dp(
audio_column: bool = True,
overwrite: bool = False,
) -> Dict[str, mk.DataPanel]:
"""
Build DataPanels for the audioset dataset downloaded to ``dataset_dir``. By
default, the resulting DataPanels will be written to ``dataset_dir`` under the
filenames "audioset_examples.mk" and "audioset_labels.mk". If these files already
exist and ``overwrite`` is False, the DataPanels will not be built anew, and instead
will be simply loaded from disk.
"""Build DataPanels for the audioset dataset downloaded to ``dataset_dir``.
By default, the resulting DataPanels will be written to ``dataset_dir``
under the filenames "audioset_examples.mk" and "audioset_labels.mk". If
these files already exist and ``overwrite`` is False, the DataPanels will
not be built anew, and instead will be simply loaded from disk.

Args:
dataset_dir: The directory where the dataset is stored
Expand Down Expand Up @@ -99,8 +98,7 @@ def build_audioset_dp(


def build_ontology_dp(dataset_dir: str) -> Dict[str, mk.DataPanel]:
"""
Build a DataPanel from the ontology.json file
"""Build a DataPanel from the ontology.json file.

Args:
dataset_dir: The directory where the ontology.json file is stored
Expand All @@ -124,8 +122,7 @@ def find_submids(
relations: mk.DataPanel = None,
dataset_dir: str = None,
) -> List[str]:
"""
Returns a list of IDs of all subcategories of an audio category
"""Returns a list of IDs of all subcategories of an audio category.

Args:
ids: ID or list of IDs for which to find the subcategories
Expand Down
3 changes: 1 addition & 2 deletions meerkat/contrib/inaturalist.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
def build_inaturalist_dp(
dataset_dir: str, download: bool = True, splits: List[str] = None
) -> mk.DataPanel:
"""
Build a DataPanel from the inaturalist dataset.
"""Build a DataPanel from the inaturalist dataset.

Args:
dataset_dir: The directory to store the dataset in.
Expand Down
2 changes: 1 addition & 1 deletion meerkat/mixins/cloneable.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, *args, **kwargs):

@classmethod
def _state_keys(cls) -> set:
""" """
""""""
raise NotImplementedError()

@classmethod
Expand Down
5 changes: 3 additions & 2 deletions meerkat/tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@


class MeerkatLoader(yaml.FullLoader):
"""PyYaml does not load unimported modules for safety reasons. We want to allow
importing only meerkat modules
"""PyYaml does not load unimported modules for safety reasons.

We want to allow importing only meerkat modules
"""

def find_python_module(self, name: str, mark, unsafe=False):
Expand Down