Skip to content

Commit

Permalink
Merge pull request #925 from thomasyu888/SYNPY-1226-use-viewbase-for-…
Browse files Browse the repository at this point in the history
…datasets

[SYNPY-1226] use viewbase for datasets
  • Loading branch information
thomasyu888 authored Aug 5, 2022
2 parents 6d096c1 + eb761d9 commit 0e9f7c4
Showing 1 changed file with 112 additions and 105 deletions.
217 changes: 112 additions & 105 deletions synapseclient/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ class EntityViewType(enum.Enum):
FOLDER = 0x08
VIEW = 0x10
DOCKER = 0x20
SUBMISSION_VIEW = 0x40
DATASET = 0x80
DATASET_COLLECTION = 0x100
MATERIALIZED_VIEW = 0x200


def _get_view_type_mask(types_to_include):
Expand Down Expand Up @@ -781,7 +785,94 @@ def __init__(self, name=None, columns=None, parent=None, definingSQL=None, prope
)


class Dataset(SchemaBase):
class ViewBase(SchemaBase):
"""
This is a helper class for EntityViewSchema and SubmissionViewSchema
containing the common methods for both.
"""
_synapse_entity_type = ""
_property_keys = SchemaBase._property_keys + ['viewTypeMask', 'scopeIds']
_local_keys = SchemaBase._local_keys + ['addDefaultViewColumns', 'addAnnotationColumns',
'ignoredAnnotationColumnNames']

def add_scope(self, entities):
"""
:param entities: a Project, Folder, Evaluation object or its ID, can also be a list of them
"""
if isinstance(entities, list):
# add ids to a temp list so that we don't partially modify scopeIds on an exception in id_of()
temp_list = [id_of(entity) for entity in entities]
self.scopeIds.extend(temp_list)
else:
self.scopeIds.append(id_of(entities))

def _filter_duplicate_columns(self, syn, columns_to_add):
"""
If a column to be added has the same name and same type as an existing column, it will be considered a duplicate
and not added.
:param syn: a :py:class:`synapseclient.client.Synapse` object that is logged in
:param columns_to_add: iterable collection of type :py:class:`synapseclient.table.Column` objects
:return: a filtered list of columns to add
"""

# no point in making HTTP calls to retrieve existing Columns if we not adding any new columns
if not columns_to_add:
return columns_to_add

# set up Column name/type tracking
# map of str -> set(str), where str is the column type as a string and set is a set of column name strings
column_type_to_annotation_names = {}

# add to existing columns the columns that user has added but not yet created in synapse
column_generator = itertools.chain(syn.getColumns(self.columnIds),
self.columns_to_store) if self.columns_to_store \
else syn.getColumns(self.columnIds)

for column in column_generator:
column_name = column['name']
column_type = column['columnType']

column_type_to_annotation_names.setdefault(column_type, set()).add(column_name)

valid_columns = []
for column in columns_to_add:
new_col_name = column['name']
new_col_type = column['columnType']

typed_col_name_set = column_type_to_annotation_names.setdefault(new_col_type, set())
if new_col_name not in typed_col_name_set:
typed_col_name_set.add(new_col_name)
valid_columns.append(column)
return valid_columns

def _before_synapse_store(self, syn):
# get the default EntityView columns from Synapse and add them to the columns list
additional_columns = []
view_type = self._synapse_entity_type.split(".")[-1].lower()
mask = self.get("viewTypeMask")

if self.addDefaultViewColumns:
additional_columns.extend(
syn._get_default_view_columns(view_type, view_type_mask=mask)
)

# get default annotations
if self.addAnnotationColumns:
anno_columns = [x for x in syn._get_annotation_view_columns(self.scopeIds, view_type,
view_type_mask=mask)
if x['name'] not in self.ignoredAnnotationColumnNames]
additional_columns.extend(anno_columns)

self.addColumns(self._filter_duplicate_columns(syn, additional_columns))

# set these boolean flags to false so they are not repeated.
self.addDefaultViewColumns = False
self.addAnnotationColumns = False

super(ViewBase, self)._before_synapse_store(syn)


class Dataset(ViewBase):
"""
A Dataset is an :py:class:`synapseclient.entity.Entity` that defines a
flat list of entities as a tableview (a.k.a. a "dataset").
Expand All @@ -803,7 +894,7 @@ class Dataset(SchemaBase):
# Create a Dataset with pre-defined DatasetItems. Default Dataset columns
# are used if no schema is provided.
dataset_items = [
{'entityId': "syn000", 'versionNumber: 1},
{'entityId': "syn000", 'versionNumber': 1},
{...},
]
dataset = syn.store(Dataset(
Expand Down Expand Up @@ -866,28 +957,35 @@ class Dataset(SchemaBase):
comment="This is version 1")
"""
_synapse_entity_type: str = "org.sagebionetworks.repo.model.table.Dataset"
_property_keys: List[str] = SchemaBase._property_keys + ['datasetItems']
_local_keys: List[str] = SchemaBase._local_keys + ['folders_to_add', 'force']
_property_keys: List[str] = ViewBase._property_keys + ['datasetItems']
_local_keys: List[str] = ViewBase._local_keys + ['folders_to_add', 'force']

def __init__(self, name=None, columns=None, parent=None, properties=None,
addDefaultViewColumns=True, addAnnotationColumns=True, ignoredAnnotationColumnNames=[],
annotations=None, local_state=None, dataset_items=None,
folders=None, force=None, **kwargs):
folders=None, force=False, **kwargs):
self.properties.setdefault('datasetItems', [])
self.__dict__.setdefault('folders_to_add', set())
self.__dict__.setdefault('force', False)
self.ignoredAnnotationColumnNames = set(ignoredAnnotationColumnNames)
self.viewTypeMask = EntityViewType.DATASET.value
super(Dataset, self).__init__(
name=name, columns=columns, properties=properties,
annotations=annotations, local_state=local_state, parent=parent,
**kwargs
)

if force:
self.force = True
self.force = force
if dataset_items:
self.add_items(dataset_items, force)
if folders:
self.add_folders(folders, force)

# HACK: make sure we don't try to add columns to schemas that we retrieve from synapse
is_from_normal_constructor = not (properties or local_state)
# allowing annotations because user might want to update annotations all at once
self.addDefaultViewColumns = addDefaultViewColumns and is_from_normal_constructor
self.addAnnotationColumns = addAnnotationColumns and is_from_normal_constructor

def __len__(self):
return len(self.properties.datasetItems)

Expand Down Expand Up @@ -956,8 +1054,8 @@ def add_folder(self, folder: str, force: bool = True):
if not self.__dict__.get('folders_to_add', None):
self.__dict__['folders_to_add'] = set()
self.__dict__['folders_to_add'].add(folder)
if self.force != force:
self.force = force
# if self.force != force:
self.force = force

def add_folders(self, folders: List[str], force: bool = True):
"""
Expand Down Expand Up @@ -988,114 +1086,23 @@ def _add_folder_files(self, syn, folder):
return files

def _before_synapse_store(self, syn):
# Add default Dataset columns if schema is not provided.
if not self.properties.columnIds and not self.columns_to_store:
self.addColumns(syn._get_default_view_columns("dataset"))

super()._before_synapse_store(syn)

# Add files from folders (if any) before storing dataset.
if self.folders_to_add:
for folder in self.folders_to_add:
items_to_add = self._add_folder_files(syn, folder)
self.add_items(items_to_add, self.force)
self.folders_to_add = set()

# Must set this scopeIds is used to get all annotations from the
# entities
self.scopeIds = [item['entityId'] for item in self.properties.datasetItems]
super()._before_synapse_store(syn)
# Reset attribute to force-add items from folders.
self.force = True

# Remap `datasetItems` back to `items` before storing (since `items`
# is the accepted field name in the API, not `datasetItems`).
self.properties.items = self.properties.datasetItems


class ViewBase(SchemaBase):
"""
This is a helper class for EntityViewSchema and SubmissionViewSchema
containing the common methods for both.
"""
_synapse_entity_type = ""
_property_keys = SchemaBase._property_keys + ['viewTypeMask', 'scopeIds']
_local_keys = SchemaBase._local_keys + ['addDefaultViewColumns', 'addAnnotationColumns',
'ignoredAnnotationColumnNames']

def add_scope(self, entities):
"""
:param entities: a Project, Folder, Evaluation object or its ID, can also be a list of them
"""
if isinstance(entities, list):
# add ids to a temp list so that we don't partially modify scopeIds on an exception in id_of()
temp_list = [id_of(entity) for entity in entities]
self.scopeIds.extend(temp_list)
else:
self.scopeIds.append(id_of(entities))

def _filter_duplicate_columns(self, syn, columns_to_add):
"""
If a column to be added has the same name and same type as an existing column, it will be considered a duplicate
and not added.
:param syn: a :py:class:`synapseclient.client.Synapse` object that is logged in
:param columns_to_add: iterable collection of type :py:class:`synapseclient.table.Column` objects
:return: a filtered list of columns to add
"""

# no point in making HTTP calls to retrieve existing Columns if we not adding any new columns
if not columns_to_add:
return columns_to_add

# set up Column name/type tracking
# map of str -> set(str), where str is the column type as a string and set is a set of column name strings
column_type_to_annotation_names = {}

# add to existing columns the columns that user has added but not yet created in synapse
column_generator = itertools.chain(syn.getColumns(self.columnIds),
self.columns_to_store) if self.columns_to_store \
else syn.getColumns(self.columnIds)

for column in column_generator:
column_name = column['name']
column_type = column['columnType']

column_type_to_annotation_names.setdefault(column_type, set()).add(column_name)

valid_columns = []
for column in columns_to_add:
new_col_name = column['name']
new_col_type = column['columnType']

typed_col_name_set = column_type_to_annotation_names.setdefault(new_col_type, set())
if new_col_name not in typed_col_name_set:
typed_col_name_set.add(new_col_name)
valid_columns.append(column)
return valid_columns

def _before_synapse_store(self, syn):
# get the default EntityView columns from Synapse and add them to the columns list
additional_columns = []
view_type = self._synapse_entity_type.split(".")[-1].lower()
mask = self.get("viewTypeMask")

if self.addDefaultViewColumns:
additional_columns.extend(
syn._get_default_view_columns(view_type, view_type_mask=mask)
)

# get default annotations
if self.addAnnotationColumns:
anno_columns = [x for x in syn._get_annotation_view_columns(self.scopeIds, view_type,
view_type_mask=mask)
if x['name'] not in self.ignoredAnnotationColumnNames]
additional_columns.extend(anno_columns)

self.addColumns(self._filter_duplicate_columns(syn, additional_columns))

# set these boolean flags to false so they are not repeated.
self.addDefaultViewColumns = False
self.addAnnotationColumns = False

super(ViewBase, self)._before_synapse_store(syn)


class EntityViewSchema(ViewBase):
"""
A EntityViewSchema is a :py:class:`synapseclient.entity.Entity` that displays all files/projects
Expand Down

0 comments on commit 0e9f7c4

Please sign in to comment.