Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Speed up indexing by bulk committing to database #1013

Merged
merged 11 commits into from
Aug 7, 2023
2 changes: 1 addition & 1 deletion bids-examples
Submodule bids-examples updated 200 files
87 changes: 49 additions & 38 deletions bids/layout/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
from ..utils import listify, make_bidsfile
from ..exceptions import BIDSConflictingValuesError

from .models import Config, Entity, Tag, FileAssociation
from .models import Config, Entity, Tag, FileAssociation, _create_tag_dict
from .validation import validate_indexing_args


def _regexfy(patt, root=None):
if hasattr(patt, "search"):
return patt
Expand Down Expand Up @@ -145,7 +144,12 @@
_regexfy(patt, root=self._layout._root) for patt in listify(ignore)
]

self._index_dir(self._layout._root, self._config)
all_bfs, all_tag_dicts = self._index_dir(self._layout._root, self._config)

self.session.bulk_save_objects(all_bfs)
self.session.bulk_insert_mappings(Tag, all_tag_dicts)
self.session.commit()

if self.index_metadata:
self._index_metadata()

Expand Down Expand Up @@ -182,7 +186,7 @@

# Derivative directories must always be added separately
if self._layout._root.joinpath('derivatives') in abs_path.parents:
return
return [], []

config = list(config) # Shallow copy

Expand All @@ -205,13 +209,15 @@
if self.config_filename in filenames:
filenames.remove(self.config_filename)

all_bfs = []
all_tag_dicts = []
for f in filenames:
abs_fn = path / f
# Skip files that fail validation, unless forcibly indexing
if force or self._validate_file(abs_fn):
self._index_file(abs_fn, config_entities)

self.session.commit()
bf, tag_dicts = self._index_file(abs_fn, config_entities)
all_tag_dicts += tag_dicts
all_bfs.append(bf)

# Recursively index subdirectories
for d in dirnames:
Expand All @@ -223,12 +229,15 @@
root=self._layout._root,
)
if force is not False:
self._index_dir(d, config, force=force)
dir_bfs, dir_tag_dicts = self._index_dir(d, config, force=force)
all_bfs += dir_bfs
all_tag_dicts += dir_tag_dicts

return all_bfs, all_tag_dicts

def _index_file(self, abs_fn, entities):
"""Create DB record for file and its tags. """
bf = make_bidsfile(abs_fn)
self.session.add(bf)

# Extract entity values
match_vals = {}
Expand All @@ -240,12 +249,12 @@
match_vals[e.name] = (e, m)

# Create Entity <=> BIDSFile mappings
if match_vals:
for _, (ent, val) in match_vals.items():
tag = Tag(bf, ent, str(val), ent._dtype)
self.session.add(tag)
tag_dicts = [
_create_tag_dict(bf, ent, val, ent._dtype)
for ent, val in match_vals.values()
]

return bf
return bf, tag_dicts

def _index_metadata(self):
"""Index metadata for all files in the BIDS dataset.
Expand All @@ -255,8 +264,7 @@
if filters:
# ensure we are returning objects
filters['return_type'] = 'object'
# until 0.11.0, user can specify extension or extensions
ext_key = 'extensions' if 'extensions' in filters else 'extension'

if filters.get(ext_key):
filters[ext_key] = listify(filters[ext_key])
# ensure json files are being indexed
Expand Down Expand Up @@ -299,40 +307,44 @@
f"from file {path}"
) from e

filenames = []
for bf in all_files:
file_ents = bf.entities.copy()
suffix = file_ents.pop('suffix', None)
ext = file_ents.pop('extension', None)

if suffix is not None and ext is not None:
if 'suffix' in bf.entities and 'extension' in bf.entities:
file_ents = bf.entities.copy()
suffix = file_ents.pop('suffix')
ext = file_ents.pop('extension')
key = "{}/{}".format(ext, suffix)
if key not in file_data:
file_data[key] = defaultdict(list)

payload = None
if ext == '.json':
payload = partial(load_json, bf.path)
else:
filenames.append(bf)

to_store = (file_ents, payload, bf.path)
file_data[key][bf._dirname].append(to_store)

# To avoid integrity errors, track primary keys we've seen
seen_assocs = set()

def create_association_pair(src, dst, kind, kind2=None):
objs = []
kind2 = kind2 or kind
pk1 = '#'.join([src, dst, kind])
if pk1 not in seen_assocs:
self.session.add(FileAssociation(src=src, dst=dst, kind=kind))
objs.append(FileAssociation(src=src, dst=dst, kind=kind))
seen_assocs.add(pk1)
pk2 = '#'.join([dst, src, kind2])
if pk2 not in seen_assocs:
self.session.add(FileAssociation(src=dst, dst=src, kind=kind2))
objs.append((FileAssociation(src=dst, dst=src, kind=kind2)))
seen_assocs.add(pk2)

# TODO: Efficiency of everything in this loop could be improved
filenames = [bf for bf in all_files if not bf.path.endswith('.json')]
return objs

all_objs = []
all_tag_dicts = []
for bf in filenames:
file_ents = bf.entities.copy()
suffix = file_ents.pop('suffix', None)
Expand Down Expand Up @@ -394,7 +406,7 @@

# Create DB records for metadata associations
js_file = payloads[0][1]
create_association_pair(js_file, bf.path, 'Metadata')
all_objs += create_association_pair(js_file, bf.path, 'Metadata')

# Consolidate metadata by looping over inherited JSON files
file_md = {}
Expand All @@ -414,14 +426,14 @@
for i, (pl, js_file) in enumerate(payloads):
if (i + 1) < n_pl:
other = payloads[i + 1][1]
create_association_pair(js_file, other, 'Child', 'Parent')
all_objs += create_association_pair(js_file, other, 'Child', 'Parent')

# Inheritance for current file
n_pl = len(ancestors)
for i, src in enumerate(ancestors):
if (i + 1) < n_pl:
dst = ancestors[i + 1]
create_association_pair(src, dst, 'Child', 'Parent')
all_objs += create_association_pair(src, dst, 'Child', 'Parent')

Check warning on line 436 in bids/layout/index.py

View check run for this annotation

Codecov / codecov/patch

bids/layout/index.py#L436

Added line #L436 was not covered by tests

# Files with IntendedFor field always get mapped to targets
intended = listify(file_md.get('IntendedFor', []))
Expand All @@ -430,7 +442,7 @@
target = self._layout._root.joinpath(
'sub-{}'.format(bf.entities['subject']),
target)
create_association_pair(bf.path, str(target), 'IntendedFor',
all_objs += create_association_pair(bf.path, str(target), 'IntendedFor',
'InformedBy')

# Link files to BOLD runs
Expand All @@ -439,7 +451,7 @@
extension=['.nii', '.nii.gz'], suffix='bold',
return_type='filename', **file_ents)
for img in images:
create_association_pair(bf.path, img, 'IntendedFor',
all_objs += create_association_pair(bf.path, img, 'IntendedFor',
'InformedBy')

# Link files to DWI runs
Expand All @@ -448,7 +460,7 @@
extension=['.nii', '.nii.gz'], suffix='dwi',
return_type='filename', **file_ents)
for img in images:
create_association_pair(bf.path, img, 'IntendedFor',
all_objs += create_association_pair(bf.path, img, 'IntendedFor',

Check warning on line 463 in bids/layout/index.py

View check run for this annotation

Codecov / codecov/patch

bids/layout/index.py#L463

Added line #L463 was not covered by tests
'InformedBy')

# Create Tag <-> Entity mappings, and any newly discovered Entities
Expand All @@ -469,10 +481,9 @@
if md_key not in all_entities:
all_entities[md_key] = Entity(md_key)
self.session.add(all_entities[md_key])
tag = Tag(bf, all_entities[md_key], md_val, is_metadata=True)
self.session.add(tag)

if len(self.session.new) >= 1000:
self.session.commit()

tag = _create_tag_dict(bf, all_entities[md_key], md_val, is_metadata=True)
all_tag_dicts.append(tag)

self.session.bulk_save_objects(all_objs)
self.session.bulk_insert_mappings(Tag, all_tag_dicts)
self.session.commit()
78 changes: 46 additions & 32 deletions bids/layout/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,13 @@
val = self.dtype(val)
return val


type_map = {
'str': str,
'int': PaddedInt,
'float': float,
'bool': bool,
'json': 'json',
}
class Tag(Base):
"""Represents an association between a File and an Entity.

Expand Down Expand Up @@ -669,52 +675,60 @@
"tags", collection_class=attribute_mapped_collection("file_path")))

def __init__(self, file, entity, value, dtype=None, is_metadata=False):
data = _create_tag_dict(file, entity, value, dtype, is_metadata)

if dtype is None:
dtype = type(value)

self.value = value
self.is_metadata = is_metadata

if not isinstance(dtype, str):
dtype = dtype.__name__
if dtype not in ('str', 'float', 'int', 'bool'):
# Try serializing to JSON first
try:
value = json.dumps(value)
dtype = 'json'
except TypeError as e:
raise ValueError(
f"Passed value has an invalid dtype ({dtype}). Must be one of "
"int, float, bool, or str.") from e
value = str(value)
self.file_path = file.path
self.entity_name = entity.name

self._value = value
self._dtype = dtype
self.file_path = data['file_path']
self.entity_name = data['entity_name']
self._dtype = data['_dtype']
self._value = data['_value']
self.is_metadata = data['is_metadata']

self._init_on_load()
self.dtype = type_map[self._dtype]
if self._dtype != 'json':
self.value = self.dtype(value)
else:
self.value = value

Check warning on line 690 in bids/layout/models.py

View check run for this annotation

Codecov / codecov/patch

bids/layout/models.py#L690

Added line #L690 was not covered by tests

def __repr__(self):
msg = "<Tag file:{!r} entity:{!r} value:{!r}>"
return msg.format(self.file_path, self.entity_name, self.value)

@reconstructor
def _init_on_load(self):
if self._dtype not in ('str', 'float', 'int', 'bool', 'json'):
raise ValueError("Invalid dtype '{}'. Must be one of 'int', "
"'float', 'bool', 'str', or 'json'.".format(self._dtype))
if self._dtype == 'json':
self.value = json.loads(self._value)
effigies marked this conversation as resolved.
Show resolved Hide resolved
self.dtype = 'json'
elif self._dtype == 'int':
self.dtype = PaddedInt
self.value = self.dtype(self._value)
else:
self.dtype = eval(self._dtype)
self.dtype = type_map[self._dtype]
self.value = self.dtype(self._value)

def _create_tag_dict(file, entity, value, dtype=None, is_metadata=False):
data = {}
if dtype is None:
dtype = type(value)

if not isinstance(dtype, str):
dtype = dtype.__name__

if dtype in ['list', 'dict']:
_dtype = 'json'
_value = json.dumps(value)
else:
_dtype = dtype
_value = str(value)
if _dtype not in ('str', 'float', 'int', 'bool', 'json'):
raise ValueError(

Check warning on line 720 in bids/layout/models.py

View check run for this annotation

Codecov / codecov/patch

bids/layout/models.py#L720

Added line #L720 was not covered by tests
f"Passed value has an invalid dtype ({dtype}). Must be one of "
"int, float, bool, or str.")

data['is_metadata'] = is_metadata
data['file_path'] = file.path
data['entity_name'] = entity.name
data['_dtype'] = _dtype
data['_value'] = _value

return data


class FileAssociation(Base):
__tablename__ = 'associations'
Expand Down