Skip to content

Commit

Permalink
Adding logging, removing an obvious list copy.
Browse files Browse the repository at this point in the history
- Attempting to gather data about why rebuild_manifest is slow
- Also remove an obvious and unnecessary list copy from HSCDataSet
  • Loading branch information
mtauraso committed Nov 12, 2024
1 parent a7c642b commit 42c200a
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 10 deletions.
49 changes: 40 additions & 9 deletions src/fibad/data_sets/hsc_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ def _init_from_path(

self.cutout_shape = cutout_shape

self.pruned_objects = {}
self._prune_objects(filters_ref)

if self.cutout_shape is None:
Expand Down Expand Up @@ -370,13 +371,15 @@ def _scan_file_names(self, filters: Optional[list[str]] = None) -> files_dict:
filter_name -> file name. Corresponds to self.files
"""

logger.info(f"Scanning files in directory {self.path}")

object_id_regex = r"[0-9]{17}"
filter_regex = r"HSC-[GRIZY]" if filters is None else "|".join(filters)
full_regex = f"({object_id_regex})_.*_({filter_regex}).fits"

files = {}
# Go scan the path for object ID's so we have a list.
for filepath in Path(self.path).iterdir():
for index, filepath in enumerate(Path(self.path).iterdir()):
filename = filepath.name

# If we are filtering based off a user-provided catalog of object ids, Filter out any
Expand Down Expand Up @@ -404,6 +407,11 @@ def _scan_file_names(self, filters: Optional[list[str]] = None) -> files_dict:
msg += "and will not be included in the data set."
logger.error(msg)

if index != 0 and index % 100_000 == 0:
logger.info(f"Processed {index} files.")

Check warning on line 411 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L411

Added line #L411 was not covered by tests
else:
logger.info(f"Processed {index+1} files")

return files

def _read_filter_catalog(
Expand Down Expand Up @@ -458,10 +466,16 @@ def _read_filter_catalog(

def _scan_file_dimensions(self) -> dim_dict:
# Scan the filesystem to get the widths and heights of all images into a dict
return {
object_id: [self._fits_file_dims(filepath) for filepath in self._object_files(object_id)]
for object_id in self.ids()
}
logger.info("Scanning for dimensions...")

retval = {}
for index, object_id in enumerate(self.ids()):
retval[object_id] = [self._fits_file_dims(filepath) for filepath in self._object_files(object_id)]
if index != 0 and index % 100_000 == 0:
logger.info(f"Scanned {index} objects for dimensions")

Check warning on line 475 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L475

Added line #L475 was not covered by tests
else:
logger.info(f"Scanned {index+1} objects for dimensions")
return retval

def _prune_objects(self, filters_ref: list[str]):
"""Class initialization helper. Prunes objects from the list of objects.
Expand All @@ -484,12 +498,12 @@ def _prune_objects(self, filters_ref: list[str]):
"""
filters_ref = sorted(filters_ref)
self.prune_count = 0
for object_id, filters in list(self.files.items()):
for index, (object_id, filters) in enumerate(self.files.items()):
# Drop objects with missing filters
filters = sorted(list(filters))
if filters != filters_ref:
msg = f"HSCDataSet in {self.path} has the wrong group of filters for object {object_id}."
self._prune_object(object_id, msg)
self._mark_for_prune(object_id, msg)
logger.info(f"Filters for object {object_id} were {filters}")
logger.debug(f"Reference filters were {filters_ref}")

Expand All @@ -500,8 +514,16 @@ def _prune_objects(self, filters_ref: list[str]):
msg = f"A file for object {object_id} has shape ({shape[1]}px, {shape[1]}px)"
msg += " this is too small for the given cutout size of "
msg += f"({self.cutout_shape[0]}px, {self.cutout_shape[1]}px)"
self._prune_object(object_id, msg)
self._mark_for_prune(object_id, msg)
break
if index != 0 and index % 100_000 == 0:
logger.info(f"Processed {index} objects for pruning")

Check warning on line 520 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L520

Added line #L520 was not covered by tests
else:
logger.info(f"Processed {index + 1} objects for pruning")

# Prune marked objects
for object_id, reason in self.pruned_objects.items():
self._prune_object(object_id, reason)

# Log about the pruning process
pre_prune_object_count = len(self.files) + self.prune_count
Expand All @@ -512,6 +534,9 @@ def _prune_objects(self, filters_ref: list[str]):
logger.warning("Greater than 1% of objects in the data directory were pruned.")
logger.info(f"Pruned {self.prune_count} out of {pre_prune_object_count} objects")

def _mark_for_prune(self, object_id, reason):
self.pruned_objects[object_id] = reason

def _prune_object(self, object_id, reason: str):
logger.warning(reason)
logger.warning(f"Dropping object {object_id} from the dataset")
Expand Down Expand Up @@ -542,6 +567,8 @@ def _check_file_dimensions(self) -> tuple[int, int]:
The minimum width and height in pixels of the entire dataset. In other words: the maximal image
size in pixels that can be generated from ALL cutout images via cropping.
"""
logger.info("Checking file dimensions to determine standard cutout size...")

# Find the maximal cutout size that all images can support
all_widths = [shape[0] for shape_list in self.dims.values() for shape in shape_list]
cutout_width = np.min(all_widths)
Expand Down Expand Up @@ -613,7 +640,7 @@ def _rebuild_manifest(self, config):
"variance": parse_bool(config["download"]["variance"]),
}

for object_id, filter, filename, dim in self._all_files_full():
for index, (object_id, filter, filename, dim) in enumerate(self._all_files_full()):
for static_col in static_column_names:
columns[static_col].append(static_values[static_col])

Check warning on line 645 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L643-L645

Added lines #L643 - L645 were not covered by tests

Expand Down Expand Up @@ -643,6 +670,10 @@ def _rebuild_manifest(self, config):
# which will be hit when someone alters dynamic column names above without also
# writing an implementation.
raise RuntimeError(f"No implementation to process column {dynamic_col}")
if index != 0 and index % 100_000 == 0:
logger.info(f"Addeed {index} objects to manifest")

Check warning on line 674 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L672-L674

Added lines #L672 - L674 were not covered by tests
else:
logger.info(f"Addeed {index+1} objects to manifest")

Check warning on line 676 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L676

Added line #L676 was not covered by tests

logger.info("Writing rebuilt manifest...")
manifest_table = Table(columns)
Expand Down
4 changes: 3 additions & 1 deletion src/fibad/rebuild_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def run(config):

_, data_set = setup_model_and_dataset(config, split=config["train"]["split"])

Check warning on line 18 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L18

Added line #L18 was not covered by tests

logger.info("Starting rebuild of manifest")

Check warning on line 20 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L20

Added line #L20 was not covered by tests

data_set.rebuild_manifest(config)

Check warning on line 22 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L22

Added line #L22 was not covered by tests

logger.info("Finished Prepare")
logger.info("Finished Rebuild Manifest")

Check warning on line 24 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L24

Added line #L24 was not covered by tests

0 comments on commit 42c200a

Please sign in to comment.