Skip to content

Commit

Permalink
Rebuild a fits manifest from an HSC data directory.
Browse files Browse the repository at this point in the history
- Added a new verb rebuild_manifest
- When run with the HSC dataset class this verb will:
  0) Scan the data directory and ingest HSC cutout files
  1) Read in the original catalog file configured for download for metadata
  2) Write out rebuilt_manifest.fits in the data directory

- Fixed up config resolution so that fibad_config.toml in the cwd
  works again for CLI invocations.
  • Loading branch information
mtauraso committed Nov 9, 2024
1 parent e666dbe commit a7c642b
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 43 deletions.
63 changes: 31 additions & 32 deletions src/fibad/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ def __init__(
runtime_config_filepath: Union[Path, str] = None,
default_config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH,
):
self.fibad_default_config = self._read_runtime_config(default_config_filepath)
self.fibad_default_config = ConfigManager._read_runtime_config(default_config_filepath)

self.runtime_config_filepath = runtime_config_filepath
if self.runtime_config_filepath is None:
self.runtime_config_filepath = ConfigManager.resolve_runtime_config(runtime_config_filepath)
if self.runtime_config_filepath is DEFAULT_CONFIG_FILEPATH:
self.user_specific_config = ConfigDict()
else:
self.user_specific_config = self._read_runtime_config(self.runtime_config_filepath)
self.user_specific_config = ConfigManager._read_runtime_config(self.runtime_config_filepath)

self.external_library_config_paths = self._find_external_library_default_config_paths(
self.external_library_config_paths = ConfigManager._find_external_library_default_config_paths(
self.user_specific_config
)

Expand All @@ -93,7 +93,7 @@ def __init__(

self.config = self.merge_configs(self.overall_default_config, self.user_specific_config)
if not self.config["general"]["dev_mode"]:
self._validate_runtime_config(self.config, self.overall_default_config)
ConfigManager._validate_runtime_config(self.config, self.overall_default_config)

@staticmethod
def _read_runtime_config(config_filepath: Union[Path, str] = DEFAULT_CONFIG_FILEPATH) -> ConfigDict:
Expand Down Expand Up @@ -232,38 +232,37 @@ def _validate_runtime_config(runtime_config: ConfigDict, default_config: ConfigD
raise RuntimeError(msg)
ConfigManager._validate_runtime_config(runtime_config[key], default_config[key])

@staticmethod
def resolve_runtime_config(runtime_config_filepath: Union[Path, str, None] = None) -> Path:
"""Resolve a user-supplied runtime config to where we will actually pull config from.
def resolve_runtime_config(runtime_config_filepath: Union[Path, str, None] = None) -> Path:
"""Resolve a user-supplied runtime config to where we will actually pull config from.
1) If a runtime config file is specified, we will use that file
2) If no file is specified and there is a file named "fibad_config.toml" in the cwd we will use that file
3) If no file is specified and there is no file named "fibad_config.toml" in the current working directory
we will exclusively work off the configuration defaults in the packaged "fibad_default_config.toml"
file.
1) If a runtime config file is specified, we will use that file.
2) If no file is specified and there is a file named "fibad_config.toml" in the cwd we will use it.
3) If no file is specified and there is no file named "fibad_config.toml" in the cwd we will
exclusively work off the configuration defaults in the packaged "fibad_default_config.toml" file.
Parameters
----------
runtime_config_filepath : Union[Path, str, None], optional
Location of the supplied config file, by default None
Parameters
----------
runtime_config_filepath : Union[Path, str, None], optional
Location of the supplied config file, by default None
Returns
-------
Path
Path to the configuration file ultimately used for config resolution. When we fall back to the
package supplied default config file, the Path to that file is returned.
"""
if isinstance(runtime_config_filepath, str):
runtime_config_filepath = Path(runtime_config_filepath)
Returns
-------
Path
Path to the configuration file ultimately used for config resolution. When we fall back to the
package supplied default config file, the Path to that file is returned.
"""
if isinstance(runtime_config_filepath, str):
runtime_config_filepath = Path(runtime_config_filepath)

# If a named config exists in cwd, and no config specified on cmdline, use cwd.
if runtime_config_filepath is None and DEFAULT_USER_CONFIG_FILEPATH.exists():
runtime_config_filepath = DEFAULT_USER_CONFIG_FILEPATH
# If a named config exists in cwd, and no config specified on cmdline, use cwd.
if runtime_config_filepath is None and DEFAULT_USER_CONFIG_FILEPATH.exists():
runtime_config_filepath = DEFAULT_USER_CONFIG_FILEPATH

Check warning on line 260 in src/fibad/config_utils.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/config_utils.py#L260

Added line #L260 was not covered by tests

if runtime_config_filepath is None:
runtime_config_filepath = DEFAULT_CONFIG_FILEPATH
if runtime_config_filepath is None:
runtime_config_filepath = DEFAULT_CONFIG_FILEPATH

Check warning on line 263 in src/fibad/config_utils.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/config_utils.py#L263

Added line #L263 was not covered by tests

return runtime_config_filepath
return runtime_config_filepath


def create_results_dir(config: ConfigDict, postfix: Union[Path, str]) -> Path:
Expand Down
137 changes: 133 additions & 4 deletions src/fibad/data_sets/hsc_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@
from torch.utils.data import Dataset
from torchvision.transforms.v2 import CenterCrop, Compose, Lambda

from fibad.download import Downloader
from fibad.downloadCutout.downloadCutout import (
parse_bool,
parse_degree,
parse_latitude,
parse_longitude,
parse_rerun,
parse_tract_opt,
parse_type,
)

from .data_set_registry import fibad_data_set

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -94,6 +105,9 @@ def __getitem__(self, idx: int) -> torch.Tensor:
def __len__(self) -> int:
return len(self.current_split)

def rebuild_manifest(self, config):
return self.container._rebuild_manifest(config)

Check warning on line 109 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L109

Added line #L109 was not covered by tests


class HSCDataSetSplit(Dataset):
def __init__(
Expand Down Expand Up @@ -553,6 +567,88 @@ def _check_file_dimensions(self) -> tuple[int, int]:

return cutout_width, cutout_height

def _rebuild_manifest(self, config):
if self.filter_catalog:
raise RuntimeError("Cannot rebuild manifest. Set the filter_catalog=false and rerun")

Check warning on line 572 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L571-L572

Added lines #L571 - L572 were not covered by tests

logger.info("Reading in catalog file... ")
location_table = Downloader.filterfits(

Check warning on line 575 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L574-L575

Added lines #L574 - L575 were not covered by tests
Path(config["download"]["fits_file"]).resolve(), ["object_id", "ra", "dec"]
)

obj_to_ra = {

Check warning on line 579 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L579

Added line #L579 was not covered by tests
str(location_table["object_id"][index]): location_table["ra"][index]
for index in range(len(location_table))
}
obj_to_dec = {

Check warning on line 583 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L583

Added line #L583 was not covered by tests
str(location_table["object_id"][index]): location_table["dec"][index]
for index in range(len(location_table))
}

del location_table

Check warning on line 588 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L588

Added line #L588 was not covered by tests

logger.info("Assembling Manifest...")

Check warning on line 590 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L590

Added line #L590 was not covered by tests

# These are the column names expected in a manifest file by the downloader
column_names = Downloader.MANIFEST_COLUMN_NAMES
columns = {column_name: [] for column_name in column_names}

Check warning on line 594 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L593-L594

Added lines #L593 - L594 were not covered by tests

# These we vary every object and must be implemented below
dynamic_column_names = ["object_id", "filter", "dim", "tract", "ra", "dec", "filename"]

Check warning on line 597 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L597

Added line #L597 was not covered by tests
# These are pulled from config ("sw", "sh", "rerun", "type", "image", "mask", and "variance")
static_column_names = [name for name in column_names if name not in dynamic_column_names]

Check warning on line 599 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L599

Added line #L599 was not covered by tests

# Check that all column names we need for a manifest are either in static or dynamic columns
for column_name in column_names:
if column_name not in static_column_names and column_name not in dynamic_column_names:
raise RuntimeError(f"Error Assembling manifest {column_name} not implemented")

Check warning on line 604 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L602-L604

Added lines #L602 - L604 were not covered by tests

static_values = {

Check warning on line 606 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L606

Added line #L606 was not covered by tests
"sw": parse_degree(config["download"]["sw"]),
"sh": parse_degree(config["download"]["sh"]),
"rerun": parse_rerun(config["download"]["rerun"]),
"type": parse_type(config["download"]["type"]),
"image": parse_bool(config["download"]["image"]),
"mask": parse_bool(config["download"]["mask"]),
"variance": parse_bool(config["download"]["variance"]),
}

for object_id, filter, filename, dim in self._all_files_full():
for static_col in static_column_names:
columns[static_col].append(static_values[static_col])

Check warning on line 618 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L616-L618

Added lines #L616 - L618 were not covered by tests

for dynamic_col in dynamic_column_names:
if dynamic_col == "object_id":
columns[dynamic_col].append(int(object_id))
elif dynamic_col == "filter":
columns[dynamic_col].append(filter)
elif dynamic_col == "dim":
columns[dynamic_col].append(dim)
elif dynamic_col == "tract":

Check warning on line 627 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L620-L627

Added lines #L620 - L627 were not covered by tests
# There's value in pulling tract from the filename rather than the download catalog
# in case The catalog had it wrong, the filename will have the value the cutout server
# provided.
tract = filename.split("_")[4]
columns[dynamic_col].append(parse_tract_opt(tract))
elif dynamic_col == "ra":
ra = obj_to_ra[object_id]
columns[dynamic_col].append(parse_longitude(ra))
elif dynamic_col == "dec":
dec = obj_to_dec[object_id]
columns[dynamic_col].append(parse_latitude(dec))
elif dynamic_col == "filename":
columns[dynamic_col].append(filename)

Check warning on line 640 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L631-L640

Added lines #L631 - L640 were not covered by tests
else:
# The tower of if statements has been entirely to create this failure path.
# which will be hit when someone alters dynamic column names above without also
# writing an implementation.
raise RuntimeError(f"No implementation to process column {dynamic_col}")

Check warning on line 645 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L645

Added line #L645 was not covered by tests

logger.info("Writing rebuilt manifest...")
manifest_table = Table(columns)
rebuilt_manifest_path = Path(config["general"]["data_dir"]) / "rebuilt_manifest.fits"
manifest_table.write(rebuilt_manifest_path, overwrite=True, format="fits")

Check warning on line 650 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L647-L650

Added lines #L647 - L650 were not covered by tests

def shape(self) -> tuple[int, int, int]:
"""Shape of the individual cutouts this will give to a model
Expand Down Expand Up @@ -641,6 +737,25 @@ def ids(self):
for object_id in self.files:
yield object_id

def _all_files_full(self):
"""
Private read-only iterator over all files that enforces a strict total order across
objects and filters. Will not work prior to self.files, and self.path initialization in __init__
Yields
------
Tuple[object_id, filter, filename, dim]
Members of this tuple are
- The object_id as a string
- The filter name as a string
- The filename relative to self.path
- A tuple containing the dimensions of the fits file in pixels.
"""
for object_id in self.ids():
dims = self.dims[object_id]
for idx, (filter, filename) in enumerate(self._filter_filename(object_id)):
yield (object_id, filter, filename, dims[idx])

Check warning on line 757 in src/fibad/data_sets/hsc_data_set.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/data_sets/hsc_data_set.py#L754-L757

Added lines #L754 - L757 were not covered by tests

def _all_files(self):
"""
Private read-only iterator over all files that enforces a strict total order across
Expand All @@ -655,6 +770,22 @@ def _all_files(self):
for filename in self._object_files(object_id):
yield filename

def _filter_filename(self, object_id):
"""
Private read-only iterator over all files for a given object. This enforces a strict total order
across filters. Will not work prior to self.files initialization in __init__
Yields
------
filter_name, file name
The name of a filter and the file name for the fits file.
The file name is relative to self.path
"""
filters = self.files[object_id]
filter_names = sorted(list(filters))
for filter_name in filter_names:
yield filter_name, filters[filter_name]

def _object_files(self, object_id):
"""
Private read-only iterator over all files for a given object. This enforces a strict total order
Expand All @@ -665,10 +796,8 @@ def _object_files(self, object_id):
Path
The path to the file.
"""
filters = self.files[object_id]
filter_names = sorted(list(filters))
for filter in filter_names:
yield self._file_to_path(filters[filter])
for _, filename in self._filter_filename(object_id):
yield self._file_to_path(filename)

def _file_to_path(self, filename: str) -> Path:
"""Turns a filename into a full path suitable for open. Equivalent to:
Expand Down
7 changes: 4 additions & 3 deletions src/fibad/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class Downloader:
# of the immutable fields that we rely on for hash checks are also included.
RECT_COLUMN_NAMES = list(dict.fromkeys(VARIABLE_FIELDS + dC.Rect.immutable_fields + ["dim"]))

MANIFEST_COLUMN_NAMES = RECT_COLUMN_NAMES + ["filename", "object_id"]

MANIFEST_FILE_NAME = "manifest.fits"

def __init__(self, config):
Expand Down Expand Up @@ -280,9 +282,8 @@ def _write_manifest(self):
logger.info(f"Writing out download manifest with {len(combined_manifest)} entries.")

# Convert the combined manifest into an astropy table by building a dict of {column_name: column_data}
# for all the fields in a rect, plus our object_id and filename.
column_names = Downloader.RECT_COLUMN_NAMES + ["filename", "object_id"]
columns = {column_name: [] for column_name in column_names}
# for all the fields we require in a manifest
columns = {column_name: [] for column_name in Downloader.MANIFEST_COLUMN_NAMES}

Check warning on line 286 in src/fibad/download.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/download.py#L286

Added line #L286 was not covered by tests

for rect, msg in combined_manifest.items():
# This parsing relies on the name format set up in create_rects to work properly
Expand Down
16 changes: 12 additions & 4 deletions src/fibad/fibad.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import Union

from .config_utils import ConfigManager, resolve_runtime_config
from .config_utils import ConfigManager


class Fibad:
Expand All @@ -14,7 +14,7 @@ class Fibad:
CLI functions in fibad_cli are implemented by calling this class
"""

verbs = ["train", "predict", "download", "prepare"]
verbs = ["train", "predict", "download", "prepare", "rebuild_manifest"]

def __init__(self, *, config_file: Union[Path, str] = None, setup_logging: bool = True):
"""Initialize fibad. Always applies the default config, and merges it with any provided config file.
Expand Down Expand Up @@ -88,7 +88,7 @@ def __init__(self, *, config_file: Union[Path, str] = None, setup_logging: bool
# Setup our handlers from config
self._initialize_log_handlers()

self.logger.info(f"Runtime Config read from: {resolve_runtime_config(config_file)}")
self.logger.info(f"Runtime Config read from: {ConfigManager.resolve_runtime_config(config_file)}")

Check warning on line 91 in src/fibad/fibad.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/fibad.py#L91

Added line #L91 was not covered by tests

def _initialize_log_handlers(self):
"""Private initialization helper, Adds handlers and level setting to the global self.logger object"""
Expand Down Expand Up @@ -180,8 +180,16 @@ def predict(self, **kwargs):

def prepare(self, **kwargs):
"""
See Fibad.predict.run()
See Fibad.prepare.run()
"""
from .prepare import run

return run(config=self.config, **kwargs)

def rebuild_manifest(self, **kwargs):
"""
See Fibad.rebuild_manifest.run()
"""
from .rebuild_manifest import run

Check warning on line 193 in src/fibad/fibad.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/fibad.py#L193

Added line #L193 was not covered by tests

return run(config=self.config, **kwargs)

Check warning on line 195 in src/fibad/fibad.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/fibad.py#L195

Added line #L195 was not covered by tests
22 changes: 22 additions & 0 deletions src/fibad/rebuild_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging

Check warning on line 1 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L1

Added line #L1 was not covered by tests

from fibad.pytorch_ignite import setup_model_and_dataset

Check warning on line 3 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L3

Added line #L3 was not covered by tests

logger = logging.getLogger(__name__)

Check warning on line 5 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L5

Added line #L5 was not covered by tests


def run(config):

Check warning on line 8 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L8

Added line #L8 was not covered by tests
"""Rebuild a broken download manifest
Parameters
----------
config : dict
The parsed config file as a nested
dict
"""

_, data_set = setup_model_and_dataset(config, split=config["train"]["split"])

Check warning on line 18 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L18

Added line #L18 was not covered by tests

data_set.rebuild_manifest(config)

Check warning on line 20 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L20

Added line #L20 was not covered by tests

logger.info("Finished Prepare")

Check warning on line 22 in src/fibad/rebuild_manifest.py

View check run for this annotation

Codecov / codecov/patch

src/fibad/rebuild_manifest.py#L22

Added line #L22 was not covered by tests

0 comments on commit a7c642b

Please sign in to comment.