feat pydantic .env file config support

afloresep · Nov 28, 2024 · 50a7dea · 50a7dea
1 parent 107a704
commit 50a7dea
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 34 deletions.
diff --git a/chelombus/data_handler.py b/chelombus/data_handler.py
@@ -12,8 +12,9 @@ def __init__(self, file_path, chunksize, smiles_col_index=0, header=0):
         self.smiles_col_index= smiles_col_index
         self.header = header
         self.datatype = find_input_type(file_path)
+
     @staticmethod 
-    def get_total_chunks(self, file_path, chunksize):
+    def get_total_chunks(file_path, chunksize):
         """ 
         Calculate number of chunks based on self.chunksize for tqdm 
         Maybe avoid for files that are too large >150 GB? Takes about ~2 minutes for such size
@@ -28,7 +29,6 @@ def get_total_lines(self):
         """Calculate the total number of lines in the file."""
         with open(self.file_path, 'r', encoding='utf-8') as f:
             return sum(1 for _ in f)
-
 
     def load_data(self):
         """Returns correct generator to load data based on file input type"""

diff --git a/chelombus/utils/config_loader.py b/chelombus/utils/config_loader.py
@@ -10,6 +10,7 @@ class Config(BaseSettings):
    DATA_PATH: str = "data/" 
    OUTPUT_PATH: str = "data/output/"
    CHUNKSIZE: int = 100_000
+   IPCA_MODEL: str = None
    PCA_N_COMPONENTS: int = 3 
    STEPS_LIST: List[int] = [50, 50, 50]
    N_JOBS: int = os.cpu_count()
@@ -32,6 +33,7 @@ def load_config(user_config_path=None) -> Config:
     """
     Load and merge configurations from defaults, a user-provided file, and evironment variables
     """
+    print(user_config_path)
     if user_config_path and os.path.exists(user_config_path):
         config = Config(_env_file=user_config_path)
     else:

diff --git a/docs/README.md b/docs/README.md
@@ -163,41 +163,73 @@ Chelombus includes utilities that allows you to monitor the execution time and m
 
 ## **5. Configuration**
 
-### **Default Configuration**
+Chelombus provides flexible configuration management using `pydantic`, allowing users to customize default settings efficiently. The default configuration values are defined in the `Config` class, which inherits from `BaseSettings`. Below is a list of the default values:
 
-The default configuration is defined in `config.py`. Below is an example `user_config.yml` file:
-
-```yaml
-DATA_FILE_PATH: "data/input.csv"
-OUTPUT_FILE_PATH: "results/"
-CHUNKSIZE: 100000
-PCA_N_COMPONENTS: 3
-STEPS_LIST: [50, 50, 50]
-LOGGING_LEVEL: "INFO"
+```python
+BASE_DIR: str = os.getcwd()  # Base directory for project execution
+DATA_PATH: str = "data/"  # Path for input data
+OUTPUT_PATH: str = "data/output/"  # Path for output data
+CHUNKSIZE: int = 100_000  # Size of data chunks for memory-intensive operations
+IPCA_MODEL: str = None  # Path to a pre-trained IPCA model (if applicable)
+PCA_N_COMPONENTS: int = 3  # Number of principal components for dimensionality reduction
+STEPS_LIST: List[int] = [50, 50, 50]  # Number of buckets per PCA dimension
+N_JOBS: int = os.cpu_count()  # Number of CPU cores to utilize (defaults to all cores)
+RANDOM_STATE: int = 42  # Random seed for reproducibility
+TMAP_NAME: str = "tmap"  # Name prefix for generated t-maps
+PERMUTATIONS: int = 512  # Number of hashing permutations
+TMAP_K: int = 20  # Number of neighbors considered in t-map
+TMAP_NODE_SIZE: int = 5  # Size of nodes in the t-map visualization
+TMAP_POINT_SCALE: float = 1.0  # Scale factor for t-map points
+LOG_FILE_PATH: str = "logs/app.log"  # Log file path
+LOGGING_LEVEL: str = "INFO"  # Logging verbosity level
+LOGGING_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"  # Logging format
 ```
 
-`config_loader.py` in `utils/` dynamically load and merges a user-specified configuration with the defaults.
+### **Modifying Configuration Values**
 
-You can update all scripts or modules that direclty import from config.py to instead use `load_config()`
+Chelombus allows users to customize the configuration in two ways:
 
-```python
-from utils.config_loader import load_config
+---
+
+#### **1. Changing Configuration Values via CLI**
+
+You can override specific default values directly from the command line using flags. For example, to set a custom output directory (`OUTPUT_PATH`), use the corresponding flag, such as `--output-dir`. Only the specified value will be updated; all other settings will retain their defaults.
+
+---
 
-# Load the configuration
-config = load_config("path/to/user_config.yml")
+#### **2. Changing Configuration Values via an `.env` File**
 
-# Example of usage
-output_path = config["OUTPUT_FILE_PATH"]
-chunksize = config["CHUNKSIZE"]
+Alternatively, you can use a configuration file by specifying its path with the `--config` flag. This allows you to define multiple settings in a single file instead of specifying each value via CLI flags.
+
+When using a configuration file:
+- The file should follow the `.env` format, which consists of key-value pairs.
+- Any settings missing from the file will default to the values defined in the `Config` class.
+- Use the same attribute names as in the `Config` class, prefixed with `CHELOMBUS_`.
+
+For example, a `.env` file named `user_config.env` might look like this:
+
+```env
+CHELOMBUS_DATA_PATH=data/input.csv
+CHELOMBUS_OUTPUT_PATH=results/
+CHELOMBUS_CHUNKSIZE=200000
+CHELOMBUS_PCA_N_COMPONENTS=5
+CHELOMBUS_STEPS_LIST=[100, 200, 300]
+CHELOMBUS_LOGGING_LEVEL=DEBUG
 ```
 
-### Run Scripts with Custom Config
+#### **Usage Example**
+
+To load the configuration file, pass it to the `--config` flag:
 
-One can also pass the path to the user config file via CLI:
+```bash
+perform-pca --config user_config.env
+```
+
+---
 
-`python scripts/fingerprint.py --config user_config.yml`
+### **YAML Files and Compatibility**
 
-> If no user config is provided, the defaults from `config.py`are used.
+Currently, Chelombus supports `.env` files for configuration. YAML files are not natively supported by `pydantic`. If you prefer YAML, consider converting the file to an `.env` format.
 
 ---
 

diff --git a/scripts/pca.py b/scripts/pca.py
@@ -34,9 +34,10 @@ def main() -> None:
 
    # Load configuration
    try:
+      logging.info(args.config)
       config = load_config(args.config)
    except ValidationError as e:
-      logging.error(f"COnfiguration error: {e}")
+      logging.error(f"Configuration error: {e}")
       sys.exit(1)
 
    # Override configuration with CLI args if provided
@@ -46,32 +47,29 @@ def main() -> None:
    "CHUNKSIZE":args.chunksize or config.CHUNKSIZE,
    "PCA_N_COMPONENTS": args.pca_components or config.PCA_N_COMPONENTS,
    "N_JOBS": args.n_jobs or config.N_JOBS,
+   "IPCA_MODEL": args.ipca_model or config.IPCA_MODEL
    }) 
 
    os.makedirs(config.OUTPUT_PATH, exist_ok=True)
 
    logging.info(f"Input path: {config.DATA_PATH}")
    logging.info(f"Output directory: {config.OUTPUT_PATH}")
+   logging.info(f"IPCA model: {config.IPCA_MODEL}")
    logging.info(f"Chunk size: {config.CHUNKSIZE}")
    logging.info(f"Using {config.N_JOBS} CPU cores")
    logging.info(f"Number of PCA Components: {config.PCA_N_COMPONENTS}")
 
    start = time.time()
 
-   if args.ipca_model != None:
-      try:
-         ipca = joblib.load(args.ipca_model)
-      except Exception as e:
-         logging.error(f"iPCA model {args.ipca_model} could not be loaded. Error {e}") 
-         sys.exit(1)
-   else:
+   if config.IPCA_MODEL == None:
       ipca = IncrementalPCA(n_components=config.PCA_N_COMPONENTS) 
       for file_path in process_input(config.DATA_PATH):
+         logging.info(f"Loading {file_path}")
          # TODO: Think best way to solve when user has different col_index and wants to change it. 
          data_handler = DataHandler(file_path=file_path, chunksize=config.CHUNKSIZE, smiles_col_index=1, header=None) 
          output_gen = OutputGenerator()
 
-         total_chunks = data_handler.get_total_chunks()
+         total_chunks = data_handler.get_total_chunks(file_path=file_path, chunksize=config.CHUNKSIZE)
 
          start = time.time()
          for idx in tqdm(range(args.resume_chunk,total_chunks), desc="Loading Fingerprints and iPCA partial fitting"):
@@ -89,6 +87,13 @@ def main() -> None:
          end = time.time()
          logging.info(f"iPCA fitting done in {config.DATA_PATH} took {int((end - start) // 3600)} hours, {int(((end - start) % 3600) // 60)} minutes, and {((end - start) % 60):.2f} seconds")
 
+   else: 
+      try:
+         ipca = joblib.load(args.ipca_model)
+      except Exception as e:
+         logging.error(f"iPCA model {args.ipca_model} could not be loaded. Error {e}") 
+         sys.exit(1)
+
    for file_path in process_input(config.DATA_PATH):
       data_handler = DataHandler(file_path=file_path, chunksize=config.CHUNKSIZE, smiles_col_index=1, header=None) # TODO: Think best way to solve when user has different col_index and wants to change it.
       total_chunks = data_handler.get_total_lines()