Skip to content

Commit

Permalink
feat pydantic .env file config support
Browse files Browse the repository at this point in the history
  • Loading branch information
afloresep committed Nov 28, 2024
1 parent 107a704 commit 50a7dea
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 34 deletions.
4 changes: 2 additions & 2 deletions chelombus/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ def __init__(self, file_path, chunksize, smiles_col_index=0, header=0):
self.smiles_col_index= smiles_col_index
self.header = header
self.datatype = find_input_type(file_path)

@staticmethod
def get_total_chunks(self, file_path, chunksize):
def get_total_chunks(file_path, chunksize):
"""
Calculate number of chunks based on self.chunksize for tqdm
Maybe avoid for files that are too large >150 GB? Takes about ~2 minutes for such size
Expand All @@ -28,7 +29,6 @@ def get_total_lines(self):
"""Calculate the total number of lines in the file."""
with open(self.file_path, 'r', encoding='utf-8') as f:
return sum(1 for _ in f)


def load_data(self):
"""Returns correct generator to load data based on file input type"""
Expand Down
2 changes: 2 additions & 0 deletions chelombus/utils/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class Config(BaseSettings):
DATA_PATH: str = "data/"
OUTPUT_PATH: str = "data/output/"
CHUNKSIZE: int = 100_000
IPCA_MODEL: str = None
PCA_N_COMPONENTS: int = 3
STEPS_LIST: List[int] = [50, 50, 50]
N_JOBS: int = os.cpu_count()
Expand All @@ -32,6 +33,7 @@ def load_config(user_config_path=None) -> Config:
"""
Load and merge configurations from defaults, a user-provided file, and evironment variables
"""
print(user_config_path)
if user_config_path and os.path.exists(user_config_path):
config = Config(_env_file=user_config_path)
else:
Expand Down
78 changes: 55 additions & 23 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,41 +163,73 @@ Chelombus includes utilities that allows you to monitor the execution time and m

## **5. Configuration**

### **Default Configuration**
Chelombus provides flexible configuration management using `pydantic`, allowing users to customize default settings efficiently. The default configuration values are defined in the `Config` class, which inherits from `BaseSettings`. Below is a list of the default values:

The default configuration is defined in `config.py`. Below is an example `user_config.yml` file:

```yaml
DATA_FILE_PATH: "data/input.csv"
OUTPUT_FILE_PATH: "results/"
CHUNKSIZE: 100000
PCA_N_COMPONENTS: 3
STEPS_LIST: [50, 50, 50]
LOGGING_LEVEL: "INFO"
```python
BASE_DIR: str = os.getcwd() # Base directory for project execution
DATA_PATH: str = "data/" # Path for input data
OUTPUT_PATH: str = "data/output/" # Path for output data
CHUNKSIZE: int = 100_000 # Size of data chunks for memory-intensive operations
IPCA_MODEL: str = None # Path to a pre-trained IPCA model (if applicable)
PCA_N_COMPONENTS: int = 3 # Number of principal components for dimensionality reduction
STEPS_LIST: List[int] = [50, 50, 50] # Number of buckets per PCA dimension
N_JOBS: int = os.cpu_count() # Number of CPU cores to utilize (defaults to all cores)
RANDOM_STATE: int = 42 # Random seed for reproducibility
TMAP_NAME: str = "tmap" # Name prefix for generated t-maps
PERMUTATIONS: int = 512 # Number of hashing permutations
TMAP_K: int = 20 # Number of neighbors considered in t-map
TMAP_NODE_SIZE: int = 5 # Size of nodes in the t-map visualization
TMAP_POINT_SCALE: float = 1.0 # Scale factor for t-map points
LOG_FILE_PATH: str = "logs/app.log" # Log file path
LOGGING_LEVEL: str = "INFO" # Logging verbosity level
LOGGING_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # Logging format
```

`config_loader.py` in `utils/` dynamically load and merges a user-specified configuration with the defaults.
### **Modifying Configuration Values**

You can update all scripts or modules that direclty import from config.py to instead use `load_config()`
Chelombus allows users to customize the configuration in two ways:

```python
from utils.config_loader import load_config
---

#### **1. Changing Configuration Values via CLI**

You can override specific default values directly from the command line using flags. For example, to set a custom output directory (`OUTPUT_PATH`), use the corresponding flag, such as `--output-dir`. Only the specified value will be updated; all other settings will retain their defaults.

---

# Load the configuration
config = load_config("path/to/user_config.yml")
#### **2. Changing Configuration Values via an `.env` File**

# Example of usage
output_path = config["OUTPUT_FILE_PATH"]
chunksize = config["CHUNKSIZE"]
Alternatively, you can use a configuration file by specifying its path with the `--config` flag. This allows you to define multiple settings in a single file instead of specifying each value via CLI flags.

When using a configuration file:
- The file should follow the `.env` format, which consists of key-value pairs.
- Any settings missing from the file will default to the values defined in the `Config` class.
- Use the same attribute names as in the `Config` class, prefixed with `CHELOMBUS_`.

For example, a `.env` file named `user_config.env` might look like this:

```env
CHELOMBUS_DATA_PATH=data/input.csv
CHELOMBUS_OUTPUT_PATH=results/
CHELOMBUS_CHUNKSIZE=200000
CHELOMBUS_PCA_N_COMPONENTS=5
CHELOMBUS_STEPS_LIST=[100, 200, 300]
CHELOMBUS_LOGGING_LEVEL=DEBUG
```

### Run Scripts with Custom Config
#### **Usage Example**

To load the configuration file, pass it to the `--config` flag:

One can also pass the path to the user config file via CLI:
```bash
perform-pca --config user_config.env
```

---

`python scripts/fingerprint.py --config user_config.yml`
### **YAML Files and Compatibility**

> If no user config is provided, the defaults from `config.py`are used.
Currently, Chelombus supports `.env` files for configuration. YAML files are not natively supported by `pydantic`. If you prefer YAML, consider converting the file to an `.env` format.

---

Expand Down
23 changes: 14 additions & 9 deletions scripts/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ def main() -> None:

# Load configuration
try:
logging.info(args.config)
config = load_config(args.config)
except ValidationError as e:
logging.error(f"COnfiguration error: {e}")
logging.error(f"Configuration error: {e}")
sys.exit(1)

# Override configuration with CLI args if provided
Expand All @@ -46,32 +47,29 @@ def main() -> None:
"CHUNKSIZE":args.chunksize or config.CHUNKSIZE,
"PCA_N_COMPONENTS": args.pca_components or config.PCA_N_COMPONENTS,
"N_JOBS": args.n_jobs or config.N_JOBS,
"IPCA_MODEL": args.ipca_model or config.IPCA_MODEL
})

os.makedirs(config.OUTPUT_PATH, exist_ok=True)

logging.info(f"Input path: {config.DATA_PATH}")
logging.info(f"Output directory: {config.OUTPUT_PATH}")
logging.info(f"IPCA model: {config.IPCA_MODEL}")
logging.info(f"Chunk size: {config.CHUNKSIZE}")
logging.info(f"Using {config.N_JOBS} CPU cores")
logging.info(f"Number of PCA Components: {config.PCA_N_COMPONENTS}")

start = time.time()

if args.ipca_model != None:
try:
ipca = joblib.load(args.ipca_model)
except Exception as e:
logging.error(f"iPCA model {args.ipca_model} could not be loaded. Error {e}")
sys.exit(1)
else:
if config.IPCA_MODEL == None:
ipca = IncrementalPCA(n_components=config.PCA_N_COMPONENTS)
for file_path in process_input(config.DATA_PATH):
logging.info(f"Loading {file_path}")
# TODO: Think best way to solve when user has different col_index and wants to change it.
data_handler = DataHandler(file_path=file_path, chunksize=config.CHUNKSIZE, smiles_col_index=1, header=None)
output_gen = OutputGenerator()

total_chunks = data_handler.get_total_chunks()
total_chunks = data_handler.get_total_chunks(file_path=file_path, chunksize=config.CHUNKSIZE)

start = time.time()
for idx in tqdm(range(args.resume_chunk,total_chunks), desc="Loading Fingerprints and iPCA partial fitting"):
Expand All @@ -89,6 +87,13 @@ def main() -> None:
end = time.time()
logging.info(f"iPCA fitting done in {config.DATA_PATH} took {int((end - start) // 3600)} hours, {int(((end - start) % 3600) // 60)} minutes, and {((end - start) % 60):.2f} seconds")

else:
try:
ipca = joblib.load(args.ipca_model)
except Exception as e:
logging.error(f"iPCA model {args.ipca_model} could not be loaded. Error {e}")
sys.exit(1)

for file_path in process_input(config.DATA_PATH):
data_handler = DataHandler(file_path=file_path, chunksize=config.CHUNKSIZE, smiles_col_index=1, header=None) # TODO: Think best way to solve when user has different col_index and wants to change it.
total_chunks = data_handler.get_total_lines()
Expand Down

0 comments on commit 50a7dea

Please sign in to comment.