diff --git a/.gitignore b/.gitignore index 907f3fd..db6d490 100644 --- a/.gitignore +++ b/.gitignore @@ -593,4 +593,5 @@ scripts/*.json *stats_*.json !build.rs .ruff_cache -hg_deepchopper \ No newline at end of file +hg_deepchopper +analysis_data \ No newline at end of file diff --git a/README.md b/README.md index ab83f60..da22f13 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # logo **DeepChopper** [![social](https://img.shields.io/github/stars/ylab-hi/DeepChopper?style=social)](https://github.com/ylab-hi/DeepChopper/stargazers) [![pypi](https://img.shields.io/pypi/v/deepchopper.svg)](https://pypi.python.org/pypi/deepchopper) +[![PyPI - Wheel](https://img.shields.io/pypi/wheel/deepchopper)](https://pypi.org/project/deepchopper/#files) [![license](https://img.shields.io/pypi/l/deepchopper.svg)](https://github.com/ylab-hi/DeepChopper/blob/main/LICENSE) [![pypi version](https://img.shields.io/pypi/pyversions/deepchopper.svg)](https://pypi.python.org/pypi/deepbiop) -[![Actions status](https://github.com/ylab-hi/DeepChopper/actions/workflows/release-python.yml/badge.svg)](https://github.com/ylab-hi/DeepChopper/actions) [![platform](https://img.shields.io/badge/platform-linux%20%7C%20osx%20%7C%20win-blue)](https://pypi.org/project/deepchopper/#files) +[![Actions status](https://github.com/ylab-hi/DeepChopper/actions/workflows/release-python.yml/badge.svg)](https://github.com/ylab-hi/DeepChopper/actions) [![Space](https://huggingface.co/datasets/huggingface/badges/resolve/main/open-in-hf-spaces-md.svg)](https://huggingface.co/spaces/yangliz5/deepchopper) diff --git a/deepchopper/cli.py b/deepchopper/cli.py index dc7d52e..8a10fe4 100644 --- a/deepchopper/cli.py +++ b/deepchopper/cli.py @@ -66,6 +66,9 @@ def encode( if verbose: set_logging_level(logging.INFO) + if isinstance(fastq_path, str): + fastq_path = Path(fastq_path) + if not fastq_path.exists(): msg = f"Folder {fastq_path} does not exist." logging.error(msg) @@ -109,6 +112,9 @@ def predict( if verbose: set_logging_level(logging.INFO) + if isinstance(data_path, str): + data_path = Path(data_path) + tokenizer = deepchopper.models.llm.load_tokenizer_from_hyena_model(model_name="hyenadna-small-32k-seqlen") datamodule: LightningDataModule = deepchopper.data.fq_datamodule.FqDataModule( train_data_path="dummy.parquet", @@ -129,7 +135,7 @@ def predict( trainer = lightning.pytorch.trainer.Trainer( accelerator=accelerator, - devices=gpus, + devices=devices, callbacks=callbacks, deterministic=False, logger=False, @@ -141,19 +147,17 @@ def predict( def chop( predicts: list[Path] = typer.Argument(..., help="Paths to prediction files"), fq: Path = typer.Argument(..., help="Path to FASTQ file"), - smooth_window_size: int = typer.Option(21, "--smooth-window", "-s", help="Smooth window size"), - min_interval_size: int = typer.Option(13, "--min-interval", "-i", help="Minimum interval size"), - approved_interval_number: int = typer.Option(20, "--approved-intervals", "-a", help="Number of approved intervals"), - max_process_intervals: int = typer.Option(4, "--max-process", "-p", help="Maximum process intervals"), - min_read_length_after_chop: int = typer.Option( - 20, "--min-read-length", "-l", help="Minimum read length after chop" - ), - output_chopped_seqs: bool = typer.Option(False, "--output-chopped", "-o", help="Output chopped sequences"), - chop_type: str = typer.Option("all", "--chop-type", "-t", help="Chop type"), - threads: int = typer.Option(2, "--threads", "-n", help="Number of threads"), - output_prefix: str | None = typer.Option(None, "--prefix", "-x", help="Output prefix"), - max_batch_size: int | None = typer.Option(None, "--max-batch", "-b", help="Maximum batch size"), + smooth_window_size: int = typer.Option(21, "--smooth-window", help="Smooth window size"), + min_interval_size: int = typer.Option(13, "--min-interval-size", help="Minimum interval size"), + approved_interval_number: int = typer.Option(20, "--approved-intervals", help="Number of approved intervals"), + max_process_intervals: int = typer.Option(4, "--max-process-intervals", help="Maximum process intervals"), + min_read_length_after_chop: int = typer.Option(20, "--min-read-length", help="Minimum read length after chop"), + output_chopped_seqs: bool = typer.Option(False, "--output-chopped", help="Output chopped sequences"), + chop_type: str = typer.Option("all", "--chop-type", help="Chop type"), + threads: int = typer.Option(2, "--threads", help="Number of threads"), verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output"), + output_prefix: str | None = typer.Option(None, "--prefix", "-o", help="Output prefix"), + max_batch_size: int | None = typer.Option(None, "--max-batch", help="Maximum batch size"), ): """Chop sequences based on predictions.""" if verbose: @@ -168,37 +172,19 @@ def chop( predict_files = " ".join([f"--pdt {predict}" for predict in predicts]) - commands = [ - [ - "deepchopper-chop", - predict_files, - "--fq", - fq, - "-t", - threads, - "-s", - smooth_window_size, - "--mis", - min_interval_size, - "-a", - approved_interval_number, - "--mpi", - max_process_intervals, - "--mcr", - min_read_length_after_chop, - "--ocq", - output_chopped_seqs, - "--ct", - chop_type, - "-o", - output_prefix, - "-m", - max_batch_size, - ], - ] + command = f"deepchopper-chop {predict_files} --fq {fq} -t {threads} -s {smooth_window_size} --mis {min_interval_size} -a {approved_interval_number} --mpi {max_process_intervals} --mcr {min_read_length_after_chop} --ct {chop_type} " + + if output_chopped_seqs: + command += "--ocq " + + if output_prefix is not None: + command += f"-o {output_prefix} " + + if max_batch_size is not None: + command += f"-m {max_batch_size} " try: - subprocess.run(commands, check=True) + subprocess.run(command.split(), check=True) except subprocess.CalledProcessError as e: logging.error(f"Error: Chopping failed with exit code {e.returncode}") raise e