Skip to content

Commit

Permalink
Work on bbuploader:
Browse files Browse the repository at this point in the history
1) added better overwritten arguments.
2) added preload option, that preloads bedfiles before uploading.
  • Loading branch information
khoroshevskyi committed Nov 19, 2024
1 parent c0a54f5 commit f674080
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 17 deletions.
26 changes: 25 additions & 1 deletion bedboss/bbuploader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,18 @@ def upload_all(
None,
help="Reference genome [Default: None] (e.g. hg38) - if None, all genomes will be processed",
),
preload: bool = typer.Option(
True, help="Download bedfile before caching it. [Default: True]"
),
create_bedset: bool = typer.Option(
True, help="Create bedset from bed files. [Default: True]"
),
overwrite: bool = typer.Option(
False, help="Overwrite existing bedfiles. [Default: False]"
),
overwrite_bedset: bool = typer.Option(
False, help="Overwrite existing bedset. [Default: False]"
),
rerun: bool = typer.Option(False, help="Re-run all the samples. [Default: False]"),
run_skipped: bool = typer.Option(
True, help="Run skipped projects. [Default: False]"
Expand Down Expand Up @@ -65,12 +74,15 @@ def upload_all(
download_limit=download_limit,
genome=genome,
create_bedset=create_bedset,
preload=preload,
rerun=rerun,
run_skipped=run_skipped,
run_failed=run_failed,
standardize_pep=standardize_pep,
use_skipper=use_skipper,
reinit_skipper=reinit_skipper,
overwrite=overwrite,
overwrite_bedset=overwrite_bedset,
)


Expand All @@ -88,17 +100,26 @@ def upload_gse(
None,
help=" reference genome to upload to database. If None, all genomes will be processed",
),
preload: bool = typer.Option(
True, help="Download bedfile before caching it. [Default: True]"
),
rerun: bool = typer.Option(True, help="Re-run all the samples. [Default: False]"),
run_skipped: bool = typer.Option(
True, help="Run skipped projects. [Default: False]"
),
run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"),
overwrite: bool = typer.Option(
False, help="Overwrite existing bedfiles. [Default: False]"
),
overwrite_bedset: bool = typer.Option(
True, help="Overwrite existing bedset. [Default: False]"
),
standardize_pep: bool = typer.Option(
False, help="Standardize pep with BEDMESS. [Default: False]"
),
use_skipper: bool = typer.Option(
True,
help="Use skipper to skip projects if they were processed locally [Default: False]",
help="Use local skipper to skip projects if they were processed locally [Default: False]",
),
reinit_skipper: bool = typer.Option(
False, help="Reinitialize skipper. [Default: False]"
Expand All @@ -112,12 +133,15 @@ def upload_gse(
gse=gse,
create_bedset=create_bedset,
genome=genome,
preload=preload,
rerun=rerun,
run_skipped=run_skipped,
run_failed=run_failed,
standardize_pep=standardize_pep,
use_skipper=use_skipper,
reinit_skipper=reinit_skipper,
overwrite=overwrite,
overwrite_bedset=overwrite_bedset,
)


Expand Down
2 changes: 1 addition & 1 deletion bedboss/bbuploader/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
PKG_NAME = "bbuploader"

FILE_FOLDER_NAME = "files"
FILE_FOLDER_NAME = "geo_files"

DEFAULT_GEO_TAG = "samples"

Expand Down
56 changes: 44 additions & 12 deletions bedboss/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pephubclient import PEPHubClient
from pephubclient.helpers import MessageHandler
from pephubclient.models import SearchReturnModel
from setuptools.command.egg_info import overwrite_arg
from sqlalchemy import and_, select
from sqlalchemy.orm import Session

Expand All @@ -21,8 +22,10 @@
from bedboss.bedbuncher.bedbuncher import run_bedbuncher
from bedboss.exceptions import BedBossException
from bedboss.utils import standardize_genome_name
from bedboss.utils import standardize_pep as pep_standardizer
from bedboss.utils import standardize_pep as pep_standardizer, download_file
from bedboss.skipper import Skipper
from bedboss.bbuploader.constants import FILE_FOLDER_NAME
from bedboss.bbuploader.utils import create_gsm_sub_name

_LOGGER = logging.getLogger(PKG_NAME)
_LOGGER.setLevel(logging.DEBUG)
Expand All @@ -38,12 +41,15 @@ def upload_all(
download_limit: int = 100,
genome: str = None,
create_bedset: bool = True,
preload=True,
rerun: bool = False,
run_skipped: bool = False,
run_failed: bool = True,
standardize_pep: bool = False,
use_skipper=True,
reinit_skipper=False,
overwrite=False,
overwrite_bedset=False,
):
"""
This is main function that is responsible for processing bed files from PEPHub.
Expand All @@ -57,9 +63,10 @@ def upload_all(
:param download_limit: limit of GSE projects to be downloaded (used for testing purposes) [Default: 100]
:param genome: reference genome [Default: None] (e.g. hg38) - if None, all genomes will be processed
:param create_bedset: create bedset from bed files
:param rerun: rerun processing of the series
:param run_skipped: rerun files that were skipped
:param run_failed: rerun failed files
:param preload: pre - download files to the local folder (used for faster reproducibility)
:param rerun: rerun processing of the series. Used in logging system. If you want to reupload file use overwrite
:param run_skipped: rerun files that were skipped. Used in logging system. If you want to reupload file use overwrite
:param run_failed: rerun failed files. Used in logging system. If you want to reupload file use overwrite
:param standardize_pep: standardize pep metadata using BEDMS
:param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed
and failed files.
Expand Down Expand Up @@ -137,9 +144,12 @@ def upload_all(
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
rerun=rerun,
# rerun=rerun,
use_skipper=use_skipper,
reinit_skipper=reinit_skipper,
preload=preload,
overwrite=overwrite,
overwrite_bedset=overwrite_bedset,
)
except Exception as err:
_LOGGER.error(
Expand Down Expand Up @@ -260,12 +270,15 @@ def upload_gse(
outfolder: str = os.getcwd(),
create_bedset: bool = True,
genome: str = None,
preload: bool = True,
rerun: bool = False,
run_skipped: bool = False,
run_failed: bool = True,
standardize_pep: bool = False,
use_skipper=True,
reinit_skipper=False,
overwrite=False,
overwrite_bedset=False,
):
"""
Upload bed files from GEO series to BedBase
Expand All @@ -275,13 +288,16 @@ def upload_gse(
:param outfolder: working directory, where files will be downloaded, processed and statistics will be saved
:param create_bedset: create bedset from bed files
:param genome: reference genome to upload to database. If None, all genomes will be processed
:param preload: pre - download files to the local folder (used for faster reproducibility)
:param rerun: rerun processing of the series
:param run_skipped: rerun files that were skipped
:param run_failed: rerun failed files
:param standardize_pep: standardize pep metadata using BEDMS
:param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed
and failed files.
:param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs files will be cleaned
:param overwrite: overwrite existing bedfiles
:param overwrite_bedset: overwrite existing bedset
:return: None
"""
Expand Down Expand Up @@ -327,7 +343,9 @@ def upload_gse(
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
rerun=rerun,
preload=preload,
overwrite=overwrite,
overwrite_bedset=overwrite_bedset,
use_skipper=use_skipper,
reinit_skipper=reinit_skipper,
)
Expand Down Expand Up @@ -376,9 +394,11 @@ def _upload_gse(
sa_session: Session = None,
gse_status_sa_model: GeoGseStatus = None,
standardize_pep: bool = False,
rerun: bool = False,
overwrite: bool = False,
overwrite_bedset: bool = False,
use_skipper: bool = True,
reinit_skipper: bool = False,
preload: bool = True,
) -> ProjectProcessingStatus:
"""
Upload bed files from GEO series to BedBase
Expand All @@ -391,11 +411,12 @@ def _upload_gse(
:param sa_session: opened session to the database
:param gse_status_sa_model: sqlalchemy model for project status
:param standardize_pep: standardize pep metadata using BEDMS
:param rerun: force overwrite data in the database
:param overwrite: overwrite existing bedfiles
:param overwrite_bedset: overwrite existing bedset
:param use_skipper: use skipper to skip already processed logged locally. Skipper creates local log of processed
and failed files.
:param reinit_skipper: reinitialize skipper, if set to True, skipper will be reinitialized and all logs will be
:param preload: pre - download files to the local folder (used for faster reproducibility)
:return: None
"""
if isinstance(bedbase_config, str):
Expand Down Expand Up @@ -486,10 +507,21 @@ def _upload_gse(
sample_status.status = STATUS.PROCESSING
sa_session.commit()

if preload:
gsm_folder = create_gsm_sub_name(sample_gsm)
files_path = os.path.join(outfolder, FILE_FOLDER_NAME, gsm_folder)
os.makedirs(files_path, exist_ok=True)
file_abs_path = os.path.abspath(
os.path.join(files_path, project_sample.file)
)
download_file(project_sample.file_url, file_abs_path, no_fail=True)
else:
file_abs_path = required_metadata.file_path

try:
file_digest = run_all(
name=required_metadata.title,
input_file=required_metadata.file_path,
input_file=file_abs_path,
input_type=required_metadata.type,
outfolder=os.path.join(outfolder, "outputs"),
genome=required_metadata.ref_genome,
Expand All @@ -499,7 +531,7 @@ def _upload_gse(
upload_pephub=True,
upload_s3=True,
upload_qdrant=True,
force_overwrite=rerun,
force_overwrite=overwrite,
)
uploaded_files.append(file_digest)
if skipper_obj:
Expand Down Expand Up @@ -530,7 +562,7 @@ def _upload_gse(
upload_pephub=True,
upload_s3=True,
no_fail=True,
force_overwrite=rerun,
force_overwrite=overwrite_bedset,
)

else:
Expand Down
24 changes: 23 additions & 1 deletion bedboss/bbuploader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
_LOGGER = logging.getLogger(PKG_NAME)


# This function is not used in the code anymore
def download_file(file_url: str, local_file_path: str, force: bool = False) -> None:
"""
Download file using ftp url
Expand All @@ -22,3 +21,26 @@ def download_file(file_url: str, local_file_path: str, force: bool = False) -> N
urllib.request.urlretrieve(file_url, local_file_path)
else:
_LOGGER.info(f"File {local_file_path} already exists. Skipping downloading.")


def create_gsm_sub_name(name: str) -> str:
"""
Create gse subfolder name. e.g.
gse123456 -> gsm123nnn
gse123 -> gsennn
gse1234-> gse1nnn
gse1 -> gsennn
! This function was copied from geopephub utils
:param name: gse name
:return: gse subfolder name
"""

len_name = len(name)

if len_name <= 6:
return """gsmnnn"""
else:
# return name[:6] + "n" * (len_name - 6)
return name[:-3] + "n" * 3
23 changes: 21 additions & 2 deletions scripts/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ def another_test():
from bedboss.bbuploader.main import upload_gse

upload_gse(
gse="gse218680",
# gse="gse246900",
# gse="gse247593",
# gse="gse241222",
# gse="gse266130",
# gse="gse209627",
gse="gse266949",
# gse="gse266949",
# gse="gse240325", # TODO: check if qc works
# gse="gse229592", # mice
bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
Expand All @@ -47,7 +48,25 @@ def another_test():
)


def upload_time():
from bedboss.bbuploader.main import upload_all

upload_all(
bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
start_date="2024/06/01",
# end_date="2024/08/28",
search_limit=1000,
download_limit=10000,
search_offset=0,
genome="hg38",
rerun=True,
run_skipped=True,
)


if __name__ == "__main__":
# runn()

another_test()
# another_test()
upload_time()

0 comments on commit f674080

Please sign in to comment.