Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Requests #15

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,7 @@ pipeline_cache/
app_sources/
!app_sources/annotated_inputs/.gitkeep
!app_sources/coordinate_files/.gitkeep
!app_sources/immunopeptidomes/.gitkeep
!app_sources/immunopeptidomes/.gitkeep

# Ignore release cache file
src/SOPRANO/data/ensembl.releases
5 changes: 3 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/python/black
rev: 23.7.0
rev: 23.9.1
hooks:
- id: black
pass_filenames: true
Expand All @@ -13,7 +13,8 @@ repos:
rev: 'v1.5.1'
hooks:
- id: mypy
additional_dependencies: ['types-requests']
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
rev: v0.0.292
hooks:
- id: ruff
22 changes: 12 additions & 10 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,28 @@ description = "SOPRANO: Selection On PRotein ANnotated regiOns"
readme = "README.md"
authors = [
{ name = "Luis Zapata", email = "[email protected]" },
{ name = "ICR Scientific Software Group", email = "[email protected]"},
{ name = "ICR Scientific Software Group", email = "[email protected]" },
]
maintainers = [
{ name = "Luis Zapata", email = "[email protected]" },
{ name = "ICR Scientific Software Group", email = "[email protected]"},
{ name = "ICR Scientific Software Group", email = "[email protected]" },
]
requires-python = ">=3.8"
dependencies = [
"pandas",
"numpy",
"streamlit == 1.27.0"
"streamlit == 1.27.0",
"requests",
"types-requests"
]
classifiers = [
"Development Status :: 3 - Alpha",
"License :: OSI Approved :: BSD License",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.11",
"Programming Language :: Perl",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Intended Audience :: Science/Research"
"Development Status :: 3 - Alpha",
"License :: OSI Approved :: BSD License",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.11",
"Programming Language :: Perl",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Intended Audience :: Science/Research"
]
dynamic = ["version"]

Expand Down
14 changes: 1 addition & 13 deletions src/SOPRANO/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,22 +248,10 @@ def with_tab_genomes(tab: DeltaGenerator):
if st.button("Download", disabled=True):
# TODO: Probably easiest to rewrite the download via the
# the requests library. Getting a bit convoluted with
# shell. See comment below
# shell. See utils.url_utils
pass


# def download_file(url):
# response = requests.get(url)
# if "content-disposition" in response.headers:
# content_disposition = response.headers["content-disposition"]
# filename = content_disposition.split("filename=")[1]
# else:
# filename = url.split("/")[-1]
# with open(filename, mode="wb") as file:
# file.write(response.content)
# print(f"Downloaded file {filename}")


def with_tab_annotator(tab: DeltaGenerator):
with tab:
st.title("Annotate VCF File")
Expand Down
202 changes: 202 additions & 0 deletions src/SOPRANO/core/objects.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
import pathlib
from argparse import Namespace
from dataclasses import dataclass
from typing import Set

from SOPRANO.utils.path_utils import Directories, genome_pars_to_paths
from SOPRANO.utils.url_utils import (
build_ensembl_urls,
check_ensembl_file_url,
compute_chrom_sizes,
compute_fasta_index,
decompress,
download_from_url,
filename_from_url,
find_earliest_release,
find_latest_release,
)


@dataclass(frozen=True)
Expand Down Expand Up @@ -254,3 +266,193 @@ def from_namespace(cls, namespace: Namespace):

class SOPRANOError(Exception):
pass


class _GatherReferences:
# Urls
toplevel_url: str
primary_assembly_url: str

# Params
species: str
reference: str

# Status
toplevel_gz_done: Set[int] = set()
toplevel_fa_done: Set[int] = set()
toplevel_fai_done: Set[int] = set()
primary_assembly_gz_done: Set[int] = set()
primary_assembly_fa_done: Set[int] = set()
primary_assembly_fai_done: Set[int] = set()
sizes_done: Set[int] = set()

def _dest_directory(self, release: int):
return Directories.data(self.species) / f"{release}_{self.reference}"

def _dest_fa_gz(self, release: int, _toplevel: bool):
return self._dest_directory(release) / filename_from_url(
self.toplevel_url if _toplevel else self.primary_assembly_url
).format(RELEASE=release)

def _dest_fa(self, release: int, _toplevel: bool):
return self._dest_fa_gz(release, _toplevel).with_suffix("")

def _dest_fai(self, release: int, _toplevel: bool):
return self._dest_fa_gz(release, _toplevel).with_suffix(".fai")

def dest_chrom(self, release: int, _toplevel: bool):
return self._dest_fa(release, _toplevel).with_suffix(".chrom")

def toplevel_fa_gz_path(self, release: int):
return self._dest_fa_gz(release, _toplevel=True)

def toplevel_fa_path(self, release: int):
return self._dest_fa(release, _toplevel=True)

def toplevel_fai_path(self, release: int):
return self._dest_fai(release, _toplevel=True)

def toplevel_chrom_path(self, release: int):
return self.toplevel_fa_path(release).with_suffix(".chrom")

def primary_assembly_fa_gz_path(self, release: int):
return self._dest_fa_gz(release, _toplevel=False)

def primary_assembly_fa_path(self, release: int):
return self._dest_fa(release, _toplevel=False)

def primary_assembly_fai_path(self, release: int):
return self._dest_fai(release, _toplevel=False)

def _download(self, release: int, _toplevel):
if _toplevel:
source_url = self.toplevel_url
dest_path = self.toplevel_fa_gz_path(release)
decompressed_path = self.toplevel_fa_path(release)
else:
source_url = self.primary_assembly_url
dest_path = self.primary_assembly_fa_gz_path(release)
decompressed_path = self.toplevel_fa_path(release)

if not (decompressed_path.exists() or dest_path.exists()):
dest_path.parent.mkdir(parents=True, exist_ok=True)
check_ensembl_file_url(source_url, release)
download_from_url(
source_url.format(RELEASE=release),
target_path=dest_path,
)

def _check_release_ok(self, release):
min_release = find_earliest_release(self.toplevel_url)
max_release = find_latest_release(self.toplevel_url)

if not (min_release <= release <= max_release):
raise ValueError(release)

def download_toplevel(self, release):
if release not in self.toplevel_gz_done:
self._check_release_ok(release)

if not self.toplevel_fa_gz_path(release).exists():
self._download(release, _toplevel=True)

self.toplevel_gz_done.add(release)

def download_primary_assembly(self, release):
if release not in self.primary_assembly_gz_done:
self._check_release_ok(release)

if self.primary_assembly_fa_gz_path(release).exists():
self._download(release, _toplevel=False)

self.primary_assembly_gz_done.add(release)

def decompress_toplevel(self, release):
if release not in self.toplevel_fa_done:
if not self.toplevel_fa_path(release).exists():
decompress(self.toplevel_fa_gz_path(release))

self.toplevel_fa_done.add(release)

def decompress_primary_assembly(self, release):
if release not in self.primary_assembly_fa_done:
if not self.primary_assembly_fa_path(release).exists():
decompress(self.primary_assembly_fa_gz_path(release))

self.primary_assembly_fa_done.add(release)

def compute_chrom_sizes(self, release):
if release not in self.sizes_done:
if not self.toplevel_chrom_path(release).exists():
compute_chrom_sizes(self.toplevel_fai_path(release))

self.sizes_done.add(release)

def compute_fasta_index_toplevel(self, release):
if release not in self.toplevel_fai_done:
if not self.toplevel_fai_path(release).exists():
compute_fasta_index(self.toplevel_fa_path(release))

self.toplevel_fai_done.add(release)

def compute_fasta_index_primary_assembly(self, release):
if release not in self.primary_assembly_fai_done:
if not self.primary_assembly_fai_path(release).exists():
compute_fasta_index(self.primary_assembly_fa_path(release))

self.primary_assembly_fai_done.add(release)

def compute_all_toplevel(self, release):
self.download_toplevel(release)
self.decompress_toplevel(release)
self.compute_fasta_index_toplevel(release)
self.compute_chrom_sizes(release)

def compute_all_primary_assembly(self, release):
self.download_primary_assembly(release)
self.decompress_primary_assembly(release)
self.compute_fasta_index_primary_assembly(release)

def get_genome_reference_paths(self, release):
return GenomePaths(
sizes=self.toplevel_chrom_path(release),
fasta=self.toplevel_fa_path(release),
)


class EnsemblData(_GatherReferences):
def __init__(self, species: str, reference: str, _init_urls=True):
self.species = species
self.reference = reference

if _init_urls:
url_dict = build_ensembl_urls(species, reference)
self.toplevel_url = url_dict["toplevel"]
self.primary_assembly_url = url_dict["primary_assembly"]

@classmethod
def homo_sapiens_GRCh38(cls):
return cls("homo_sapiens", "GRCh38")

@classmethod
def homo_sapiens_GRCh37(cls):
# GRCh37 is actually has a deviant url structure, so manually set here
toplevel_url = (
"https://ftp.ensembl.org/pub/grch37/release-{RELEASE}/"
"fasta/homo_sapiens/dna/"
"Homo_sapiens.GRCh37.dna.toplevel.fa.gz"
)

primary_assembly_url = (
"https://ftp.ensembl.org/pub/grch37/release-{RELEASE}/"
"fasta/homo_sapiens/dna/"
"Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz"
)

species = "homo_sapiens"
reference = "GRCh37"

obj = cls(species, reference, _init_urls=False)
obj.toplevel_url = toplevel_url
obj.primary_assembly_url = primary_assembly_url
return obj
Loading
Loading