Skip to content

Commit

Permalink
Merge branch 'updates-for-ijv' of https://github.com/RNAcentral/rnace…
Browse files Browse the repository at this point in the history
…ntral-import-pipeline into dev

# Conflicts:
#	.pre-commit-config.yaml
#	poetry.lock
#	rnacentral_pipeline/databases/data/regions.py
  • Loading branch information
carlosribas committed Jan 9, 2024
2 parents 12076d3 + 4bcad44 commit 42fbed7
Show file tree
Hide file tree
Showing 13 changed files with 366 additions and 75 deletions.
2 changes: 0 additions & 2 deletions files/genome-mapping/find_species.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@ COPY (
taxid,
division
FROM ensembl_assembly
WHERE
division NOT IN ('EnsemblProtists', 'EnsemblFungi')
) TO STDOUT CSV;
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ python = "^3.11"
PyMySQL = "^1.0.2"
attrs = "^21.4.0"
beautifulsoup4 = "^4.11.0"
biopython = "^1.79"
biopython = "^1.81"
click = "^8.1.3"
click-aliases = "^1.0.1"
furl = "^2.1.3"
Expand Down
5 changes: 3 additions & 2 deletions rnacentral_pipeline/cli/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,19 @@ def cli():


@cli.command("urls-for")
@click.option("--kind", default=None)
@click.argument("division", type=click.Choice(Division.names(), case_sensitive=False))
@click.argument("ftp")
@click.argument("output", default="-", type=click.File("w"))
def vert_url(division, ftp, output):
def vert_url(division, ftp, output, kind=None):
"""
This is a command to generate a CSV file of urls to fetch to get Ensembl
data from. The urls may be globs suitable for fetching with wget.
"""
division = Division.from_name(division)
writer = csv.writer(output, lineterminator="\n")
rows = urls.urls_for(division, ftp)
writer.writerows(row.writeable() for row in rows)
writer.writerows(row.writeable(kind=kind) for row in rows)


@cli.command("parse")
Expand Down
20 changes: 16 additions & 4 deletions rnacentral_pipeline/cli/genome_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import click

from rnacentral_pipeline.rnacentral import attempted
from rnacentral_pipeline.rnacentral.genome_mapping import blat, urls
from rnacentral_pipeline.rnacentral.genome_mapping import blat, urls, igv


@click.group("genome-mapping")
Expand Down Expand Up @@ -76,16 +76,17 @@ def select_hits(hits, output, sort=False):


@cli.command("url-for")
@click.option("--kind", default="fa", type=click.Choice(["fa", "gff3"]))
@click.option("--host", default="ensembl")
@click.argument("species")
@click.argument("assembly_id")
@click.argument("output", default="-", type=click.File("w"))
def find_remote_url(species, assembly_id, output, host=None):
def find_remote_url(species, assembly_id, output, host=None, kind=None):
"""
Determine the remote URL to fetch a the genome for a given species/assembly.
Determine the remote URL to fetch the genome or coordinates for a given species/assembly.
The url is written to the output file and may include '*'.
"""
url = urls.url_for(species, assembly_id, host=host)
url = urls.url_for(species, assembly_id, kind=kind, host=host)
output.write(url)


Expand All @@ -107,3 +108,14 @@ def find_remote_urls(filename, output):
@click.argument("output", type=click.File("w"))
def parse_attempted_sequences(filename, assembly_id, output):
attempted.genome_mapping(filename, assembly_id, output)


@cli.command("igv")
@click.argument("species")
@click.argument("assembly_id")
@click.argument("output", default="-", type=click.File("w"))
def find_url_and_create_json(species, assembly_id, output):
"""
Check which files are available to be used by IGV
"""
igv.create_json(species, assembly_id, output)
32 changes: 17 additions & 15 deletions rnacentral_pipeline/databases/data/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@

import enum
import operator as op
import typing as ty

import attr
from attr.validators import instance_of as is_a
from attrs import field, frozen


class UnknownStrand(Exception):
Expand Down Expand Up @@ -109,8 +111,8 @@ def __str__(self):
raise ValueError("No name for %s" % self)


@attr.s(frozen=True, hash=True, slots=True)
class CoordinateSystem(object):
@frozen(hash=True, slots=True)
class CoordinateSystem:
"""
This is meant to represent how a database numbers a genome. Some databases
will start counting at zeros and others one, this is called the basis here.
Expand All @@ -122,8 +124,8 @@ class CoordinateSystem(object):
http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/
"""

basis = attr.ib(validator=is_a(CoordinateStart))
close_status = attr.ib(validator=is_a(CloseStatus))
basis: CoordinateStart = field(validator=is_a(CoordinateStart))
close_status: CloseStatus = field(validator=is_a(CloseStatus))

@classmethod
def build(cls, value):
Expand Down Expand Up @@ -210,17 +212,17 @@ def normalize(self, location):
return self.as_one_based(location)


@attr.s(frozen=True, hash=True, slots=True)
class Exon(object):
start = attr.ib(validator=is_a(int))
stop = attr.ib(validator=is_a(int))
@frozen(hash=True, slots=True)
class Exon:
start: int = field(validator=is_a(int))
stop: int = field(validator=is_a(int))

@classmethod
def from_dict(cls, raw):
return cls(start=raw["exon_start"], stop=raw["exon_stop"])

@stop.validator
def greater_than_start(self, attribute, value):
def greater_than_start(self, _attribute, value):
if value < self.start:
raise ValueError("stop (%i) must be >= start (%i)" % (value, self.start))

Expand All @@ -235,13 +237,13 @@ def as_sorted_exons(raw):
return tuple(sorted(exons, key=op.attrgetter("start")))


@attr.s(frozen=True, hash=True, slots=True)
@frozen(hash=True, slots=True)
class SequenceRegion:
assembly_id = attr.ib(validator=is_a(str), converter=str)
chromosome = attr.ib(validator=is_a(str), converter=str)
strand = attr.ib(validator=is_a(Strand), converter=Strand.build)
exons = attr.ib(validator=is_a(tuple), converter=as_sorted_exons)
coordinate_system = attr.ib(
assembly_id: str = field(validator=is_a(str), converter=str)
chromosome: str = field(validator=is_a(str), converter=str)
strand: Strand = field(validator=is_a(Strand), converter=Strand.build)
exons: ty.Tuple[Exon] = field(validator=is_a(tuple), converter=as_sorted_exons)
coordinate_system: CoordinateSystem = field(
validator=is_a(CoordinateSystem),
converter=CoordinateSystem.build,
)
Expand Down
12 changes: 6 additions & 6 deletions rnacentral_pipeline/databases/ensembl/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

from rnacentral_pipeline.databases.data import SequenceRegion
from rnacentral_pipeline.databases.data.utils import as_so_term
from rnacentral_pipeline.databases.helpers import embl
from rnacentral_pipeline.databases.ensembl import helpers
from rnacentral_pipeline.databases.helpers import embl

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -76,12 +76,12 @@ def from_feature(cls, record, feature) -> TranscriptInfo:

@attr.s()
class FtpInfo:
division = attr.ib(validator=is_a(Division))
species = attr.ib(validator=is_a(str))
data_files = attr.ib(validator=is_a(str))
gff_file = attr.ib(validator=is_a(str))
division: Division = attr.ib(validator=is_a(Division))
species: str = attr.ib(validator=is_a(str))
data_files: str = attr.ib(validator=is_a(str))
gff_file: str = attr.ib(validator=is_a(str))

def writeable(self) -> ty.Tuple[str, str, str, str]:
def writeable(self, kind=None) -> ty.Tuple[str, str, str, str]:
return (self.division.name, self.species, self.data_files, self.gff_file)


Expand Down
12 changes: 0 additions & 12 deletions rnacentral_pipeline/databases/ensembl/genomes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,6 @@ def generate_paths(
gff_path = f"{base}/{release}/gff3/{name}/{organism_name}.gff3.gz"
data_files = f"{base}/{release}/embl/{name}/{organism_name}.*.dat.gz"

# try:
# size = ftp.size(gff_path)
# if size is None:
# LOGGER.warn("GFF file %s is empty, skip %s", gff_path, assembly)
# continue
# except e:
# LOGGER.warn(
# "Could not get data for %s, skipping %s", gff_path, assembly
# )
# print(e)
# continue

yield FtpInfo(
division=division,
species=name,
Expand Down
18 changes: 9 additions & 9 deletions rnacentral_pipeline/rnacentral/genome_mapping/blat.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import logging
import typing as ty

import attr
from attr.validators import instance_of as is_a
from attrs import frozen


from rnacentral_pipeline import utils
from rnacentral_pipeline.databases.data.regions import Exon
Expand Down Expand Up @@ -56,13 +56,13 @@
]


@attr.s(frozen=True)
class BlatHit(object):
upi = attr.ib(validator=is_a(str), converter=str)
sequence_length = attr.ib(validator=is_a(int))
matches = attr.ib(validator=is_a(int))
target_insertions = attr.ib(validator=is_a(int))
region = attr.ib(validator=is_a(SequenceRegion))
@frozen
class BlatHit:
upi: str
sequence_length: int
matches: int
target_insertions: int
region: SequenceRegion

@classmethod
def build(cls, assembly_id: str, raw: ty.Dict[str, ty.Any]) -> BlatHit:
Expand Down
84 changes: 84 additions & 0 deletions rnacentral_pipeline/rnacentral/genome_mapping/igv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-

"""
Copyright [2009-present] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import json
import logging
from contextlib import contextmanager
from ftplib import FTP
from typing import Dict, List, IO

LOGGER = logging.getLogger(__name__)
FTP_SERVER = "ftp.ebi.ac.uk"


class NoFastaFile(Exception):
"""
Could not find any file to be used by IGV
"""


@contextmanager
def ftp(host):
conn = FTP(host)
conn.login()

yield conn

try:
conn.quit()
except Exception as err:
LOGGER.info("Failed to close FTP connection")
LOGGER.exception(err)


def check_url(file_list: List[str], name: str, path: str, assembly_id: str) -> Dict:
result = {}
fasta_file = [file for file in file_list if file == name + ".fa"]
fasta_index = [file for file in file_list if file == name + ".fa.fai"]
ensembl = [file for file in file_list if file == name + ".ensembl.gff3.gz"]
ensembl_index = [file for file in file_list if
file == name + ".ensembl.gff3.gz.tbi" or file == name + ".ensembl.gff3.gz.csi"]
rnacentral = [file for file in file_list if file == name + ".rnacentral.gff3.gz"]
rnacentral_index = [file for file in file_list if
file == name + ".rnacentral.gff3.gz.tbi" or file == name + ".rnacentral.gff3.gz.csi"]

if (ensembl or rnacentral) and fasta_file and fasta_index:
result["name"] = assembly_id
result["fastaFile"] = "https://" + FTP_SERVER + "/" + path + "/" + fasta_file[0]
result["fastaIndex"] = "https://" + FTP_SERVER + "/" + path + "/" + fasta_index[0]

if ensembl and ensembl_index:
result["ensemblTrack"] = "https://" + FTP_SERVER + "/" + path + "/" + ensembl[0]
result["ensemblIndex"] = "https://" + FTP_SERVER + "/" + path + "/" + ensembl_index[0]

if rnacentral and rnacentral_index:
result["rnacentralTrack"] = "https://" + FTP_SERVER + "/" + path + "/" + rnacentral[0]
result["rnacentralIndex"] = "https://" + FTP_SERVER + "/" + path + "/" + rnacentral_index[0]

return result


def create_json(species: str, assembly_id: str, output: IO[str]) -> None:
with ftp(FTP_SERVER) as conn:
path = "pub/databases/RNAcentral/.genome-browser"
conn.cwd(path)
file_list = conn.nlst()
name = f"{species}.{assembly_id}"
result = check_url(file_list, name, path, assembly_id)

if result:
json.dump(result, output)
else:
raise NoFastaFile(f"{species}.{assembly_id}")
Loading

0 comments on commit 42fbed7

Please sign in to comment.