Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

R2dt rescan #153

Draft
wants to merge 33 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d0ce5e9
Changes to extract and propagate r2dt version info for hits
afg1 Oct 25, 2022
2fd83ff
Version information for attempted
afg1 Oct 25, 2022
82e0683
Tweak workflow to extract and propagate version info
afg1 Oct 25, 2022
3d7c140
Remove version from r2dt hit info. It will only be stored in the atte…
afg1 Oct 25, 2022
d044b85
Add cell location as an optional parameter
afg1 Oct 26, 2022
7dd45e8
Rfam cm parser to rnac database entry
afg1 Oct 26, 2022
684031b
Fix gtrnadb parser for up to date model spec
afg1 Oct 26, 2022
12a0ef3
CM scanner for CRW
afg1 Oct 26, 2022
205efcd
Working parsers with correct forwarding of db_url
afg1 Oct 26, 2022
54c894e
Use crw metadata file rather than database query and tidy up unused s…
afg1 Oct 26, 2022
09a8289
Fix RNAse-p parser
afg1 Oct 26, 2022
e51c9e1
Fix ribovision parser and add lookup for more taxa
afg1 Oct 26, 2022
a29ed6a
Add metadata url to crw parser
afg1 Oct 26, 2022
7883e8a
Fix bad escapes in r2dt version extraction
afg1 Oct 26, 2022
98537a5
Workflow to scan the database with r2dt
afg1 Oct 26, 2022
5e362e1
Update models on conflict
afg1 Oct 26, 2022
ffa4026
Switch pipeline back to use metadata
afg1 Oct 26, 2022
13ff828
Fix sed command
afg1 Oct 26, 2022
e3485ba
Fix cms paths
afg1 Oct 26, 2022
9443e2e
Fix cms path
afg1 Oct 27, 2022
cf91e65
Add SO term to name lookup and reorganise writing
afg1 Oct 27, 2022
e8aa886
Fix filenames and field names in ctl
afg1 Oct 27, 2022
10ee204
Add extra parsing to handle LSU and SSU for mt_rRNA
afg1 Oct 27, 2022
30862c9
Update SO-rnaType name lookup with Rfam types
afg1 Oct 28, 2022
9faf6c0
Adds another missing type to the mapping
afg1 Oct 28, 2022
70ec167
Check for .16. in model name properly
afg1 Oct 28, 2022
f06f883
Add sequence limit for r2dt processing
afg1 Nov 11, 2022
6309e33
Add some missing SO -> name lookups
afg1 Nov 11, 2022
7cfffea
Fix model loading ctl to be consistent with table
afg1 Nov 11, 2022
5df46d3
Pass version correctly to create-attempted
afg1 Nov 11, 2022
34e31c5
Ignore version in input to publish layout
afg1 Nov 11, 2022
8045287
Black reformatting
afg1 Nov 14, 2022
b7cd92b
Add version invormation to attempted ctl
afg1 Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions files/r2dt/attempted.ctl
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,27 @@ DROP TABLE IF EXISTS load_traveler_attempted;
$$,
$$
CREATE TABLE load_traveler_attempted (
urs text primary key
urs text primary key,
r2dt_version text,
);
$$

AFTER LOAD DO
$$
INSERT INTO pipeline_tracking_traveler (
urs,
last_run
last_run,
r2dt_version
) (
SELECT
load.urs,
NOW()
NOW(),
load.r2dt_version
FROM load_traveler_attempted load
) ON CONFLICT (urs) DO UPDATE
SET
SET
last_run = EXCLUDED.last_run
r2dt_version = EXCLUDED.r2dt_version
;
$$
;
16 changes: 11 additions & 5 deletions files/r2dt/load.ctl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ HAVING FIELDS (
sequence_start,
sequence_stop,
sequence_coverage,
inferred_should_show
inferred_should_show,
r2dt_version
) INTO {{PGDATABASE}}?load_secondary
TARGET COLUMNS (
urs,
Expand All @@ -24,7 +25,8 @@ TARGET COLUMNS (
sequence_start,
sequence_stop,
sequence_coverage,
inferred_should_show
inferred_should_show,
r2dt_version
)

WITH
Expand All @@ -49,7 +51,8 @@ create table load_secondary (
sequence_start int,
sequence_stop int,
sequence_coverage float,
inferred_should_show bool
inferred_should_show bool,
r2dt_version text
);
$$

Expand All @@ -66,7 +69,8 @@ INSERT INTO rnc_secondary_structure_layout (
sequence_start,
sequence_stop,
sequence_coverage,
inferred_should_show
inferred_should_show,
r2dt_version
) (
SELECT
urs,
Expand All @@ -79,7 +83,8 @@ SELECT
sequence_start,
sequence_stop,
sequence_coverage,
inferred_should_show
inferred_should_show,
r2dt_version
FROM load_secondary
) ON CONFLICT (urs) DO UPDATE
SET
Expand All @@ -93,6 +98,7 @@ SET
sequence_stop = EXCLUDED.sequence_stop,
sequence_coverage = EXCLUDED.sequence_coverage,
inferred_should_show = EXCLUDED.inferred_should_show
r2dt_version = EXCLUDED.r2dt_version
;
$$,
$$
Expand Down
14 changes: 10 additions & 4 deletions rnacentral_pipeline/cli/r2dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@ def cli():
@click.option("--allow-missing", is_flag=True, default=False)
@click.argument("model_info", type=click.File("r"))
@click.argument("directory", type=click.Path())
@click.argument("version", type=click.File("r"))
@click.argument("output", type=click.File("w"))
def process_svgs(model_info, directory, output, allow_missing=False):
def process_svgs(model_info, directory, version, output, allow_missing=False):
"""
Process all SVG secondary structures in the given directory and produce a
single data file that can be imported into the database.
"""
r2dt.write(model_info, directory, output, allow_missing=allow_missing)
version_string = version.read().strip()
r2dt.write(
model_info, directory, version_string, output, allow_missing=allow_missing
)


@cli.group("should-show")
Expand Down Expand Up @@ -192,9 +196,11 @@ def rnase_p_model_info(filename, output):

@cli.command("create-attempted")
@click.argument("filename", type=click.File("r"))
@click.argument("version", type=click.File("r"))
@click.argument("output", default="-", type=click.File("w"))
def r2dt_create_attempted(filename, output):
attempted.r2dt(filename, output)
def r2dt_create_attempted(filename, version, output):
version_string = version.read().strip()
attempted.r2dt(filename, version_string, output)


@cli.command("publish")
Expand Down
10 changes: 7 additions & 3 deletions rnacentral_pipeline/rnacentral/attempted.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,14 @@ def parse_rfam_version(handle: ty.IO) -> str:
raise ValueError(f"Could not find version in file {handle}")


def write(data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True):
def write(
data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True, version=None
):
writer = csv.writer(output)
seen = False
for row in data:
if version:
row.append(version)
writer.writerow(row)
seen = True
if not seen:
Expand All @@ -88,6 +92,6 @@ def qa(handle: ty.IO, name: str, version_file: ty.IO, output: ty.IO):
write(data, output)


def r2dt(handle: ty.IO, output: ty.IO):
def r2dt(handle: ty.IO, version: str, output: ty.IO):
data = fasta_parser(handle)
write(data, output)
write(data, output, version=version)
27 changes: 16 additions & 11 deletions rnacentral_pipeline/rnacentral/r2dt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,34 @@

import joblib

from rnacentral_pipeline.rnacentral.r2dt import parser
from rnacentral_pipeline.rnacentral.r2dt import should_show
from rnacentral_pipeline.rnacentral.r2dt.models import crw
from rnacentral_pipeline.rnacentral.r2dt.models import gtrnadb
from rnacentral_pipeline.rnacentral.r2dt.models import ribovision
from rnacentral_pipeline.rnacentral.r2dt.models import rnase_p
from rnacentral_pipeline.rnacentral.r2dt.models import rfam
from rnacentral_pipeline.rnacentral.r2dt import parser, should_show
from rnacentral_pipeline.rnacentral.r2dt.models import (
crw,
gtrnadb,
rfam,
ribovision,
rnase_p,
)


def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False):
def parse(model_mapping: ty.TextIO, directory: str, version: str, allow_missing=False):
path = Path(directory)
return parser.parse(model_mapping, path, allow_missing=allow_missing)
return parser.parse(model_mapping, path, version, allow_missing=allow_missing)


def write(
model_mapping: ty.TextIO, directory: str, output: ty.TextIO, allow_missing=False
model_mapping: ty.TextIO,
directory: str,
version: str,
output: ty.TextIO,
allow_missing=False,
):
"""
Parse all the secondary structure data from the given directory and write
it to the given file.
"""

parsed = parse(model_mapping, directory, allow_missing=allow_missing)
parsed = parse(model_mapping, directory, version, allow_missing=allow_missing)
writeable = (e.writeable() for e in parsed)
csv.writer(output).writerows(writeable)

Expand Down
9 changes: 7 additions & 2 deletions rnacentral_pipeline/rnacentral/r2dt/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,10 @@
import typing as ty
from pathlib import Path

from Bio import SeqIO

import attr
from attr.validators import instance_of as is_a
from attr.validators import optional
from Bio import SeqIO

from rnacentral_pipeline.databases.data import RibovoreResult

Expand Down Expand Up @@ -141,6 +140,7 @@ class R2DTResultInfo(object):
db_info = attr.ib(validator=is_a(ModelDatabaseInfo))
source = attr.ib(validator=is_a(Source))
path = attr.ib(validator=is_a(Path))
version = attr.ib(validator=is_a(str))

@property
def model_name(self):
Expand Down Expand Up @@ -269,6 +269,10 @@ def from_info(cls, info: R2DTResultInfo, hit_info=None):
def urs(self):
return self.info.urs

@property
def r2dt_version(self):
return self.info.version

@property
def model_id(self):
return self.info.model_db_id
Expand Down Expand Up @@ -341,6 +345,7 @@ def writeable(self):
sequence_stop,
sequence_coverage,
True,
self.r2dt_version,
]


Expand Down
4 changes: 2 additions & 2 deletions rnacentral_pipeline/rnacentral/r2dt/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def load_hit_info(base: Path, allow_missing: bool):


def parse(
info_path: ty.TextIO, base: Path, allow_missing=False
info_path: ty.TextIO, base: Path, version: str, allow_missing=False
) -> ty.Iterator[data.R2DTResult]:

if not base.exists():
Expand All @@ -82,7 +82,7 @@ def parse(
raise ValueError("No info for model %s", model_name)

minfo = model_info[model_name]
info = data.R2DTResultInfo(urs, minfo, source, result_base)
info = data.R2DTResultInfo(urs, minfo, source, result_base, version)
if info in seen:
LOGGER.warn("Dupcliate line in metadata for, %s", info)
continue
Expand Down
7 changes: 4 additions & 3 deletions workflows/r2dt.nf
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,18 @@ process layout_sequences {
memory params.r2dt.layout.memory
container params.r2dt.container
containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms"
errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' }
errorStrategy { task.exitStatus = 130 ? 'ignore' : 'finish' }

input:
path(sequences)

output:
tuple path("$sequences"), path('output')
tuple path("$sequences"), path('output'), path('version')

"""
esl-sfetch --index $sequences
r2dt.py draw $sequences output/
r2dt.py version | perl -ne 'm/(\d\.\d)/ && print "$1\n"' > version
"""
}

Expand All @@ -94,7 +95,7 @@ process publish_layout {

process parse_layout {
input:
tuple path(sequences), path(to_parse), path(mapping)
tuple path(sequences), path(to_parse), path(version), path(mapping)
errorStrategy "ignore"

output:
Expand Down