RNAcentral · blakesweeney · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
diff --git a/files/r2dt/attempted.ctl b/files/r2dt/attempted.ctl
@@ -19,23 +19,27 @@ DROP TABLE IF EXISTS load_traveler_attempted;
 $$,
 $$
 CREATE TABLE load_traveler_attempted (
-  urs text primary key
+  urs text primary key,
+  r2dt_version text,
 );
 $$
 
 AFTER LOAD DO
 $$
 INSERT INTO pipeline_tracking_traveler (
   urs,
-  last_run
+  last_run,
+  r2dt_version
 ) (
 SELECT
   load.urs,
-  NOW()
+  NOW(),
+  load.r2dt_version
 FROM load_traveler_attempted load
 ) ON CONFLICT (urs) DO UPDATE
-SET 
+SET
   last_run = EXCLUDED.last_run
+  r2dt_version = EXCLUDED.r2dt_version
 ;
 $$
 ;
diff --git a/files/r2dt/load.ctl b/files/r2dt/load.ctl
@@ -11,7 +11,8 @@ HAVING FIELDS (
     sequence_start,
     sequence_stop,
     sequence_coverage,
-    inferred_should_show
+    inferred_should_show,
+    r2dt_version
 ) INTO {{PGDATABASE}}?load_secondary
 TARGET COLUMNS (
     urs,
@@ -24,7 +25,8 @@ TARGET COLUMNS (
     sequence_start,
     sequence_stop,
     sequence_coverage,
-    inferred_should_show
+    inferred_should_show,
+    r2dt_version
 )
 
 WITH
@@ -49,7 +51,8 @@ create table load_secondary (
     sequence_start int,
     sequence_stop int,
     sequence_coverage float,
-    inferred_should_show bool
+    inferred_should_show bool,
+    r2dt_version text
 );
 $$
 
@@ -66,7 +69,8 @@ INSERT INTO rnc_secondary_structure_layout (
     sequence_start,
     sequence_stop,
     sequence_coverage,
-    inferred_should_show
+    inferred_should_show,
+    r2dt_version
 ) (
 SELECT
     urs,
@@ -79,7 +83,8 @@ SELECT
     sequence_start,
     sequence_stop,
     sequence_coverage,
-    inferred_should_show
+    inferred_should_show,
+    r2dt_version
 FROM load_secondary
 ) ON CONFLICT (urs) DO UPDATE
 SET
@@ -93,6 +98,7 @@ SET
     sequence_stop = EXCLUDED.sequence_stop,
     sequence_coverage = EXCLUDED.sequence_coverage,
     inferred_should_show = EXCLUDED.inferred_should_show
+    r2dt_version = EXCLUDED.r2dt_version
 ;
 $$,
 $$

diff --git a/rnacentral_pipeline/cli/r2dt.py b/rnacentral_pipeline/cli/r2dt.py
@@ -33,13 +33,17 @@ def cli():
 @click.option("--allow-missing", is_flag=True, default=False)
 @click.argument("model_info", type=click.File("r"))
 @click.argument("directory", type=click.Path())
+@click.argument("version", type=click.File("r"))
 @click.argument("output", type=click.File("w"))
-def process_svgs(model_info, directory, output, allow_missing=False):
+def process_svgs(model_info, directory, version, output, allow_missing=False):
     """
     Process all SVG secondary structures in the given directory and produce a
     single data file that can be imported into the database.
     """
-    r2dt.write(model_info, directory, output, allow_missing=allow_missing)
+    version_string = version.read().strip()
+    r2dt.write(
+        model_info, directory, version_string, output, allow_missing=allow_missing
+    )
 
 
 @cli.group("should-show")
@@ -192,9 +196,11 @@ def rnase_p_model_info(filename, output):
 
 @cli.command("create-attempted")
 @click.argument("filename", type=click.File("r"))
+@click.argument("version", type=click.File("r"))
 @click.argument("output", default="-", type=click.File("w"))
-def r2dt_create_attempted(filename, output):
-    attempted.r2dt(filename, output)
+def r2dt_create_attempted(filename, version, output):
+    version_string = version.read().strip()
+    attempted.r2dt(filename, version_string, output)
 
 
 @cli.command("publish")

diff --git a/rnacentral_pipeline/rnacentral/attempted.py b/rnacentral_pipeline/rnacentral/attempted.py
@@ -62,10 +62,14 @@ def parse_rfam_version(handle: ty.IO) -> str:
     raise ValueError(f"Could not find version in file {handle}")
 
 
-def write(data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True):
+def write(
+    data: ty.Iterable[ty.List[str]], output: ty.IO, require_attempt=True, version=None
+):
     writer = csv.writer(output)
     seen = False
     for row in data:
+        if version:
+            row.append(version)
         writer.writerow(row)
         seen = True
     if not seen:
@@ -88,6 +92,6 @@ def qa(handle: ty.IO, name: str, version_file: ty.IO, output: ty.IO):
     write(data, output)
 
 
-def r2dt(handle: ty.IO, output: ty.IO):
+def r2dt(handle: ty.IO, version: str, output: ty.IO):
     data = fasta_parser(handle)
-    write(data, output)
+    write(data, output, version=version)
diff --git a/rnacentral_pipeline/rnacentral/r2dt/__init__.py b/rnacentral_pipeline/rnacentral/r2dt/__init__.py
@@ -21,29 +21,34 @@
 
 import joblib
 
-from rnacentral_pipeline.rnacentral.r2dt import parser
-from rnacentral_pipeline.rnacentral.r2dt import should_show
-from rnacentral_pipeline.rnacentral.r2dt.models import crw
-from rnacentral_pipeline.rnacentral.r2dt.models import gtrnadb
-from rnacentral_pipeline.rnacentral.r2dt.models import ribovision
-from rnacentral_pipeline.rnacentral.r2dt.models import rnase_p
-from rnacentral_pipeline.rnacentral.r2dt.models import rfam
+from rnacentral_pipeline.rnacentral.r2dt import parser, should_show
+from rnacentral_pipeline.rnacentral.r2dt.models import (
+    crw,
+    gtrnadb,
+    rfam,
+    ribovision,
+    rnase_p,
+)
 
 
-def parse(model_mapping: ty.TextIO, directory: str, allow_missing=False):
+def parse(model_mapping: ty.TextIO, directory: str, version: str, allow_missing=False):
     path = Path(directory)
-    return parser.parse(model_mapping, path, allow_missing=allow_missing)
+    return parser.parse(model_mapping, path, version, allow_missing=allow_missing)
 
 
 def write(
-    model_mapping: ty.TextIO, directory: str, output: ty.TextIO, allow_missing=False
+    model_mapping: ty.TextIO,
+    directory: str,
+    version: str,
+    output: ty.TextIO,
+    allow_missing=False,
 ):
     """
     Parse all the secondary structure data from the given directory and write
     it to the given file.
     """
 
-    parsed = parse(model_mapping, directory, allow_missing=allow_missing)
+    parsed = parse(model_mapping, directory, version, allow_missing=allow_missing)
     writeable = (e.writeable() for e in parsed)
     csv.writer(output).writerows(writeable)
 

diff --git a/rnacentral_pipeline/rnacentral/r2dt/data.py b/rnacentral_pipeline/rnacentral/r2dt/data.py
@@ -20,11 +20,10 @@
 import typing as ty
 from pathlib import Path
 
-from Bio import SeqIO
-
 import attr
 from attr.validators import instance_of as is_a
 from attr.validators import optional
+from Bio import SeqIO
 
 from rnacentral_pipeline.databases.data import RibovoreResult
 
@@ -141,6 +140,7 @@ class R2DTResultInfo(object):
     db_info = attr.ib(validator=is_a(ModelDatabaseInfo))
     source = attr.ib(validator=is_a(Source))
     path = attr.ib(validator=is_a(Path))
+    version = attr.ib(validator=is_a(str))
 
     @property
     def model_name(self):
@@ -269,6 +269,10 @@ def from_info(cls, info: R2DTResultInfo, hit_info=None):
     def urs(self):
         return self.info.urs
 
+    @property
+    def r2dt_version(self):
+        return self.info.version
+
     @property
     def model_id(self):
         return self.info.model_db_id
@@ -341,6 +345,7 @@ def writeable(self):
             sequence_stop,
             sequence_coverage,
             True,
+            self.r2dt_version,
         ]
 
 

diff --git a/rnacentral_pipeline/rnacentral/r2dt/parser.py b/rnacentral_pipeline/rnacentral/r2dt/parser.py
@@ -60,7 +60,7 @@ def load_hit_info(base: Path, allow_missing: bool):
 
 
 def parse(
-    info_path: ty.TextIO, base: Path, allow_missing=False
+    info_path: ty.TextIO, base: Path, version: str, allow_missing=False
 ) -> ty.Iterator[data.R2DTResult]:
 
     if not base.exists():
@@ -82,7 +82,7 @@ def parse(
                 raise ValueError("No info for model %s", model_name)
 
             minfo = model_info[model_name]
-            info = data.R2DTResultInfo(urs, minfo, source, result_base)
+            info = data.R2DTResultInfo(urs, minfo, source, result_base, version)
             if info in seen:
                 LOGGER.warn("Dupcliate line in metadata for, %s", info)
                 continue

diff --git a/workflows/r2dt.nf b/workflows/r2dt.nf
@@ -59,17 +59,18 @@ process layout_sequences {
   memory params.r2dt.layout.memory
   container params.r2dt.container
   containerOptions "--bind ${params.r2dt.cms_path}:/rna/r2dt/data/cms"
-  errorStrategy { task.exitStatus = 130 ? 'ignore' : 'terminate' }
+  errorStrategy { task.exitStatus = 130 ? 'ignore' : 'finish' }
 
   input:
   path(sequences)
 
   output:
-  tuple path("$sequences"), path('output')
+  tuple path("$sequences"), path('output'), path('version')
 
   """
   esl-sfetch --index $sequences
   r2dt.py draw $sequences output/
+  r2dt.py version | perl -ne 'm/(\d\.\d)/ && print "$1\n"' > version
   """
 }
 
@@ -94,7 +95,7 @@ process publish_layout {
 
 process parse_layout {
   input:
-  tuple path(sequences), path(to_parse), path(mapping)
+  tuple path(sequences), path(to_parse), path(version), path(mapping)
   errorStrategy "ignore"
 
   output: