Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: Modify annotations and schemas #76

Merged
merged 7 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 11 additions & 28 deletions schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,7 @@
"title": "Version"
},
"code": {
"allOf": [
{
"$ref": "#/$defs/Code"
}
],
"$ref": "#/$defs/Code",
"description": "A symbol uniquely identifying the concept, as in a syntax defined by the code system. CURIE format is preferred where possible (e.g. 'SO:0000704' is the CURIE form of the Sequence Ontology code for 'gene')."
}
},
Expand Down Expand Up @@ -412,19 +408,11 @@
"description": "A mapping to a concept in a terminology or code system.",
"properties": {
"coding": {
"allOf": [
{
"$ref": "#/$defs/Coding"
}
],
"$ref": "#/$defs/Coding",
"description": "A structured representation of a code for a defined concept in a terminology or code system."
},
"relation": {
"allOf": [
{
"$ref": "#/$defs/Relation"
}
],
"$ref": "#/$defs/Relation",
"description": "A mapping relation between concepts as defined by the Simple Knowledge Organization System (SKOS)."
}
},
Expand All @@ -439,11 +427,7 @@
"description": "Representation of a variation by a specified nomenclature or syntax for a\nVariation object. Common examples of expressions for the description of molecular\nvariation include the HGVS and ISCN nomenclatures.",
"properties": {
"syntax": {
"allOf": [
{
"$ref": "#/$defs/Syntax"
}
],
"$ref": "#/$defs/Syntax",
"description": "The syntax used to describe the variation. The value should be one of the supported syntaxes."
},
"value": {
Expand Down Expand Up @@ -751,11 +735,7 @@
"title": "Mappings"
},
"sequence": {
"allOf": [
{
"$ref": "#/$defs/SequenceString"
}
],
"$ref": "#/$defs/SequenceString",
"description": "the literal sequence"
}
},
Expand Down Expand Up @@ -972,21 +952,24 @@
"pre_mapped": {
"anyOf": [
{
"$ref": "#/$defs/Allele"
"$ref": "#/$defs/CisPhasedBlock"
},
{
"$ref": "#/$defs/CisPhasedBlock"
"$ref": "#/$defs/Allele"
}
],
"title": "Pre Mapped"
},
"post_mapped": {
"anyOf": [
{
"$ref": "#/$defs/CisPhasedBlock"
},
{
"$ref": "#/$defs/Allele"
},
{
"$ref": "#/$defs/CisPhasedBlock"
"type": "null"
}
],
"title": "Post Mapped"
Expand Down
9 changes: 8 additions & 1 deletion src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,16 @@ def _get_blat_output(metadata: ScoresetMetadata, silent: bool) -> QueryResult:
"""
with tempfile.NamedTemporaryFile() as query_file:
query_file = _build_query_file(metadata, Path(query_file.name))
if len(metadata.target_sequence) > 25000:
msg = f"Target sequence for {metadata.urn} must have a length <= 25000 to run BLAT"
raise AlignmentError(msg)

if metadata.target_sequence_type == TargetSequenceType.PROTEIN:
target_args = "-q=prot -t=dnax"
elif metadata.target_gene_category == TargetType.PROTEIN_CODING:
elif (
metadata.target_gene_category == TargetType.PROTEIN_CODING
and len(metadata.target_sequence) <= 10000
):
target_args = "-q=dnax -t=dnax"
else:
target_args = ""
Expand Down
97 changes: 78 additions & 19 deletions src/dcd_mapping/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
CisPhasedBlock,
Expression,
LiteralSequenceExpression,
SequenceString,
)

from dcd_mapping.lookup import (
Expand Down Expand Up @@ -53,6 +54,27 @@ def _get_vrs_1_3_ext(allele: Allele) -> Extension:
)


def _get_va_digest(allele: Allele) -> Extension:
"""Return the VA digest for a pre-mapped allele
:param allele: A pre-mapped variant
:return A VRS extension reporting the pre-mapped digest
"""
return Extension(name="pre_mapped_id", value=allele.id)


def _is_valid_allele(allele: Allele, align_result: AlignmentResult) -> bool:
"""Check if a post-mapped allele occurs within the alignment coverage
:param allele: A post-mapped allele
:param align_result: Alignment data
:return True if position occurs in coverage, False if not
"""
return (
align_result.query_range.start
<= allele.location.start
<= align_result.query_range.end
)


def _offset_allele_ref_seq(ss: str, start: int, end: int) -> tuple[int, int]:
"""Handle known edge cases in start and end coordinates for vrs_ref_allele_seq."""
if ss.startswith("urn:mavedb:00000060-a-1"):
Expand Down Expand Up @@ -94,7 +116,7 @@ def _get_vrs_ref_allele_seq(
ref = sr.get_sequence(seq, start, end)
if ref is None:
raise ValueError
return Extension(name="vrs_ref_allele_seq", value=ref)
return SequenceString(root=ref)


def _get_hgvs_string(allele: Allele, accession: str) -> tuple[str, Syntax]:
Expand Down Expand Up @@ -179,16 +201,19 @@ def _annotate_allele_mapping(
mapped_score: MappedScore,
tx_results: TxSelectResult | None,
metadata: ScoresetMetadata,
align_result: AlignmentResult,
) -> ScoreAnnotationWithLayer:
"""Perform annotations for allele mappings."""
pre_mapped: Allele = mapped_score.pre_mapped
post_mapped: Allele = mapped_score.post_mapped

# get vrs_ref_allele_seq for pre-mapped variants
pre_mapped.extensions = [
_get_vrs_ref_allele_seq(pre_mapped, metadata, tx_results),
_get_vrs_1_3_ext(pre_mapped),
]
pre_mapped.location.sequence = _get_vrs_ref_allele_seq(
pre_mapped, metadata, tx_results
)

# Determine reference sequence
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
Expand All @@ -206,17 +231,29 @@ def _annotate_allele_mapping(
sr = get_seqrepo()
loc = mapped_score.post_mapped.location
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
post_mapped.location.sequence = SequenceString(
root=sr.get_sequence(sequence_id, loc.start, loc.end)
)
post_mapped.extensions = [
Extension(name="vrs_ref_allele_seq", value=ref),
_get_vrs_1_3_ext(post_mapped),
_get_va_digest(pre_mapped),
]
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]

namespace = metadata.urn
val = mapped_score.accession_id.split("#")[1]

# Check if post-mapped allele is valid
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
post_mapped = (
post_mapped if _is_valid_allele(pre_mapped, align_result) else None
)

# Remove extra digest attributes
pre_mapped.digest = None
post_mapped.digest = None

return ScoreAnnotationWithLayer(
pre_mapped=pre_mapped,
post_mapped=post_mapped,
Expand All @@ -240,17 +277,21 @@ def _get_vrs_1_3_haplotype_id(cpb: CisPhasedBlock) -> str:


def _annotate_cpb_mapping(
mapping: MappedScore, tx_results: TxSelectResult | None, metadata: ScoresetMetadata
mapping: MappedScore,
tx_results: TxSelectResult | None,
metadata: ScoresetMetadata,
align_result: AlignmentResult,
) -> ScoreAnnotationWithLayer:
"""Perform annotations and create VRS 1.3 equivalents for CisPhasedBlock mappings."""
pre_mapped: CisPhasedBlock = mapping.pre_mapped # type: ignore
post_mapped: CisPhasedBlock = mapping.post_mapped # type: ignore
# get vrs_ref_allele_seq for pre-mapped variants
for allele in pre_mapped.members:
allele.extensions = [
_get_vrs_ref_allele_seq(allele, metadata, tx_results),
_get_vrs_1_3_ext(allele),
]
allele.location.sequence = _get_vrs_ref_allele_seq(allele, metadata, tx_results)
allele.digest = None
# Determine reference sequence
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
sequence_id = (
Expand All @@ -267,23 +308,37 @@ def _annotate_cpb_mapping(
accession = tx_results.np

sr = get_seqrepo()
for allele in post_mapped.members:
loc = allele.location
valid_post_mapped_alleles = []
for post_mapped_allele, pre_mapped_allele in zip(
post_mapped.members, pre_mapped.members, strict=True
):
loc = post_mapped_allele.location
sequence_id = f"ga4gh:{loc.sequenceReference.refgetAccession}"
ref = sr.get_sequence(sequence_id, loc.start, loc.end)
allele.extensions = [
Extension(name="vrs_ref_allele_seq", value=ref),
_get_vrs_1_3_ext(allele),
post_mapped_allele.location.sequence = SequenceString(
root=sr.get_sequence(sequence_id, loc.start, loc.end)
)
post_mapped_allele.extensions = [
_get_vrs_1_3_ext(post_mapped_allele),
_get_va_digest(pre_mapped_allele),
]
hgvs, syntax = _get_hgvs_string(allele, accession)
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
hgvs, syntax = _get_hgvs_string(post_mapped_allele, accession)
post_mapped_allele.expressions = [Expression(syntax=syntax, value=hgvs)]
if mapping.annotation_layer == AnnotationLayer.PROTEIN or _is_valid_allele(
pre_mapped_allele, align_result
):
valid_post_mapped_alleles.append(post_mapped_allele)
post_mapped_allele.digest = None
post_mapped.members = valid_post_mapped_alleles

pre_mapped.extensions = [
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(pre_mapped))
]
post_mapped.extensions = [
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped))
]
if len(post_mapped.members) >= 2:
post_mapped.extensions = [
Extension(name="vrs_v1.3_id", value=_get_vrs_1_3_haplotype_id(post_mapped)),
]
else:
post_mapped = post_mapped.members[0]

namespace = metadata.urn
val = mapping.accession_id.split("#")[1]
Expand All @@ -301,6 +356,7 @@ def annotate(
mapped_scores: list[MappedScore],
tx_results: TxSelectResult | None,
metadata: ScoresetMetadata,
align_result: AlignmentResult,
) -> list[ScoreAnnotationWithLayer]:
"""Given a list of mappings, add additional contextual data:

Expand All @@ -316,6 +372,7 @@ def annotate(
:param vrs_results: in-progress variant mappings
:param tx_select_results: transcript selection if available
:param metadata: MaveDB scoreset metadata
:param align_result: Alignment data
:return: annotated mappings objects
"""
score_annotations = []
Expand All @@ -324,13 +381,15 @@ def annotate(
mapped_score.post_mapped, CisPhasedBlock
):
score_annotations.append(
_annotate_cpb_mapping(mapped_score, tx_results, metadata)
_annotate_cpb_mapping(mapped_score, tx_results, metadata, align_result)
)
elif isinstance(mapped_score.pre_mapped, Allele) and isinstance(
mapped_score.post_mapped, Allele
):
score_annotations.append(
_annotate_allele_mapping(mapped_score, tx_results, metadata)
_annotate_allele_mapping(
mapped_score, tx_results, metadata, align_result
)
)
else:
ValueError("inconsistent variant structure")
Expand Down
2 changes: 1 addition & 1 deletion src/dcd_mapping/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ async def map_scoreset(
_emit_info("VRS mapping complete.", silent)

_emit_info("Annotating metadata and saving to file...", silent)
vrs_results = annotate(vrs_results, transcript, metadata)
vrs_results = annotate(vrs_results, transcript, metadata, alignment_result)
final_output = save_mapped_output_json(
metadata.urn,
vrs_results,
Expand Down
10 changes: 5 additions & 5 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ class TargetSequenceType(str, Enum):
class TargetType(str, Enum):
"""Define target gene types."""

PROTEIN_CODING = "Protein coding"
REGULATORY = "Regulatory"
OTHER_NC = "Other noncoding"
PROTEIN_CODING = "protein_coding"
REGULATORY = "regulatory"
OTHER_NC = "other_noncoding"


class UniProtRef(BaseModel):
Expand Down Expand Up @@ -145,7 +145,7 @@ class MappedScore(BaseModel):
annotation_layer: AnnotationLayer
score: str | None
pre_mapped: Allele | CisPhasedBlock
post_mapped: Allele | CisPhasedBlock | None
post_mapped: Allele | CisPhasedBlock


class ScoreAnnotation(BaseModel):
Expand All @@ -155,7 +155,7 @@ class ScoreAnnotation(BaseModel):
"""

pre_mapped: CisPhasedBlock | Allele
post_mapped: CisPhasedBlock | Allele
post_mapped: CisPhasedBlock | Allele | None
mavedb_id: StrictStr
relation: Literal["SO:is_homologous_to"] = "SO:is_homologous_to"
score: float | None
Expand Down
15 changes: 1 addition & 14 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,20 +462,7 @@ def _get_variation(

# Run ga4gh_identify to assign VA digest
allele.id = ga4gh_identify(allele)

# Check if the start of an allele is covered by the alignment block for
# post-mapped genomic variants
if layer == AnnotationLayer.GENOMIC:
if pre_map:
alleles.append(allele)
else:
if (
allele.location.start >= alignment.hit_range.start
and allele.location.start < alignment.hit_range.end
):
alleles.append(allele)
else:
alleles.append(allele)
alleles.append(allele)

if not alleles:
return None
Expand Down
Loading
Loading