From d4fffce3772ed33c9d12c81b983e3e152f0c2d83 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 15 Mar 2023 10:59:39 +0100 Subject: [PATCH] feat: add support for SV TSV files (#75) (#78) --- clinvar_this/batches.py | 44 +- clinvar_this/cli.py | 9 +- clinvar_this/io/tsv.py | 557 ++++++++++++++++-- docs/file_formats.rst | 6 +- .../data/io_tsv/example_sv.bad.tsv | 1 + tests/clinvar_this/data/io_tsv/example_sv.tsv | 2 + tests/clinvar_this/test_cli.py | 43 +- tests/clinvar_this/test_io_tsv.py | 81 ++- 8 files changed, 636 insertions(+), 107 deletions(-) create mode 100644 tests/clinvar_this/data/io_tsv/example_sv.bad.tsv create mode 100644 tests/clinvar_this/data/io_tsv/example_sv.tsv diff --git a/clinvar_this/batches.py b/clinvar_this/batches.py index 4ccbcf6..9ccd985 100644 --- a/clinvar_this/batches.py +++ b/clinvar_this/batches.py @@ -122,18 +122,26 @@ def import_(config: config.Config, name: str, path: str, metadata: typing.Tuple[ logger.info("Creating new payload only") previous_submission_container = None if path.endswith(".tsv") or path.endswith(".txt"): - tsv_records = tsv.read_tsv(path=path) - batch_metadata = tsv.batch_metadata_from_mapping(metadata, use_defaults=True) - new_submission_container = tsv.tsv_records_to_submission_container( - tsv_records, batch_metadata - ) - if previous_submission_container: - submission_container = _merge_submission_container( - base=previous_submission_container, - patch=new_submission_container, - ) + tsv_type = tsv.guess_tsv_type(path) + if tsv_type in (tsv.TsvType.SEQ_VAR, tsv.TsvType.STRUC_VAR): + batch_metadata = tsv.batch_metadata_from_mapping(metadata, use_defaults=True) + if tsv_type == tsv.TsvType.SEQ_VAR: + new_submission_container = tsv.seq_var_tsv_records_to_submission_container( + tsv.read_seq_var_tsv(path=path), batch_metadata + ) + else: # tsv_type == tsv.TsvType.STRUC_VAR + new_submission_container = tsv.struc_var_tsv_records_to_submission_container( + tsv.read_struc_var_tsv(path=path), batch_metadata + ) + if previous_submission_container: + submission_container = _merge_submission_container( + base=previous_submission_container, + patch=new_submission_container, + ) + else: + submission_container = new_submission_container else: - submission_container = new_submission_container + raise exceptions.IOException(f"Could not guess TSV file type from header for {path}") _write_payload(submission_container, config.profile, name) else: # pragma: no cover raise exceptions.IOException(f"File extension of {path} cannot be handled.") @@ -152,7 +160,9 @@ def _load_latest_payload(profile: str, name: str): return common.CONVERTER.structure(payload_unstructured, models.SubmissionContainer) -def export(config: config.Config, name: str, path: str, force: bool = False): +def export( + config: config.Config, name: str, path: str, force: bool = False, struc_var: bool = False +): """Export the batch with the given ``name`` to the file at ``path``.""" if pathlib.Path(path).exists() and not force: raise exceptions.IOException( @@ -160,8 +170,14 @@ def export(config: config.Config, name: str, path: str, force: bool = False): ) if path.endswith(".tsv") or path.endswith(".txt"): payload = _load_latest_payload(config.profile, name) - tsv_records = tsv.submission_container_to_tsv_records(payload) - tsv.write_tsv(tsv_records, path=path) + if struc_var: + tsv.write_struc_var_tsv( + tsv_records=tsv.submission_container_to_struc_var_tsv_records(payload), path=path + ) + else: + tsv.write_seq_var_tsv( + tsv_records=tsv.submission_container_to_seq_var_tsv_records(payload), path=path + ) else: # pragma: no cover raise exceptions.IOException(f"File extension of {path} cannot be handled.") diff --git a/clinvar_this/cli.py b/clinvar_this/cli.py index ac4d59b..c94f2bd 100644 --- a/clinvar_this/cli.py +++ b/clinvar_this/cli.py @@ -118,16 +118,23 @@ def batch_import( @click.argument("name") @click.argument("path") @click.option("--force/--no-force", required=False, default=False, help="Overwrite existing files") +@click.option( + "--struc-var/--no-struc-var", + required=False, + default=False, + help="Export structural variants rather than sequence variants", +) @click.pass_context def batch_export( ctx: click.Context, name: str, path: str, force: bool = False, + struc_var: bool = False, ): """Export batch data to a given file""" config_obj = load_config(ctx.obj["profile"]) - batches.export(config_obj, name, path, force) + batches.export(config_obj, name, path, force, struc_var) @batch.command("update-metadata") diff --git a/clinvar_this/io/tsv.py b/clinvar_this/io/tsv.py index 6d92c33..ec163ad 100644 --- a/clinvar_this/io/tsv.py +++ b/clinvar_this/io/tsv.py @@ -38,13 +38,14 @@ from clinvar_api.msg.sub_payload import ( ClinicalFeaturesAffectedStatus, ClinicalFeaturesDb, + VariantType, ) from clinvar_this import exceptions -@attrs.define(frozen=True) -class TsvRecord: - """Record for reading.""" +@attrs.frozen +class SeqVarTsvRecord: + """Record for reading sequence variant TSV.""" #: Assembly assembly: Assembly @@ -75,7 +76,41 @@ class TsvRecord: @attrs.frozen -class HeaderColumn: +class StrucVarTsvRecord: + """Record for reading structural variant TSV.""" + + #: Assembly + assembly: Assembly + #: Chromosome + chromosome: Chromosome + #: Start position + start: int + #: Stop position + stop: int + #: Variant type + sv_type: VariantType + #: OMIM ID + omim: typing.List[str] + #: Mode of inheritance + inheritance: typing.Optional[ModeOfInheritance] + #: Clinical significance + clinical_significance_description: ClinicalSignificanceDescription + #: Local identifier of variant-condition pair. + local_key: typing.Optional[str] = None + #: Additional columns + extra_data: typing.Dict[str, str] = attrs.field(factory=dict) + #: Date of last evaluation of clinical significance + clinical_significance_date_last_evaluated: typing.Optional[str] = None + #: Additional comment of clinical significance + clinical_significance_comment: typing.Optional[str] = None + #: HPO terms for clinical features + hpo_terms: typing.Optional[typing.List[str]] = None + + +@attrs.frozen +class SeqVarHeaderColumn: + """Header column of sequence variant TSV.""" + #: Interpreted header names from TSV header_names: typing.Tuple[str] #: The corresponding key in in ``TsvRecord`` @@ -85,7 +120,28 @@ class HeaderColumn: #: Type converter on import converter: typing.Callable[[str], typing.Any] #: Extractor on export - extractor: typing.Callable[[TsvRecord], str] + extractor: typing.Callable[[SeqVarTsvRecord], str] + + @property + def canonical_name(self): + """The first entry in ``header_names`` is the canonical one.""" + return self.header_names[0] + + +@attrs.frozen +class StrucVarHeaderColumn: + """Header column of structural variant TSV.""" + + #: Interpreted header names from TSV + header_names: typing.Tuple[str] + #: The corresponding key in in ``TsvRecord`` + key: str + #: Whether the header is required + required: bool + #: Type converter on import + converter: typing.Callable[[str], typing.Any] + #: Extractor on export + extractor: typing.Callable[[StrucVarTsvRecord], str] @property def canonical_name(self): @@ -95,7 +151,10 @@ def canonical_name(self): def _str_list(val: str, pat: str = r"[;,]") -> typing.List[str]: """Split a string and return list of trimmed entries""" - return [x.strip() for x in re.split(pat, val)] + if not val: + return [] + else: + return [x.strip() for x in re.split(pat, val)] def _uuid4_if_falsy(value: typing.Optional[str] = None) -> typing.Union[str, uuid.UUID]: @@ -129,86 +188,174 @@ def _join_list(xs: typing.List[typing.Any]) -> str: return ",".join([str(x).strip() for x in xs]) -#: The header columns for TSV files. -HEADER_COLUMNS: typing.Tuple[HeaderColumn, ...] = ( - HeaderColumn( +#: The header columns for sequence variant TSV files. +SEQ_VAR_HEADER_COLUMNS: typing.Tuple[SeqVarHeaderColumn, ...] = ( + SeqVarHeaderColumn( header_names=("ASSEMBLY",), key="assembly", required=True, converter=str, extractor=lambda r: _enum_value(r.assembly), ), - HeaderColumn( + SeqVarHeaderColumn( header_names=("CHROM",), key="chromosome", required=True, converter=str, extractor=lambda r: _enum_value(r.chromosome), ), - HeaderColumn( + SeqVarHeaderColumn( header_names=("POS",), key="pos", required=True, converter=int, extractor=lambda r: str(r.pos), ), - HeaderColumn( + SeqVarHeaderColumn( header_names=("REF",), key="ref", required=True, converter=str, extractor=lambda r: str(r.ref), ), - HeaderColumn( + SeqVarHeaderColumn( header_names=("ALT",), key="alt", required=True, converter=str, extractor=lambda r: str(r.alt), ), - HeaderColumn( + SeqVarHeaderColumn( + header_names=("OMIM",), + key="omim", + required=True, + converter=_str_list, + extractor=lambda r: _join_list(r.omim), + ), + SeqVarHeaderColumn( + header_names=("MOI",), + key="inheritance", + required=True, + converter=lambda x: x or None, + extractor=lambda r: _enum_value_or_empty(r.inheritance), + ), + SeqVarHeaderColumn( + header_names=("CLIN_SIG",), + key="clinical_significance_description", + required=True, + converter=str, + extractor=lambda r: str(r.clinical_significance_description), + ), + SeqVarHeaderColumn( + header_names=("CLIN_EVAL",), + key="clinical_significance_date_last_evaluated", + required=False, + converter=_today_if_falsy, + extractor=lambda r: str(r.clinical_significance_date_last_evaluated or ""), + ), + SeqVarHeaderColumn( + header_names=("CLIN_COMMENT",), + key="clinical_significance_comment", + required=False, + converter=lambda x: x or None, + extractor=lambda r: str(r.clinical_significance_comment or ""), + ), + SeqVarHeaderColumn( + header_names=("KEY",), + key="local_key", + required=False, + converter=_uuid4_if_falsy, + extractor=lambda r: str(r.local_key), + ), + SeqVarHeaderColumn( + header_names=("HPO",), + key="hpo_terms", + required=False, + converter=_str_list, + extractor=lambda r: _join_list(r.omim), + ), +) + +#: The header columns for structural variant TSV files. +STRUC_VAR_HEADER_COLUMNS: typing.Tuple[StrucVarHeaderColumn, ...] = ( + StrucVarHeaderColumn( + header_names=("ASSEMBLY",), + key="assembly", + required=True, + converter=str, + extractor=lambda r: _enum_value(r.assembly), + ), + StrucVarHeaderColumn( + header_names=("CHROM",), + key="chromosome", + required=True, + converter=str, + extractor=lambda r: _enum_value(r.chromosome), + ), + StrucVarHeaderColumn( + header_names=("START",), + key="start", + required=True, + converter=int, + extractor=lambda r: str(r.start), + ), + StrucVarHeaderColumn( + header_names=("STOP",), + key="stop", + required=True, + converter=int, + extractor=lambda r: str(r.stop), + ), + StrucVarHeaderColumn( + header_names=("SV_TYPE",), + key="sv_type", + required=True, + converter=str, + extractor=lambda r: str(r.sv_type), + ), + StrucVarHeaderColumn( header_names=("OMIM",), key="omim", required=True, converter=_str_list, extractor=lambda r: _join_list(r.omim), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("MOI",), key="inheritance", required=True, converter=lambda x: x or None, extractor=lambda r: _enum_value_or_empty(r.inheritance), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("CLIN_SIG",), key="clinical_significance_description", required=True, converter=str, extractor=lambda r: str(r.clinical_significance_description), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("CLIN_EVAL",), key="clinical_significance_date_last_evaluated", required=False, converter=_today_if_falsy, extractor=lambda r: str(r.clinical_significance_date_last_evaluated or ""), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("CLIN_COMMENT",), key="clinical_significance_comment", required=False, converter=lambda x: x or None, extractor=lambda r: str(r.clinical_significance_comment or ""), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("KEY",), key="local_key", required=False, converter=_uuid4_if_falsy, extractor=lambda r: str(r.local_key), ), - HeaderColumn( + StrucVarHeaderColumn( header_names=("HPO",), key="hpo_terms", required=False, @@ -218,13 +365,41 @@ def _join_list(xs: typing.List[typing.Any]) -> str: ) -def _map_header(header: typing.List[str]) -> typing.List[typing.Optional[HeaderColumn]]: - """Map header row from TSV file to header columns +class TsvType(enum.Enum): + """Type of TSV file.""" + + #: Sequence variants. + SEQ_VAR = "seqvar" + #: Structural variants. + STRUC_VAR = "strucvar" + + +def guess_tsv_type(path: str) -> typing.Optional[TsvType]: + """Guess TSV type.""" + with open(path, "rt") as inputf: + arr = inputf.readline().strip().split("\t") + try: + _map_seq_var_header(arr) + return TsvType.SEQ_VAR + except exceptions.InvalidFormat: + try: + _map_struc_var_header(arr) + return TsvType.STRUC_VAR + except exceptions.InvalidFormat: + return None + + +def _map_seq_var_header( + header: typing.List[str], +) -> typing.List[typing.Optional[SeqVarHeaderColumn]]: + """Map header row from sequence variant TSV file to header columns Map to ``None`` for extra data columns. Raises if a required column is missing. """ - seen_required = {column.canonical_name: False for column in HEADER_COLUMNS if column.required} - by_name = {name: column for column in HEADER_COLUMNS for name in column.header_names} + seen_required = { + column.canonical_name: False for column in SEQ_VAR_HEADER_COLUMNS if column.required + } + by_name = {name: column for column in SEQ_VAR_HEADER_COLUMNS for name in column.header_names} result = [] for entry in header: column = by_name.get(entry) @@ -239,8 +414,65 @@ def _map_header(header: typing.List[str]) -> typing.List[typing.Optional[HeaderC return result -def _read_tsv_file(inputf: typing.TextIO) -> typing.List[TsvRecord]: - """Read TSV from the given file.""" +def _map_struc_var_header( + header: typing.List[str], +) -> typing.List[typing.Optional[StrucVarHeaderColumn]]: + """Map header row from structural variant TSV file to header columns + + Map to ``None`` for extra data columns. Raises if a required column is missing. + """ + seen_required = { + column.canonical_name: False for column in STRUC_VAR_HEADER_COLUMNS if column.required + } + by_name = {name: column for column in STRUC_VAR_HEADER_COLUMNS for name in column.header_names} + result = [] + for entry in header: + column = by_name.get(entry) + if column: + seen_required[column.canonical_name] = True + result.append(column) + + missing_columns = [name for name, seen in seen_required.items() if not seen] + if missing_columns: + raise exceptions.InvalidFormat(f"Missing columns in TSV file: {missing_columns}") + + return result + + +def _read_seq_var_tsv_file(inputf: typing.TextIO) -> typing.List[SeqVarTsvRecord]: + """Read sequence variant TSV from the given file.""" + + def row_empty(row: typing.List[str]) -> bool: + return not row or not [val.strip() for val in row if val.strip()] + + reader = csv.reader(inputf, delimiter="\t") + header_row = None + headers = None + + result: typing.List[SeqVarTsvRecord] = [] + for lineno, row in enumerate(reader): + if row_empty(row): + continue # skip empty lines + if header_row: + raw_record = {} + extra_data = {} + if len(row) != len(header_row): + raise exceptions.InvalidFormat(f"Wrong number of rows in line {lineno+1}") + for value, header, header_name in zip(row, headers, header_row): + if header: + raw_record[header.key] = header.converter(value) + else: + extra_data[header_name] = value + record = cattrs.structure(raw_record, SeqVarTsvRecord) + result.append(attrs.evolve(record, extra_data=extra_data)) + else: + header_row = row + headers = _map_seq_var_header(row) + return result + + +def _read_struc_var_tsv_file(inputf: typing.TextIO) -> typing.List[StrucVarTsvRecord]: + """Read structural variant TSV from the given file.""" def row_empty(row: typing.List[str]) -> bool: return not row or not [val.strip() for val in row if val.strip()] @@ -249,7 +481,7 @@ def row_empty(row: typing.List[str]) -> bool: header_row = None headers = None - result: typing.List[TsvRecord] = [] + result: typing.List[StrucVarTsvRecord] = [] for lineno, row in enumerate(reader): if row_empty(row): continue # skip empty lines @@ -263,31 +495,65 @@ def row_empty(row: typing.List[str]) -> bool: raw_record[header.key] = header.converter(value) else: extra_data[header_name] = value - record = cattrs.structure(raw_record, TsvRecord) + record = cattrs.structure(raw_record, StrucVarTsvRecord) result.append(attrs.evolve(record, extra_data=extra_data)) else: header_row = row - headers = _map_header(row) + headers = _map_struc_var_header(row) return result -def read_tsv( +def read_seq_var_tsv( *, file: typing.Optional[typing.TextIO] = None, path: typing.Union[None, str, pathlib.Path] = None, -) -> typing.List[TsvRecord]: - """Read TSV from either file or path""" +) -> typing.List[SeqVarTsvRecord]: + """Read sequence variant TSV from either file or path""" if file: - return _read_tsv_file(file) + return _read_seq_var_tsv_file(file) elif path: with pathlib.Path(path).open("rt") as inputf: - return _read_tsv_file(inputf) + return _read_seq_var_tsv_file(inputf) else: raise TypeError("You have to provide either file or path") -def _write_tsv_file(tsv_records: typing.Iterable[TsvRecord], outputf: typing.TextIO): - """Write records as TSV to the given file.""" +def read_struc_var_tsv( + *, + file: typing.Optional[typing.TextIO] = None, + path: typing.Union[None, str, pathlib.Path] = None, +) -> typing.List[StrucVarTsvRecord]: + """Read structural variant TSV from either file or path""" + if file: + return _read_struc_var_tsv_file(file) + elif path: + with pathlib.Path(path).open("rt") as inputf: + return _read_struc_var_tsv_file(inputf) + else: + raise TypeError("You have to provide either file or path") + + +def _write_seq_var_tsv_file(tsv_records: typing.Iterable[SeqVarTsvRecord], outputf: typing.TextIO): + """Write sequence variant records as TSV to the given file.""" + extra_keys = [] + for record in tsv_records: + if record.extra_data: + for key in record.extra_data: + if key not in extra_keys: + extra_keys.append(key) + writer = csv.writer(outputf, delimiter="\t") + writer.writerow([h.canonical_name for h in SEQ_VAR_HEADER_COLUMNS] + extra_keys) + for record in tsv_records: + row = [hc.extractor(record) for hc in SEQ_VAR_HEADER_COLUMNS] + [ + record.extra_data.get(extra_key, "") for extra_key in extra_keys + ] + writer.writerow(row) + + +def _write_struc_var_tsv_file( + tsv_records: typing.Iterable[StrucVarTsvRecord], outputf: typing.TextIO +): + """Write structural variant records as TSV to the given file.""" extra_keys = [] for record in tsv_records: if record.extra_data: @@ -295,26 +561,42 @@ def _write_tsv_file(tsv_records: typing.Iterable[TsvRecord], outputf: typing.Tex if key not in extra_keys: extra_keys.append(key) writer = csv.writer(outputf, delimiter="\t") - writer.writerow([h.canonical_name for h in HEADER_COLUMNS] + extra_keys) + writer.writerow([h.canonical_name for h in STRUC_VAR_HEADER_COLUMNS] + extra_keys) for record in tsv_records: - row = [hc.extractor(record) for hc in HEADER_COLUMNS] + [ + row = [hc.extractor(record) for hc in STRUC_VAR_HEADER_COLUMNS] + [ record.extra_data.get(extra_key, "") for extra_key in extra_keys ] writer.writerow(row) -def write_tsv( - tsv_records: typing.Iterable[TsvRecord], +def write_seq_var_tsv( + tsv_records: typing.Iterable[SeqVarTsvRecord], *, file: typing.Optional[typing.TextIO] = None, path: typing.Union[None, str, pathlib.Path] = None, ): - """Write TSV to either file or path""" + """Write sequence variant TSV to either file or path""" if file: - return _write_tsv_file(tsv_records, file) + return _write_seq_var_tsv_file(tsv_records, file) elif path: with pathlib.Path(path).open("wt") as outputf: - return _write_tsv_file(tsv_records, outputf) + return _write_seq_var_tsv_file(tsv_records, outputf) + else: + raise TypeError("You have to provide either file or path") + + +def write_struc_var_tsv( + tsv_records: typing.Iterable[StrucVarTsvRecord], + *, + file: typing.Optional[typing.TextIO] = None, + path: typing.Union[None, str, pathlib.Path] = None, +): + """Write structural variant TSV to either file or path""" + if file: + return _write_struc_var_tsv_file(tsv_records, file) + elif path: + with pathlib.Path(path).open("wt") as outputf: + return _write_struc_var_tsv_file(tsv_records, outputf) else: raise TypeError("You have to provide either file or path") @@ -366,13 +648,13 @@ def batch_metadata_from_mapping( return BatchMetadata(**kwargs) -def tsv_records_to_submission_container( - tsv_records: typing.List[TsvRecord], +def seq_var_tsv_records_to_submission_container( + tsv_records: typing.List[SeqVarTsvRecord], batch_metadata: BatchMetadata, ) -> SubmissionContainer: - """Convert TSV records to submission container data structure.""" + """Convert seq. var. TSV records to submission container data structure.""" - def record_condition(record: TsvRecord) -> SubmissionCondition: + def record_condition(record: SeqVarTsvRecord) -> SubmissionCondition: """Construct ``SubmissionCondition`` from ``TsvRecord``.""" if not record.omim or record.omim == ["not provided"]: return SubmissionCondition(name="not provided") @@ -380,7 +662,7 @@ def record_condition(record: TsvRecord) -> SubmissionCondition: return SubmissionCondition(db=ConditionDb.OMIM, id=record.omim[0]) def record_clinical_features( - record: TsvRecord, + record: SeqVarTsvRecord, ) -> typing.Optional[typing.List[SubmissionClinicalFeature]]: """Construct ``typing.Optional[typing.List[SubmissionClinicalFeature]]`` from ``TsvRecord``.""" if record.hpo_terms: @@ -449,9 +731,91 @@ def record_clinical_features( ) -def submission_container_to_tsv_records( +def struc_var_tsv_records_to_submission_container( + tsv_records: typing.List[StrucVarTsvRecord], + batch_metadata: BatchMetadata, +) -> SubmissionContainer: + """Convert struc. var. TSV records to submission container data structure.""" + + def record_condition(record: StrucVarTsvRecord) -> SubmissionCondition: + """Construct ``SubmissionCondition`` from ``TsvRecord``.""" + if not record.omim or record.omim == ["not provided"]: + return SubmissionCondition(name="not provided") + else: + return SubmissionCondition(db=ConditionDb.OMIM, id=record.omim[0]) + + def record_clinical_features( + record: StrucVarTsvRecord, + ) -> typing.Optional[typing.List[SubmissionClinicalFeature]]: + """Construct ``typing.Optional[typing.List[SubmissionClinicalFeature]]`` from ``TsvRecord``.""" + if record.hpo_terms: + return [ + SubmissionClinicalFeature( + clinical_features_affected_status=ClinicalFeaturesAffectedStatus.PRESENT, + db=ClinicalFeaturesDb.HP, + id=hpo_term, + ) + for hpo_term in record.hpo_terms + ] + else: + return None + + allele_origin = batch_metadata.allele_origin or BATCH_METADATA_DEFAULTS["batch_metadata"] + collection_method = ( + batch_metadata.collection_method or BATCH_METADATA_DEFAULTS["collection_method"] + ) + release_status = batch_metadata.release_status or BATCH_METADATA_DEFAULTS["release_status"] + + return SubmissionContainer( + assertion_criteria=SubmissionAssertionCriteria( + # The following should come from the profile, cf. + # + # https://github.com/bihealth/clinvar-this/issues/36 + db=CitationDb.PUBMED, + id="25741868", + ), + clinvar_submission_release_status=release_status, + clinvar_submission=[ + SubmissionClinvarSubmission( + local_id=str(_uuid4_if_falsy()), + local_key=record.local_key, + condition_set=SubmissionConditionSet(condition=[record_condition(record)]), + observed_in=[ + SubmissionObservedIn( + affected_status=AffectedStatus.YES, + allele_origin=allele_origin, + collection_method=collection_method, + clinical_features=record_clinical_features(record), + ) + ], + clinical_significance=SubmissionClinicalSignificance( + clinical_significance_description=record.clinical_significance_description, + mode_of_inheritance=record.inheritance, + ), + record_status=RecordStatus.NOVEL, + variant_set=SubmissionVariantSet( + variant=[ + SubmissionVariant( + chromosome_coordinates=SubmissionChromosomeCoordinates( + assembly=record.assembly, + chromosome=record.chromosome, + start=record.start, + stop=record.stop, + ), + variant_type=record.sv_type, + ) + ] + ), + extra_data=record.extra_data or None, # prefer ``None`` over ``{}`` + ) + for record in tsv_records + ], + ) + + +def submission_container_to_seq_var_tsv_records( submission_container: SubmissionContainer, -) -> typing.List[TsvRecord]: +) -> typing.List[SeqVarTsvRecord]: def _condition(submission: SubmissionClinvarSubmission) -> typing.List[str]: if not submission.condition_set.condition: raise exceptions.ClinvarThisException( @@ -471,7 +835,9 @@ def _inheritance(submission: SubmissionClinvarSubmission) -> typing.Optional[Mod else: return None - def submission_to_tsv_record(submission: SubmissionClinvarSubmission) -> TsvRecord: + def submission_to_seq_var_tsv_record( + submission: SubmissionClinvarSubmission, + ) -> SeqVarTsvRecord: if not submission.variant_set: raise exceptions.ClinvarThisException( "Problem with internal data structure - no variant set" @@ -505,7 +871,7 @@ def submission_to_tsv_record(submission: SubmissionClinvarSubmission) -> TsvReco if submission.extra_data: extra_data.update(submission.extra_data) - return TsvRecord( + return SeqVarTsvRecord( assembly=chromosome_coordinates.assembly, chromosome=chromosome_coordinates.chromosome, pos=chromosome_coordinates.start, @@ -523,4 +889,89 @@ def submission_to_tsv_record(submission: SubmissionClinvarSubmission) -> TsvReco clinvar_submissions = submission_container.clinvar_submission or [] - return [submission_to_tsv_record(submission) for submission in clinvar_submissions] + return [submission_to_seq_var_tsv_record(submission) for submission in clinvar_submissions] + + +def submission_container_to_struc_var_tsv_records( + submission_container: SubmissionContainer, +) -> typing.List[StrucVarTsvRecord]: + def _condition(submission: SubmissionClinvarSubmission) -> typing.List[str]: + if not submission.condition_set.condition: + raise exceptions.ClinvarThisException( + "Problem with internal data structure - condition cannot be empty" + ) + if submission.condition_set.condition[0].name: + return [] # not provided + else: + if submission.condition_set.condition[0].id: + return [submission.condition_set.condition[0].id] + else: + return [] + + def _inheritance(submission: SubmissionClinvarSubmission) -> typing.Optional[ModeOfInheritance]: + if submission.clinical_significance.mode_of_inheritance: + return submission.clinical_significance.mode_of_inheritance + else: + return None + + def submission_to_struc_var_tsv_record( + submission: SubmissionClinvarSubmission, + ) -> StrucVarTsvRecord: + if not submission.variant_set: + raise exceptions.ClinvarThisException( + "Problem with internal data structure - no variant set" + ) + elif not submission.variant_set.variant: + raise exceptions.ClinvarThisException( + "Problem with internal data structure - no variant" + ) + elif not submission.variant_set.variant[0].chromosome_coordinates: + raise exceptions.ClinvarThisException( + "Problem with internal data structure - no chromosome coordinates" + ) + else: + chromosome_coordinates: SubmissionChromosomeCoordinates = ( + submission.variant_set.variant[0].chromosome_coordinates + ) + variant_type: typing.Optional[VariantType] = submission.variant_set.variant[ + 0 + ].variant_type + if not ( + chromosome_coordinates.assembly + and chromosome_coordinates.chromosome + and chromosome_coordinates.start + and chromosome_coordinates.stop + ): + raise exceptions.ClinvarThisException( + "Problem with internal data structure - incomplete coordinates" + ) + if not variant_type: + raise exceptions.ClinvarThisException( + "Problem with internal data structure - no variant type" + ) + + extra_data = {} + if submission.clinvar_accession: + extra_data["clinvar_accession"] = submission.clinvar_accession # XXX + if submission.extra_data: + extra_data.update(submission.extra_data) + + return StrucVarTsvRecord( + assembly=chromosome_coordinates.assembly, + chromosome=chromosome_coordinates.chromosome, + start=chromosome_coordinates.start, + stop=chromosome_coordinates.stop, + sv_type=variant_type, + omim=_condition(submission), + inheritance=_inheritance(submission), + clinical_significance_description=submission.clinical_significance.clinical_significance_description, + local_key=submission.local_key or "", + extra_data=extra_data, + clinical_significance_date_last_evaluated=submission.clinical_significance.date_last_evaluated + or "", + clinical_significance_comment=submission.clinical_significance.comment or "", + ) + + clinvar_submissions = submission_container.clinvar_submission or [] + + return [submission_to_struc_var_tsv_record(submission) for submission in clinvar_submissions] diff --git a/docs/file_formats.rst b/docs/file_formats.rst index 94befa7..9c01a4f 100644 --- a/docs/file_formats.rst +++ b/docs/file_formats.rst @@ -10,9 +10,9 @@ In the case of the non-native TSV format, this section documents how the file fo Overall, the aim of clinvar-this is to support you in submitting data easily with restrictions (see :ref:`limitations`). If you need the full functionality of the NCBI ClinVar API then please consider using the ``clinvar_api`` Python module. --------------------------- -Small Variant TSV (Native) --------------------------- +----------------------------- +Sequence Variant TSV (Native) +----------------------------- The following headers are required. Clinvar-this will recognize the TSV file format based on these headers. diff --git a/tests/clinvar_this/data/io_tsv/example_sv.bad.tsv b/tests/clinvar_this/data/io_tsv/example_sv.bad.tsv new file mode 100644 index 0000000..67be85f --- /dev/null +++ b/tests/clinvar_this/data/io_tsv/example_sv.bad.tsv @@ -0,0 +1 @@ +bad diff --git a/tests/clinvar_this/data/io_tsv/example_sv.tsv b/tests/clinvar_this/data/io_tsv/example_sv.tsv new file mode 100644 index 0000000..b3e0187 --- /dev/null +++ b/tests/clinvar_this/data/io_tsv/example_sv.tsv @@ -0,0 +1,2 @@ +ASSEMBLY CHROM START STOP SV_TYPE OMIM MOI CLIN_SIG HPO +GRCh38 1 844347 4398122 Deletion Autosomal dominant inheritance not provided HP:0001263 diff --git a/tests/clinvar_this/test_cli.py b/tests/clinvar_this/test_cli.py index 48e0f4d..e4e0854 100644 --- a/tests/clinvar_this/test_cli.py +++ b/tests/clinvar_this/test_cli.py @@ -79,26 +79,33 @@ def test_call_config_set_fail_invalid_name(): @pytest.mark.parametrize( - "force", + "force,struc_var", [ - True, - False, + (True, False), + (False, False), + (True, True), + (False, True), ], ) -def test_call_batch_export(fs_config, monkeypatch, force): - """Unit test of ``batch export``, stubs out call of ``batches.list`` and checks results.""" +def test_call_batch_export(fs_config, monkeypatch, force, struc_var): + """Unit test of ``batch export`` with both sequence and structural variant. + + The test stubs out call of ``batches.list`` and checks results. + """ mock_export = MagicMock() monkeypatch.setattr(batches, "export", mock_export) args = ["batch", "export", "batch-name", "out-tsv"] if force: args.append("--force") + if struc_var: + args.append("--struc-var") runner = CliRunner() result = runner.invoke(cli.cli, args) mock_export.assert_called_once() - assert len(mock_export.call_args.args) == 4 + assert len(mock_export.call_args.args) == 5 assert len(mock_export.call_args.kwargs) == 0 assert ( str(mock_export.call_args.args[0]) @@ -107,6 +114,7 @@ def test_call_batch_export(fs_config, monkeypatch, force): assert mock_export.call_args.args[1] == "batch-name" assert mock_export.call_args.args[2] == "out-tsv" assert mock_export.call_args.args[3] == force + assert mock_export.call_args.args[4] == struc_var assert result.exit_code == 0 @@ -119,7 +127,10 @@ def test_call_batch_export(fs_config, monkeypatch, force): ], ) def test_call_batch_import(fs_config, monkeypatch, name, metadata): - """Unit test of ``batch import``, stubs out call of ``batches.import_`` and checks results.""" + """Unit test of ``batch import``. + + The test stubs out call of ``batches.import_`` and checks results. + """ mock_import = MagicMock() monkeypatch.setattr(batches, "import_", mock_import) @@ -149,24 +160,6 @@ def test_call_batch_import(fs_config, monkeypatch, name, metadata): assert result.exit_code == 0 -def test_call_batch_list(fs_config, monkeypatch): - """Unit test of ``batch list``, stubs out call of ``batches.list`` and checks results.""" - mock_list = MagicMock() - monkeypatch.setattr(batches, "list_", mock_list) - - runner = CliRunner() - result = runner.invoke(cli.cli, ["batch", "list"]) - - mock_list.assert_called_once() - assert len(mock_list.call_args.args) == 1 - assert len(mock_list.call_args.kwargs) == 0 - assert ( - str(mock_list.call_args.args[0]) - == "Config(profile='default', auth_token='****', verify_ssl=True)" - ) - assert result.exit_code == 0 - - @pytest.mark.parametrize( "name,metadata", [ diff --git a/tests/clinvar_this/test_io_tsv.py b/tests/clinvar_this/test_io_tsv.py index 14166c7..eee7e9f 100644 --- a/tests/clinvar_this/test_io_tsv.py +++ b/tests/clinvar_this/test_io_tsv.py @@ -7,17 +7,25 @@ Chromosome, ClinicalSignificanceDescription, ModeOfInheritance, + VariantType, ) from clinvar_this import exceptions -from clinvar_this.io.tsv import TsvRecord, read_tsv +from clinvar_this.io.tsv import ( + SeqVarTsvRecord, + StrucVarTsvRecord, + TsvType, + guess_tsv_type, + read_seq_var_tsv, + read_struc_var_tsv, +) DATA_DIR = pathlib.Path(__file__).parent / "data/io_tsv" -def test_read_tsv_path(): - actual = read_tsv(path=DATA_DIR / "example.tsv") +def test_read_seq_var_tsv_path(): + actual = read_seq_var_tsv(path=DATA_DIR / "example.tsv") assert actual == [ - TsvRecord( + SeqVarTsvRecord( assembly=Assembly.GRCH37, chromosome=Chromosome.CHR10, pos=115614632, @@ -32,11 +40,28 @@ def test_read_tsv_path(): ] -def test_read_tsv_file(): +def test_read_struc_var_tsv_path(): + actual = read_struc_var_tsv(path=DATA_DIR / "example_sv.tsv") + assert actual == [ + StrucVarTsvRecord( + assembly=Assembly.GRCH38, + chromosome=Chromosome.CHR1, + start=844347, + stop=4398122, + sv_type=VariantType.DELETION, + omim=[], + inheritance=ModeOfInheritance.AUTOSOMAL_DOMINANT_INHERITANCE, + clinical_significance_description=ClinicalSignificanceDescription.NOT_PROVIDED, + hpo_terms=["HP:0001263"], + ) + ] + + +def test_read_seq_var_tsv_file(): with (DATA_DIR / "example.tsv").open("rt") as inputf: - actual = read_tsv(file=inputf) + actual = read_seq_var_tsv(file=inputf) assert actual == [ - TsvRecord( + SeqVarTsvRecord( assembly=Assembly.GRCH37, chromosome=Chromosome.CHR10, pos=115614632, @@ -51,11 +76,45 @@ def test_read_tsv_file(): ] -def test_read_tsv_path_bad(): +def test_read_struc_var_tsv_file(): + with (DATA_DIR / "example_sv.tsv").open("rt") as inputf: + actual = read_struc_var_tsv(file=inputf) + assert actual == [ + StrucVarTsvRecord( + assembly=Assembly.GRCH38, + chromosome=Chromosome.CHR1, + start=844347, + stop=4398122, + sv_type=VariantType.DELETION, + omim=[], + inheritance=ModeOfInheritance.AUTOSOMAL_DOMINANT_INHERITANCE, + clinical_significance_description=ClinicalSignificanceDescription.NOT_PROVIDED, + hpo_terms=["HP:0001263"], + ) + ] + + +def test_read_seq_var_tsv_path_bad(): + with pytest.raises(exceptions.InvalidFormat): + read_seq_var_tsv(path=DATA_DIR / "example.bad.tsv") + + +def test_read_struc_var_tsv_path_bad(): with pytest.raises(exceptions.InvalidFormat): - read_tsv(path=DATA_DIR / "example.bad.tsv") + read_struc_var_tsv(path=DATA_DIR / "example_sv.bad.tsv") -def test_read_tsv_error(): +def test_read_seq_var_tsv_error(): with pytest.raises(TypeError): - read_tsv() + read_seq_var_tsv() + + +def test_read_struc_var_tsv_error(): + with pytest.raises(TypeError): + read_struc_var_tsv() + + +def test_guess_file_type(): + assert guess_tsv_type(path=DATA_DIR / "example.bad.tsv") == None # noqa: E711 + assert guess_tsv_type(path=DATA_DIR / "example.tsv") == TsvType.SEQ_VAR + assert guess_tsv_type(path=DATA_DIR / "example_sv.tsv") == TsvType.STRUC_VAR