broadinstitute · epiercehoffman · Aug 29, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/README.md b/README.md
@@ -62,7 +62,15 @@ We still encourage members of the community to adapt GATK-SV for non-GCP backend
 
 ### Data:
 * Illumina short-read whole-genome CRAMs or BAMs, aligned to hg38 with [bwa-mem](https://github.com/lh3/bwa). BAMs must also be indexed.
-* Family structure definitions file in [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Sex aneuploidies (detected in [EvidenceQC](#evidence-qc)) should be entered as sex = 0.
+* Family structure definitions file in [PED format](#ped-format).
+
+#### <a name="ped-format">PED file format</a>
+The PED file format is described [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Note that GATK-SV imposes additional requirements:
+* The file must be tab-delimited.
+* The sex column must only contain integer values: 1=Male, 2=Female, 0=Other/Unknown. Sex chromosome aneuploidies (detected in [EvidenceQC](#evidence-qc)) should be entered as sex = 0.
+* All family, individual, and parental IDs must conform to the [sample ID requirements](#sampleids).
+* Missing parental IDs should be entered as 0.
+To validate the PED file, you may use `src/sv-pipeline/scripts/validate_ped.py -p pedigree.ped -s samples.list`.
 
 #### <a name="sample-exclusion">Sample Exclusion</a>
 We recommend filtering out samples with a high percentage of improperly paired reads (>10% or an outlier for your data) as technical outliers prior to running [GatherSampleEvidence](#gather-sample-evidence). A high percentage of improperly paired reads may indicate issues with library prep, degradation, or contamination. Artifactual improperly paired reads could cause incorrect SV calls, and these samples have been observed to have longer runtimes and higher compute costs for [GatherSampleEvidence](#gather-sample-evidence).

diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
@@ -26,7 +26,7 @@ The following cohort-level or batch-level inputs are also required:
 |---------|--------|--------------|
 |`String`|`sample_set_id`|Batch identifier|
 |`String`|`sample_set_set_id`|Cohort identifier|
-|`File`|`cohort_ped_file`|Path to the GCS location of a family structure definitions file in [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Sex aneuploidies (detected in `02-EvidenceQC`) should be entered as sex = 0.|
+|`File`|`cohort_ped_file`|Path to the GCS location of a family structure definitions file in [PED format](https://github.com/broadinstitute/gatk-sv#ped-format).|
 
 ### Pipeline outputs
 

diff --git a/src/sv-pipeline/scripts/validate_ped.py b/src/sv-pipeline/scripts/validate_ped.py
@@ -0,0 +1,137 @@
+#!/bin/python
+
+import argparse
+import re
+
+
+"""
+Summary: Validates a PED file for use with GATK-SV. Performs some sample ID validations as well.
+
+Usage: python validate_ped.py -p pedigree.ped -s samples.list
+
+Outputs: The script will write to stderr "PED file passes validation!" if successful, and
+    otherwise it will print an error message describing the PED file format violation.
+"""
+
+
+FIELD_NUMBER_ID = 1
+FIELD_NUMBER_SEX = 4
+ILLEGAL_ID_SUBSTRINGS = ["chr", "name", "DEL", "DUP", "CPX", "CHROM"]
+
+
+def validate_id(identifier, id_set, id_type, source_file):
+    # check for empty IDs
+    if identifier is None or identifier == "":
+        raise ValueError(f"Empty {id_type} ID in {source_file}")
+
+    # check all characters are alphanumeric or underscore
+    if not re.match(r'^\w+$', identifier):
+        raise ValueError(f"Invalid {id_type} ID in {source_file}: '{identifier}'. " +
+                         "IDs should only contain alphanumeric and underscore characters.")
+
+    # check for all-numeric IDs
+    if id_type != "family" and identifier.isdigit():
+        raise ValueError(f"Invalid {id_type} ID in {source_file}: {identifier}. " +
+                         "IDs should not contain only numeric characters.")
+
+    # check for illegal substrings
+    for sub in ILLEGAL_ID_SUBSTRINGS:
+        if sub in identifier:
+            raise ValueError(f"Invalid {id_type} ID in {source_file}: {identifier}. " +
+                             f"IDs cannot contain the following substrings: {', '.join(ILLEGAL_ID_SUBSTRINGS)}.")
+
+    # check for duplicate IDs
+    if id_set is not None:
+        if identifier in id_set:
+            raise ValueError(f"Duplicate {id_type} ID in {source_file}: {identifier}")
+
+
+def get_samples(samples_file):
+    # Note that this sample ID validation is incomplete:
+    # * It does not check if sample IDs are substrings of other sample IDs
+    # * It only checks the provided sample list for duplicates, which is not the full cohort in GatherBatchEvidence
+    samples = set()
+    with open(samples_file, 'r') as samp:
+        for line in samp:
+            sample = line.strip()
+            validate_id(sample, samples, "sample", "sample list")
+            samples.add(sample)
+
+    return samples
+
+
+def validate_ped(ped_file, samples):
+    seen_sex_1 = False
+    seen_sex_2 = False
+    samples_found = set()
+    with open(ped_file, 'r') as ped:
+        for line in ped:
+            # specification allows commented lines, which should be removed by SubsetPedFile task
+            if line.startswith("#"):
+                continue
+
+            # we require tab-delimited PED files although the specification does not
+            fields = line.strip().split("\t")
+            if len(fields) < 6:
+                raise ValueError("Invalid PED file. PED file must be tab-delimited and have 6 columns: " +
+                                 "family ID, sample ID, paternal ID, maternal ID, sex, and phenotype.")
+
+            # validate IDs
+            # don't check for duplicates here:
+            # family and parent IDs may appear multiple times, and sample IDs checked elsewhere
+            for identifier, id_type in zip(fields[:FIELD_NUMBER_SEX], ["family", "sample", "parent", "parent"]):
+                if id_type == "parent" and identifier == "0":
+                    continue  # allow maternal and paternal ID to be 0 (otherwise all-numeric IDs not allowed)
+                validate_id(identifier, None, id_type, "PED file")
+
+            # check for at least one appearance each of 1 and 2 in sex column
+            # this check is to ensure M/F are coded as 1 and 2 (not 0 and 1)
+            # and both M and F are present (for sex-specific steps)
+            sample_id = fields[FIELD_NUMBER_ID]
+            sex = fields[FIELD_NUMBER_SEX]
+            if sex == "1":
+                seen_sex_1 = True
+            elif sex == "2":
+                seen_sex_2 = True
+
+            # require integer sex
+            elif not sex.isdigit():
+                raise ValueError(f"Sample {sample_id} has an invalid value for sex: {sex}. " +
+                                 "PED file must use integer values for sex: 1=Male, 2=Female, 0=Unknown/Other.")
+
+            # check all samples in samples list are present in PED file exactly once
+            # no duplicate sample IDs
+            if sample_id in samples_found:
+                raise ValueError(f"Invalid PED file. PED file has duplicate entries for sample {sample_id}.")
+            elif sample_id in samples:
+                samples_found.add(sample_id)
+            # if not in samples list, ignore: ok for PED file to contain samples not in sample list
+
+    # check if all samples in sample list are present in PED file
+    if len(samples_found) < len(samples):
+        missing = samples - samples_found
+        raise ValueError(f"Invalid PED file. PED file is missing sample(s): {','.join(missing)}.")
+
+    if not (seen_sex_2 and seen_sex_1):
+        raise ValueError(f"Invalid PED file. PED file must use integer values for sex: " +
+                         "1=Male, 2=Female, 0=Unknown/Other. PED file must contain at least " +
+                         "one sample with sex=1 and one with sex=2.")
+
+    # passed validation tests
+    sys.stderr.write("PED file passes validation!")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("-p", "--ped-file", required=True, help="PED file to validate")
+    parser.add_argument("-s", "--samples-file", required=True,
+                        help="File containing samples (one per line) that should be in PED file")
+    args = parser.parse_args()
+
+    samples = get_samples(args.samples_file)
+    validate_ped(args.ped_file, samples)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/wdl/GatherBatchEvidence.wdl b/wdl/GatherBatchEvidence.wdl
@@ -161,6 +161,7 @@ workflow GatherBatchEvidence {
     RuntimeAttr? ploidy_score_runtime_attr
     RuntimeAttr? ploidy_build_runtime_attr
     RuntimeAttr? runtime_attr_subset_ped
+    RuntimeAttr? runtime_attr_validate_ped
     RuntimeAttr? add_sample_to_ped_runtime_attr
     RuntimeAttr? condense_counts_runtime_attr
     RuntimeAttr? preprocess_calls_runtime_attr
@@ -214,6 +215,14 @@ workflow GatherBatchEvidence {
   }
 
   Array[String] samples_batch = select_first([ref_panel_samples, samples])
+  call util.ValidatePedFile {
+    input:
+      ped_file = ped_file,
+      sample_list = write_lines(samples_batch),
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_attr_validate_ped
+  }
+
   call util.SubsetPedFile {
     input:
       ped_file = ped_file,

diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl
@@ -400,6 +400,44 @@ task SubsetPedFile {
 }
 
 
+task ValidatePedFile {
+  input {
+    File ped_file
+    File sample_list
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 3.75,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+
+    set -euo pipefail
+    python opt/sv-pipeline/scripts/validate_ped.py -p ~{ped_file} -s {sample_list}
+    # no outputs - task will either succeed or fail with details in log file
+
+  >>>
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
 task LocalizeCloudFileWithCredentials {
   input {
     String cloud_file_path

diff --git a/website/docs/gs/input_files.md b/website/docs/gs/input_files.md
@@ -11,8 +11,15 @@ GATK-SV requires the following input data:
   BAMs must also be indexed.
 
 - Family structure definitions file in 
-  [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format).  
-  **Note**: Sex aneuploidies (detected in EvidenceQC) should be entered as sex = 0.
+  PED format (see description below).
+
+### PED file format
+The PED file format is described [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). Note that GATK-SV imposes additional requirements:
+* The file must be tab-delimited.
+* The sex column must only contain integer values: 1=Male, 2=Female, 0=Other/Unknown. Sex chromosome aneuploidies (detected in [EvidenceQC](/docs/modules/eqc)) should be entered as sex = 0.
+* All family, individual, and parental IDs must conform to the sample ID requirements.
+* Missing parental IDs should be entered as 0.
+To validate the PED file, you may use `src/sv-pipeline/scripts/validate_ped.py -p pedigree.ped -s samples.list`.
 
 ### Sample Exclusion
 We recommend filtering out samples with a high percentage