diff --git a/pipeline_versions.txt b/pipeline_versions.txt index c80f1a60b..4d1205506 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,42 +1,42 @@ Pipeline Name Version Date of Last Commit -Optimus 7.6.0 2024-08-06 -Multiome 5.5.0 2024-08-06 -PairedTag 1.5.0 2024-08-06 -atac 2.2.3 2024-08-02 -SlideSeq 3.4.0 2024-08-06 -snm3C 4.0.3 2024-08-05 MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02 -scATAC 1.3.2 2023-08-03 +MultiSampleSmartSeq2 2.2.21 2023-04-19 +PairedTag 1.6.0 2024-08-02 +Optimus 7.6.0 2024-08-06 +atac 2.3.0 2024-08-29 +snm3C 4.0.4 2024-08-06 SmartSeq2SingleSample 5.1.20 2023-04-19 +Multiome 5.6.0 2024-08-02 +scATAC 1.3.2 2023-08-03 BuildIndices 3.0.0 2023-12-06 -MultiSampleSmartSeq2 2.2.21 2023-04-19 -CEMBA 1.1.6 2023-12-18 +SlideSeq 3.4.0 2024-08-06 BuildCembaReferences 1.0.0 2020-11-15 -UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 +CEMBA 1.1.6 2023-12-18 GDCWholeGenomeSomaticSingleSample 1.3.2 2024-08-02 -ExomeGermlineSingleSample 3.1.22 2024-06-12 -UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 -WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 -VariantCalling 2.2.1 2024-06-12 +UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02 +JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 +JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 UltimaGenomicsJointGenotyping 1.1.7 2023-12-18 JointGenotyping 1.6.10 2023-12-18 ReblockGVCF 2.2.1 2024-06-12 -JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18 -JointGenotypingByChromosomePartOne 1.4.12 2023-12-18 -ExternalExomeReprocessing 3.2.2 2024-08-02 -ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 -ExomeReprocessing 3.2.2 2024-08-02 -CramToUnmappedBams 1.1.3 2024-08-02 -WholeGenomeReprocessing 3.2.2 2024-08-02 -IlluminaGenotypingArray 1.12.21 2024-08-02 -Arrays 2.6.27 2024-08-02 -MultiSampleArrays 1.6.2 2024-08-02 +VariantCalling 2.2.1 2024-06-12 +WholeGenomeGermlineSingleSample 3.2.1 2024-06-12 +UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02 +ExomeGermlineSingleSample 3.1.22 2024-06-12 ValidateChip 1.16.5 2024-08-02 +Arrays 2.6.27 2024-08-02 Imputation 1.1.13 2024-05-21 -RNAWithUMIsPipeline 1.0.16 2023-12-18 +MultiSampleArrays 1.6.2 2024-08-02 BroadInternalUltimaGenomics 1.0.21 2024-08-02 BroadInternalArrays 1.1.11 2024-08-02 BroadInternalImputation 1.1.12 2024-08-02 BroadInternalRNAWithUMIs 1.0.33 2024-08-02 +CramToUnmappedBams 1.1.3 2024-08-02 +ExternalWholeGenomeReprocessing 2.2.2 2024-08-02 +ExternalExomeReprocessing 3.2.2 2024-08-02 +WholeGenomeReprocessing 3.2.2 2024-08-02 +ExomeReprocessing 3.2.2 2024-08-02 +IlluminaGenotypingArray 1.12.21 2024-08-02 CheckFingerprint 1.0.20 2024-08-02 AnnotationFiltration 1.2.5 2023-12-18 +RNAWithUMIsPipeline 1.0.16 2023-12-18 diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index ffe875fa0..544fb8ea5 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,10 @@ +# 2.3.0 +2024-08-29 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + +* Updated the memory for the CreateFragmentFile task + # 2.2.3 2024-08-02 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 45f6a7175..b207e393f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -46,7 +46,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.2.3" + String pipeline_version = "2.3.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" @@ -58,7 +58,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.0.9-2.6.3-1715865353" + String snap_atac_docker = "snapatac2:1.1.0" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -158,11 +158,13 @@ workflow ATAC { File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) + File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac File snap_metrics = snap_metrics_atac + File library_metrics_file = library_metrics } } @@ -505,7 +507,7 @@ task CreateFragmentFile { File annotations_gtf Boolean preindex Int disk_size = 500 - Int mem_size = 16 + Int mem_size = 64 Int nthreads = 4 String cpuPlatform = "Intel Cascade Lake" String docker_path @@ -547,17 +549,35 @@ task CreateFragmentFile { import snapatac2.preprocessing as pp import snapatac2 as snap import anndata as ad + from collections import OrderedDict + import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file if preindex == "true": - pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB") + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": - pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB") - + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + + # Add NHashID to metrics + nhash_ID_value = "XXX" + data = OrderedDict({'NHash_ID': atac_nhash_id, **data}) + # Flatten the dictionary + flattened_data = [] + for category, metrics in data.items(): + if isinstance(metrics, dict): + for metric, value in metrics.items(): + flattened_data.append((metric, value)) + else: + flattened_data.append((category, metrics)) + + # Write to CSV + csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + with open(csv_file_path, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerows(flattened_data) # Write data + + print(f"Dictionary successfully written to {csv_file_path}") - # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default - # those settings allow us to retain all barcodes - pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0) atac_data = ad.read_h5ad("temp_metrics.h5ad") # Add nhash_id to h5ad file as unstructured metadata atac_data.uns['NHashID'] = atac_nhash_id @@ -580,5 +600,6 @@ task CreateFragmentFile { output { File fragment_file = "~{bam_base_name}.fragments.tsv" File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" } } diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index afc52d57f..98904837e 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 5.6.0 +2024-08-02 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + # 5.5.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 21584c01d..d647e8294 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.5.0" + String pipeline_version = "5.6.0" input { @@ -179,6 +179,7 @@ workflow Multiome { File fragment_file_atac = JoinBarcodes.atac_fragment_tsv File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi File snap_metrics_atac = JoinBarcodes.atac_h5ad_file + File atac_library_metrics = Atac.library_metrics_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index e9da183ec..ba4a05376 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,8 @@ +# 1.6.0 +2024-08-02 (Date of Last Commit) + +* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. + # 1.5.0 2024-08-06 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index e35a153de..4206f4fab 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.5.0" + String pipeline_version = "1.6.0" input { @@ -149,6 +149,7 @@ workflow PairedTag { File atac_fragment_out = select_first([ParseBarcodes.atac_fragment_tsv,Atac_preindex.fragment_file]) File atac_h5ad_out = select_first([ParseBarcodes.atac_h5ad_file, Atac_preindex.snap_metrics]) + output { String pairedtag_pipeline_version_out = pipeline_version @@ -157,6 +158,7 @@ workflow PairedTag { File bam_aligned_output_atac = Atac_preindex.bam_aligned_output File fragment_file_atac = atac_fragment_out File snap_metrics_atac = atac_h5ad_out + File atac_library_final = Atac_preindex.library_metrics_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version diff --git a/pipelines/skylab/paired_tag/README.md b/pipelines/skylab/paired_tag/README.md index b00f015d6..97a801a49 100644 --- a/pipelines/skylab/paired_tag/README.md +++ b/pipelines/skylab/paired_tag/README.md @@ -1,6 +1,6 @@ ## Announcing a new site for WARP documentation! -Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! +Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! ### Paired-Tag summary diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index 7690c4ca2..91b7cab98 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,7 +1,12 @@ +# 4.0.4 +2024-08-06 (Date of Last Commit) + +* Updated the Demultiplexing task in the snm3C wdl to flag when file/cell is empty + # 4.0.3 -2024-08-05 (Date of Last Commit) +2024-08-06 (Date of Last Commit) -* Updated the demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present +* Updated the Demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present # 4.0.2 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 953d9912f..b7387032f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -44,7 +44,7 @@ workflow snm3C { } # version of the pipeline - String pipeline_version = "4.0.3" + String pipeline_version = "4.0.4" call Demultiplexing { input: @@ -154,6 +154,8 @@ task Demultiplexing { File random_primer_indexes String plate_id Int batch_number + Int min_threshold = 100 + Int max_threshold = 10000000 String docker Int disk_size = 1000 @@ -179,7 +181,7 @@ task Demultiplexing { $WORKING_DIR/r2.fastq.gz \ > $WORKING_DIR/~{plate_id}.stats.txt - # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz + # Remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz python3 < threshold: - os.remove(file_path) + if adapter_name in adapter_counts: + if adapter_counts[adapter_name] < ~{min_threshold} or adapter_counts[adapter_name] > ~{max_threshold}: + print("Removing ", file_path, " with count equal to ", adapter_counts[adapter_name]) + os.remove(file_path) CODE + + # Check if the number of *R1.fq.gz files is 0 + if [[ $(ls | grep "\-R1.fq.gz" | wc -l) -eq 0 ]]; then + echo "Error: No files found. All fastq files were removed. Exiting." + exit 1 + fi # Batch the fastq files into folders of batch_number size R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) + batch_number=~{batch_number} total_files=${#R1_files[@]} echo "Total files: $total_files" - batch_number=~{batch_number} - if [[ $total_files -lt $batch_number ]]; then echo "Warning: Number of files is less than the batch number. Updating batch number to $total_files." batch_number=$total_files @@ -229,14 +235,14 @@ task Demultiplexing { mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done - # Counter for the folder index + # Counter for the folder index and create emptycells file folder_index=1 - WORKING_DIR=`pwd` # Distribute the FASTQ files and create TAR files for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") r2_file="${sample_id}-R2.fq.gz" + mv $WORKING_DIR/$file batch$((folder_index))/$file mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file # Increment the counter @@ -249,7 +255,7 @@ task Demultiplexing { tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> - + runtime { docker: docker disks: "local-disk ${disk_size} SSD" diff --git a/scripts/firecloud_api/GetWorkflowOutputs.py b/scripts/firecloud_api/GetWorkflowOutputs.py new file mode 100644 index 000000000..8970a888d --- /dev/null +++ b/scripts/firecloud_api/GetWorkflowOutputs.py @@ -0,0 +1,53 @@ +import requests +import argparse + +def get_workflow_outputs(token, namespace, workspace_name, submission_id, workflow_id, pipeline_name): + # API endpoint to get the workflow outputs + url = f"https://api.firecloud.org/api/workspaces/{namespace}/{workspace_name}/submissions/{submission_id}/workflows/{workflow_id}/outputs" + print(f"Requesting URL: {url}") + + # Headers including the authorization token + headers = { + 'accept': '*/*', + 'Authorization': f'Bearer {token}', + } + + # Make the GET request + response = requests.get(url, headers=headers) + + # Check if the request was successful + if response.status_code == 200: + json_response = response.json() # parse the JSON response + # extract the outputs section using the task name + outputs = json_response.get('tasks', {}).get(pipeline_name, {}).get('outputs', {}) + + # Turn the outputs dictionary into a list of values + output_values = list(outputs.values()) + + return outputs, output_values + else: + print(f"Failed to retrieve workflow outputs. Status code: {response.status_code}") + return None, None + +if __name__ == "__main__": + # Define the command-line arguments + parser = argparse.ArgumentParser(description='Fetch workflow outputs from the API.') + parser.add_argument('--token', required=True, help='Authentication token') + parser.add_argument('--namespace', required=True, help='Workspace namespace') + parser.add_argument('--workspace', required=True, help='Workspace name') + parser.add_argument('--submission_id', required=True, help='Submission ID') + parser.add_argument('--workflow_id', required=True, help='Workflow ID') + parser.add_argument('--pipeline_name', required=True, help='Name of the pipeline') + + # Parse the arguments + args = parser.parse_args() + + # Call the function with the parsed arguments + outputs, output_values = get_workflow_outputs(args.token, args.namespace, args.workspace, args.submission_id, args.workflow_id, args.pipeline_name) + + if outputs: + print("Outputs:") + print(outputs) + + print("\nOutput Values:") + print(output_values) \ No newline at end of file diff --git a/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py b/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py new file mode 100644 index 000000000..3276bf415 --- /dev/null +++ b/scripts/firecloud_api/SubmitWorkflowAndGetStatus.py @@ -0,0 +1,90 @@ +import requests +import time +import argparse +import json + +def create_submission(token, workspace_namespace, workspace_name, submission_data): + # API endpoint + base_url = f'https://api.firecloud.org/api/workspaces/{workspace_namespace}/{workspace_name}/submissions' + + # Headers to make API requests + headers = { + 'accept': 'application/json', + 'Authorization': f'Bearer {token}', + 'Content-Type': 'application/json' + } + + # Create the submission + # send an HTTP POST request to the API endpoint + response = requests.post(base_url, headers=headers, json=submission_data) + # convert the response json into a dictionary + submission_response = response.json() + # extract the submission ID from the response dictionary + submission_id = submission_response.get("submissionId") + + if not submission_id: + print("Failed to create submission.") + else: + print(f"Submission created with ID: {submission_id}") + return submission_id + +def poll_submission_status(token, workspace_namespace, workspace_name, submission_id): + + # Status endpoint + status_url = f'https://api.firecloud.org/api/workspaces/{workspace_namespace}/{workspace_name}/submissions/{submission_id}' + + # Headers to make API requests + headers = { + 'accept': 'application/json', + 'Authorization': f'Bearer {token}' + } + + # polling the submission status + # create an emptu list to store the previous workflow status + previous_workflow_status = [] + + # loop until the submission is done + while True: + # send a get request and convert the response json into a dictionary + status_response = requests.get(status_url, headers=headers) + status_data = status_response.json() + + # get the submission status + submission_status = status_data.get("status") + # get the workflow status of each workflow in the submission + workflows_status = [workflow.get("status") for workflow in status_data.get("workflows", [])] + + # print the workflow status to stdout if it has changed + if workflows_status != previous_workflow_status: + print(f"Workflows Status: {workflows_status}") + previous_workflow_status = workflows_status + + # Check if the submission has completed + if submission_status == "Done" and "Failed" in workflows_status: + print("At least one workflow has failed.") + break + elif submission_status == "Done": + break + + # Wait for 10 seconds before polling again + time.sleep(10) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Submit and monitor a job.') + parser.add_argument('--token', required=True, help='API access token') + parser.add_argument('--workspace-namespace', required=True, help='Workspace namespace') + parser.add_argument('--workspace-name', required=True, help='Workspace name') + parser.add_argument('--submission-data-file', required=True, help='Path to the JSON file containing submission data') + + args = parser.parse_args() + + # load submission data from JSON file + with open(args.submission_data_file, 'r') as file: + submission_data = json.load(file) + + # create submission and get submission ID + submission_id = create_submission(args.token, args.workspace_namespace, args.workspace_name, submission_data) + + if submission_id: + # Poll submission status + poll_submission_status(args.token, args.workspace_namespace, args.workspace_name, submission_id) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 1bb2dd639..fc3a985ab 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -93,6 +93,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_``.atac_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) ## Versioning and testing diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md new file mode 100644 index 000000000..184cfeb8e --- /dev/null +++ b/website/docs/Pipelines/ATAC/library-metrics.md @@ -0,0 +1,35 @@ +--- +sidebar_position: 2 +--- + +# ATAC Library Metrics Overview + +The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/SnapATAC2) to generate library-level metrics in CSV format. + + +| Metric | Description | +| --- | --- | +| NHash_ID | A unique identifier used to track and reference the specific sample or dataset. | +| Sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | +| Sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | +| Fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | +| Fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| Fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| Number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | +| Mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | +| Median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | +| Fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | +| Fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | +| Fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | +| Fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | +| Fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | +| Fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | +| Fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | +| Fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | +| TSS_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | +| Fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | +| Number_of_peaks | The total number of peaks, or regions of accessible chromatin, identified in the dataset, representing potential regulatory elements. | +| Fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | +| Fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | + + diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index d77c5ec3b..afb277766 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -107,6 +107,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | fragment_file_atac | `_atac.fragments.sorted.tsv.gz` | Sorted and bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "ATAC Barcode", "Number of reads", and "GEX Barcode". | | fragment_file_index | `_atac.fragments.sorted.tsv.gz.tbi` | tabix index file for the fragment file. | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. Also contains the equivalent gene expression barcode for each ATAC barcode in the `gex_barcodes` column of the `h5ad.obs` property. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. | +| atac_library_metrics | `_.atac.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | | matrix_gex | `_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. |