Merge pull request #411 from d3b-center/probe-annot

Create methyl array probes annotation module
d3b-center · Oct 6, 2023 · 6039915 · 6039915
2 parents 0e6aad3 + 69f9403
commit 6039915
Show file tree

Hide file tree

Showing 11 changed files with 863,659 additions and 3 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -88,6 +88,13 @@ RUN wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar
     make install && \
     cd .. && rm -rf htslib-1.9
 
+# GenomeTools
+RUN wget http://genometools.org/pub/genometools-1.6.2.tar.gz  && \
+    tar -zxvf genometools-1.6.2.tar.gz  && rm -f genometools-1.6.2.tar.gz && \
+    cd genometools-1.6.2 && \
+    make cairo=no && \
+    make prefix=/usr/local cairo=no install && \
+    cd .. && rm -rf genometools-1.6.2
 
 #### R packages
 ###############

diff --git a/analyses/methylation-probe-annotations/01-create-bed-files.R b/analyses/methylation-probe-annotations/01-create-bed-files.R
@@ -0,0 +1,160 @@
+# Create GENCODE gene features and Illumina infinium methylation array CpG 
+# probe coordinates bed files
+
+# Eric Wafula for Pediatric OpenTargets
+# 06/26/2023
+
+# Load libraries
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(rtracklayer))
+suppressPackageStartupMessages(library(tidyverse))
+
+# set up optparse options
+option_list <- list(
+  make_option(opt_str = "--gencode_gtf", type = "character", default = NULL,
+              help = "GENCODE GTF",
+              metavar = "character"),
+  make_option(opt_str = "--gencode_gff", type = "character", default = NULL,
+              help = "GENCODE GFF with intron coordinates",
+              metavar = "character"),
+  make_option(opt_str = "--cpg_map", type = "character", default = NULL,
+              help = "Illumina infinium methylation array CpG probe coordinates",
+              metavar = "character")
+)
+
+# parse parameter options
+opt <- parse_args(OptionParser(option_list = option_list))
+gencode_gtf <- opt$gencode_gtf
+gencode_gff <- opt$gencode_gff
+cpg_map <- opt$cpg_map
+
+# establish base dir
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Set module and inputs directory
+module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
+inputs_dir <- file.path(module_dir, "inputs")
+
+# create gene to transcript from gencode gtf
+gene_transcript_map <- rtracklayer::import(con = gencode_gtf) |> 
+  tibble::as_tibble() |> 
+  dplyr::select(gene_id, transcript_id, gene_name) |>
+  tidyr::drop_na() |>
+  dplyr::mutate(transcript_id = stringr::str_extract(transcript_id, "ENST\\d+"), 
+                gene_id = stringr::str_extract(gene_id, "ENSG\\d+")) |> 
+  dplyr::distinct()
+
+# load gencode gff with intron coordinates and covert to bed format or selected
+# nuclear chromosome gene features - exon, intron, five_prime_UTR, and 
+# three_prime_UTR
+gencode_bed <- rtracklayer::readGFF(gencode_gff) |>
+  tibble::as_tibble() |>
+  dplyr::filter(type %in% c("exon", "intron",
+                            "five_prime_UTR", "three_prime_UTR"),
+                !grepl("^chrM|^GL|^KI", seqid)) |>
+  dplyr::select(seqid, start, end, type, Parent, strand) |>
+  dplyr::rename(transcript = Parent) |>
+  dplyr::mutate(seqid = as.character(seqid),
+                type = as.character(type),
+                transcript = as.character(transcript),
+                transcript = stringr::str_extract(transcript, "\\w+")) |>
+  dplyr::distinct()
+
+# filter gene features bed to retain only the most terminal 3'-UTRs and 5'-UTRs 
+# where multiple UTRs have been predicted for a transcript
+exons_introns <- gencode_bed |> 
+  dplyr::filter(!type %in% c("five_prime_UTR", "three_prime_UTR"))
+plus_five_utrs <-gencode_bed |> 
+  dplyr::filter(type == "five_prime_UTR", strand == "+") |> 
+  dplyr::group_by(seqid, type, transcript, strand) |> 
+  dplyr::summarize(start = min(start), end = min(end)) |> 
+  dplyr::arrange(seqid, start, end) |> 
+  ungroup()
+minus_five_utrs <-gencode_bed |> 
+  dplyr::filter(type == "five_prime_UTR", strand == "-") |> 
+  dplyr::group_by(seqid, type, transcript, strand) |> 
+  dplyr::summarize(start = min(start), end = min(end)) |> 
+  dplyr::arrange(seqid, start, end) |> 
+  ungroup()
+plus_three_utrs <-gencode_bed |> 
+  dplyr::filter(type == "three_prime_UTR", strand == "+") |> 
+  dplyr::group_by(seqid, type, transcript, strand) |> 
+  dplyr::summarize(start = min(start), end = min(end)) |> 
+  dplyr::arrange(seqid, start, end) |> 
+  ungroup()
+minus_three_utrs <-gencode_bed |> 
+  dplyr::filter(type == "three_prime_UTR", strand == "-") |> 
+  dplyr::group_by(seqid, type, transcript, strand) |> 
+  dplyr::summarize(start = min(start), end = min(end)) |> 
+  dplyr::arrange(seqid, start, end) |> 
+  ungroup()
+gencode_bed <- dplyr::bind_rows(exons_introns, plus_five_utrs, minus_five_utrs,
+                                plus_three_utrs, minus_three_utrs) |> 
+  dplyr::select(seqid, start, end, type, strand, transcript) |> 
+  dplyr::arrange(seqid, start, end, transcript) |> 
+  dplyr::distinct() 
+
+# # extend 3'-UTR by 1kb
+# gencode_bed <- gencode_bed |>
+#   dplyr::mutate(start = case_when(type == "three_prime_UTR" &
+#                                     strand == "-" ~ as.integer(start - 1000),
+#                                   TRUE ~ as.integer(start)),
+#                 end = case_when(type == "three_prime_UTR" &
+#                                   strand == "+" ~ as.integer(end + 1000),
+#                                 TRUE ~ as.integer(end)))
+
+
+# add three_prime_UTR_extension - 1kb extension beyond the 3'-UTR
+three_prime_UTR_extensions <- gencode_bed |>
+  dplyr::filter(type == "three_prime_UTR") |>
+  dplyr::mutate(type = "three_prime_UTR_extension",
+                start = case_when(strand == "-" ~ as.integer(start - 1001), 
+                                  strand == "+" ~  as.integer(end + 1)),
+                end = case_when(strand == "-" ~ as.integer(start + 1000),
+                                strand == "+" ~ as.integer(end + 1001)))
+
+
+# add promoter region - 1kb extension beyond the 5'-UTR
+promoters <- gencode_bed |>
+  dplyr::filter(type == "five_prime_UTR") |>
+  dplyr::mutate(type = "promoter",
+                start = case_when(strand == "-" ~ as.integer(end + 1), 
+                                  strand == "+" ~  as.integer(start - 1001)),
+                end = case_when(strand == "-" ~ as.integer(end + 1001),
+                                strand == "+" ~ as.integer(start + 1000)))
+
+# merge promoters to all other features
+gencode_features_bed <- gencode_bed |> 
+  dplyr::bind_rows(promoters) |>
+  dplyr::bind_rows(three_prime_UTR_extensions) |>
+  dplyr::mutate(start = case_when(start < 1 ~ as.integer(1),
+                                  TRUE ~ as.integer(start)),
+                end = case_when(end < 1 ~ as.integer(1),
+                                TRUE ~ as.integer(end))) |> 
+  dplyr::arrange(seqid, start, end, transcript) |>
+  dplyr::rename(transcript_id = transcript) |>
+  # dplyr::rename(Chromosome = seqid, Start = start, End = end, Strand = strand, 
+  #               Gene_Feature = type, transcript_id = transcript) |>
+  dplyr::left_join(gene_transcript_map, by = "transcript_id") |>
+  dplyr::distinct()
+
+# write gencode gene features bed format to file
+gencode_features_bed |> 
+  readr::write_tsv(
+    file.path(inputs_dir, 
+              "gencode.v39.primary_assembly.gene_features.annotation.bed"),
+    col_names = FALSE)
+
+# write methylation array CpG probe coordinates to bed format to file
+readr::read_csv(cpg_map, skip = 7) |> 
+  dplyr::select(CHR, MAPINFO, Name) |> 
+  dplyr::mutate(MAPINFO2 = MAPINFO) |>
+  dplyr::select(CHR, MAPINFO, MAPINFO2, Name) |>
+  tidyr::drop_na() |>
+  dplyr::mutate(CHR = paste0("chr", CHR)) |>
+  dplyr::arrange(CHR, MAPINFO) |>
+  dplyr::distinct() |>
+  readr::write_tsv(
+    file.path(inputs_dir,
+              "infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed"),
+    col_names = FALSE)
diff --git a/analyses/methylation-probe-annotations/02-parse-bedtools-intersect.R b/analyses/methylation-probe-annotations/02-parse-bedtools-intersect.R
@@ -0,0 +1,58 @@
+# Reformat GENCODE gene features and Illumina infinium methylation array CpG 
+# probe coordinates Human Build 38 (GRCh38/hg38) liftover bedtools intersect 
+
+# Eric Wafula for Pediatric OpenTargets
+# 06/26/2023
+
+# Load libraries
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(tidyverse))
+
+# set up optparse options
+option_list <- list(
+  make_option(opt_str = "--bed_intersect", type = "character", default = NULL,
+              help = "GENCODE gene fetures and CpG liftover intersect bed file",
+              metavar = "character")
+)
+
+# parse parameter options
+opt <- parse_args(OptionParser(option_list = option_list))
+bed_intersect <- opt$bed_intersect
+
+# establish base dir
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+
+# Set module and results directory
+module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
+results_dir <- file.path(module_dir, "results")
+
+
+# Parse bedtools GENCODE gene fetures and CpG liftover intersect bed file
+
+readr::read_tsv(
+  file.path(results_dir, 
+            "infinium.gencode.v39.probe.annotations.bedtools.tsv.gz"), 
+  col_names = FALSE) |> 
+  dplyr::select(X1, X2, X4, X8, X6, X7, X9, X10, X11, X12) |>
+  dplyr::rename(Chromosome = X1, Location = X2, Probe_ID = X4, 
+                Gene_Feature = X8, Start = X6, End = X7, Strand = X9, 
+                transcript_id = X10, targetFromSourceId = X11, 
+                Gene_symbol = X12) |> 
+  dplyr::distinct() |> 
+  dplyr::arrange(Chromosome, Location) |>
+  dplyr::mutate(Gene_Feature = case_when(Gene_Feature == "." ~ "intergenic", 
+                                         TRUE ~ Gene_Feature), 
+                Start = case_when(Start == -1 ~ NA_integer_, 
+                                  TRUE ~ as.integer(Start)), 
+                End = case_when(End == -1 ~ NA_integer_, 
+                                TRUE ~ as.integer(End)), 
+                Strand = case_when(Strand == "." ~ NA_character_, 
+                                   TRUE ~ Strand), 
+                transcript_id = case_when(transcript_id == "." ~ NA_character_,
+                                          TRUE ~ transcript_id), 
+                targetFromSourceId = case_when(targetFromSourceId == "." ~ NA_character_, 
+                                               TRUE ~ targetFromSourceId), 
+                Gene_symbol = case_when(Gene_symbol == "." ~ NA_character_, 
+                                        TRUE ~ Gene_symbol)) |> 
+  readr::write_tsv(
+    file.path(results_dir, "infinium.gencode.v39.probe.annotations.tsv.gz"))
diff --git a/analyses/methylation-probe-annotations/README.md b/analyses/methylation-probe-annotations/README.md
@@ -0,0 +1,65 @@
+# OpenPedCan Methylation Array Probe Annotations
+
+## Purpose
+
+This module create Illumina Infinium Human Methylation array CpG probe annotations based on the `Human Build 38 (GRCh38/hg38)` and `GENCODE v39 release`.
+The [450K and EPIC Illumina Infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) are based on the `Human Build 37 (GRCh37/hg19)` genome assembly. 
+Probe coordinates are converted to `Human Build 38 (GRCh38/hg38)` using the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter). 
+A probe annotation file, `infinium.gencode.v39.probe.annotations.tsv` is created by annotating all probes lifted over with associated gene features (i.e., `promoter`, `5' UTR`, `exon`, `intron`, `3'UTR`, `three_prime_UTR_extension`, and `intergenic`) based on [GENCODE v39 release](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/) that is currently utilized in the OpenPedCan data analyses. 
+Intron coordinates, typically not included in the GFF3/GTF genome annotation formats, are added to the GENCODE annotations file using [GenomeTools](http://genometools.org/). Probe locations are then assigned with their intersecting gene annotation features using [bedtools](https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html). 
+
+
+## Analysis scripts
+1. **`01-create-bed-files.R`** script create GENCODE gene features and Illumina infinium methylation array CpG probe coordinates input BED files
+
+```
+Usage: 01-create-bed-files.R [options]
+
+Options:
+  --gencode_gtf=CHARACTER
+    GENCODE GTF
+
+  --gencode_gff=CHARACTER
+    GENCODE GFF with intron coordinates
+
+  --cpg_map=CHARACTER
+    Illumina infinium methylation array CpG probe coordinates
+
+  -h, --help
+    Show this help message and exit
+```
+
+2. **`02-parse-bedtools-intersect.R`** script reformats the bedtools intersect of the GENCODE gene features and lifted over (GRCh38 to GRCh38) CpG probe coordinates to final OpenPedCan methylation array probe annotations.
+```
+Usage: 02-parse-bedtools-intersect.R [options]
+
+Options:
+  --bed_intersect=CHARACTER
+    GENCODE gene fetures and CpG liftover intersect bed file
+
+  -h, --help
+    Show this help message and exit
+```
+
+3. `run-methyl-probe-annotation.sh` is a wrapper bash script for executing all the other analysis scripts in the module, including creating the GENCODE GFF file inserted intron coordinates and the primary assignment of CpG probe locations to GENCODE gene features. 
+```
+bash run-methyl-probe-annotation.sh
+```
+
+## Input datasets
+OpenPedCan-analysis/blob/dev/download-data.sh). 
+- `../../data/gencode.v39.primary_assembly.annotation.gtf.gz`
+- `../../data/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip`
+- `inputs/gencode.v39.primary_assembly.annotation.gff3.gz`
+- `inputs/gencode.v39.primary_assembly.introns.annotation.gff3.gz`
+- `inputs/gencode.v39.primary_assembly.gene_features.annotation.bed`
+- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`
+- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`
+
+**`NOTE:`** The GpG probe coordinates liftover process is performed by running the running `01-create-bed-files.R` script to create GRCh37 array probe CpG BED file, `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`, which is then uploaded to the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter) to generate GRCh38 array probe CpG BED file,`inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`.
+
+
+## Output datasets
+- `results/infinium.gencode.v39.probe.annotations.bedtools.tsv.gz`
+- `results/infinium.gencode.v39.probe.annotations.tsv.gz`
+
diff --git a/analyses/methylation-probe-annotations/inputs/.gitignore b/analyses/methylation-probe-annotations/inputs/.gitignore
@@ -0,0 +1,6 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
+!gencode.v39.primary_assembly.annotation.gff3.gz
+!infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed
diff --git a/...yses/methylation-probe-annotations/inputs/gencode.v39.primary_assembly.annotation.gff3.gz b/...yses/methylation-probe-annotations/inputs/gencode.v39.primary_assembly.annotation.gff3.gz