Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create methyl array probes annotation module #411

Merged
merged 11 commits into from
Oct 6, 2023
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ RUN wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar
make install && \
cd .. && rm -rf htslib-1.9

# GenomeTools
RUN wget http://genometools.org/pub/genometools-1.6.2.tar.gz && \
tar -zxvf genometools-1.6.2.tar.gz && rm -f genometools-1.6.2.tar.gz && \
cd genometools-1.6.2 && \
make cairo=no && \
make prefix=/usr/local cairo=no install && \
cd .. && rm -rf genometools-1.6.2

#### R packages
###############
Expand Down
160 changes: 160 additions & 0 deletions analyses/methylation-probe-annotations/01-create-bed-files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Create GENCODE gene features and Illumina infinium methylation array CpG
# probe coordinates bed files

# Eric Wafula for Pediatric OpenTargets
# 06/26/2023

# Load libraries
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(tidyverse))

# set up optparse options
option_list <- list(
make_option(opt_str = "--gencode_gtf", type = "character", default = NULL,
help = "GENCODE GTF",
metavar = "character"),
make_option(opt_str = "--gencode_gff", type = "character", default = NULL,
help = "GENCODE GFF with intron coordinates",
metavar = "character"),
make_option(opt_str = "--cpg_map", type = "character", default = NULL,
help = "Illumina infinium methylation array CpG probe coordinates",
metavar = "character")
)

# parse parameter options
opt <- parse_args(OptionParser(option_list = option_list))
gencode_gtf <- opt$gencode_gtf
gencode_gff <- opt$gencode_gff
cpg_map <- opt$cpg_map

# establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set module and inputs directory
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
inputs_dir <- file.path(module_dir, "inputs")

# create gene to transcript from gencode gtf
gene_transcript_map <- rtracklayer::import(con = gencode_gtf) |>
tibble::as_tibble() |>
dplyr::select(gene_id, transcript_id, gene_name) |>
tidyr::drop_na() |>
dplyr::mutate(transcript_id = stringr::str_extract(transcript_id, "ENST\\d+"),
gene_id = stringr::str_extract(gene_id, "ENSG\\d+")) |>
dplyr::distinct()

# load gencode gff with intron coordinates and covert to bed format or selected
# nuclear chromosome gene features - exon, intron, five_prime_UTR, and
# three_prime_UTR
gencode_bed <- rtracklayer::readGFF(gencode_gff) |>
tibble::as_tibble() |>
dplyr::filter(type %in% c("exon", "intron",
"five_prime_UTR", "three_prime_UTR"),
!grepl("^chrM|^GL|^KI", seqid)) |>
dplyr::select(seqid, start, end, type, Parent, strand) |>
dplyr::rename(transcript = Parent) |>
dplyr::mutate(seqid = as.character(seqid),
type = as.character(type),
transcript = as.character(transcript),
transcript = stringr::str_extract(transcript, "\\w+")) |>
dplyr::distinct()

# filter gene features bed to retain only the most terminal 3'-UTRs and 5'-UTRs
# where multiple UTRs have been predicted for a transcript
exons_introns <- gencode_bed |>
dplyr::filter(!type %in% c("five_prime_UTR", "three_prime_UTR"))
plus_five_utrs <-gencode_bed |>
dplyr::filter(type == "five_prime_UTR", strand == "+") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
minus_five_utrs <-gencode_bed |>
dplyr::filter(type == "five_prime_UTR", strand == "-") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
plus_three_utrs <-gencode_bed |>
dplyr::filter(type == "three_prime_UTR", strand == "+") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
minus_three_utrs <-gencode_bed |>
dplyr::filter(type == "three_prime_UTR", strand == "-") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
gencode_bed <- dplyr::bind_rows(exons_introns, plus_five_utrs, minus_five_utrs,
plus_three_utrs, minus_three_utrs) |>
dplyr::select(seqid, start, end, type, strand, transcript) |>
dplyr::arrange(seqid, start, end, transcript) |>
dplyr::distinct()

# # extend 3'-UTR by 1kb
# gencode_bed <- gencode_bed |>
# dplyr::mutate(start = case_when(type == "three_prime_UTR" &
# strand == "-" ~ as.integer(start - 1000),
# TRUE ~ as.integer(start)),
# end = case_when(type == "three_prime_UTR" &
# strand == "+" ~ as.integer(end + 1000),
# TRUE ~ as.integer(end)))


# add three_prime_UTR_extension - 1kb extension beyond the 3'-UTR
three_prime_UTR_extensions <- gencode_bed |>
dplyr::filter(type == "three_prime_UTR") |>
dplyr::mutate(type = "three_prime_UTR_extension",
start = case_when(strand == "-" ~ as.integer(start - 1001),
strand == "+" ~ as.integer(end + 1)),
end = case_when(strand == "-" ~ as.integer(start + 1000),
jharenza marked this conversation as resolved.
Show resolved Hide resolved
strand == "+" ~ as.integer(end + 1001)))


# add promoter region - 1kb extension beyond the 5'-UTR
promoters <- gencode_bed |>
dplyr::filter(type == "five_prime_UTR") |>
dplyr::mutate(type = "promoter",
start = case_when(strand == "-" ~ as.integer(end + 1),
strand == "+" ~ as.integer(start - 1001)),
end = case_when(strand == "-" ~ as.integer(end + 1001),
strand == "+" ~ as.integer(start + 1000)))
jharenza marked this conversation as resolved.
Show resolved Hide resolved

# merge promoters to all other features
gencode_features_bed <- gencode_bed |>
dplyr::bind_rows(promoters) |>
dplyr::bind_rows(three_prime_UTR_extensions) |>
dplyr::mutate(start = case_when(start < 1 ~ as.integer(1),
TRUE ~ as.integer(start)),
end = case_when(end < 1 ~ as.integer(1),
TRUE ~ as.integer(end))) |>
dplyr::arrange(seqid, start, end, transcript) |>
dplyr::rename(transcript_id = transcript) |>
# dplyr::rename(Chromosome = seqid, Start = start, End = end, Strand = strand,
# Gene_Feature = type, transcript_id = transcript) |>
dplyr::left_join(gene_transcript_map, by = "transcript_id") |>
dplyr::distinct()

# write gencode gene features bed format to file
gencode_features_bed |>
readr::write_tsv(
file.path(inputs_dir,
"gencode.v39.primary_assembly.gene_features.annotation.bed"),
col_names = FALSE)

# write methylation array CpG probe coordinates to bed format to file
readr::read_csv(cpg_map, skip = 7) |>
dplyr::select(CHR, MAPINFO, Name) |>
dplyr::mutate(MAPINFO2 = MAPINFO) |>
dplyr::select(CHR, MAPINFO, MAPINFO2, Name) |>
tidyr::drop_na() |>
dplyr::mutate(CHR = paste0("chr", CHR)) |>
dplyr::arrange(CHR, MAPINFO) |>
dplyr::distinct() |>
readr::write_tsv(
file.path(inputs_dir,
"infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed"),
ewafula marked this conversation as resolved.
Show resolved Hide resolved
col_names = FALSE)
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Reformat GENCODE gene features and Illumina infinium methylation array CpG
# probe coordinates Human Build 38 (GRCh38/hg38) liftover bedtools intersect

# Eric Wafula for Pediatric OpenTargets
# 06/26/2023

# Load libraries
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(tidyverse))

# set up optparse options
option_list <- list(
make_option(opt_str = "--bed_intersect", type = "character", default = NULL,
help = "GENCODE gene fetures and CpG liftover intersect bed file",
metavar = "character")
)

# parse parameter options
opt <- parse_args(OptionParser(option_list = option_list))
bed_intersect <- opt$bed_intersect

# establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set module and results directory
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
results_dir <- file.path(module_dir, "results")


# Parse bedtools GENCODE gene fetures and CpG liftover intersect bed file

readr::read_tsv(
file.path(results_dir,
"infinium.gencode.v39.probe.annotations.bedtools.tsv.gz"),
col_names = FALSE) |>
dplyr::select(X1, X2, X4, X8, X6, X7, X9, X10, X11, X12) |>
dplyr::rename(Chromosome = X1, Location = X2, Probe_ID = X4,
Gene_Feature = X8, Start = X6, End = X7, Strand = X9,
transcript_id = X10, targetFromSourceId = X11,
Gene_symbol = X12) |>
dplyr::distinct() |>
dplyr::arrange(Chromosome, Location) |>
dplyr::mutate(Gene_Feature = case_when(Gene_Feature == "." ~ "intergenic",
TRUE ~ Gene_Feature),
Start = case_when(Start == -1 ~ NA_integer_,
TRUE ~ as.integer(Start)),
End = case_when(End == -1 ~ NA_integer_,
TRUE ~ as.integer(End)),
Strand = case_when(Strand == "." ~ NA_character_,
TRUE ~ Strand),
transcript_id = case_when(transcript_id == "." ~ NA_character_,
TRUE ~ transcript_id),
targetFromSourceId = case_when(targetFromSourceId == "." ~ NA_character_,
TRUE ~ targetFromSourceId),
Gene_symbol = case_when(Gene_symbol == "." ~ NA_character_,
TRUE ~ Gene_symbol)) |>
readr::write_tsv(
file.path(results_dir, "infinium.gencode.v39.probe.annotations.tsv.gz"))
65 changes: 65 additions & 0 deletions analyses/methylation-probe-annotations/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# OpenPedCan Methylation Array Probe Annotations

## Purpose

This module create Illumina Infinium Human Methylation array CpG probe annotations based on the `Human Build 38 (GRCh38/hg38)` and `GENCODE v39 release`.
The [450K and EPIC Illumina Infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) are based on the `Human Build 37 (GRCh37/hg19)` genome assembly.
Probe coordinates are converted to `Human Build 38 (GRCh38/hg38)` using the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter).
A probe annotation file, `infinium.gencode.v39.probe.annotations.tsv` is created by annotating all probes lifted over with associated gene features (i.e., `promoter`, `5' UTR`, `exon`, `intron`, `3'UTR`, `three_prime_UTR_extension`, and `intergenic`) based on [GENCODE v39 release](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/) that is currently utilized in the OpenPedCan data analyses.
Intron coordinates, typically not included in the GFF3/GTF genome annotation formats, are added to the GENCODE annotations file using [GenomeTools](http://genometools.org/). Probe locations are then assigned with their intersecting gene annotation features using [bedtools](https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html).


## Analysis scripts
1. **`01-create-bed-files.R`** script create GENCODE gene features and Illumina infinium methylation array CpG probe coordinates input BED files

```
Usage: 01-create-bed-files.R [options]

Options:
--gencode_gtf=CHARACTER
GENCODE GTF

--gencode_gff=CHARACTER
GENCODE GFF with intron coordinates

--cpg_map=CHARACTER
Illumina infinium methylation array CpG probe coordinates

-h, --help
Show this help message and exit
```

2. **`02-parse-bedtools-intersect.R`** script reformats the bedtools intersect of the GENCODE gene features and lifted over (GRCh38 to GRCh38) CpG probe coordinates to final OpenPedCan methylation array probe annotations.
```
Usage: 02-parse-bedtools-intersect.R [options]

Options:
--bed_intersect=CHARACTER
GENCODE gene fetures and CpG liftover intersect bed file

-h, --help
Show this help message and exit
```

3. `run-methyl-probe-annotation.sh` is a wrapper bash script for executing all the other analysis scripts in the module, including creating the GENCODE GFF file inserted intron coordinates and the primary assignment of CpG probe locations to GENCODE gene features.
```
bash run-methyl-probe-annotation.sh
```

## Input datasets
OpenPedCan-analysis/blob/dev/download-data.sh).
- `../../data/gencode.v39.primary_assembly.annotation.gtf.gz`
- `../../data/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip`
- `inputs/gencode.v39.primary_assembly.annotation.gff3.gz`
- `inputs/gencode.v39.primary_assembly.introns.annotation.gff3.gz`
- `inputs/gencode.v39.primary_assembly.gene_features.annotation.bed`
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`

**`NOTE:`** The GpG probe coordinates liftover process is performed by running the running `01-create-bed-files.R` script to create GRCh37 array probe CpG BED file, `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`, which is then uploaded to the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter) to generate GRCh38 array probe CpG BED file,`inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`.


## Output datasets
- `results/infinium.gencode.v39.probe.annotations.bedtools.tsv.gz`
- `results/infinium.gencode.v39.probe.annotations.tsv.gz`

6 changes: 6 additions & 0 deletions analyses/methylation-probe-annotations/inputs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore
!gencode.v39.primary_assembly.annotation.gff3.gz
!infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed
Binary file not shown.
Loading