Skip to content

Commit

Permalink
Merge pull request #411 from d3b-center/probe-annot
Browse files Browse the repository at this point in the history
Create methyl array probes annotation module
  • Loading branch information
jharenza authored Oct 6, 2023
2 parents 0e6aad3 + 69f9403 commit 6039915
Show file tree
Hide file tree
Showing 11 changed files with 863,659 additions and 3 deletions.
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ RUN wget https://github.com/samtools/htslib/releases/download/1.9/htslib-1.9.tar
make install && \
cd .. && rm -rf htslib-1.9

# GenomeTools
RUN wget http://genometools.org/pub/genometools-1.6.2.tar.gz && \
tar -zxvf genometools-1.6.2.tar.gz && rm -f genometools-1.6.2.tar.gz && \
cd genometools-1.6.2 && \
make cairo=no && \
make prefix=/usr/local cairo=no install && \
cd .. && rm -rf genometools-1.6.2

#### R packages
###############
Expand Down
160 changes: 160 additions & 0 deletions analyses/methylation-probe-annotations/01-create-bed-files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Create GENCODE gene features and Illumina infinium methylation array CpG
# probe coordinates bed files

# Eric Wafula for Pediatric OpenTargets
# 06/26/2023

# Load libraries
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(tidyverse))

# set up optparse options
option_list <- list(
make_option(opt_str = "--gencode_gtf", type = "character", default = NULL,
help = "GENCODE GTF",
metavar = "character"),
make_option(opt_str = "--gencode_gff", type = "character", default = NULL,
help = "GENCODE GFF with intron coordinates",
metavar = "character"),
make_option(opt_str = "--cpg_map", type = "character", default = NULL,
help = "Illumina infinium methylation array CpG probe coordinates",
metavar = "character")
)

# parse parameter options
opt <- parse_args(OptionParser(option_list = option_list))
gencode_gtf <- opt$gencode_gtf
gencode_gff <- opt$gencode_gff
cpg_map <- opt$cpg_map

# establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set module and inputs directory
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
inputs_dir <- file.path(module_dir, "inputs")

# create gene to transcript from gencode gtf
gene_transcript_map <- rtracklayer::import(con = gencode_gtf) |>
tibble::as_tibble() |>
dplyr::select(gene_id, transcript_id, gene_name) |>
tidyr::drop_na() |>
dplyr::mutate(transcript_id = stringr::str_extract(transcript_id, "ENST\\d+"),
gene_id = stringr::str_extract(gene_id, "ENSG\\d+")) |>
dplyr::distinct()

# load gencode gff with intron coordinates and covert to bed format or selected
# nuclear chromosome gene features - exon, intron, five_prime_UTR, and
# three_prime_UTR
gencode_bed <- rtracklayer::readGFF(gencode_gff) |>
tibble::as_tibble() |>
dplyr::filter(type %in% c("exon", "intron",
"five_prime_UTR", "three_prime_UTR"),
!grepl("^chrM|^GL|^KI", seqid)) |>
dplyr::select(seqid, start, end, type, Parent, strand) |>
dplyr::rename(transcript = Parent) |>
dplyr::mutate(seqid = as.character(seqid),
type = as.character(type),
transcript = as.character(transcript),
transcript = stringr::str_extract(transcript, "\\w+")) |>
dplyr::distinct()

# filter gene features bed to retain only the most terminal 3'-UTRs and 5'-UTRs
# where multiple UTRs have been predicted for a transcript
exons_introns <- gencode_bed |>
dplyr::filter(!type %in% c("five_prime_UTR", "three_prime_UTR"))
plus_five_utrs <-gencode_bed |>
dplyr::filter(type == "five_prime_UTR", strand == "+") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
minus_five_utrs <-gencode_bed |>
dplyr::filter(type == "five_prime_UTR", strand == "-") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
plus_three_utrs <-gencode_bed |>
dplyr::filter(type == "three_prime_UTR", strand == "+") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
minus_three_utrs <-gencode_bed |>
dplyr::filter(type == "three_prime_UTR", strand == "-") |>
dplyr::group_by(seqid, type, transcript, strand) |>
dplyr::summarize(start = min(start), end = min(end)) |>
dplyr::arrange(seqid, start, end) |>
ungroup()
gencode_bed <- dplyr::bind_rows(exons_introns, plus_five_utrs, minus_five_utrs,
plus_three_utrs, minus_three_utrs) |>
dplyr::select(seqid, start, end, type, strand, transcript) |>
dplyr::arrange(seqid, start, end, transcript) |>
dplyr::distinct()

# # extend 3'-UTR by 1kb
# gencode_bed <- gencode_bed |>
# dplyr::mutate(start = case_when(type == "three_prime_UTR" &
# strand == "-" ~ as.integer(start - 1000),
# TRUE ~ as.integer(start)),
# end = case_when(type == "three_prime_UTR" &
# strand == "+" ~ as.integer(end + 1000),
# TRUE ~ as.integer(end)))


# add three_prime_UTR_extension - 1kb extension beyond the 3'-UTR
three_prime_UTR_extensions <- gencode_bed |>
dplyr::filter(type == "three_prime_UTR") |>
dplyr::mutate(type = "three_prime_UTR_extension",
start = case_when(strand == "-" ~ as.integer(start - 1001),
strand == "+" ~ as.integer(end + 1)),
end = case_when(strand == "-" ~ as.integer(start + 1000),
strand == "+" ~ as.integer(end + 1001)))


# add promoter region - 1kb extension beyond the 5'-UTR
promoters <- gencode_bed |>
dplyr::filter(type == "five_prime_UTR") |>
dplyr::mutate(type = "promoter",
start = case_when(strand == "-" ~ as.integer(end + 1),
strand == "+" ~ as.integer(start - 1001)),
end = case_when(strand == "-" ~ as.integer(end + 1001),
strand == "+" ~ as.integer(start + 1000)))

# merge promoters to all other features
gencode_features_bed <- gencode_bed |>
dplyr::bind_rows(promoters) |>
dplyr::bind_rows(three_prime_UTR_extensions) |>
dplyr::mutate(start = case_when(start < 1 ~ as.integer(1),
TRUE ~ as.integer(start)),
end = case_when(end < 1 ~ as.integer(1),
TRUE ~ as.integer(end))) |>
dplyr::arrange(seqid, start, end, transcript) |>
dplyr::rename(transcript_id = transcript) |>
# dplyr::rename(Chromosome = seqid, Start = start, End = end, Strand = strand,
# Gene_Feature = type, transcript_id = transcript) |>
dplyr::left_join(gene_transcript_map, by = "transcript_id") |>
dplyr::distinct()

# write gencode gene features bed format to file
gencode_features_bed |>
readr::write_tsv(
file.path(inputs_dir,
"gencode.v39.primary_assembly.gene_features.annotation.bed"),
col_names = FALSE)

# write methylation array CpG probe coordinates to bed format to file
readr::read_csv(cpg_map, skip = 7) |>
dplyr::select(CHR, MAPINFO, Name) |>
dplyr::mutate(MAPINFO2 = MAPINFO) |>
dplyr::select(CHR, MAPINFO, MAPINFO2, Name) |>
tidyr::drop_na() |>
dplyr::mutate(CHR = paste0("chr", CHR)) |>
dplyr::arrange(CHR, MAPINFO) |>
dplyr::distinct() |>
readr::write_tsv(
file.path(inputs_dir,
"infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed"),
col_names = FALSE)
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Reformat GENCODE gene features and Illumina infinium methylation array CpG
# probe coordinates Human Build 38 (GRCh38/hg38) liftover bedtools intersect

# Eric Wafula for Pediatric OpenTargets
# 06/26/2023

# Load libraries
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(tidyverse))

# set up optparse options
option_list <- list(
make_option(opt_str = "--bed_intersect", type = "character", default = NULL,
help = "GENCODE gene fetures and CpG liftover intersect bed file",
metavar = "character")
)

# parse parameter options
opt <- parse_args(OptionParser(option_list = option_list))
bed_intersect <- opt$bed_intersect

# establish base dir
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set module and results directory
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations")
results_dir <- file.path(module_dir, "results")


# Parse bedtools GENCODE gene fetures and CpG liftover intersect bed file

readr::read_tsv(
file.path(results_dir,
"infinium.gencode.v39.probe.annotations.bedtools.tsv.gz"),
col_names = FALSE) |>
dplyr::select(X1, X2, X4, X8, X6, X7, X9, X10, X11, X12) |>
dplyr::rename(Chromosome = X1, Location = X2, Probe_ID = X4,
Gene_Feature = X8, Start = X6, End = X7, Strand = X9,
transcript_id = X10, targetFromSourceId = X11,
Gene_symbol = X12) |>
dplyr::distinct() |>
dplyr::arrange(Chromosome, Location) |>
dplyr::mutate(Gene_Feature = case_when(Gene_Feature == "." ~ "intergenic",
TRUE ~ Gene_Feature),
Start = case_when(Start == -1 ~ NA_integer_,
TRUE ~ as.integer(Start)),
End = case_when(End == -1 ~ NA_integer_,
TRUE ~ as.integer(End)),
Strand = case_when(Strand == "." ~ NA_character_,
TRUE ~ Strand),
transcript_id = case_when(transcript_id == "." ~ NA_character_,
TRUE ~ transcript_id),
targetFromSourceId = case_when(targetFromSourceId == "." ~ NA_character_,
TRUE ~ targetFromSourceId),
Gene_symbol = case_when(Gene_symbol == "." ~ NA_character_,
TRUE ~ Gene_symbol)) |>
readr::write_tsv(
file.path(results_dir, "infinium.gencode.v39.probe.annotations.tsv.gz"))
65 changes: 65 additions & 0 deletions analyses/methylation-probe-annotations/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# OpenPedCan Methylation Array Probe Annotations

## Purpose

This module create Illumina Infinium Human Methylation array CpG probe annotations based on the `Human Build 38 (GRCh38/hg38)` and `GENCODE v39 release`.
The [450K and EPIC Illumina Infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) are based on the `Human Build 37 (GRCh37/hg19)` genome assembly.
Probe coordinates are converted to `Human Build 38 (GRCh38/hg38)` using the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter).
A probe annotation file, `infinium.gencode.v39.probe.annotations.tsv` is created by annotating all probes lifted over with associated gene features (i.e., `promoter`, `5' UTR`, `exon`, `intron`, `3'UTR`, `three_prime_UTR_extension`, and `intergenic`) based on [GENCODE v39 release](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/) that is currently utilized in the OpenPedCan data analyses.
Intron coordinates, typically not included in the GFF3/GTF genome annotation formats, are added to the GENCODE annotations file using [GenomeTools](http://genometools.org/). Probe locations are then assigned with their intersecting gene annotation features using [bedtools](https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html).


## Analysis scripts
1. **`01-create-bed-files.R`** script create GENCODE gene features and Illumina infinium methylation array CpG probe coordinates input BED files

```
Usage: 01-create-bed-files.R [options]
Options:
--gencode_gtf=CHARACTER
GENCODE GTF
--gencode_gff=CHARACTER
GENCODE GFF with intron coordinates
--cpg_map=CHARACTER
Illumina infinium methylation array CpG probe coordinates
-h, --help
Show this help message and exit
```

2. **`02-parse-bedtools-intersect.R`** script reformats the bedtools intersect of the GENCODE gene features and lifted over (GRCh38 to GRCh38) CpG probe coordinates to final OpenPedCan methylation array probe annotations.
```
Usage: 02-parse-bedtools-intersect.R [options]
Options:
--bed_intersect=CHARACTER
GENCODE gene fetures and CpG liftover intersect bed file
-h, --help
Show this help message and exit
```

3. `run-methyl-probe-annotation.sh` is a wrapper bash script for executing all the other analysis scripts in the module, including creating the GENCODE GFF file inserted intron coordinates and the primary assignment of CpG probe locations to GENCODE gene features.
```
bash run-methyl-probe-annotation.sh
```

## Input datasets
OpenPedCan-analysis/blob/dev/download-data.sh).
- `../../data/gencode.v39.primary_assembly.annotation.gtf.gz`
- `../../data/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip`
- `inputs/gencode.v39.primary_assembly.annotation.gff3.gz`
- `inputs/gencode.v39.primary_assembly.introns.annotation.gff3.gz`
- `inputs/gencode.v39.primary_assembly.gene_features.annotation.bed`
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`

**`NOTE:`** The GpG probe coordinates liftover process is performed by running the running `01-create-bed-files.R` script to create GRCh37 array probe CpG BED file, `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`, which is then uploaded to the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter) to generate GRCh38 array probe CpG BED file,`inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`.


## Output datasets
- `results/infinium.gencode.v39.probe.annotations.bedtools.tsv.gz`
- `results/infinium.gencode.v39.probe.annotations.tsv.gz`

6 changes: 6 additions & 0 deletions analyses/methylation-probe-annotations/inputs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore
!gencode.v39.primary_assembly.annotation.gff3.gz
!infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed
Binary file not shown.
Loading

0 comments on commit 6039915

Please sign in to comment.