forked from AlexsLemonade/OpenPBTA-analysis
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #411 from d3b-center/probe-annot
Create methyl array probes annotation module
- Loading branch information
Showing
11 changed files
with
863,659 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
160 changes: 160 additions & 0 deletions
160
analyses/methylation-probe-annotations/01-create-bed-files.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
# Create GENCODE gene features and Illumina infinium methylation array CpG | ||
# probe coordinates bed files | ||
|
||
# Eric Wafula for Pediatric OpenTargets | ||
# 06/26/2023 | ||
|
||
# Load libraries | ||
suppressPackageStartupMessages(library(optparse)) | ||
suppressPackageStartupMessages(library(rtracklayer)) | ||
suppressPackageStartupMessages(library(tidyverse)) | ||
|
||
# set up optparse options | ||
option_list <- list( | ||
make_option(opt_str = "--gencode_gtf", type = "character", default = NULL, | ||
help = "GENCODE GTF", | ||
metavar = "character"), | ||
make_option(opt_str = "--gencode_gff", type = "character", default = NULL, | ||
help = "GENCODE GFF with intron coordinates", | ||
metavar = "character"), | ||
make_option(opt_str = "--cpg_map", type = "character", default = NULL, | ||
help = "Illumina infinium methylation array CpG probe coordinates", | ||
metavar = "character") | ||
) | ||
|
||
# parse parameter options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
gencode_gtf <- opt$gencode_gtf | ||
gencode_gff <- opt$gencode_gff | ||
cpg_map <- opt$cpg_map | ||
|
||
# establish base dir | ||
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) | ||
|
||
# Set module and inputs directory | ||
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations") | ||
inputs_dir <- file.path(module_dir, "inputs") | ||
|
||
# create gene to transcript from gencode gtf | ||
gene_transcript_map <- rtracklayer::import(con = gencode_gtf) |> | ||
tibble::as_tibble() |> | ||
dplyr::select(gene_id, transcript_id, gene_name) |> | ||
tidyr::drop_na() |> | ||
dplyr::mutate(transcript_id = stringr::str_extract(transcript_id, "ENST\\d+"), | ||
gene_id = stringr::str_extract(gene_id, "ENSG\\d+")) |> | ||
dplyr::distinct() | ||
|
||
# load gencode gff with intron coordinates and covert to bed format or selected | ||
# nuclear chromosome gene features - exon, intron, five_prime_UTR, and | ||
# three_prime_UTR | ||
gencode_bed <- rtracklayer::readGFF(gencode_gff) |> | ||
tibble::as_tibble() |> | ||
dplyr::filter(type %in% c("exon", "intron", | ||
"five_prime_UTR", "three_prime_UTR"), | ||
!grepl("^chrM|^GL|^KI", seqid)) |> | ||
dplyr::select(seqid, start, end, type, Parent, strand) |> | ||
dplyr::rename(transcript = Parent) |> | ||
dplyr::mutate(seqid = as.character(seqid), | ||
type = as.character(type), | ||
transcript = as.character(transcript), | ||
transcript = stringr::str_extract(transcript, "\\w+")) |> | ||
dplyr::distinct() | ||
|
||
# filter gene features bed to retain only the most terminal 3'-UTRs and 5'-UTRs | ||
# where multiple UTRs have been predicted for a transcript | ||
exons_introns <- gencode_bed |> | ||
dplyr::filter(!type %in% c("five_prime_UTR", "three_prime_UTR")) | ||
plus_five_utrs <-gencode_bed |> | ||
dplyr::filter(type == "five_prime_UTR", strand == "+") |> | ||
dplyr::group_by(seqid, type, transcript, strand) |> | ||
dplyr::summarize(start = min(start), end = min(end)) |> | ||
dplyr::arrange(seqid, start, end) |> | ||
ungroup() | ||
minus_five_utrs <-gencode_bed |> | ||
dplyr::filter(type == "five_prime_UTR", strand == "-") |> | ||
dplyr::group_by(seqid, type, transcript, strand) |> | ||
dplyr::summarize(start = min(start), end = min(end)) |> | ||
dplyr::arrange(seqid, start, end) |> | ||
ungroup() | ||
plus_three_utrs <-gencode_bed |> | ||
dplyr::filter(type == "three_prime_UTR", strand == "+") |> | ||
dplyr::group_by(seqid, type, transcript, strand) |> | ||
dplyr::summarize(start = min(start), end = min(end)) |> | ||
dplyr::arrange(seqid, start, end) |> | ||
ungroup() | ||
minus_three_utrs <-gencode_bed |> | ||
dplyr::filter(type == "three_prime_UTR", strand == "-") |> | ||
dplyr::group_by(seqid, type, transcript, strand) |> | ||
dplyr::summarize(start = min(start), end = min(end)) |> | ||
dplyr::arrange(seqid, start, end) |> | ||
ungroup() | ||
gencode_bed <- dplyr::bind_rows(exons_introns, plus_five_utrs, minus_five_utrs, | ||
plus_three_utrs, minus_three_utrs) |> | ||
dplyr::select(seqid, start, end, type, strand, transcript) |> | ||
dplyr::arrange(seqid, start, end, transcript) |> | ||
dplyr::distinct() | ||
|
||
# # extend 3'-UTR by 1kb | ||
# gencode_bed <- gencode_bed |> | ||
# dplyr::mutate(start = case_when(type == "three_prime_UTR" & | ||
# strand == "-" ~ as.integer(start - 1000), | ||
# TRUE ~ as.integer(start)), | ||
# end = case_when(type == "three_prime_UTR" & | ||
# strand == "+" ~ as.integer(end + 1000), | ||
# TRUE ~ as.integer(end))) | ||
|
||
|
||
# add three_prime_UTR_extension - 1kb extension beyond the 3'-UTR | ||
three_prime_UTR_extensions <- gencode_bed |> | ||
dplyr::filter(type == "three_prime_UTR") |> | ||
dplyr::mutate(type = "three_prime_UTR_extension", | ||
start = case_when(strand == "-" ~ as.integer(start - 1001), | ||
strand == "+" ~ as.integer(end + 1)), | ||
end = case_when(strand == "-" ~ as.integer(start + 1000), | ||
strand == "+" ~ as.integer(end + 1001))) | ||
|
||
|
||
# add promoter region - 1kb extension beyond the 5'-UTR | ||
promoters <- gencode_bed |> | ||
dplyr::filter(type == "five_prime_UTR") |> | ||
dplyr::mutate(type = "promoter", | ||
start = case_when(strand == "-" ~ as.integer(end + 1), | ||
strand == "+" ~ as.integer(start - 1001)), | ||
end = case_when(strand == "-" ~ as.integer(end + 1001), | ||
strand == "+" ~ as.integer(start + 1000))) | ||
|
||
# merge promoters to all other features | ||
gencode_features_bed <- gencode_bed |> | ||
dplyr::bind_rows(promoters) |> | ||
dplyr::bind_rows(three_prime_UTR_extensions) |> | ||
dplyr::mutate(start = case_when(start < 1 ~ as.integer(1), | ||
TRUE ~ as.integer(start)), | ||
end = case_when(end < 1 ~ as.integer(1), | ||
TRUE ~ as.integer(end))) |> | ||
dplyr::arrange(seqid, start, end, transcript) |> | ||
dplyr::rename(transcript_id = transcript) |> | ||
# dplyr::rename(Chromosome = seqid, Start = start, End = end, Strand = strand, | ||
# Gene_Feature = type, transcript_id = transcript) |> | ||
dplyr::left_join(gene_transcript_map, by = "transcript_id") |> | ||
dplyr::distinct() | ||
|
||
# write gencode gene features bed format to file | ||
gencode_features_bed |> | ||
readr::write_tsv( | ||
file.path(inputs_dir, | ||
"gencode.v39.primary_assembly.gene_features.annotation.bed"), | ||
col_names = FALSE) | ||
|
||
# write methylation array CpG probe coordinates to bed format to file | ||
readr::read_csv(cpg_map, skip = 7) |> | ||
dplyr::select(CHR, MAPINFO, Name) |> | ||
dplyr::mutate(MAPINFO2 = MAPINFO) |> | ||
dplyr::select(CHR, MAPINFO, MAPINFO2, Name) |> | ||
tidyr::drop_na() |> | ||
dplyr::mutate(CHR = paste0("chr", CHR)) |> | ||
dplyr::arrange(CHR, MAPINFO) |> | ||
dplyr::distinct() |> | ||
readr::write_tsv( | ||
file.path(inputs_dir, | ||
"infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed"), | ||
col_names = FALSE) |
58 changes: 58 additions & 0 deletions
58
analyses/methylation-probe-annotations/02-parse-bedtools-intersect.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Reformat GENCODE gene features and Illumina infinium methylation array CpG | ||
# probe coordinates Human Build 38 (GRCh38/hg38) liftover bedtools intersect | ||
|
||
# Eric Wafula for Pediatric OpenTargets | ||
# 06/26/2023 | ||
|
||
# Load libraries | ||
suppressPackageStartupMessages(library(optparse)) | ||
suppressPackageStartupMessages(library(tidyverse)) | ||
|
||
# set up optparse options | ||
option_list <- list( | ||
make_option(opt_str = "--bed_intersect", type = "character", default = NULL, | ||
help = "GENCODE gene fetures and CpG liftover intersect bed file", | ||
metavar = "character") | ||
) | ||
|
||
# parse parameter options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
bed_intersect <- opt$bed_intersect | ||
|
||
# establish base dir | ||
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) | ||
|
||
# Set module and results directory | ||
module_dir <- file.path(root_dir, "analyses", "methylation-probe-annotations") | ||
results_dir <- file.path(module_dir, "results") | ||
|
||
|
||
# Parse bedtools GENCODE gene fetures and CpG liftover intersect bed file | ||
|
||
readr::read_tsv( | ||
file.path(results_dir, | ||
"infinium.gencode.v39.probe.annotations.bedtools.tsv.gz"), | ||
col_names = FALSE) |> | ||
dplyr::select(X1, X2, X4, X8, X6, X7, X9, X10, X11, X12) |> | ||
dplyr::rename(Chromosome = X1, Location = X2, Probe_ID = X4, | ||
Gene_Feature = X8, Start = X6, End = X7, Strand = X9, | ||
transcript_id = X10, targetFromSourceId = X11, | ||
Gene_symbol = X12) |> | ||
dplyr::distinct() |> | ||
dplyr::arrange(Chromosome, Location) |> | ||
dplyr::mutate(Gene_Feature = case_when(Gene_Feature == "." ~ "intergenic", | ||
TRUE ~ Gene_Feature), | ||
Start = case_when(Start == -1 ~ NA_integer_, | ||
TRUE ~ as.integer(Start)), | ||
End = case_when(End == -1 ~ NA_integer_, | ||
TRUE ~ as.integer(End)), | ||
Strand = case_when(Strand == "." ~ NA_character_, | ||
TRUE ~ Strand), | ||
transcript_id = case_when(transcript_id == "." ~ NA_character_, | ||
TRUE ~ transcript_id), | ||
targetFromSourceId = case_when(targetFromSourceId == "." ~ NA_character_, | ||
TRUE ~ targetFromSourceId), | ||
Gene_symbol = case_when(Gene_symbol == "." ~ NA_character_, | ||
TRUE ~ Gene_symbol)) |> | ||
readr::write_tsv( | ||
file.path(results_dir, "infinium.gencode.v39.probe.annotations.tsv.gz")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# OpenPedCan Methylation Array Probe Annotations | ||
|
||
## Purpose | ||
|
||
This module create Illumina Infinium Human Methylation array CpG probe annotations based on the `Human Build 38 (GRCh38/hg38)` and `GENCODE v39 release`. | ||
The [450K and EPIC Illumina Infinium methylation array CpG probe coordinates](https://support.illumina.com/array/array_kits/infinium-methylationepic-beadchip-kit/downloads.html) are based on the `Human Build 37 (GRCh37/hg19)` genome assembly. | ||
Probe coordinates are converted to `Human Build 38 (GRCh38/hg38)` using the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter). | ||
A probe annotation file, `infinium.gencode.v39.probe.annotations.tsv` is created by annotating all probes lifted over with associated gene features (i.e., `promoter`, `5' UTR`, `exon`, `intron`, `3'UTR`, `three_prime_UTR_extension`, and `intergenic`) based on [GENCODE v39 release](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_39/) that is currently utilized in the OpenPedCan data analyses. | ||
Intron coordinates, typically not included in the GFF3/GTF genome annotation formats, are added to the GENCODE annotations file using [GenomeTools](http://genometools.org/). Probe locations are then assigned with their intersecting gene annotation features using [bedtools](https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html). | ||
|
||
|
||
## Analysis scripts | ||
1. **`01-create-bed-files.R`** script create GENCODE gene features and Illumina infinium methylation array CpG probe coordinates input BED files | ||
|
||
``` | ||
Usage: 01-create-bed-files.R [options] | ||
Options: | ||
--gencode_gtf=CHARACTER | ||
GENCODE GTF | ||
--gencode_gff=CHARACTER | ||
GENCODE GFF with intron coordinates | ||
--cpg_map=CHARACTER | ||
Illumina infinium methylation array CpG probe coordinates | ||
-h, --help | ||
Show this help message and exit | ||
``` | ||
|
||
2. **`02-parse-bedtools-intersect.R`** script reformats the bedtools intersect of the GENCODE gene features and lifted over (GRCh38 to GRCh38) CpG probe coordinates to final OpenPedCan methylation array probe annotations. | ||
``` | ||
Usage: 02-parse-bedtools-intersect.R [options] | ||
Options: | ||
--bed_intersect=CHARACTER | ||
GENCODE gene fetures and CpG liftover intersect bed file | ||
-h, --help | ||
Show this help message and exit | ||
``` | ||
|
||
3. `run-methyl-probe-annotation.sh` is a wrapper bash script for executing all the other analysis scripts in the module, including creating the GENCODE GFF file inserted intron coordinates and the primary assignment of CpG probe locations to GENCODE gene features. | ||
``` | ||
bash run-methyl-probe-annotation.sh | ||
``` | ||
|
||
## Input datasets | ||
OpenPedCan-analysis/blob/dev/download-data.sh). | ||
- `../../data/gencode.v39.primary_assembly.annotation.gtf.gz` | ||
- `../../data/infinium-methylationepic-v-1-0-b5-manifest-file-csv.zip` | ||
- `inputs/gencode.v39.primary_assembly.annotation.gff3.gz` | ||
- `inputs/gencode.v39.primary_assembly.introns.annotation.gff3.gz` | ||
- `inputs/gencode.v39.primary_assembly.gene_features.annotation.bed` | ||
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed` | ||
- `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed` | ||
|
||
**`NOTE:`** The GpG probe coordinates liftover process is performed by running the running `01-create-bed-files.R` script to create GRCh37 array probe CpG BED file, `inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch37.bed`, which is then uploaded to the [ENSEMBL Assembly Converter tool](https://useast.ensembl.org/Homo_sapiens/Tools/AssemblyConverter) to generate GRCh38 array probe CpG BED file,`inputs/infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed`. | ||
|
||
|
||
## Output datasets | ||
- `results/infinium.gencode.v39.probe.annotations.bedtools.tsv.gz` | ||
- `results/infinium.gencode.v39.probe.annotations.tsv.gz` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Ignore everything in this directory | ||
* | ||
# Except this file | ||
!.gitignore | ||
!gencode.v39.primary_assembly.annotation.gff3.gz | ||
!infinium-methylationepic-v-1-0-b5-manifest-file-grch38.bed |
Binary file added
BIN
+59.9 MB
...yses/methylation-probe-annotations/inputs/gencode.v39.primary_assembly.annotation.gff3.gz
Binary file not shown.
Oops, something went wrong.