Skip to content

Commit

Permalink
t Merge branch 'master' of github.com:AlexanderLabWHOI/EUKulele
Browse files Browse the repository at this point in the history
  • Loading branch information
akrinos committed Jan 16, 2023
2 parents 646bcfb + 2a52287 commit ecafcb4
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/source/databaseandconfig.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Four databases can be downloaded and formatted automatically when invoking ``EUK
- `PhyloDB <https://drive.google.com/drive/u/0/folders/0B-BsLZUMHrDQfldGeDRIUHNZMEREY0g3ekpEZFhrTDlQSjQtbm5heC1QX2V6TUxBeFlOejQ>`_
- `EukProt <https://figshare.com/articles/EukProt_a_database_of_genome-scale_predicted_proteins_across_the_diversity_of_eukaryotic_life/12417881/2>`_
- `MMETSP <https://zenodo.org/record/1212585#.Xw3PoJNKhTZ>`_
- `MMETSP <https://zenodo.org/record/1212585#.Xw3PoJNKhTZ>`_ and `MMETSP <https://mmp.sfb.uit.no/databases/marref/#/>`_ *Default*
- `MMETSP <https://zenodo.org/record/1212585#.Xw3PoJNKhTZ>`_ and `MarRef <https://mmp.sfb.uit.no/databases/marref/#/>`_ *Default*
- `EukZoo <https://github.com/zxl124/EukZoo-database>`_

Note that the MMETSP database is generated using cleaned MMETSP assemblies originally derived from, but not identical to, the assemblies stored in full at the link above. In order to download the cleaned assemblies used to create the ``EUKulele`` MMETSP database, please follow the instructions recorded in `this Github repository <https://github.com/shu251/download-cleaned-mmetsp>`_.
Expand Down
54 changes: 54 additions & 0 deletions scripts/process-output.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
pacman::p_load(ggplot2,dplyr,data.table)

## FUNCTIONS ##

process_sub_eukulele <- function(eukulele_dir, label_out, all_results, salmon_dir,
salmon_format="num", is_protein=FALSE) {
curr_dir = file.path(eukulele_dir,"taxonomy_estimation")
for (curr_file in list.files(curr_dir)) {
eukulele_estimates = data.frame(fread(file.path(curr_dir, curr_file), sep = "\t")) %>%
dplyr::mutate(full_classification=case_when(!is.na(full_classification)~full_classification,
TRUE~"Unclassified")) %>%
tidyr::separate(full_classification, sep = ";",
into = c("Domain","Supergroup","Phylum","Class","Order",
"Family","Genus","Species"))%>%
dplyr::mutate(Domain = unlist(lapply(as.character(EUKulele_out$Domain),trimws)))%>%
dplyr::mutate(Supergroup = unlist(lapply(as.character(EUKulele_out$Supergroup),trimws)))%>%
dplyr::mutate(Phylum = unlist(lapply(as.character(EUKulele_out$Phylum),trimws)))%>%
dplyr::mutate(Class = unlist(lapply(as.character(EUKulele_out$Class),trimws)))%>%
dplyr::mutate(Order = unlist(lapply(as.character(EUKulele_out$Order),trimws)))%>%
dplyr::mutate(Family = unlist(lapply(as.character(EUKulele_out$Family),trimws)))%>%
dplyr::mutate(Genus = unlist(lapply(as.character(EUKulele_out$Genus),trimws)))%>%
dplyr::mutate(Species = unlist(lapply(as.character(EUKulele_out$Species),trimws)))

if (is_protein) {
eukulele_estimates = eukulele_estimates %>% tidyr::separate(transcript_name,sep="\\.p",
into=c("transcript_name",
"protein_id"))
}
number_file = unlist(strsplit(curr_file,"_"))[1]
salmon_file = read.csv(file.path(salmon_dir,paste0(salmon_format,as.character(number_file), "_quant"),
"quant.sf"), sep = "\t")
matched_file= salmon_file %>% dplyr::full_join(eukulele_estimates,by=c("Name"="transcript_name"))
matched_file["Sample"] = curr_file
matched_file["SplitSamp"] = number_file
matched_file["Type"] = label_out

matched_file = matched_file %>%
tidyr::replace_na(list("Domain"="Uncertain",
"Supergroup"="Uncertain",
"Phylum"="Uncertain",
"Class"="Uncertain",
"Order"="Uncertain",
"Family"="Uncertain",
"Genus"="Uncertain",
"Species"="Uncertain"))
all_results = all_results %>% dplyr::bind_rows(matched_file)
}
return(all_results)
}

all_results=data.frame()
all_results = process_sub_eukulele("CAG_eukulele_pleuro_metazoans", "Metazoan_DB", all_results,
"/vortexfs1/omics/alexander/akrinos/2021-remodeling-eukrhythmic/2021-09-ALOHA/intermediate-files/04-compare/09-CAG-mapping/salmon",
salmon_format="num", is_protein=TRUE)

0 comments on commit ecafcb4

Please sign in to comment.