-
Notifications
You must be signed in to change notification settings - Fork 5
/
term-weights.R
135 lines (132 loc) · 6.8 KB
/
term-weights.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
similarity_sparql_query <- function(corpus) {
if (corpus == "taxa") {
list(
path = "<http://purl.org/phenoscape/vocab.owl#has_phenotypic_profile>/<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>",
subject_filter_property = "http://www.w3.org/2000/01/rdf-schema#isDefinedBy",
subject_filter_value = "http://purl.obolibrary.org/obo/vto.owl"
)
} else if (corpus == "genes") {
list(
path = "<http://purl.org/phenoscape/vocab.owl#has_phenotypic_profile>/<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>",
subject_filter_property = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
subject_filter_value = "http://purl.org/phenoscape/vocab.owl#AnnotatedGene"
)
} else {
stop("corpus '", corpus, "' is currently unsupported", call. = FALSE)
}
}
#' Obtains term frequencies for the Phenoscape KB
#'
#' Determines the frequencies for the given input list of terms, based on
#' the selected corpus.
#'
#' Depending on the corpus selected, the frequencies are queried directly
#' from pre-computed counts through the KB API, or are calculated based on
#' matching row counts obtained from query results. Currently, the Phenoscape KB
#' has precomputed counts for corpora "taxa" and "genes".
#' @note
#' Term categories being accurate is vital for obtaining correct counts and
#' thus frequencies. In earlier (<=0.2.x) releases, auto-determining term
#' category was an option, but this is no longer supported, in part because it
#' was potentially time consuming and sometimes inaccurate, in particular for
#' the many post-composed subsumer terms returned by [subsumer_matrix()]. In the
#' KB v2.0 API, auto-determining the category of a post-composed term is no
#' longer supported. If the list of terms is legitimately of different categories,
#' determine (and possibly correct) categories beforehand using [term_category()].
#' In earlier (<=0.2.x) releases the "taxon_annotations" corpus was supported, but
#' this is no longer the case due to the inability to determine an accurate
#' count for post-composed terms with the KB v2.0 API. This also means that the
#' only supported value for the `as` parameter is "phenotype" since "entity" and
#' "quality" were only supported for the "taxon_annotations" corpus.
#' @param x a vector or list of one or more terms, either as IRIs or as term
#' objects.
#' @param as the category or categories of the input terms (see [term_category()]).
#' Supported categories are "entity", "quality", and "phenotype". (At present,
#' support for "entity" and "quality" has been disabled as of v0.3.0, pending full support from the KB API.)
#' The value must either be a single category (applying to all terms), or a vector of
#' categories (of same length as `x`). The default is "phenotype".
#' @param corpus the name of the corpus for which to determine frequencies.
#' Supported values are "taxon_annotations", "taxa", "gene_annotations", and
#' "genes". (At present, support for "gene_annotations" and "taxon_annotations" is disabled, pending support in
#' the Phenoscape KB API.)
#' The default is "taxa".
#' @param decodeIRI boolean. This parameter is deprecated (as of v0.3.x) and must be set
#' to FALSE (the default). If TRUE is passed an error will be raised. In v0.2.x
#' when TRUE this parameter would attempt to decode post-composed entity IRIs.
#' Due to changes in the IRI returned by the Phenoscape KB v2.x API decoding
#' post-composed entity IRIs is no longer possible. Prior to v0.3.x, the default
#' value for this parameter was TRUE.
#' @param ... additional query parameters to be passed to the function querying
#' for counts, see [pkb_args_to_query()]. Currently this is only used for
#' corpus "taxon_annotations", and the only useful parameter is `includeRels`,
#' which can be used to include historical and serial homologues in the counts.
#' It can also be used to always include parts for entity terms.
#' @return a vector of frequencies as floating point numbers (between zero
#' and 1.0), of the same length (and ordering) as the input list of terms.
#' @examples
#' phens <- get_phenotypes(entity = "basihyal bone")
#' term_freqs(phens$id, as = "phenotype", corpus = "taxa")
#' term_freqs(phens$id, as = "phenotype", corpus = "genes")
#' @export
term_freqs <- function(x,
as = c("phenotype", "entity", "quality"),
corpus = c("taxa", "taxon_annotations", "gene_annotations", "genes"),
decodeIRI = FALSE,
...) {
as <- match.arg(as, several.ok = TRUE)
corpus <- match.arg(corpus)
if (decodeIRI) stop("Decoding an IRI is no longer supported.")
if (length(as) > 1 && length(as) != length(x))
stop("'as' must be a single value, or have the same length as 'x'", call. = FALSE)
ctotal <- corpus_size(corpus = corpus)
if (corpus == "taxa" || corpus == "genes") {
if (any(as != "phenotype"))
stop("corpus '", corpus, "' requires phenotype terms", call. = FALSE)
query <- similarity_sparql_query(corpus)
query$terms <- as.character(jsonlite::toJSON(x))
freqs <- get_csv_data(pkb_api("/similarity/frequency"), query = query,
header = FALSE, row.names = 1, check.names = FALSE)
reordering <- match(x, rownames(freqs))
freqs <- freqs[reordering,] / ctotal
} else {
stop("corpus '", corpus, "' is currently unsupported", call. = FALSE)
}
unname(freqs)
}
#' Obtain the size of different corpora
#'
#' Obtains the size of a certain number of predefined corpora. The total size
#' of a corpus is important for calculating term frequencies.
#'
#' Corpus sizes are cached per session after they have first been obtained.
#' Thus, if the Phenoscape KB changes, a session needs to be restarted to
#' have those changes be reflected.
#'
#' @param corpus the name of the corpus, currently one of "taxon_annotations",
#' "taxa", "gene_annotations", and "genes". (At present "gene_annotations" is
#' pending support by the Phenoscape API.) Unambiguous abbreviations are
#' acceptable.
#' @return the size of the specified corpus as an integer number.
#' @examples
#' corpus_size("taxa")
#' corpus_size("taxon_annotations")
#' @export
corpus_size <- local({
.sizes <- list()
function(corpus = c("taxon_annotations", "taxa", "gene_annotations", "genes")) {
corpus <- match.arg(corpus)
if (is.null(.sizes[[corpus]])) {
if (corpus == "taxa" || corpus == "genes") {
query <- similarity_sparql_query(corpus)
res <- get_json_data(pkb_api("/similarity/corpus_size"), query)
.sizes[[corpus]] <- res$total
} else if (corpus == "taxon_annotations") {
res <- get_json_data(pkb_api("/taxon/annotations"), list(total = TRUE))
.sizes[[corpus]] <- res$total
} else {
stop("corpus 'gene_annotations' is currently unsupported", call. = FALSE)
}
}
.sizes[[corpus]]
}
})