-
Notifications
You must be signed in to change notification settings - Fork 993
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
New function topn #4187
New function topn #4187
Changes from all commits
0807b1d
aae0158
dfbfb28
051886d
dd1772d
535bae8
656ff75
4e24c8f
8322e9b
8f3c977
ded01ce
d2d1921
6f322e4
c22c140
af37e1c
815729a
467c2c4
f34b759
37f503e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) | |
|
||
fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) | ||
fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) | ||
topn = function(vec, n=6L, decreasing=FALSE) .Call(CtopnR, vec, n, decreasing) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the default be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I totally don't mind. The way I implemented it was to be in line with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think More practically, in other places in |
||
colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) | ||
coerceFill = function(x) .Call(CcoerceFillR, x) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
\name{topn} | ||
\alias{topn} | ||
\title{ Top N values index} | ||
\description{ | ||
\code{topn} is used to get the indices of the few values of an input. This is an extension of \code{\link{which.max}}/\code{\link{which.min}} which provide \emph{only} the first such index. | ||
|
||
The output is the same as \code{order(vec)[1:n]}, but internally optimized not to sort the irrelevant elements of the input (and therefore much faster, for small \code{n} relative to input size). | ||
} | ||
\usage{ | ||
topn(vec, n=6L, decreasing=FALSE) | ||
} | ||
\arguments{ | ||
\item{vec}{ A numeric vector of type numeric or integer. Other types are not supported yet. } | ||
\item{n}{ A positive integer value greater or equal to 1. Maximum value is 1000. } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we put such a maximum? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because the algorithm is not build fir large number. Try to remove the max and test the function for large number. You will notice a massive reduction in performance above a given threshold. It can even become 100 times slower than |
||
\item{decreasing}{ A logical value (default \code{FALSE}) to indicate whether to sort \code{vec} in decreasing or increasing order. Equivalent to argument \code{decreasing} in function \code{base::order}. } | ||
} | ||
\value{ | ||
\code{integer} vector of indices of the most extreme (according to \code{decreasing}) \code{n} values in vector \code{vec}. | ||
} | ||
\examples{ | ||
x = rnorm(1e6) | ||
|
||
# Example 1: index of top 6 negative values | ||
topn(x, 6L) | ||
order(x)[1:6] | ||
|
||
# Example 2: index of top 6 positive values | ||
topn(x, 6L, decreasing = TRUE) | ||
order(x, decreasing = TRUE)[1:6] | ||
|
||
# Example 3: top 6 negative values | ||
x[topn(x, 6L)] | ||
sort(x)[1:6] | ||
} | ||
\keyword{ data } |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -344,3 +344,194 @@ SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { | |
UNPROTECT(nprotect); | ||
return ans; | ||
} | ||
|
||
SEXP topnR(SEXP vec, SEXP n, SEXP dec) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is it in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feel free to add it wherever you wish. I just did not want to create too many files. There are already enough :-) |
||
int nprotect = 0; | ||
int64_t i, j, idx = 0; | ||
int len0 = asInteger(n); | ||
const int64_t len1 = xlength(vec); | ||
|
||
if (isS4(vec) && !INHERITS(vec, char_nanotime)) { | ||
error(_("S4 class objects (excluding nanotime) are not supported.")); | ||
} | ||
if (len0 > 1000) { | ||
error(_("Function 'topn' is not built for large value of 'n'. The algorithm is made for small values. Please prefer the 'order' function if you want to proceed with such large value.")); | ||
} | ||
if (len0 > len1) { | ||
warning(_("'n' is larger than length of 'vec'. 'n' will be set to length of 'vec'.")); | ||
len0 = len1; | ||
} | ||
if (len0 < 1) { | ||
error(_("Please enter a positive integer larger or equal to 1.")); | ||
} | ||
if (!IS_TRUE_OR_FALSE(dec)) { | ||
error(_("Argument 'decreasing' must be TRUE or FALSE and length 1.")); | ||
} | ||
|
||
const bool vdec = LOGICAL(dec)[0]; | ||
SEXPTYPE tvec = TYPEOF(vec); | ||
SEXP ans = PROTECT(allocVector(INTSXP, len0)); nprotect++; | ||
int *restrict pans = INTEGER(ans); | ||
int tmp; | ||
|
||
if (vdec) { | ||
switch(tvec) { | ||
case INTSXP: { | ||
const int *restrict pvec = INTEGER(vec); | ||
int min_value = pvec[0]; | ||
for (i = 0; i < len0; ++i) { | ||
pans[i] = i; | ||
if (pvec[i] <= min_value || pvec[i] == NA_INTEGER) { | ||
min_value = pvec[i]; | ||
idx = i; | ||
} | ||
} | ||
for (i = len0; i < len1; ++i) { | ||
if (pvec[i] == NA_INTEGER) { | ||
continue; | ||
} | ||
if (pvec[i] > min_value) { | ||
min_value = pvec[i]; | ||
pans[idx] = i; | ||
for (j = 0; j <len0; ++j) { | ||
if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || pvec[pans[j]] == NA_INTEGER) { | ||
min_value = pvec[pans[j]]; | ||
idx = j; | ||
} | ||
} | ||
} | ||
} | ||
for (i = 0; i < len0; ++i) { | ||
tmp = pans[i]; | ||
for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1])); --j) { | ||
pans[j] = pans[j-1]; | ||
} | ||
pans[j] = tmp; | ||
} | ||
for (i =0; i < len0; ++i) { | ||
pans[i]++; | ||
} | ||
} break; | ||
case REALSXP: { | ||
const double *restrict pvec = REAL(vec); | ||
double min_value = pvec[0]; | ||
for (i = 0; i < len0; ++i) { | ||
pans[i] = i; | ||
if (pvec[i] <= min_value || ISNAN(pvec[i])) { | ||
min_value = pvec[i]; | ||
idx = i; | ||
} | ||
} | ||
for (i = len0; i < len1; ++i) { | ||
if (ISNAN(pvec[i])) { | ||
continue; | ||
} | ||
if (pvec[i] > min_value || ISNAN(min_value)) { | ||
min_value = pvec[i]; | ||
pans[idx] = i; | ||
for (j = 0; j <len0; ++j) { | ||
if ((min_value > pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) { | ||
min_value = pvec[pans[j]]; | ||
idx = j; | ||
} | ||
} | ||
} | ||
} | ||
for (i = 0; i < len0; ++i) { | ||
tmp = pans[i]; | ||
for (j = i; j > 0 && (pvec[tmp] > pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) { | ||
pans[j] = pans[j-1]; | ||
} | ||
pans[j] = tmp; | ||
} | ||
for (i =0; i < len0; ++i) { | ||
pans[i]++; | ||
} | ||
} break; | ||
default: | ||
error(_("Type %s is not supported."), type2char(tvec)); | ||
} | ||
} else { | ||
switch(tvec) { | ||
case INTSXP: { | ||
const int *restrict pvec = INTEGER(vec); | ||
int min_value = pvec[0]; | ||
for (i = 0; i < len0; ++i) { | ||
pans[i] = i; | ||
if ((pvec[i] >= min_value && min_value != NA_INTEGER) || pvec[i] == NA_INTEGER) { | ||
min_value = pvec[i]; | ||
idx = i; | ||
} | ||
} | ||
for (i = len0; i < len1; ++i) { | ||
if (pvec[i] == NA_INTEGER) { | ||
continue; | ||
} | ||
if (pvec[i] < min_value || min_value == NA_INTEGER) { | ||
min_value = pvec[i]; | ||
pans[idx] = i; | ||
for (j = 0; j <len0; ++j) { | ||
if (((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) && min_value != NA_INTEGER) || pvec[pans[j]] == NA_INTEGER) { | ||
min_value = pvec[pans[j]]; | ||
idx = j; | ||
} | ||
} | ||
} | ||
} | ||
for (i = 0; i < len0; ++i) { | ||
tmp = pans[i]; | ||
if (pvec[tmp] == NA_INTEGER) { | ||
continue; | ||
} | ||
for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || pvec[pans[j-1]] == NA_INTEGER); --j) { | ||
pans[j] = pans[j-1]; | ||
} | ||
pans[j] = tmp; | ||
} | ||
for (i =0; i < len0; ++i) { | ||
pans[i]++; | ||
} | ||
} break; | ||
case REALSXP: { | ||
const double *restrict pvec = REAL(vec); | ||
double min_value = pvec[0]; | ||
for (i = 0; i < len0; ++i) { | ||
pans[i] = i; | ||
if (pvec[i] >= min_value || ISNAN(pvec[i])) { | ||
min_value = pvec[i]; | ||
idx = i; | ||
} | ||
} | ||
for (i = len0; i < len1; ++i) { | ||
if (ISNAN(pvec[i])) { | ||
continue; | ||
} | ||
if (pvec[i] < min_value || ISNAN(min_value)) { | ||
min_value = pvec[i]; | ||
pans[idx] = i; | ||
for (j = 0; j <len0; ++j) { | ||
if ((min_value < pvec[pans[j]] || (min_value == pvec[pans[j]] && pans[idx] < pans[j])) || ISNAN(pvec[pans[j]])) { | ||
min_value = pvec[pans[j]]; | ||
idx = j; | ||
} | ||
} | ||
} | ||
} | ||
for (i = 0; i < len0; ++i) { | ||
tmp = pans[i]; | ||
for (j = i; j > 0 && (pvec[tmp] < pvec[pans[j-1]] || (pvec[tmp] == pvec[pans[j-1]] && tmp < pans[j-1]) || (!ISNAN(pvec[tmp]) && ISNAN(pvec[pans[j-1]]))); --j) { | ||
pans[j] = pans[j-1]; | ||
} | ||
pans[j] = tmp; | ||
} | ||
for (i =0; i < len0; ++i) { | ||
pans[i]++; | ||
} | ||
} break; | ||
default: | ||
error(_("Type %s is not supported."), type2char(tvec)); | ||
} | ||
} | ||
UNPROTECT(nprotect); | ||
return ans; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We also have
na.last
argument fordata.table:::forder
, we should include that here if we really want to replaceorder(x)[1:n]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
na
are always last, I did not look atforder
but the behaviour oftopn
should be similar toorder
when it comes toNA
.