Skip to content

Commit

Permalink
FIx cc_outl issue
Browse files Browse the repository at this point in the history
  • Loading branch information
BrunoVilela committed Oct 23, 2023
1 parent cea8cfc commit abec7cd
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 142 deletions.
266 changes: 140 additions & 126 deletions R/cc_outl.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@
#' cc_outl(x, method = "quantile", value = "flagged")
#' cc_outl(x, method = "distance", value = "flagged", tdi = 10000)
#' cc_outl(x, method = "distance", value = "flagged", tdi = 1000)
#' plot(x[, -1], col = cc_outl(x, method = "distance", value = "flagged", tdi = 1000) + 1)
#'
#' @export
#' @importFrom geosphere distm distHaversine
Expand Down Expand Up @@ -157,131 +156,16 @@ cc_outl <- function(x,


# identify points for flagging
flags <- lapply(splist, function(k) {

# Set raster flag inc ase thinning= TRUE,
# or this particualr species has 100000 or more records
if (nrow(k) >= 10000 | thinning) {
raster_flag <- TRUE
} else {
raster_flag <- FALSE
}

# calculate the distance matrix between all points for the outlier tests
## for small datasets and without thinning,
## simply a distance matrix using geospheric distance
if (raster_flag) {
# raster approximation for large datasets and thinning
# get a distance matrix of raster midpoints, with the row
# and colnames giving the cell IDs

# if the raster distance is used due to large dataset and
# not for thinning, account for the number of points per gridcell
if (thinning) {
dist_obj <- ras_dist(x = k,
lat = lat,
lon = lon,
ras = ras,
weights = FALSE)

# the point IDS
pts <- dist_obj$pts

# the distance matrix
dist <- dist_obj$dist

} else {
dist_obj <- ras_dist(x = k,
lat = lat,
lon = lon,
ras = ras,
weights = TRUE)

# the point IDS
pts <- dist_obj$pts

# the distance matrix
dist <- dist_obj$dist

# a weight matrix to weight each distance by the number of points in it
wm <- dist_obj$wm
}
} else {
#distance calculation
dist <- geosphere::distm(k[, c(lon, lat)],
fun = geosphere::distHaversine) / 1000

# set diagonale to NA, so it does not influence the mean
dist[dist == 0] <- NA
}

# calculate the outliers for the different methods
## distance method useing absolute distance
if (method == "distance") {
# Drop an error if the sampling correction is activated
if (sampling_thresh > 0) {
stop("Sampling correction impossible for method 'distance'" )
}

# calculate the minimum distance to any next point
mins <- apply(dist, 1, min, na.rm = TRUE)

# outliers based on absolute distance threshold
out <- which(mins > tdi)
} else { # for the other methods the mean must be claculated
# depending on if the raster method is used
# get row means
if (raster_flag & !thinning) {
# get mean distance to all other points
mins <- apply(dist, 1, sum, na.rm = TRUE) / rowSums(wm, na.rm = TRUE)
} else {
# get mean distance to all other points
mins <- apply(dist, 1, mean, na.rm = TRUE)
}
}

## the quantile method
if (method == "quantile") {
# identify the quaniles
quo <- quantile(mins, c(0.25, 0.75), na.rm = TRUE)

# flag all upper outliers
out <- which(mins > (quo[2] + stats::IQR(mins) * mltpl))
}

## the mad method
if (method == "mad") {
# get the median
quo <- stats::median(mins, na.rm = TRUE)

# get the mad stat
tester <- stats::mad(mins, na.rm = TRUE)

# Identify outliers
out <- which(mins > (quo + tester * mltpl))
}

# Identify the outlier points depending on
# if the raster approximation was used
if (raster_flag) {
# create output object
if (length(out) == 0) {
ret <- NA
}
if (length(out) > 0) {
ret <- rownames(k)[which(pts %in% gsub("X", "", names(out)))]
}
}else{
# create output object
if (length(out) == 0) {
ret <- NA
}
if (length(out) > 0) {
ret <- rownames(k)[out]
}
}
return(ret)
})
flags <- lapply(splist,
.flagging,
thinning = thinning,
lon = lon,
lat = lat,
method = method,
mltpl = mltpl,
ras = ras,
sampling_thresh = sampling_thresh,
tdi = tdi)

# turn to numeric and remove NAs
flags <- as.numeric(as.vector(unlist(flags)))
Expand Down Expand Up @@ -371,3 +255,133 @@ cc_outl <- function(x,
ids = return(init_ids[flags]))
}
}


####------

.flagging <- function(k, thinning, lon, lat, method, mltpl, ras,
sampling_thresh, tdi) {

# Set raster flag inc ase thinning= TRUE,
# or this particualr species has 100000 or more records
if (nrow(k) >= 10000 | thinning) {
raster_flag <- TRUE
} else {
raster_flag <- FALSE
}

# calculate the distance matrix between all points for the outlier tests
## for small datasets and without thinning,
## simply a distance matrix using geospheric distance
if (raster_flag) {
# raster approximation for large datasets and thinning
# get a distance matrix of raster midpoints, with the row
# and colnames giving the cell IDs

# if the raster distance is used due to large dataset and
# not for thinning, account for the number of points per gridcell
if (thinning) {
dist_obj <- ras_dist(x = k,
lat = lat,
lon = lon,
ras = ras,
weights = FALSE)

# the point IDS
pts <- dist_obj$pts

# the distance matrix
dist <- dist_obj$dist

} else {
dist_obj <- ras_dist(x = k,
lat = lat,
lon = lon,
ras = ras,
weights = TRUE)

# the point IDS
pts <- dist_obj$pts

# the distance matrix
dist <- dist_obj$dist

# a weight matrix to weight each distance by the number of points in it
wm <- dist_obj$wm
}
} else {
#distance calculation
dist <- geosphere::distm(k[, c(lon, lat)],
fun = geosphere::distHaversine) / 1000

# set diagonale to NA, so it does not influence the mean
dist[dist == 0] <- NA
}

# calculate the outliers for the different methods
## distance method useing absolute distance
if (method == "distance") {
# Drop an error if the sampling correction is activated
if (sampling_thresh > 0) {
stop("Sampling correction impossible for method 'distance'" )
}

# calculate the minimum distance to any next point
mins <- apply(dist, 1, min, na.rm = TRUE)

# outliers based on absolute distance threshold
out <- which(mins > tdi)
} else { # for the other methods the mean must be claculated
# depending on if the raster method is used
# get row means
if (raster_flag & !thinning) {
# get mean distance to all other points
mins <- apply(dist, 1, sum, na.rm = TRUE) / rowSums(wm, na.rm = TRUE)
} else {
# get mean distance to all other points
mins <- apply(dist, 1, mean, na.rm = TRUE)
}
}

## the quantile method
if (method == "quantile") {
# identify the quaniles
quo <- quantile(mins, c(0.25, 0.75), na.rm = TRUE)

# flag all upper outliers
out <- which(mins > (quo[2] + stats::IQR(mins) * mltpl))
}

## the mad method
if (method == "mad") {
# get the median
quo <- stats::median(mins, na.rm = TRUE)

# get the mad stat
tester <- stats::mad(mins, na.rm = TRUE)

# Identify outliers
out <- which(mins > (quo + tester * mltpl))
}

# Identify the outlier points depending on
# if the raster approximation was used
if (raster_flag) {
# create output object
if (length(out) == 0) {
ret <- NA
}
if (length(out) > 0) {
ret <- rownames(k)[which(pts %in% gsub("X", "", names(out)))]
}
}else{
# create output object
if (length(out) == 0) {
ret <- NA
}
if (length(out) > 0) {
ret <- rownames(k)[out]
}
}
return(ret)
}
41 changes: 26 additions & 15 deletions R/internal_clean_coordinate.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,28 @@ reproj <- function(ref) {

ras_create <- function(x, lat, lon, thinning_res){
# get data extend
ex <- terra::ext(terra::vect(x[, c(lon, lat)], geom=c(lon, lat))) + thinning_res * 2
ex <- terra::ext(terra::vect(x[, c(lon, lat)],
geom = c(lon, lat))) + thinning_res * 2

# check for boundary conditions
if(ex[1] < -180 | ex[2] > 180 | ex[3] < -90 | ex[4] >90){
if (ex[1] < -180 | ex[2] > 180 | ex[3] < -90 | ex[4] > 90) {
warning("fixing raster boundaries, assuming lat/lon projection")

if(ex[1] < -180){ex[1] <- -180}
if (ex[1] < -180) {
ex[1] <- -180
}

if(ex[2] > 180){ex[2] <- 180}
if (ex[2] > 180) {
ex[2] <- 180
}

if(ex[3] < -90){ex[3] <- -90}
if (ex[3] < -90) {
ex[3] <- -90
}

if(ex[4] > 90){ex[4] <- 90}
if (ex[4] > 90) {
ex[4] <- 90
}
}

# create raster
Expand All @@ -60,22 +69,24 @@ ras_create <- function(x, lat, lon, thinning_res){
#output a data.frame with the distances and the cell IDs as row and column names for cc_outl

ras_dist <- function(x, lat, lon, ras, weights) {
# x = a data.frame of point coordinates, ras = a raster with cell IDs as layer,
#weight = logical, shall the distance matrix be weightened by the number of points per cell?
# assign each point to a raster cell
#x = a data.frame of point coordinates, ras = a raster with cell IDs as layer,
#weight = logical, shall the distance matrix be weightened by the number of
#points per cell? assign each point to a raster cell
pts <- terra::extract(x = ras,
y = terra::vect(x[, c(lon, lat)],
geom = c(lon, lat),
crs = ras))

# convert to data.frame
midp <- data.frame(terra::as.points(ras))
midp <- data.frame(terra::as.points(ras),
terra::xyFromCell(ras, 1:terra::ncell(ras)))

# retain only cells that contain points
midp <- midp[midp$lyr.1 %in% unique(pts$lyr.1),]
midp <- midp[midp$lyr.1 %in% unique(pts$lyr.1), , drop = FALSE]

# order
midp <- midp[match(unique(pts$lyr.1), midp$lyr.1),]
midp <- midp[match(unique(pts$lyr.1), midp$lyr.1), , drop = FALSE]


# calculate geospheric distance between raster cells with points
dist <- geosphere::distm(midp[, c("x", "y")],
Expand All @@ -85,18 +96,18 @@ ras_dist <- function(x, lat, lon, ras, weights) {
dist <- as.data.frame(dist, row.names = as.integer(midp$lyr.1))
names(dist) <- midp$lyr.1

if(weights){
if (weights) {
# approximate within cell distance as half
# the cell size, assumin 1 deg = 100km
# this is crude, but doesn't really matter
dist[dist == 0] <- 100 * mean(terra::res(ras)) / 2

# weight matrix to account for the number of points per cell
## the number of points in each cell
cou <- table(pts)
cou <- table(pts$lyr.1)

## order
cou <- cou[match(unique(pts), names(cou))]
cou <- cou[match(unique(pts$lyr.1), names(cou))]

# weight matrix, representing the number of distances between or within the cellse (points cell 1 * points cell 2)
wm <- outer(cou, cou)
Expand Down
1 change: 0 additions & 1 deletion man/cc_outl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit abec7cd

Please sign in to comment.