-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfd_function.R
293 lines (234 loc) · 13.8 KB
/
fd_function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
## --------------------------------------------------------------
## Name: fd_function.R
## Description: Function to calculate functional diversity indices
## Author: R.S.C. Cooke, [email protected]
## Date: March 2016 -
## Inputs: species data per ecoregion from eco_species.R script
## site = species composition data for each site
## trait data from function 'trait'
## trait = trait data to be used
## Outputs: Function named "fd_eco"
## Options: corr = correction to be applied
## spp_list = save list of all species used as a .csv?
## tree = save a Newick format version of the functional dendrogram used
## --------------------------------------------------------------
fd_eco <- function(site, trait, corr = "cailliez", spp_list = FALSE, tree = FALSE)
{
# Set up required packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(fossil, data.table, FD, clue, dplyr, qpcR, stats, cowplot, ggdendro, ape, dendextend, methods)
# fossil: used to create matrices # calls: create.matrix
# data.table: used to make data.tables for range areas # calls: as.data.table
# FD: calculates functional indices and gower dissimilarity # calls: gowdis, fdisp, functcomp
# clue: used to process dendrograms and clusters # calls: cl_ensemble, cl_consensus, cl_ultrametric, cl_dissimilarity
# dplyr: used to calculate across rows # calls: mutate, filter, transmute
# qpcR: used to calculate RMSE # calls: RMSE
# stats: used for clustering and data handling # calls: hclust, as.dendrogram, aggregate
# cowplot: simplify ggplots # calls: ggplot, plot_grid
# ggdendro: for functional dendrograms # calls: dendro_data
# ape: used to write trees in Newick format # calls: write.tree
# dendextend: used to cut a non-hclust dendrogram # calls: cutree
# methods: used to create slots in output # calls: setClass
#### Set up data ###
# sum the areas of each species per ecoregion
UK_data_sum <- as.data.table(site[c(-3)])[, lapply(.SD, sum), by = list(id_no, binomial, eco_code)] # sum shape areas by species name and ecoregion
UK_data_sum <- as.data.frame(UK_data_sum) # convert data table to data frame
UK_data_sum <- with(UK_data_sum, UK_data_sum[order(binomial),]) # reorder by name of species
# All mammals for presence = extant; origin = native, reintroduced
am12 <- read.csv("ALL_Mammals_1_2.csv")
am12_sum <- aggregate(shape_Area~binomial, am12, FUN=sum)
am12_merge <- merge(UK_data_sum, am12_sum, by.x = "binomial", by.y="binomial")
names(am12_merge) <- c("binomial", "id_no", "eco_code", "eco_spp_area", "total_spp_area")
am12 <- mutate(am12_merge, prop_spp_area = eco_spp_area/total_spp_area*100) # sum number of species per ecoregion per group
# list of unique species
Species <- unique(site[c("id_no", "binomial")])
Species <- with(Species, Species[order(binomial),]) # reorder by name of species
# create speciesxsite matrix
site_m <- create.matrix(site, tax.name="binomial", locality="eco_code")
site_m <- t(site_m) # transpose
### set up TRAITS data ##
trait$id_no <- NULL
# Turn off IUCN species id numbers (keep in data in case they become useful)
trait[trait==-999] <- NA
# convert -999's to NA
trait$activity <- factor(trait$activity) ; trait$terrestriality <- factor(trait$terrestriality) ; trait$trophic <- factor(trait$trophic)
# set activity, terrestriality and trophic level as factors
########### data cleaning for NAs
species_remove <- trait[!rowSums(is.na(trait))<length(trait),] # returns the rows that have all NAs for traits
spp_col <- unique(grep(paste(as.character(rownames(species_remove)),collapse="|"), colnames(site_m))) # finds column numbers in site data for species with all NAs for traits
# total number of species per ecoregion (including species with missing trait data)
spp_total <- apply(site_m, 1, sum)
# missing data species
id_missing <- site_m[,c(spp_col)] # matrix of missing data species per ecoregion
spp_missing <- rowSums(id_missing) # number of missing data species per ecoregion
# edit site data to match species in trait data
site_m <- site_m[,-c(spp_col)] # removes NA species from site data
trait <- trait[rowSums(is.na(trait))<length(trait),] # returns the rows that have at least one non-NA value for trait data
isTRUE(nrow(trait) == ncol(site_m))
# do the site and trait data sets contain the same number of species? - should be TRUE
# number of species per ecoregion
spp_final <- apply(site_m, 1, sum)
# number of ecoregions assessed
e <- dim(site_m)[1]
############### Set up functional data ##############
# combine trait and site data
UK <- list(trait, site_m) ; names(UK) <- c("trait","site")
# calculate species x species distance matrix based on effect traits
gd <- gowdis(UK$trait, ord = c("podani"))
# applied the Podani 1999 correction to account for ordered traits (Lefcheck et al., 2014)
###############################################################################
######### Plot multiple dengrograms of species based on effect traits #########
###############################################################################
hclust_methods <- c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty")
# average = UPGMA, mcquitty = WPGMA
# omitted UPGMC and WPGMC methods because they are not appropriate for non-metric distances (Lefcheck et al., 2014)
hclust_results <- lapply(hclust_methods, function(m) hclust(gd, method = m))
names(hclust_results) <- hclust_methods
######## Testing the performance of different clustering algorithms ##########
########## matrix 2-norm: dissimilarities (Lefcheck et al., 2014) ############
ud <- lapply(hclust_results, function(m) cl_ultrametric(m))
# convert dendrograms to ultrametric
dendro_en <- cl_ensemble(list = hclust_results)
class(dendro_en)
dendro_con <- cl_consensus(dendro_en)
# build consensus dendrogram
all_ultra <- c(ud, dendro_con[1]); names(all_ultra) <- c(hclust_methods, "consensus")
(ul <- lapply(all_ultra, function(x) cl_dissimilarity(x, gd, method = "spectral")))
# calculate dissimilarity values
ul2 <- do.call(rbind, ul)
min_dendro <- which.min(ul2); names(all_ultra)[min_dendro]
# dendrogram with lowest 2-norm value
v <- names(all_ultra)[min_dendro]
# name of dendrogram with lowest 2-norm value
min_dist <- all_ultra[names(all_ultra) == v][[1]]
# distance matrix for dendrogram with lowest 2-norm value
min_dist <- min_dist/max(min_dist)
# scale between 0-1
dend <- dendro_data(as.dendrogram(min_dist), type = "rectangle")
# extract cluster data from min_dist dendrogram
####### Quantifying the number of clusters (k) ############
###### L method (Salvador & Chan, 2004)
mer <- sort(unique(dend$segment[,"y"]), decreasing = TRUE)
# merge heights of clusters
mer2 <- data.frame(c(NA, mer), 1:(length(mer)+1)); names(mer2) <- c("h","k")
# combine merge heights and number of clusters
b <- max(mer2$k) # n = b-1
c <- seq(from = 3, to = b-2, by = 1)
xl <- lapply(c, function(c) seq(from = min(c), to = max(c), by = 1))
xr <- lapply(c, function(c) seq(from = (min(c+1)), to = max((c+1)), by = 1))
lc <- lapply(xl, function(xl) do.call("lm",list(h ~ k, data = quote(mer2), subset = 2:xl)))
rc <- lapply(xr, function(xr) do.call("lm",list(h ~ k, data = quote(mer2), subset = xr:b)))
RMSEl <- lapply(lc, function(lc) RMSE(lc))
RMSEr <- lapply(rc, function(rc) RMSE(rc))
rmsed <- data.frame(unlist(RMSEl), unlist(RMSEr), c, rep(b, length(RMSEl))); names(rmsed) <- c("RMSEl", "RMSEr", "c", "b")
rmsed <- mutate(rmsed, RMSEc = c-1/b-1*RMSEl + b-c/b-1*RMSEr) ## equation Salvador & Chan: RMSEc = c-1/b-1*RMSEl + b-c/b-1*RMSEr
min_rmsed <- filter(rmsed, RMSEc == min(RMSEc))
c_ <- min_rmsed$c
################ Functional redundancy ##################
# find number of groups and return species assignation to groups
e_gr <- cutree(as.dendrogram(min_dist), k = c_)
# p = number of plots
p <- nrow(UK$site)
# c_ = number of effect groups
gr = c_
gr2 <- unique(e_gr)
gr3 <- gr2[order(gr2)]
t2 <- rep(p,gr)
e_group <- rep(gr3, t2)
site <- rep(row.names(UK$site), gr)
e_gr1 <- rep(e_gr, p)
e_gr_m <- matrix(e_gr1, p, length(e_gr), byrow = T, dimnames = list(rownames(UK$site), colnames(UK$site)))
mats <- list()
FRed1 <- data.frame()
for (i in 1:gr){
t <- ifelse(e_gr_m == i, 1, 0)
mats[[i]] <- t * UK$site
FRed1 <- rbind(FRed1, mats[[i]])
}
FRed <- FRed1 %>% transmute(nbsp_gr = rowSums(FRed1)) # sum number of species per ecoregion per group
results <- data.frame(site, e_group, FRed$nbsp); names(results) <- c("site", "group", "FRed")
res <- results[order(results$site),]; row.names(res) <- NULL
########### Functional dispersion #########
FD <- fdisp(gd, UK$site)
########### Community-weighted means ############
CWM <- functcomp(UK$trait, UK$site) # CWM.type = "all" if frequencies of each ordinal class needed
########### Outputs and Plots ################
if (spp_list == TRUE)
cat(write.csv(Species, file = "Species_list.csv", row.names=FALSE)) # export species list as .csv
###### write newick tree
if (tree == TRUE)
write.tree(as.phylo(as.dendrogram(min_dist)),"Functional_dendrogram.new")
###### Plot: performance of clustering algorithms
uld <- data.frame(unlist(ul)); names(uld) <- c("ultra"); rownames(uld)[which(rownames(uld)=="average")]= "UPGMA"; rownames(uld)[which(rownames(uld)=="mcquitty")]= "WPGMA"; rownames(uld)[which(rownames(uld)=="single")]= "Single"; rownames(uld)[which(rownames(uld)=="complete")]= "Complete"; rownames(uld)[which(rownames(uld)=="consensus")]= "Consensus"; rownames(uld)[which(rownames(uld)=="ward.D2")]= "Ward D2"; rownames(uld)[which(rownames(uld)=="ward.D")]= "Ward D"
uld <- uld[order(uld$ultra), , drop = FALSE] # reorder data frame by increasing dissimilarity
clus <- ggplot(data = uld, aes(x = row.names(uld), y = 1/ultra)) +
geom_bar(stat = "identity", fill = "grey") + # adds bars
scale_x_discrete(limits = row.names(uld)) + # maintains order of row names in ul data frame
labs(title = "Performance of clustering algorithms", x = "Linkage function", y = "Spectral norm (2-norm) similarities") + # adds axis titles and main title
theme(axis.text.y = element_text(size=15), # changes size of axis labels
axis.text.x = element_text(size=15), # changes size of axis labels
axis.title.x = element_text(size=17), # changes size of axis title
axis.title.y = element_text(size=17), # changes size of axis title
plot.title = element_text(size = 20)) # changes size of main title
###### Plot: Evaluation plot
eval <- ggplot(data = mer2, aes(k, h)) + # sets up plot
geom_point(size = 2.5, shape = 16) + # adds points
labs(title = "Evaluation plot", x = "Number of clusters", y = "Merging height") + # adds axis titles and main title
geom_vline(aes(xintercept = c_)) + # adds vertical line at c_
annotate("text", x = 0.5*b, y = 0.7*max(mer2$h, na.rm = TRUE), label = paste("RMSE(min): k =", c_), size = 6) + # adds text to centre of plot
scale_y_continuous(breaks = seq(0, 1, by = 0.2)) +
theme(axis.text.y = element_text(size=15), # changes size of axis labels
axis.text.x = element_text(size=15), # changes size of axis labels
axis.title.x = element_text(size=17), # changes size of axis title
axis.title.y = element_text(size=17), # changes size of axis title
plot.title = element_text(size = 20)) # changes size of main title
###### Plot: Functional dendrogram
# Set up dendrogram
dendro_plot <- ggplot(dend$segments) + # sets up plot
geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + # adds heights of branchs
ylab("") + # remove label for y axis
coord_flip() + # make dendrogram horizontal
scale_y_reverse(expand = c(0,0), breaks = seq(0, 1, by = 0.2)) + # reverse scale so that 0 is on the right
theme(axis.title.y=element_blank(), axis.text.y=element_blank(),
axis.ticks.y=element_blank(), axis.line.y=element_blank(),
axis.ticks.length = unit(0.5, "lines"),
plot.margin = unit(c(1,0,1,1), "lines")) # remove y axis information
# Add species labels separately to ensure axis is clipped to limits
dendro_text <- ggplot(dend$labels) +
scale_y_continuous(limits = c(0,1)) +
geom_text(data = dend$labels, aes(x = x, y = y, label = label),
hjust = 0, size = 4.5) + # adds species names
coord_flip() + # make dendrogram horizontal
theme(axis.title.y=element_blank(), axis.text.y=element_blank(),
axis.ticks.y=element_blank(), axis.line.y=element_blank(),
axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank(), axis.line.x=element_blank(),
plot.margin = unit(c(1,1,1,0), "lines")) # remove y axis information
# Combine dendrogram and text plots
dendro <- plot_grid(dendro_plot, dendro_text, align="h", rel_widths = c(3,1))
# Session information
session_info <- devtools::session_info()
# Create list of outputs
stats <- list(
ecoregions = e[1], # @stats$ecoregions
spp_total = spp_total, # @stats$spp_total
spp_missing = spp_missing, # @stats$spp_missing
spp_final = spp_final, # @stats$spp_final
k = c_, # @stats$k
CWM = CWM, # @stats$CWM
FRed = res, # @stats$FRed
FDis = FD$FDis) # @stats$FDis
# Create list of plots
plots <- list(
clus = clus, # @plots$clus
eval = eval, # @plots$eval
dendro = dendro) # @plots$dendro
# Create list for session information
session_info <- list(
session_info = session_info
)
# Combine two lists of outputs into slots
result <- setClass("result", slots = c(stats = "list", plots = "list", session_info = "list"))
result <- result(stats = stats, plots = plots, session_info = session_info)
return(result)
} # end of fd_eco function