-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_Ring_width_read_and_clean.R
235 lines (185 loc) · 10.1 KB
/
1_Ring_width_read_and_clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
##----------------------------------------------------------------------------------------------------------------------------------#
# Script by : Brendon Reidy,
# Special thanks: LLM AI for help writing the function for removing 0's and replacing them with NA's
# Project: 2024 Reu project/Living Collections Wood Archive
# Purpose: To read raw data generated by manual measurments of tree ring width using the tellervo software.
# This scrip utilizes the dplR package (https://rdrr.io/cran/dplR/)to read files containing ringwidth measurments
# and their associated meta data house on the Forest Ecology Lab google drive. It then puts these data into
# a form usable for analysis.
# Inputs: Raw data from measurements collect via Forest Ecology Lab staff. These inputs include .CSV,
#.JSON, or .XML files in the Tridas format. This script utilizes .XML Files. Files are generated
# utlizing the software Tellervo (http://www.tellervo.org/) designed to collect and manage dendrochronological data and samples.
# Outputs:comb1.rwl
# Notes:
#---#At the time of writing, new collected data are not automated to upload onto on to the Forest Ecology Lab google drive.
#---#Data generated post the writing of this scipt will need to be uploaded on to the Forest Ecology Lab Drive
#---#This script presupposes a file path a file local to the user's googledrive where the Data input is housed
#---#Line 114 of this script removes a column due to user error in data entry, once this error is corrected that line
#must be removed.
######################################
library(dplR)
# Set working directory # CR: we ALWAYS want the working directory to be where our repository is; we want to set up a path to where to read/write files
# Until pathing errors are fixed on drive This will need to be the location of these data on the users local drive
# setwd("~/Google Drive/My Drive/2024_REU_crossdate/Quercus RW Tridas")
# setwd("~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/")
#This should be your path but is not working on Mirada's file stream so commented out
path.dat <- "~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/"
path.out <- "~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/organized"
#path to make this work on Mirada's computer. Path is Local ONLY TO Miranada's Computer
# path.dat <- "~/Desktop/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/"
# path.out <- "~/Desktop/Data/Raw Ring Widths/organized"
if(!dir.exists(path.out)) dir.create(path.out, recursive = T)
# Function to process TRIDAS file # CR: This was good, but not using the funciton to make it easier to store metadata
# process.tridas.file <- function(file) {
# tridas.data <- read.tridas(file, ids.from.titles = TRUE, ids.from.identifiers = TRUE,
# combine.series = TRUE, trim.whitespace = TRUE, warn.units = TRUE)
# measurements <- tridas.data$measurements
# rwl.object <- as.rwl(measurements)
# return(rwl.object)
# }
# There is an encoding error in several of the files, this
# Funtion cleans non-UTF-8 characters from a files
clean.utf8 <- function(file) {
raw.content <- readBin(file, "raw", file.info(file)$size)
utf8.content <- iconv(rawToChar(raw.content), from = "latin1", to = "UTF-8", sub = "")
temp.file <- tempfile(fileext = ".xml")
writeLines(utf8.content, temp.file, useBytes = TRUE)
return(temp.file)
}
#This section lists the XML files to check an ensure the data is consistent.
#It then cleans a process these files and stores them as .RWL objects
# tridas.files <- list.files(pattern = ".xml$", full.names = TRUE)
tridas.files <- dir(path.dat, ".xml") # This ends up being just a hair cleaner in terms of coding
print(tridas.files) # Should list all XML files, expecting 112 from first pull, 126 from second pull
# Create an empty list for storing RWL objects
rwl.objects <- list()
# Adding a data frame to store key metadata
series.metadata <- data.frame(file=tridas.files, site=NA, taxon=NA, treeID=NA, core=NA, radius=NA, measurement=NA, year.first=NA, year.last=NA, first0Flag=NA, last0Flag=NA)
# first/last 0 Flag being if there are leading/trailing 0s we need to clean up and redate
# names(tridas.data)
# Problem files: QUERCS-134-U61-11-1-CR1b.xml, QUERCS-134-U61-11-1-CR1b.xml
# Good File: UNKNWN-UNK-169-11-1-MC1.xml
fBAD <- "QUERCS-134-U61-11-1-CR1b.xml" # This is our outlier in later stages
fGOOD <- "UNKNWN-UNK-169-11-1-MC1.xml" # This is one from Miranda which isn't that weird
# Loop through each file, clean it, and process it
for (FILE in tridas.files) {
rowFile <- which(series.metadata$file==FILE)
cleaned.file <- clean.utf8(file=file.path(path.dat, FILE))
# cecking for errors with read in text visible in the console
# tridas.data <- tryCatch({
# process.tridas.file(file=cleaned.file)
# }, error = function(e) {
# message("Error reading file:", FILE, "\n", conditionMessage(e))
# NULL
# })
tridas.data <- read.tridas(cleaned.file, ids.from.titles = TRUE, ids.from.identifiers = TRUE,
combine.series = TRUE, trim.whitespace = TRUE, warn.units = TRUE)
# summary(data.frame(tridas.data$measurements))
# series.metadata[rowFile,]
# tridas.data$unit
if(max(tridas.data$measurements)<1 & tridas.data$unit=="millimetres") tridas.data$measurements <- tridas.data$measurements*100 # Things are being weird, but for now lets just hack this fix in
# measurements <- tridas.data$measurements
rwl.objects[[FILE]] <- as.rwl(tridas.data$measurements)
series.metadata[rowFile, "site"] <- tridas.data$site.title$site.title
series.metadata[rowFile, "taxon"] <- tridas.data$taxon$normal
series.metadata[rowFile, c("treeID", "core", "radius", "measurement")] <- tridas.data$titles
series.metadata[rowFile, c("year.first", "year.last")] <- as.numeric(range(row.names(tridas.data$measurements)))
# if (!is.null(tridas.data)) {
# rwl.objects[[file]] <- tridas.data
# cat("RWL object for", file, "created.\n")
# }
}
dim(series.metadata)
length(rwl.objects)
summary(series.metadata)
#####
#####
# Identifying problem cores up front
#####
rowBad <- which(series.metadata$year.last<=1900 | series.metadata$year.last>=lubridate::year(Sys.Date()))
series.metadata[rowBad,]
filesBAD <- series.metadata$file[rowBad]
rwlList <- rwl.objects[!names(rwl.objects) %in% filesBAD] # Just keeping the good ones
# Checking on our weirdo and our okay reference
summary(rwlList[[fBAD]])
summary(rwlList[[fGOOD]])
rwlList[[fBAD]][1:20,]
rwlList[[fGOOD]][1:20,]
mean(rwlList[[fBAD]][1:10,])
mean(rwlList[[fGOOD]][1:10,])
# The measuremetns looks similar or the good should be higher!
# summary(rwlList[,c(fBAD, fGOOD)])
#####
#####
#This next section combines the individual RWL objest into a single data frame and aligns the
#measurements by year
#####
all.years <- unique(unlist(lapply(rwlList, rownames)))
all.years <- sort(as.numeric(all.years))
# Creates a data frame which will store the measurements
#From the rwl.objescts data frame around the "year" column as defined above.
combined.rwl <- data.frame(year = all.years)
# Add measurements from each RWL object to the combined data frame
for (file in names(rwlList)) {
measurements <- rwlList[[file]]
file.name <- basename(file)
# Create a data frame for the current file, Missing years will contain an NA
df <- data.frame(year = as.numeric(rownames(measurements)), measurement = measurements[,1])
# Merge with combined data frame by the "year"
combined.rwl <- merge(combined.rwl, df, by = "year", all.x = TRUE)
# Rename the last column to the file name
colnames(combined.rwl)[ncol(combined.rwl)] <- file.name
}
# Print combined RWL object names to check
print(colnames(combined.rwl)[1:25])
print(tail(colnames(combined.rwl)))
dim(combined.rwl)
# Set row names the the "year" and remove the year column
rownames(combined.rwl) <- combined.rwl$year
combined.rwl <- combined.rwl[, -which(colnames(combined.rwl)=="year")]
#check
# print(colnames(combined.rwl))
head(colnames(combined.rwl))
dim(combined.rwl)
#View(combined.rwl)
summary(combined.rwl[,1:10])
# Checking on our two reference series --> still looks okay
summary(combined.rwl[,c(fBAD, fGOOD)])
combined.rwl[1:20,c(fBAD, fGOOD)]
# Get rid of leading and trailing 0s in our series
# What Brendon had was a good start, but this will be a bit more robust
for(i in 1:ncol(combined.rwl)){
fNow <- names(combined.rwl)[i]
rowfirstDat <- min(which(!is.na(combined.rwl[,i])))
rowlastDat <- max(which(!is.na(combined.rwl[,i])))
rowfirstMeas <- min(which(combined.rwl[,i]>0))
rowlastMeas <- max(which(combined.rwl[,i]>0))
# Deal with leading 0s
if(rowfirstDat==rowfirstMeas){
series.metadata$first0Flag[series.metadata$file==fNow] <- F
} else {
print(paste0(fNow, ": removing leading 0s"))
series.metadata$first0Flag[series.metadata$file==fNow] <- T
combined.rwl[1:rowfirstDat,i] <- NA
}
# Deal with trailing 0s
if(rowlastDat==rowlastMeas){
series.metadata$last0Flag[series.metadata$file==fNow] <- F
} else {
print(paste0(fNow, ": removing trailing 0s"))
series.metadata$last0Flag[series.metadata$file==fNow] <- T
combined.rwl[rowlastDat:nrow(combined.rwl),i] <- NA
}
}
# Trim out rows that are all missing
rows.dat <- apply(combined.rwl, 1, FUN=function(x){!all(is.na(x))})
combined.rwl <- combined.rwl[rows.dat,]
combined.rwl[1:20,c(fBAD, fGOOD)]
# print out cores that need leading or trailing 0s fixed (to be redated)
series.metadata[!is.na(series.metadata$first0Flag) & (series.metadata$first0Flag | series.metadata$last0Flag),]
cores.check <- series.metadata[(!is.na(series.metadata$first0Flag) & (series.metadata$first0Flag | series.metadata$last0Flag)) |
(series.metadata$year.last<=1900 | series.metadata$year.last>=lubridate::year(Sys.Date())) ,]
write.csv(cores.check, file.path(path.out, "Series-with-Errors.csv"), row.names=F)
# Now also write out the metadata and compiled data
write.csv(series.metadata, file.path(path.out, "Series-Metadata_all.csv"), row.names=F)
write.csv(combined.rwl, file.path(path.out, "Series-Measurements_all.csv"), row.names=T)