1_Ring_width_read_and_clean.R

##----------------------------------------------------------------------------------------------------------------------------------#
# Script by : Brendon Reidy, 
# Special thanks: LLM AI for help writing the function for removing 0's and replacing them with NA's
# Project: 2024 Reu project/Living Collections Wood Archive 
# Purpose: To read raw data generated by manual measurments of tree ring width using the tellervo software.
# This scrip utilizes the dplR package (https://rdrr.io/cran/dplR/)to read files containing ringwidth measurments
# and their associated meta data house on the Forest Ecology Lab google drive. It then puts these data into 
# a form usable for analysis. 
# Inputs: Raw data from measurements collect via Forest Ecology Lab staff. These inputs include .CSV,
#.JSON, or .XML files in the Tridas format. This script utilizes .XML Files. Files are generated
# utlizing the software Tellervo (http://www.tellervo.org/) designed to collect and manage dendrochronological data and samples. 
# Outputs:comb1.rwl
# Notes: 
#---#At the time of writing, new collected data are not automated to upload onto on to the Forest Ecology Lab google drive.
#---#Data generated post the writing of this scipt will need to be uploaded on to the Forest Ecology Lab Drive
#---#This script presupposes a file path a file local to the user's googledrive where the Data input is housed

#---#Line 114 of this script removes a column due to user error in data entry, once this error is corrected that line
#must be removed.
######################################

library(dplR)

# Set working directory  # CR:  we ALWAYS want the working directory to be where our repository is; we want to set up a path to where to read/write files

# Until pathing errors are fixed on drive This will need to be the location of these data on the users local drive
# setwd("~/Google Drive/My Drive/2024_REU_crossdate/Quercus RW Tridas")
# setwd("~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/")
#This should be your path but is not working on Mirada's file stream so commented out
path.dat <- "~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/"
path.out <- "~/Google Drive/My Drive/URF REU 2024 - Chiong - Oaks/Data/Raw Ring Widths/organized"

#path to make this work on Mirada's computer. Path is Local ONLY TO Miranada's Computer
# path.dat <- "~/Desktop/Data/Raw Ring Widths/Quercus REU 2024 - Second Pull/Quercus RW Tridas/"
# path.out <- "~/Desktop/Data/Raw Ring Widths/organized"

if(!dir.exists(path.out)) dir.create(path.out, recursive = T)

# Function to process TRIDAS file # CR: This was good, but not using the funciton to make it easier to store metadata
# process.tridas.file <- function(file) {
#   tridas.data <- read.tridas(file, ids.from.titles = TRUE, ids.from.identifiers = TRUE, 
#                              combine.series = TRUE, trim.whitespace = TRUE, warn.units = TRUE)
#   measurements <- tridas.data$measurements
#   rwl.object <- as.rwl(measurements)
#   return(rwl.object)
# }

# There is an encoding error in several of the files, this 
# Funtion cleans non-UTF-8 characters from a files
clean.utf8 <- function(file) {
  raw.content <- readBin(file, "raw", file.info(file)$size)
  utf8.content <- iconv(rawToChar(raw.content), from = "latin1", to = "UTF-8", sub = "")
  temp.file <- tempfile(fileext = ".xml")
  writeLines(utf8.content, temp.file, useBytes = TRUE)
  return(temp.file)
}

#This section lists the XML files to check an ensure the data is consistent.
#It then cleans a process these files and stores them as .RWL objects

# tridas.files <- list.files(pattern = ".xml$", full.names = TRUE)
tridas.files <- dir(path.dat, ".xml") # This ends up being just a hair cleaner in terms of coding
print(tridas.files)  # Should list all XML files, expecting 112 from first pull, 126 from second pull

# Create an empty list for storing RWL objects
rwl.objects <- list()

# Adding a data frame to store key metadata 
series.metadata <- data.frame(file=tridas.files, site=NA, taxon=NA, treeID=NA, core=NA, radius=NA, measurement=NA, year.first=NA, year.last=NA, first0Flag=NA, last0Flag=NA)
# first/last 0 Flag being if there are leading/trailing 0s we need to clean up and redate

# names(tridas.data)
# Problem files: QUERCS-134-U61-11-1-CR1b.xml, QUERCS-134-U61-11-1-CR1b.xml
# Good File: UNKNWN-UNK-169-11-1-MC1.xml
fBAD <- "QUERCS-134-U61-11-1-CR1b.xml" # This is our outlier in later stages
fGOOD <- "UNKNWN-UNK-169-11-1-MC1.xml" # This is one from Miranda which isn't that weird

# Loop through each file, clean it, and process it
for (FILE in tridas.files) {

  rowFile <- which(series.metadata$file==FILE)
  
  cleaned.file <- clean.utf8(file=file.path(path.dat, FILE))
 # cecking for errors with read in text visible in the console
  # tridas.data <- tryCatch({
  #    process.tridas.file(file=cleaned.file)
  #  }, error = function(e) {
  #    message("Error reading file:", FILE, "\n", conditionMessage(e))
  #    NULL
  #  })
  
  tridas.data <- read.tridas(cleaned.file, ids.from.titles = TRUE, ids.from.identifiers = TRUE,
                             combine.series = TRUE, trim.whitespace = TRUE, warn.units = TRUE)
  
  # summary(data.frame(tridas.data$measurements))
  # series.metadata[rowFile,]
  # tridas.data$unit
  if(max(tridas.data$measurements)<1 & tridas.data$unit=="millimetres") tridas.data$measurements <- tridas.data$measurements*100 # Things are being weird, but for now lets just hack this fix in 
  # measurements <- tridas.data$measurements
  rwl.objects[[FILE]] <- as.rwl(tridas.data$measurements)
  
  series.metadata[rowFile, "site"] <- tridas.data$site.title$site.title
  series.metadata[rowFile, "taxon"] <- tridas.data$taxon$normal
  series.metadata[rowFile, c("treeID", "core", "radius", "measurement")] <- tridas.data$titles
  series.metadata[rowFile, c("year.first", "year.last")] <- as.numeric(range(row.names(tridas.data$measurements)))
  
  # if (!is.null(tridas.data)) {
  #   rwl.objects[[file]] <- tridas.data
  #   cat("RWL object for", file, "created.\n")
  # }
}
dim(series.metadata)
length(rwl.objects)

summary(series.metadata)
#####

#####
# Identifying problem cores up front
#####
rowBad <- which(series.metadata$year.last<=1900 | series.metadata$year.last>=lubridate::year(Sys.Date()))
series.metadata[rowBad,]
filesBAD <- series.metadata$file[rowBad]

rwlList <- rwl.objects[!names(rwl.objects) %in% filesBAD] # Just keeping the good ones

# Checking on our weirdo and our okay reference
summary(rwlList[[fBAD]])
summary(rwlList[[fGOOD]])
rwlList[[fBAD]][1:20,]
rwlList[[fGOOD]][1:20,]
mean(rwlList[[fBAD]][1:10,])
mean(rwlList[[fGOOD]][1:10,])
# The measuremetns looks similar or the good should be higher!
# summary(rwlList[,c(fBAD, fGOOD)])
#####


#####
#This next section combines the individual RWL objest into a single data frame and aligns the 
#measurements by year
#####
all.years <- unique(unlist(lapply(rwlList, rownames)))
all.years <- sort(as.numeric(all.years))

# Creates a data frame which will store the measurements
#From the rwl.objescts data frame around the "year" column as defined above.  
combined.rwl <- data.frame(year = all.years)

# Add measurements from each RWL object to the combined data frame
for (file in names(rwlList)) {
  measurements <- rwlList[[file]]
  file.name <- basename(file)
  
  # Create a data frame for the current file, Missing years will contain an NA
  df <- data.frame(year = as.numeric(rownames(measurements)), measurement = measurements[,1])
  
  # Merge with combined data frame by the "year" 
  combined.rwl <- merge(combined.rwl, df, by = "year", all.x = TRUE)
  
  # Rename the last column to the file name
  colnames(combined.rwl)[ncol(combined.rwl)] <- file.name
}

# Print combined RWL object names to check
print(colnames(combined.rwl)[1:25])
print(tail(colnames(combined.rwl)))
dim(combined.rwl)

# Set row names the the "year" and remove the year column
rownames(combined.rwl) <- combined.rwl$year
combined.rwl <- combined.rwl[, -which(colnames(combined.rwl)=="year")]
#check
# print(colnames(combined.rwl))
head(colnames(combined.rwl))
dim(combined.rwl)
#View(combined.rwl)
summary(combined.rwl[,1:10])

# Checking on our two reference series --> still looks okay
summary(combined.rwl[,c(fBAD, fGOOD)])
combined.rwl[1:20,c(fBAD, fGOOD)]


# Get rid of leading and trailing 0s in our series
# What Brendon had was a good start, but this will be a bit more robust
for(i in 1:ncol(combined.rwl)){
  fNow <- names(combined.rwl)[i]
  rowfirstDat <- min(which(!is.na(combined.rwl[,i])))
  rowlastDat <- max(which(!is.na(combined.rwl[,i])))
  
  rowfirstMeas <- min(which(combined.rwl[,i]>0))
  rowlastMeas <- max(which(combined.rwl[,i]>0))
  
  # Deal with leading 0s
  if(rowfirstDat==rowfirstMeas){
    series.metadata$first0Flag[series.metadata$file==fNow] <- F
  } else {
    print(paste0(fNow, ": removing leading 0s"))
    series.metadata$first0Flag[series.metadata$file==fNow] <- T
    combined.rwl[1:rowfirstDat,i] <- NA
  }
  
  # Deal with trailing 0s 
  if(rowlastDat==rowlastMeas){ 
    series.metadata$last0Flag[series.metadata$file==fNow] <- F
  } else {
    print(paste0(fNow, ": removing trailing 0s"))
    series.metadata$last0Flag[series.metadata$file==fNow] <- T
    combined.rwl[rowlastDat:nrow(combined.rwl),i] <- NA
  }
  
}

# Trim out rows that are all missing
rows.dat <- apply(combined.rwl, 1, FUN=function(x){!all(is.na(x))})
combined.rwl <- combined.rwl[rows.dat,]

combined.rwl[1:20,c(fBAD, fGOOD)]

# print out cores that need leading or trailing 0s fixed (to be redated)
series.metadata[!is.na(series.metadata$first0Flag) & (series.metadata$first0Flag | series.metadata$last0Flag),]

cores.check <- series.metadata[(!is.na(series.metadata$first0Flag) & (series.metadata$first0Flag | series.metadata$last0Flag)) | 
                                 (series.metadata$year.last<=1900 | series.metadata$year.last>=lubridate::year(Sys.Date())) ,]

write.csv(cores.check, file.path(path.out, "Series-with-Errors.csv"), row.names=F)


# Now also write out the metadata and compiled data
write.csv(series.metadata, file.path(path.out, "Series-Metadata_all.csv"), row.names=F)
write.csv(combined.rwl, file.path(path.out, "Series-Measurements_all.csv"), row.names=T)