From 2e10668f178efdadbecd88cab06ad204af0479dc Mon Sep 17 00:00:00 2001
From: iosefa <ipercival@gmail.com>
Date: Wed, 27 Nov 2024 09:43:29 -1000
Subject: [PATCH] Remove author and update paper content to address reviewer
 comments.

Removed Ryan Perroy from author list. Enhanced the summary and statement of need sections to clarify functionalities and emphasize forest structure metric calculation. Clarified the challenges associated with size and the features that the library provides. Added new sections: usage and contributing.
---
 paper/paper.bib | 76 ++++++++++++++++++++++++++++++++++++++++++++++++-
 paper/paper.md  | 19 +++++++++----
 2 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/paper/paper.bib b/paper/paper.bib
index 143a89a..895c59e 100644
--- a/paper/paper.bib
+++ b/paper/paper.bib
@@ -291,4 +291,78 @@ @software{howard_butler_2024_13993879
   version      = {2.8.1},
   doi          = {10.5281/zenodo.13993879},
   url          = {https://doi.org/10.5281/zenodo.13993879}
-}
\ No newline at end of file
+}
+
+@software{manning_entwine,
+  author       = {Connor Manning},
+  title        = {Entwine: Open Source Point Cloud Indexing},
+  year         = {2024},
+  url          = {https://entwine.io/},
+  note         = {Latest version accessed on 2024-11-22}
+}
+
+@article{doi:10.1111/2041-210X.13901,
+author = {Borowiec, Marek L. and Dikow, Rebecca B. and Frandsen, Paul B. and McKeeken, Alexander and Valentini, Gabriele and White, Alexander E.},
+title = {Deep learning as a tool for ecology and evolution},
+journal = {Methods in Ecology and Evolution},
+volume = {13},
+number = {8},
+pages = {1640-1660},
+keywords = {artificial intelligence, automation, computer vision, machine learning, modelling, neural networks, statistics},
+doi = {https://doi.org/10.1111/2041-210X.13901},
+url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.13901},
+eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.13901},
+abstract = {Abstract Deep learning is driving recent advances behind many everyday technologies, including speech and image recognition, natural language processing and autonomous driving. It is also gaining popularity in biology, where it has been used for automated species identification, environmental monitoring, ecological modelling, behavioural studies, DNA sequencing and population genetics and phylogenetics, among other applications. Deep learning relies on artificial neural networks for predictive modelling and excels at recognizing complex patterns. In this review we synthesize 818 studies using deep learning in the context of ecology and evolution to give a discipline-wide perspective necessary to promote a rethinking of inference approaches in the field. We provide an introduction to machine learning and contrast it with mechanistic inference, followed by a gentle primer on deep learning. We review the applications of deep learning in ecology and evolution and discuss its limitations and efforts to overcome them. We also provide a practical primer for biologists interested in including deep learning in their toolkit and identify its possible future applications. We find that deep learning is being rapidly adopted in ecology and evolution, with 589 studies (64\%) published since the beginning of 2019. Most use convolutional neural networks (496 studies) and supervised learning for image identification but also for tasks using molecular data, sounds, environmental data or video as input. More sophisticated uses of deep learning in biology are also beginning to appear. Operating within the machine learning paradigm, deep learning can be viewed as an alternative to mechanistic modelling. It has desirable properties of good performance and scaling with increasing complexity, while posing unique challenges such as sensitivity to bias in input data. We expect that rapid adoption of deep learning in ecology and evolution will continue, especially in automation of biodiversity monitoring and discovery and inference from genetic data. Increased use of unsupervised learning for discovery and visualization of clusters and gaps, simplification of multi-step analysis pipelines, and integration of machine learning into graduate and postgraduate training are all likely in the near future.},
+year = {2022}
+}
+
+@article{doi:10.1111/2041-210X.14040,
+author = {Atkins, Jeff W. and Costanza, Jennifer and Dahlin, Kyla M. and Dannenberg, Matthew P. and Elmore, Andrew J. and Fitzpatrick, Matthew C. and Hakkenberg, Christopher R. and Hardiman, Brady S. and Kamoske, Aaron and LaRue, Elizabeth A. and Silva, Carlos Alberto and Stovall, Atticus E. L. and Tielens, Elske K.},
+title = {Scale dependency of lidar-derived forest structural diversity},
+journal = {Methods in Ecology and Evolution},
+volume = {14},
+number = {2},
+pages = {708-723},
+keywords = {ecosystem structure, forest structure, forestry, lidar, remote sensing, representative elementary area, scaling},
+doi = {https://doi.org/10.1111/2041-210X.14040},
+url = {https://besjournals.onlinelibrary.wiley.com/doi/abs/10.1111/2041-210X.14040},
+eprint = {https://besjournals.onlinelibrary.wiley.com/doi/pdf/10.1111/2041-210X.14040},
+abstract = {Abstract Lidar-derived forest structural diversity (FSD) metrics—including measures of forest canopy height, vegetation arrangement, canopy cover (CC), structural complexity and leaf area and density—are increasingly used to describe forest structural characteristics and can be used to infer many ecosystem functions. Despite broad adoption, the importance of spatial resolution (grain and extent) over which these structural metrics are calculated remains largely unconsidered. Often researchers will quantify FSD at the spatial grain size of the process of interest without considering the scale dependency or statistical behaviour of the FSD metric employed. We investigated the appropriate scale of inference for eight lidar-derived spatial metrics—CC, canopy relief ratio, foliar height diversity, leaf area index, mean and median canopy height, mean outer canopy height, and rugosity (RT)--representing five FSD categories—canopy arrangement, CC, canopy height, leaf area and density, and canopy complexity. Optimal scale was determined using the representative elementary area (REA) concept whereby the REA is the smallest grain size representative of the extent. Structural metrics were calculated at increasing canopy spatial grain (from 5 to 1000 m) from aerial lidar data collected at nine different forested ecosystems including sub-boreal, broadleaf temperate, needleleaf temperate, dry tropical, woodland and savanna systems, all sites are part of the National Ecological Observatory Network within the conterminous United States. To identify the REA of each FSD metric, we used changepoint analysis via segmented or piecewise regression which identifies significant changepoints for both the magnitude and variance of each metric. We find that using a spatial grain size between 25 and 75 m sufficiently captures the REA of CC, canopy arrangement, canopy leaf area and canopy complexity metrics across multiple forest types and a grain size of 30–150 m captures the REA of canopy height metrics. However, differences were evident among forest types with higher REA necessary to characterize CC in evergreen needleleaf forests, and canopy height in deciduous broadleaved forests. These findings indicate the appropriate range of spatial grain sizes from which inferences can be drawn from this set of FSD metrics, informing the use of lidar-derived structural metrics for research and management applications.},
+year = {2023}
+}
+
+@article{BUTLER2021104680,
+title = {PDAL: An open source library for the processing and analysis of point clouds},
+journal = {Computers & Geosciences},
+volume = {148},
+pages = {104680},
+year = {2021},
+issn = {0098-3004},
+doi = {https://doi.org/10.1016/j.cageo.2020.104680},
+url = {https://www.sciencedirect.com/science/article/pii/S0098300420306518},
+author = {Howard Butler and Bradley Chambers and Preston Hartzell and Craig Glennie},
+keywords = {Point clouds, Lidar, Open source software, Geospatial, Iterative closest point},
+abstract = {As large point cloud datasets become ubiquitous in the Earth science community, open source libraries and software dedicated to manipulating these data are valuable tools for geospatial scientists and practitioners. We highlight an open source library called the Point Data Abstraction Library, more commonly referred to by its acronym: PDAL. PDAL provides a standalone application for point cloud processing, a C++ library for development of new point cloud applications, and support for Python, MATLAB, Julia, and Java languages. Central to PDAL are the concepts of stages, which implement core capabilities for reading, writing, and filtering point cloud data, and pipelines, which are end-to-end workflows composed of sequential stages for transforming point clouds. We review the motivation for PDAL’s genesis, describe its general structure and functionality, detail several options for conveniently accessing PDAL’s functionality, and provide an example that uses PDAL’s Python extension to estimate earthquake surface deformation from pre- and post-event airborne laser scanning point cloud data using an iterative closest point algorithm.}
+}
+
+@article{DUBAYAH2020100002,
+title = {The Global Ecosystem Dynamics Investigation: High-resolution laser ranging of the Earth’s forests and topography},
+journal = {Science of Remote Sensing},
+volume = {1},
+pages = {100002},
+year = {2020},
+issn = {2666-0172},
+doi = {https://doi.org/10.1016/j.srs.2020.100002},
+url = {https://www.sciencedirect.com/science/article/pii/S2666017220300018},
+author = {Ralph Dubayah and James Bryan Blair and Scott Goetz and Lola Fatoyinbo and Matthew Hansen and Sean Healey and Michelle Hofton and George Hurtt and James Kellner and Scott Luthcke and John Armston and Hao Tang and Laura Duncanson and Steven Hancock and Patrick Jantz and Suzanne Marselis and Paul L. Patterson and Wenlu Qi and Carlos Silva},
+keywords = {Lidar, Ecosystem structure, GEDI, Biomass},
+abstract = {Obtaining accurate and widespread measurements of the vertical structure of the Earth’s forests has been a long-sought goal for the ecological community. Such observations are critical for accurately assessing the existing biomass of forests, and how changes in this biomass caused by human activities or variations in climate may impact atmospheric CO2 concentrations. Additionally, the three-dimensional structure of forests is a key component of habitat quality and biodiversity at local to regional scales. The Global Ecosystem Dynamics Investigation (GEDI) was launched to the International Space Station in late 2018 to provide high-quality measurements of forest vertical structure in temperate and tropical forests between 51.6° N & S latitude. The GEDI instrument is a geodetic-class laser altimeter/waveform lidar comprised of 3 lasers that produce 8 transects of structural information. Over its two-year nominal lifetime GEDI is anticipated to provide over 10 billion waveforms at a footprint resolution of 25 ​m. These data will be used to derive a variety of footprint and gridded products, including canopy height, canopy foliar profiles, Leaf Area Index (LAI), sub-canopy topography and biomass. Additionally, data from GEDI are used to demonstrate the efficacy of its measurements for prognostic ecosystem modeling, habit and biodiversity studies, and for fusion using radar and other remote sensing instruments. GEDI science and technology are unique: no other space-based mission has been created that is specifically optimized for retrieving vegetation vertical structure. As such, GEDI promises to advance our understanding of the importance of canopy vertical variations within an ecological paradigm based on structure, composition and function.}
+}
+
+@software{copc_format,
+  author       = {Howard Butler and Contributors},
+  title        = {Cloud Optimized Point Cloud (COPC)},
+  year         = {2021},
+  url          = {https://copc.io/},
+  note         = {Latest version accessed on 2024-11-22}
+}
diff --git a/paper/paper.md b/paper/paper.md
index b36e93d..d84604e 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -15,9 +15,6 @@ authors:
   - name: Benjamin Palsa Leamon
     orcid: 0009-0002-4614-2322
     affiliation: "3"
-  - name: Ryan Perroy
-    orcid: 0000-0002-4210-3281
-    affiliation: "1,2"
 affiliations:
  - name: Department of Geography & Environmental Science, University of Hawai‘i at Hilo, Hilo, HI 96720, USA
    index: 1
@@ -32,14 +29,17 @@ bibliography: paper.bib
 
 # Summary
 
-PyForestScan is an open-source Python library designed for calculating forest structural metrics from lidar point cloud data at scale. The software supports input formats including .las, .laz, and .copc files, efficiently handles large-scale lidar datasets, and calculates key ecological metrics such as foliage height diversity (FHD), plant area density (PAD), canopy height, and plant area index (PAI). In addition to metrics computation, the library supports the import and generation of digital terrain models, the generation of GeoTIFF outputs, and integrates with geospatial libraries like PDAL [@howard_butler_2024_13993879], making it a valuable tool for forest monitoring, carbon accounting, and ecological research.
+PyForestScan is an open-source Python library designed for calculating forest structural metrics from Light Detection and Ranging (lidar) point cloud data at scale. The software calculates key ecological metrics such as foliage height diversity (FHD), plant area density (PAD), canopy height, plant area index (PAI), and digital terrain models (DTMs), efficiently handles large-scale lidar datasets, and supports input formats including the Entwine Point Tile (EPT) format, .las, .laz, and .copc files. In addition to metrics computation, the library supports the generation of GeoTIFF outputs and integrates with geospatial libraries like the Point Cloud Data Abstraction Library (PDAL) [@howard_butler_2024_13993879; @BUTLER2021104680], making it a valuable tool for forest monitoring, carbon accounting, and ecological research.
 
 # Statement of Need
 
-Remote sensing data, particularly Light Detection and Ranging (lidar) data from airborne sensors, are becoming increasingly accessible, offering a detailed understanding of forest ecosystems at fine spatial resolutions. This data is critical for calculating forest structural metrics such as canopy height, canopy cover, plant area index (PAI), plant area density (PAD), foliage height diversity (FHD), as well as digital terrain models (DTM), which are essential for forest management, biodiversity conservation, and carbon accounting [@mcelhinnyForestWoodlandStand2005; @drakeEstimationTropicalForest2002; @pascualRoleImprovedGround2020; @guerra-hernandezUsingBitemporalALS2024; @pascualIntegratedAssessmentCarbon2023; @pascualNewRemoteSensingbased2021a]. However, working with large-scale lidar datasets remains a challenge due to the complexity and size of the data. Additionally, despite Python being a powerful language widely used for geospatial and ecological analysis, there is a notable lack of dedicated, open-source tools within the Python ecosystem specifically designed for calculating these forest structural metrics from lidar data. Calculating these metrics is non-trivial, and several steps are often required to process the point clouds in order to generate these metrics. Existing open source solutions are primarily in the R programming language [@rousselLidRPackageAnalysis2020a;  @rousselAirborneLiDARData2024; @dealmeidaLeafRCalculatesLeaf2021] or are proprietary, computationally intensive, or not flexible enough for the variety of ecological contexts in which these metrics are needed [@lastools; @fusion; @globalmapper]. This gap makes it difficult for researchers, ecologists, and forest managers to integrate lidar-based analysis into their workflows efficiently.
+Remote sensing data, particularly point cloud data from airborne lidar sensors, are becoming increasingly accessible, offering a detailed understanding of forest ecosystems at fine spatial resolutions over large areas. This data is useful for calculating forest structural metrics such as canopy height, canopy cover, PAI, PAD, FHD, as well as DTMs, which are essential for forest management, biodiversity conservation, and carbon accounting [@mcelhinnyForestWoodlandStand2005; @drakeEstimationTropicalForest2002; @pascualRoleImprovedGround2020; @guerra-hernandezUsingBitemporalALS2024; @pascualIntegratedAssessmentCarbon2023; @pascualNewRemoteSensingbased2021a]. 
+
+Despite Python's prominence as a powerful language for geospatial and ecological analysis, there is a notable lack of dedicated, open-source tools within the Python ecosystem specifically designed for calculating comprehensive forest structural metrics from airborne lidar point-cloud data. This gap is significant given Python's extensive libraries for data science and its increasingly important role in ecology and deep learning [@doi:10.1111/2041-210X.13901]. Existing open-source solutions that offer some of these metrics are primarily available in the R programming language. For instance, lidR [@rousselLidRPackageAnalysis2020a; @rousselAirborneLiDARData2024] provides functions for point cloud manipulation, metric computation, and visualization but lacks native calculations for foliage height diversity (FHD) and plant area index (PAI). Another tool, leafR [@dealmeidaLeafRCalculatesLeaf2021], calculates FHD, leaf area index (LAI), and leaf area density (LAD) - both of which are very similar to PAI and PAD - but is limited in processing large datasets due to the absence of tiling functionality. Moreover, the importance of scale in lidar-based analyses of forest structure is well-documented [@doi:10.1111/2041-210X.14040], and leafR does not allow users to modify voxel depth, which can be important for accurate estimation of structural metrics across different forest types and scales. Similarly, canopyLazR [@kamoskeLeafAreaDensity2019] provides tools to calculate LAD and LAI from point cloud lidar data but only allows the calculation of these metrics and also lacks support for tiling mechanisms, limiting its applicability to large datasets. Proprietary solutions like LAStools [@lastools], FUSION [@fusion], and Global Mapper [@globalmapper] offer tools to calculate some of these metrics -mostly canopy height- but may not provide the flexibility required for diverse ecological contexts and are often inaccessible due to licensing costs. This lack of a comprehensive, scalable Python-based solution makes it challenging for researchers, ecologists, and forest managers to integrate point-cloud-based analysis into their Python workflows efficiently. This is particularly problematic when working with large datasets or when integrating analyses with other Python-based tools, such as those used for processing space-based waveform lidar data from the Global Ecosystem Dynamics Investigation (GEDI) mission [@DUBAYAH2020100002] [also cite ATBD], which also provides data on PAI, plant area volume density (PAVD), and FHD.  
 
-PyForestScan was developed to fill this gap by providing an open-source, Python-based solution that can handle the complexities of lidar data while remaining accessible and efficient. Designed for point clouds captured by airborne lidar and points generated from structure from motion (SfM), it supports commonly used formats such as .las, .laz, and .copc, and integrates with well-established geospatial frameworks for point clouds like Point Cloud Data Abstraction Library (PDAL) [@howard_butler_2024_13993879]. The more mathematically intensive calculations of PAD, PAI, and FHD are calculated following established methods by @kamoskeLeafAreaDensity2019 and @hurlbertNonconceptSpeciesDiversity1971, and are given by equations (1) - (3). PyForestScan provides tiling mechanisms to calculate metrics across large landscapes, IO support across multiple formats, point cloud processing tools to filter points and create ground surfaces, as well as simple visualization functions for core metrics. PyForestScan brings this functionality to Python, while also introducing capabilities not found in any single existing open-source software. These include canopy height, PAD, PAI, FHD, DTM, and advanced tiling functionality that efficiently handles large datasets enabling analysis of forest metrics across large landscapes. By focusing on forest structural metrics, PyForestScan provides an essential tool for the growing need to analyze forest structure at scale in the context of environmental monitoring, conservation, and climate-related research.
+In addition to the lack of Python-based software for calculating forest structural metrics like PAI, PAD, and FHD, working with large-scale point clouds remains a challenge due to complexities inherent in the size of the data. Lidar datasets can vary in point densities—from about 2-3 points per square meter in airborne surveys covering vast landscapes to upwards of 2,000 points per square meter in high-resolution drone-based surveys, potentially resulting in terabytes of data. To manage these large volumes, datasets are typically divided into fixed-size tiles, which must be individually loaded into memory for analysis. This approach can introduce inflexibility because analyses may need to conform strictly to tile boundaries, potentially causing boundary effects when calculating metrics that span across tiles. While tools like lidR can handle tiling and mitigate boundary effects natively, they do not fully leverage the advanced spatial indexing provided by formats like EPT [@manning_entwine] and Cloud Optimized Point Cloud (COPC) [@copc_format]. Additionally, fixed tile sizes may not align with varying memory constraints or specific workflow needs, limiting the ability to adjust tile sizes dynamically based on data density and processing requirements. For example, extracting point clouds over specific polygons within tiles, or performing exploratory data analysis over a large region consisting of several tiles can be overly time-consuming as it often requires reading all data into memory. 
 
+PyForestScan was developed to fill this gap by providing an open-source, Python-based solution to calculate forest structural metrics that can handle large-scale point-cloud data while remaining accessible and efficient. By leveraging IO capabilities of PDAL, it handles large-scale analyses by allowing users to work with more efficient point-cloud data structure, such as spatially indexed hierarchical octree formats like EPT or COPC. In addition to lidar-based point clouds, it can also process point clouds derived from structure-from-motion (SfM) in open-canopy forests, provided the SfM data includes a sufficient density of points to capture the full vertical profile of the forest. PyForestScan supports commonly used formats such as .las, .laz, as well as more efficient formats such as COPC and EPT, and integrates with well-established geospatial frameworks for point clouds like PDAL [@howard_butler_2024_13993879; @BUTLER2021104680]. The more mathematically intensive calculations of PAD, PAI, and FHD are calculated following established methods by @kamoskeLeafAreaDensity2019 and @hurlbertNonconceptSpeciesDiversity1971, and are given by equations (1) - (3). PyForestScan provides native tiling mechanisms to calculate metrics across large landscapes, IO support across multiple formats, point cloud processing tools to filter points and create ground surfaces, as well as simple visualization functions for core metrics. PyForestScan brings this functionality to Python, while also introducing capabilities not found in any single existing open-source software. By focusing on forest structural metrics, PyForestScan provides an essential tool for the growing need to analyze forest structure at scale in the context of environmental monitoring, conservation, and climate-related research.
 
 $$
   PAD_{i-1,i} = \ln\left(\frac{S_e}{S_t}\right) \frac{1}{k \Delta z}
@@ -62,6 +62,13 @@ Equation (2) represents PAI as the vertical summation of PAD across all layers $
 
 In equation (3), $FHD$ is calculated as the Shannon entropy of the vertical distribution of plant material across all layers of the canopy. $p_i$ is the proportion of total plant material in each voxel $i$, relative to the entire vertical column, with $n$ representing the number of vertical layers. 
 
+# Usage
+
+To facilitate usage of the software, we have included ... 
+
+# Contributions
+J.E.H.P developed the concept with input from B.P; J.E.H.P wrote the software and automatic tests; B.P. and J.E.H.P wrote the software documentation; B.P. created Jupyter notebooks for example usage; J.E.H.P and B.P wrote the manuscript.
+
 # Acknowledgements
 
 We would like to express our gratitude to Juan Guerra-Hernandez and Adrian Pascual for providing a noise-free classified point cloud dataset [@guerra-hernandezHighresolutionCanopyHeight2024], which was instrumental in testing and validating the PyForestScan library. This work was enabled in part by funding from the National Science Foundation award: 2149133. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation.