From 747602e36ce76e63df1a7405e17099f0789dc52d Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 11:37:12 +0200 Subject: [PATCH 1/6] apply changes for joss review --- paper/paper.bib | 69 +++++++++++++++++++++++++++++++++++++++++++++++-- paper/paper.md | 8 +++--- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 1cba255..4492802 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -6,7 +6,7 @@ @article{shan:2022 pages = {113116}, year = {2022}, issn = {0034-4257}, - doi = {https://doi.org/10.1016/j.rse.2022.113116}, + doi = {10.1016/j.rse.2022.113116}, url = {https://www.sciencedirect.com/science/article/pii/S0034425722002309}, author = {Xu Shan and Susan Steele-Dunne and Manuel Huber and Sebastian Hahn and Wolfgang Wagner and Bertrand Bonan and Clement Albergel and Jean-Christophe Calvet and Ou Ku and Sonja Georgievska}, keywords = {ASCAT, Scatterometry, Radar, Vegetation, Land surface model, Machine learning, Deep Neural Network, Plant water dynamics, Soil moisture}, @@ -31,7 +31,7 @@ @article{XUE:2015 pages = {153-165}, year = {2015}, issn = {0034-4257}, - doi = {https://doi.org/10.1016/j.rse.2015.09.009}, + doi = {doi.org/10.1016/j.rse.2015.09.009}, url = {https://www.sciencedirect.com/science/article/pii/S0034425715301322}, author = {Yuan Xue and Barton A. Forman}, keywords = {Sensitivity analysis, Machine learning, Brightness temperature, Snow, Data assimilation}, @@ -81,3 +81,68 @@ @inproceedings{Rocklin2015DaskPC year={2015}, url={https://api.semanticscholar.org/CorpusID:63554230} } + +@article{REICHLE20081411, +title = {Data assimilation methods in the Earth sciences}, +journal = {Advances in Water Resources}, +volume = {31}, +number = {11}, +pages = {1411-1418}, +year = {2008}, +note = {Hydrologic Remote Sensing}, +issn = {0309-1708}, +doi = {10.1016/j.advwatres.2008.01.001}, +url = {https://www.sciencedirect.com/science/article/pii/S0309170808000043}, +author = {Rolf H. Reichle}, +keywords = {Data assimilation, Remote sensing, Land surface hydrology, Variational methods, Kalman filter}, +abstract = {Although remote sensing data are often plentiful, they do not usually satisfy the users’ needs directly. Data assimilation is required to extract information about geophysical fields of interest from the remote sensing observations and to make the data more accessible to users. Remote sensing may provide, for example, measurements of surface soil moisture, snow water equivalent, snow cover, or land surface (skin) temperature. Data assimilation can then be used to estimate variables that are not directly observed from space but are needed for applications, for instance root zone soil moisture or land surface fluxes. The paper provides a brief introduction to modern data assimilation methods in the Earth sciences, their applications, and pertinent research questions. Our general overview is readily accessible to hydrologic remote sensing scientists. Within the general context of Earth science data assimilation, we point to examples of the assimilation of remotely sensed observations in land surface hydrology.} +} + +@article{Carrassi2018, + title={Data assimilation in the geosciences: An overview of methods, issues, and perspectives}, + author={Carrassi, Alberto and Bocquet, Marc and Bertino, Laurent and Evensen, Geir}, + journal={Wiley Interdisciplinary Reviews: Climate Change}, + volume={9}, + number={5}, + pages={e535}, + year={2018}, + publisher={Wiley Online Library}, + doi={10.1002/wcc.535} +} + +@ARTICLE{Evensen2009, + author={Evensen, Geir}, + journal={IEEE Control Systems Magazine}, + title={The ensemble Kalman filter for combined state and parameter estimation}, + year={2009}, + volume={29}, + number={3}, + pages={83-104}, + keywords={Parameter estimation;Particle filters;Atmospheric modeling;Covariance matrix;Nonlinear equations;Error analysis;Kalman filters;Computational modeling;Computational efficiency;Bayesian methods}, + doi={10.1109/MCS.2009.932223}} + +@Article{Clement2018, +AUTHOR = {Albergel, Clement and Munier, Simon and Bocher, Aymeric and Bonan, Bertrand and Zheng, Yongjun and Draper, Clara and Leroux, Delphine J. and Calvet, Jean-Christophe}, +TITLE = {LDAS-Monde Sequential Assimilation of Satellite Derived Observations Applied to the Contiguous US: An ERA-5 Driven Reanalysis of the Land Surface Variables}, +JOURNAL = {Remote Sensing}, +VOLUME = {10}, +YEAR = {2018}, +NUMBER = {10}, +ARTICLE-NUMBER = {1627}, +URL = {https://www.mdpi.com/2072-4292/10/10/1627}, +ISSN = {2072-4292}, +ABSTRACT = {Land data assimilation system (LDAS)-Monde, an offline land data assimilation system with global capacity, is applied over the CONtiguous US (CONUS) domain to enhance monitoring accuracy for water and energy states and fluxes. LDAS-Monde ingests satellite-derived surface soil moisture (SSM) and leaf area index (LAI) estimates to constrain the interactions between soil, biosphere, and atmosphere (ISBA) land surface model (LSM) coupled with the CNRM (Centre National de Recherches Météorologiques) version of the total runoff integrating pathways (CTRIP) continental hydrological system (ISBA-CTRIP). LDAS-Monde is forced by the ERA-5 atmospheric reanalysis from the European Center for Medium Range Weather Forecast (ECMWF) from 2010 to 2016 leading to a seven-year, quarter degree spatial resolution offline reanalysis of land surface variables (LSVs) over CONUS. The impact of assimilating LAI and SSM into LDAS-Monde is assessed over North America, by comparison to satellite-driven model estimates of land evapotranspiration from the Global Land Evaporation Amsterdam Model (GLEAM) project, and upscaled ground-based observations of gross primary productivity from the FLUXCOM project. Taking advantage of the relatively dense data networks over CONUS, we have also evaluated the impact of the assimilation against in situ measurements of soil moisture from the USCRN (US Climate Reference Network), together with river discharges from the United States Geological Survey (USGS) and the Global Runoff Data Centre (GRDC). Those data sets highlight the added value of assimilating satellite derived observations compared with an open-loop simulation (i.e., no assimilation). It is shown that LDAS-Monde has the ability not only to monitor land surface variables but also to forecast them, by providing improved initial conditions, which impacts persist through time. LDAS-Monde reanalysis also has the potential to be used to monitor extreme events like agricultural drought. Finally, limitations related to LDAS-Monde and current satellite-derived observations are exposed as well as several insights on how to use alternative datasets to analyze soil moisture and vegetation state.}, +DOI = {10.3390/rs10101627} +} + +@article{zhou2008ensemble, + title={An ensemble multiscale filter for large nonlinear data assimilation problems}, + author={Zhou, Yuhua and McLaughlin, Dennis and Entekhabi, Dara and Ng, Gene-Hua Crystal}, + journal={Monthly Weather Review}, + volume={136}, + number={2}, + pages={678--698}, + year={2008}, + publisher={American Meteorological Society}, + DOI = {10.1175/2007MWR2064.1} +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index 620ffb4..7869967 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -34,17 +34,17 @@ bibliography: paper.bib ## Summary -Data assimilation (DA) is an essential procedure in Earth and environmental sciences, enabling physical model states to be constrained using observational data. +Data assimilation (DA) is an essential procedure in Earth and environmental sciences, enabling physical model states to be constrained using observational data. [@REICHLE20081411, @Evensen2009, @Clement2018, @Carrassi2018] In the DA process, observations are integrated into the physical model through the application of a Measurement Operator (MO) – a connection model mapping physical model states to observations. Researchers have observed that employing a Machine-Learning (ML) model as a surrogate MO can bypass the limitations associated with using an overly simplified MO [@Forman:2014; @XUE:2015; @Forman:2017]. ## Statement of Need -A surrogate MO, as a ML model is trained with the assumption that a single MO applies when mapping physical model states to observations. When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. +The surrogate MO, trained as a Machine-Learning model, is generally considered valid within a specific spatio-temporal range.[@zhou2008ensemble, @REICHLE20081411, @shan:2022] When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. -To address this challenge, we developed a novel approach for distributed training of MOs. We present the open Python library `MOTrainer`, which to the best of our knowledge, is the first Python library catering to researchers requiring training independent MOs across extensive spatio-temporal coverage in a distributed manner. `MOTrainer` leverages Xarray's [@Hoyer_xarray_N-D_labeled_2017] support for multi-dimensional datasets to accommodate spatio-temporal features of input/output data of the training tasks. It provides user-friendly functionalities implemented with the Dask [@Rocklin2015DaskPC] library, facilitating the partitioning of large spatio-temporal data for independent model training tasks. Additionally, it streamlines the train-test data split based on customized spatio-temporal coordinates. The Jackknife method [@mccuen1998hydrologic] is implemented as an external Cross-Validation (CV) method for Deep Neural Network (DNN) training, with support for Dask parallelization. This feature enables the scaling of training tasks across various computational infrastructures. +To address this challenge, we developed a novel approach for distributed training of MOs. We present the open Python library `MOTrainer`, which to the best of our knowledge, is the first Python library catering to researchers requiring training independent MOs across extensive spatio-temporal coverage in a distributed manner. `MOTrainer` leverages Xarray's [@Hoyer_xarray_N-D_labeled_2017] support for multi-dimensional datasets to accommodate spatio-temporal features of input/output data of the training tasks. It provides user-friendly functionalities implemented with the Dask [@Rocklin2015DaskPC] library, facilitating the partitioning of large spatio-temporal data for independent model training tasks. Additionally, it streamlines the train-test data split based on customized spatio-temporal coordinates. The Jackknife method [@mccuen1998hydrologic] is implemented as an external Cross-Validation method for Deep Neural Network (DNN) training, with support for Dask parallelization. This feature enables the scaling of training tasks across various computational infrastructures. -`MOTrainer` has been employed in a study of vegetation water dynamics [@shan:2022], where it facilitated the mapping of Land-Scape Model (LSM) states to satellite radar observations. +`MOTrainer` has been employed in a study of vegetation water dynamics [@shan:2022], where it facilitated the mapping of Land-Scape Model states to satellite radar observations. ## Tutorial From 4ee30f3268680ce93d9b0026ba16877e313f179d Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 11:47:25 +0200 Subject: [PATCH 2/6] add dask doi --- paper/paper.bib | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 4492802..000b79c 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -79,7 +79,8 @@ @inproceedings{Rocklin2015DaskPC author={Matthew Rocklin}, booktitle={SciPy}, year={2015}, - url={https://api.semanticscholar.org/CorpusID:63554230} + url={https://api.semanticscholar.org/CorpusID:63554230}, + doi={10.25080/majora-7b98e3ed-013} } @article{REICHLE20081411, From 55fdd4b3e89a6e4bd1218a938adeb8b2f8e91f00 Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 11:56:21 +0200 Subject: [PATCH 3/6] add orcids for all authors --- CITATION.cff | 8 +++++++- paper/paper.md | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index b026bb5..b264d29 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -15,21 +15,27 @@ authors: orcid: 'https://orcid.org/0000-0002-5373-5209' - given-names: Fakhereh (Sarah) family-names: Alidoost + orcid: 'https://orcid.org/0000-0001-8407-6472' affiliation: Netherlands eScience Center - given-names: Xu family-names: Shan + orcid: 'https://orcid.org/0000-0002-0569-4326' affiliation: Delft University of Technology - given-names: Pranav family-names: Chandramouli + orcid: 'https://orcid.org/0000-0002-7896-2969' affiliation: Netherlands eScience Center - given-names: Georgievska family-names: Sonja + orcid: 'https://orcid.org/0000-0002-8094-4532' affiliation: Netherlands eScience Center - given-names: Meiert - affiliation: Netherlands eScience Center family-names: Grootes + orcid: 'https://orcid.org/0000-0002-5733-4795' + affiliation: Netherlands eScience Center - given-names: Susan family-names: Steele-Dunne + orcid: 'https://orcid.org/0000-0002-8644-3077' affiliation: Delft University of Technology identifiers: - type: doi diff --git a/paper/paper.md b/paper/paper.md index 7869967..f87e556 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -11,16 +11,22 @@ authors: orcid: 0000-0002-5373-5209 affiliation: 1 - name: Fakhereh Alidoost + orcid: 0000-0001-8407-6472 affiliation: 1 - name: Xu Shan + orcid: 0000-0002-0569-4326 affiliation: 2 - name: Pranav Chandramouli + orcid: 0000-0002-7896-2969 affiliation: 1 - name: Sonja Georgievska + orcid: 0000-0002-8094-4532 affiliation: 1 - name: Meiert W. Grootes + orcid: 0000-0002-5733-4795 affiliation: 1 - name: Susan Steele-Dunne + orcid: 0000-0002-8644-3077 corresponding: true affiliation: 2 affiliations: From 60cdfc89e57484e22c280acaa5abad9d76ae2af1 Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 13:19:59 +0200 Subject: [PATCH 4/6] fix reference and lint --- paper/paper.md | 6 +++--- pyproject.toml | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index f87e556..6e170fb 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -40,13 +40,13 @@ bibliography: paper.bib ## Summary -Data assimilation (DA) is an essential procedure in Earth and environmental sciences, enabling physical model states to be constrained using observational data. [@REICHLE20081411, @Evensen2009, @Clement2018, @Carrassi2018] +Data assimilation (DA) is an essential procedure in Earth and environmental sciences, enabling physical model states to be constrained using observational data. [@REICHLE20081411; @Evensen2009; @Clement2018; @Carrassi2018] -In the DA process, observations are integrated into the physical model through the application of a Measurement Operator (MO) – a connection model mapping physical model states to observations. Researchers have observed that employing a Machine-Learning (ML) model as a surrogate MO can bypass the limitations associated with using an overly simplified MO [@Forman:2014; @XUE:2015; @Forman:2017]. +In the DA process, observations are integrated into the physical model through the application of a Measurement Operator (MO) – a connection model mapping physical model states to observations. Researchers have observed that employing a Machine-Learning (ML) model as a surrogate MO can bypass the limitations associated with using an overly simplified MO. [@Forman:2014; @XUE:2015; @Forman:2017] ## Statement of Need -The surrogate MO, trained as a Machine-Learning model, is generally considered valid within a specific spatio-temporal range.[@zhou2008ensemble, @REICHLE20081411, @shan:2022] When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. +The surrogate MO, trained as a ML model, is generally considered valid within a specific spatio-temporal range. [@zhou2008ensemble; @REICHLE20081411; @shan:2022] When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. To address this challenge, we developed a novel approach for distributed training of MOs. We present the open Python library `MOTrainer`, which to the best of our knowledge, is the first Python library catering to researchers requiring training independent MOs across extensive spatio-temporal coverage in a distributed manner. `MOTrainer` leverages Xarray's [@Hoyer_xarray_N-D_labeled_2017] support for multi-dimensional datasets to accommodate spatio-temporal features of input/output data of the training tasks. It provides user-friendly functionalities implemented with the Dask [@Rocklin2015DaskPC] library, facilitating the partitioning of large spatio-temporal data for independent model training tasks. Additionally, it streamlines the train-test data split based on customized spatio-temporal coordinates. The Jackknife method [@mccuen1998hydrologic] is implemented as an external Cross-Validation method for Deep Neural Network (DNN) training, with support for Dask parallelization. This feature enables the scaling of training tasks across various computational infrastructures. diff --git a/pyproject.toml b/pyproject.toml index 0f1dde9..56812a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,6 +124,8 @@ exclude = [ "node_modules", "venv", "docs", + "exploration", + "site", ] line-length = 88 From b79ddd0b2c54901184847fd4fbabdc496738f83c Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 13:48:13 +0200 Subject: [PATCH 5/6] fix doi --- paper/paper.bib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.bib b/paper/paper.bib index 000b79c..1b5dada 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -31,7 +31,7 @@ @article{XUE:2015 pages = {153-165}, year = {2015}, issn = {0034-4257}, - doi = {doi.org/10.1016/j.rse.2015.09.009}, + doi = {10.1016/j.rse.2015.09.009}, url = {https://www.sciencedirect.com/science/article/pii/S0034425715301322}, author = {Yuan Xue and Barton A. Forman}, keywords = {Sensitivity analysis, Machine learning, Brightness temperature, Snow, Data assimilation}, From 9792a16311501d1d5fc85d5c769a7cd8b14678f4 Mon Sep 17 00:00:00 2001 From: Ou Ku Date: Tue, 20 Aug 2024 13:48:24 +0200 Subject: [PATCH 6/6] fix word --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 6e170fb..cf82ea9 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -46,7 +46,7 @@ In the DA process, observations are integrated into the physical model through t ## Statement of Need -The surrogate MO, trained as a ML model, is generally considered valid within a specific spatio-temporal range. [@zhou2008ensemble; @REICHLE20081411; @shan:2022] When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. +A surrogate MO, trained as a ML model, is generally considered valid within a specific spatio-temporal range. [@zhou2008ensemble; @REICHLE20081411; @shan:2022] When dealing with a large spatio-temporal scale, multiple mapping processes may exist, prompting consideration for training separate MOs for distinct spatial and/or temporal partitions of the dataset. As the number of partitions increases, a challenge arises in distributing these training tasks effectively among the partitions. To address this challenge, we developed a novel approach for distributed training of MOs. We present the open Python library `MOTrainer`, which to the best of our knowledge, is the first Python library catering to researchers requiring training independent MOs across extensive spatio-temporal coverage in a distributed manner. `MOTrainer` leverages Xarray's [@Hoyer_xarray_N-D_labeled_2017] support for multi-dimensional datasets to accommodate spatio-temporal features of input/output data of the training tasks. It provides user-friendly functionalities implemented with the Dask [@Rocklin2015DaskPC] library, facilitating the partitioning of large spatio-temporal data for independent model training tasks. Additionally, it streamlines the train-test data split based on customized spatio-temporal coordinates. The Jackknife method [@mccuen1998hydrologic] is implemented as an external Cross-Validation method for Deep Neural Network (DNN) training, with support for Dask parallelization. This feature enables the scaling of training tasks across various computational infrastructures.