From 2422438897fa3684c4b29ba2d792d98be0e42a49 Mon Sep 17 00:00:00 2001 From: "brynn.zalmanek@pnnl.gov" Date: Tue, 17 Oct 2023 10:22:03 -0700 Subject: [PATCH 1/5] move files around --- .gitignore | 1 + .../R/NEON_data_exploration.Rmd | 0 {NEON_ph_Rshiny => NEON_ph_by_time/R}/app.R | 0 .../python/neon_time_series_data_with_map.ipynb | 0 .../python/nmdc-hack-a-thon.ipynb | 0 5 files changed, 1 insertion(+) rename NEON_data_exploration.Rmd => NEON_ph_by_time/R/NEON_data_exploration.Rmd (100%) rename {NEON_ph_Rshiny => NEON_ph_by_time/R}/app.R (100%) rename neon_time_series_data_with_map.ipynb => NEON_ph_by_time/python/neon_time_series_data_with_map.ipynb (100%) rename nmdc-hack-a-thon.ipynb => NEON_ph_by_time/python/nmdc-hack-a-thon.ipynb (100%) diff --git a/.gitignore b/.gitignore index 5b6a0652..4920fd2c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +.ipynb_checkpoints diff --git a/NEON_data_exploration.Rmd b/NEON_ph_by_time/R/NEON_data_exploration.Rmd similarity index 100% rename from NEON_data_exploration.Rmd rename to NEON_ph_by_time/R/NEON_data_exploration.Rmd diff --git a/NEON_ph_Rshiny/app.R b/NEON_ph_by_time/R/app.R similarity index 100% rename from NEON_ph_Rshiny/app.R rename to NEON_ph_by_time/R/app.R diff --git a/neon_time_series_data_with_map.ipynb b/NEON_ph_by_time/python/neon_time_series_data_with_map.ipynb similarity index 100% rename from neon_time_series_data_with_map.ipynb rename to NEON_ph_by_time/python/neon_time_series_data_with_map.ipynb diff --git a/nmdc-hack-a-thon.ipynb b/NEON_ph_by_time/python/nmdc-hack-a-thon.ipynb similarity index 100% rename from nmdc-hack-a-thon.ipynb rename to NEON_ph_by_time/python/nmdc-hack-a-thon.ipynb From d66cb31fb23efba3932cbcdbf21e01ee78e4af1e Mon Sep 17 00:00:00 2001 From: "brynn.zalmanek@pnnl.gov" Date: Tue, 17 Oct 2023 11:44:09 -0700 Subject: [PATCH 2/5] Add README.md --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 00000000..81778f98 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# NNMC Data and Metadata R and Python Sample Jupyter Notebooks + +## Overview + +This repository includes jupyter notebooks that explore and analyze microbiome data from the National Microbiome Data Collaborative's (NMDC) data portal. These notebooks aim to: + +- highlight the NMDC's metadata and data +- demonstrate how the NMDC's API may be used to retrieve metadata and data of various microbiome research +- illustrate example use cases of using the NMDC's (meta)data to answer scientific questions +- encourage scientists to programmaticly access the NMDC Data Portal +- promote the accessiblity of microbiome research by demonstrating various modes of finding, accessing, and reusing existing microbiome data. + +Each folder's scope attempts to explore a scientific question using the NMDC's (meta)data. A folder includes a `README.md` that outlines the question or analysis posed as well as two sub-folders, one labeled `R`, and the other `python` that comprises the sample notebooks using the R and Python programming languages, respectively. + +R and Python were chosen since they are popular languages among scientists to explore and visualize data. Jupyter Notebook is used because of its interactive code and data exploration features, effectiveness in teaching, language independency, and ease of sharing code. + +A challenging aspect that has been highlighted with this process is accessing the (meta)data in a user-friendly way via the NMDC API. Because the NMDC metadata schema is highly modular, retrieving metadata is not straight forward without extensive knowledge of the metadata schema's infrastructure, modeling language ([LinkML](https://linkml.io/)), and naming conventions. A proposed solution to this challenge is the creation of an R or Python package that would allow users to access NMDC's data in an easier and more straight forward way. + +## Adding new notebooks + +To add a new notebook to this repository: + +1. Create a folder in the base directory + - Name the folder with a short version of the analysis/question that will be explored. + - Make name of folder `snake_case` +2. Create a sub-folder for each language that will be demonstrated + - e.g. one subfolder named `R` and one subfolder named `python` +3. Instantiate a Jupyter Notebook for each folder coded in its corresponding language \ No newline at end of file From ccf828afb6c1438c92f581529002cd96921a405c Mon Sep 17 00:00:00 2001 From: "brynn.zalmanek@pnnl.gov" Date: Tue, 17 Oct 2023 11:45:44 -0700 Subject: [PATCH 3/5] edit README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 81778f98..cb4fc571 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ To add a new notebook to this repository: 1. Create a folder in the base directory - Name the folder with a short version of the analysis/question that will be explored. - Make name of folder `snake_case` -2. Create a sub-folder for each language that will be demonstrated +2. Create a `README.md` in the folder outlining the analysis or question. +3. Create a sub-folder for each language that will be demonstrated - e.g. one subfolder named `R` and one subfolder named `python` -3. Instantiate a Jupyter Notebook for each folder coded in its corresponding language \ No newline at end of file +4. Instantiate a Jupyter Notebook for each folder coded in its corresponding language \ No newline at end of file From 8ea824ae28bfc60b50afe12f55c4ec93a3f99a38 Mon Sep 17 00:00:00 2001 From: "brynn.zalmanek@pnnl.gov" Date: Tue, 17 Oct 2023 11:56:55 -0700 Subject: [PATCH 4/5] neon ph vs. time readme --- NEON_ph_by_time/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 NEON_ph_by_time/README.md diff --git a/NEON_ph_by_time/README.md b/NEON_ph_by_time/README.md new file mode 100644 index 00000000..352d478d --- /dev/null +++ b/NEON_ph_by_time/README.md @@ -0,0 +1,3 @@ +# Interactive application exploring NEON sites of pH vs. time + +This folder includes two notebooks, in R and Python, that looks at how soil pH changes over time for the various NEON sites. \ No newline at end of file From 487b94492e1cf2d8d6d0c55fad5e9fbe80f51a51 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Tue, 17 Oct 2023 13:35:17 -0700 Subject: [PATCH 5/5] KRH small mods on .Rmd --- NEON_data_exploration.Rmd | 52 ++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/NEON_data_exploration.Rmd b/NEON_data_exploration.Rmd index acdd9aa7..194b5802 100644 --- a/NEON_data_exploration.Rmd +++ b/NEON_data_exploration.Rmd @@ -82,20 +82,44 @@ for (i in 1:length(study_ids)){ ```{r} library(lubridate) +my_theme <- theme_bw() df <- dat_all %>% - select(collection_date, ph, geo_loc_name, lat_lon) %>% - unnest(cols = c(collection_date, geo_loc_name, - lat_lon), names_sep = "_") %>% + select( + collection_date, ph, geo_loc_name, lat_lon + ) %>% + unnest( + cols = c( + collection_date, + geo_loc_name, + lat_lon + ), names_sep = "_") %>% rename(collection_date = collection_date_has_raw_value , geo_loc = geo_loc_name_has_raw_value) df2 <- df %>% mutate(collection_date2 = as.Date(collection_date)) +df3 <- df2 %>% + mutate(geo_loc_grouped = geo_loc %>% + factor() %>% + fct_lump(n = 6) + ) %>% + filter(geo_loc_grouped != "Other") + + g <- ggplot(data = df2) + geom_point(aes(x=collection_date, y = ph)) + + my_theme + facet_wrap(facets = vars(geo_loc)) +g <- ggplot(data = df3) + + geom_point(aes(x=collection_date2, y = ph)) + + my_theme + + scale_x_date()+ + facet_wrap(facets = vars(geo_loc_grouped), + labeller = label_wrap_gen(width=30)) +g + ``` @@ -103,16 +127,32 @@ g <- ggplot(data = df2) + library("rnaturalearth") library("rnaturalearthdata") +locs_with_ph <- df2 %>% + group_by( + geo_loc + ) %>% + mutate( + count_with_ph = n() + ) %>% + select( + geo_loc, + lat_lon_longitude, + lat_lon_latitude, + count_with_ph + ) %>% + distinct() + world <- ne_countries(scale = "medium", returnclass = "sf") class(world) g2 <- ggplot(data = world) + geom_sf() + geom_point( data = locs_with_ph, - aes(x = lat_lon_longitude, y = lat_lon_latitude, color = geo_loc)) + + aes(x = lat_lon_longitude, y = lat_lon_latitude, + size = count_with_ph)) + my_theme + - labs(x = "Longitude", y = "Latitude")+ - theme(legend.position = "none")+ + labs(x = "Longitude", y = "Latitude", size = "Samples with \n pH measurements")+ + theme()+ coord_sf(xlim = c(-165, -66), ylim = c(17, 72), expand = FALSE) g2