diff --git a/Project.toml b/Project.toml index 42857918..e9e6b0b3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "SimpleSDMLayers" uuid = "2c645270-77db-11e9-22c3-0f302a89c64c" authors = ["Timothée Poisot ", "Gabriel Dansereau "] -version = "0.4.5" +version = "0.4.6" [deps] ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3" diff --git a/docs/make.jl b/docs/make.jl index 147f2a98..7081745b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -18,6 +18,7 @@ makedocs( "Examples" => [ "Temperature data" => "examples/temperature.md", "GBIF integration" => "examples/gbif.md", + "DataFrames integration" => "examples/dataframes.md", "Importing raster data" => "examples/import.md", "Sliding window analysis" => "examples/slidingwindow.md", "Landcover data" => "examples/landcover.md", diff --git a/docs/src/examples/dataframes.md b/docs/src/examples/dataframes.md new file mode 100644 index 00000000..2c79d0a5 --- /dev/null +++ b/docs/src/examples/dataframes.md @@ -0,0 +1,146 @@ +# Working with DataFrames + +Both `SimpleSDMLayers.jl` and `GBIF.jl` offer an optional integration with the +`DataFrames.jl` package. Therefore, our [previous +example](https://ecojulia.github.io/SimpleSDMLayers.jl/latest/examples/gbif/) +with the kingfisher *Megaceryle alcyon* could also be approached with a +`DataFrame`-centered workflow. + +We will illustrate this using the same data and producing the same figures as in +the previous example. To do so, we will use `GBIF.jl` to produce the occurrence +`DataFrame` we will use throughout this example. However, it is also possible to +use a `DataFrame` of your choosing instead of one generated by `GBIF.jl`, as +long as it holds one occurrence per row, a column with the latitude coordinates, +and a column with longitude coordinates. For the rest, it can hold whatever +information you like. Most of our functions assume by default that the +coordinates are stored in columns named `:latitude` and `:longitude` (the order +doesn't matter), but you can generally specify other names with `latitude = +:lat` in case you don't want to rename them (we will show you how below). + +So let's start by getting our data: + +```julia +# Load packages +using SimpleSDMLayers +using GBIF +using Plots +using Statistics +# Load DataFrames too +using DataFrames + +# Load environmental data +temperature, precipitation = worldclim([1,12]) + +# Get GBIF occurrences +kingfisher = GBIF.taxon("Megaceryle alcyon", strict=true) +kf_occurrences = occurrences(kingfisher, + "hasCoordinate" => "true", + "decimalLatitude" => (0.0, 65.0), + "decimalLongitude" => (-180.0, -50.0), + "limit" => 200) +for i in 1:4 + occurrences!(kf_occurrences) +end +@info kf_occurrences + +``` + +Once the data is loaded, we can easily convert the environmental layers to a +`DataFrame` with the corresponding coordinates. We can do this for a single +layer: + +```julia +temperature_df = DataFrame(temperature) +first(temperature_df, 5) +``` + +Or for multiple layers at the same time: + +```julia +env_layers = [temperature, precipitation] +env_df = DataFrame(env_layers) +rename!(env_df, :x1 => :temperature, :x2 => :precipitation) +first(env_df, 5) +``` + +`GBIF.jl` allows us to convert a set of occurrences to a `DataFrame` just as +easily: + +```julia +kf_df = DataFrame(kf_occurrences) +last(kf_df, 5) +``` + +We can then extract the temperature values for all the occurrences. + +```julia +temperature[kf_df] +``` + +Or we can clip the layers according to the occurrences: + +```julia +temperature_clip = clip(temperature, kf_df) +precipitation_clip = clip(precipitation, kf_df) +``` + +In case your `DataFrame` has different column names for the coordinates, for +example `:lat` and `:lon`, you can clip it like this: + +```julia +kf_df_shortnames = rename(kf_df, :latitude => :lat, :longitude => :lon) +clip(temperature, kf_df_shortnames; latitude = :lat, longitude = :lon) +``` + +We can finally plot the layer and occurrence values in a similar way to any +`DataFrame` or `Array`. Since there are often many `nothing` values in +the layers, it might be necessary to use `filter!` first: + +```julia +filter!(x -> !isnothing(x.temperature) && !isnothing(x.precipitation), env_df); +histogram2d(env_df.temperature, env_df.precipitation, c = :viridis) +scatter!(temperature_clip[kf_df], precipitation_clip[kf_df], + lab= "", c = :white, msc = :orange) +``` + +To plot the occurrence values over space, you can use: + +```julia +contour(temperature_clip, c = :alpine, title = "Temperature", + frame = :box, fill = true) +scatter!(kf_df.longitude, kf_df.latitude, + lab = "", c = :white, msc = :orange, ms = 2) +``` + +We can finally make a layer with the number of observations per cells: + +```julia +abundance = mask(precipitation_clip, kf_occurrences, Float32) +``` + +A useful trick to visualize sites with occurrences, in contrast with sites +without any occurrence, is to use `replace` or `replace!` to set the values +returned as `0` or `true` by the function `mask()` to `nothing`. This allows us +to first plot a background layer with a uniform colour, covering the whole area +to visualize, then plot the occurrence layer on top using a different colour +scale. + +```julia +abundance_nozeros = replace(abundance, 0 => nothing) +plot(precipitation_clip, c = :lightgrey) +plot!(abundance_nozeros, c = :viridis, clim = extrema(abundance_nozeros)) +``` + +Once again, the cells are rather small, and there are few observations, so this +is not necessarily going to be very informative. As in our other example, to +get a better sense of the distribution of observations, we can get the average +number of observations in a radius of 100km around each cell (we will do so for +a zoomed-in part of the map to save time): + +```julia +zoom = abundance[left = -100.0, right = -75.0, top = 43.0, bottom = 20.0] +buffered = slidingwindow(zoom, Statistics.mean, 100.0) +plot(buffered, c = :lapaz, legend = false, frame = :box) +scatter!(kf_df.longitude, kf_df.latitude, + lab = "", c = :white, msc = :orange, ms = 2, alpha = 0.5) +``` \ No newline at end of file diff --git a/src/integrations/DataFrames.jl b/src/integrations/DataFrames.jl index f8f53a7e..73be82c0 100644 --- a/src/integrations/DataFrames.jl +++ b/src/integrations/DataFrames.jl @@ -8,11 +8,13 @@ import SimpleSDMLayers: clip, latitudes, longitudes """ Base.getindex(layer::T, df::DataFrames.DataFrame; latitude = :latitude, longitude = :longitude) where {T <: SimpleSDMLayer} -Returns the values of a layer at all occurrences in a `DataFrame`. +Returns the values of a layer at all occurrences in a `DataFrame`. Note that the +function looks for columns named `:latitude` and `:longitude` by default, but +these can be changed using the `latitude` and `longitude` arguments. """ function Base.getindex(layer::T, df::DataFrames.DataFrame; latitude = :latitude, longitude = :longitude) where {T <: SimpleSDMLayer} - lats = df[:, latitude] - lons = df[:, longitude] + lats = df[!, latitude] + lons = df[!, longitude] return [layer[lon, lat] for (lon, lat) in zip(lons, lats)] end @@ -20,11 +22,13 @@ end clip(layer::T, df::DataFrames.DataFrame; latitude = :latitude, longitude = :longitude) where {T <: SimpleSDMLayer} Returns a clipped version (with a 10% margin) around all occurences in a -`DataFrame`. +`DataFrame`. Note that the function looks for columns named `:latitude` and +`:longitude` by default, but these can be changed using the `latitude` and +`longitude` arguments. """ function SimpleSDMLayers.clip(layer::T, df::DataFrames.DataFrame; latitude = :latitude, longitude = :longitude) where {T <: SimpleSDMLayer} - occ_latitudes = filter(!ismissing, df[:, latitude]) - occ_longitudes = filter(!ismissing, df[:, longitude]) + occ_latitudes = filter(!ismissing, df[!, latitude]) + occ_longitudes = filter(!ismissing, df[!, longitude]) lat_min = minimum(occ_latitudes) lat_max = maximum(occ_latitudes) @@ -50,7 +54,7 @@ end DataFrames.DataFrame(layer::T) where {T <: SimpleSDMLayer} Returns a DataFrame from a `SimpleSDMLayer` element, with columns for latitudes, -longitudes and grid values. +longitudes and grid values. """ function DataFrames.DataFrame(layer::T; kw...) where {T <: SimpleSDMLayer} lats = repeat(latitudes(layer), outer = size(layer, 2)) @@ -84,11 +88,15 @@ for ty in (:SimpleSDMResponse, :SimpleSDMPredictor) """ $($ty)(df::DataFrame, col::Symbol, layer::T; latitude::Symbol = :latitude, longitude::Symbol = :longitude) where {T <: SimpleSDMLayer} - Returns a `$($ty)` from a `DataFrame`. + Returns a `$($ty)` from a `DataFrame`. This requires to select a + column to be returned as a layer, as well as an existing layer from + which to copy the dimensions. Note that the function looks for + columns named `:latitude` and `:longitude` by default, but these can + changed using the `latitude` and `longitude` arguments. """ function SimpleSDMLayers.$ty(df::DataFrames.DataFrame, col::Symbol, layer::SimpleSDMLayer; latitude::Symbol = :latitude, longitude::Symbol = :longitude) - lats = df[:, latitude] - lons = df[:, longitude] + lats = df[!, latitude] + lons = df[!, longitude] uniquelats = unique(lats) uniquelons = unique(lons) @@ -110,4 +118,54 @@ for ty in (:SimpleSDMResponse, :SimpleSDMPredictor) end end, ) +end + +""" + mask!(layer::SimpleSDMResponse{T}, records::DataFrames.DataFrame) where {T <: AbstractBool} + +Fills a layer (most likely created with `similar`) so that the values are `true` +if an occurrence is found in the cell, `false` if not. Note that the function +looks for columns named `:latitude` and `:longitude` by default, but these can +be changed using the `latitude` and `longitude` arguments. +""" +function mask!(layer::SimpleSDMResponse{T}, df::DataFrames.DataFrame; latitude::Symbol = :latitude, longitude::Symbol = :longitude) where {T <: Bool} + uniquedf = unique(df, [longitude, latitude]) + lons = uniquedf[!, longitude] + lats = uniquedf[!, latitude] + for (lon, lat) in zip(lons, lats) + layer[lon, lat] = true + end + return layer +end + +""" + mask!(layer::SimpleSDMResponse{T}, records::GBIF.GBIFRecords) where {T <: Number} + +Fills a layer (most likely created with `similar`) so that the values reflect +the number of occurrences in the cell. Note that the function looks for columns +named `:latitude` and `:longitude` by default, but these can be changed using +the `latitude` and `longitude` arguments. +""" +function mask!(layer::SimpleSDMResponse{T}, df::DataFrames.DataFrame; latitude::Symbol = :latitude, longitude::Symbol = :longitude) where {T <: Number} + lons = df[!, longitude] + lats = df[!, latitude] + for (lon, lat) in zip(lons, lats) + layer[lon, lat] = layer[lon, lat] + one(T) + end + return layer +end + +""" + mask(layer::SimpleSDMLayer, records::GBIF.GBIFRecords, element_type::Type=Bool) + +Create a new layer storing information about the presence of occurrences in the +cells, either counting (numeric types) or presence-absence-ing (boolean types) +them. Note that the function looks for columns named `:latitude` and +`:longitude` by default, but these can be changed using the `latitude` and +`longitude` arguments. +""" +function mask(layer::SimpleSDMLayer, df::DataFrames.DataFrame, element_type::Type=Bool; latitude::Symbol = :latitude, longitude::Symbol = :longitude) + returnlayer = similar(layer, element_type) + mask!(returnlayer, df, latitude = latitude, longitude = longitude) + return returnlayer end \ No newline at end of file diff --git a/test/dataframes.jl b/test/dataframes.jl index 89ff5ecc..2020353a 100644 --- a/test/dataframes.jl +++ b/test/dataframes.jl @@ -21,4 +21,13 @@ temperature_clip = clip(temperature, df) @test typeof(SimpleSDMPredictor(df, :values, temperature_clip)) <: SimpleSDMLayer @test typeof(SimpleSDMPredictor(df, :values, temperature_clip)) <: SimpleSDMPredictor +mbool = mask(temperature_clip, df, Bool) +@test eltype(mbool) == Bool + +mfloat = mask(temperature_clip, df, Float64) +@test eltype(mfloat) == Float64 + +@test sum(mfloat) >= sum(mbool) +@test sum(mfloat) == nrow(df) + end