From 04a284ae2d52cb3a186d374623b2794e5e088707 Mon Sep 17 00:00:00 2001
From: corviday <corvi@clockworksquid.com>
Date: Wed, 18 Dec 2019 12:39:52 -0800
Subject: [PATCH] Clarify wording

---
 README.md                             | 11 ++++++++
 doc/make.bat                          | 35 ++++++++++++++++++++++++
 doc/source/api/api-index.rst          |  1 -
 doc/source/api/api-overview.md        | 13 ++++++---
 doc/source/api/metadata-api.rst       |  4 +--
 doc/source/api/watershed-api-usage.md | 11 ++++++--
 doc/source/index.rst                  |  2 +-
 doc/source/overview.md                | 39 ++++++++++++++++++++-------
 doc/source/workflow.md                |  4 ++-
 requirements.txt                      |  1 -
 setup.py                              | 11 ++++++++
 11 files changed, 111 insertions(+), 21 deletions(-)
 create mode 100644 doc/make.bat

diff --git a/README.md b/README.md
index 5fd1195c..37b1c405 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,17 @@ $ source venv/bin/activate
 (venv)$ pip install -e .
 ```
 
+### Building the Documentation
+
+Building the docs requires the package to be installed first, as docstrings from installed modules are used to generate code documentation. 
+
+```
+pip install -e .
+pyenv/bin/python setup.py build_sphinx
+```
+
+HTML documentation will then be available in the `doc/build/html` directory.
+
 ### Running the dev server
 
 A development server can be run locally by using the Flask command line interface documented [here](http://flask.pocoo.org/docs/0.12/cli/). In general, you need to set one environment variable FLASK_APP=ce.wsgi:app and can optionally set FLASK_DEBUG=1 for live code reloading.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 00000000..6247f7e2
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/source/api/api-index.rst b/doc/source/api/api-index.rst
index da58ec19..ddfb6f90 100644
--- a/doc/source/api/api-index.rst
+++ b/doc/source/api/api-index.rst
@@ -3,7 +3,6 @@ API Documentation
 
 .. mdinclude:: api-overview.md
 
-
 .. toctree::
    :maxdepth: 1
    :caption: The backend exposes the following API endpoints:
diff --git a/doc/source/api/api-overview.md b/doc/source/api/api-overview.md
index 87e44690..da4f6e6c 100644
--- a/doc/source/api/api-overview.md
+++ b/doc/source/api/api-overview.md
@@ -1,4 +1,11 @@
-Documentation for each API endpoint is automatically generated from the code and docstring for that API's main function and may not be entirely user-friendly. There are some very minor differences between arguments for each API function and the parameters needed for a web query.
+Documentation for each API endpoint is automatically generated from the code and docstring for that API's main function and may not be entirely user-friendly. There are some minor differences between the internal workings of the API function and the process of querying them over the web.
 
-1. Web queries do not supply a `sesh` (database session) as an argument; that will be automatically done by the query parser.
-2. Parameters supplied in a query url should be web-encoded.
\ No newline at end of file
+The query URL is constructed from a base url ending in a slash, followed by the name of the endpoint, a question mark, and then one or more parameters of the form `attribute=value', seperated by ampersands. Parameters supplied via query URL should be web-encoded so that they will be correctly parsed.
+
+The automatically generated API documentation describes a `sesh` (database session) argument to each API function. Database sessions are supplied by the query parser and does not need to be given in the query URL.
+
+For example, the `multimeta` function has a signature of `ce.api.multimeta(sesh, ensemble_name='ce_files', model='')`
+
+The query URL `https://base_url/multimeta?ensemble_name=ce_files&model=CanESM2` calls the `multimeta` endpoint and supplies two arguments for the `multimeta` function: `ensemble_name` is "ce_files" and `model` is CanESM2. `sesh` is not supplied in the query URL.
+
+The API function return values are converted to JSON for the endpoint response.
\ No newline at end of file
diff --git a/doc/source/api/metadata-api.rst b/doc/source/api/metadata-api.rst
index 8c1cbed6..8198a8c1 100644
--- a/doc/source/api/metadata-api.rst
+++ b/doc/source/api/metadata-api.rst
@@ -10,8 +10,8 @@ metadata
 
 .. warning::
      Parameter names for this endpoint are not consistent with parameter names for the other
-     endpoints. Every other endpoint uses the word "model" to refer to the global climate
-     model (GCM) that produced a particular dataset.
+     endpoints. Every other endpoint uses the word "model" to refer to the General Circulation
+     Model (GCM) or Regional Climate Model (RCM) that produced a particular dataset.
      
      This endpoint uses the "model_id" parameter to reger to a dataset's unique identification
      string, which is called "id_" in every other endpoint.
diff --git a/doc/source/api/watershed-api-usage.md b/doc/source/api/watershed-api-usage.md
index 0dc2d4f6..2f4a5a10 100644
--- a/doc/source/api/watershed-api-usage.md
+++ b/doc/source/api/watershed-api-usage.md
@@ -2,7 +2,7 @@ This API endpoint provides contextual information about the watershed that drain
 
 Every grid cell is defined as flowing into a single other grid cell, so this data is most reliable for larger watersheds representing at least ten grid cells, and completely inappropriate for creeks or culverts smaller than a single grid cell. At small scales, streamflow variations within grid cells, not capturable by a gridded dataset, play too large a role.
 
-## Hypometric curve
+## Hypsometric curve
 The `hypsometric_curve` object defines a histogram of area by elevation. 
 
 * Elevation bins are of equal width, `w = elevation_bin_width`.
@@ -11,4 +11,11 @@ The `hypsometric_curve` object defines a histogram of area by elevation.
 * The sum of areas of all cells with elevation falling into elevation bin `k` is given by `ak = cumulative_areas[k]`.
 * Units of elevation and area are specified by the properties `elevation_units` and `area_units`.
 
-Because elevation data is given as the mean of each grid square it is possible to have nonzero values for two elevation bins `ek` and `ek+2` but a zero value for `ek+1` in steep landscapes.
\ No newline at end of file
+### Gaps in the hypsometric curve, or "empty" bins
+For large areas of the earth and reasonably large elevation bins, we expect to see non-zero cumulative areas for each elevation bin between the minimum and maximum elevation over that area. In other words, there should be at least some area at each elevation in the histogram.
+
+However, for small areas with steep topography, it is common to see some of the elevation bins between min and max elevation with zero area. This is not an error in either the computation or the data that feeds it. It is instead a product of the fact that `n` surface grid cells can represent at most `n` elevations.
+
+Consider the most extreme case of `n = 2` cells that happen to be positioned at the edge of a steep sided valley. One cell is in the valley bottom with an average elevation of 100 m. The other is cell, just adjacent to it, mostly covers the highland above with an average elevation of 500 m. In a histogram with 100 m bin width, we'd see non-zero areas for the 100 m bin and the 500 m bin, but zero areas for the 200 m, 300 m, and 400 m elevation bins, and in the graph these would look like gaps.
+
+We can see a similar effect for other small values of `n > 2` in steep terrain too. Once `n` becomes large enough, then the likelihood of an elevation bin not having some cells is quite low and these gaps do not appear.
\ No newline at end of file
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 5cc19e84..df9a5757 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -6,7 +6,7 @@
 PCIC Climate Explorer Backend Documentation
 ============================================
 
-This backend serves climate raster data and metadata to the 
+This backend serves climate data and metadata to the 
 `PCIC Climate Explorer (PCEX) <https://services.pacificclimate.org/pcex/app/#/data/climo/ce_files>`_
 and other applications.
 
diff --git a/doc/source/overview.md b/doc/source/overview.md
index e8aaf6db..f546ff4e 100644
--- a/doc/source/overview.md
+++ b/doc/source/overview.md
@@ -1,32 +1,51 @@
 # Overview
 
-## Raster Data
-This set of APIs provide queries for retrieving statistical and summary data from multidimensional raster datasets containing climate data.
+## Data Matrix
+This set of APIs provide queries for retrieving statistical and summary data from multidimensional data matrix containing climate data.
 
-Raster datasets treat data as a regular matrix. Each dataset has a list of latitudes, a list of longitudes, a list of timestamps, and a list of variables such as temperature, precipitation, or derived values like growing degree days. A value may be accessed for every combination of latitude, longitude, timestamp, and variable, which represents the value of that variable at that particular point in time and space. 
+The data is presented as a regular matrix. Each dataset has a list of latitudes, a list of longitudes, a list of timestamps, and a list of variables such as temperature, precipitation, or derived values like growing degree days. A value may be accessed for every combination of latitude, longitude, timestamp, and variable, which represents the value of that variable at that particular point in time and space. 
 
 These data are interpolated from real-world observations or generated by General Circulation Models or Regional Climate Models, which are able to provide complete data coverage. Individual points may be masked off as missing, but this is typically only done to mark the boundaries of irregular geographic extents, like coastlines. The data should be assumed to be dense within the spatial and temporal boundaries.
 
 
 ## Data Access
-This system is not designed to provide individual access to every point in the raster data matrix, but calculates and makes available summary data over user-defined spatial and temporal scales to support analysis and visualization. All numerical API endpoints accept a spatial area of interest (specified in URL-encoded WKT as a point or polygon) and return the mean of all cells within the designated area. If no area is designated, the mean across the entire spatial extent of the dataset is returned. 
+This system is not designed to provide individual access to every point in the data matrix, but calculates and makes available summary data over user-defined spatial and temporal scales to support analysis and visualization. All numerical API endpoints accept a spatial area of interest (specified in URL-encoded WKT as a point or polygon) and return the mean of all cells within the designated area. If no area is designated, the mean across the entire spatial extent of the dataset is returned. 
 
 Datafiles are organized by temporal range and resolution.
 
 ## Modeled Data
 
-Most data accessible by the API is climate data from a General Circulation Model (GCM) or Regional Climate Model (RCM). These numerical models represent physical processes in the atmosphere, ocean, cryosphere, and land surface of the earth. They simulate the response of the global climate system. The models output meteorological projections like temperature and precipitation. Other available variables represent statistical indices derived from the meteorological outputs.
+Most data accessible by the API is climate data from a General Circulation Model (GCM) or Regional Climate Model (RCM). These numerical models represent physical processes in the atmosphere, ocean, cryosphere, and land surface of the earth. They simulate the response of the global climate system. Model outputs are meteorological variables such as temperature and precipitation. Temporally, they typically cover the period 1950 to 2100. Data for any dates in the future are termed "projections." Other variables represent statistical indices calculated from the meteorological variables output by models, and are associated with the same model that output the base data. 
+
+A small minority of the datasets accessible by this API are not GCM or RCM outputs, but are intended to help provide additional context to the GCM and RCM outputs. These contextual data were instead created by models that interpolate observed data to fill spatial and temporal gaps. These non-GCM interpolated datasets have no projection data associated with dates in the future.
 
 ### Emissions Scenarios
 
-Each dataset has an associated emissions scenario, sometimes called `experiment`. Emissions scenarios represent a range of possible future projections for greenhouse gas concentrations. GCMs and RCMs are typically driven by historical data on greenhouse gas concentrations when modeling the past and present, before using an emissions scenario to provide input on greenhouse gas concentrations to drive future projections.
+Each dataset has an associated emissions scenario. Emissions scenarios represent a range of possible future projections for greenhouse gas concentrations. GCMs and RCMs are typically driven by historical data on greenhouse gas concentrations when modeling the past and present, before using an emissions scenario to provide input on greenhouse gas concentrations to drive future projections, typically a Representative Concentration Pathway scenario defined by the IPCC.
+
+Datasets created by interpolative models feature only historical emissions scenarios.
+
+The parameter name `experiment` is used to designate an emissions scenario in the API and code.
 
 ### Runs
 
-Each dataset has an associated run string, sometimes called `ensemble_member`, which represents the initialization settings of the simulating model. A GCM or RCM may be run multiple times with different initialization conditions.  
+Each dataset has an associated run string, which represents the initialization settings of the simulating model. A GCM or RCM may be run multiple times with different initialization conditions. A collection of related runs with different initializations of the same model comprise a statistical ensemble, which can be used to give some idea of the range of possible outcomes of the model system.
+
+GCMs and RCMs follow a standard encoding for the members of a statistical ensemble, `rXiXpX`, which is provided to the API as the parameter `ensemble_member`:
+* rX where X is an integer representing the realization of the run
+* iX where X is an integer representing the initialization method of this run
+* pX where X is an integer representing the physics version used for this run.
+
+Interpolative models can't typically be run multiple times to create a statistical ensemble, and the concept of an ensemble_member code doesn't apply. Nevertheless, for uniformity, we have generalized from the common GCM case, and datasets output by interpolative models have an `ensemble_member` string that does not follow the `rXiXpX` encoding, such as "n/a" or "nominal".
+
+## Climatological Aggregates
+
+While GCM and RCM models output daily or sub-daily projections, most data accessible via this API is normalized as multi-year climatological statistics with monthly, seasonal, or annual resolution.
+
+The statistical methods used to generate the monthly, seasonal, or annual values vary with the nature of the data. For example, a monthly mean precipitation value (`pr`) may be calculated by taking the mean daily precipitation for every day in a month. A maximum one-day monthly precipitation value (`rx1day`) may be calculated by taking the single largest total precipitation that falls on one day in the month.
 
-## Climatological Means
+Most of the data is then further aggregated between years over a specified amount of time, typically 30 years, the standard in climate science. For example, the January value of a climatological aggregate dataset for 1961-1990 represents the statistically aggregated values for January 1961, January 1962, and so on up to January 1990. The February value represents February 1961, February 1962, and so on. 
 
-Most data accessible via this API is normalized as multi-year climatological means. These datasets have been averaged between years over a specified amount of time, typically thirty years. For example, the January value of a precipitation climatological mean dataset for 1961-1990 represents the mean precipitation of January 1961, January 1962, and so on up to January 1990. 
+A series of these overlapping 30-year aggregated climatologies is generated from the entire timespan of the model output data. The aggregating function used to create multi-year climatological datasets is either `mean`, to show long term trends in the numerical quantity being measured, or `stdev`, to show long term variability of the numerical quantity being measured.
 
-A series of overlapping climatologies are generated from the entire timespan of the model output data.
\ No newline at end of file
+A small number of non-aggregated datasets are available, but they are not the primary purpose of this system, and many of the queries are not available for these datasets. These datasets are removed from circulation and replaced with aggregated datasets when possible.
\ No newline at end of file
diff --git a/doc/source/workflow.md b/doc/source/workflow.md
index 4cbdfa99..77040376 100644
--- a/doc/source/workflow.md
+++ b/doc/source/workflow.md
@@ -2,7 +2,9 @@
 
 ## List Available Datasets
 
-Datafiles are organized into ensembles, which collect all data needed for a specific purpose. To view a list of all datasets in a particular ensemble, query the `multimeta` API. The `multimeta` API gives each datafile as a unique identifier string and a collection of attributes describing the data contained within that file. After a datafile of interest has been determined from its metadata attributes, its unique identifier string may be used to request the data.
+Datafiles are organized into ensembles containing all data needed for a specific purpose. (The term "ensemble" is a misnomer in this usage; a more appropriate term would be "collection." For historical reasons the term "ensemble" is embedded in the code and it is not easily changed at this point.)
+
+To view a list of all datasets in a particular ensemble, query the `multimeta` API. The `multimeta` API gives each datafile as a unique identifier string and a collection of attributes describing the data contained within that file. After a datafile of interest has been determined from its metadata attributes, its unique identifier string may be used to request the data.
 
 The `multi_year_mean` attribute is an important attribute of datafiles. Datafiles with `multi_year_mean` equal to `true` represent climatological means. Each individual value in these files represents a mean taken across years. For example, a monthly climatological mean might cover a span of thirty years, 1961-1990, but feature only twelve timestamps. The January timestamp is the mean of the value for January 1961, January 1962, and so on up to January 1990. The February timestamp is the mean of the values for February 1961, February 1962, and so on. Climatological means may be monthly, seasonal, or annual. This API primarily supports analysis of climatological datasets, and more analysis options are available for them.
 
diff --git a/requirements.txt b/requirements.txt
index 5e07f0ef..7149f0d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ contexttimer==0.3.3
 
 # For documentation
 sphinx
-recommonmark
 m2r
 
 # For testing
diff --git a/setup.py b/setup.py
index 3ce64d16..3a3c89cc 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 from setuptools import setup, find_packages
+from sphinx.setup_command import BuildDoc
 
 
 def recursive_list(pkg_dir, basedir):
@@ -40,5 +41,15 @@ def find():
     package_data={
         'ce': ['tests/data/*.nc', 'templates/*.html'] + recursive_list('ce/', 'ce/static'),
     },
+    cmdclass = {
+        'build_sphinx': BuildDoc
+        },
+    command_options={
+        'build_sphinx': {
+            'project': ('setup.py', "ce"),
+            'version': ('setup.py', __version__),
+            'source_dir': ('setup.py', 'doc/source')
+            }
+        },
     zip_safe=False
 )