From a24d6f12dc286d132259e8a5ce41f31ccc78c49c Mon Sep 17 00:00:00 2001
From: corviday <corvi@clockworksquid.com>
Date: Wed, 18 Dec 2019 12:39:52 -0800
Subject: [PATCH] Clarify wording

---
 README.md                             | 11 ++++++++
 doc/make.bat                          | 35 ++++++++++++++++++++++++
 doc/source/api/api-index.rst          |  1 -
 doc/source/api/api-overview.md        | 13 ++++++---
 doc/source/api/metadata-api.rst       |  4 +--
 doc/source/api/multimeta-api-usage.md |  2 +-
 doc/source/api/watershed-api-usage.md | 11 ++++++--
 doc/source/index.rst                  |  2 +-
 doc/source/overview.md                | 39 ++++++++++++++++++++-------
 doc/source/workflow.md                | 16 ++++++-----
 requirements.txt                      |  1 -
 setup.py                              | 11 ++++++++
 12 files changed, 118 insertions(+), 28 deletions(-)
 create mode 100644 doc/make.bat

diff --git a/README.md b/README.md
index 5fd1195c..37b1c405 100644
--- a/README.md
+++ b/README.md
@@ -33,6 +33,17 @@ $ source venv/bin/activate
 (venv)$ pip install -e .
 ```
 
+### Building the Documentation
+
+Building the docs requires the package to be installed first, as docstrings from installed modules are used to generate code documentation. 
+
+```
+pip install -e .
+pyenv/bin/python setup.py build_sphinx
+```
+
+HTML documentation will then be available in the `doc/build/html` directory.
+
 ### Running the dev server
 
 A development server can be run locally by using the Flask command line interface documented [here](http://flask.pocoo.org/docs/0.12/cli/). In general, you need to set one environment variable FLASK_APP=ce.wsgi:app and can optionally set FLASK_DEBUG=1 for live code reloading.
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 00000000..6247f7e2
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/doc/source/api/api-index.rst b/doc/source/api/api-index.rst
index da58ec19..ddfb6f90 100644
--- a/doc/source/api/api-index.rst
+++ b/doc/source/api/api-index.rst
@@ -3,7 +3,6 @@ API Documentation
 
 .. mdinclude:: api-overview.md
 
-
 .. toctree::
    :maxdepth: 1
    :caption: The backend exposes the following API endpoints:
diff --git a/doc/source/api/api-overview.md b/doc/source/api/api-overview.md
index 87e44690..da4f6e6c 100644
--- a/doc/source/api/api-overview.md
+++ b/doc/source/api/api-overview.md
@@ -1,4 +1,11 @@
-Documentation for each API endpoint is automatically generated from the code and docstring for that API's main function and may not be entirely user-friendly. There are some very minor differences between arguments for each API function and the parameters needed for a web query.
+Documentation for each API endpoint is automatically generated from the code and docstring for that API's main function and may not be entirely user-friendly. There are some minor differences between the internal workings of the API function and the process of querying them over the web.
 
-1. Web queries do not supply a `sesh` (database session) as an argument; that will be automatically done by the query parser.
-2. Parameters supplied in a query url should be web-encoded.
\ No newline at end of file
+The query URL is constructed from a base url ending in a slash, followed by the name of the endpoint, a question mark, and then one or more parameters of the form `attribute=value', seperated by ampersands. Parameters supplied via query URL should be web-encoded so that they will be correctly parsed.
+
+The automatically generated API documentation describes a `sesh` (database session) argument to each API function. Database sessions are supplied by the query parser and does not need to be given in the query URL.
+
+For example, the `multimeta` function has a signature of `ce.api.multimeta(sesh, ensemble_name='ce_files', model='')`
+
+The query URL `https://base_url/multimeta?ensemble_name=ce_files&model=CanESM2` calls the `multimeta` endpoint and supplies two arguments for the `multimeta` function: `ensemble_name` is "ce_files" and `model` is CanESM2. `sesh` is not supplied in the query URL.
+
+The API function return values are converted to JSON for the endpoint response.
\ No newline at end of file
diff --git a/doc/source/api/metadata-api.rst b/doc/source/api/metadata-api.rst
index 8c1cbed6..8198a8c1 100644
--- a/doc/source/api/metadata-api.rst
+++ b/doc/source/api/metadata-api.rst
@@ -10,8 +10,8 @@ metadata
 
 .. warning::
      Parameter names for this endpoint are not consistent with parameter names for the other
-     endpoints. Every other endpoint uses the word "model" to refer to the global climate
-     model (GCM) that produced a particular dataset.
+     endpoints. Every other endpoint uses the word "model" to refer to the General Circulation
+     Model (GCM) or Regional Climate Model (RCM) that produced a particular dataset.
      
      This endpoint uses the "model_id" parameter to reger to a dataset's unique identification
      string, which is called "id_" in every other endpoint.
diff --git a/doc/source/api/multimeta-api-usage.md b/doc/source/api/multimeta-api-usage.md
index c3c2f131..2a6ffb42 100644
--- a/doc/source/api/multimeta-api-usage.md
+++ b/doc/source/api/multimeta-api-usage.md
@@ -10,7 +10,7 @@ This endpoint is intended to provide an overview of all available datasets to en
 * `variables`: A list of variables in this dataset, with name and a short description. Variables are the numerical quantities being measured or projected, such as maximum temperature, precipitation, or derived indices.
 * `ensemble_member`: A model may be run multiple times with different initialization conditions; data from these runs is distinguished by the ensemble_member attribute
 * `timescale`: The temporal resolution of the data. `monthly`, `seasonal`, or `yearly`
-* `multi_year_mean`: Whether or not this datafile is a climatological mean. Climatological means 
+* `multi_year_mean`: Whether or not this datafile is a climatological aggregate. In a climatological aggregate dataset, the value at each timestamp represents a combination of values at that timestamp across multiple years. For example, a 1961-1990 climatological mean would have one value for January that represented the mean value of January 1961, January 1962, etc. 
 * `start_date`: The start of the temporal interval described by this dataset 
 * `end_date`: The end of the temporal interval described by this dataset
 * `modtime`: The most recent data this dataset was updated. Useful for determining whether to cache data.
\ No newline at end of file
diff --git a/doc/source/api/watershed-api-usage.md b/doc/source/api/watershed-api-usage.md
index 0dc2d4f6..2f4a5a10 100644
--- a/doc/source/api/watershed-api-usage.md
+++ b/doc/source/api/watershed-api-usage.md
@@ -2,7 +2,7 @@ This API endpoint provides contextual information about the watershed that drain
 
 Every grid cell is defined as flowing into a single other grid cell, so this data is most reliable for larger watersheds representing at least ten grid cells, and completely inappropriate for creeks or culverts smaller than a single grid cell. At small scales, streamflow variations within grid cells, not capturable by a gridded dataset, play too large a role.
 
-## Hypometric curve
+## Hypsometric curve
 The `hypsometric_curve` object defines a histogram of area by elevation. 
 
 * Elevation bins are of equal width, `w = elevation_bin_width`.
@@ -11,4 +11,11 @@ The `hypsometric_curve` object defines a histogram of area by elevation.
 * The sum of areas of all cells with elevation falling into elevation bin `k` is given by `ak = cumulative_areas[k]`.
 * Units of elevation and area are specified by the properties `elevation_units` and `area_units`.
 
-Because elevation data is given as the mean of each grid square it is possible to have nonzero values for two elevation bins `ek` and `ek+2` but a zero value for `ek+1` in steep landscapes.
\ No newline at end of file
+### Gaps in the hypsometric curve, or "empty" bins
+For large areas of the earth and reasonably large elevation bins, we expect to see non-zero cumulative areas for each elevation bin between the minimum and maximum elevation over that area. In other words, there should be at least some area at each elevation in the histogram.
+
+However, for small areas with steep topography, it is common to see some of the elevation bins between min and max elevation with zero area. This is not an error in either the computation or the data that feeds it. It is instead a product of the fact that `n` surface grid cells can represent at most `n` elevations.
+
+Consider the most extreme case of `n = 2` cells that happen to be positioned at the edge of a steep sided valley. One cell is in the valley bottom with an average elevation of 100 m. The other is cell, just adjacent to it, mostly covers the highland above with an average elevation of 500 m. In a histogram with 100 m bin width, we'd see non-zero areas for the 100 m bin and the 500 m bin, but zero areas for the 200 m, 300 m, and 400 m elevation bins, and in the graph these would look like gaps.
+
+We can see a similar effect for other small values of `n > 2` in steep terrain too. Once `n` becomes large enough, then the likelihood of an elevation bin not having some cells is quite low and these gaps do not appear.
\ No newline at end of file
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 5cc19e84..df9a5757 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -6,7 +6,7 @@
 PCIC Climate Explorer Backend Documentation
 ============================================
 
-This backend serves climate raster data and metadata to the 
+This backend serves climate data and metadata to the 
 `PCIC Climate Explorer (PCEX) <https://services.pacificclimate.org/pcex/app/#/data/climo/ce_files>`_
 and other applications.
 
diff --git a/doc/source/overview.md b/doc/source/overview.md
index e8aaf6db..0cfb5e1e 100644
--- a/doc/source/overview.md
+++ b/doc/source/overview.md
@@ -1,32 +1,51 @@
 # Overview
 
-## Raster Data
-This set of APIs provide queries for retrieving statistical and summary data from multidimensional raster datasets containing climate data.
+## Data Matrix
+This set of APIs provide queries for retrieving statistical and summary data from multidimensional data matrix containing climate data.
 
-Raster datasets treat data as a regular matrix. Each dataset has a list of latitudes, a list of longitudes, a list of timestamps, and a list of variables such as temperature, precipitation, or derived values like growing degree days. A value may be accessed for every combination of latitude, longitude, timestamp, and variable, which represents the value of that variable at that particular point in time and space. 
+The data is presented as a regular matrix. Each dataset has a list of latitudes, a list of longitudes, a list of timestamps, and a list of variables such as temperature, precipitation, or derived values like growing degree days. A value may be accessed for every combination of latitude, longitude, timestamp, and variable, which represents the value of that variable at that particular point in time and space. 
 
 These data are interpolated from real-world observations or generated by General Circulation Models or Regional Climate Models, which are able to provide complete data coverage. Individual points may be masked off as missing, but this is typically only done to mark the boundaries of irregular geographic extents, like coastlines. The data should be assumed to be dense within the spatial and temporal boundaries.
 
 
 ## Data Access
-This system is not designed to provide individual access to every point in the raster data matrix, but calculates and makes available summary data over user-defined spatial and temporal scales to support analysis and visualization. All numerical API endpoints accept a spatial area of interest (specified in URL-encoded WKT as a point or polygon) and return the mean of all cells within the designated area. If no area is designated, the mean across the entire spatial extent of the dataset is returned. 
+This system is not designed to provide individual access to every point in the data matrix, but calculates and makes available summary data over user-defined spatial and temporal scales to support analysis and visualization. All numerical API endpoints accept a spatial area of interest (specified in URL-encoded WKT as a point or polygon) and return the mean (or other specified statistical measure) of all cells within the designated area. If no area is designated, the mean across the entire spatial extent of the dataset is returned. 
 
 Datafiles are organized by temporal range and resolution.
 
 ## Modeled Data
 
-Most data accessible by the API is climate data from a General Circulation Model (GCM) or Regional Climate Model (RCM). These numerical models represent physical processes in the atmosphere, ocean, cryosphere, and land surface of the earth. They simulate the response of the global climate system. The models output meteorological projections like temperature and precipitation. Other available variables represent statistical indices derived from the meteorological outputs.
+Most data accessible by the API is climate data from a General Circulation Model (GCM) or Regional Climate Model (RCM). These numerical models represent physical processes in the atmosphere, ocean, cryosphere, and land surface of the earth. They simulate the response of the global climate system. Model outputs are meteorological variables such as temperature and precipitation. Temporally, they typically cover the period 1950 to 2100. Data for any dates in the future are termed "projections." Other variables represent statistical indices calculated from the meteorological variables output by models, and are associated with the same model that output the base data. 
+
+A small minority of the datasets accessible by this API are not GCM or RCM outputs, but are intended to help provide additional context to the GCM and RCM outputs. These contextual data were instead created by models that interpolate observed data to fill spatial and temporal gaps. These non-GCM interpolated datasets have no projection data associated with dates in the future.
 
 ### Emissions Scenarios
 
-Each dataset has an associated emissions scenario, sometimes called `experiment`. Emissions scenarios represent a range of possible future projections for greenhouse gas concentrations. GCMs and RCMs are typically driven by historical data on greenhouse gas concentrations when modeling the past and present, before using an emissions scenario to provide input on greenhouse gas concentrations to drive future projections.
+Each dataset has an associated emissions scenario. Emissions scenarios represent a range of possible future projections for greenhouse gas concentrations. GCMs and RCMs are typically driven by historical data on greenhouse gas concentrations when modeling the past and present, before using an emissions scenario to provide input on greenhouse gas concentrations to drive future projections, typically a Representative Concentration Pathway scenario defined by the IPCC.
+
+Datasets created by interpolative models feature only historical emissions scenarios.
+
+The parameter name `experiment` is used to designate an emissions scenario in the API and code.
 
 ### Runs
 
-Each dataset has an associated run string, sometimes called `ensemble_member`, which represents the initialization settings of the simulating model. A GCM or RCM may be run multiple times with different initialization conditions.  
+Each dataset has an associated run string, which represents the initialization settings of the simulating model. A GCM or RCM may be run multiple times with different initialization conditions. A collection of related runs with different initializations of the same model comprise a statistical ensemble, which can be used to give some idea of the range of possible outcomes of the model system.
+
+GCMs and RCMs follow a standard encoding for the members of a statistical ensemble, `rXiXpX`, which is provided to the API as the parameter `ensemble_member`:
+* rX where X is an integer representing the realization of the run
+* iX where X is an integer representing the initialization method of this run
+* pX where X is an integer representing the physics version used for this run.
+
+Interpolative models can't typically be run multiple times to create a statistical ensemble, and the concept of an ensemble_member code doesn't apply. Nevertheless, for uniformity, we have generalized from the common GCM case, and datasets output by interpolative models have an `ensemble_member` string that does not follow the `rXiXpX` encoding, such as "n/a" or "nominal".
+
+## Climatological Aggregates
+
+While GCM and RCM models output daily or sub-daily projections, most data accessible via this API is normalized as multi-year climatological statistics with monthly, seasonal, or annual resolution.
+
+The statistical methods used to generate the monthly, seasonal, or annual values vary with the nature of the data. For example, a monthly mean precipitation value (`pr`) may be calculated by taking the mean daily precipitation for every day in a month. A maximum one-day monthly precipitation value (`rx1day`) may be calculated by taking the single largest total precipitation that falls on one day in the month.
 
-## Climatological Means
+Most of the data is then further aggregated between years over a specified amount of time, typically 30 years, the standard in climate science. For example, the January value of a climatological aggregate dataset for 1961-1990 represents the statistically aggregated values for January 1961, January 1962, and so on up to January 1990. The February value represents February 1961, February 1962, and so on. 
 
-Most data accessible via this API is normalized as multi-year climatological means. These datasets have been averaged between years over a specified amount of time, typically thirty years. For example, the January value of a precipitation climatological mean dataset for 1961-1990 represents the mean precipitation of January 1961, January 1962, and so on up to January 1990. 
+A series of these overlapping 30-year aggregated climatologies is generated from the entire timespan of the model output data. The aggregating function used to create multi-year climatological datasets is either `mean`, to show long term trends in the numerical quantity being measured, or `stdev`, to show long term variability of the numerical quantity being measured.
 
-A series of overlapping climatologies are generated from the entire timespan of the model output data.
\ No newline at end of file
+A small number of non-aggregated datasets are available, but they are not the primary purpose of this system, and many of the queries are not available for these datasets. These datasets are removed from circulation and replaced with aggregated datasets when possible.
\ No newline at end of file
diff --git a/doc/source/workflow.md b/doc/source/workflow.md
index 4cbdfa99..0905a5b4 100644
--- a/doc/source/workflow.md
+++ b/doc/source/workflow.md
@@ -2,21 +2,23 @@
 
 ## List Available Datasets
 
-Datafiles are organized into ensembles, which collect all data needed for a specific purpose. To view a list of all datasets in a particular ensemble, query the `multimeta` API. The `multimeta` API gives each datafile as a unique identifier string and a collection of attributes describing the data contained within that file. After a datafile of interest has been determined from its metadata attributes, its unique identifier string may be used to request the data.
+Datafiles are organized into ensembles containing all data needed for a specific purpose. (The term "ensemble" is a misnomer in this usage; a more appropriate term would be "collection." For historical reasons the term "ensemble" is embedded in the code and it is not easily changed at this point.)
 
-The `multi_year_mean` attribute is an important attribute of datafiles. Datafiles with `multi_year_mean` equal to `true` represent climatological means. Each individual value in these files represents a mean taken across years. For example, a monthly climatological mean might cover a span of thirty years, 1961-1990, but feature only twelve timestamps. The January timestamp is the mean of the value for January 1961, January 1962, and so on up to January 1990. The February timestamp is the mean of the values for February 1961, February 1962, and so on. Climatological means may be monthly, seasonal, or annual. This API primarily supports analysis of climatological datasets, and more analysis options are available for them.
+To view a list of all datasets in a particular ensemble, query the `multimeta` API. The `multimeta` API gives each datafile as a unique identifier string and a collection of attributes describing the data contained within that file. After a datafile of interest has been determined from its metadata attributes, its unique identifier string may be used to request the data.
+
+The `multi_year_mean` attribute is an important attribute of datafiles. Datafiles with `multi_year_mean` equal to `true` represent climatological aggregates. Each individual value in these files represents a mean or stanard deviation calculated across multiple years, typically thirty years, which is standard in climate science. For example, a monthly climatological mean might cover 1961-1990, but feature only twelve timestamps. The January timestamp is the mean of the value for January 1961, January 1962, and so on up to January 1990. The February timestamp is the mean of the values for February 1961, February 1962, and so on. Climatological means may be monthly, seasonal, or annual. This API primarily supports analysis of climatological datasets, and more analysis options are available for them.
 
 Datasets with `multi_year_mean` equal to `false` represent nominal time datasets; no aggregation has been done between years. A monthly dataset covering 1961-1990 would feature 360 timestamps.
 
-## Request Numerical Data From A Climatological Mean Datafile
+## Request Numerical Data From A Climatological Aggregate Datafile
 
-The `timeseries` endpoint returns a timeseries with the mean value of every timestamp in the datafile. It requires a datafile's unique identification string, and optionally a spatial area of interest defined as a Well Known Text (WKT) Polygon or Point. For a climatological mean datafile, the resulting timeseries represents an average annual cycle over the period described by the dataset. The annual cycle may have twelve monthly values, four seasonal values, or a single annual value.
+The `timeseries` endpoint returns a timeseries with the mean value for each timestamp in the datafile. It requires a datafile's unique identification string, and optionally a spatial area of interest defined as a Well Known Text (WKT) Polygon or Point. For a climatological aggregate datafile, the resulting timeseries represents an average annual cycle over the period described by the dataset. The annual cycle may have twelve monthly values, four seasonal values, or a single annual value.
 
-The `stats` endpoint returns statistical measures (`mean`, `stdev`, `min`, `max`, and `median`) of a single dataset identified by its unique identification string, a temporal index, and optionally a spatial area of interest defined as a WKT Polygon or Point.  
+The `stats` endpoint returns statistical measures (`mean`, `stdev`, `min`, `max`, and `median`) of a single dataset identified by its unique identification string. The timestep of interest is defined by a temporal index. An optional spatial area of interest may be defined as a WKT Polygon or Point. The statistical measures will be calculated over the time and space extent within the dataset.
 
-You may wish to compare the statistical measure of this particular climatological mean datafile against the statistical measure of climatological means of other timespans to observe change over time. For example, if the file of interest is a climatological mean across the years 1961-1990, there may also be datafiles representing the same data for 1971-2000, 1981-2010, or other spans available. The `multistats` query may be called with parameters that describe a set of datasets by specifying all parameters except the start and end dates, as well as a time index and optional spatial area of interest. It responds with the same information as the `stats` query, but for every datafile that matches the parameters.
+You may wish to compare the statistical measures of a related set of climatological aggregate datafiles. The `multistats` query functions similarly to the `stats` query, but on several files that share common parameters at once. The `multistats` query may be called with parameters that describe a set of datasets by specifying all parameters except the start and end dates, as well as a time index and optional spatial area of interest. It responds with the same information as the `stats` query, but for every datafile that matches the parameters.
 
-Similarly, the `data` API is also queried by submitting parameters that describe a set of datafiles by specifying all parameters except the start and end dates, as well as a time index and optional spatial area of interest. It returns a timeseries constructed from all climatological mean files that meet the parameters. For example, it would return the January value for the 1961-1990 climatology, the January value for the 1971-2000 climatology, etc, to make a timeseries showing projected long-term change over time of the mean value for January.
+Similarly, the `data` API is also queried by submitting parameters that describe a set of datafiles by specifying all parameters except the start and end dates, as well as a time index and optional spatial area of interest. It returns a timeseries constructed from all climatological aggregate files that meet the parameters. For example, it would return the January value for the 1961-1990 climatology, the January value for the 1971-2000 climatology, etc, to make a timeseries showing projected long-term change over time of the mean value for January.
 
 ## Request A Non-Climatological Timeseries
 
diff --git a/requirements.txt b/requirements.txt
index 5e07f0ef..7149f0d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,6 @@ contexttimer==0.3.3
 
 # For documentation
 sphinx
-recommonmark
 m2r
 
 # For testing
diff --git a/setup.py b/setup.py
index 3ce64d16..3a3c89cc 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 from setuptools import setup, find_packages
+from sphinx.setup_command import BuildDoc
 
 
 def recursive_list(pkg_dir, basedir):
@@ -40,5 +41,15 @@ def find():
     package_data={
         'ce': ['tests/data/*.nc', 'templates/*.html'] + recursive_list('ce/', 'ce/static'),
     },
+    cmdclass = {
+        'build_sphinx': BuildDoc
+        },
+    command_options={
+        'build_sphinx': {
+            'project': ('setup.py', "ce"),
+            'version': ('setup.py', __version__),
+            'source_dir': ('setup.py', 'doc/source')
+            }
+        },
     zip_safe=False
 )