From 8ae5f34bafb2c7f94f78c53cd75a95285fc32adb Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 9 Oct 2024 18:26:03 +0100 Subject: [PATCH 01/10] feat: add cmdstan-based optimizer for augurs-prophet This commit adds an implementation of `Optimizer` to the `augurs-prophet` crate that uses `cmdstan` to optimize or sample from the Prophet model, similar to the Prophet Python package. The `cmdstan` feature is disabled by default since it requires external dependencies and may not work well on all platforms. The `compile-cmdstan` feature can be enabled to build the Prophet model at build time. This requires a working installation of `stan` and setting the `STAN_PATH` environment variable to the path to the `stan` installation. This will embed the Prophet model binary and the libtbb dynamic library into the final executable, which will increase the size of the final executable by about 2MB, but the final binary won't require any additional dependencies. There's an example in the `examples` directory which fits and predicts using cmdstan. It can be runwith something like: ```bash STAN_PATH=/home/ben/.conda/envs/stan cargo run --example prophet_cmdstan ``` (replacing the `STAN_PATH` value with the path to your Stan installation). --- Cargo.toml | 1 + crates/augurs-outlier/Cargo.toml | 2 +- crates/augurs-prophet/.gitignore | 1 + crates/augurs-prophet/Cargo.toml | 9 + crates/augurs-prophet/build.rs | 70 +++ crates/augurs-prophet/build/.gitkeep | 0 crates/augurs-prophet/prophet.stan | 144 ++++++ crates/augurs-prophet/src/cmdstan.rs | 455 ++++++++++++++++++ crates/augurs-prophet/src/lib.rs | 3 + crates/augurs-prophet/src/optimizer.rs | 46 ++ crates/augurs-prophet/src/positive_float.rs | 1 + crates/augurs/Cargo.toml | 2 + examples/forecasting/Cargo.toml | 2 +- .../forecasting/examples/prophet_cmdstan.rs | 20 + 14 files changed, 754 insertions(+), 2 deletions(-) create mode 100644 crates/augurs-prophet/.gitignore create mode 100644 crates/augurs-prophet/build.rs create mode 100644 crates/augurs-prophet/build/.gitkeep create mode 100644 crates/augurs-prophet/prophet.stan create mode 100644 crates/augurs-prophet/src/cmdstan.rs create mode 100644 examples/forecasting/examples/prophet_cmdstan.rs diff --git a/Cargo.toml b/Cargo.toml index 43806cd1..baac1126 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ rand = "0.8.5" roots = "0.0.8" serde = { version = "1.0.166", features = ["derive"] } statrs = "0.17.1" +serde_json = "1.0.128" thiserror = "1.0.40" tinyvec = "1.6.0" tracing = "0.1.37" diff --git a/crates/augurs-outlier/Cargo.toml b/crates/augurs-outlier/Cargo.toml index ec4d2b19..8b968cbe 100644 --- a/crates/augurs-outlier/Cargo.toml +++ b/crates/augurs-outlier/Cargo.toml @@ -26,7 +26,7 @@ tracing.workspace = true [dev-dependencies] augurs.workspace = true -serde_json = "1.0.128" +serde_json.workspace = true [features] parallel = ["rayon"] diff --git a/crates/augurs-prophet/.gitignore b/crates/augurs-prophet/.gitignore new file mode 100644 index 00000000..796b96d1 --- /dev/null +++ b/crates/augurs-prophet/.gitignore @@ -0,0 +1 @@ +/build diff --git a/crates/augurs-prophet/Cargo.toml b/crates/augurs-prophet/Cargo.toml index 17387796..4fa6c614 100644 --- a/crates/augurs-prophet/Cargo.toml +++ b/crates/augurs-prophet/Cargo.toml @@ -15,6 +15,9 @@ itertools.workspace = true num-traits.workspace = true rand.workspace = true statrs.workspace = true +serde = { workspace = true, optional = true, features = ["derive"] } +serde_json = { workspace = true, optional = true } +tempfile = { version = "3.13.0", optional = true } thiserror.workspace = true [dev-dependencies] @@ -22,5 +25,11 @@ augurs-testing.workspace = true chrono.workspace = true pretty_assertions.workspace = true +[build-dependencies] +tempfile = { version = "3.13.0", optional = true } + [features] bytemuck = ["dep:bytemuck"] +cmdstan = ["dep:tempfile", "dep:serde_json", "serde"] +compile-cmdstan = ["cmdstan", "dep:tempfile"] +serde = ["dep:serde"] diff --git a/crates/augurs-prophet/build.rs b/crates/augurs-prophet/build.rs new file mode 100644 index 00000000..ec5e03c3 --- /dev/null +++ b/crates/augurs-prophet/build.rs @@ -0,0 +1,70 @@ +/// Compile the Prophet model (in prophet.stan) to a binary, +/// using the Makefile in the cmdstan installation. +/// +/// This requires: +/// - The `STAN_PATH` environment variable to be set to the +/// path to the Stan installation. +#[cfg(all(feature = "cmdstan", feature = "compile-cmdstan"))] +fn compile_model() -> Result<(), Box> { + use std::{fs, path::PathBuf, process::Command}; + use tempfile::TempDir; + + println!("cargo::rerun-if-changed=prophet.stan"); + println!("cargo::rerun-if-env-changed=CMDSTAN_PATH"); + + let stan_path: PathBuf = std::env::var("STAN_PATH") + .expect("STAN_PATH not set") + .parse() + .expect("invalid STAN_PATH"); + let cmdstan_bin_path = stan_path.join("bin/cmdstan"); + let model_stan = include_bytes!("./prophet.stan"); + + let build_dir = PathBuf::from("./build"); + fs::create_dir_all(&build_dir).expect("could not create build directory"); + + // Write the Prophet Stan file to a named file in a temporary directory. + let tmp_dir = TempDir::new()?; + let prophet_stan_path = tmp_dir.path().join("prophet.stan"); + eprintln!("Writing Prophet model to {}", prophet_stan_path.display()); + fs::write(tmp_dir.path().join("prophet.stan"), model_stan)?; + + // The Stan Makefile expects to see the path to the final executable + // file (without the .stan extension). It will build the executable + // at this location. + let tmp_exe_path = prophet_stan_path.with_extension(""); + + // Execute the cmdstan make command pointing at the expected + // prophet file. + eprintln!("Compiling Prophet model to {}", tmp_exe_path.display()); + let mut cmd = Command::new("make"); + cmd.current_dir(cmdstan_bin_path).arg(&tmp_exe_path); + eprintln!("Executing {:?}", cmd); + let output = cmd.output()?; + if !output.status.success() { + return Err(format!("make failed: {}", String::from_utf8_lossy(&output.stderr)).into()); + } + eprintln!("Successfully compiled Prophet model"); + + // Copy the executable to the final location. + let dest_exe_path = build_dir.join("prophet"); + std::fs::copy(tmp_exe_path, &dest_exe_path)?; + eprintln!("Copied prophet exe to {}", dest_exe_path.display()); + + // Copy libtbb to the final location. + let libtbb_path = stan_path.join("lib/libtbb.so.12"); + let dest_libtbb_path = build_dir.join("libtbb.so.12"); + std::fs::copy(&libtbb_path, &dest_libtbb_path)?; + eprintln!( + "Copied libtbb.so from {} to {}", + libtbb_path.display(), + dest_libtbb_path.display(), + ); + + Ok(()) +} + +fn main() -> Result<(), Box> { + #[cfg(all(feature = "cmdstan", feature = "compile-cmdstan"))] + compile_model()?; + Ok(()) +} diff --git a/crates/augurs-prophet/build/.gitkeep b/crates/augurs-prophet/build/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/crates/augurs-prophet/prophet.stan b/crates/augurs-prophet/prophet.stan new file mode 100644 index 00000000..7a091b4a --- /dev/null +++ b/crates/augurs-prophet/prophet.stan @@ -0,0 +1,144 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +functions { + matrix get_changepoint_matrix(vector t, vector t_change, int T, int S) { + // Assumes t and t_change are sorted. + matrix[T, S] A; + row_vector[S] a_row; + int cp_idx; + + // Start with an empty matrix. + A = rep_matrix(0, T, S); + a_row = rep_row_vector(0, S); + cp_idx = 1; + + // Fill in each row of A. + for (i in 1:T) { + while ((cp_idx <= S) && (t[i] >= t_change[cp_idx])) { + a_row[cp_idx] = 1; + cp_idx = cp_idx + 1; + } + A[i] = a_row; + } + return A; + } + + // Logistic trend functions + + vector logistic_gamma(real k, real m, vector delta, vector t_change, int S) { + vector[S] gamma; // adjusted offsets, for piecewise continuity + vector[S + 1] k_s; // actual rate in each segment + real m_pr; + + // Compute the rate in each segment + k_s = append_row(k, k + cumulative_sum(delta)); + + // Piecewise offsets + m_pr = m; // The offset in the previous segment + for (i in 1:S) { + gamma[i] = (t_change[i] - m_pr) * (1 - k_s[i] / k_s[i + 1]); + m_pr = m_pr + gamma[i]; // update for the next segment + } + return gamma; + } + + vector logistic_trend( + real k, + real m, + vector delta, + vector t, + vector cap, + matrix A, + vector t_change, + int S + ) { + vector[S] gamma; + + gamma = logistic_gamma(k, m, delta, t_change, S); + return cap .* inv_logit((k + A * delta) .* (t - (m + A * gamma))); + } + + // Linear trend function + + vector linear_trend( + real k, + real m, + vector delta, + vector t, + matrix A, + vector t_change + ) { + return (k + A * delta) .* t + (m + A * (-t_change .* delta)); + } + + // Flat trend function + + vector flat_trend( + real m, + int T + ) { + return rep_vector(m, T); + } +} + +data { + int T; // Number of time periods + int K; // Number of regressors + vector[T] t; // Time + vector[T] cap; // Capacities for logistic trend + vector[T] y; // Time series + int S; // Number of changepoints + vector[S] t_change; // Times of trend changepoints + matrix[T,K] X; // Regressors + vector[K] sigmas; // Scale on seasonality prior + real tau; // Scale on changepoints prior + int trend_indicator; // 0 for linear, 1 for logistic, 2 for flat + vector[K] s_a; // Indicator of additive features + vector[K] s_m; // Indicator of multiplicative features +} + +transformed data { + matrix[T, S] A = get_changepoint_matrix(t, t_change, T, S); + matrix[T, K] X_sa = X .* rep_matrix(s_a', T); + matrix[T, K] X_sm = X .* rep_matrix(s_m', T); +} + +parameters { + real k; // Base trend growth rate + real m; // Trend offset + vector[S] delta; // Trend rate adjustments + real sigma_obs; // Observation noise + vector[K] beta; // Regressor coefficients +} + +transformed parameters { + vector[T] trend; + if (trend_indicator == 0) { + trend = linear_trend(k, m, delta, t, A, t_change); + } else if (trend_indicator == 1) { + trend = logistic_trend(k, m, delta, t, cap, A, t_change, S); + } else if (trend_indicator == 2) { + trend = flat_trend(m, T); + } +} + +model { + //priors + k ~ normal(0, 5); + m ~ normal(0, 5); + delta ~ double_exponential(0, tau); + sigma_obs ~ normal(0, 0.5); + beta ~ normal(0, sigmas); + + // Likelihood + y ~ normal_id_glm( + X_sa, + trend .* (1 + X_sm * beta), + beta, + sigma_obs + ); +} + diff --git a/crates/augurs-prophet/src/cmdstan.rs b/crates/augurs-prophet/src/cmdstan.rs new file mode 100644 index 00000000..d95aa153 --- /dev/null +++ b/crates/augurs-prophet/src/cmdstan.rs @@ -0,0 +1,455 @@ +//! Use `cmdstan` to optimize or sample from the Prophet model. +//! +//! This module provides an [`Optimizer`] implementation that uses `cmdstan` to +//! optimize or sample from the Prophet model. `cmdstan` is a C++ program provided +//! by Stan which is used to compile the Stan model into a binary executable. +//! That executable can then be used to optimize or sample from the model, +//! passing data in and out over the filesystem. +//! +//! # Usage +//! +//! There are two ways to use this optimizer: +//! +//! 1. Use the `compile-cmdstan` feature to build the Prophet Stan model at build time. +//! This requires a working installation of `stan` and setting the `STAN_PATH` +//! environment variable to the path to the `stan` installation. +//! This will embed the Prophet model binary and the libtbb dynamic library into +//! the final executable, which will increase the size of the final executable by about +//! 2MB, but the final binary won't require any additional dependencies. +//! 2. Use [`CmdstanOptimizer::with_custom_prophet`] to create a new `CmdstanOptimizer` using +//! a precompiled Prophet Stan model. This model can be obtained by either manually building +//! the Prophet model (which still requires a working Stan installation) or extracting it from +//! the Prophet Python package for your target platform. +//! +//! # Gotchas +//! +//! - the `compile-cmdstan` feature may not work on all platforms. If you encounter +//! problems, try using the second method instead. +//! - the Prophet model binary is not statically linked to `libtbb`, since doing so is +//! not recommended by the Stan developers or TBB developers. This means that the +//! final binary will require the `libtbb` dynamic library to be present at runtime. +//! The first method takes care of this for you, but if using the second you'll need +//! to make sure the `libtbb` dynamic library is available at runtime. You can do this +//! by copying the `libtbb` dynamic library to a known directory and setting the +//! `LD_LIBRARY_PATH` environment variable to that directory. +use std::{ + io, + num::{ParseFloatError, ParseIntError}, + path::{Path, PathBuf}, + process::Command, +}; + +use rand::{thread_rng, Rng}; + +use crate::{optimizer, Optimizer, PositiveFloat, TryFromFloatError}; + +/// Errors that can occur when trying to run optimization using cmdstan. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// An I/O error occurred when trying to run the Prophet model executable. + #[error("Error running Prophet at {command}: {source}")] + ProphetIOError { + /// The stringified command that was attempted. + command: String, + /// The source error. + #[source] + source: io::Error, + }, + /// An error occurred when running the Prophet model executable. + #[error("Error {}running Prophet command ({command}): {stdout}\n{stderr}", code.map(|c| format!("code {}", c)).unwrap_or_default())] + ProphetExeError { + /// The stringified command that was attempted. + command: String, + /// The status code, if provided by the OS. + code: Option, + /// Stdout from the command. + stdout: String, + /// Stderr from the command. + stderr: String, + }, + /// An error occurred when creating a temporary directory. + #[error("Error creating temporary directory: {0}")] + CreateTempDir( + /// The source error. + #[from] + io::Error, + ), + /// An error occurred when serializing the initial parameters to JSON. + #[error("Error serializing initial parameters to JSON: {0}")] + SerializeInits( + /// The source error. + #[source] + serde_json::Error, + ), + /// An error occurred when writing the initial parameters to a temporary file. + #[error("Error writing initial parameters to temporary file: {0}")] + WriteInits( + /// The source error. + #[source] + io::Error, + ), + /// An error occurred when serializing the data to JSON. + #[error("Error serializing data to JSON: {0}")] + SerializeData( + /// The source error. + #[source] + serde_json::Error, + ), + /// An error occurred when writing the data to a temporary file. + #[error("Error writing data to temporary file: {0}")] + WriteData( + /// The source error. + #[source] + io::Error, + ), + /// An error occurred when reading the output from Stan. + #[error("Error reading output from Prophet at {path}: {source}")] + ReadOutput { + /// The path to the output file. + path: PathBuf, + /// The source error. + #[source] + source: io::Error, + }, + /// No header row was found in the output file. + #[error("No header row was found in the output file")] + NoHeader, + /// No data row was found in the output file. + #[error("No data row was found in the output file")] + NoData, + /// An invalid integer was found in the output header row. + /// + /// Vector parameters should appear in the form `.` + /// where `` is a one-indexed integer. + #[error("Invalid int in output: {source}")] + InvalidInt { + /// The source error. + #[from] + source: ParseIntError, + }, + /// An invalid float was found in the output data row. + #[error("Invalid float in output: {source}")] + InvalidFloat { + /// The source error. + #[from] + source: ParseFloatError, + }, + /// An invalid positive float was found in the output data row. + #[error("Invalid positive float in output: {source}")] + InvalidPositiveFloat { + /// The source error. + #[from] + source: TryFromFloatError, + }, + /// Some parameters were missing from the output. + #[error("Some parameters were not found in the output: {0:?}")] + MissingOptimizedParameters(OptimizedParamsFound), +} + +/// Helper struct to track which parameters were found in the output. +#[derive(Debug, Default, PartialEq, Eq)] +pub struct OptimizedParamsFound { + /// The `k` parameter was found in the output. + pub k: bool, + /// The `m` parameter was found in the output. + pub m: bool, + /// The `sigma_obs` parameter was found in the output. + pub sigma_obs: bool, + /// The `delta` parameter was found in the output. + pub delta: bool, + /// The `beta` parameter was found in the output. + pub beta: bool, + /// The `trend` parameter was found in the output. + pub trend: bool, +} + +#[derive(Debug)] +struct ProphetInstallation { + #[cfg(feature = "compile-cmdstan")] + _dir: Option, + lib_dir: Option, + prophet_binary_path: PathBuf, +} + +impl Clone for ProphetInstallation { + fn clone(&self) -> Self { + Self { + // We can't clone the temporary file but we also don't really need it. + // We just need the path to the Prophet binary and library directory. + #[cfg(feature = "compile-cmdstan")] + _dir: None, + lib_dir: self.lib_dir.clone(), + prophet_binary_path: self.prophet_binary_path.clone(), + } + } +} + +impl ProphetInstallation { + fn command(&self) -> Command { + let mut cmd = Command::new(&self.prophet_binary_path); + if let Some(lib_dir) = &self.lib_dir { + cmd.env("LD_LIBRARY_PATH", lib_dir); + } + cmd + } +} + +struct OptimizeCommand<'a> { + installation: &'a ProphetInstallation, + init: &'a optimizer::InitialParams, + data: &'a optimizer::Data, + opts: &'a optimizer::OptimizeOpts, +} + +impl<'a> OptimizeCommand<'a> { + fn run(&self) -> Result { + // Set up temp dir and files. + let tempdir = tempfile::tempdir()?; + let init_path = tempdir.path().join("init.json"); + std::fs::write( + &init_path, + serde_json::to_vec(&self.init).map_err(Error::SerializeInits)?, + ) + .map_err(Error::WriteInits)?; + let data_path = tempdir.path().join("data.json"); + std::fs::write( + &data_path, + serde_json::to_vec(&self.data).map_err(Error::SerializeData)?, + ) + .map_err(Error::WriteData)?; + let output_path = tempdir.path().join("prophet_output.csv"); + + // Run the command. + let mut command = self.command(&data_path, &init_path, &output_path); + let output = command.output().map_err(|source| Error::ProphetIOError { + command: format!("{:?}", command), + source, + })?; + if !output.status.success() { + return Err(Error::ProphetExeError { + command: format!("{:?}", command), + code: output.status.code(), + stdout: String::from_utf8_lossy(&output.stdout).to_string(), + stderr: String::from_utf8_lossy(&output.stderr).to_string(), + }); + } + + self.parse_output(output_path) + } + + fn parse_output(&self, output_path: PathBuf) -> Result { + let output = std::fs::read_to_string(&output_path).map_err(|source| Error::ReadOutput { + path: output_path, + source, + })?; + let mut lines = output.lines().skip_while(|line| line.starts_with('#')); + let header = lines.next().ok_or(Error::NoHeader)?.split(','); + let data = lines + .next() + .ok_or(Error::NoData)? + .split(',') + .map(|val| val.parse()); + + let mut delta_indices: Vec = Vec::new(); + let mut beta_indices: Vec = Vec::new(); + let mut trend_indices: Vec = Vec::new(); + let mut out = optimizer::OptimizedParams { + k: 0.0, + m: 0.0, + sigma_obs: PositiveFloat::one(), + delta: Vec::new(), + beta: Vec::new(), + trend: Vec::new(), + }; + let mut found = OptimizedParamsFound::default(); + for (name, val) in header.zip(data) { + match name.split_once('.') { + Some(("delta", i)) => { + found.delta = true; + delta_indices.push(i.parse()?); + out.delta.push(val?); + } + Some(("beta", i)) => { + found.beta = true; + beta_indices.push(i.parse()?); + out.beta.push(val?); + } + Some(("trend", i)) => { + found.trend = true; + trend_indices.push(i.parse()?); + out.trend.push(val?); + } + None | Some((_, _)) => match name { + "k" => { + found.k = true; + out.k = val?; + } + "m" => { + found.m = true; + out.m = val?; + } + "sigma_obs" => { + found.sigma_obs = true; + out.sigma_obs = val?.try_into()?; + } + _ => {} + }, + } + } + + if !(found.k && found.m && found.sigma_obs && found.delta && found.beta && found.trend) { + return Err(Error::MissingOptimizedParameters(found)); + } + + // Sort the vector params by their indices. + // We need to subtract 1 from the indices because the params + // returned by Stan are 1-indexed. + out.delta = delta_indices + .into_iter() + .map(|i| out.delta[i - 1]) + .collect(); + out.beta = beta_indices.into_iter().map(|i| out.beta[i - 1]).collect(); + out.trend = trend_indices + .into_iter() + .map(|i| out.trend[i - 1]) + .collect(); + Ok(out) + } + + fn command(&self, data_path: &Path, init_path: &Path, output_path: &Path) -> Command { + let mut command = self.installation.command(); + command.arg("random"); + command.arg(format!( + "seed={}", + self.opts.seed.unwrap_or_else(|| { + let mut rng = thread_rng(); + (&mut rng).gen_range(1..99999) + }) + )); + command.arg("data"); + command.args([ + format!("file={}", data_path.display()), + format!("init={}", init_path.display()), + ]); + command.arg("output"); + command.arg(format!("file={}", output_path.display())); + command.arg("method=optimize"); + command.arg(format!( + "algorithm={}", + self.opts + .algorithm + .unwrap_or(crate::Algorithm::Lbfgs) + .to_string() + )); + command + } +} + +/// Optimizer that calls out to a compiled Stan model using `cmdstan`. +/// +/// See the module level documentation for more information on how to use this. +#[derive(Debug, Clone)] +pub struct CmdstanOptimizer { + prophet_installation: ProphetInstallation, +} + +impl CmdstanOptimizer { + /// Create a new [`CmdstanOptimizer`] using the Prophet model compiled at build-time. + /// + /// This works by embedding the built Prophet model binary into the executable and + /// writing it to a temporary file at runtime when this method is first called. + /// + /// This is only available if the `compile-cmdstan` feature is enabled. + /// + /// # Panics + /// + /// This function will panic if the temporary file could not be created, or if the + /// Prophet model binary could not be written to the temporary file. + #[cfg(feature = "compile-cmdstan")] + pub fn new() -> Self { + static PROPHET_INSTALLATION: std::sync::LazyLock = + std::sync::LazyLock::new(|| { + static PROPHET_BINARY: &[u8] = include_bytes!("../build/prophet"); + static LIBTBB_SO: &[u8] = include_bytes!("../build/libtbb.so.12"); + + let dir = tempfile::tempdir().expect("could not create temporary directory"); + let lib_dir = dir.path().join("lib"); + std::fs::create_dir_all(&lib_dir).expect("could not create lib directory"); + let prophet_binary_path = dir.path().join("prophet"); + let libtbb_path = lib_dir.join("libtbb.so.12"); + + // Write the Prophet model binary to the temporary directory. + std::fs::write(&prophet_binary_path, PROPHET_BINARY) + .expect("could not write prophet model to temporary file"); + // Write the libtbb.so file to the temporary directory. + std::fs::write(&libtbb_path, LIBTBB_SO) + .expect("could not write libtbb to temporary file"); + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions( + &prophet_binary_path, + std::fs::Permissions::from_mode(0o755), + ) + .expect("could not set permissions on Prophet binary"); + } + + ProphetInstallation { + _dir: Some(dir), + prophet_binary_path, + lib_dir: Some(lib_dir), + } + }); + + Self { + prophet_installation: PROPHET_INSTALLATION.clone(), + } + } + + /// Create a new [`CmdstanOptimizer`] using the Prophet model found the provided path. + pub fn with_custom_prophet(prophet_path: impl Into) -> Result { + let prophet_installation = ProphetInstallation { + #[cfg(feature = "compile-cmdstan")] + _dir: None, + lib_dir: None, + prophet_binary_path: prophet_path.into(), + }; + // Test that the command can be executed. + let mut command = prophet_installation.command(); + command.arg("help"); + let output = command.output().map_err(|source| Error::ProphetIOError { + command: format!("{:?}", command), + source, + })?; + if !output.status.success() { + return Err(Error::ProphetExeError { + command: format!("{:?}", command), + code: output.status.code(), + stdout: String::from_utf8_lossy(&output.stdout).to_string(), + stderr: String::from_utf8_lossy(&output.stderr).to_string(), + }); + } + Ok(Self { + prophet_installation, + }) + } +} + +impl Optimizer for CmdstanOptimizer { + fn optimize( + &self, + init: &optimizer::InitialParams, + data: &optimizer::Data, + opts: &optimizer::OptimizeOpts, + ) -> Result { + OptimizeCommand { + installation: &self.prophet_installation, + init: &init, + data: &data, + opts: &opts, + } + .run() + .map_err(optimizer::Error::custom) + } +} diff --git a/crates/augurs-prophet/src/lib.rs b/crates/augurs-prophet/src/lib.rs index aaf6be2a..5fc9323a 100644 --- a/crates/augurs-prophet/src/lib.rs +++ b/crates/augurs-prophet/src/lib.rs @@ -17,6 +17,9 @@ mod prophet; mod testdata; mod util; +#[cfg(feature = "cmdstan")] +pub mod cmdstan; + /// A timestamp represented as seconds since the epoch. pub type TimestampSeconds = i64; diff --git a/crates/augurs-prophet/src/optimizer.rs b/crates/augurs-prophet/src/optimizer.rs index 316c128f..fba98d91 100644 --- a/crates/augurs-prophet/src/optimizer.rs +++ b/crates/augurs-prophet/src/optimizer.rs @@ -21,10 +21,13 @@ // WASM Components? // TODO: write a pure Rust optimizer for the default case. +use std::fmt; + use crate::positive_float::PositiveFloat; /// The initial parameters for the optimization. #[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct InitialParams { /// Base trend growth rate. pub k: f64, @@ -49,8 +52,40 @@ pub enum TrendIndicator { Flat, } +#[cfg(feature = "serde")] +impl serde::Serialize for TrendIndicator { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_u8(match self { + Self::Linear => 0, + Self::Logistic => 1, + Self::Flat => 2, + }) + } +} + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for TrendIndicator { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + D::Error: serde::de::Error, + { + let value = u8::deserialize(deserializer)?; + match value { + 0 => Ok(Self::Linear), + 1 => Ok(Self::Logistic), + 2 => Ok(Self::Flat), + _ => Err(serde::de::Error::custom("invalid trend indicator")), + } + } +} + /// Data for the Prophet model. #[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(non_snake_case)] pub struct Data { /// Number of time periods. @@ -94,6 +129,17 @@ pub enum Algorithm { Lbfgs, } +impl fmt::Display for Algorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::Lbfgs => "lbfgs", + Self::Newton => "newton", + Self::Bfgs => "bfgs", + }; + f.write_str(s) + } +} + /// Arguments for optimization. #[derive(Default, Debug, Clone)] pub struct OptimizeOpts { diff --git a/crates/augurs-prophet/src/positive_float.rs b/crates/augurs-prophet/src/positive_float.rs index f7805cd2..fb65f19f 100644 --- a/crates/augurs-prophet/src/positive_float.rs +++ b/crates/augurs-prophet/src/positive_float.rs @@ -2,6 +2,7 @@ #[repr(transparent)] #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] #[cfg_attr(feature = "bytemuck", derive(bytemuck::Pod, bytemuck::Zeroable))] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct PositiveFloat(f64); /// An invalid float was provided when trying to create a [`PositiveFloat`]. diff --git a/crates/augurs/Cargo.toml b/crates/augurs/Cargo.toml index c6d3c107..fecc8ccb 100644 --- a/crates/augurs/Cargo.toml +++ b/crates/augurs/Cargo.toml @@ -35,5 +35,7 @@ full = ["changepoint", "clustering", "dtw", "ets", "forecaster", "mstl", "outlie mstl = ["augurs-mstl", "augurs-ets?/mstl"] parallel = ["augurs-dtw?/parallel", "augurs-outlier?/parallel"] prophet = ["augurs-prophet"] +prophet-cmdstan = ["augurs-prophet/cmdstan"] +prophet-compile-cmdstan = ["augurs-prophet/compile-cmdstan"] outlier = ["augurs-outlier"] seasons = ["augurs-seasons"] diff --git a/examples/forecasting/Cargo.toml b/examples/forecasting/Cargo.toml index 4cedebb8..8cefdab8 100644 --- a/examples/forecasting/Cargo.toml +++ b/examples/forecasting/Cargo.toml @@ -11,4 +11,4 @@ keywords.workspace = true publish = false [dependencies] -augurs = { workspace = true, features = ["ets", "mstl", "forecaster", "seasons"] } +augurs = { workspace = true, features = ["ets", "mstl", "forecaster", "prophet", "prophet-cmdstan", "prophet-compile-cmdstan", "seasons"] } diff --git a/examples/forecasting/examples/prophet_cmdstan.rs b/examples/forecasting/examples/prophet_cmdstan.rs new file mode 100644 index 00000000..a3f20b9d --- /dev/null +++ b/examples/forecasting/examples/prophet_cmdstan.rs @@ -0,0 +1,20 @@ +use augurs::prophet::{cmdstan::CmdstanOptimizer, Prophet, TrainingData}; + +fn main() -> Result<(), Box> { + let ds = vec![ + 1704067200, 1704871384, 1705675569, 1706479753, 1707283938, 1708088123, 1708892307, + 1709696492, 1710500676, 1711304861, 1712109046, 1712913230, 1713717415, + ]; + let y = vec![ + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, + ]; + let data = TrainingData::new(ds, y.clone())?; + let mut prophet = Prophet::new(Default::default(), CmdstanOptimizer::new()); + prophet.fit(data, Default::default())?; + let predictions = prophet.predict(None)?; + assert_eq!(predictions.yhat.point.len(), y.len()); + assert!(predictions.yhat.lower.is_some()); + assert!(predictions.yhat.upper.is_some()); + println!("Predicted values: {:#?}", predictions.yhat); + Ok(()) +} From 528856cfc0700ff1a571584ed5915d8fffbc8ad9 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 10:28:59 +0100 Subject: [PATCH 02/10] Add download bin, rework features to be less annoying --- .github/workflows/rust.yml | 4 + .gitignore | 1 + crates/augurs-prophet/.gitignore | 1 + crates/augurs-prophet/Cargo.toml | 13 ++ crates/augurs-prophet/README.md | 8 + crates/augurs-prophet/build.rs | 18 ++- crates/augurs-prophet/src/bin/main.rs | 144 ++++++++++++++++++ crates/augurs-prophet/src/cmdstan.rs | 59 ++++--- examples/forecasting/Cargo.toml | 2 +- .../forecasting/examples/prophet_cmdstan.rs | 19 ++- justfile | 4 + 11 files changed, 249 insertions(+), 24 deletions(-) create mode 100644 crates/augurs-prophet/src/bin/main.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 83dd2bb3..4afa58a1 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -34,6 +34,10 @@ jobs: with: bins: cargo-nextest,just + - name: Download Prophet Stan model + # Download the Prophet Stan model since an example requires it. + run: just download-prophet-stan-model + - name: Run cargo nextest run: just test - name: Run doc tests diff --git a/.gitignore b/.gitignore index 54eda469..3438770c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /Cargo.lock .bacon-locations .vscode +prophet_stan_model diff --git a/crates/augurs-prophet/.gitignore b/crates/augurs-prophet/.gitignore index 796b96d1..0d059786 100644 --- a/crates/augurs-prophet/.gitignore +++ b/crates/augurs-prophet/.gitignore @@ -1 +1,2 @@ /build +/prophet_stan_model diff --git a/crates/augurs-prophet/Cargo.toml b/crates/augurs-prophet/Cargo.toml index 4fa6c614..0ee13384 100644 --- a/crates/augurs-prophet/Cargo.toml +++ b/crates/augurs-prophet/Cargo.toml @@ -19,6 +19,8 @@ serde = { workspace = true, optional = true, features = ["derive"] } serde_json = { workspace = true, optional = true } tempfile = { version = "3.13.0", optional = true } thiserror.workspace = true +ureq = { version = "2.10.1", optional = true } +zip = { version = "2.2.0", optional = true } [dev-dependencies] augurs-testing.workspace = true @@ -32,4 +34,15 @@ tempfile = { version = "3.13.0", optional = true } bytemuck = ["dep:bytemuck"] cmdstan = ["dep:tempfile", "dep:serde_json", "serde"] compile-cmdstan = ["cmdstan", "dep:tempfile"] +download = ["dep:ureq", "dep:zip"] +# Ignore cmdstan compilation in the build script. +# This should only be used for developing the library, not by +# end users, or you may end up with a broken build where the +# Prophet model isn't available to be compiled into the binary. +internal-ignore-cmdstan-failure = [] serde = ["dep:serde"] + +[[bin]] +name = "download-stan-model" +path = "src/bin/main.rs" +required-features = ["download"] diff --git a/crates/augurs-prophet/README.md b/crates/augurs-prophet/README.md index 94721347..ee81d486 100644 --- a/crates/augurs-prophet/README.md +++ b/crates/augurs-prophet/README.md @@ -31,6 +31,14 @@ This works fine if you're operating in a desktop or server environment, but poses issues when running in more esoteric environments such as WebAssembly. +The `cmdstan` module of this crate contains an implementation of `Optimizer` +which will use a compiled Stan program to do this. See the `cmdstan` module +for more details on how to use it. + +This requires the `cmdstan` feature to be enabled, and optionally the +`compile-cmdstan` feature to be enabled if you want to compile and embed +the Stan model at build time. + ### `libstan` We could choose to write a `libstan` crate which uses [`cxx`][cxx] to diff --git a/crates/augurs-prophet/build.rs b/crates/augurs-prophet/build.rs index ec5e03c3..21e9b07a 100644 --- a/crates/augurs-prophet/build.rs +++ b/crates/augurs-prophet/build.rs @@ -13,14 +13,14 @@ fn compile_model() -> Result<(), Box> { println!("cargo::rerun-if-env-changed=CMDSTAN_PATH"); let stan_path: PathBuf = std::env::var("STAN_PATH") - .expect("STAN_PATH not set") + .map_err(|_| "STAN_PATH not set")? .parse() - .expect("invalid STAN_PATH"); + .map_err(|_| "invalid STAN_PATH")?; let cmdstan_bin_path = stan_path.join("bin/cmdstan"); let model_stan = include_bytes!("./prophet.stan"); let build_dir = PathBuf::from("./build"); - fs::create_dir_all(&build_dir).expect("could not create build directory"); + fs::create_dir_all(&build_dir).map_err(|_| "could not create build directory")?; // Write the Prophet Stan file to a named file in a temporary directory. let tmp_dir = TempDir::new()?; @@ -64,7 +64,17 @@ fn compile_model() -> Result<(), Box> { } fn main() -> Result<(), Box> { + let _result = Ok::<(), &'static str>(()); #[cfg(all(feature = "cmdstan", feature = "compile-cmdstan"))] - compile_model()?; + let _result = compile_model(); + // This is a complete hack but lets us get away with still using + // the `--all-features` flag of Cargo without everything failing + // if there isn't a Stan installation. + // Basically, if we're + // hard fail, we just want to skip the feature. + // This will cause things to fail at runtime if there isn't a Stan + // installation, but that's okay. + #[cfg(not(feature = "internal-ignore-cmdstan-failure"))] + _result?; Ok(()) } diff --git a/crates/augurs-prophet/src/bin/main.rs b/crates/augurs-prophet/src/bin/main.rs new file mode 100644 index 00000000..391ae531 --- /dev/null +++ b/crates/augurs-prophet/src/bin/main.rs @@ -0,0 +1,144 @@ +//! CLI providing convenience methods for downloading the compiled Prophet +//! model from the PyPI wheel to a local directory. +//! +//! This makes it easier to use `augurs-prophet` with the `cmdstan` feature +//! enabled. +//! +//! When run, it will: +//! +//! - determine the wheel to download based on the local computer's architecture and +//! OS +//! - download the wheel to the `prophet_stan_model` directory +//! - extract the `prophet_model.bin` to that directory +//! - extract the `libtbb-dc01d64d.so.2` to the `prophet_stan_model/lib` subdirectory +//! - remove the wheel. +//! +//! This will fail if there is no wheel for the current architecture and OS, or +//! if anything goes wrong during the download, extraction, or removal. +//! +//! For now, the version of Prophet is hardcoded to 1.1.6 for simplicity. +//! It's very unlikely that the Stan model will ever change, so this should be +//! fine. + +use std::{ + env::consts, + fs::{self, File}, + io::{self, BufReader, Read}, + path::{Path, PathBuf}, +}; + +use anyhow::{bail, Context, Result}; +use zip::read::ZipFile; + +const PROPHET_MODEL_FILENAME: &str = "prophet_model.bin"; +const LIBTBB_SO_FILENAME: &str = "libtbb-dc01d64d.so.2"; + +fn wheel_url() -> Result<&'static str> { + Ok(match (consts::ARCH, consts::OS) { + ("x86_64", "linux") => "https://files.pythonhosted.org/packages/1f/47/f7d10a904756830efd8522700e582822ff44a15f839b464044ee4c53ee36/prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + ("aarch64", "linux") => "https://files.pythonhosted.org/packages/a1/c5/c6dd58b132653af3139c87e92b484bad79264492a62d70fc5beda837a933/prophet-1.1.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", + ("x86_64", "macos") => "https://files.pythonhosted.org/packages/41/46/75309abde08c10f9be78bcfca581be430b5d8303d847de8d88190f4d5c21/prophet-1.1.6-py3-none-macosx_10_11_x86_64.whl", + ("aarch64", "macos") => "https://files.pythonhosted.org/packages/15/9a/a8d35652e869011a3bae9e0888f4c62157bf9067c9be15535602c73039dd/prophet-1.1.6-py3-none-macosx_11_0_arm64.whl", + ("x86_64", "windows") => "https://files.pythonhosted.org/packages/12/ff/a04156f4ca3d18bd005c73f79e86e0684346fbc2aea856429c3e49f2828e/prophet-1.1.6-py3-none-win_amd64.whl", + _ => bail!("Unsupported platform. Prophet only builds wheels for x86_64 linux, aarch64 linux, x86_64 macos, and x86_64 windows."), + }) +} + +fn download(url: &str, dest: &Path) -> Result<()> { + let resp = ureq::get(url) + .set("User-Agent", "augurs-prophet") + .call() + .with_context(|| format!("error fetching url {url}"))?; + let len: u64 = resp + .header("Content-Length") + .context("missing content-length header")? + .parse() + .context("invalid content-length header")?; + + let mut file = File::create(dest) + .with_context(|| format!("couldn't create file at {}", dest.display()))?; + + let mut body = resp.into_reader().take(len); + io::copy(&mut body, &mut file).context("error writing wheel to file")?; + Ok(()) +} + +fn unzip(dest: &Path) -> Result<()> { + let file = File::open(dest).context("open file")?; + let reader = BufReader::new(file); + let mut zip = zip::ZipArchive::new(reader).context("open wheel file")?; + + let (mut found_prophet, mut found_libtbb) = (false, false); + for i in 0..zip.len() { + let file = zip.by_index(i).expect("valid index"); + match file.enclosed_name() { + Some(x) + if !found_prophet + && x.file_name().and_then(|x| x.to_str()) == Some(PROPHET_MODEL_FILENAME) => + { + let dest = dest.parent().expect("dest has parent since it's a file"); + write(dest, file)?; + found_prophet = true + } + Some(x) + if !found_libtbb + && file.is_file() + && x.file_name() + .and_then(|x| x.to_str()) + .map_or(false, |x| x.contains(LIBTBB_SO_FILENAME)) => + { + let mut dest = dest + .parent() + .expect("dest has parent since it's a file") + .to_path_buf(); + dest.push("lib"); + fs::create_dir_all(&dest).context("create lib dir")?; + write(&dest, file)?; + found_libtbb = true; + } + _ => continue, + } + if found_prophet && found_libtbb { + return Ok(()); + } + } + anyhow::bail!("could not find one or more of prophet_model.bin or libtbb.so"); +} + +fn write(dest: &Path, file: ZipFile<'_>) -> Result<()> { + let file_path = file.enclosed_name().context("invalid file name in wheel")?; + let name = file_path.file_name().context("file has no name?")?; + let path = dest.join(name); + eprintln!("Writing zipped {} to {}", file.name(), path.display()); + let mut out = File::create(&path).context(format!("create {}", path.display()))?; + let mut reader = BufReader::new(file); + io::copy(&mut reader, &mut out).context("copy bytes")?; + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&path, std::fs::Permissions::from_mode(0o755)) + .context("could not set permissions on Prophet binary")?; + } + + Ok(()) +} + +fn remove(dest: &Path) -> Result<()> { + fs::remove_file(dest).context("remove file")?; + Ok(()) +} + +fn main() -> Result<()> { + let url = wheel_url()?; + + let mut dest = PathBuf::from("prophet_stan_model"); + fs::create_dir_all(&dest).context("create prophet_stan_model dir")?; + dest.push(url.rsplit_once('/').unwrap().1); + eprintln!("Downloading {} to {}", url, dest.display()); + + download(url, &dest)?; + unzip(&dest)?; + remove(&dest)?; + Ok(()) +} diff --git a/crates/augurs-prophet/src/cmdstan.rs b/crates/augurs-prophet/src/cmdstan.rs index d95aa153..f81425bf 100644 --- a/crates/augurs-prophet/src/cmdstan.rs +++ b/crates/augurs-prophet/src/cmdstan.rs @@ -16,10 +16,22 @@ //! This will embed the Prophet model binary and the libtbb dynamic library into //! the final executable, which will increase the size of the final executable by about //! 2MB, but the final binary won't require any additional dependencies. -//! 2. Use [`CmdstanOptimizer::with_custom_prophet`] to create a new `CmdstanOptimizer` using +//! 2. Use [`CmdstanOptimizer::with_prophet_path`] to create a new `CmdstanOptimizer` using //! a precompiled Prophet Stan model. This model can be obtained by either manually building //! the Prophet model (which still requires a working Stan installation) or extracting it from //! the Prophet Python package for your target platform. +//! The `download-stan-model` binary of this crate can be used to do the latter easily. +//! It will download the precompiled model for the current architecture and OS, and +//! extract it to the `prophet_stan_model` directory. This won't work if there is no +//! wheel for the current architecture and OS, though. +//! +//! For example: +//! +//! ```sh +//! $ cargo install --bin download-stan-model augurs-prophet +//! $ download-stan-model +//! $ ls prophet_stan_model +//! ``` //! //! # Gotchas //! @@ -46,6 +58,9 @@ use crate::{optimizer, Optimizer, PositiveFloat, TryFromFloatError}; /// Errors that can occur when trying to run optimization using cmdstan. #[derive(Debug, thiserror::Error)] pub enum Error { + /// The provided path to the Prophet binary has no parent directory. + #[error("Prophet path {0} has no parent")] + ProphetPathHasNoParent(PathBuf), /// An I/O error occurred when trying to run the Prophet model executable. #[error("Error running Prophet at {command}: {source}")] ProphetIOError { @@ -167,7 +182,7 @@ pub struct OptimizedParamsFound { struct ProphetInstallation { #[cfg(feature = "compile-cmdstan")] _dir: Option, - lib_dir: Option, + lib_dir: PathBuf, prophet_binary_path: PathBuf, } @@ -187,9 +202,7 @@ impl Clone for ProphetInstallation { impl ProphetInstallation { fn command(&self) -> Command { let mut cmd = Command::new(&self.prophet_binary_path); - if let Some(lib_dir) = &self.lib_dir { - cmd.env("LD_LIBRARY_PATH", lib_dir); - } + cmd.env("LD_LIBRARY_PATH", &self.lib_dir); cmd } } @@ -323,7 +336,7 @@ impl<'a> OptimizeCommand<'a> { "seed={}", self.opts.seed.unwrap_or_else(|| { let mut rng = thread_rng(); - (&mut rng).gen_range(1..99999) + rng.gen_range(1..99999) }) )); command.arg("data"); @@ -336,10 +349,7 @@ impl<'a> OptimizeCommand<'a> { command.arg("method=optimize"); command.arg(format!( "algorithm={}", - self.opts - .algorithm - .unwrap_or(crate::Algorithm::Lbfgs) - .to_string() + self.opts.algorithm.unwrap_or(crate::Algorithm::Lbfgs) )); command } @@ -361,12 +371,17 @@ impl CmdstanOptimizer { /// /// This is only available if the `compile-cmdstan` feature is enabled. /// + /// It will fail at compile-time if the Prophet model wasn't built by the build + /// script. Generally this shouldn't ever happen (since the build script will fail), + /// but there is always a chance that the built file is deleted in between + /// the build script running and compilation! + /// /// # Panics /// /// This function will panic if the temporary file could not be created, or if the /// Prophet model binary could not be written to the temporary file. #[cfg(feature = "compile-cmdstan")] - pub fn new() -> Self { + pub fn new_embedded() -> Self { static PROPHET_INSTALLATION: std::sync::LazyLock = std::sync::LazyLock::new(|| { static PROPHET_BINARY: &[u8] = include_bytes!("../build/prophet"); @@ -398,7 +413,7 @@ impl CmdstanOptimizer { ProphetInstallation { _dir: Some(dir), prophet_binary_path, - lib_dir: Some(lib_dir), + lib_dir, } }); @@ -408,12 +423,20 @@ impl CmdstanOptimizer { } /// Create a new [`CmdstanOptimizer`] using the Prophet model found the provided path. - pub fn with_custom_prophet(prophet_path: impl Into) -> Result { + pub fn with_prophet_path(prophet_path: impl Into) -> Result { + let prophet_binary_path = prophet_path.into(); + // Assume that the libtbb library is at the `lib` subdirectory of + // the Prophet binary, as arranged by the `download-stan-model` + // convenience script. + let lib_dir = prophet_binary_path + .parent() + .ok_or_else(|| Error::ProphetPathHasNoParent(prophet_binary_path.to_path_buf()))? + .join("lib"); let prophet_installation = ProphetInstallation { #[cfg(feature = "compile-cmdstan")] _dir: None, - lib_dir: None, - prophet_binary_path: prophet_path.into(), + lib_dir, + prophet_binary_path, }; // Test that the command can be executed. let mut command = prophet_installation.command(); @@ -445,9 +468,9 @@ impl Optimizer for CmdstanOptimizer { ) -> Result { OptimizeCommand { installation: &self.prophet_installation, - init: &init, - data: &data, - opts: &opts, + init, + data, + opts, } .run() .map_err(optimizer::Error::custom) diff --git a/examples/forecasting/Cargo.toml b/examples/forecasting/Cargo.toml index 8cefdab8..ecb8c0f1 100644 --- a/examples/forecasting/Cargo.toml +++ b/examples/forecasting/Cargo.toml @@ -11,4 +11,4 @@ keywords.workspace = true publish = false [dependencies] -augurs = { workspace = true, features = ["ets", "mstl", "forecaster", "prophet", "prophet-cmdstan", "prophet-compile-cmdstan", "seasons"] } +augurs = { workspace = true, features = ["ets", "mstl", "forecaster", "prophet", "prophet-cmdstan", "seasons"] } diff --git a/examples/forecasting/examples/prophet_cmdstan.rs b/examples/forecasting/examples/prophet_cmdstan.rs index a3f20b9d..d3789d1e 100644 --- a/examples/forecasting/examples/prophet_cmdstan.rs +++ b/examples/forecasting/examples/prophet_cmdstan.rs @@ -1,3 +1,14 @@ +//! Example of using the Prophet model with the cmdstan optimizer. +//! +//! To run this example, you must first download the Prophet Stan model +//! and libtbb shared library into the `prophet_stan_model` directory. +//! The easiest way to do this is to run the `download-stan-model` +//! binary in the `augurs-prophet` crate: +//! +//! ```sh +//! $ cargo run --features download --bin download-stan-model +//! $ cargo run --example prophet_cmdstan +//! ``` use augurs::prophet::{cmdstan::CmdstanOptimizer, Prophet, TrainingData}; fn main() -> Result<(), Box> { @@ -9,7 +20,13 @@ fn main() -> Result<(), Box> { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, ]; let data = TrainingData::new(ds, y.clone())?; - let mut prophet = Prophet::new(Default::default(), CmdstanOptimizer::new()); + + let cmdstan = CmdstanOptimizer::with_prophet_path("prophet_stan_model/prophet_model.bin")?; + // If you were using the embedded version of the cmdstan model, you'd use this: + // let cmdstan = CmdstanOptimizer::new_embedded(); + + let mut prophet = Prophet::new(Default::default(), cmdstan); + prophet.fit(data, Default::default())?; let predictions = prophet.predict(None)?; assert_eq!(predictions.yhat.point.len(), y.len()); diff --git a/justfile b/justfile index 73cf9349..18fe2e67 100644 --- a/justfile +++ b/justfile @@ -20,3 +20,7 @@ doc: watch: bacon + +# Download the Prophet Stan model. +download-prophet-stan-model: + cargo run --features download --bin download-stan-model From 564f478124d52d34a51a70feba3d2a86211997d4 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 10:36:36 +0100 Subject: [PATCH 03/10] Create empty files if compile step fails in build script --- crates/augurs-prophet/build.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/crates/augurs-prophet/build.rs b/crates/augurs-prophet/build.rs index 21e9b07a..01149458 100644 --- a/crates/augurs-prophet/build.rs +++ b/crates/augurs-prophet/build.rs @@ -70,10 +70,17 @@ fn main() -> Result<(), Box> { // This is a complete hack but lets us get away with still using // the `--all-features` flag of Cargo without everything failing // if there isn't a Stan installation. - // Basically, if we're - // hard fail, we just want to skip the feature. + // Basically, if have this feature enabled, skip any failures in + // the build process and just create some empty files. // This will cause things to fail at runtime if there isn't a Stan - // installation, but that's okay. + // installation, but that's okay because no-one should ever use this + // feature. + #[cfg(feature = "internal-ignore-cmdstan-failure")] + if _result.is_err() { + std::fs::create_dir_all("./build")?; + std::fs::File::create("./build/prophet")?; + std::fs::File::create("./build/libtbb.so.12")?; + } #[cfg(not(feature = "internal-ignore-cmdstan-failure"))] _result?; Ok(()) From df699b62357733378bcca6b0451fac0659659771 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 10:59:24 +0100 Subject: [PATCH 04/10] Run examples and benchmarks (except iai ones) in CI --- .github/workflows/rust.yml | 2 +- justfile | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4afa58a1..cc6ee77c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -39,7 +39,7 @@ jobs: run: just download-prophet-stan-model - name: Run cargo nextest - run: just test + run: just test-all - name: Run doc tests run: just doctest diff --git a/justfile b/justfile index 18fe2e67..30970131 100644 --- a/justfile +++ b/justfile @@ -8,7 +8,23 @@ publish-npm: wasm-pack publish --access public test: - cargo nextest run --all-features --workspace + cargo nextest run \ + --all-features \ + --workspace \ + --exclude augurs-js \ + --exclude pyaugurs + +# Run all unit and integration tests, plus examples and benchmarks, +# except for those which require `iai` (which isn't available on +# all platforms). +test-all: + cargo nextest run \ + --all-features \ + --all-targets \ + --workspace \ + --exclude augurs-js \ + --exclude pyaugurs \ + -E 'not binary(/iai/)' doctest: # Ignore augurs-js and pyaugurs since they either won't compile with all features enabled From a80e9bc2a7819b4d5be8bdd272db9ce02ce1ea57 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 11:35:51 +0100 Subject: [PATCH 05/10] Handle lib dir more robustly and document better --- crates/augurs-prophet/src/cmdstan.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/crates/augurs-prophet/src/cmdstan.rs b/crates/augurs-prophet/src/cmdstan.rs index f81425bf..2cd03582 100644 --- a/crates/augurs-prophet/src/cmdstan.rs +++ b/crates/augurs-prophet/src/cmdstan.rs @@ -40,10 +40,13 @@ //! - the Prophet model binary is not statically linked to `libtbb`, since doing so is //! not recommended by the Stan developers or TBB developers. This means that the //! final binary will require the `libtbb` dynamic library to be present at runtime. -//! The first method takes care of this for you, but if using the second you'll need -//! to make sure the `libtbb` dynamic library is available at runtime. You can do this -//! by copying the `libtbb` dynamic library to a known directory and setting the -//! `LD_LIBRARY_PATH` environment variable to that directory. +//! The first method takes care of this for you, but if manually providing the path +//! using the second you'll need to make sure the `libtbb` dynamic library is +//! available at runtime. You can do this by copying the `libtbb` dynamic library +//! to a known directory and setting the `LD_LIBRARY_PATH` environment variable +//! to that directory. The `download-stan-model` tool adds this library to the +//! `lib` subdirectory next to the Prophet binary, and the `LD_LIBRARY_PATH` +//! environment variable is set to that directory by default. use std::{ io, num::{ParseFloatError, ParseIntError}, @@ -202,7 +205,16 @@ impl Clone for ProphetInstallation { impl ProphetInstallation { fn command(&self) -> Command { let mut cmd = Command::new(&self.prophet_binary_path); - cmd.env("LD_LIBRARY_PATH", &self.lib_dir); + // Use the provided LD_LIBRARY_PATH if it's set, adding on the configured lib + // dir for good measure. + if let Ok(ld_library_path) = std::env::var("LD_LIBRARY_PATH") { + let path = format!("{}:{}", ld_library_path, self.lib_dir.display()); + cmd.env("LD_LIBRARY_PATH", path); + } + // Otherwise, just use our lib_dir. + else { + cmd.env("LD_LIBRARY_PATH", &self.lib_dir); + } cmd } } From 032b508d30cd5629cef47bae48e5ac6fd0eb3d86 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 11:55:32 +0100 Subject: [PATCH 06/10] Add example to Prophet README --- crates/augurs-prophet/Cargo.toml | 1 + crates/augurs-prophet/README.md | 50 ++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/crates/augurs-prophet/Cargo.toml b/crates/augurs-prophet/Cargo.toml index 0ee13384..f35f0e5e 100644 --- a/crates/augurs-prophet/Cargo.toml +++ b/crates/augurs-prophet/Cargo.toml @@ -23,6 +23,7 @@ ureq = { version = "2.10.1", optional = true } zip = { version = "2.2.0", optional = true } [dev-dependencies] +augurs.workspace = true augurs-testing.workspace = true chrono.workspace = true pretty_assertions.workspace = true diff --git a/crates/augurs-prophet/README.md b/crates/augurs-prophet/README.md index ee81d486..6eb86280 100644 --- a/crates/augurs-prophet/README.md +++ b/crates/augurs-prophet/README.md @@ -3,6 +3,46 @@ `augurs-prophet` contains an implementation of the [Prophet] time series forecasting library. +## Example + +First, download the Prophet Stan model using the included binary: + +```sh +$ cargo install --bin download-stan-model --features download augurs-prophet +$ download-stan-model +``` + +```rust,no_run +use augurs::prophet::{cmdstan::CmdstanOptimizer, Prophet, TrainingData}; + +fn main() -> Result<(), Box> { + let ds = vec![ + 1704067200, 1704871384, 1705675569, 1706479753, 1707283938, 1708088123, 1708892307, + 1709696492, 1710500676, 1711304861, 1712109046, 1712913230, 1713717415, + ]; + let y = vec![ + 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, + ]; + let data = TrainingData::new(ds, y.clone())?; + + let cmdstan = CmdstanOptimizer::with_prophet_path("prophet_stan_model/prophet_model.bin")?; + // If you were using the embedded version of the cmdstan model, you'd enable + // the `compile-cmdstan` feature and use this: + // + // let cmdstan = CmdstanOptimizer::new_embedded(); + + let mut prophet = Prophet::new(Default::default(), cmdstan); + + prophet.fit(data, Default::default())?; + let predictions = prophet.predict(None)?; + assert_eq!(predictions.yhat.point.len(), y.len()); + assert!(predictions.yhat.lower.is_some()); + assert!(predictions.yhat.upper.is_some()); + println!("Predicted values: {:#?}", predictions.yhat); + Ok(()) +} +``` + This crate aims to be low-dependency to enable it to run in as many places as possible. With that said, we need to talk about optimizers… @@ -15,9 +55,13 @@ inference as well as maximum likelihood estimation using optimizers such as L-BF However, it is written in C++ and has non-trivial dependencies, which makes it difficult to interface with from Rust (or, indeed, Python). -`augurs-prophet` (similar to the Python library) abstracts optimization -and sampling implementations using the `Optimizer` and `Sampler` traits. -These are yet to be implemented, but I have a few ideas: +Similar to the Python library, `augurs-prophet` abstracts MLE optimization +using the `Optimizer` and (later) MCMC using the `Sampler` traits. +There is a single implementation of the `Optimizer` which uses +`cmdstan` to run the optimization. See below and the `cmdstan` module +for details. + +Further implementations are possible, with some ideas below. ### `cmdstan` From 59b55471953e2891ef86b23de4c6827fea547176 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 16:13:03 +0100 Subject: [PATCH 07/10] Correctly serialize X (regressors) as nested sequences --- crates/augurs-prophet/src/optimizer.rs | 139 +++++++++++++++++- .../forecasting/examples/prophet_cmdstan.rs | 23 ++- 2 files changed, 159 insertions(+), 3 deletions(-) diff --git a/crates/augurs-prophet/src/optimizer.rs b/crates/augurs-prophet/src/optimizer.rs index fba98d91..ce2d1dc3 100644 --- a/crates/augurs-prophet/src/optimizer.rs +++ b/crates/augurs-prophet/src/optimizer.rs @@ -85,7 +85,6 @@ impl<'de> serde::Deserialize<'de> for TrendIndicator { /// Data for the Prophet model. #[derive(Clone, Debug, PartialEq)] -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[allow(non_snake_case)] pub struct Data { /// Number of time periods. @@ -110,6 +109,13 @@ pub struct Data { /// Indicator of multiplicative features, length k. pub s_m: Vec, /// Regressors, shape (n, k). + /// + /// This is stored as a `Vec` rather than a nested `Vec>` + /// because passing such a struct by reference is tricky in Rust, since + /// it can't be dereferenced to a `&[&[f64]]` (which would be ideal). + /// + /// However, when serialized to JSON, it is converted to a nested array + /// of arrays, which is what cmdstan expects. pub X: Vec, /// Scale on seasonality prior. pub sigmas: Vec, @@ -118,6 +124,50 @@ pub struct Data { pub tau: PositiveFloat, } +#[cfg(feature = "serde")] +impl serde::Serialize for Data { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::{SerializeSeq, SerializeStruct}; + + /// A serializer which serializes X, a flat slice of f64s, as an sequence of sequences, + /// with each one having length equal to the second field. + struct XSerializer<'a>(&'a [f64], usize); + + impl<'a> serde::Serialize for XSerializer<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut outer = serializer.serialize_seq(Some(self.0.len() / self.1))?; + for chunk in self.0.chunks(self.1) { + outer.serialize_element(&chunk)?; + } + outer.end() + } + } + + let mut s = serializer.serialize_struct("Data", 13)?; + let x = XSerializer(&self.X, self.K as usize); + s.serialize_field("T", &self.T)?; + s.serialize_field("y", &self.y)?; + s.serialize_field("t", &self.t)?; + s.serialize_field("cap", &self.cap)?; + s.serialize_field("S", &self.S)?; + s.serialize_field("t_change", &self.t_change)?; + s.serialize_field("trend_indicator", &self.trend_indicator)?; + s.serialize_field("K", &self.K)?; + s.serialize_field("s_a", &self.s_a)?; + s.serialize_field("s_m", &self.s_m)?; + s.serialize_field("X", &x)?; + s.serialize_field("sigmas", &self.sigmas)?; + s.serialize_field("tau", &self.tau)?; + s.end() + } +} + /// The algorithm to use for optimization. One of: 'BFGS', 'LBFGS', 'Newton'. #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub enum Algorithm { @@ -303,3 +353,90 @@ pub mod mock_optimizer { } } } + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn serialize_data() { + let data = Data { + T: 3, + y: vec![1.0, 2.0, 3.0], + t: vec![0.0, 1.0, 2.0], + X: vec![1.0, 2.0, 3.0, 1.0, 2.0, 3.0], + sigmas: vec![ + 1.0.try_into().unwrap(), + 2.0.try_into().unwrap(), + 3.0.try_into().unwrap(), + ], + tau: 1.0.try_into().unwrap(), + K: 2, + s_a: vec![1, 1, 1], + s_m: vec![0, 0, 0], + cap: vec![0.0, 0.0, 0.0], + S: 2, + t_change: vec![0.0, 0.0, 0.0], + trend_indicator: TrendIndicator::Linear, + }; + let serialized = serde_json::to_string_pretty(&data).unwrap(); + pretty_assertions::assert_eq!( + serialized, + r#"{ + "T": 3, + "y": [ + 1.0, + 2.0, + 3.0 + ], + "t": [ + 0.0, + 1.0, + 2.0 + ], + "cap": [ + 0.0, + 0.0, + 0.0 + ], + "S": 2, + "t_change": [ + 0.0, + 0.0, + 0.0 + ], + "trend_indicator": 0, + "K": 2, + "s_a": [ + 1, + 1, + 1 + ], + "s_m": [ + 0, + 0, + 0 + ], + "X": [ + [ + 1.0, + 2.0 + ], + [ + 3.0, + 1.0 + ], + [ + 2.0, + 3.0 + ] + ], + "sigmas": [ + 1.0, + 2.0, + 3.0 + ], + "tau": 1.0 +}"# + ); + } +} diff --git a/examples/forecasting/examples/prophet_cmdstan.rs b/examples/forecasting/examples/prophet_cmdstan.rs index d3789d1e..ab4853fc 100644 --- a/examples/forecasting/examples/prophet_cmdstan.rs +++ b/examples/forecasting/examples/prophet_cmdstan.rs @@ -9,7 +9,9 @@ //! $ cargo run --features download --bin download-stan-model //! $ cargo run --example prophet_cmdstan //! ``` -use augurs::prophet::{cmdstan::CmdstanOptimizer, Prophet, TrainingData}; +use std::collections::HashMap; + +use augurs::prophet::{cmdstan::CmdstanOptimizer, Prophet, Regressor, TrainingData}; fn main() -> Result<(), Box> { let ds = vec![ @@ -19,13 +21,30 @@ fn main() -> Result<(), Box> { let y = vec![ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, ]; - let data = TrainingData::new(ds, y.clone())?; + let data = TrainingData::new(ds, y.clone())? + .with_regressors(HashMap::from([ + ( + "foo".to_string(), + vec![ + 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, + ], + ), + ( + "bar".to_string(), + vec![ + 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, 4.0, + ], + ), + ])) + .unwrap(); let cmdstan = CmdstanOptimizer::with_prophet_path("prophet_stan_model/prophet_model.bin")?; // If you were using the embedded version of the cmdstan model, you'd use this: // let cmdstan = CmdstanOptimizer::new_embedded(); let mut prophet = Prophet::new(Default::default(), cmdstan); + prophet.add_regressor("foo".to_string(), Regressor::additive()); + prophet.add_regressor("bar".to_string(), Regressor::additive()); prophet.fit(data, Default::default())?; let predictions = prophet.predict(None)?; From fd0c5a3661e4161b7aac4859d331c41446bf1b32 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 16:51:25 +0100 Subject: [PATCH 08/10] Handle division by zero in X serialization Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- crates/augurs-prophet/src/optimizer.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/crates/augurs-prophet/src/optimizer.rs b/crates/augurs-prophet/src/optimizer.rs index ce2d1dc3..fe18591d 100644 --- a/crates/augurs-prophet/src/optimizer.rs +++ b/crates/augurs-prophet/src/optimizer.rs @@ -141,8 +141,14 @@ impl serde::Serialize for Data { where S: serde::Serializer, { - let mut outer = serializer.serialize_seq(Some(self.0.len() / self.1))?; - for chunk in self.0.chunks(self.1) { + if self.1 == 0 { + return Err(serde::ser::Error::custom( + "Invalid value for K: cannot be zero", + )); + } + let chunk_size = self.1; + let mut outer = serializer.serialize_seq(Some(self.0.len() / chunk_size))?; + for chunk in self.0.chunks(chunk_size) { outer.serialize_element(&chunk)?; } outer.end() From 02c3b17c70f20219465fa68d65c00a1eae0ca167 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 17:00:07 +0100 Subject: [PATCH 09/10] Use correct env name in rerun-if-env-changed --- crates/augurs-prophet/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/augurs-prophet/build.rs b/crates/augurs-prophet/build.rs index 01149458..65b13b93 100644 --- a/crates/augurs-prophet/build.rs +++ b/crates/augurs-prophet/build.rs @@ -10,7 +10,7 @@ fn compile_model() -> Result<(), Box> { use tempfile::TempDir; println!("cargo::rerun-if-changed=prophet.stan"); - println!("cargo::rerun-if-env-changed=CMDSTAN_PATH"); + println!("cargo::rerun-if-env-changed=STAN_PATH"); let stan_path: PathBuf = std::env::var("STAN_PATH") .map_err(|_| "STAN_PATH not set")? From 3396905e3d4ac0fa624f17ce93e9f742bacb20cd Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 10 Oct 2024 17:08:42 +0100 Subject: [PATCH 10/10] Use 'OUT_DIR' instead of 'build' --- crates/augurs-prophet/build.rs | 9 +++++---- crates/augurs-prophet/src/cmdstan.rs | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/augurs-prophet/build.rs b/crates/augurs-prophet/build.rs index 65b13b93..bfa4d90b 100644 --- a/crates/augurs-prophet/build.rs +++ b/crates/augurs-prophet/build.rs @@ -19,7 +19,7 @@ fn compile_model() -> Result<(), Box> { let cmdstan_bin_path = stan_path.join("bin/cmdstan"); let model_stan = include_bytes!("./prophet.stan"); - let build_dir = PathBuf::from("./build"); + let build_dir = PathBuf::from(std::env::var("OUT_DIR")?); fs::create_dir_all(&build_dir).map_err(|_| "could not create build directory")?; // Write the Prophet Stan file to a named file in a temporary directory. @@ -77,9 +77,10 @@ fn main() -> Result<(), Box> { // feature. #[cfg(feature = "internal-ignore-cmdstan-failure")] if _result.is_err() { - std::fs::create_dir_all("./build")?; - std::fs::File::create("./build/prophet")?; - std::fs::File::create("./build/libtbb.so.12")?; + let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR")?); + std::fs::create_dir_all(&out_dir)?; + std::fs::File::create(out_dir.join("prophet"))?; + std::fs::File::create(out_dir.join("libtbb.so.12"))?; } #[cfg(not(feature = "internal-ignore-cmdstan-failure"))] _result?; diff --git a/crates/augurs-prophet/src/cmdstan.rs b/crates/augurs-prophet/src/cmdstan.rs index 2cd03582..2f6d983e 100644 --- a/crates/augurs-prophet/src/cmdstan.rs +++ b/crates/augurs-prophet/src/cmdstan.rs @@ -396,8 +396,8 @@ impl CmdstanOptimizer { pub fn new_embedded() -> Self { static PROPHET_INSTALLATION: std::sync::LazyLock = std::sync::LazyLock::new(|| { - static PROPHET_BINARY: &[u8] = include_bytes!("../build/prophet"); - static LIBTBB_SO: &[u8] = include_bytes!("../build/libtbb.so.12"); + static PROPHET_BINARY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/prophet")); + static LIBTBB_SO: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libtbb.so.12")); let dir = tempfile::tempdir().expect("could not create temporary directory"); let lib_dir = dir.path().join("lib");