diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..aed25d6 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,3 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +cran-comments.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c833a2c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..aff819c --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,25 @@ +Package: condusco +Type: Package +Title: Query-Driven Pipeline Execution and Query Templates +Version: 0.1.0 +Author: Roland Stevenson +Maintainer: Roland Stevenson +Description: Runs a function iteratively over each row of either a dataframe + or the results of a query. Use the 'BigQuery' and 'DBI' wrappers to + iteratively pass each row of query results to a function. If a field + contains a 'JSON' string, it will be converted to an object. This is + helpful for queries that return 'JSON' strings that represent objects. + These fields can then be treated as objects by the pipeline. +License: GPL-3 +URL: https://github.com/ras44/condusco +BugReports: https://github.com/ras44/condusco/issues +Encoding: UTF-8 +LazyData: true +Suggests: knitr, rmarkdown, whisker, testthat, RSQLite +VignetteBuilder: knitr +Depends: R (>= 3.3.2), + jsonlite, + assertthat, + bigrquery, + DBI +RoxygenNote: 6.0.1.9000 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..e6354fd --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,9 @@ +# Generated by roxygen2: do not edit by hand + +export(run_pipeline) +export(run_pipeline_dbi) +export(run_pipeline_gbq) +import(DBI) +import(assertthat) +import(bigrquery) +import(jsonlite) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..6cd81c7 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,7 @@ +# News + +## 0.1.0 +condusco now contains the following functions +run_pipeline +run_pipeline_dbi +run_pipeline_gbq diff --git a/R/run_pipeline.R b/R/run_pipeline.R new file mode 100644 index 0000000..1c6408e --- /dev/null +++ b/R/run_pipeline.R @@ -0,0 +1,192 @@ +#' Runs user-provided pipeline for each row of arguments in parameters, converting any JSON +#' strings to objects +#' +#' @param pipeline User-provided function with one argument, a dataframe +#' @param parameters An dataframe of fields to convert to json +#' +#' @import assertthat jsonlite +#' +#' @examples +#' +#' library(whisker) +#' +#' run_pipeline( +#' function(params){ +#' query <- "SELECT result FROM {{table_prefix}}_results;" +#' whisker.render(query,params) +#' }, +#' data.frame( +#' table_prefix = c('batman', 'robin') +#' ) +#') +#' +#' @export +run_pipeline <- function(pipeline, parameters){ + + assert_that(length(parameters)>0) + + #For each row in parameters, convert each column to json object if it contains json + apply(parameters, 1, function(row){ + lr <- as.list(row) + for(n in names(lr)){ + tryCatch({ + lr[[n]] <- fromJSON(get(n,lr), simplifyVector=FALSE) + },error=function(e){ + lr[[n]] <- toString(get(n,lr)) + } + ) + } + + pipeline(lr) + }) + +} + + +#' A wrapper for running pipelines with a BigQuery invocation query +#' +#' @param pipeline User-provided function with one argument, one row of query results +#' @param query A query to execute in Google BigQuery +#' @param project The Google BigQuery project to bill +#' @param ... Additional arguments passed to query_exec() +#' +#' @import bigrquery +#' +#' @examples +#' +#'\dontrun{ +#' library(whisker) +#' +#' #Set GBQ project +#' project <- '' +#' +#' #Set the following options for GBQ authentication on a cloud instance +#' options("httr_oauth_cache" = "~/.httr-oauth") +#' options(httr_oob_default=TRUE) +#' +#' #Run the below query to authenticate and write credentials to .httr-oauth file +#' query_exec("SELECT 'foo' as bar",project=project); +#' +#' pipeline <- function(params){ +#' +#' query <- " +#' SELECT +#' {{#list}} +#' SUM(CASE WHEN author.name ='{{name}}' THEN 1 ELSE 0 END) as n_{{name_clean}}, +#' {{/list}} +#' repo_name +#' FROM `bigquery-public-data.github_repos.sample_commits` +#' GROUP BY repo_name +#' ;" +#' +#' res <- query_exec( +#' whisker.render(query,params), +#' project=project, +#' use_legacy_sql = FALSE +#' ); +#' +#' print(res) +#' } +#' +#' run_pipeline_gbq(pipeline, " +#' SELECT CONCAT('[', +#' STRING_AGG( +#' CONCAT('{\"name\":\"',name,'\",' +#' ,'\"name_clean\":\"', REGEXP_REPLACE(name, r'[^[:alpha:]]', ''),'\"}' +#' ) +#' ), +#' ']') as list +#' FROM ( +#' SELECT author.name, +#' COUNT(commit) n_commits +#' FROM `bigquery-public-data.github_repos.sample_commits` +#' GROUP BY 1 +#' ORDER BY 2 DESC +#' LIMIT 10 +#' ) +#' ", +#' project, +#' use_legacy_sql = FALSE +#' ) +#'} +#' @export +run_pipeline_gbq <- function(pipeline, query, project, ... ){ + + #run the query to generate the intitialization table + parameters <- query_exec(query, project=project, ...) + + run_pipeline(pipeline, parameters) + +} + +#' A wrapper for running pipelines with a DBI connection invocation query +#' +#' @param pipeline User-provided function with one argument, one row of query results +#' @param query A query to execute via the DBI connection +#' @param con The DBI connection +#' @param ... Additional arguments passed to dbSendQuery() and dbFetch() +#' +#' @import DBI +#' +#' @examples +#' +#'\dontrun{ +#' library(whisker) +#' library(RSQLite) +#' +#' con <- dbConnect(RSQLite::SQLite(), ":memory:") +#' +#' dbWriteTable(con, "mtcars", mtcars) +#' +#' #for each cylinder count, count the number of top 5 hps it has +#' pipeline <- function(params){ +#' +#' query <- "SELECT +#' {{#list}} +#' SUM(CASE WHEN hp='{{val}}' THEN 1 ELSE 0 END )as n_hp_{{val}}, +#' {{/list}} +#' cyl +#' FROM mtcars +#' GROUP BY cyl +#' ;" +#' +#' +#' dbGetQuery( +#' con, +#' whisker.render(query,params) +#' ) +#' } +#' +#' +#' #pass the top 5 most common hps as val params +#' run_pipeline_dbi( +#' pipeline, +#' ' +#' SELECT "[" || GROUP_CONCAT("{ ""val"": """ || hp || """ }") || "]" AS list +#' FROM ( +#' SELECT +#' CAST(hp as INTEGER) as HP, +#' count(hp) as cnt +#' FROM mtcars +#' GROUP BY hp +#' ORDER BY cnt DESC +#' LIMIT 5 +#' ) +#' ', +#' con +#' ) +#' +#' +#' dbDisconnect(con) +#'} +#' @export +run_pipeline_dbi <- function(pipeline, query, con, ...){ + + rs <- dbSendQuery(con, query, ...) + parameters <- dbFetch(rs, ...) + + dbClearResult(rs, ...) + + run_pipeline(pipeline, parameters) + +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..61f9328 --- /dev/null +++ b/README.md @@ -0,0 +1,255 @@ +# condusco + +## Overview + +condusco lets you run a function iteratively, passing it the rows of a dataframe or the results of a query. + +We call the functions condusco runs pipelines, and define a pipeline as a function that accepts a list of parameters and run a series of customized commands based on the values of the parameters. + +The most common use case for condusco are data pipelines. For data pipelines that primarily run SQL queries, we can template queries with a library (ie. [whisker](https://github.com/edwindj/whisker)), so that parametrized values are separated from the query logic. We can then render the query with the appropriate values: + +``` +parameters <- source("params.R") + +#define a pipeline +pipeline <- function(parameters){ + query <- "SELECT * FROM {{dataset}}.{{table_prefix}}_results LIMIT {{limit_size}}" + query_with_params <- whisker.render(query, parameters) + run_query(query_with_params) +} + +# run the pipeline with the parameters in 'params.R' +pipeline(parameters) +``` + + +condusco provides the following extensions in functionality to the above design pattern: + - the user can provide a data-frame that contains multiple rows of parameters to be iteratively passed to the pipeline + - the user can provide a query and each row of results is iteratively passed to the pipeline + - any JSON-string parameter will be converted to an object before being passed to the pipeline + + +## Functions + +|function|description| +|:--------------|:--------------| +|run_pipeline(pipeline, parameters)| iteratively pass each row of parameters to a pipeline, converting any JSON parameters to objects| +|run_pipeline_gbq(pipeline, query, project)|calls run_pipeline with the results of query executed via bigrquery| +|run_pipeline_dbi(pipline, query, con)|calls run_pipeline with the results of query executed via DBI| + + +## Installation + +```{r, eval = FALSE} +install.packages("condusco") +``` + +## Features + +* Name-based substitution of local parameters into pipelines, iterating through rows of parameters: + + ```{r} + run_pipeline( + #the pipeline + function(parameters){ + query <- "SELECT * FROM {{table_prefix}}_results;" + print(whisker.render(query,parameters)) + }, + #the parameters + data.frame( + table_prefix = c('batman', 'robin') + ) + ) + ``` + + + +* Name-based substitution of query-results into pipelines, iterating through rows of parameters dataframe: + + ```{r} + con <- dbConnect(RSQLite::SQLite(), ":memory:") + + pipeline <- function(parameters){ + + query <-" + SELECT count(*) as n_hits + FROM user_hits + WHERE date(date_time) BETWEEN date('{{{date_low}}}') AND date('{{{date_high}}}') + ;" + + whisker.render(query,parameters) + + } + + run_pipeline_dbi(pipeline, + "SELECT date('now', '-5 days') as date_low, date('now') as date_high", + con + ) + + dbDisconnect(con) + ``` + + +* Dynamic query generation based on JSON strings: + + ```{r} + con <- dbConnect(RSQLite::SQLite(), ":memory:") + mtcars + dbWriteTable(con, "mtcars", mtcars) + + #for each cylinder count, count the number of top 5 hps it has + pipeline <- function(swap){ + + query <- "SELECT + {{#list}} + SUM(CASE WHEN hp='{{val}}' THEN 1 ELSE 0 END )as n_hp_{{val}}, + {{/list}} + cyl + FROM mtcars + GROUP BY cyl + ;" + + print(whisker.render(query,swap)) + + print( + dbGetQuery( + con, + whisker.render(query,swap) + ) + ) + } + + + #pass the top 5 most common hps as val parameters + run_pipeline_dbi( + pipeline, + ' + SELECT "[" || GROUP_CONCAT("{ ""val"": """ || hp || """ }") || "]" AS list + FROM ( + SELECT + CAST(hp as INTEGER) as HP, + count(hp) as cnt + FROM mtcars + GROUP BY hp + ORDER BY cnt DESC + LIMIT 5 + ) + ', + con + ) + + + dbDisconnect(con) + ``` + + + +# Google BigQuery Examples + +This is not available as a vignette because it requires user authentication + +```{r } +library(whisker) +library(bigrquery) +library(condusco) + +#Set GBQ project +project <- '' + +#Set the following options for GBQ authentication on a cloud instance +options("httr_oauth_cache" = "~/.httr-oauth") +options(httr_oob_default=TRUE) + +#Run the below query to authenticate and write credentials to .httr-oauth file +query_exec("SELECT 'foo' as bar",project=project); + +``` + + + +## Dynamically generated queries via JSON +If list is defined, convert the JSON string to an object and iterate through name1,name2 pairs. +This illustrates how to dynamically generate a query based on the JSON constructed by another query. +In this example, we create a trivial JSON object manually. We'll use a dynamically generated JSON object in the next example. +```{r} +pipeline <- function(params){ + + query <- "SELECT {{{value}}} as dollars_won, + {{#list}} + '{{name1}}' as {{name2}}, + {{/list}} + {{{field}}} as field + FROM {{table_name}} + LIMIT {{limit_size}} + ;" + + res <- query_exec(whisker.render(query,params), + project=project, + use_legacy_sql = FALSE + ); + + print(res) +} + +project + +run_pipeline_gbq(pipeline, " + SELECT 1000 as value, + 'word' as field, + '[{\"name1\":\"foo\", \"name2\":\"bar\"},{\"name1\":\"foo2\", \"name2\":\"bar2\"}]' as list, + 'publicdata:samples.shakespeare' AS table_name, + 5 AS limit_size +", project) + +``` + + + +## Feature Generation Query +Create features for each of the repos describing how many commits the top 10 commiters made to that repo. +```{r} +pipeline <- function(params){ + + query <- " + SELECT + {{#list}} + SUM(CASE WHEN author.name ='{{name}}' THEN 1 ELSE 0 END) as n_{{name_clean}}, + {{/list}} + repo_name + FROM `bigquery-public-data.github_repos.sample_commits` + GROUP BY repo_name + ;" + + res <- query_exec( + whisker.render(query,params), + project=project, + use_legacy_sql = FALSE + ); + + print(res) +} + +run_pipeline_gbq(pipeline, " + SELECT CONCAT('[', + STRING_AGG( + CONCAT('{\"name\":\"',name,'\",' + ,'\"name_clean\":\"', REGEXP_REPLACE(name, r'[^[:alpha:]]', ''),'\"}' + ) + ), + ']') as list + FROM ( + SELECT author.name, + COUNT(commit) n_commits + FROM `bigquery-public-data.github_repos.sample_commits` + GROUP BY 1 + ORDER BY 2 DESC + LIMIT 10 + ) +", +project, +use_legacy_sql = FALSE +) + +``` + + diff --git a/condusco.Rproj b/condusco.Rproj new file mode 100644 index 0000000..270314b --- /dev/null +++ b/condusco.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..eb23901 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,53 @@ +# Resubmission +This is a resubmission. In this version I have: + +* removed package name redundancy in description + +## Test environments +* Debian GNU/Linux 8.6 (jessie), R 3.3.2 +* win-builder (devel) + +## R CMD check results +There were no ERRORs or WARNINGs. + +## Downstream dependencies +None + + + + +# Resubmission +This is a resubmission. In this version I have: + +* writen package names and software names in single quotes (e.g. 'JSON') + +* elaborated the description and updated title + +* added a small executable example in the Rd-files for run_pipeline + +* added examples for run_pipeline_dbi and run_pipeline_gbq + - run_pipeline_gbq example requires user authentication to 'BigQuery' + +## Test environments +* Debian GNU/Linux 8.6 (jessie), R 3.3.2 +* win-builder (devel) + +## R CMD check results +There were no ERRORs or WARNINGs. + +## Downstream dependencies +None + + + +# First submission + +## Test environments +* Debian GNU/Linux 8.6 (jessie), R 3.3.2 +* win-builder (devel) + +## R CMD check results +There were no ERRORs or WARNINGs. + +## Downstream dependencies +None diff --git a/man/run_pipeline.Rd b/man/run_pipeline.Rd new file mode 100644 index 0000000..5fe6732 --- /dev/null +++ b/man/run_pipeline.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_pipeline.R +\name{run_pipeline} +\alias{run_pipeline} +\title{Runs user-provided pipeline for each row of arguments in parameters, converting any JSON +strings to objects} +\usage{ +run_pipeline(pipeline, parameters) +} +\arguments{ +\item{pipeline}{User-provided function with one argument, a dataframe} + +\item{parameters}{An dataframe of fields to convert to json} +} +\description{ +Runs user-provided pipeline for each row of arguments in parameters, converting any JSON +strings to objects +} +\examples{ + +library(whisker) + +run_pipeline( + function(params){ + query <- "SELECT result FROM {{table_prefix}}_results;" + whisker.render(query,params) + }, + data.frame( + table_prefix = c('batman', 'robin') + ) +) + +} diff --git a/man/run_pipeline_dbi.Rd b/man/run_pipeline_dbi.Rd new file mode 100644 index 0000000..9538430 --- /dev/null +++ b/man/run_pipeline_dbi.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_pipeline.R +\name{run_pipeline_dbi} +\alias{run_pipeline_dbi} +\title{A wrapper for running pipelines with a DBI connection invocation query} +\usage{ +run_pipeline_dbi(pipeline, query, con, ...) +} +\arguments{ +\item{pipeline}{User-provided function with one argument, one row of query results} + +\item{query}{A query to execute via the DBI connection} + +\item{con}{The DBI connection} + +\item{...}{Additional arguments passed to dbSendQuery() and dbFetch()} +} +\description{ +A wrapper for running pipelines with a DBI connection invocation query +} +\examples{ + +\dontrun{ +library(whisker) +library(RSQLite) + +con <- dbConnect(RSQLite::SQLite(), ":memory:") + +dbWriteTable(con, "mtcars", mtcars) + +#for each cylinder count, count the number of top 5 hps it has +pipeline <- function(params){ + + query <- "SELECT + {{#list}} + SUM(CASE WHEN hp='{{val}}' THEN 1 ELSE 0 END )as n_hp_{{val}}, + {{/list}} + cyl + FROM mtcars + GROUP BY cyl + ;" + + + dbGetQuery( + con, + whisker.render(query,params) + ) +} + + +#pass the top 5 most common hps as val params +run_pipeline_dbi( + pipeline, + ' + SELECT "[" || GROUP_CONCAT("{ ""val"": """ || hp || """ }") || "]" AS list + FROM ( + SELECT + CAST(hp as INTEGER) as HP, + count(hp) as cnt + FROM mtcars + GROUP BY hp + ORDER BY cnt DESC + LIMIT 5 + ) + ', + con +) + + +dbDisconnect(con) +} +} diff --git a/man/run_pipeline_gbq.Rd b/man/run_pipeline_gbq.Rd new file mode 100644 index 0000000..f3c20dc --- /dev/null +++ b/man/run_pipeline_gbq.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/run_pipeline.R +\name{run_pipeline_gbq} +\alias{run_pipeline_gbq} +\title{A wrapper for running pipelines with a BigQuery invocation query} +\usage{ +run_pipeline_gbq(pipeline, query, project, ...) +} +\arguments{ +\item{pipeline}{User-provided function with one argument, one row of query results} + +\item{query}{A query to execute in Google BigQuery} + +\item{project}{The Google BigQuery project to bill} + +\item{...}{Additional arguments passed to query_exec()} +} +\description{ +A wrapper for running pipelines with a BigQuery invocation query +} +\examples{ + +\dontrun{ +library(whisker) + +#Set GBQ project +project <- '' + +#Set the following options for GBQ authentication on a cloud instance +options("httr_oauth_cache" = "~/.httr-oauth") +options(httr_oob_default=TRUE) + +#Run the below query to authenticate and write credentials to .httr-oauth file +query_exec("SELECT 'foo' as bar",project=project); + +pipeline <- function(params){ + + query <- " + SELECT + {{#list}} + SUM(CASE WHEN author.name ='{{name}}' THEN 1 ELSE 0 END) as n_{{name_clean}}, + {{/list}} + repo_name + FROM `bigquery-public-data.github_repos.sample_commits` + GROUP BY repo_name + ;" + + res <- query_exec( + whisker.render(query,params), + project=project, + use_legacy_sql = FALSE + ); + + print(res) +} + +run_pipeline_gbq(pipeline, " + SELECT CONCAT('[', + STRING_AGG( + CONCAT('{\\"name\\":\\"',name,'\\",' + ,'\\"name_clean\\":\\"', REGEXP_REPLACE(name, r'[^[:alpha:]]', ''),'\\"}' + ) + ), + ']') as list + FROM ( + SELECT author.name, + COUNT(commit) n_commits + FROM `bigquery-public-data.github_repos.sample_commits` + GROUP BY 1 + ORDER BY 2 DESC + LIMIT 10 + ) +", +project, +use_legacy_sql = FALSE +) +} +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..fcad17f --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,3 @@ +library(testthat) + +test_check("condusco") diff --git a/tests/testthat/test-data-frame-swaps.R b/tests/testthat/test-data-frame-swaps.R new file mode 100644 index 0000000..afeaf25 --- /dev/null +++ b/tests/testthat/test-data-frame-swaps.R @@ -0,0 +1,75 @@ +context("data frame swaps") +library(whisker) + +test_that(" data.frame '<' swaped into {{{three_escapes}}} via whisker == '<' ", { + + if (!isNamespaceLoaded("whisker")) { + stop("Package whisker needed for this function to work. Please install it.") + } + + expect_true( + run_pipeline( + #the pipeline + function(swap){ + query <- "{{{three_escapes}}}" + whisker.render(query,swap) + }, + #the swap + data.frame( + three_escapes = '<' + ) + ) + #should equal '<' + =="<" + ) + +}) + +test_that(" data.frame '<' swaped into {{two_escapes}} via whisker == '<", { + + if (!isNamespaceLoaded("whisker")) { + stop("Package whisker needed for this function to work. Please install it.") + } + + expect_true( + run_pipeline( + #the pipeline + function(swap){ + query <- "{{two_escapes}}" + whisker.render(query,swap) + }, + #the swap + data.frame( + two_escapes = '<' + ) + ) + #should equal '<' + =="<" + ) + +}) + + +test_that("single element dataframe is converted to a named list", { + + if (!isNamespaceLoaded("whisker")) { + stop("Package whisker needed for this function to work. Please install it.") + } + + expect_true( + run_pipeline( + #the pipeline + function(swap){ + query <- "{{two_escapes}}" + whisker.render(query,swap) + }, + #the swap + data.frame( + two_escapes = 1 + ) + ) + #should equal 1 + =="1" + ) + +}) diff --git a/vignettes/dbi.Rmd b/vignettes/dbi.Rmd new file mode 100644 index 0000000..2c58e75 --- /dev/null +++ b/vignettes/dbi.Rmd @@ -0,0 +1,141 @@ +--- +title: "Examples with DBI and SQLite" +author: "Roland Stevenson" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{DBI and SQLite} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r } +library(whisker) +library(DBI) +library(condusco) +``` + +## Simple Example + +Create pipelines that run dynamic queries based on results of a query. A common use case is to dynamically query a range of dates, without hard-coding the any variables. +```{r} +con <- dbConnect(RSQLite::SQLite(), ":memory:") + +pipeline <- function(params){ + + query <-" + SELECT count(*) as n_hits + FROM user_hits + WHERE date(date_time) BETWEEN date('{{{date_low}}}') AND date('{{{date_high}}}') + ;" + + whisker.render(query,params) + +} + +run_pipeline_dbi(pipeline, + "SELECT date('now', '-5 days') as date_low, date('now') as date_high", + con +) + +dbDisconnect(con) + +``` + + +## Dynamically generated queries via JSON +If list is defined, convert the JSON string to an object and iterate through name1,name2 pairs. +This dynamically generates a query of variable length, based on the JSON object. +In this example, we create a trivial JSON object manually. We'll use a dynamically generated JSON object in the next example. +```{r} +con <- dbConnect(RSQLite::SQLite(), ":memory:") + +pipeline <- function(params){ + + query <- "SELECT {{{value}}} as dollars_won, + {{#list}} + '{{name1}}' as {{name2}}, + {{/list}} + '{{{field}}}' as field + ;" + + whisker.render(query,params) + +} + + +run_pipeline_dbi( + pipeline, + "SELECT value, + field, + list + FROM ( + SELECT 1000 as value, + 'word' as field, + '[{\"name1\":\"foo1\", \"name2\":\"bar1\"},{\"name1\":\"foo2\", \"name2\":\"bar2\"}]' as list + ) + UNION ALL + SELECT 2000 as value, + 'word' as field, + '[{\"name1\":\"foo1\", \"name2\":\"bar1\"},{\"name1\":\"foo2\", \"name2\":\"bar2\"}]' as list + ", + con +) + + +dbDisconnect(con) + +``` + + +## Feature Generation Query +For the top 5 represented horsepowers in the mtcars dataset, create features for each of those horsepowers for each of the types of cylinders. For example, we dynamically create features like n_hp_110=4, for cyl=6. +```{r} +con <- dbConnect(RSQLite::SQLite(), ":memory:") + +dbWriteTable(con, "mtcars", mtcars) + +#for each cylinder count, count the number of top 5 hps it has +pipeline <- function(params){ + + query <- "SELECT + {{#list}} + SUM(CASE WHEN hp='{{val}}' THEN 1 ELSE 0 END )as n_hp_{{val}}, + {{/list}} + cyl + FROM mtcars + GROUP BY cyl + ;" + + + dbGetQuery( + con, + whisker.render(query,params) + ) +} + + +#pass the top 5 most common hps as val params +run_pipeline_dbi( + pipeline, + ' + SELECT "[" || GROUP_CONCAT("{ ""val"": """ || hp || """ }") || "]" AS list + FROM ( + SELECT + CAST(hp as INTEGER) as HP, + count(hp) as cnt + FROM mtcars + GROUP BY hp + ORDER BY cnt DESC + LIMIT 5 + ) + ', + con +) + + +dbDisconnect(con) + +``` + + diff --git a/vignettes/no-db.Rmd b/vignettes/no-db.Rmd new file mode 100644 index 0000000..148e413 --- /dev/null +++ b/vignettes/no-db.Rmd @@ -0,0 +1,37 @@ +--- +title: "Examples with no database access" +author: "Roland Stevenson" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{No Database} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r } +library(whisker) +library(condusco) +``` + + +## Simple Example + +This example shows how to simply swap out a value using the whisker library. A common use case is two users working on a the same logic, but wanting to keep their datasets separate. A namespace can be provided for each user with {{table_prefix}}, and the swap can be set in a user-specific configuration file, thereby allowing users to separate the logic of the pipelines from the user-specific configuration variables. They can commit changes to the logic in which the commits are free of their specific variables. +```{r} + +run_pipeline( + #the pipeline + function(params){ + query <- "SELECT result FROM {{table_prefix}}_results;" + whisker.render(query,params) + }, + #the swap + data.frame( + table_prefix = c('batman', 'robin') + ) +) + + +``` +