From 7c21c6bf3fcf6c68964b41cff397a49a600143e2 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Fri, 2 Feb 2018 16:05:44 -0800 Subject: [PATCH] support for BDB backend for disk-based storage, closes #6 Also extends tests, exports rdf_free, minor tweaks to documentation. Attempt to install bdb on travis for testing as well. --- .travis.yml | 1 + NAMESPACE | 8 +++ NEWS.md | 16 ++++- R/rdf.R | 105 ++++++++++++++++----------- R/utilities.R | 54 ++++++++++++++ inst/examples/storage_types.R | 38 ++++++++++ inst/extdata/ex2.xml | 18 +++++ man/rdf.Rd | 17 +++-- man/rdf_free.Rd | 27 +++++++ tests/testthat/test-rdf.R | 131 ++++++++++++++++++++++++++++------ 10 files changed, 343 insertions(+), 72 deletions(-) create mode 100644 inst/examples/storage_types.R create mode 100644 inst/extdata/ex2.xml create mode 100644 man/rdf_free.Rd diff --git a/.travis.yml b/.travis.yml index 854205b..1d8f310 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,5 +19,6 @@ addons: - librdf0-dev - libv8-dev - libjq-dev + - libdb-dev after_success: - Rscript -e 'covr::codecov()' diff --git a/NAMESPACE b/NAMESPACE index e13f544..2ac9324 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(format,rdf) S3method(print,rdf) export(rdf) export(rdf_add) +export(rdf_free) export(rdf_parse) export(rdf_query) export(rdf_serialize) @@ -20,11 +21,18 @@ importFrom(jsonld,jsonld_expand) importFrom(jsonld,jsonld_to_rdf) importFrom(methods,as) importFrom(methods,new) +importFrom(utils,capture.output) importFrom(utils,download.file) importMethodsFrom(redland,addStatement) importMethodsFrom(redland,executeQuery) +importMethodsFrom(redland,freeModel) +importMethodsFrom(redland,freeParser) importMethodsFrom(redland,freeQuery) importMethodsFrom(redland,freeQueryResults) +importMethodsFrom(redland,freeSerializer) +importMethodsFrom(redland,freeStatement) +importMethodsFrom(redland,freeStorage) +importMethodsFrom(redland,freeWorld) importMethodsFrom(redland,getNextResult) importMethodsFrom(redland,parseFileIntoModel) importMethodsFrom(redland,serializeToFile) diff --git a/NEWS.md b/NEWS.md index 942d46b..870aa31 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,17 @@ -# rdflib 0.0.4 - +# rdflib 0.1.0 + +* add `c()` method to concatenate `rdf` objects +* `rdf_query` now coerces data into appropriate type + if it recognizes the data URI and can match that + to an R type (a few XMLSchema types are recognized, + otherwise still defaults to character string) +* All methods free memory from any temporary objects they initialize + (e.g. parsers, serializers, query, statement) +* rdf includes explicit pointer to storage object +* rdf constructor supports BDB backend for disk-based triplestore [#6](https://github.com/cboettig/rdflib/issues/6) +* tests free rdf objects +* extend unit tests for some of new functionality +* Add `rdf_free` to free rdf (ideally would be done by GC in redland...) # rdflib 0.0.3 (2018-01-02) diff --git a/R/rdf.R b/R/rdf.R index f6f3905..d80efc1 100644 --- a/R/rdf.R +++ b/R/rdf.R @@ -1,31 +1,78 @@ #' Initialize an `rdf` Object #' +#' @param path where should local database to store RDF triples be created. +#' Default NULL will store triples in memory and should be best for most use cases. +#' Large databases should give a path on disk. Requires redland package to be +#' built with support for the Berkeley DB (libdb-dev on Ubuntu, berkeley-db on homebrew). +#' #' @return an rdf object #' @details an rdf Object is a list of class 'rdf', consisting of -#' two pointers to external C objects managed by the redland library. -#' These are the `World` object, basically a top-level pointer for -#' all RDF models, and a `Model` object, essentially a storage structure -#' for all RDF triples. `rdflib` defaults to an in-memory hash-based +#' three pointers to external C objects managed by the redland library. +#' These are the `world` object: basically a top-level pointer for +#' all RDF models, and a `model` object: a collection of RDF statements, +#' and a `storage` object, indicating how these statements are stored. +#' `rdflib` defaults to an in-memory hash-based #' storage structure at this time. The primary purpose of the `rdf` #' object is to abstract these low-level details away from the user. #' Typical use will be simply to initialize a container to which #' the user would manually add triples using \code{\link{rdf_add}}. #' +#' +#' @importClassesFrom redland World Model Storage +#' @importMethodsFrom redland freeWorld freeModel freeStorage +#' @importFrom utils capture.output #' @export #' #' @examples #' x <- rdf() #' -rdf <- function(){ +rdf <- function(path = NULL){ world <- new("World") - storage <- new("Storage", world, "hashes", name = "", - options = "hash-type='memory'") + + ## Handle storage type + if(is.character(path)){ + if(has_bdb()){ + ## Store in Berkeley DB + options <- paste0("new='yes',hash-type='bdb',dir='", path, "'") + } else { + warning("BDB driver not found. Falling back on in-memory storage") + options <- "hash-type='memory'" + } + } else { ## Store in memory + options <- "hash-type='memory'" + } + storage <- new("Storage", world, "hashes", name = "rdflib", + options = options) + + model <- new("Model", world = world, storage, options = "") - structure(list(world = world, model = model), + structure(list(world = world, model = model, storage = storage), class = "rdf") } +#' Free Memory Associated with RDF object +#' +#' @param rdf an rdf object +#' @details Free all pointers associated with an rdf object. +#' Frees memory associated with the storage, world, and model +#' objects. After this a user should remove the rdf object +#' from the environment as well with `rm`, since attempting +#' to reference an object after it has been removed can crash +#' R! +#' @export +#' @examples +#' rdf <- rdf() +#' rdf_free(rdf) +#' rm(rdf) +rdf_free <- function(rdf){ + redland::freeModel(rdf$model) + redland::freeStorage(rdf$storage) + redland::freeWorld(rdf$world) +} + + + #' @export format.rdf <- function(x, format = getOption("rdf_print_format", "nquads"), @@ -56,7 +103,7 @@ print.rdf <- function(x, ...){ #' @return an rdf object, containing the redland world #' and model objects #' @importClassesFrom redland World Storage Model Parser -#' @importMethodsFrom redland parseFileIntoModel +#' @importMethodsFrom redland parseFileIntoModel freeParser #' @importFrom jsonld jsonld_to_rdf #' @export #' @@ -92,7 +139,8 @@ rdf_parse <- function(doc, mimetype <- unname(rdf_mimetypes[format]) parser <- new("Parser", rdf$world, name = format, mimeType = mimetype) redland::parseFileIntoModel(parser, rdf$world, doc, rdf$model) - + redland::freeParser(parser) + rdf } @@ -129,7 +177,7 @@ add_base_uri <- function(doc, tmp = tempfile()){ #' \code{\link{rdf_parse}}. #' @importFrom methods new #' @importClassesFrom redland Serializer -#' @importMethodsFrom redland setNameSpace serializeToFile +#' @importMethodsFrom redland setNameSpace serializeToFile freeSerializer #' #' @export #' @examples @@ -189,6 +237,7 @@ rdf_serialize <- function(rdf, } } + redland::freeSerializer(serializer) invisible(doc) } @@ -250,7 +299,7 @@ rdf_query <- function(rdf, query, ...){ #' to the model object in C code, note that the input object is modified #' directly. #' @importClassesFrom redland Statement -#' @importMethodsFrom redland addStatement +#' @importMethodsFrom redland addStatement freeStatement #' @export #' #' @examples @@ -271,8 +320,9 @@ rdf_add <- function(rdf, subject, predicate, object, stmt <- new("Statement", world = rdf$world, subject, predicate, as.character(object), subjectType, objectType, datatype_uri) - addStatement(rdf$model, stmt) + redland::addStatement(rdf$model, stmt) + redland::freeStatement(stmt) ## rdf object is a list of pointers, modified in pass-by-reference invisible(rdf) } @@ -288,36 +338,7 @@ c.rdf <- function(...){ rdf_parse(txt, "nquads") } -# Must match parser name & q 1.0 mimetype listed at: -# http://librdf.org/raptor/api/raptor-formats-types-by-parser.html -# 3 turtle options listed but only text/turtle works. -rdf_mimetypes <- c("nquads" = "text/x-nquads", - "ntriples" = "application/n-triples", - "rdfxml" = "application/rdf+xml", - "trig" = "application/x-trig", - "turtle" = "text/turtle") - -# trig not working right now, not clear why -# Consider adding/testing: -# - n3 (text/n3) -# - rdfa (application/xhtml+xml, or text/html) -# - rss (application/rss+xml or text/rss) - -# rdf functions like working with local files -# this helper function allows us to also use URLs or strings -#' @importFrom utils download.file -text_or_url_to_doc <- function(x, tmp = tempfile()){ - if(file.exists(x)){ - return(x) - } else if(grepl("^https?://", x)) { - utils::download.file(x, tmp) - return(tmp) - } else { - writeLines(x, tmp) - return(tmp) - } -} #' rdflib: Tools to Manipulate and Query Semantic Data #' diff --git a/R/utilities.R b/R/utilities.R index b6a1f4b..e1262a2 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -35,3 +35,57 @@ rectangularize_query_results <- function(out){ names(X) <- vars as.data.frame(X, stringsAsFactors=FALSE) } + + + + + +has_bdb <- function(){ + ## Unfortunately convoluted way to check if we have Berkeley DB Support + world <- new("World") + path <-tempdir() + options <- paste0("new='yes',hash-type='bdb',dir='", path, "'") + storage <- new("Storage", world, "hashes", name = "rdflib", + options = options) + + out <- !(utils::capture.output( + base::print.default( + storage@librdf_storage@ref)) == + "") + + redland::freeStorage(storage) + redland::freeWorld(world) + + out +} + +# Must match parser name & q 1.0 mimetype listed at: +# http://librdf.org/raptor/api/raptor-formats-types-by-parser.html +# 3 turtle options listed but only text/turtle works. +rdf_mimetypes <- c("nquads" = "text/x-nquads", + "ntriples" = "application/n-triples", + "rdfxml" = "application/rdf+xml", + "trig" = "application/x-trig", + "turtle" = "text/turtle") + +# trig not working right now, not clear why +# Consider adding/testing: +# - n3 (text/n3) +# - rdfa (application/xhtml+xml, or text/html) +# - rss (application/rss+xml or text/rss) + + +# rdf functions like working with local files +# this helper function allows us to also use URLs or strings +#' @importFrom utils download.file +text_or_url_to_doc <- function(x, tmp = tempfile()){ + if(file.exists(x)){ + return(x) + } else if(grepl("^https?://", x)) { + utils::download.file(x, tmp, quiet = TRUE) + return(tmp) + } else { + writeLines(x, tmp) + return(tmp) + } +} diff --git a/inst/examples/storage_types.R b/inst/examples/storage_types.R new file mode 100644 index 0000000..360bc5b --- /dev/null +++ b/inst/examples/storage_types.R @@ -0,0 +1,38 @@ + +library(redland) +world <- new("World") + +## No error but null pointer returned +bdb_storage <- new("Storage", world, "hashes", name = "db1", + options = "new='yes',hash-type='bdb',dir='.'") +model <- new("Model", world = world, storage = bdb_storage, options = "") + +## error: sqlite not found +sqlite_storage <- new("Storage", world, "sqlite", name = "sqlite1", options = "new='yes'") +## not found +postgres_storage <- new("Storage", world, "postgresql", name = "postgres1", + options = "new='yes',host='localhost',database='red',user='foo','password='bar'") + +## Works, in memory, serializes to an rdf/xml file called thing.rdf when freed. +## Not indexed, so will be slow. Suitable for small models. +file_storage <- new("Storage", world, "file", "thing.rdf", "") +storage <- file_storage +model <- new("Model", world = world, storage = storage, options = "") + +## Works, fast write, not indexed, good for only small models, +## no reason to use this instead of hash-based memory (which is indexed) +memory_storage <- new("Storage", world, "memory", "", "") +storage <- memory_storage +model <- new("Model", world = world, storage = storage, options = "") + + +library(rdflib) + +rdf <- structure(list(world = world, model = model, storage = storage), + class = "rdf") + +rdf_add(rdf, + subject="http://www.dajobe.org/", + predicate="http://purl.org/dc/elements/1.1/language", + object="en") +rdf diff --git a/inst/extdata/ex2.xml b/inst/extdata/ex2.xml new file mode 100644 index 0000000..2a997d4 --- /dev/null +++ b/inst/extdata/ex2.xml @@ -0,0 +1,18 @@ + + + + + + + Jane Doe + + + + + + Professor + + + (425) 123-4567 + + diff --git a/man/rdf.Rd b/man/rdf.Rd index cd9b9bc..86a3db6 100644 --- a/man/rdf.Rd +++ b/man/rdf.Rd @@ -4,7 +4,13 @@ \alias{rdf} \title{Initialize an `rdf` Object} \usage{ -rdf() +rdf(path = NULL) +} +\arguments{ +\item{path}{where should local database to store RDF triples be created. +Default NULL will store triples in memory and should be best for most use cases. +Large databases should give a path on disk. Requires redland package to be +built with support for the Berkeley DB (libdb-dev on Ubuntu, berkeley-db on homebrew).} } \value{ an rdf object @@ -14,10 +20,11 @@ Initialize an `rdf` Object } \details{ an rdf Object is a list of class 'rdf', consisting of -two pointers to external C objects managed by the redland library. -These are the `World` object, basically a top-level pointer for -all RDF models, and a `Model` object, essentially a storage structure -for all RDF triples. `rdflib` defaults to an in-memory hash-based +three pointers to external C objects managed by the redland library. +These are the `world` object: basically a top-level pointer for +all RDF models, and a `model` object: a collection of RDF statements, +and a `storage` object, indicating how these statements are stored. +`rdflib` defaults to an in-memory hash-based storage structure at this time. The primary purpose of the `rdf` object is to abstract these low-level details away from the user. Typical use will be simply to initialize a container to which diff --git a/man/rdf_free.Rd b/man/rdf_free.Rd new file mode 100644 index 0000000..d4de7fb --- /dev/null +++ b/man/rdf_free.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rdf.R +\name{rdf_free} +\alias{rdf_free} +\title{Free Memory Associated with RDF object} +\usage{ +rdf_free(rdf) +} +\arguments{ +\item{rdf}{an rdf object} +} +\description{ +Free Memory Associated with RDF object +} +\details{ +Free all pointers associated with an rdf object. +Frees memory associated with the storage, world, and model +objects. After this a user should remove the rdf object +from the environment as well with `rm`, since attempting +to reference an object after it has been removed can crash +R! +} +\examples{ +rdf <- rdf() +rdf_free(rdf) +rm(rdf) +} diff --git a/tests/testthat/test-rdf.R b/tests/testthat/test-rdf.R index cc92e86..cde4463 100644 --- a/tests/testthat/test-rdf.R +++ b/tests/testthat/test-rdf.R @@ -3,14 +3,54 @@ testthat::context("test-rdf.R") doc <- system.file("extdata/example.rdf", package="redland") out <- "testing.rdf" + +testthat::test_that("We can initialize and free rdf objects", { + rdf <- rdf() + + testthat::expect_is(rdf, "rdf") + testthat::expect_is(rdf$world, "World") + testthat::expect_is(rdf$model, "Model") + testthat::expect_is(rdf$storage, "Storage") + + rdf_free(rdf) +}) + +testthat::test_that("We warn if we cannot use disk-based storage", { + testthat::skip_if(has_bdb()) + path <- tempdir() + testthat::expect_warning(rdf <- rdf(path), "BDB driver not found") + + ## Falls back on memory-based storage, still creates rdf + testthat::expect_is(rdf, "rdf") + rdf_free(rdf) + +}) + + + +testthat::test_that("We can use BDB storage", { + + testthat::skip_if_not(has_bdb()) + path <- tempdir() + testthat::expect_silent(rdf <- rdf(path)) + testthat::expect_is(rdf, "rdf") + + rdf_free(rdf) + +}) + + testthat::test_that("we can parse (in rdfxml) and serialize (in nquads) a simple rdf graph", { rdf <- rdf_parse(doc) rdf_serialize(rdf, out, "nquads") roundtrip <- rdf_parse(out, "nquads") testthat::expect_is(roundtrip, "rdf") + + rdf_free(rdf) }) +## FIXME check format, check return types testthat::test_that("we can make sparql queries", { sparql <- 'PREFIX dc: @@ -20,17 +60,36 @@ testthat::test_that("we can make sparql queries", { rdf <- rdf_parse(doc) match <- rdf_query(rdf, sparql) testthat::expect_length(match, 2) + + rdf_free(rdf) + }) testthat::test_that("we can initialize add triples to rdf graph", { - x <- rdf() - x <- rdf_add(x, + rdf <- rdf() + rdf <- rdf_add(rdf, subject="http://www.dajobe.org/", predicate="http://purl.org/dc/elements/1.1/language", object="en") - testthat::expect_is(x, "rdf") + testthat::expect_is(rdf, "rdf") + rdf_free(rdf) + +}) + + +testthat::test_that("we can concatenate rdfs", { + rdf1 <- rdf_parse(system.file("extdata/ex2.xml", package = "rdflib")) + rdf2 <- rdf_parse(doc) + rdf <- c(rdf1, rdf2) + testthat::expect_is(rdf, "rdf") + + rdf_free(rdf1) + rdf_free(rdf2) + rdf_free(rdf) }) + + testthat::test_that("we can add, parse and serialize json-ld", { #x <- rdf_parse(doc) x <- rdf() @@ -41,23 +100,23 @@ testthat::test_that("we can add, parse and serialize json-ld", { rdf_serialize(x, out, "jsonld") rdf <- rdf_parse(out, format = "jsonld") testthat::expect_is(rdf, "rdf") + rdf_free(x) + rdf_free(rdf) + }) + + + testthat::test_that("print and format work", { rdf <- rdf_parse(doc) txt <- format(rdf, format = "rdfxml") testthat::expect_output(print(rdf), ".*johnsmith.*") testthat::expect_is(txt, "character") + rdf_free(rdf) }) -testthat::test_that("we can parse from a text string", { - rdf <- rdf_parse(doc) - txt <- format(rdf, format = "rdfxml") - testthat::expect_is(txt, "character") - roundtrip <- rdf_parse(txt, format="rdfxml") - testthat::expect_is(roundtrip, "rdf") -}) testthat::test_that("we can add a namespace on serializing", { rdf <- rdf_parse(doc) @@ -67,43 +126,59 @@ testthat::test_that("we can add a namespace on serializing", { prefix = "dc") roundtrip <- rdf_parse(doc) testthat::expect_is(roundtrip, "rdf") + + rdf_free(rdf) + rdf_free(roundtrip) }) +################################################################ + +testthat::context("Test each serialization format") testthat::test_that("we can parse and serialize json-ld", { - x <- rdf_parse(doc) - rdf_serialize(x, out, "jsonld") + rdf <- rdf_parse(doc) + rdf_serialize(rdf, out, "jsonld") roundtrip <- rdf_parse(out, "jsonld") testthat::expect_is(roundtrip, "rdf") - + rdf_free(rdf) + }) testthat::test_that("we can parse and serialize nquads", { - x <- rdf_parse(doc) - rdf_serialize(x, out, "nquads") + rdf <- rdf_parse(doc) + rdf_serialize(rdf, out, "nquads") roundtrip <- rdf_parse(out, "nquads") testthat::expect_is(roundtrip, "rdf") + rdf_free(rdf) + }) testthat::test_that("we can parse and serialize ntriples", { - x <- rdf_parse(doc) - rdf_serialize(x, out, "ntriples") + rdf <- rdf_parse(doc) + rdf_serialize(rdf, out, "ntriples") roundtrip <- rdf_parse(out, "ntriples") testthat::expect_is(roundtrip, "rdf") + rdf_free(rdf) + }) testthat::test_that("we can parse and serialize tutle", { - x <- rdf_parse(doc) - rdf_serialize(x, out, "turtle") + rdf <- rdf_parse(doc) + rdf_serialize(rdf, out, "turtle") roundtrip <- rdf_parse(out, "turtle") testthat::expect_is(roundtrip, "rdf") + rdf_free(rdf) }) testthat::test_that("we can parse and serialize rdfxml", { - x <- rdf_parse(doc) - rdf_serialize(x, out, "rdfxml") + rdf <- rdf_parse(doc) + rdf_serialize(rdf, out, "rdfxml") roundtrip <- rdf_parse(out, "rdfxml") testthat::expect_is(roundtrip, "rdf") + rdf_free(rdf) }) +################################################################ + +testthat::context("Parsing different sources (URL, string)") testthat::test_that("we can parse from a url", { # CRAN seems okay with tests requiring an internet connection @@ -111,11 +186,18 @@ testthat::test_that("we can parse from a url", { rdf <- rdf_parse("https://tinyurl.com/ycf95c9h") testthat::expect_is(rdf, "rdf") + rdf_free(rdf) }) +testthat::test_that("we can parse from a text string", { + rdf <- rdf_parse(doc) + txt <- format(rdf, format = "rdfxml") + testthat::expect_is(txt, "character") + roundtrip <- rdf_parse(txt, format="rdfxml") + testthat::expect_is(roundtrip, "rdf") + rdf_free(rdf) -testthat::test_that("we can parse from a string", { - string <- + string <- ' _:b0 "Professor" . _:b0 "Jane Doe" . @@ -126,8 +208,11 @@ testthat::test_that("we can parse from a string", { rdf <- rdf_parse(string, "nquads") testthat::expect_is(rdf, "rdf") + rdf_free(rdf) }) + + unlink(out)