exploring with large datasets

ropensci · Feb 15, 2018 · 55a7c1e · 55a7c1e
1 parent 49a7c7c
commit 55a7c1e
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 13 deletions.
diff --git a/inst/examples/as_rdf.R b/inst/examples/as_rdf.R
@@ -1,19 +1,33 @@
-as_rdf <- function(df, base_uri = NULL) UseMethod("as_rdf")
+as_rdf <- function(df, key = NULL, base_uri = NULL) UseMethod("as_rdf")
+
 
 
 ## tidy data to rdf
-as_rdf.data.frame <- function(df, base_uri = NULL){
+as_rdf.data.frame <- function(df, key = NULL, base_uri = NULL){
 
-  x <- tibble::rowid_to_column(df, "subject")
+  x <- df
+  if(is.null(key)){
+    x <- tibble::rowid_to_column(x, "subject")
+  } else {
+    names(x)[names(x) == key] <- "subject"
+  }
   suppressWarnings(
     x <- tidyr::gather(x, key = predicate, value = object, -subject)
   )
 
-  ## gather looses col-classes
+  ## gather looses col-classes, so pre-compute them (with base R)
   col_classes <- data.frame(datatype = 
-                              vapply(df, rdflib:::xs_class, character(1)))
-  col_classes <- tibble::rownames_to_column(col_classes, "predicate")
-  x <- dplyr::inner_join(x, col_classes, "predicate")
+                              vapply(df, rdflib:::xs_class, character(1)),
+                            stringsAsFactors = FALSE)
+  col_classes$predicate <- rownames(col_classes)
+  rownames(col_classes) <- NULL
+
+
+  x <- merge(x, col_classes, by = "predicate")
+
+  ## NA to blank string
+  x$object[is.na(x$object)] <- ""
+  x$subject[is.na(x$subject)] <- ""
 
   rdf <- rdf()
   for(i in seq_along(x$subject)){

diff --git a/vignettes/rdf_intro.Rmd b/vignettes/rdf_intro.Rmd
@@ -350,14 +350,45 @@ _Still working on writing this section_
 
 ```{r}
 source(system.file("examples/as_rdf.R", package="rdflib"))
-## Testing: Digest some data.frames into RDF and extract back
- library(tidyverse)
- cars <- mtcars %>% rownames_to_column("Model")
- x1 <- as_rdf(iris, "iris:")
- x2 <- as_rdf(cars, "mtcars:")
- rdf <- c(x1,x2)
 ```
 
+Note: looping over `rdf_add` can be a slow way to add hundreds of thousands of triples.  Coercing into RDF via JSON-LD might be much faster.
+
+```{r}
+library(nycflights13)
+library(tidyverse)
+
+df <- flights %>% 
+  left_join(airlines) %>%
+  left_join(planes, by="tailnum") %>% 
+  select(carrier, manufacturer, model)
+  
+```
+
+
+```{r}
+x1 <- as_rdf(airlines, "carrier", "x:")
+x2 <- as_rdf(airports,  "faa", "x:")
+x3 <- as_rdf(planes, "tailnum", "x:")
+system.time(
+x4 <- as_rdf(flights, NULL, "x:")
+)
+
+rdf <- c(x1,x2,x3)
+```
+
+
+```{r}
+sparql <-
+  'SELECT  ?carrier ?manufacturer ?model 
+WHERE {
+ ?s <x:carrier>  ?carrier .
+ ?s <x:manufacturer>  ?manufacturer .
+ ?s <x:model>  ?model . 
+}'
+
+iris2 <- rdf_query(rdf, sparql)
+```
 
 ## SPARQL: Getting back to Tidy Tables!