Skip to content

Commit

Permalink
exploring with large datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
cboettig committed Feb 15, 2018
1 parent 49a7c7c commit 55a7c1e
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 13 deletions.
28 changes: 21 additions & 7 deletions inst/examples/as_rdf.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
as_rdf <- function(df, base_uri = NULL) UseMethod("as_rdf")
as_rdf <- function(df, key = NULL, base_uri = NULL) UseMethod("as_rdf")



## tidy data to rdf
as_rdf.data.frame <- function(df, base_uri = NULL){
as_rdf.data.frame <- function(df, key = NULL, base_uri = NULL){

x <- tibble::rowid_to_column(df, "subject")
x <- df
if(is.null(key)){
x <- tibble::rowid_to_column(x, "subject")
} else {
names(x)[names(x) == key] <- "subject"
}
suppressWarnings(
x <- tidyr::gather(x, key = predicate, value = object, -subject)
)

## gather looses col-classes
## gather looses col-classes, so pre-compute them (with base R)
col_classes <- data.frame(datatype =
vapply(df, rdflib:::xs_class, character(1)))
col_classes <- tibble::rownames_to_column(col_classes, "predicate")
x <- dplyr::inner_join(x, col_classes, "predicate")
vapply(df, rdflib:::xs_class, character(1)),
stringsAsFactors = FALSE)
col_classes$predicate <- rownames(col_classes)
rownames(col_classes) <- NULL


x <- merge(x, col_classes, by = "predicate")

## NA to blank string
x$object[is.na(x$object)] <- ""
x$subject[is.na(x$subject)] <- ""

rdf <- rdf()
for(i in seq_along(x$subject)){
Expand Down
43 changes: 37 additions & 6 deletions vignettes/rdf_intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -350,14 +350,45 @@ _Still working on writing this section_

```{r}
source(system.file("examples/as_rdf.R", package="rdflib"))
## Testing: Digest some data.frames into RDF and extract back
library(tidyverse)
cars <- mtcars %>% rownames_to_column("Model")
x1 <- as_rdf(iris, "iris:")
x2 <- as_rdf(cars, "mtcars:")
rdf <- c(x1,x2)
```

Note: looping over `rdf_add` can be a slow way to add hundreds of thousands of triples. Coercing into RDF via JSON-LD might be much faster.

```{r}
library(nycflights13)
library(tidyverse)
df <- flights %>%
left_join(airlines) %>%
left_join(planes, by="tailnum") %>%
select(carrier, manufacturer, model)
```


```{r}
x1 <- as_rdf(airlines, "carrier", "x:")
x2 <- as_rdf(airports, "faa", "x:")
x3 <- as_rdf(planes, "tailnum", "x:")
system.time(
x4 <- as_rdf(flights, NULL, "x:")
)
rdf <- c(x1,x2,x3)
```


```{r}
sparql <-
'SELECT ?carrier ?manufacturer ?model
WHERE {
?s <x:carrier> ?carrier .
?s <x:manufacturer> ?manufacturer .
?s <x:model> ?model .
}'
iris2 <- rdf_query(rdf, sparql)
```

## SPARQL: Getting back to Tidy Tables!

Expand Down

0 comments on commit 55a7c1e

Please sign in to comment.