dataverse_metrics.Rmd

---
title: "Dataverse Metrics Report"
date: "Generated `r Sys.time()`"
output:
  pdf_document: 
    includes:
      in_header: header.tex
  html_document: default
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) 
opts <- options(knitr.kable.NA = "")
library(httr)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggfortify)
source("R/deposits.R")
source("R/mdc.R")

### Configurable variables

# Custom start and end dates: use YYYY-MM-DD format
# These are specified from the workflow
# If these aren't provided, we'll: default to today's date as end date and to the beginning of the year as the start date
period_begin <- as.Date(Sys.getenv("period_begin"))
period_end <- as.Date(Sys.getenv("period_end"))

# Calculate dates: We use YTD if none are given
if (is.na(period_begin)) {
  period_begin <- as.Date(paste(as.integer(format(Sys.Date(), "%Y")),
                              "01-01", sep = "-"))}
if (is.na(period_end)){
  period_end <- Sys.Date()
}

# Set tokens and file location settings
dataverse_host <- Sys.getenv("DATAVERSE_SERVER")
dataverse_key <- Sys.getenv("DATAVERSE_TOKEN")

dataverse <- paste0('http://', dataverse_host, "/api/info/metrics")
doi_link <- paste0('http://', dataverse_host, "/dataset.xhtml?persistentId=")

# Dataverse data. Cached locally as rds files
# This way, we don't need to re-scrape for each format (HTML/PDF)
if (file.exists("deposits.rds")) {
  deposits <- readRDS("deposits.rds")
  projects <- readRDS("projects.rds")
  if (file.exists("downloads.rds")) downloads <- readRDS("downloads.rds")
  if (file.exists("mdc.rds")) mdc <- readRDS("mdc.rds")
} else { # actually scrape
  deposits <- deposits_get(dataverse_host)
  saveRDS(deposits, "deposits.rds")
  
  projects <- content(GET(paste0(dataverse, "/uniquedownloads/monthly?parentAlias=UCS-Data")))
  projects <- bind_rows(projects$data)
  saveRDS(projects, "projects.rds")

  if (mdc_available(dataverse_host)) {
    mdc <- mdc_get(dataverse_host)
    saveRDS(mdc, "mdc.rds")
  } else {
    downloads <- content(GET(paste0(dataverse, "/downloads/monthly?parentAlias=UCS-Data")))
    downloads <- bind_rows(downloads$data)
    downloads <- projects %>% group_by(date) %>% summarize(unique=sum(count)) %>%
      full_join(downloads)
    saveRDS(downloads, "downloads.rds")
  }
}


```

## Parameters

- Dataverse instance: `r dataverse_host`
- Reporting period: `r period_begin` to `r period_end`.

```{r total_projects}
dataverse_total <- nrow(deposits)

# all projects created within specified time frame
deposits_new <- deposits %>% filter(as.Date(createdAt) < period_end & as.Date(createdAt) > period_begin) %>%
  summarize(n=n()) %>% .$n

# all *published* projects within the specified timeframe
published_new <- deposits %>% filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin) %>%
  summarize(n=n()) %>% .$n
```

- There are currently `r dataverse_total` projects. 
- `r deposits_new` new deposits were initiated between `r period_begin` and `r period_end`.
- `r published_new` projects were published between `r period_begin` and `r period_end`.

```{r subject}
# all projects by status
deposits %>% group_by(versionState) %>% summarize(n=n()) %>%
  knitr::kable(caption="Total deposits by publication status", col.names=c("Status", "Deposits"))

# all published projects by subject
deposits_subj <- deposits %>% 
  filter(versionState=="Published") %>% select(c("subjects")) %>%
  unnest_longer(subjects) %>% group_by(subjects) %>% summarize(published=n())

# published projects by subject within time frame
deposits_subj <- deposits %>% 
  filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin & 
           versionState=="Published") %>%
  select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
  summarize(period=n()) %>% full_join(deposits_subj)

# Any unpublished projects?
if (any(deposits$versionState=="Unpublished")) {
  deposits_subj <- deposits %>% filter(versionState=="Unpublished") %>% select(c("subjects")) %>%
    unnest_longer(subjects) %>% group_by(subjects) %>%
    summarize(unpublished=n()) %>% full_join(deposits_subj)
} else {
  deposits_subj$unpublished <- 0
}

# Any unpublished in the time period specified?
if (any(subset(deposits, as.Date(createdAt) < period_end &
               as.Date(createdAt) > period_begin)$versionState=="Unpublished")) {
  deposits_subj <- deposits %>% filter(versionState == "Unpublished" & 
                                      as.Date(createdAt) < period_end &
                                      as.Date(createdAt) > period_begin) %>%
    select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
    summarize(unpublished_period=n()) %>% full_join(deposits_subj)
} else {
  deposits_subj$unpublished_period <- 0
}

# exclude unpublished columns if missing dataverse key
if (dataverse_key=="") {
  include_columns <- c("subjects", "period", "published")
  column_names <- c("Subject",
    paste("Published between", period_begin, "and", period_end),
    "Total published")
} else {
  include_columns <- c("subjects", "unpublished_period",
                  "unpublished", "period", "published")
  column_names <- c("Subject", paste("Unpublished between", period_begin, "and", period_end), "Total draft", paste("Published between", period_begin, "and", period_end), "Total published")
}

# make the table
deposits_subj[, include_columns] %>%
  knitr::kable(caption="Deposits by subject",
               col.names=column_names)

```

```{r top}
projects %>% group_by(pid) %>% summarize(sum(count)) %>% 
  arrange(desc(`sum(count)`)) %>% slice(1:5) %>%
  mutate(pid = paste0("[", deposits$name[deposits$global_id %in% pid], "](", 
                      doi_link, pid, ")")) %>%
  knitr::kable(caption="Top 5 all time downloaded projects",
             row.names = FALSE, col.names = c("Project", "Downloads"))

projects$date <- as.Date(paste0(projects$date, "-01"))
projects %>% group_by(pid) %>% filter(date < period_end &
                      date > period_begin) %>%
  summarize(sum(count)) %>% arrange(desc(`sum(count)`)) %>% slice(1:5) %>%
  mutate(pid = paste0("[", deposits$name[deposits$global_id %in% pid], "](",
                      doi_link, pid, ")")) %>%
  knitr::kable(caption=paste("Top 5 downloaded projects between ", period_begin,
                             "and", period_end),
             row.names = FALSE, col.names = c("Project", "Downloads"))
```

## Aggregate data

Dataverse reports statistics as "total" and "unique". 

- Unique" views/downloads correspond to the number of unique sessions in which a data project has been viewed/downloaded. Roughly translates to: "X people have viewed/downloaded this project or its files."
- Total views/downloads correspond to the number of times a project has been viewed/downloaded, including, e.g., multiple counts for multiple views/downloads in the same session or by the same user.

```{r mdc}
# not all dataverse instances use MDC data (QDR does). In case they don't, we use the regular downloads stats
if (exists("mdc")) {
  stats_ts <- ts(mdc[, -1], frequency=12, start=strsplit(mdc$date[1], split="-")[[1]])
} else {
  stats_ts <- ts(downloads[, -1], frequency=12, 
                start=strsplit(downloads$date[1], split="-")[[1]])
}

autoplot(stats_ts, facets = FALSE) +  labs(title="Statistics", color="Statistic") +
    scale_y_continuous(labels=scales::comma)

```