Skip to content
This repository has been archived by the owner on Oct 17, 2020. It is now read-only.

Latest commit

 

History

History
306 lines (215 loc) · 5.64 KB

README.md

File metadata and controls

306 lines (215 loc) · 5.64 KB

enigma

Build Status

An R client for Enigma.io

Enigma holds government data and provides a really nice set of APIs for data, metadata, and stats on each of the datasets. That is, you can request a dataset itself, metadata on the dataset, and summary statistics on the columns of each dataset.

enigma info

LICENSE

MIT, see LICENSE file and MIT text

Quick start

Install

install.packages("devtools")
library("devtools")
install_github("ropengov/enigma")
library("enigma")

Get data

out <- enigma_data(dataset = "us.gov.whitehouse.visitor-list", select = c("namelast",
    "visitee_namelast", "last_updatedby"))

Some metadata on the results

out$info
##    rows_limit total_results   total_pages  current_page
##            50       3577135         71543             1

Look at the data, first 6 rows for readme brevity

head(out$result)
##   namelast visitee_namelast last_updatedby
## 1   BELBAS           OFFICE             T1
## 2     BOUL            POTUS             J7
## 3   VITALE             TING             GB
## 4   VITALE             TING             GB
## 5   VITALE             TING             GB
## 6   VITALE             TING             GB

Statistics on dataset columns

out <- enigma_stats(dataset = "us.gov.whitehouse.visitor-list", select = "total_people")

Some summary stats

out$result[c("sum", "avg", "stddev", "variance", "min", "max")]
## $sum
## [1] "1028567261"
##
## $avg
## [1] "289.0040005540871372"
##
## $stddev
## [1] "520.769872911814"
##
## $variance
## [1] "271201.260532586939"
##
## $min
## [1] "0"
##
## $max
## [1] "5730"

Frequency details

head(out$result$frequency)
##   total_people  count
## 1            1 158256
## 2            2  98349
## 3            6  96922
## 4            3  79896
## 5            4  79100
## 6            5  67575

Metadata on datasets

out <- enigma_metadata(dataset = "us.gov.whitehouse")

Paths

out$info$paths
## [[1]]
## [[1]]$level
## [1] "us"
##
## [[1]]$label
## [1] "United States"
##
## [[1]]$description
## [1] "United States"
##
##
## [[2]]
## [[2]]$level
## [1] "gov"
##
## [[2]]$label
## [1] "U.S. Federal Government"
##
## [[2]]$description
## [1] "Government comprising the Legislative, Executive, and Judicial branches of the United States of America."
##
##
## [[3]]
## [[3]]$level
## [1] "whitehouse"
##
## [[3]]$label
## [1] "The White House"
##
## [[3]]$description
## [1] "Located at 1600 Pennsylvania Avenue in Washington D.C., the White House has served as the home and office for every U.S. president since John Adams."

Immediate nodes

out$info$immediate_nodes
## [[1]]
## [[1]]$datapath
## [1] "us.gov.whitehouse.salaries"
##
## [[1]]$label
## [1] "White House Salaries"
##
## [[1]]$description
## [1] "The White House report to Congress listing the title and salary of every White House Office employee since 1995."

Children tables

out$info$children_tables[[1]]
## $datapath
## [1] "us.gov.whitehouse.nom-and-app"
##
## $label
## [1] "Nominations & Appointments"
##
## $description
## [1] "The nominees and appointees names, positions, agencies under which they are nominated or appointed, the agency's websites, nomination dates, and vote confirmation dates."
##
## $db_boundary_datapath
## [1] "us.gov.whitehouse"
##
## $db_boundary_label
## [1] "The White House"

Use case: Plot frequency of flight distances

First, get columns for the air carrier dataset

dset <- "us.gov.dot.rita.trans-stats.air-carrier-statistics.t100d-market-all-carrier"
head(enigma_metadata(dset)$columns$table[, c(1:4)])
##               id          label         type index
## 1     passengers     Passengers type_varchar     0
## 2        freight Freight (Lbs.) type_varchar     1
## 3           mail    Mail (Lbs.) type_varchar     2
## 4       distance Distance (Mi.) type_varchar     3
## 5 unique_carrier Unique Carrier type_varchar     4
## 6     airline_id     Airline ID type_numeric     5

Looks like there's a column called distance that we can search on. We by default for varchar type columns only frequency bake for the column.

out <- enigma_stats(dset, select = "distance")
head(out$result$frequency)
##   distance count
## 1     0.00 15648
## 2    59.00 12960
## 3   296.00 12748
## 4    16.00 12570
## 5    95.00 11966
## 6    94.00 11964

Then we can do a bit of tidying and make a plot

library("ggplot2")
library("ggthemes")
df <- out$result$frequency
df <- data.frame(distance=as.numeric(df$distance), count=as.numeric(df$count))
ggplot(df, aes(distance, count)) +
  geom_bar(stat="identity") +
  geom_point() +
  theme_grey(base_size = 18) +
  labs(y="flights", x="distance (miles)")

plot of chunk unnamed-chunk-16

Direct dataset download

Enigma provides an endpoint .../export/<datasetid> to download a zipped csv file of the entire dataset.

enigma_fetch() gives you an easy way to download these to a specific place on your machine.

enigma_fetch(dataset='com.crunchbase.info.companies.acquisition')

And a message tells you that a file has been written to disk.

zip file written to
/Users/sacmac/enigma-com-crunchbase-info-companies-acquisition-c610f13ccda70c8051f1aa611766847b.zip