From ea00a1f86450a6565c5641bb96e5e0b083248c58 Mon Sep 17 00:00:00 2001
From: Jenny Bryan <jenny.f.bryan@gmail.com>
Date: Sun, 24 Sep 2023 14:32:22 -0700
Subject: [PATCH] Make it easier to learn about `id`

Closes #505
---
 R/vroom.R    |  6 ++++++
 README.Rmd   |  9 +++++----
 README.md    | 32 +++++++++++++++++---------------
 man/vroom.Rd |  6 ++++++
 4 files changed, 34 insertions(+), 19 deletions(-)
diff --git a/R/vroom.R b/R/vroom.R
index 780da745..248e1f09 100644
--- a/R/vroom.R
+++ b/R/vroom.R
@@ -180,6 +180,12 @@
 #'
 #' # Pass the filenames directly to vroom, they are efficiently combined
 #' vroom(mtcars_by_cyl)
+#'
+#' # If you need to extract data from the filenames, use `id` to request a
+#' # column that reveals the underlying file path
+#' dat <- vroom(mtcars_by_cyl, id = "source")
+#' dat$source <- basename(dat$source)
+#' dat
 vroom <- function(
   file,
   delim = NULL,
diff --git a/README.Rmd b/README.Rmd
index dc660773..e1c6ff3a 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -135,21 +135,22 @@ connections!).
 
 First we generate some files to read by splitting the nycflights dataset by
 airline.
+For the sake of the example, we'll just take the first 2 lines of each file.
 ```{r}
 library(nycflights13)
 purrr::iwalk(
   split(flights, flights$carrier),
-  ~ { .x$carrier[[1]]; vroom::vroom_write(.x, glue::glue("flights_{.y}.tsv"), delim = "\t") }
+  ~ { .x$carrier[[1]]; vroom::vroom_write(head(.x, 2), glue::glue("flights_{.y}.tsv"), delim = "\t") }
 )
 ```
 
-Then we can efficiently read them into one tibble by passing the filenames
-directly to vroom.
+Then we can efficiently read them into one tibble by passing the filenames directly to vroom.
+The `id` argument can be used to request a column that reveals the filename that each row originated from.
 
 ```{r}
 files <- fs::dir_ls(glob = "flights*tsv")
 files
-vroom::vroom(files)
+vroom::vroom(files, id = "source")
 ```
 
 ```{r, include = FALSE}
diff --git a/README.md b/README.md
index 863fde56..0ac88c17 100644
--- a/README.md
+++ b/README.md
@@ -112,18 +112,20 @@ vroom natively supports reading from multiple files (or even multiple
 connections!).
 
 First we generate some files to read by splitting the nycflights dataset
-by airline.
+by airline. For the sake of the example, we’ll just take the first 2
+lines of each file.
 
 ``` r
 library(nycflights13)
 purrr::iwalk(
   split(flights, flights$carrier),
-  ~ { .x$carrier[[1]]; vroom::vroom_write(.x, glue::glue("flights_{.y}.tsv"), delim = "\t") }
+  ~ { .x$carrier[[1]]; vroom::vroom_write(head(.x, 2), glue::glue("flights_{.y}.tsv"), delim = "\t") }
 )
 ```
 
 Then we can efficiently read them into one tibble by passing the
-filenames directly to vroom.
+filenames directly to vroom. The `id` argument can be used to request a
+column that reveals the filename that each row originated from.
 
 ``` r
 files <- fs::dir_ls(glob = "flights*tsv")
@@ -132,8 +134,8 @@ files
 #> flights_EV.tsv flights_F9.tsv flights_FL.tsv flights_HA.tsv flights_MQ.tsv 
 #> flights_OO.tsv flights_UA.tsv flights_US.tsv flights_VX.tsv flights_WN.tsv 
 #> flights_YV.tsv
-vroom::vroom(files)
-#> Rows: 336776 Columns: 19
+vroom::vroom(files, id = "source")
+#> Rows: 32 Columns: 20
 #> ── Column specification ────────────────────────────────────────────────────────
 #> Delimiter: "\t"
 #> chr   (4): carrier, tailnum, origin, dest
@@ -142,16 +144,16 @@ vroom::vroom(files)
 #> 
 #> ℹ Use `spec()` to retrieve the full column specification for this data.
 #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
-#> # A tibble: 336,776 × 19
-#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#>   <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>          <dbl>
-#> 1  2013     1     1      810            810         0     1048           1037
-#> 2  2013     1     1     1451           1500        -9     1634           1636
-#> 3  2013     1     1     1452           1455        -3     1637           1639
-#> # ℹ 336,773 more rows
-#> # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <dbl>,
-#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#> #   hour <dbl>, minute <dbl>, time_hour <dttm>
+#> # A tibble: 32 × 20
+#>   source          year month   day dep_time sched_dep_time dep_delay arr_time
+#>   <chr>          <dbl> <dbl> <dbl>    <dbl>          <dbl>     <dbl>    <dbl>
+#> 1 flights_9E.tsv  2013     1     1      810            810         0     1048
+#> 2 flights_9E.tsv  2013     1     1     1451           1500        -9     1634
+#> 3 flights_AA.tsv  2013     1     1      542            540         2      923
+#> # ℹ 29 more rows
+#> # ℹ 12 more variables: sched_arr_time <dbl>, arr_delay <dbl>, carrier <chr>,
+#> #   flight <dbl>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
+#> #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
 ```
 
 ## Learning more
diff --git a/man/vroom.Rd b/man/vroom.Rd
index 86bdbb03..944535d2 100644
--- a/man/vroom.Rd
+++ b/man/vroom.Rd
@@ -239,4 +239,10 @@ mtcars_by_cyl
 
 # Pass the filenames directly to vroom, they are efficiently combined
 vroom(mtcars_by_cyl)
+
+# If you need to extract data from the filenames, use `id` to request a
+# column that reveals the underlying file path
+dat <- vroom(mtcars_by_cyl, id = "source")
+dat$source <- basename(dat$source)
+dat
 }