From ea00a1f86450a6565c5641bb96e5e0b083248c58 Mon Sep 17 00:00:00 2001 From: Jenny Bryan Date: Sun, 24 Sep 2023 14:32:22 -0700 Subject: [PATCH] Make it easier to learn about `id` Closes #505 --- R/vroom.R | 6 ++++++ README.Rmd | 9 +++++---- README.md | 32 +++++++++++++++++--------------- man/vroom.Rd | 6 ++++++ 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/R/vroom.R b/R/vroom.R index 780da745..248e1f09 100644 --- a/R/vroom.R +++ b/R/vroom.R @@ -180,6 +180,12 @@ #' #' # Pass the filenames directly to vroom, they are efficiently combined #' vroom(mtcars_by_cyl) +#' +#' # If you need to extract data from the filenames, use `id` to request a +#' # column that reveals the underlying file path +#' dat <- vroom(mtcars_by_cyl, id = "source") +#' dat$source <- basename(dat$source) +#' dat vroom <- function( file, delim = NULL, diff --git a/README.Rmd b/README.Rmd index dc660773..e1c6ff3a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -135,21 +135,22 @@ connections!). First we generate some files to read by splitting the nycflights dataset by airline. +For the sake of the example, we'll just take the first 2 lines of each file. ```{r} library(nycflights13) purrr::iwalk( split(flights, flights$carrier), - ~ { .x$carrier[[1]]; vroom::vroom_write(.x, glue::glue("flights_{.y}.tsv"), delim = "\t") } + ~ { .x$carrier[[1]]; vroom::vroom_write(head(.x, 2), glue::glue("flights_{.y}.tsv"), delim = "\t") } ) ``` -Then we can efficiently read them into one tibble by passing the filenames -directly to vroom. +Then we can efficiently read them into one tibble by passing the filenames directly to vroom. +The `id` argument can be used to request a column that reveals the filename that each row originated from. ```{r} files <- fs::dir_ls(glob = "flights*tsv") files -vroom::vroom(files) +vroom::vroom(files, id = "source") ``` ```{r, include = FALSE} diff --git a/README.md b/README.md index 863fde56..0ac88c17 100644 --- a/README.md +++ b/README.md @@ -112,18 +112,20 @@ vroom natively supports reading from multiple files (or even multiple connections!). First we generate some files to read by splitting the nycflights dataset -by airline. +by airline. For the sake of the example, we’ll just take the first 2 +lines of each file. ``` r library(nycflights13) purrr::iwalk( split(flights, flights$carrier), - ~ { .x$carrier[[1]]; vroom::vroom_write(.x, glue::glue("flights_{.y}.tsv"), delim = "\t") } + ~ { .x$carrier[[1]]; vroom::vroom_write(head(.x, 2), glue::glue("flights_{.y}.tsv"), delim = "\t") } ) ``` Then we can efficiently read them into one tibble by passing the -filenames directly to vroom. +filenames directly to vroom. The `id` argument can be used to request a +column that reveals the filename that each row originated from. ``` r files <- fs::dir_ls(glob = "flights*tsv") @@ -132,8 +134,8 @@ files #> flights_EV.tsv flights_F9.tsv flights_FL.tsv flights_HA.tsv flights_MQ.tsv #> flights_OO.tsv flights_UA.tsv flights_US.tsv flights_VX.tsv flights_WN.tsv #> flights_YV.tsv -vroom::vroom(files) -#> Rows: 336776 Columns: 19 +vroom::vroom(files, id = "source") +#> Rows: 32 Columns: 20 #> ── Column specification ──────────────────────────────────────────────────────── #> Delimiter: "\t" #> chr (4): carrier, tailnum, origin, dest @@ -142,16 +144,16 @@ vroom::vroom(files) #> #> ℹ Use `spec()` to retrieve the full column specification for this data. #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. -#> # A tibble: 336,776 × 19 -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> -#> 1 2013 1 1 810 810 0 1048 1037 -#> 2 2013 1 1 1451 1500 -9 1634 1636 -#> 3 2013 1 1 1452 1455 -3 1637 1639 -#> # ℹ 336,773 more rows -#> # ℹ 11 more variables: arr_delay , carrier , flight , -#> # tailnum , origin , dest , air_time , distance , -#> # hour , minute , time_hour +#> # A tibble: 32 × 20 +#> source year month day dep_time sched_dep_time dep_delay arr_time +#> +#> 1 flights_9E.tsv 2013 1 1 810 810 0 1048 +#> 2 flights_9E.tsv 2013 1 1 1451 1500 -9 1634 +#> 3 flights_AA.tsv 2013 1 1 542 540 2 923 +#> # ℹ 29 more rows +#> # ℹ 12 more variables: sched_arr_time , arr_delay , carrier , +#> # flight , tailnum , origin , dest , air_time , +#> # distance , hour , minute , time_hour ``` ## Learning more diff --git a/man/vroom.Rd b/man/vroom.Rd index 86bdbb03..944535d2 100644 --- a/man/vroom.Rd +++ b/man/vroom.Rd @@ -239,4 +239,10 @@ mtcars_by_cyl # Pass the filenames directly to vroom, they are efficiently combined vroom(mtcars_by_cyl) + +# If you need to extract data from the filenames, use `id` to request a +# column that reveals the underlying file path +dat <- vroom(mtcars_by_cyl, id = "source") +dat$source <- basename(dat$source) +dat }