r4lis-crc.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
  krantz2]{krantz}
\usepackage{amsmath,amssymb}
\usepackage{lmodern}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math}
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.33,0.33,0.33}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.61,0.61,0.61}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.06,0.06,0.06}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.5,0.5,0.5}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0,0,0}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.27,0.27,0.27}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.27,0.27,0.27}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.06,0.06,0.06}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.14,0.14,0.14}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.06,0.06,0.06}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0,0,0}{#1}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.27,0.27,0.27}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.43,0.43,0.43}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0,0,0}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.5,0.5,0.5}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.5,0.5,0.5}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0,0,0}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.5,0.5,0.5}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage[bf,singlelinecheck=off]{caption}
\captionsetup[table]{labelsep=space}
\captionsetup[figure]{labelsep=space}
\usepackage[scale=.8]{sourcecodepro}

\usepackage{framed,color}
\definecolor{shadecolor}{RGB}{248,248,248}

\renewcommand{\textfraction}{0.05}
\renewcommand{\topfraction}{0.8}
\renewcommand{\bottomfraction}{0.8}
\renewcommand{\floatpagefraction}{0.75}

\renewenvironment{quote}{\begin{VF}}{\end{VF}}
% \let\oldhref\href
% \renewcommand{\href}[2]{#2\footnote{\url{#1}}}

\makeatletter
\newenvironment{kframe}{%
\medskip{}
\setlength{\fboxsep}{.8em}
 \def\at@end@of@kframe{}%
 \ifinner\ifhmode%
  \def\at@end@of@kframe{\end{minipage}}%
  \begin{minipage}{\columnwidth}%
 \fi\fi%
 \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
 \colorbox{shadecolor}{##1}\hskip-\fboxsep
     % There is no \\@totalrightmargin, so:
     \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
 \MakeFramed {\advance\hsize-\width
   \@totalleftmargin\z@ \linewidth\hsize
   \@setminipage}}%
 {\par\unskip\endMakeFramed%
 \at@end@of@kframe}
\makeatother

\renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}}

\usepackage{makeidx}
\makeindex

% \urlstyle{tt}

\usepackage{amsthm}
\makeatletter
\def\thm@space@setup{%
  \thm@preskip=8pt plus 2pt minus 4pt
  \thm@postskip=\thm@preskip
}
\makeatother

\frontmatter
\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{apalike}
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
  pdftitle={Hands-on Data Science for Librarians},
  pdfauthor={Sarah Lin \& Dorris Scott},
  colorlinks=true,
  linkcolor={Maroon},
  filecolor={Maroon},
  citecolor={Blue},
  urlcolor={Blue},
  pdfcreator={LaTeX via pandoc}}

\title{Hands-on Data Science for Librarians}
\author{Sarah Lin \& Dorris Scott}
\date{2022-09-21}

\begin{document}
\maketitle

% you may need to leave a few empty pages before the dedication page

%\cleardoublepage\newpage\thispagestyle{empty}\null
%\cleardoublepage\newpage\thispagestyle{empty}\null
%\cleardoublepage\newpage
\thispagestyle{empty}

\begin{center}
To my son,

without whom I should have finished this book two years earlier
%\includegraphics{images/dedication.pdf}
\end{center}

\setlength{\abovedisplayskip}{-5pt}
\setlength{\abovedisplayshortskip}{-5pt}

{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{2}
\tableofcontents
}
\listoffigures
\hypertarget{preface}{%
\chapter*{Preface}\label{preface}}


Resources to learn R are all over the internet and most libraries. However, easy access to resources doesn't mean it's easy to learn to do data science in R. This book spends time on an introduction to R and basic data cleaning tasks that are taught elsewhere because we want to provide a gentle, low-stress introduction to key aspects of data science using R. Librarians have varied backgrounds, but for most of us, rigorous education in mathematics, statistics, and computer science are not part of our expertise. That doesn't mean we can't learn to code or do data science in code. Based on our own experiences, we are particuarly concerned that you, our reader, are able to access the content in this book with minimal frustration, exasperation, and despair.

Using resources at the end of each chapter, in the appendix, and in the bibliography of this book will provide you with next steps to further your data science skills beyond this introductory text. With a basic foundation in data science skills, any of the resources we link to should be comprehensible, if challenging. We wish you well on our data science journey!

\hypertarget{what-youll-need}{%
\section*{What you'll need}\label{what-youll-need}}


Access to a personal computer (desktop or laptop) with permission to install programs, such as the Chrome web browser and extensions.

\hypertarget{software-information-and-conventions}{%
\section*{Software information and conventions}\label{software-information-and-conventions}}


We used the \emph{knitr}\index{knitr} package \citep{xie2015} and the \emph{bookdown}\index{bookdown} package \citep{R-bookdown} to compile my book. Our R session information is shown below:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{xfun}\SpecialCharTok{::}\FunctionTok{session\_info}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Locale: en_US.UTF-8 / en_US.UTF-8 / en_US.UTF-8 / C / en_US.UTF-8 / en_US.UTF-8
##
## Package version:
##   base64enc_0.1.3 bookdown_0.28   bslib_0.4.0
##   cachem_1.0.6    cli_3.3.0       compiler_4.2.1
##   digest_0.6.29   evaluate_0.16   fastmap_1.1.0
##   fs_1.5.2        glue_1.6.2      graphics_4.2.1
##   grDevices_4.2.1 highr_0.9       htmltools_0.5.3
##   jquerylib_0.1.4 jsonlite_1.8.0  knitr_1.39
##   magrittr_2.0.3  memoise_2.0.1   methods_4.2.1
##   R6_2.5.1        rappdirs_0.3.3  renv_0.15.5
##   rlang_1.0.4     rmarkdown_2.14  rstudioapi_0.13
##   sass_0.4.2      stats_4.2.1     stringi_1.7.8
##   stringr_1.4.0   tinytex_0.40    tools_4.2.1
##   utils_4.2.1     xfun_0.32       yaml_2.3.5
\end{verbatim}

Package names are in italic text (e.g., \emph{rmarkdown}), and inline code and filenames are formatted in a typewriter font (e.g., \texttt{knitr::knit(\textquotesingle{}foo.Rmd\textquotesingle{})}). Function names are followed by parentheses (e.g., \texttt{bookdown::render\_book()}).

We use the assignment (\texttt{\textless{}-}) operator in all code chunks to assign and store objects in this book, but you can also use the equals sign (\texttt{=}).

In 2022, the company RStudio, PBC changed its name to Posit, PBC. The open source IDE created by this company is now known as either ``the RStudio IDE'', or simply ``RStudio.'' We use both terms interchangeably in this book.

\hypertarget{acknowledgments}{%
\section*{Acknowledgments}\label{acknowledgments}}


People to thank: Greg Wilson, Myfawnwy Johnston, Patrick Alston, Emily Nimsakont, Luke Johnston, Carl Howe

\hypertarget{about-the-authors}{%
\chapter*{About the Authors}\label{about-the-authors}}


Sarah Lin manages the Enterprise Information Management team at Posit, PBC. A graduate of the University of Illinois iSchool, Sarah worked as a technical services librarian in many different library types before moving into corporate librarianship and information management. She didn't know anything about coding in R before joining Posit.

Dorris Scott is \ldots{}

\mainmatter

\hypertarget{introduction}{%
\chapter{Introduction}\label{introduction}}

\hypertarget{data-science}{%
\section{What is data science?}\label{data-science}}

Data science degree and certificate programs have sprouted at academic institutions around the country, while books, articles, and conference programs about data and how to analyze it regularly appear in library conference programs and educational events. The increased visibility of data science belies the fact that data science has been around for a while. Indeed, data collection and the need to make sense of it is not new. R, the programming language used in this book, has been around for decades. However, experts have some back-and-forth about the discipline of data science and its relationship to other subjects.

Rather than take sides, this book takes a broad view of what constitutes data science and highlights five interdependent elements. These include both \emph{mathematics} and \emph{statistics} on the computational side. With or without a graphical user interface, data science is made real through \emph{computer programming}. Practitioners of data science bring extensive \emph{subject matter knowledge}. Their expertise enables them to communicate their conclusions through data \emph{visualizations}, often providing pictures that speak louder than numbers.

\begin{figure}
\centering
\includegraphics{images/DS-AI-CS-Graphic-UPDATED-Aug2021.jpeg}
\caption{Data Science as Discipline Diagram, Data Science Program, Viterbi School of Engineering, University of Southern California, \url{http://datascience.usc.edu}, 2021}
\end{figure}

Data science is a discipline that extracts knowledge from data in various fields, including librarianship. While data science can help make decisions, it is not a substitute for human decision-making. It can provide insights and generalizations from collected observations (data). Aspects of some subjects remain unquantifiable yet comprehensible to human interpretation. Data analysis is fallible; it requires data science practitioners to bring their expertise to bear on interpretation and decision-making.

Whether we realize it or not, data science is a broad discipline that saturates our professional lives. For academic librarians, faculty, staff, and students learn and perform data science tasks daily, such as data cleaning, management, and visualizations. This occurs in computational science disciplines as well as the biological, physical, social sciences, and even in the humanities. In addition, librarians can act as data curators who help researchers publish or deposit their data to data repositories and academic journals.

Corporations and other institutions with special libraries likely have teams using many tools to analyze the market or user behavior. Predictive text in search engines relies upon text mining and machine learning. Humanities and social science professionals use maps, analysis, web scraping, and text mining to create and analyze datasets. These disciplines need to communicate their findings through written reports and dashboards for their stakeholders and constituents. Data science also permeates the public sphere. Users are subject to machine learning algorithms in their daily lives within loan applications, resume screenings, social media feeds, news visualizations, public health data, social services eligibility, and medical care. Public librarians interact with patrons whose complex information needs may result from how data science impacts their lives. Data literacy is required when data science provides input for human decisions, particularly when those decisions affect others' well-being.

\hypertarget{learn-ds}{%
\section{Why learn data science?}\label{learn-ds}}

Librarians have long collected metrics about their collections and their patrons. However, the pervasiveness of data collection and the need to justify or rationalize library expenditures creates an environment that data science can exploit in the best interests of library and information professionals. Because librarians are both consumers of data and teachers of data literacy, they must acquire skills to perform data science and interrogate data analyses to determine their veracity.

Data literacy is the ability to read, interpret and analyze data, and it is a requirement when people use data to distort the truth\footnote{\url{https://royalsocietypublishing.org/doi/10.1098/rsos.190161}}. Unfortunately, data literacy is both a necessary and frequently needed skill. Data science enables data literacy and democratizes access to the source material; so much of our personal and professional lives are affected by data, whether created or influenced by data-driven decision-making. Data provides valuable information to help experts make decisions. Beyond just the economy, so much in our society rewards data literacy and penalizes the illiterate. Because of this, data is too valuable to be left only to data scientists, computer scientists, or statisticians. Instead, subject experts need to learn to code because they know their data best and are best suited to analyze it and draw healthy and accurate conclusions. Your professional expertise lets you ask the right questions and interpret meaning from the data. When experts in their field add data science skills to their repertoire, data science is further democratized\footnote{\url{https://www.rstudio.com/resources/rstudioconf-2020/data-science-education-in-2022/}}, and data-driven decisions are more impactful.

\hypertarget{use-code}{%
\section{Why use code?}\label{use-code}}

Ever the proponents of literacy, librarians have embraced data literacy and data-driven decision-making for many years. Conference sessions to improve both data collection and analytics presentation abound. When data skills are adopted, it is usually in the context of a commercial spreadsheet or analytics program. Learning to code is not as common among library and information professionals; this book argues that learning to code is doable and provides increased utility and impact. In the long run, learning a programming language for data science is best because it is accessible to all, ensures data analysis is reproducible, and it is future-proof as applications change.

If we define programming as being able to talk to computers in a language they understand, then most librarians have already done that and are probably quite good at it. Technical services and cataloging librarians will be familiar with MARC (Machine Readable Cataloging), the special syntax libraries use to catalog their collections so that computer software can read. More commonly, if you've written formulas in a spreadsheet application, you've dabbled in the basics of computer programming. However, learning to code offers far greater applications and versatility than a spreadsheet application.

The core benefits of doing data science in code are interoperability and reproducibility. Many academic librarians will be familiar with FAIR Principles\footnote{\url{https://www.go-fair.org/}} through their data curation work; this initiative focuses on making information Findable, Accessible, Interoperable, and Reproducible. Doing data science in code ensures that data and data analysis are both interoperable and reproducible, neither of which is possible with proprietary software applications.

Interoperability requires that other librarians who may have completely different software applications on their computers would be able to run anyone else's code. The R programming language is an open-source tool that is free to anyone across the globe and provides transparent data analysis. Additionally, platform-agnostic tools like coding can bring together the output of multiple commercial products to rationalize and analyze the data together.

Reproducibility is closely related to interoperability because code should run on any application configuration. Still, the analysis must be able to be re-run by another person and get the same results. In the past few years, there have been stories in the news about errors in spreadsheet applications that allowed researchers to draw erroneous conclusions. In one case, years of austerity measures around the globe rested on one economics research paper that was missing a few values for some variables\footnote{\url{https://www.businessinsider.com/thomas-herndon-michael-ash-and-robert-pollin-on-reinhart-and-rogoff-2013-4}}. Using code allows researchers to combine their data, code, and analysis, providing transparency into the process of data science. Unfortunately, there have other examples of reproducibility problems in various scientific disciplines: physics\footnote{\url{https://physicstoday.scitation.org/do/10.1063/PT.6.1.20180822a/full/}}, psychology\footnote{\url{https://www.science.org/doi/10.1126/science.aac4716}}, and medical research\footnote{\url{https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1002165}} as well. A librarian will need to re-run their analyses on new iterations of data without replicating the data cleaning and analysis steps manually. Thankfully, code can be run repeatedly with new data as input, saving hours and hours while repeating each step precisely. The ultimate benefit of doing data science using computer programming languages is the ability to share raw data and the steps for analysis.

\hypertarget{vignette}{%
\section{Vignette}\label{vignette}}

This book creates an overarching narrative that presents realistic code examples and valuable outputs centered around a hypothetical outreach librarian in St.~Louis, MO. Envision that you are this outreach librarian and you want to create a partnership with community institutions to address unemployment in St.~Louis. Your goal is to present a report to stakeholders at the library and within the community that analyzes several data sources related to employment and unemployment in St.~Louis. You will employ different data science skills to compile the report. Each chapter in this book will touch on a different aspect of her report, building upon each other to learn data science and code each analytical section in R.

The reader is invited to inhabit the role of this librarian, who we will address as `you' throughout the book as we introduce each chapter with a scenario that describes what the librarian is trying to accomplish with each data science skill.

\hypertarget{book-structure}{%
\section{Structure of this book}\label{book-structure}}

In pursuit of data to justify a community partnership, you will learn R in incremental steps with a topic for each chapter that will produce one aspect of the final report. This book isn't an exhaustive textbook on R or data science but rather a guidebook through the central functional practices of data science in R. The focus is on immediately applicable skill acquisition made easier through library-specific hypothetical tasks. The chapter topics include:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Use RStudio to code in R
\item
  Learn to clean data using code
\item
  Plot basic visualizations
\item
  Scrape websites using code
\item
  Visualize data using maps
\item
  Use code to mine textual data
\item
  Publish your code using R Markdown
\item
  Communicate your findings via Flexdashboard
\item
  Let stakeholders draw their conclusions from an interactive Shiny application
\item
  Understand how AI intersects with employment by understanding how machine learning works
\end{enumerate}

To expand on this list, the first two chapters explain R, the RStudio IDE used to program in R, and how to get started cleaning data. In any data-related project, cleaning data is the first and often the most time-consuming task. Chapters three through nine teach different data science skills: plots/graphs, web scraping, geographic visualizations, text mining, publishing, dashboards, and interactive web applications. The final chapter covers machine learning, explaining the construction of algorithms and their implications for librarians who interact with them. An explanation of how resumé screening software uses machine learning to accept or reject job applications ties how machine learning works with experiences job seekers have through the prospective outreach partnership.

\hypertarget{audience}{%
\section{Who this book is for}\label{audience}}

The anticipated audience for this book is all librarians and information professionals interested in learning data science and applying it to their everyday jobs. Public, academic, medical, legal, special, and corporate librarians can all put the data science skills taught in this book to use in their daily work. The book has been designed with examples adaptable to many job positions and library types, creating a practical introduction to primary data science skills needed in a professional setting. This book does not include in-depth explanations of particular R packages, the statistical and mathematical principles behind package functions, or theoretical foundations of different analysis types. There are several related topics that, while not required, are helpful to learn alongside or following this book. The Appendix includes those topics, and resources to learn more about them.

\hypertarget{rstudio}{%
\chapter{Using RStudio's IDE}\label{rstudio}}

\hypertarget{rstudio-los}{%
\section{Learning Objective: use the RStudio IDE (Integrated Development Environment) for importing data.}\label{rstudio-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Use your computer knowledge to install RStudio.
\item
  Describe the function of each pane in the IDE.
\item
  Modify IDE settings to their liking.
\item
  Use the IDE to import a tabular data file.
\end{enumerate}

\hypertarget{rstudio-terms}{%
\section{Terms You'll Learn}\label{rstudio-terms}}

\begin{itemize}
\tightlist
\item
  integrated development environment (IDE)
\item
  package
\item
  tidyverse
\item
  session
\item
  working directory
\end{itemize}

\hypertarget{rstudio-scenario}{%
\section{Scenario}\label{rstudio-scenario}}

You want to use R to do data science and publish a data-based report to support your outreach efforts, but you don't know how to code in R or get started.

\hypertarget{rstudio-intro}{%
\section{Introduction}\label{rstudio-intro}}

This chapter aims to get you up and running with programming in R using the RStudio \emph{Integrated Development Environment}, or IDE, which is generally referred to as `RStudio.' An IDE is a computer program that makes it easier to code; while you can use your computer's command line\footnote{the program that enables you to type commands that your computer will follow to complete a task, such as Terminal on MacOS} or UNIX shell\footnote{\url{https://librarycarpentry.org/lc-shell/01-intro-shell/index.html}} interface to code, the graphical user interface of an IDE makes it a lot more accessible. The distinction between coding in the command line or using an IDE is a lot like the difference between finding stored files in the command line or or using Finder/File Explorer on your work or personal computer. While there are some scenarios where using the command line makes the most sense, for the day-to-day, most computer users use the Finder/File Explorer to more easily navigate through their files and data. IDEs are very common in computer programming, and many different applications exist. We're using RStudio because it was designed specifically for R, though you can use it to program in Python and other languages. It is free and open-source, and using it to program in R is a widely-used way to wrangle and interpret data. We will also cover the basics of R as a programming language, and a widely-used core of packages called the Tidyverse and then install RStudio to get started with R.

\hypertarget{what-is-r}{%
\section{What is R?}\label{what-is-r}}

Version 1.0 of the R programming language was released publicly in 2000\footnote{\url{https://blog.revolutionanalytics.com/2020/07/the-history-of-r-updated-for-2020.html}}, five years after initial distribution as open-source software. The intellectual genealogy of R comes from the S statistical programming language, created at Bell Labs in the 1970s\footnote{\url{https://youtu.be/jk9S3RTAl38}}. As a programming language, R was designed for statisticians to analyze data interactively. R's statistical and academic origins stand in contrast to other programming languages used for data science.

R is an object-based programming language, where code and outputs are stored as objects to be acted upon later. Algebra might store a single value or mathematical expression in a variable; R can hold single or multiple variables, or values, in each object. Where algebra uses an equal sign to denote what a variable is, such as \texttt{x\ =\ 5}, R uses \texttt{\textless{}-} in the same way. You can read the left-pointing arrow as the word ``is.'' We can use the \texttt{print()} function to display the value of an object when we put the object we want to see inside of the parentheses.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{x }\OtherTok{\textless{}{-}} \DecValTok{5}
\NormalTok{y }\OtherTok{\textless{}{-}}\NormalTok{ x }\SpecialCharTok{+} \DecValTok{2}

\FunctionTok{print}\NormalTok{(y)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 7
\end{verbatim}

R lets us work with data interactively through the use of code. When we write code in R, we are usually creating and saving data objects of various classes according to our needs. We can then conduct operations and/or analyses on these data objects in our R session(s).

Common classes (types) for these objects include numeric, character (text) and logical (true/false). Objects of a single class are often collected and stored together as vectors. Vectors can in turn be grouped together to make larger data objects you might already be familiar with, including matrices, arrays, or data frames. In this book we focus on data frames.

The data frame structure is central to data analysis because it requires each element of the data frame to have the same length, just like rows in a table. Another way to say this is that each column must be the same length; each column in the data frame must have the same number of rows. This consistent table-like structure is vital for many data science functions. Readers who move on to further data science tasks beyond this book will need to understand data class and structures. Coding errors in R are often traced back to problems with incompatible data structures or inconsistent application of classes.

Please note that any code preceded by a \texttt{\#} functions as a comment because R ignores anything following that character. We can combine multiple values into one object using \texttt{c()} to determine an object's class using \texttt{class()}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# numeric vector}
\NormalTok{numbers }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\DecValTok{8}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{5}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{9}\NormalTok{) }

\FunctionTok{class}\NormalTok{(numbers)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "numeric"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#logical vector}
\NormalTok{values }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\ConstantTok{TRUE}\NormalTok{, }\ConstantTok{TRUE}\NormalTok{, }\ConstantTok{TRUE}\NormalTok{, }\ConstantTok{FALSE}\NormalTok{, }\ConstantTok{FALSE}\NormalTok{, }\ConstantTok{FALSE}\NormalTok{, }\ConstantTok{TRUE}\NormalTok{)}

\FunctionTok{class}\NormalTok{(values)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "logical"
\end{verbatim}

Designed explicitly to work with data, R works with many object types. Like many other programming languages, when data scientists find a need for specific applications and groups of related functions, they can create and bundle them into what R calls ``packages''. A \emph{package} is a group of associated functions equivalent to what in other languages might be called a ``library.''

The complete list of the thousands and thousands of contributed R packages is on CRAN (\url{https://cran.r-project.org/}). These run the gamut from technical to purely fun. Some packages focus on a particular skill (web scraping) or a specific dataset (the Project Gutenberg books). You'll become familiar with a dozen or so packages throughout this book.

\hypertarget{tidyverse}{%
\section{Introducing the Tidyverse}\label{tidyverse}}

One of the most helpful R packages to become familiar with is the \emph{tidyverse} package, which is a collection of packages\footnote{\url{https://www.tidyverse.org/packages/}} usually referred to as the Tidyverse. Each of these, listed below, focuses on a different aspect of cleaning or tidying data before it's used or analyzed further. While it is possible to use ``base R,'' meaning the functions that come loaded with R when installing it, many R users prefer to use the Tidyverse because they make common tasks in R easier. The Tidyverse packages all work together, and Posit, PBC staff maintain them.

The ``core'' Tidyverse packages include:
* ggplot2, for data visualization
* dplyr, for data manipulation
* tidyr, for data tidying
* readr, for importing data from CSV files
* purrr, for functional programming (such as repetitive functions)
* tibble, for tibbles, a more straightforward way to create data frames
* stringr, for manipulating strings\footnote{\url{https://en.wikipedia.org/wiki/String_(computer_science)}}
* forecast for factors (a data structure not used in this book)

There are several other additional packages in the Tidyverse, and we will use several of them in this book:
* httr, for web APIs
* jsonlite, for JSON files
* readxl, for .xls and .xlsx files. (not used in this book, but useful for those who use Microsoft Excel frequently)
* rvest, for web scraping.
* xml2, for working with XML formats

This book will cover the purpose and functions of these packages as they are needed.

\hypertarget{ide-start}{%
\section{Getting Started with the RStudio IDE}\label{ide-start}}

There are many ways to interface with R on your computer, and you can chose the interface that makes the most sense for you. Millions of R users use the graphical user interface provided by RStudio:

\begin{quote}
The RStudio IDE is a set of integrated tools designed to help you be more productive with R and Python. It includes a console, syntax-highlighting editor that supports direct code execution, and a variety of robust tools for plotting, viewing history, debugging, and managing your workspace\footnote{\url{https://www.rstudio.com/products/rstudio/download/}}.
\end{quote}

RStudio is also open-source software, which means that the code used to create it is freely available to download, use, and modify. In contrast, other statistical analysis software programs have inaccessible code and require paid subscriptions. Additionally, Posit, PBC supports the continued development of RStudio by dedicating a portion of its engineering team to work only on open-source software projects.

\hypertarget{install-r}{%
\subsection{Install R}\label{install-r}}

The RStudio IDE does not come with R; instead, download the latest version of R for your operating system from the Comprehensive R Archive Network, or CRAN\footnote{\url{https://cran.r-project.org/}}. Following the download and installation instructions for your operating system to install R.

\hypertarget{install-the-rstudio-ide}{%
\subsection{Install the RStudio IDE}\label{install-the-rstudio-ide}}

We will use the open-source desktop version of the IDE, which is available as a free download from Posit's website\footnote{\url{https://www.rstudio.com/products/rstudio/download/\#download}}. On the download page, you should select the correct version of the IDE that matches your operating system.\\
\includegraphics{images/ide-download.png}
After selecting the download button, follow the prompts on your computer to install RStudio.

\hypertarget{navigate-ide}{%
\subsection{Navigating RStudio}\label{navigate-ide}}

The RStudio IDE brings together all the tools you need to do data science: an editor to write code and text, a console to execute code, access to your computer's terminal, a file explorer, a viewer pane for graphs and visualizations, as well as a version control pane, for those who use Git or Github (see appendix@ref\{appendix\}). While it can accommodate many programming languages, the focus of this book will be using RStudio to code in R. Within the ecosystem of R tools, it includes common code libraries and other tools, like spellcheck, which make the work of data science much more manageable.

\includegraphics{images/rstudio-landing-screen.png}
RStudio has numerous features, but this book covers only some of those. We'll go over those that are necessary for the tasks at hand. The left-hand pane is called the console, where we can type code directly, or else the IDE will run code within particular files automatically so we can see a log of our code as it executes. Additional tabs are in that pane for the terminal (see appendix@ref\{appendix\}). On the top right is the environment pane, where the R objects you create and use in your session are stored. The bottom right is the files pane, where you can navigate through your computer's file directory. Other useful tabs in that pane are Help and Viewer, which shows any graphs or plots you create.

RStudio uses the concept of project files, which group together all the code and dataset files for one project. Every new data science project should start with a new R project in the RStudio IDE. From the \emph{File} menu, select \emph{New Project} and follow the prompts to create a new project. Each project must have a name, which will create a folder of the same name and save all your code and other files within that folder. Naming projects separately keep project files organized and more easily navigable from a file directory. When you open a project at the start of your work session, the IDE will use the file directory for that project as that session's working directory. Any files created will be automatically saved to that same directory or folder, helpful in keeping files organized.

Once you create a file or open one, the console moves to the left bottom, and an editor pane opens in the upper left. To store some R code as a file to access or re-run later, create a new R Script file by going to File \textgreater{} New File \textgreater{} R Script.
\includegraphics{images/new-r-script.png}

With four panes, the IDE screen looks like this:
\includegraphics{images/4-panes.png}

\hypertarget{pkgs-download}{%
\section{Packages needed for this book}\label{pkgs-download}}

As you progress in your data science journey, you will install more and more R packages. As with any new project you start, begin by installing all the packages you will need to use. Please see the appendix@ref\{appendix\} for instructions on installing additional software that these packages depend on to function properly (commonly called `dependencies'). You might see prompts in the console during this process. If you're asked to install other packages, say `yes.' If you're asked if you want to compile binaries from source, say `no.'

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#commenting out until we\textquotesingle{}re ready to go}

\CommentTok{\# install the packages needed by this book; you may need to install dependencies before proceeding or if you encounter an error message}

\FunctionTok{lapply}\NormalTok{(}\FunctionTok{c}\NormalTok{(}\StringTok{\textquotesingle{}xfun\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}tidyverse\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}gapminder\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}tidytext\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}jsonlite\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}units\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}rgdal\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}terra\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}sf\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}tmap\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}tidycensus\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}readr\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}textdata\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}tidymodels\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}flexdashboard\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}DT\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}shiny\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Rcpp\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}raster\textquotesingle{}}\NormalTok{), }\ControlFlowTok{function}\NormalTok{(pkg) \{}
\ControlFlowTok{if}\NormalTok{ (}\FunctionTok{system.file}\NormalTok{(}\AttributeTok{package =}\NormalTok{ pkg) }\SpecialCharTok{==} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{) }\FunctionTok{install.packages}\NormalTok{(pkg, }\AttributeTok{repos=}\StringTok{"https://cloud.r{-}project.org"}\NormalTok{)}
\NormalTok{\})}

\CommentTok{\# installing \_rgdal\_ can sometimes result in error codes. Please see the Appendix\textbackslash{}@ref\{appendix\} for troubleshooting tips.}
\end{Highlighting}
\end{Shaded}

At the start of all subsequent chapters, you'll notice a code chunk that loads each package into your current session using the \texttt{library()} function. Installing a package happens only once, but loading a package must occur each time you open RStudio or start a new R session.

\hypertarget{ide-viewing}{%
\section{Viewing tabular data in RStudio}\label{ide-viewing}}

Let's read some data into R and get more comfortable with RStudio while exploring the data. We'll use COVID stats for the city of St.~Louis that are available at: \url{https://www.stlouis-mo.gov/covid-19/data/\#totalsByDate}. Scroll down to Totals By Specimen Collection Date and click View Data, then save the csv file.

After the file is saved, we can use the Tidyverse package \emph{readr} and its \texttt{read.csv()} function to read the file into R and make it available for us to use. First, we need to load the Tidyverse packages we already installed with the \texttt{library()} function.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#load \_readr\_ as part of the \_tidyverse\_ package}
\FunctionTok{library}\NormalTok{(tidyverse) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## -- Attaching packages -------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ----------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#create an object to store the csv data we read in}
\NormalTok{stl\_covid }\OtherTok{\textless{}{-}} \FunctionTok{read.csv}\NormalTok{(}\StringTok{"City{-}of{-}St{-}Louis{-}COVID{-}19{-}Case{-}Data.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

All output needs to be an object, so we created a \texttt{stl\_covid} object that contains the csv file we just downloaded. Most COVID datasets are very large, so while we could click on this object in the Environment pane and open it to view the entire file, we could use a few R functions to get a sense of what this dataset looks like.

If we want to see the entire file, we can use the \texttt{view()} command to open up a spreadsheet view in our editor pane. The file is very large, as expected.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{view}\NormalTok{(stl\_covid)}
\end{Highlighting}
\end{Shaded}

\includegraphics{images/view-covid.png}
We can also use some built-in base R functions to see snippets of the \texttt{stl\_covid} dataset. To see the first ten lines, we can use \texttt{head()} and \texttt{tail()} to see the last ten lines. An additional function is \texttt{summary()}, which will display summary statistics for each column in the data frame.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(stl\_covid)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   CONFIRMEDCASECHANGE       DATE DEATHCHANGE
## 1                   1 03-03-2020           0
## 2                   0 03-04-2020           0
## 3                   0 03-05-2020           0
## 4                   0 03-06-2020           0
## 5                   0 03-07-2020           0
## 6                   0 03-08-2020           0
##   PROBABLECASECHANGE R0 R0CIHIGH R0CILOW
## 1                  0 NA       NA      NA
## 2                  0 NA       NA      NA
## 3                  0 NA       NA      NA
## 4                  0 NA       NA      NA
## 5                  0 NA       NA      NA
## 6                  0 NA       NA      NA
##   TOTALCONFIRMEDCASES TOTALDEATHS TOTALPROBABLECASES
## 1                   1           0                  0
## 2                   1           0                  0
## 3                   1           0                  0
## 4                   1           0                  0
## 5                   1           0                  0
## 6                   1           0                  0
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tail}\NormalTok{(stl\_covid)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##     CONFIRMEDCASECHANGE       DATE DEATHCHANGE
## 743                   8 03-15-2022           1
## 744                  11 03-16-2022           0
## 745                  11 03-17-2022           0
## 746                   7 03-18-2022           0
## 747                   5 03-19-2022           0
## 748                   0 03-20-2022           0
##     PROBABLECASECHANGE R0 R0CIHIGH R0CILOW
## 743                  5 NA       NA      NA
## 744                  3 NA       NA      NA
## 745                  0 NA       NA      NA
## 746                  1 NA       NA      NA
## 747                  1 NA       NA      NA
## 748                  0 NA       NA      NA
##     TOTALCONFIRMEDCASES TOTALDEATHS TOTALPROBABLECASES
## 743               45378         746               7504
## 744               45389         746               7507
## 745               45400         746               7507
## 746               45407         746               7508
## 747               45412         746               7509
## 748               45412         746               7509
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{summary}\NormalTok{(stl\_covid)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  CONFIRMEDCASECHANGE     DATE
##  Min.   :  0.0       Length:748
##  1st Qu.: 21.0       Class :character
##  Median : 35.0       Mode  :character
##  Mean   : 60.7
##  3rd Qu.: 66.0
##  Max.   :735.0
##
##   DEATHCHANGE     PROBABLECASECHANGE       R0
##  Min.   : 0.000   Min.   : -2        Min.   :0.56
##  1st Qu.: 0.000   1st Qu.:  0        1st Qu.:0.89
##  Median : 0.000   Median :  5        Median :0.99
##  Mean   : 0.997   Mean   : 10        Mean   :1.04
##  3rd Qu.: 2.000   3rd Qu.: 12        3rd Qu.:1.14
##  Max.   :12.000   Max.   :241        Max.   :3.99
##                                      NA's   :34
##     R0CIHIGH       R0CILOW     TOTALCONFIRMEDCASES
##  Min.   :0.61   Min.   :0.51   Min.   :    1
##  1st Qu.:1.00   1st Qu.:0.76   1st Qu.: 6541
##  Median :1.11   Median :0.89   Median :20429
##  Mean   :1.16   Mean   :0.91   Mean   :18537
##  3rd Qu.:1.28   3rd Qu.:1.00   3rd Qu.:26597
##  Max.   :5.36   Max.   :2.63   Max.   :45412
##  NA's   :34     NA's   :34
##   TOTALDEATHS  TOTALPROBABLECASES
##  Min.   :  0   Min.   :   0
##  1st Qu.:214   1st Qu.:  97
##  Median :438   Median :1724
##  Mean   :396   Mean   :2125
##  3rd Qu.:577   3rd Qu.:3401
##  Max.   :746   Max.   :7509
##
\end{verbatim}

While this dataset originated as a CSV file, there are specific R packages for reading in Microsoft Excel (\emph{readxl}) and Google Sheets (\emph{googlesheets4}). This book works only with CSV files, but know that if you often work with those proprietary formats, other packages exist to help you with those if you don't want to convert them to CSV files first.

\hypertarget{rstudio-summary}{%
\section{Summary}\label{rstudio-summary}}

This chapter took you from no experience coding in R to interacting with data in the RStudio IDE using R functions. R is an object-oriented programming language used within RStudio's graphical user interface alongside several popular code packages, such as the Tidyverse. New users must install R and RStudio before learning the various features the IDE offers for data scientists. There are several ways to view data in RStudio, whether viewing the entire dataset file or using R functions to see snippets of the dataset within the console.

\hypertarget{rstudio-study}{%
\section{Further Practice}\label{rstudio-study}}

\begin{itemize}
\tightlist
\item
  Read in a csv file of your own and run the same summary functions: \texttt{head()}, \texttt{tail()}, \texttt{summary()}
\item
  Install \emph{janeaustenR} for use in chapter 6\ref{text-study}
\end{itemize}

\hypertarget{rstudio-resources}{%
\section{Additional Resources}\label{rstudio-resources}}

\begin{itemize}
\tightlist
\item
  \emph{Hands-on programming with R}, Garrett \& Hadley
\item
  RStudio IDE, Base R, \& data import (\emph{readr}) cheatsheets: \url{https://www.rstudio.com/resources/cheatsheets/}
\item
  ``Getting Started with R and RStudio'': \url{https://moderndive.netlify.com/1-getting-started.html}
\item
  An Introduction to R: \url{https://cran.r-project.org/doc/manuals/R-intro.html}
\end{itemize}

\hypertarget{dplyr}{%
\chapter{\texorpdfstring{Tidying data with \emph{dplyr}}{Tidying data with dplyr}}\label{dplyr}}

\hypertarget{dplyr-los}{%
\section{\texorpdfstring{Learning Objective: write code to perform data scrubbing functions with the Tidyverse's \emph{dplyr} package.}{Learning Objective: write code to perform data scrubbing functions with the Tidyverse's dplyr package.}}\label{dplyr-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Use the IDE to load the \emph{dplyr} package.
\item
  Identify data elements in RStudio's IDE that need to be changed.
\item
  Summarize the most common functions \emph{dplyr} is used for.
\item
  Use \emph{dplyr} functions to normalize fields in a dataset.
\end{enumerate}

\hypertarget{dplyr-terms}{%
\section{Terms You'll Learn}\label{dplyr-terms}}

\begin{itemize}
\tightlist
\item
  API
\end{itemize}

\hypertarget{dplyr-scenario}{%
\section{Scenario}\label{dplyr-scenario}}

You need data on unemployment in the city of St.~Louis, and the first step to creating visualizations related to unemployment requires you to read the data and tidy it. You'd like to target your outreach to areas of low unemployment, so you will need to prepare data to use in determining those. Occupations with the highest employment would be helpful to target training for job seekers for jobs that are in demand.

\hypertarget{dplyr-pkgs}{%
\section{Packages \& Datasets Needed}\label{dplyr-pkgs}}

\hypertarget{dplyr-intro}{%
\section{Introduction}\label{dplyr-intro}}

This chapter is focused on census data and learning data tidying functions to create an unemployment dataset for use in subsequent chapters. We are aided in this endeavor by the \emph{tidycensus} package\footnote{\url{https://walker-data.com/tidycensus/}}, which interfaces with the US Census data and returns data that are ready to work with Tidyverse packages. \emph{Tidycensus} lets us access census data for many communities, St.~Louis included. The Census contains data about employment, occupation, gender, and location.

\hypertarget{census-setup}{%
\section{Getting started with U.S. Census data}\label{census-setup}}

Census data is available from the Census \emph{API}\footnote{\url{https://en.wikipedia.org/wiki/API}}. An API, or application programming interface, allows our computer to access the computer(s) storing the census data. APIs enable computers to talk to each other; they are a valuable tool for data scientists who want to get a dataset directly from the source. Many data sources provide API access to their databases, which we will visit again in chapter 7\ref{text-api}.

\hypertarget{census-prerequisites}{%
\subsection{Census prerequisites}\label{census-prerequisites}}

Before using \emph{tidycensus} to query the Census database, each user must have a unique identifier: an API key. This unique authorization code from the Census website allows you to access census data\footnote{\url{http://api.census.gov/data/key_signup.html}}.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Create a Census API key
\end{enumerate}

If you're following along and entering this code into your R console, sign up for your own census data key, delete the `\#' and replace '``your-key-here'' with your own API key.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# census\_api\_key("your{-}key{-}here") }
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Get FIPS codes
  We are limiting our analysis to the city of St.~Louis and need to restrict our data to that area. To do that, we'll use the Federal Information Processing Series (FIPS) Codes. Thankfully, \texttt{fips\_codes} are already part of \emph{tidycensus}.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(fips\_codes)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##   state state_code state_name county_code
## 1    AL         01    Alabama         001
## 2    AL         01    Alabama         003
## 3    AL         01    Alabama         005
## 4    AL         01    Alabama         007
## 5    AL         01    Alabama         009
## 6    AL         01    Alabama         011
##           county
## 1 Autauga County
## 2 Baldwin County
## 3 Barbour County
## 4    Bibb County
## 5  Blount County
## 6 Bullock County
\end{verbatim}

When combined with the state, each county has a code that allows us to query the Census database for only the geographic area of interest, like St.~Louis.

\hypertarget{census-variables}{%
\subsection{Census variables}\label{census-variables}}

The Census collects a lot of data about the US population, but we don't need all that data! To narrow our scope to the most applicable data, we must select the Census report year, type, and metadata fields (variables) we want to analyze. The American Community Survey\footnote{\url{https://www.census.gov/programs-surveys/acs}} will provide the most valuable data for our analysis.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Review all Census variables
  We'll use \texttt{load\_variables()} to review the 2019 ACS 5-year survey data variables.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{var\_2019 }\OtherTok{\textless{}{-}} \FunctionTok{load\_variables}\NormalTok{(}\DecValTok{2019}\NormalTok{, }\StringTok{"acs5"}\NormalTok{)}
\NormalTok{var\_2019}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 27,040 x 4
##    name       label                     concept geogr~1
##    <chr>      <chr>                     <chr>   <chr>
##  1 B01001_001 Estimate!!Total:          SEX BY~ block ~
##  2 B01001_002 Estimate!!Total:!!Male:   SEX BY~ block ~
##  3 B01001_003 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  4 B01001_004 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  5 B01001_005 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  6 B01001_006 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  7 B01001_007 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  8 B01001_008 Estimate!!Total:!!Male:!~ SEX BY~ block ~
##  9 B01001_009 Estimate!!Total:!!Male:!~ SEX BY~ block ~
## 10 B01001_010 Estimate!!Total:!!Male:!~ SEX BY~ block ~
## # ... with 27,030 more rows, and abbreviated variable
## #   name 1: geography
## # i Use `print(n = ...)` to see more rows
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Create new object of variables
  Having pulled in the FIPS codes that allow us to identify data from St.~Louis and the variable names from the 2019 ACS, we can now create a new object that contains only the data we want:
\end{enumerate}

\begin{itemize}
\tightlist
\item
  Survey: 5-year ACS
\item
  Year: 2019
\item
  Locations: St.~Louis County, Missouri
\item
  Variables:
\item
  Total population: B23025\_001
\item
  Population not in the labor force (unemployed): B23025\_007
\end{itemize}

One Base R function that we'll rely on for this code is \texttt{c()}, which concatenates strings (numbers or text) into one value. We'll concatenate the two variables we're interested in: total population and the number of unemployed. The function \texttt{get\_acs()} passes the metadata requirements to the Census database, returning the data we need for each Census tract. We're interested in all the variables and want to see them spread out across the columns, so we will use the \texttt{output\ =\ "wide"} setting to adjust the output.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data }\OtherTok{\textless{}{-}} \FunctionTok{get\_acs}\NormalTok{(}\AttributeTok{geography =} \StringTok{"tract"}\NormalTok{, }\AttributeTok{variables =} \FunctionTok{c}\NormalTok{(}
  \AttributeTok{total\_pop =} \StringTok{"B23001\_001"}\NormalTok{,}
  \AttributeTok{unemployed =} \StringTok{"B23025\_007"}\NormalTok{), }\AttributeTok{state =} \StringTok{"MO"}\NormalTok{, }\AttributeTok{county =} \StringTok{"510"}\NormalTok{, }\AttributeTok{year =} \DecValTok{2019}\NormalTok{, }\AttributeTok{output =} \StringTok{"wide"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Getting data from the 2015-2019 5-year ACS
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{data}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 106 x 6
##    GEOID       NAME     total~1 total~2 unemp~3 unemp~4
##    <chr>       <chr>      <dbl>   <dbl>   <dbl>   <dbl>
##  1 29510124200 Census ~    2536     326     668     194
##  2 29510124300 Census ~    3043     305     649     142
##  3 29510125500 Census ~    3881     328     791     177
##  4 29510102100 Census ~    2300     184     545     107
##  5 29510102400 Census ~    2086     166     570     152
##  6 29510103800 Census ~    3269     240     898     131
##  7 29510104200 Census ~    3000     261     869     218
##  8 29510105500 Census ~    2265     388     964     215
##  9 29510106500 Census ~    2275     376    1135     265
## 10 29510107500 Census ~    1730     312     904     220
## # ... with 96 more rows, and abbreviated variable
## #   names 1: total_popE, 2: total_popM,
## #   3: unemployedE, 4: unemployedM
## # i Use `print(n = ...)` to see more rows
\end{verbatim}

\hypertarget{dplyr-tidy-tools}{%
\section{\texorpdfstring{Tidy data tools from \emph{dplyr}}{Tidy data tools from dplyr}}\label{dplyr-tidy-tools}}

The data we've pulled is the total population and the number of unemployed, but that's not what we need to know. We need an unemployment rate; from there we can determine where the areas of highest and lowest unemployment are alongside occupation data. To do this, we must tidy and modify the \emph{tidycensus} data we have.

The \emph{dplyr} package within the Tidyverse contains a constellation of functions designed for data modification. Some of the actions we'll need to perform are:

\begin{itemize}
\tightlist
\item
  renaming columns
\item
  creating a new column for the unemployment rate, which involves performing a mathematical operation on other columns
\item
  combine columns
\item
  sort column values
\item
  combine several functions sequentially
\item
  choose specific columns or rows within the table
\item
  see a snapshot of a dataset
\item
  group data by column value
\item
  filter a subset of a table
\item
  combine datasets based on common column values
\end{itemize}

\hypertarget{dplyr-start}{%
\section{\texorpdfstring{Getting started with \emph{dplyr} functions}{Getting started with dplyr functions}}\label{dplyr-start}}

One of the formative concepts of the Tidyverse, which we will rely upon heavily through the remainder of the book, is the use of the pipe: \texttt{\%\textgreater{}\%}. This operator can be read within a code chunk as `then': it allows us to call \emph{dplyr} functions sequentially to make our code more readable. You will often see code from the Tidyverse written in an ``object {[}then{]} function'' syntax pattern.

\hypertarget{create-unemployment-rate}{%
\subsection{Create unemployment rate}\label{create-unemployment-rate}}

We'll use the ``object {[}then{]} function'' pattern to create a new variable and column for the unemployment rate. The ACS doesn't provide an unemployment rate, so we must calculate it from the columns we have: total population and population unemployed. Our task here is two-fold: 1) create a new column and 2) populate each row in the column with its calculated unemployment rate. This is the number unemployed divided by the total population:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{unemployment\_data }\OtherTok{\textless{}{-}}\NormalTok{ data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{unemployment\_rate =} \FunctionTok{as.numeric}\NormalTok{(unemployedE)}\SpecialCharTok{/}\FunctionTok{as.numeric}\NormalTok{(total\_popE)) }
\end{Highlighting}
\end{Shaded}

In plain English, the code above says ``create a new object called \texttt{unemployment\_data}, which takes the \texttt{data} object and then makes a new column in it called \texttt{unemployment\_rate}; fill the rows in that new column with the value of the number unemployed divided by the total population.''

\hypertarget{save-unemployment-data}{%
\subsection{Save unemployment data}\label{save-unemployment-data}}

Before going any further, we will save the \texttt{unemployment\_data} object to a CSV file in the \texttt{data/} sub-directory, or folder, using the \texttt{write\_csv()} function from \emph{readr}:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(unemployment\_data, }\StringTok{"data/unemployment\_data.csv"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\hypertarget{find-the-areas-with-the-highest-unemployment}{%
\subsection{Find the areas with the highest unemployment}\label{find-the-areas-with-the-highest-unemployment}}

Getting back to \emph{dplyr}, we need to figure out where the areas with the highest levels of unemployment are. We'll use \texttt{arrange()} to sort the dataset by unemployment rate in descending order and then look at only the top 10 locations in the dataset.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Arrange by descending unemployment rate and get the ten highest unemployment rates; \textasciigrave{}arrange()\textasciigrave{} defaults to ascending sort order}
\NormalTok{high\_unemploy }\OtherTok{\textless{}{-}}\NormalTok{ unemployment\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(unemployment\_rate)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}

\NormalTok{high\_unemploy}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 7
##    GEOID  NAME  total~1 total~2 unemp~3 unemp~4 unemp~5
##    <chr>  <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 29510~ Cens~    1572     337    1019     262   0.648
##  2 29510~ Cens~    1448     169     866     166   0.598
##  3 29510~ Cens~    2360     345    1376     252   0.583
##  4 29510~ Cens~    1299     220     743     137   0.572
##  5 29510~ Cens~    1333     172     733     126   0.550
##  6 29510~ Cens~    2163     287    1174     197   0.543
##  7 29510~ Cens~    3721     345    2011     271   0.540
##  8 29510~ Cens~    1613     198     858     197   0.532
##  9 29510~ Cens~    1769     218     930     182   0.526
## 10 29510~ Cens~    1730     312     904     220   0.523
## # ... with abbreviated variable names 1: total_popE,
## #   2: total_popM, 3: unemployedE, 4: unemployedM,
## #   5: unemployment_rate
\end{verbatim}

\hypertarget{find-the-areas-with-the-lowest-unemployment}{%
\subsection{Find the areas with the lowest unemployment}\label{find-the-areas-with-the-lowest-unemployment}}

Using the same \texttt{arrange()} function, but using the default ascending sort, we'll create a new object of only the values with the lowest unemployment rate.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy }\OtherTok{\textless{}{-}}\NormalTok{ unemployment\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(unemployment\_rate) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}

\NormalTok{low\_unemploy}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 7
##    GEOID  NAME  total~1 total~2 unemp~3 unemp~4 unemp~5
##    <chr>  <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 29510~ Cens~    2362     227     417     133   0.177
##  2 29510~ Cens~    2617     262     511     157   0.195
##  3 29510~ Cens~    3556     311     720     177   0.202
##  4 29510~ Cens~    3881     328     791     177   0.204
##  5 29510~ Cens~    1401     139     286      83   0.204
##  6 29510~ Cens~    3561     442     734     223   0.206
##  7 29510~ Cens~    3420     298     712     157   0.208
##  8 29510~ Cens~    1612     155     336      77   0.208
##  9 29510~ Cens~    2262     206     480     117   0.212
## 10 29510~ Cens~    3043     305     649     142   0.213
## # ... with abbreviated variable names 1: total_popE,
## #   2: total_popM, 3: unemployedE, 4: unemployedM,
## #   5: unemployment_rate
\end{verbatim}

\hypertarget{occupation-data}{%
\section{Occupation data}\label{occupation-data}}

Now that we have the top ten Census block groups with the highest and lowest unemployment rates, let's see the occupations in those Census blocks. We will select the full table of data instead of only specific variables.

\hypertarget{occupation-data-for-the-city-of-st.-louis}{%
\subsection{Occupation data for the city of St.~Louis}\label{occupation-data-for-the-city-of-st.-louis}}

When we use the \texttt{get\_acs} function to pull that survey's data from the census website, we need to specify which particular table of data we need. ``Tract'' is the unit of geographical measurement we need; we'll include the year, county/state, and the code for the table. A Census tract is a usually permanent subdivision of a county with about 4,000 people that reside within its bounds\footnote{\url{https://www.census.gov/programs-surveys/geography/about/glossary.html\#par_textimage_13}}. We'll use the \emph{dplyr} function \texttt{glimpse()} to see a list of column names and a snippet of the values for each. This function is handy to see an object at a glance to understand what it contains.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{occupation\_data  }\OtherTok{\textless{}{-}} \FunctionTok{get\_acs}\NormalTok{(}
  \AttributeTok{geography =} \StringTok{"tract"}\NormalTok{, }
  \AttributeTok{state =} \StringTok{"MO"}\NormalTok{, }
  \AttributeTok{county =} \StringTok{"510"}\NormalTok{, }
  \AttributeTok{year =} \DecValTok{2019}\NormalTok{, }
  \AttributeTok{table =} \StringTok{"C24010"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Getting data from the 2015-2019 5-year ACS
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{glimpse}\NormalTok{(occupation\_data)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 7,738
## Columns: 5
## $ GEOID    <chr> "29510101100", "29510101100", "29510~
## $ NAME     <chr> "Census Tract 1011, St. Louis city, ~
## $ variable <chr> "C24010_001", "C24010_002", "C24010_~
## $ estimate <dbl> 1338, 744, 156, 89, 73, 16, 34, 8, 1~
## $ moe      <dbl> 136, 106, 70, 56, 50, 19, 28, 13, 18~
\end{verbatim}

The GEOID variable, which contains the Census tract ID, is present in both the occupation data and the employment data.

\hypertarget{save-occupation-data}{%
\subsection{Save occupation data}\label{save-occupation-data}}

We'll save the occupation data to a CSV file so we can use it later.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(occupation\_data, }\StringTok{"data/occupation\_data.csv"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\hypertarget{select-occupations-with-the-highest-and-lowest-unemployment}{%
\subsection{Select occupations with the highest and lowest unemployment}\label{select-occupations-with-the-highest-and-lowest-unemployment}}

We will need to use \texttt{filter()} to get the occupations with the highest and lowest unemployment rates and then use \texttt{arrange()} again to sort the results. Our challenge here is that we don't want all the occupation data; we only need the occupations for the areas we already identified in the \texttt{low\_unemploy} and \texttt{high\_unemploy}objects we created in the previous section. We'll need to use the \texttt{\%in\%} operator, which lets us select only specific rows that match both datasets.

To find the occupations (jobs) with the lowest unemployment, we want to take the \texttt{occupation\_data} object and filter out the rows whose GEOID matches the GEOID field in the \texttt{low\_unemploy} object. Then we want to arrange them in descending order by the \texttt{estimate} column.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy\_jobs }\OtherTok{\textless{}{-}}\NormalTok{ occupation\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(GEOID }\SpecialCharTok{\%in\%}\NormalTok{ low\_unemploy}\SpecialCharTok{$}\NormalTok{GEOID) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(estimate)) }

\FunctionTok{glimpse}\NormalTok{(low\_unemploy\_jobs)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 730
## Columns: 5
## $ GEOID    <chr> "29510125500", "29510114101", "29510~
## $ NAME     <chr> "Census Tract 1255, St. Louis city, ~
## $ variable <chr> "C24010_001", "C24010_001", "C24010_~
## $ estimate <dbl> 2894, 2637, 2598, 2440, 2188, 2050, ~
## $ moe      <dbl> 340, 317, 259, 385, 240, 230, 175, 2~
\end{verbatim}

We'll perform the same functions for jobs with the highest unemployment but compare rows against the \texttt{high\_unemploy} object.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{high\_unemploy\_jobs }\OtherTok{\textless{}{-}}\NormalTok{ occupation\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(GEOID }\SpecialCharTok{\%in\%}\NormalTok{ high\_unemploy}\SpecialCharTok{$}\NormalTok{GEOID) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(estimate)) }

\FunctionTok{glimpse}\NormalTok{(high\_unemploy\_jobs)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 730
## Columns: 5
## $ GEOID    <chr> "29510119300", "29510106700", "29510~
## $ NAME     <chr> "Census Tract 1193, St. Louis city, ~
## $ variable <chr> "C24010_001", "C24010_001", "C24010_~
## $ estimate <dbl> 1619, 952, 848, 794, 771, 714, 700, ~
## $ moe      <dbl> 221, 263, 146, 225, 147, 147, 145, 2~
\end{verbatim}

\hypertarget{combine-occupation-names-with-unemployment-data}{%
\subsection{Combine occupation names with unemployment data}\label{combine-occupation-names-with-unemployment-data}}

The high and low unemployment datasets include the occupation in the \texttt{variable} column, but only as a code. At the beginning of this chapter, we read the list of variables and saved it to the \texttt{var\_2019} object. We need to combine these two datasets to bring only the variables we need from \texttt{var\_2019} to 'high\_unemploy\_jobs\texttt{and}low\_unemploy\_jobs`.

\emph{dplyr} has several `join' functions that allow you to combine and filter data in one step. This book will utilize:
\emph{\texttt{left\_join()}\footnote{\url{https://dplyr.tidyverse.org/reference/mutate-joins.html}}
+Adds columns from the second dataset to the first dataset
}\texttt{anti\_join()}\footnote{\url{https://dplyr.tidyverse.org/reference/filter-joins.html}}
+Returns all rows from the first dataset that \emph{do not} have a match in the second dataset

To combine the variable names with the code that matches our low unemployment occupations, we'll take our \texttt{low\_unemploy\_jobs} dataset and use \texttt{left\_join()} to add fields from \texttt{var\_2019}. We need to indicate which column we should join the dataset by, which is another way of identifying which column is present in both datasets. The column appearing in both datasets will match the rest of the rows and columns together.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy\_jobs\_join }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemploy\_jobs }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{left\_join}\NormalTok{(var\_2019, }\AttributeTok{by =} \FunctionTok{c}\NormalTok{(}\StringTok{"variable"} \OtherTok{=} \StringTok{"name"}\NormalTok{))}

\FunctionTok{glimpse}\NormalTok{(low\_unemploy\_jobs\_join)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 730
## Columns: 8
## $ GEOID     <chr> "29510125500", "29510114101", "2951~
## $ NAME      <chr> "Census Tract 1255, St. Louis city,~
## $ variable  <chr> "C24010_001", "C24010_001", "C24010~
## $ estimate  <dbl> 2894, 2637, 2598, 2440, 2188, 2050,~
## $ moe       <dbl> 340, 317, 259, 385, 240, 230, 175, ~
## $ label     <chr> "Estimate!!Total:", "Estimate!!Tota~
## $ concept   <chr> "SEX BY OCCUPATION FOR THE CIVILIAN~
## $ geography <chr> "block group", "block group", "bloc~
\end{verbatim}

When we look at the combined dataset, we can see only one variable (name) column, but otherwise, the \texttt{left\_join()} added all the columns from \texttt{var\_2019} to \texttt{low\_unemploy\_jobs}.

\hypertarget{group-occupations-by-gender}{%
\subsection{Group occupations by gender}\label{group-occupations-by-gender}}

Census data includes binary gender classifications (male/female) for each occupation. We can de-duplicate the dataset by separating the rows by gender.

We'll use several new \emph{dplyr} functions to refine the dataset. To group variables with the same values, \texttt{group\_by()} lets us specify which column name to use. The pipe comes in handy as we work several sequential steps on the same \texttt{low\_unemploy\_jobs\_join} dataset. First, we'll group the values by \texttt{label}. Then we'll use \texttt{summarize()}\footnote{\url{https://dplyr.tidyverse.org/reference/summarise.html}}, which is a function that will create a new table of summarized data containing the grouped labels and the column called \texttt{total} that will contain the total \texttt{estimate} column. We can then arrange the values in descending order. The last function we will pipe into this object is \texttt{filter()}, which we'll use to exclude gross aggregate estimates that are over 10,000. Excluded values include the total estimate and total female estimate rows because we're interested in specific occupation data.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy\_jobs\_ct }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemploy\_jobs\_join }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{group\_by}\NormalTok{(label) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{summarize}\NormalTok{(}\AttributeTok{total =} \FunctionTok{sum}\NormalTok{(estimate)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(total }\SpecialCharTok{\textless{}} \DecValTok{10000}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\hypertarget{occupations-with-the-lowest-unemployment-for-men}{%
\subsection{Occupations with the lowest unemployment for men}\label{occupations-with-the-lowest-unemployment-for-men}}

Because the occupation rankings differ for men and women, we need to separate the data into two datasets by gender. To do that, we'll utilize the \emph{stringr} package. We can use the \texttt{str\_detect()} function to \texttt{filter()} rows that match the character string we specify. In this case, we want to filter the label column for the value of ``Male.'' Then we can arrange the results in descending order and look at the top 10 occupations with the lowest unemployment rate for men.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Get the top ten by male occupations by detecting the word "Male"}
\CommentTok{\# in the labels using the \textasciigrave{}str\_detect()\textasciigrave{} function in \_stringr\_}
\NormalTok{low\_unemploy\_jobs\_male }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemploy\_jobs\_ct }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\FunctionTok{str\_detect}\NormalTok{(label, }\StringTok{"Male"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}

\NormalTok{low\_unemploy\_jobs\_male}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 2
##    label                                          total
##    <chr>                                          <dbl>
##  1 Estimate!!Total:!!Male:!!Management, business~  5128
##  2 Estimate!!Total:!!Male:!!Management, business~  2072
##  3 Estimate!!Total:!!Male:!!Service occupations:   1586
##  4 Estimate!!Total:!!Male:!!Sales and office occ~  1494
##  5 Estimate!!Total:!!Male:!!Management, business~  1424
##  6 Estimate!!Total:!!Male:!!Management, business~  1271
##  7 Estimate!!Total:!!Male:!!Management, business~  1219
##  8 Estimate!!Total:!!Male:!!Production, transpor~  1164
##  9 Estimate!!Total:!!Male:!!Sales and office occ~   868
## 10 Estimate!!Total:!!Male:!!Management, business~   867
\end{verbatim}

\hypertarget{occupations-with-the-lowest-unemployment-for-women}{%
\subsection{Occupations with the lowest unemployment for women}\label{occupations-with-the-lowest-unemployment-for-women}}

To create a new dataset showing occupations with the lowest unemployment for women, we will follow the same steps except for searching the label column for ``Female.''

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Get the top ten by female occupations by detecting the word}
\CommentTok{\# "Female" in the labels using the \textasciigrave{}str\_detect()\textasciigrave{} function in \_stringr\_}
\NormalTok{low\_unemploy\_jobs\_female }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemploy\_jobs\_ct }\SpecialCharTok{\%\textgreater{}\%} \CommentTok{\#clean}
  \FunctionTok{filter}\NormalTok{(}\FunctionTok{str\_detect}\NormalTok{(label, }\StringTok{"Female"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(total)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}

\NormalTok{low\_unemploy\_jobs\_female}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 10 x 2
##    label                                          total
##    <chr>                                          <dbl>
##  1 Estimate!!Total:!!Female:!!Management, busine~  5648
##  2 Estimate!!Total:!!Female:!!Sales and office o~  2549
##  3 Estimate!!Total:!!Female:!!Management, busine~  2095
##  4 Estimate!!Total:!!Female:!!Management, busine~  1737
##  5 Estimate!!Total:!!Female:!!Service occupation~  1677
##  6 Estimate!!Total:!!Female:!!Sales and office o~  1360
##  7 Estimate!!Total:!!Female:!!Management, busine~  1225
##  8 Estimate!!Total:!!Female:!!Sales and office o~  1189
##  9 Estimate!!Total:!!Female:!!Management, busine~  1103
## 10 Estimate!!Total:!!Female:!!Management, busine~   870
\end{verbatim}

The occupation with the lowest unemployment is the same for men and women: management, business, science, and the arts. The remaining nine lowest unemployment occupations differ between genders, though.

\hypertarget{clean-metadata}{%
\section{Clean up metadata}\label{clean-metadata}}

As we can see from our glimpse of the occupations for women with the lowest unemployment, the occupation labels from the Census are tough to parse. There are many exclamation points and colons, and several words repeat in addition to the occupation category itself. Knowing that we want to present this information in a professional context, we will have to spend some time simplifying and relabeling each occupation.

\hypertarget{simplify-and-relabel-male-occupations}{%
\subsection{Simplify and relabel male occupations}\label{simplify-and-relabel-male-occupations}}

Cleaning up column values is not an unusual tidying task. In this instance, the irregularities of these values and the small number (ten) of each lend themselves to creating a new object with clean names that we can join with the dataset to provide comprehensible occupation names.

We'll use \emph{tibble}\footnote{\url{https://tibble.tidyverse.org/index.html}}, another core Tidyverse package to create a tibble (a tidy table) of occupation labels using \texttt{tribble()}\footnote{\url{https://tibble.tidyverse.org/reference/tribble.html}}. When using \texttt{tribble()}, we first include the names of the two columns in our tibble, \texttt{label} and \texttt{male\_jobs}. Then we proceed to include each value in the tibble we want to create, column by column and row by row. We will take the existing values for the \texttt{label} column from the Census and create a clean version of each occupation in a new column called \texttt{male\_jobs}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean\_labels\_m }\OtherTok{\textless{}{-}} \FunctionTok{tribble}\NormalTok{(}
  \SpecialCharTok{\textasciitilde{}}\NormalTok{label, }\SpecialCharTok{\textasciitilde{}}\NormalTok{male\_jobs,}
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:"}\NormalTok{, }\StringTok{"Management, business, science, \& arts"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:!!Management, business, and financial occupations:"}\NormalTok{, }\StringTok{"Business \& financial operations"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Service occupations:"}\NormalTok{, }\StringTok{"Service occupations"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Sales and office occupations:"}\NormalTok{, }\StringTok{"Sales \& office"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:!!Computer, engineering, and science occupations:"}\NormalTok{, }\StringTok{"Computer, engineering, \& science"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:!!Management, business, and financial occupations:!!Management occupations"}\NormalTok{, }\StringTok{"Management"}\NormalTok{, }
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:!!Education, legal, community service, arts, and media occupations:"}\NormalTok{, }\StringTok{"Education, legal, community service, arts, \& media"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Production, transportation, and material moving occupations:"}\NormalTok{, }\StringTok{"Production, transportation, \& material moving"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Sales and office occupations:!!Sales and related occupations"}\NormalTok{, }\StringTok{"Sales"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Male:!!Management, business, science, and arts occupations:!!Computer, engineering, and science occupations:!!Computer and mathematical occupations"}\NormalTok{, }\StringTok{"Computer \& mathematical"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

In order to see the beginning of our second clean tidy table, we'll run \texttt{glimpse().}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{glimpse}\NormalTok{(clean\_labels\_m)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10
## Columns: 2
## $ label     <chr> "Estimate!!Total:!!Male:!!Managemen~
## $ male_jobs <chr> "Management, business, science, & a~
\end{verbatim}

\hypertarget{update-occupation-labels-for-male-unemployment-data}{%
\subsection{Update occupation labels for male unemployment data}\label{update-occupation-labels-for-male-unemployment-data}}

Now that we have a table of clean occupation labels, we can use another \texttt{left\_join()} to combine \texttt{clean\_labels\_m} with \texttt{low\_unemploy\_jobs\_male}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy\_jobs\_malec }\OtherTok{\textless{}{-}} \FunctionTok{left\_join}\NormalTok{(}
\NormalTok{  low\_unemploy\_jobs\_male,}
\NormalTok{  clean\_labels\_m,}
  \AttributeTok{by =} \StringTok{\textquotesingle{}label\textquotesingle{}}
\NormalTok{  )}

\FunctionTok{glimpse}\NormalTok{(low\_unemploy\_jobs\_malec)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10
## Columns: 3
## $ label     <chr> "Estimate!!Total:!!Male:!!Managemen~
## $ total     <dbl> 5128, 2072, 1586, 1494, 1424, 1271,~
## $ male_jobs <chr> "Management, business, science, & a~
\end{verbatim}

\hypertarget{simplify-and-relabel-female-occupations}{%
\subsection{Simplify and relabel female occupations}\label{simplify-and-relabel-female-occupations}}

We'll repeat the same occupation label cleaning steps that we did with male occupations with the female occupations using \texttt{tribble()}. Our column names are \texttt{label} and \texttt{female\_jobs}, and then we will enter the existing labels and their clean counterparts.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{clean\_labels\_f }\OtherTok{\textless{}{-}} \FunctionTok{tribble}\NormalTok{(}
  \SpecialCharTok{\textasciitilde{}}\NormalTok{label, }\SpecialCharTok{\textasciitilde{}}\NormalTok{female\_jobs,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:"}\NormalTok{, }\StringTok{"Management, business, science, \& arts"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Service occupations:"}\NormalTok{, }\StringTok{"Service occupations"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Sales and office occupations:!!Sales and related occupations"}\NormalTok{, }\StringTok{"Sales"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Sales and office occupations:!!Office and administrative support occupations"}\NormalTok{, }\StringTok{"Office \& administrative support"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Sales and office occupations:"}\NormalTok{, }\StringTok{"Sales \& office"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:!!Management, business, and financial occupations:!!Management occupations"}\NormalTok{, }\StringTok{"Management"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:!!Management, business, and financial occupations:!!Business and financial operations occupations"}\NormalTok{, }\StringTok{"Business \& financial operations"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:!!Management, business, and financial occupations:"}\NormalTok{, }\StringTok{"Management, business, \& financial"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:!!Healthcare practitioners and technical occupations:"}\NormalTok{, }\StringTok{"Healthcare practitioners \& technical"}\NormalTok{,}
  \StringTok{"Estimate!!Total:!!Female:!!Management, business, science, and arts occupations:!!Education, legal, community service, arts, and media occupations:"}\NormalTok{, }\StringTok{"Education, legal, community service, arts, \& media"}
\NormalTok{)}
\end{Highlighting}
\end{Shaded}

In order to see the beginning of our clean tidy table, we'll run \texttt{glimpse().}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{glimpse}\NormalTok{(clean\_labels\_f)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10
## Columns: 2
## $ label       <chr> "Estimate!!Total:!!Female:!!Manag~
## $ female_jobs <chr> "Management, business, science, &~
\end{verbatim}

\hypertarget{update-occupation-labels-for-female-unemployment-data}{%
\subsection{Update occupation labels for female unemployment data}\label{update-occupation-labels-for-female-unemployment-data}}

Again, we'll use the previous tool of \texttt{left\_join()} to combine the clean labels with the list of female unemployment data we have.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemploy\_jobs\_femalec }\OtherTok{\textless{}{-}} \FunctionTok{left\_join}\NormalTok{(}
\NormalTok{  low\_unemploy\_jobs\_female,}
\NormalTok{  clean\_labels\_f,}
  \AttributeTok{by =} \StringTok{\textquotesingle{}label\textquotesingle{}}
\NormalTok{  )}
\end{Highlighting}
\end{Shaded}

\hypertarget{unemploy-csv}{%
\section{Create a csv of unemployment data}\label{unemploy-csv}}

All of the data tidying work we've done in this chapter needs to be saved as new files to use in later chapters. We'll use \emph{readr} again to save the male and female unemployment data to individual CSV files.

\hypertarget{male-unemployment}{%
\subsection{1. Male unemployment}\label{male-unemployment}}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(low\_unemploy\_jobs\_malec, }\StringTok{"data/male{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{female-unemployment}{%
\subsection{2. Female unemployment}\label{female-unemployment}}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(low\_unemploy\_jobs\_femalec, }\StringTok{"data/female{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{dplyr-summary}{%
\section{Summary}\label{dplyr-summary}}

The \emph{dplyr} package is a foundational Tidyverse package. We used it in this chapter to modify our census data into tables that we can use in later chapters to analyze and plot in R. We've sorted, renamed, grouped, joined, and filtered St.~Louis census data. Other functions have allowed us to create new columns with new data, such as \texttt{mutate()} and \texttt{summarize()}. Data rarely arrives in a state perfectly ready for analysis in the real world. Messy data makes learning data cleaning functions essential for work in data science. We also learned to use the central Tidyverse operator: the pipe. With the pipe, we can stack functions one after the other to manipulate our data efficiently and quickly. Using \emph{dplyr} functions with the pipe presents a stark contrast to years of working with Excel files that have to be manually modified repeatedly in (hopefully) the same way.

\hypertarget{dplyr-study}{%
\section{Further Practice}\label{dplyr-study}}

*Use the Census API to import a different ACS year for St.~Louis, and adjust the column names and variables as appropriate.

\hypertarget{dplyr-resources}{%
\section{Additional Resources}\label{dplyr-resources}}

\begin{itemize}
\tightlist
\item
  Data transformation (\emph{dplyr}) cheatsheet: \url{https://www.rstudio.com/resources/cheatsheets/}
\item
  R for Data Science (Grolemund \& Wickham, 2016)
\end{itemize}

\hypertarget{ggplot2}{%
\chapter{\texorpdfstring{Visualizing your project with \emph{ggplot2}}{Visualizing your project with ggplot2}}\label{ggplot2}}

\hypertarget{ggplot-los}{%
\section{Learning Objectives}\label{ggplot-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Describe various geom functions in \emph{ggplot2} for making plots.
\item
  Use appropriate code to load the \emph{ggplot2} library and other relevant libraries into RStudio.
\item
  Generate basic column plots using the relevant geom functions in \emph{ggplot2}.
\item
  Apply the \texttt{aes()} function to customize plots.
\item
  Apply legends and labels to plots.
\end{enumerate}

\hypertarget{ggplot-terms}{%
\section{Terms You'll Learn}\label{ggplot-terms}}

\begin{itemize}
\tightlist
\item
  Aesthetic mapping
\item
  Geometric object
\item
  Coordinate system
\item
  Polar coordinate system
\item
  Facet
\item
  Scale
\item
  Theme
\end{itemize}

\hypertarget{ggplot2-scenario}{%
\section{Scenario}\label{ggplot2-scenario}}

You were asked by your library to generate some data visualizations about the demographic profile of St.~Louis City. Specifically, you need data visualizations of the top ten census block groups and occupations with the highest and lowest unemployment rates.

\hypertarget{ggplot-pkgs}{%
\section{Packages \& Datasets needed}\label{ggplot-pkgs}}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(tidycensus)}
\FunctionTok{library}\NormalTok{(gapminder)}
\FunctionTok{library}\NormalTok{(readr)}
\end{Highlighting}
\end{Shaded}

\hypertarget{ggplot2-intro}{%
\section{\texorpdfstring{Introduction to \emph{ggplot2}}{Introduction to ggplot2}}\label{ggplot2-intro}}

The \emph{ggplot2} package creates data visualizations such as histograms and plots. Its syntax comes from Leland Wilkinson's \emph{Grammar of Graphics}\ref{@wilkinson2005}, which makes layers of graphics by mapping aesthetic attributes to geometric objects\ref{@wickham2020}. Aesthetic attributes are things like size, color, and shape. A geometric object can be a plot that could be a bar, line, or point plot. For this exercise, we will focus on creating bar plots because we will be working with one discrete variable, the census tracts, and one continuous variable, the unemployment rates. Data scientists refer to such data visualizations as ``plots'' regardless of the type of data visualization.

\hypertarget{plot-components}{%
\section{Components of a plot}\label{plot-components}}

A typical plot contains data, a coordinate system, and a geometry. The bare minimum code to create a plot is: \texttt{ggplot(data\ =\ DATA,\ mapping\ =\ aes(MAPPINGS))\ +\ \ geom\_function()}. The \texttt{gg} in \texttt{ggplot} stands for the ``Grammar of Graphics'' that was introduced in the previous section.

The function \texttt{ggplot()} calls the package, and in the parenthesis you apply aesthetic mappings to your data. For example, aesthetic mappings indicate which variable is on the x-axis and which is on the y-axis. You can also apply different stylistic mappings to the plot by plotting the points by size, shape, or color. Then, you add a layer using the \texttt{geom()} function, which, for example, could be a histogram (\texttt{geom\_hist()}), scatterplot (\texttt{geom\_point()}), or a line plot (\texttt{geom\_line()}). This list is not exhaustive; the \emph{ggplot2} cheat sheet\footnote{\url{https://www.rstudio.com/resources/cheatsheets/}} lists all of the varieties of plots that are available.

You can further customize your plot by specifying the statistical transformation (also known as stat, see section 3.7 in \emph{R for Data Science}\footnote{\url{https://r4ds.had.co.nz/data-visualisation.html}}) and position of how you display your values (simply known as position). You can also add a coordinate system, facets, scales, and themes.

\hypertarget{coordinate-systems}{%
\section{Coordinate Systems}\label{coordinate-systems}}

You plot your data on a coordinate system. Coordinate system options in \emph{ggplot2} include a cartesian coordinate system, a polar coordinate system, and a spatial coordinate system.

\textbf{Cartesian coordinate system}: A two-dimensional coordinate system based on a horizontal x-axis, a vertical y-axis, and diagonal z-axis. You can also flip the x and y axes, plot the coordinates on a fixed ratio, and transform coordinates.

\textbf{Polar coordinate system}: A two-dimensional coordinate system that consists of a reference point and an angle from a reference direction

\textbf{Spatial coordinate system}: A spatial coordinate system comprises lines of latitude which run parallel to the equator and longitude, which runs parallel to the prime meridian. We will go more into spatial coordinate systems in Chapter 6, using the \emph{tmap} package@ref\{tmap\}.

\hypertarget{start-plots}{%
\section{Before we start}\label{start-plots}}

We will be using the \texttt{male\_low\_unemployment}, \texttt{female\_low\_employment}, and \texttt{unemployment\_data} csv files that we created in chapter three@ref\{dplyr\}. Make sure that you have these data imported in RStudio.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{male\_low\_unemployment }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/male{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10 Columns: 3
## -- Column specification -------------------------------
## Delimiter: ","
## chr (2): label, male_jobs
## dbl (1): total
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{female\_low\_unemployment }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/female{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10 Columns: 3
## -- Column specification -------------------------------
## Delimiter: ","
## chr (2): label, female_jobs
## dbl (1): total
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{unemployment\_data }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/unemployment\_data.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 106 Columns: 7
## -- Column specification -------------------------------
## Delimiter: ","
## chr (1): NAME
## dbl (6): GEOID, total_popE, total_popM, unemployedE...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

We'll also recreate the high and low unemployment objects we used in chapter 3@ref\{dplyr-start\}.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Arrange by descending unemployment rate and get the ten highest unemployment rates}
\NormalTok{high\_unemploy }\OtherTok{\textless{}{-}}\NormalTok{ unemployment\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(}\FunctionTok{desc}\NormalTok{(unemployment\_rate)) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}

\NormalTok{low\_unemploy }\OtherTok{\textless{}{-}}\NormalTok{ unemployment\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{arrange}\NormalTok{(unemployment\_rate) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{head}\NormalTok{(}\AttributeTok{n =} \DecValTok{10}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\hypertarget{plotting-the-census-block-groups-by-unemployment-rate-with-geom_col}{%
\subsection{\texorpdfstring{Plotting the Census block groups by unemployment rate with \texttt{geom\_col()}}{Plotting the Census block groups by unemployment rate with geom\_col()}}\label{plotting-the-census-block-groups-by-unemployment-rate-with-geom_col}}

You use \texttt{geom\_col()} for a continuous and discrete variable. To add a layer to your plot you will need to use \texttt{+} and then call your function afterwards. First, we will plot the ten Census block groups with the highest and lowest unemployment rates:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Ten Census block groups with the highest unemployment rates}
\NormalTok{high\_unemp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ high\_unemploy }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ unemployment\_rate, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(NAME, unemployment\_rate))) }\SpecialCharTok{+} \FunctionTok{geom\_col}\NormalTok{()}

\NormalTok{high\_unemp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/ten-highest-1.pdf}
Let's break down this code. We are creating a variable called \texttt{high\_unemp\_plot} which uses the \texttt{high\_unemploy} data which is passed to the ggplot function via the pipe (\texttt{\%\textgreater{}\%}) operator. We then call the \texttt{ggplot()} function in which the unemployment rate will be on the x-axis whlie the Census tract name is on the y-axis. The \texttt{reorder} function will reorder the Census tracts based on the unemployment rate. As a result, the Census tract with the highest unemployment rate will be on top of the y-axis while the Census tract with the lowest unemployment rate will be on the bottom of the y-axis. The same method is applied to create the \texttt{low\_unemp\_plot} to indicate the Census tracts with the lowest rates of unemployment.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# Ten Census block groups with the lowest unemployment rates}
\NormalTok{low\_unemp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemploy }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ unemployment\_rate, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(NAME, unemployment\_rate))) }\SpecialCharTok{+}  \FunctionTok{geom\_col}\NormalTok{() }

\NormalTok{low\_unemp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/ten-lowest-1.pdf}

Let's clean up these labels and add a title for the two plots. We do this through the \texttt{labs()} function. We do not have to re-write all the code to create the plot since we stored it in a variable. All we will have to do is to add the labels layer through using \texttt{+} and then \texttt{labs()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{high\_unemp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ high\_unemp\_plot }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(title, }\StringTok{"Ten census tracts with the highest unemployment rate"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Unemployment Rate"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Census Tract"}\NormalTok{)}

\NormalTok{high\_unemp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/high-plot-1.pdf}

We will also do the same for low\_unemp\_plot.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemp\_plot }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Ten census tracts with the lowest unemployment rate"}\NormalTok{, }\AttributeTok{x =} \StringTok{"Unemployment Rate"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Census Tract"}\NormalTok{)}

\NormalTok{low\_unemp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-plot-1.pdf}

Now that we have the top ten Census block groups with the highest and lowest unemployment rates, let's see their occupations. First, let's use the \texttt{female\_low\_unemployment} data frame to create a plot seeing the top ten female occupations with the lowest unemployment. By getting information about these occupations, you can target your outreach by providing information and training opportunities with these occupations.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_female\_plot }\OtherTok{\textless{}{-}}\NormalTok{ female\_low\_unemployment }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(female\_jobs, total))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{() }

\NormalTok{low\_unemployment\_female\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-female-jobs-1.pdf}

As we did in the previous set of plots, we created a variable called \texttt{low\_unemployment\_female\_plot} in which we have the total number of occupations on the x-axis and the type of jobs on the y-axis. We call the reorder function to order the bar plot by the number of jobs. The occupation with the highest number of jobs (Management, business, science and arts) is on the top of the bar plot and the occupation with the lowest number of jobs (Business \& financial operations) is on the bottom of the bar plot. Let's now do the same with the male occupations.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_male\_plot }\OtherTok{\textless{}{-}}\NormalTok{ male\_low\_unemployment }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(male\_jobs, total))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{()}

\NormalTok{low\_unemployment\_male\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-male-jobs-1.pdf}

Now we will add a title and x and y axis label layer to the two plots through \texttt{+\ labs()}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_female\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemployment\_female\_plot }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten female occupations in census tracts with lowest unemployment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}

\NormalTok{low\_unemployment\_female\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-female-labels-1.pdf}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_male\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemployment\_male\_plot }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten male occupations in census tracts with lowest unemployment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}

\NormalTok{low\_unemployment\_male\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-male-labels-1.pdf}

\hypertarget{more-ggplot2}{%
\section{\texorpdfstring{Additional \emph{ggplot2} concepts}{Additional ggplot2 concepts}}\label{more-ggplot2}}

This chapter aims to get you up and running by creating data visualizations using \emph{ggplot2}, which is by no means exhaustive. The \emph{ggplot2} library has robust features, but here are a few concepts you should know.

\hypertarget{facets}{%
\subsection{Facets}\label{facets}}

If you want to plot data by a particular categorical variable, you can use facets. Our data is not facet-friendly so instead, we will use one of the built-in R datasets similar to our topic at hand. The gapminder dataset, a dataset on global demographics, will work with facets.

One way in which we can create facets through the \texttt{facet\_wrap()} function. Use this function when you want to facet one variable (Wickham, 2017). Let's look at the life expectancy (lifeExp) vs.~GDP per capita (gdpPercap) by decade from 1957 to 2007. Before we can start faceting, we must to get the data.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#saving the gapminder data into a variable.}
\NormalTok{gapminder\_data }\OtherTok{\textless{}{-}}\NormalTok{ gapminder }
\CommentTok{\#separating the gapminder data by decade from 1957 to 2007.}
\NormalTok{gapminder\_decade\_data }\OtherTok{\textless{}{-}}\NormalTok{ gapminder\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(year }\SpecialCharTok{\%in\%} \FunctionTok{c}\NormalTok{(}\DecValTok{1957}\NormalTok{, }\DecValTok{1967}\NormalTok{, }\DecValTok{1977}\NormalTok{, }\DecValTok{1987}\NormalTok{,}\DecValTok{1997}\NormalTok{, }\DecValTok{2007}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

We create a variable called \texttt{gapminder\_data} by calling the built-in dataset called gapminder. We want the data by decade from 1957 to 2007, so we create another variable called \texttt{gapminder\_decade\_data} and we filter the data by decade from 1957 to 2007 using the \texttt{\%in\%} operator to extract the specific years that is indicated in the vector \texttt{c(1957,\ 1967,\ 1977,\ 1987,1997,\ 2007)}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gdp\_lifeexp\_year\_plot }\OtherTok{\textless{}{-}}\NormalTok{ gapminder\_decade\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(gdpPercap, lifeExp)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_wrap}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{ year)}

\NormalTok{gdp\_lifeexp\_year\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/gdp-lifeexp-1.pdf}
Now we will create a variable called \texttt{gdp\_lifeexp\_year\_plot} that will store the plot with GDP per capita on the the x-axis and life expectancy on the y-axis. We will make this plot a scatterplot by adding \texttt{geom\_point()} and facet by year through the \texttt{facet\_wrap} function. The year inside the \texttt{facet\_wrap} function indicates that you want to facet by year.

You can use \texttt{facet\_grid} to facet your plot using two variables (Wickham, 2017). Let's create a plot that shows the relationship between the GDP per capita (gdpPercap) and life expectancy (lifeExp) in the Americas and Europe by decade from 1957 to 2007. First, we need to filter the \texttt{gapminder\_decade\_data} to only include data from the Americans and Europe. We will then use the \texttt{facet\_grid} function to facet the plots by continents. We supply a variable before and after the \texttt{\textasciitilde{}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gdp\_lifeexp\_dec\_plot \textless{}{-} gapminder\_decade\_data \%\textgreater{}\%}
\NormalTok{  filter( continent == "Americas" | continent == "Europe") \%\textgreater{}\%}
\NormalTok{  ggplot(aes(lifeExp, gdpPercap)) +}
\NormalTok{  geom\_point() +}
\NormalTok{  facet\_wrap( continent \textasciitilde{} year)}

\NormalTok{gdp\_lifeexp\_dec\_plot}
\end{Highlighting}
\end{Shaded}

To spread values across columns, we would need to use \texttt{.\textasciitilde{}}@ref\{wickham2020\}. In the case of our data, to do this by continent, we would need to use \texttt{facet\_grid(.\ \textasciitilde{}\ continent)}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gdp\_lifeexp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ gapminder\_decade\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(gdpPercap, lifeExp)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_grid}\NormalTok{(.}\SpecialCharTok{\textasciitilde{}}\NormalTok{continent)}

\NormalTok{gdp\_lifeexp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/gdp-life-plot-1.pdf}

In this code chunk, we created a variable to store our plot called \texttt{gdp\_lifeexp\_plot} in which life expectancy (\texttt{lifeExp}) is on the y-axis and (\texttt{gdpPercap}) is on the x-axis. As you can see, the plot shares the same y-axis which allows for easy comparison across columns. Likewise, we can spread values across rows using \texttt{\textasciitilde{}.} Let's flip the plot from above where we can spread GDP per capita and life expectancy by continent and compare row-wise.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{gdp\_lifeexp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ gapminder\_decade\_data }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(gdpPercap, lifeExp)) }\SpecialCharTok{+}
  \FunctionTok{geom\_point}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{facet\_grid}\NormalTok{(continent }\SpecialCharTok{\textasciitilde{}}\NormalTok{.)}

\NormalTok{gdp\_lifeexp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/gdp-life-compare-1.pdf}
With this plot, we can see that the variables share the same x-axis that allows us to compare row-wise. However this plot doesn't seem aesthetically pleasing the previous one since it uses too much space row-wise. It's important to test which type of faceting is appropriate for your data.

Now that we know about \texttt{facet\_grid} and \texttt{facet\_wrap}, let's revisit why the data we have been using isn't appropriate for faceting. Our data isn't appropriate for faceting because we do not have any categorical variables in our data so we can do any subsetting. Each variable in our data has one value attributed to it; each Census tract and occupation has one value attributed to it. To best show this, let's create a facet grid in faceting the unemployment rate by Census tract using the \texttt{NAME} variable through \texttt{facet\_grid}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{high\_unemp\_plot }\OtherTok{\textless{}{-}}\NormalTok{ high\_unemp\_plot }\SpecialCharTok{+}
  \FunctionTok{facet\_grid}\NormalTok{(}\SpecialCharTok{\textasciitilde{}}\NormalTok{NAME)}

\NormalTok{high\_unemp\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/high-facet-1.pdf}
This faceting of the unemployment rate by Census tract is not neccessary because it is redundant to facet by Census tract to indicate one unemployment rate per Census tract.

\hypertarget{scales}{%
\subsection{Scales}\label{scales}}

Scales change how the plot looks. Changing the scales, such as the minimum and maximum values in the x and y axes and the data breaks, can make your visualization more readable. For example, in the last plot of the top ten female occupations in Census tracts with the lowest unemployment, the data is broken up by 2000 units, with the max range being 6,000. Let's change the scale in which the data is broken up by 1,000 units, with the max range being 7,000.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_female\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemployment\_female\_plot }\SpecialCharTok{+}
  \FunctionTok{scale\_x\_continuous}\NormalTok{(}\AttributeTok{breaks =} \FunctionTok{seq}\NormalTok{(}\DecValTok{1000}\NormalTok{,}\DecValTok{7000}\NormalTok{, }\AttributeTok{by =} \DecValTok{1000}\NormalTok{)) }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten female occupations in census tracts with lowest unemployment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}

\NormalTok{low\_unemployment\_female\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-scale-1.pdf}
We call the low employment plot and build upon it by adding a scale layer using \texttt{+} and the \texttt{scale\_x\_continous} function. within that function is the code \texttt{seq(1000,\ 7000,\ by\ =\ 1000)} in which the minimum x-axis value is 1,000 and the maximum value is 7,000 and each break in the x-axis by 1,000.

\hypertarget{themes}{%
\subsection{Themes}\label{themes}}

If you do not like the default theme of your plot, there are several built-in themes that you can use. For more themes, you can use the ggthemes\footnote{\url{https://github.com/jrnold/ggthemes}} package by Jeffrey Arnold. Let's change the plot on the top ten male occupations in Census tracts with the lowest unemployment to \texttt{theme\_classic()}, which removes the grid lines.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_male\_plot }\OtherTok{\textless{}{-}}\NormalTok{ low\_unemployment\_male\_plot }\SpecialCharTok{+}
  \FunctionTok{theme\_classic}\NormalTok{()}

\NormalTok{low\_unemployment\_male\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/low-theme-1.pdf}

\hypertarget{ggplot2-summary}{%
\section{Summary}\label{ggplot2-summary}}

In this chapter we learned how to create data visualizations with the \emph{ggplot2} package. We also learned how to use the \emph{tidycensus} package in accessing Census data within RStudio which involves accessing the Census API through a Census API key. \emph{ggplot2} was created based on the ``Grammar of Graphics'' in which one graphic element is layered on top each other. While there are various types of visualizations that one can create with \emph{gglot2} we focused on creating bar plots with \texttt{geom\_col()} and scatter plots with \texttt{geom\_point()}. We were able to explore these functions through creating bar plots of Census tracts that has the lowest rate of unemployment based on sex using \emph{tidycensus}. We also created scatterplots using \texttt{geom\_point()} with gapminder data. With these plots, we were able to further explore various \emph{ggplot2} functions, such as facets, scales, and themes.

\hypertarget{ggplot2-resources}{%
\section{Additional resources}\label{ggplot2-resources}}

\begin{itemize}
\tightlist
\item
  Data visualization with \emph{ggplot2} cheatsheet:
  \url{https://www.rstudio.com/resources/cheatsheets/}
\item
  \href{https://ggplot2-book.org/index.html}{Ggplot2: Elegant Graphics for Data Analysis}
\end{itemize}

\hypertarget{ggplot2-practice}{%
\section{Further Practice}\label{ggplot2-practice}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  For the high and low unemployment plots, change the plot's theme.
\item
  Explore the built-in R datasets through the \emph{datasets} package. Choose a dataset and create facets of a particular variable.
\end{enumerate}

\hypertarget{rvest}{%
\chapter{\texorpdfstring{Webscraping with \emph{rvest}}{Webscraping with rvest}}\label{rvest}}

\hypertarget{rvest-los}{%
\section{\texorpdfstring{Learning Objective: use \emph{rvest} to scrape web page content for analysis.}{Learning Objective: use rvest to scrape web page content for analysis.}}\label{rvest-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Explain how to determine which html tags will provide needed data
\item
  Use the \emph{rvest} package to scrape webpage content into the RStudio IDE
\item
  Create a dataset from the scraped content
\item
  Use \emph{dplyr} to normalize the dataset
\item
  Create a csv file from the scraped content
\end{enumerate}

\hypertarget{rvest-terms}{%
\section{Terms You'll Learn}\label{rvest-terms}}

\begin{itemize}
\tightlist
\item
  web scraping
\item
  subset
\end{itemize}

\hypertarget{rvest-scenario}{%
\section{Scenario}\label{rvest-scenario}}

You want to advertise your report and outreach program to garner support from government leaders in St.~Louis. Reaching out to the aldermen's offices within the city seems most likely to support this goal. A website lists the aldermen, but you have to click on each name to get to the webpage that lists each alderman's email address. Rather than clicking on 29 links and copy/pasting each email address, name \& ward number, you can use \emph{rvest} to scrape this information from the city's website and put it into a table you can update and reuse through the life of this program.

\hypertarget{rvest-pkgs}{%
\section{Packages \& Datasets Needed}\label{rvest-pkgs}}

First, we need to load the packages we need into our current environment to access the functions. We are using \emph{rvest} and \emph{purrr} in this chapter while revisiting \emph{dplyr}.

\begin{verbatim}
##
## Attaching package: 'rvest'
\end{verbatim}

\begin{verbatim}
## The following object is masked from 'package:readr':
##
##     guess_encoding
\end{verbatim}

\hypertarget{rvest-intro}{%
\section{Introduction}\label{rvest-intro}}

So much valuable information and data are available on the internet, but it's hard to use directly on a webpage. Getting information from a website in an automated manner and into a format that's usable and compatible with other applications is made much easier with the advent of web scraping. \emph{Web scraping}\footnote{\url{https://en.wikipedia.org/wiki/Web_scraping}} uses code to load a webpage and extract data contained on the page. Taking a list of names and addresses and putting it into a table that you could turn into a set of mailing labels without repetitive copy/paste keystrokes is one way that web scraping can automate an onerous task. Contact information for a mailing list is a straightforward use case for web scraping, but the options are as numerous as web pages on the internet.

Web scraping works by selecting HTML or CSS fields on a webpage and using code to extract the values of those fields. Often, a web page will use a \texttt{/table} HTML tag to indicate tabular data, but just as often, websites use CSS to make content appear in a table without utilizing the semantic HTML option of the table tag; it looks great, but it can be hard to scrape. As a user, it's up to you to figure out what tags to scrape to get the data you need. Determining which tags to scrape can be challenging depending on the website's code, but it also allows you to choose only those elements that matter. The proper tags with a bit of code can return reams of data in seconds that could take hours to extract manually. The ability to pull content from websites significantly broadens what users can think of as data.

\hypertarget{rvest-components}{%
\section{Identifying \& Scraping Website Components}\label{rvest-components}}

To get in contact with each alderman, we need their email. But to keep our records straight, we also want to know the alderman's name and which ward they represent. Thus, the metadata fields we need are ward number, alderman name, and alderman email address.

The website listing St.~Louis aldermen is \url{https://www.stlouis-mo.gov/government/departments/aldermen/Wards-1-28.cfm}. It lists the 28 city wards and the alderman's name representing that ward.
\includegraphics{images/wards-aldermen-website.png}
The Wards \& Aldermen page provides us with the ward numbers and their representative alderman, but it does not list the alderman's email address. Both the ward number and the alderman name are hyperlinks to the same alderman's webpage.

\includegraphics{images/sharon-tyus.png}
The individual alderman websites do provide the alderman's email address, physical address, and phone number. From these pages, we can anticipate two stages to our web scraping. First, we'll need to scrape the list of wards and aldermen. Then we'll need to scrape each alderman's webpage for their email address.

Before we get started coding, we need to determine the CSS tags, or selectors, that we need to scrape. To do that, we'll utilize a tool called Selector Gadget\footnote{\url{https://selectorgadget.com/}}, which is available as a Chrome\footnote{\url{https://www.google.com/chrome/}} browser extension, to expose and identify CSS tags on the data we need. The \emph{rvest} website\footnote{\url{https://rvest.tidyverse.org}} recommends SelectorGadget.

After installing Chrome and SelectorGadget, we navigate to \url{https://www.stlouis-mo.gov/government/departments/aldermen/Wards-1-28.cfm} and click SelectorGadget from the Chrome extensions menu. Once SelectorGadget is engaged, we click on the content we want to scrape (any ward number or alderman name). SelectorGadget highlights the content and provides the CSS selector to scrape in the dialog box at the bottom of the browser. If SelectorGadget highlights more fields than what we want, we'll click on the extraneous elements to deselect them. Once finished, SelectorGadget will display the selector we need to use with \emph{rvest}. We'll need to repeat this for each page and element we need.
\includegraphics{images/aldermen-selectorgadget.png}
To scrape the aldermen's names and ward numbers, we need the CSS selector \texttt{.data\ a}.

\hypertarget{scraping1}{%
\section{Web scraping part 1: wards \& aldermen}\label{scraping1}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Scrape each alderman's name and ward number
\end{enumerate}

First, we will use the \texttt{read\_html()} function to read the webpage we want to scrape. Our second step is to save that scraped webpage in an HTML file so that we can access it later.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scrape }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(}\StringTok{"https://www.stlouis{-}mo.gov/government/departments/aldermen/Wards{-}1{-}28.cfm"}\NormalTok{)}
\FunctionTok{write\_html}\NormalTok{(scrape, }\StringTok{"scrape.html"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

After we read the scraped HTML file back in, then we'll use \texttt{html\_nodes()} to indicate which selector to scrape (\texttt{.data\ a}). The \texttt{html\_text()} function turns the scraped content into readable text format. We'll \texttt{unlist()} the data and put it into an R table format with \texttt{tibble()}. In Chapter 2 we talked about different data structures in R@ref\{what-is-r\}; \texttt{base::unlist()} is an R function that converts a list into a vector. This standardizes the data, ensuring that all elements have the same data class so that they can be moved into a table with the \texttt{tibble()} function. Finally, we can check our dataset to make sure we scraped what we wanted:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wards }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(}\StringTok{"scrape.html"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\StringTok{".data a"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_text}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unlist}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tibble}\NormalTok{()}

\NormalTok{wards}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 58 x 1
##    .
##    <chr>
##  1 "\nWard\n01\n"
##  2 "\nSharon Tyus\n"
##  3 "\nWard\n02\n"
##  4 "\nLisa Middlebrook\n"
##  5 "\nWard\n03\n"
##  6 "\nBrandon Bosley\n"
##  7 "\nWard\n04\n"
##  8 "\nDwinderlin Evans\n"
##  9 "\nWard\n05\n"
## 10 "\nJames Page\n"
## # ... with 48 more rows
## # i Use `print(n = ...)` to see more rows
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Tidy aldermen names and ward numbers
\end{enumerate}

We can see from the output above that the \texttt{wards} object doesn't have the ward number and alderman name on a single line, as we need it to be. The problem is that because the ward number and the alderman's name use the same CSS selector, R put all columns and rows into the same character vector without unique separators between lines. To fix this problem, we need to create a new tibble where column 1 is all the odd rows of wards and column 2 is all the even rows, so we have a tidy dataset\footnote{\url{https://vita.had.co.nz/papers/tidy-data.pdf}}.

We'll need to use two new operators to select the even and odd rows separately: \texttt{\%\%} and \texttt{==}. The double equal signs mean that two values must be exactly equal to each other, and the double percentage signs return the remainder (as in division). We're creating objects called even when rows are divisible by 2 (the remainder is zero) and odd when a row has a remainder of 1 when divided by 2. We will do this in two steps to create an object for our odd rows and another for even rows:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ward\_num }\OtherTok{\textless{}{-}}\NormalTok{ wards }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\FunctionTok{row\_number}\NormalTok{() }\SpecialCharTok{\%\%} \DecValTok{2} \SpecialCharTok{==} \DecValTok{1}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \CommentTok{\#odd rows}
  \FunctionTok{tibble}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{Ward =} \FunctionTok{names}\NormalTok{(.)) }\CommentTok{\# here the \textquotesingle{}.\textquotesingle{} can be read as "the current object, created in the previous line"}
\end{Highlighting}
\end{Shaded}

Additionally, we'll rename the columns with a blank header using the dplyr \texttt{rename()} function, because columns without header rows are hard to work with and hard to remember what's inside that column. When using \texttt{rename()} we need to pass the function two pieces of information: the data and the new column name. However, since we are using the pipe to combine functions, the data isn't an object we can reference so we need a workaround. The argument we'll pass to \texttt{rename()} here is the new name and we'll say that it is equal to the name that is currently called \texttt{.} using the \texttt{base::names()} function, which R recognizes applies to the name of a list or vector.

While this explanation is a little long, this code is more efficient than creating an object with an blank column name and then adding a new object that is the same data, but with a new column name. In essence, this code allows us to do all of our actions in one object rather than two.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{alderman }\OtherTok{\textless{}{-}}\NormalTok{ wards }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\FunctionTok{row\_number}\NormalTok{() }\SpecialCharTok{\%\%} \DecValTok{2} \SpecialCharTok{==} \DecValTok{0}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \CommentTok{\#even rows}
  \FunctionTok{tibble}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{ (}\AttributeTok{Alderman =} \FunctionTok{names}\NormalTok{(.)) }\CommentTok{\# rename the Alderman column, which has a blank header}
\end{Highlighting}
\end{Shaded}

Finally, the column bind function, \texttt{base::cbind()}, will let us bind the objects for alderman names and ward numbers together into one table, where each vector becomes a column in the table:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{wards\_alderman }\OtherTok{\textless{}{-}} \FunctionTok{cbind}\NormalTok{(ward\_num, alderman) }\CommentTok{\# combine the two columns together into one tibble}
\FunctionTok{head}\NormalTok{(wards\_alderman)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##           Ward                Alderman
## 1 \nWard\n01\n         \nSharon Tyus\n
## 2 \nWard\n02\n    \nLisa Middlebrook\n
## 3 \nWard\n03\n      \nBrandon Bosley\n
## 4 \nWard\n04\n    \nDwinderlin Evans\n
## 5 \nWard\n05\n          \nJames Page\n
## 6 \nWard\n06\n \nChristine Ingrassia\n
\end{verbatim}

We now have a tidy dataset where each observation includes the ward number and that ward's alderman.

\hypertarget{scraping2}{%
\section{Web scraping part 2: email addresses on individual pages}\label{scraping2}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Scrape the Aldermans' contact page URLs
\end{enumerate}

The first page we scraped lacks the email address for each alderman, but we did scrape the name and ward number. Each alderman's web page contains their contact information, including their email address. However, we need to scrape the links to each alderman's page before scraping the email addresses. With the second round of scraping, we will pull the URLs (\texttt{href}) for the aldermen's pages and each alderman's email address from that page.

We're still interested in the same CSS selector, \texttt{.data\ a}, but this time we do not want the value for that selector. Using \texttt{html\_attr()}, we indicate that we want the link, or \texttt{href}, for those values. The href doesn't include the entire URL to the aldermen's sites but only the relative file path: \texttt{/government/departments/aldermen/ward-1}. Each file path is relative to the \texttt{base\_url} in that the relative file path is text that would be appended to the \texttt{base\_url}. Thus, our code will need to scrape the href but then combine that with the base URL, \texttt{https://www.stlouis-mo.gov/}, to find the full URL we need to scrape in the subsequent step. The R function \texttt{file.path()} is a faster version of paste, and we will use it to paste the two URL components together. In coding, the relative file path is usually represented by \texttt{.}, as we see in the \texttt{file.path(base\_url,\ .)}. When run repeatedly, the relative file path will change, but each will be combined with the \texttt{base\_url}:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{base\_url }\OtherTok{\textless{}{-}} \StringTok{"https://www.stlouis{-}mo.gov"}

\NormalTok{href }\OtherTok{\textless{}{-}} \FunctionTok{read\_html}\NormalTok{(}\StringTok{"scrape.html"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_nodes}\NormalTok{(}\StringTok{".data a"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{html\_attr}\NormalTok{(}\StringTok{"href"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{file.path}\NormalTok{(base\_url, .) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{str\_replace\_all}\NormalTok{(}\StringTok{"gov//"}\NormalTok{, }\StringTok{"gov/"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tibble}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

There's an additional step to mention here: the base URL ends in a forward slash, and the relative file path starts with a forward slash. Combining these two portions will get a double forward slash that will invalidate the complete URL. We'll utilize a function from the \emph{stringr} package (part of \emph{tidyverse}) that lets us modify a string in many ways. We want to use \texttt{str\_replace\_all()} to remove the double forward slash. However, we must specify that we only want to replace the double forward slash after \texttt{gov}. Otherwise, our code will replace the double forward slash after \texttt{https:}. Only then can we store the data in a tibble and check to see that the data looks clean.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(href)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 1
##   .
##   <chr>
## 1 https://www.stlouis-mo.gov/government/departments/al~
## 2 https://www.stlouis-mo.gov/government/departments/al~
## 3 https://www.stlouis-mo.gov/government/departments/al~
## 4 https://www.stlouis-mo.gov/government/departments/al~
## 5 https://www.stlouis-mo.gov/government/departments/al~
## 6 https://www.stlouis-mo.gov/government/departments/al~
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Clean up the website addresses
\end{enumerate}

Similar to the duplication problem we had at the start of the chapter, we're getting duplicate lines for each URL because both the ward number and the alderman's name link to the same contact page on the original web page. Both also use the same CSS tag, so we can't separate the two and only pull one of the links. We need to pull out only one copy of each link using the previous filtering by even rows. Additionally, as with so many of our columns, we need to rename those with empty header rows (which include only a period):

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{urls }\OtherTok{\textless{}{-}}\NormalTok{ href }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{filter}\NormalTok{(}\FunctionTok{row\_number}\NormalTok{() }\SpecialCharTok{\%\%} \DecValTok{2} \SpecialCharTok{==} \DecValTok{0}\NormalTok{)}

\NormalTok{websites }\OtherTok{\textless{}{-}} \FunctionTok{rename}\NormalTok{(urls, }\AttributeTok{website =}\NormalTok{ .) }

\NormalTok{websites}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 29 x 1
##    website
##    <chr>
##  1 https://www.stlouis-mo.gov/government/departments/a~
##  2 https://www.stlouis-mo.gov/government/departments/a~
##  3 https://www.stlouis-mo.gov/government/departments/a~
##  4 https://www.stlouis-mo.gov/government/departments/a~
##  5 https://www.stlouis-mo.gov/government/departments/a~
##  6 https://www.stlouis-mo.gov/government/departments/a~
##  7 https://www.stlouis-mo.gov/government/departments/a~
##  8 https://www.stlouis-mo.gov/government/departments/a~
##  9 https://www.stlouis-mo.gov/government/departments/a~
## 10 https://www.stlouis-mo.gov/government/departments/a~
## # ... with 19 more rows
## # i Use `print(n = ...)` to see more rows
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Combine the ward numbers, alderman names, and website URLs
\end{enumerate}

We've come far enough that it makes sense to combine the objects we have into a single table. Again, we'll use the column bind function to combine the objects we have so far into one table:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{all\_urls }\OtherTok{\textless{}{-}} \FunctionTok{cbind}\NormalTok{(wards\_alderman, websites) }
\CommentTok{\#now we have a table of ward, alderman name, and link to their individual bio page}
\end{Highlighting}
\end{Shaded}

\hypertarget{automating-tedious-scraping}{%
\subsection{Automating tedious scraping}\label{automating-tedious-scraping}}

As is often the case with data `in the wild,' the information we need about the city aldermen isn't neatly shown on one webpage. We scraped the alderman names, ward numbers, and personal website URLs, but clicking on 28 websites and copying and pasting email addresses sounds tedious and prone to error. Plus, aldermen join and leave this list, making our data valid for only a short while.

Thankfully, just as we can automate data collection by scraping it off a website, the Tidyverse presents a package and function combination to automate tedious web scraping tasks. In this case, we'll use the Tidyverse package \emph{purrr} to iterate over the list of website URLs. The \texttt{map()} function from \emph{purrr} lets us perform the same commands we did previously\ref{scraping1}, by iteratively on each alderman's website. For every website listed in \texttt{vector}, we will apply \texttt{read\_html()} to read in those websites. Then we will scrape the email HTML field from each page using \texttt{html\_node()}.

However, there is one problem with our data as it currently stands. R has several object classes and \texttt{purrr::map()} only works on lists or atomic vectors. Other object classes include matrices, arrays, factors, and data frames. Having an object of the wrong type for the function you want to execute is a common cause of error messages when coding in R. To find out the object class you have, you can use \texttt{base::class()}:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{class}\NormalTok{(all\_urls)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] "data.frame"
\end{verbatim}

Our object is a data frame, and while that is great for creating tables, it will not work for \texttt{map()}. We only need to change the column we need to use in our scraping task, so we will select the 3rd column, \texttt{websites}, from the object \texttt{all\_urls}. To do that we will \emph{subset}, or take one group of data within a larger group, the 3rd column of \texttt{all\_urls}. The double square brackets on either side of the number three indicate what is being subsetted. This is a useful operation that we'll use several times in this book when we need to break out one part of our larger dataset. We'll create a new object, \texttt{vector\_urls}, where we change the data structure of the \texttt{websites} column from a data frame to a vector using \texttt{as.vector()} so that the scraping function we need to do next will work.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{vector\_urls }\OtherTok{\textless{}{-}} \FunctionTok{as.vector}\NormalTok{(all\_urls[[}\DecValTok{3}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Scrape aldermen email addresses
\end{enumerate}

Before scraping email addresses for each alderman, we must determine which CSS selector we need to scrape. Using SelectorGadget as we did before, we see that each alderman's email address uses the \texttt{strong+\ a} selector.
\includegraphics{images/middlebrook-email.png}

We are now ready to automate scraping each alderman's website for their email address. First, we will use \texttt{map()} to read in the HTML for each element of our vector of alderman website URLs. Then we will use \texttt{map()} again to scrape the CSS selector \texttt{strong+\ a} from the HTML we read for each site.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{scrape\_sites }\OtherTok{\textless{}{-}} \FunctionTok{map}\NormalTok{(vector\_urls, read\_html) }
\CommentTok{\#scrape each URL in the vector}

\NormalTok{scrape\_email }\OtherTok{\textless{}{-}} \FunctionTok{map}\NormalTok{(scrape\_sites, html\_node, }\StringTok{\textquotesingle{}strong+ a\textquotesingle{}}\NormalTok{) }
\CommentTok{\#pull out the email address on each of those pages}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Convert email addresses into readable format
\end{enumerate}

We will use \texttt{map()} again to turn each scraped field into a text format using \texttt{html\_text()} and then put the clean text into a table.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{email\_list }\OtherTok{\textless{}{-}} \FunctionTok{map}\NormalTok{(scrape\_email, html\_text) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unlist}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tibble}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\hypertarget{csv-file}{%
\section{Create reusable data file}\label{csv-file}}

To reuse the scraped data in subsequent chapters, we need to make a new table with clear column headers that combine all four tables into one: ward number, alderman name, and alderman website from all, and alderman email.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Rename column headers
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{emails }\OtherTok{\textless{}{-}} \FunctionTok{rename}\NormalTok{(email\_list, }\AttributeTok{Email =}\NormalTok{ .)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Create a single table with all necessary elements
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{complete }\OtherTok{\textless{}{-}} \FunctionTok{cbind}\NormalTok{(all\_urls, emails)}
\FunctionTok{head}\NormalTok{(complete)}
\end{Highlighting}
\end{Shaded}

Now have the full dataset we need to contact all the aldermen!

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Write to CSV
\end{enumerate}

Once we've created a complete, tidy table of all the wards, the aldermen, and their email addresses, we want to save that as a CSV file for reuse or sharing.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{write\_csv}\NormalTok{(complete, }\StringTok{"data/aldermen{-}contact.csv"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

The \texttt{write\_csv} function takes our object, `complete,' and turns it into a CSV file we could read back into R or view in a spreadsheet application like Google Sheets or Microsoft Excel.

\hypertarget{rvest-summary}{%
\section{Summary}\label{rvest-summary}}

We created a CSV file of aldermen's contact information by identifying which webpages had the information we needed, what specific HTML tags that data was stored in, scraping those particular tags, and tidying the scraped data. The St.~Louis aldermen webpages are a real-life example of dispersed data on the web that can't be copied and pasted into a spreadsheet. Instead, we used code to speed up and automate data acquisition, saving the code and the output. When new aldermen are elected, we need only re-run the code to update the contact list, which saves us a lot of time and energy.

\hypertarget{rvest-study}{%
\section{Further Practice}\label{rvest-study}}

\begin{itemize}
\tightlist
\item
  scrape each alderman's phone number and add it to the \texttt{complete} object as a new column
\item
  scrape a webpage from your employer that lists employees
\item
  scrape the contact information for your state senators
\end{itemize}

\hypertarget{rvest-resources}{%
\section{Additional Resources}\label{rvest-resources}}

\begin{itemize}
\tightlist
\item
  Apply functions with \emph{purrr} cheatsheet: \url{https://www.rstudio.com/resources/cheatsheets/}
\item
  \emph{rvest} package site: \url{https://rvest.tidyverse.org/}
\item
  \emph{Webscraping with R} by Steve Pittard: \url{https://steviep42.github.io/webscraping/book/}
\end{itemize}

\hypertarget{tmap}{%
\chapter{\texorpdfstring{Mapping with \emph{tmap}}{Mapping with tmap}}\label{tmap}}

\hypertarget{tmap-los}{%
\section{Learning Objectives}\label{tmap-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Define geographic concepts relevant to cartography such as geographic coordinate system, projected coordinate system, datum, and spheroid.
\item
  Describe vector and raster data.
\item
  Perform loading spatial data and tabular into RStudio using relevant functions.
\item
  Apply a data transformation of spatial data into an appropriate projection.
\item
  Use appropriate functions in the \texttt{tmap} package to display spatial data.
\end{enumerate}

\hypertarget{tmap-terms}{%
\section{Terms You'll Learn}\label{tmap-terms}}

\begin{itemize}
\tightlist
\item
  Geographic Information Systems
\item
  Spatial data
\item
  Vector data
\item
  Raster data
\item
  Projection
\item
  Coordinate Reference System
\item
  Latitude
\item
  Longitude
\item
  Shapefile
\item
  Census tract
\end{itemize}

\hypertarget{tmap-context}{%
\section{Context}\label{tmap-context}}

After creating plots of occupations with the lowest unemployment, you decide to create maps showing the total unemployment in the city. Being able to see the areas with the lowest and highest unemployment will be helpful in targeting outreach in areas with the highest unemployment. You want these maps to be interactive so users can click on an area and get information about that area in the city. However, you are not sure how to visualize this in a map. Questions that come to mind are:

\begin{itemize}
\tightlist
\item
  Where can I get spatial data to make a map?
\item
  What kind of spatial data do I need to make a map?
\item
  What relevant R packages can be used to create these maps?
\end{itemize}

\hypertarget{spatial-overview}{%
\section{An overview of spatial data}\label{spatial-overview}}

To make a map, you need to use \textbf{spatial data}. So what makes spatial data special? Spatial data has geographic attributes which allow it to be mapped.

\hypertarget{vector-and-raster-data}{%
\subsection{Vector and raster data}\label{vector-and-raster-data}}

There are two types of spatial data: vector data and raster data. \textbf{Vector data} is composed of lines, polygons, and points. Lines can be used to represent features such as roads, county boundaries for polygons, and the location of bus and metro train stops for points data.

\begin{figure}
\centering
\includegraphics{tmap-images/lines.png}
\caption{St.~Louis streets}
\end{figure}

\includegraphics{tmap-images/points.png}
footnote\footnote{\url{https://data-metrostl.opendata.arcgis.com/datasets/current-metrobus-metrolink-stops-and-routes}}

\includegraphics{tmap-images/polygons.png}
footnote\footnote{\url{https://www.stlouis-mo.gov/data/datasets/dataset.cfm?id=46}}

\textbf{Raster data} is image data, where each pixel represents a value. A popular raster dataset is the National Aerial Imagery Program\footnote{\url{https://www.usgs.gov/centers/eros/science/usgs-eros-archive-aerial-photography-national-agriculture-imagery-program-naip}} which is a program managed by the USDA that collects satellite imagery during the growing seasons.

\begin{figure}
\centering
\includegraphics{images/stl_naip.jpg}
\caption{NAIP Imagery of St.~Louis}
\end{figure}

\hypertarget{coordinate-reference-systems}{%
\subsection{Coordinate Reference Systems}\label{coordinate-reference-systems}}

Some spatial data are not ready to be mapped. For spatial data to be mapped, they need something called a coordinate reference system. A coordinate reference system shows how spatial elements relates to the Earth's surface@ref\{lovelace2019\}. There are two types of coordinate reference systems: a projected coordinate system and a geographic coordinate system. A \textbf{geographic coordinate system} is based on latitude (lines of North-South orientation in relation to the equator) and longitude (lines of East-West orientation in relation to the Prime Meridian). A \textbf{projected coordinate system} (also known as just a projection) is a mathematical model of a 3D globe that is flattened on a 2D surface. There are a multitude of projections that are used for a variety of purposes. For example, the Mercator projection is good for navigational purposes, but not so good for visualizing the Earth on a 2D surface since it exaggerates size of continents such as North America and minimizes the size of continents such as Africa. In the context of our scenario, it is best to use the State Plane Projection\footnote{\url{https://www.usgs.gov/faqs/what-state-plane-coordinate-system-can-gps-provide-coordinates-these-values}} since this projection is highly accurate for the local level.

\hypertarget{is-your-data-projected-or-not}{%
\subsection{Is your data projected or not?}\label{is-your-data-projected-or-not}}

How do you know if your data is projected? One way is to simply look at a map. For example, an unprojected map of the US would have straight lines and the states would look distorted. For example, the northern part of the country would be mostly a straight line. A projected map of the US reflects the curvature of the earth and the northern border of the country would be curved.

Another way that you know that data isn't projected is that you will get errors trying to run specific spatial analyses. For example, if you wanted to calculate the nearest bus stop from the library and you are using unprojected data, you will get an error message. Since the data is not projected there is no spatial reference, which means that you cannot do any spatial calculations.

In R, you can use the \texttt{st\_crs()} function in the \emph{sf} package to determine whether your spatial data has a projection or not. Let's see if the US shapefile below has a projection:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{us }\OtherTok{\textless{}{-}} \FunctionTok{st\_read}\NormalTok{(}\StringTok{"us\_state\_clip/us.shp"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{st\_crs}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Reading layer `us' from data source
##   `/Users/sarahlin/r4lis/us_state_clip/us.shp'
##   using driver `ESRI Shapefile'
## Simple feature collection with 49 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -124.8 ymin: 24.4 xmax: -66.89 ymax: 49.38
## Geodetic CRS:  NAD83
\end{verbatim}

This shapefile does not have a projection. Let's see what another shapefile looks like.

\begin{figure}
\centering
\includegraphics{tmap-images/unprojected-2.png}
\caption{caption}
\end{figure}

You can tell that this shapefile does not have a projection given the straight lines at the top and bottom of the country. On a spherical surface, the US would not look like this. This is where projections come into play.

To change the projection, we will have to use \texttt{st\_transform()} and we will use the \emph{US National Atlas Equal Area projection}\footnote{\url{https://epsg.io/2163}}.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{us\_project }\OtherTok{\textless{}{-}}\NormalTok{ us }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{st\_transform}\NormalTok{(}\AttributeTok{crs =} \DecValTok{2163}\NormalTok{) }
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(us\_project)}\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/projected.png}
\caption{caption}
\end{figure}

Once the data has been projected, you can see that the curvature of the Earth has been taken into consideration in projecting it on 2D surface. The top and bottom borders of the country are curved.

\hypertarget{how-can-i-find-an-appropriate-projection}{%
\subsection{How can I find an appropriate projection?}\label{how-can-i-find-an-appropriate-projection}}

You can approach finding the appropriate projection for your data by establishing the purpose of using your spatial data. Are you using spatial data for analysis purposes or to display something? After you determine that, you should think about your area of interest. On what scale are you doing your mapping and analysis? In terms of trying to find the right projection for your area of interest, you can use the epsg.io\footnote{\url{http://epsg.io}} website to determine your projection based on location. After that you can create a list of possible projections and narrowing it down based on your purpose and further research (see additional resources at the end of this chapter@ref\{tmap-resources\}).

In the context of this project, we will be using the NAD 1983, State Plane Missouri East, FIPS 2401 feet. As mentioned before, the state plane projection is good in mapping local areas given the way the zones are derived.

In short, to map data you need:
1. A spatial dataset.
2. An appropriate projection for that spatial dataset.

This overview of spatial data is just to get you up and running. For more in-depth information about coordinate reference systems and other spatial data foundational concepts, please refer to the ``Geographic Data in R''\footnote{\url{https://geocompr.robinlovelace.net/spatial-class.html\#crs-intro}} chapter in \emph{Geocomputation in R} by Robin Lovelace, Jakub Nowosad, and Jannes Muenchow@ref\{lovelace2019\}.

\hypertarget{loading-in-the-data}{%
\section{Loading in the data}\label{loading-in-the-data}}

The type of spatial data that we will be working with is called a \textbf{shapefile}. A shapefile is a vector data format that holds the spatial object, and the attribute information of that spatial object. It is actually made of up several files such as a database file, an xml file that holds the metadata, and a projection file (if the shapefile already has a projection). We are going to load in the WARDS\_2010 shapefile which was retrieved from the St.~Louis City open data portal\footnote{\url{https://www.stlouis-mo.gov/data/}}. If you take a look at the WARDS\_2010 shapefile, you will see that it is actually comprised of several files such as WARDS\_2010.prj, which holds the projection, and WARDS\_2010.dbf that holds the attribute information for each ward.

We will need to use \texttt{st\_read()} function from the \emph{sf} package to load the shapefile into RStudio. After that, we will use the \texttt{st\_crs()} function to see if the shapefile has a coordinate system. We are also going to read the unemployment\_tract shapefile that was created in the previous chapter.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stl\_wards }\OtherTok{\textless{}{-}} \FunctionTok{st\_read}\NormalTok{(}\StringTok{"nbrhds\_wards/WARDS\_2010.shp"}\NormalTok{) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Reading layer `WARDS_2010' from data source
##   `/Users/sarahlin/r4lis/nbrhds_wards/WARDS_2010.shp'
##   using driver `ESRI Shapefile'
## Simple feature collection with 28 features and 3 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 871500 ymin: 982400 xmax: 915300 ymax: 1071000
## Projected CRS: NAD_1983_StatePlane_Missouri_East_FIPS_2401_Feet
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stl\_tracts }\OtherTok{\textless{}{-}} \FunctionTok{st\_read}\NormalTok{(}\StringTok{"unemployment\_tract/unemployment\_tract.shp"}\NormalTok{)}\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\StringTok{"unemployment\_rate"} \OtherTok{=} \StringTok{"unmply\_"}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{unemployment\_rate =}\NormalTok{ unemployment\_rate }\SpecialCharTok{*} \DecValTok{100}\NormalTok{ )}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Reading layer `unemployment_tract' from data source
##   `/Users/sarahlin/r4lis/unemployment_tract/unemployment_tract.shp'
##   using driver `ESRI Shapefile'
## Simple feature collection with 106 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -90.32 ymin: 38.53 xmax: -90.17 ymax: 38.77
## Geodetic CRS:  NAD83
\end{verbatim}

We can see that the shapefile has a coordinate system which is NAD 1983, State Plane Missouri East, FIPS 2401 feet. Next we will create a simple static map using the \emph{tmap} package. We will call the \texttt{stl\_wards} shapefile within the \texttt{tm\_shape()} function which is used to call a spatial object. Within that function we will designate miles as our units of measurement. The \texttt{stl\_wards} shapefile is a polygon, so we need to also add the \texttt{tm\_polygons()} function to draw the polygons.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_wards, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-2.png}
\caption{caption}
\end{figure}

We can add cartographic elements to the map which includes a title, scale bar, and north arrow using the \texttt{tm\_layout()}, \texttt{tm\_scale\_bar()}, and \texttt{tm\_compass()} functions respectively.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_wards, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{()}\SpecialCharTok{+}
  \FunctionTok{tm\_layout}\NormalTok{(}\AttributeTok{title =} \StringTok{"Wards in St. Louis City"}\NormalTok{, }\AttributeTok{title.size =}\NormalTok{ .}\DecValTok{7}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_scale\_bar}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"right"}\NormalTok{, }\StringTok{"bottom"}\NormalTok{), }\AttributeTok{width =}\NormalTok{ .}\DecValTok{3}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_compass}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"left"}\NormalTok{, }\StringTok{"top"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-3.png}
\caption{caption}
\end{figure}

Let's make a static map of the Census tract map. A \textbf{census tract} is one of the smallest statistical units of a county and has about an average population of 4,000 (U.S. Census Bureau, 2022) First, let's check and see what the coordinate system of the Census tracts.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{st\_crs}\NormalTok{(stl\_tracts)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Coordinate Reference System:
##   User input: NAD83
##   wkt:
## GEOGCRS["NAD83",
##     DATUM["North American Datum 1983",
##         ELLIPSOID["GRS 1980",6378137,298.257222101,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4269]]
\end{verbatim}

The CRS is NAD 1983. Let's create a map showing the unemployment rate.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_tracts) }\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{(}\StringTok{"unemployment\_rate"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-5.png}
\caption{caption}
\end{figure}

The legend title can be cleaned up, so let's change the name of the legend when adding the cartographic elements to the map. In addition,the legend items are overlapping the Census tracts, so we need to make the legend items smaller. We can do this by changing the legend position and the inner margins. A margin is the space around an element. In this case, the element would be the St.~Louis Census tracts and the legend. The inner margins is the space inside the frame that is between the frame and the element.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_tracts, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{(}\StringTok{"unemployment\_rate"}\NormalTok{, }\AttributeTok{title =} \StringTok{"Unemployment rate"}\NormalTok{)  }\SpecialCharTok{+}
   \FunctionTok{tm\_layout}\NormalTok{(}\AttributeTok{title =} \StringTok{"Percentage unemployed by}\SpecialCharTok{\textbackslash{}n}\StringTok{ Census tract"}\NormalTok{, }\AttributeTok{title.size =}\NormalTok{ .}\DecValTok{7}\NormalTok{, }\AttributeTok{legend.width =} \DecValTok{1}\NormalTok{, }\AttributeTok{legend.text.size =}\NormalTok{ .}\DecValTok{4}\NormalTok{, }\AttributeTok{legend.title.size =}\NormalTok{ .}\DecValTok{5}\NormalTok{, }\AttributeTok{legend.position =} \FunctionTok{c}\NormalTok{(}\StringTok{"left"}\NormalTok{, }\StringTok{"top"}\NormalTok{),}
             \AttributeTok{inner.margins =} \FunctionTok{c}\NormalTok{(}\FloatTok{0.01}\NormalTok{, }\FloatTok{0.01}\NormalTok{, .}\DecValTok{12}\NormalTok{, .}\DecValTok{25}\NormalTok{)) }\SpecialCharTok{+}
    \FunctionTok{tm\_scale\_bar}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"right"}\NormalTok{, }\StringTok{"bottom"}\NormalTok{), }\AttributeTok{width =}\NormalTok{ .}\DecValTok{3}\NormalTok{) }\SpecialCharTok{+}
    \FunctionTok{tm\_compass}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"right"}\NormalTok{, }\StringTok{"center"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-6.png}
\caption{caption}
\end{figure}

We can make our maps dynamic and interactive with the \emph{leaflet} package\footnote{\url{https://rstudio.github.io/leaflet/}}. We can access that package within the \texttt{tmap} package itself, since \texttt{tmap} imports the \texttt{leaflet} package already. Let's create \emph{leaflet} maps of the wards and the Census tracts. We will do that by changing the \texttt{tmap\_mode()} function to `view.' Here is a map of the wards in view mode:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"view"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_wards, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-7.png}
\caption{caption}
\end{figure}

Let's create a \emph{leaflet} version of the map showing the unemployment rates by Census tract that we created earlier. We will use the \texttt{tm\_fill()} function to add color ramp to the Census tracts to indicate the unemployment rate. With \emph{leaflet}, you can click on the Census tracts and a pop up will appear. We can customize the pop-ups using the \texttt{popup.vars} option within \texttt{tm\_fill}. The pop-up will show the unemployment rate and the name of the Census tract.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"view"}\NormalTok{)}
\FunctionTok{tm\_shape}\NormalTok{(stl\_tracts) }\SpecialCharTok{+}
  \FunctionTok{tm\_fill}\NormalTok{(}\StringTok{"unemployment\_rate"}\NormalTok{, }\AttributeTok{title=} \StringTok{"Umemployment rate"}\NormalTok{,}
          \AttributeTok{popup.vars =} \FunctionTok{c}\NormalTok{(}\StringTok{"\% unemployed"} \OtherTok{=} \StringTok{"unemployment\_rate"}\NormalTok{), }
          \AttributeTok{id =} \StringTok{"NAME"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{tmap-images/code-8.png}
\caption{caption}
\end{figure}

\hypertarget{tmap-summary}{%
\section{Summary}\label{tmap-summary}}

Spatial data is composed of vector and raster data. Vector data is composed of points, lines and polygons while raster data is image data in which each pixel represents a value. You can map vector data with the \emph{tmap} package. One of the kinds of vector data that you can map is called a shapefile, which is actually comprises several files. In this chapter, we mapped a static map of St.~Louis wards and the unemployment rate in each ward. With \emph{tmap}, you can also create dynamic web maps using the \emph{leaflet} package which you can call by itself or from within \emph{tmap}. We also created a web map of the wards and the unemployment rate by ward.

\hypertarget{tmap-study}{%
\section{Further Practice}\label{tmap-study}}

\begin{itemize}
\tightlist
\item
  Modify the pop-ups to include the total population per census tract (ttl\_ppE). You can name the pop-up label ``population.''
\end{itemize}

\hypertarget{tmap-resources}{%
\section{Additional Resources}\label{tmap-resources}}

\begin{itemize}
\item
  U.S. Census Bureau (2022). Glossary. Retrieved from \url{https://www.census.gov/programs-surveys/geography/about/glossary.html\#par_textimage_13}
\item
  \emph{Geocomputation with R}: \url{https://geocompr.robinlovelace.net/index.html}
\end{itemize}

\hypertarget{tidytext}{%
\chapter{\texorpdfstring{Textual Analysis with \emph{tidytext}}{Textual Analysis with tidytext}}\label{tidytext}}

\hypertarget{tidytext-los}{%
\section{\texorpdfstring{Learning Objective: analyze text analytics with the \emph{tidytext} package.}{Learning Objective: analyze text analytics with the tidytext package.}}\label{tidytext-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Describe a use case for text mining
\item
  Use the tidytext package to load a textual dataset into the IDE
\item
  Tokenize a textual dataset
\item
  Perform a sentiment analysis of the textual dataset
\item
  Identify the text's most common words using tf-idf functions
\end{enumerate}

\hypertarget{text-terms}{%
\section{Terms You'll Learn}\label{text-terms}}

\begin{itemize}
\tightlist
\item
  text mining
\item
  sentiment analysis
\item
  JSON
\item
  for-loop
\item
  term frequency
\item
  inverse document frequency
\item
  tf-idf
\end{itemize}

\hypertarget{text-scenario}{%
\section{Scenario}\label{text-scenario}}

Looking for a job is stressful! Being unemployed when you want to be employed is hard. Is there any data to support that unemployment has negative connotations? You'd like to add a section to your report to highlight the negative cultural connotations associated with unemployment, which job seekers feel, to further support your position that a library/community partnership is needed. In this case, newspaper articles can function as cultural data to support your premise. You'll use the \emph{tidytext} package to text mine article metadata from the \emph{New York Times} and add that analysis to your report.

\hypertarget{text-pkgs}{%
\section{Packages \& Datasets Needed}\label{text-pkgs}}

\begin{verbatim}
##
## Attaching package: 'jsonlite'
\end{verbatim}

\begin{verbatim}
## The following object is masked from 'package:purrr':
##
##     flatten
\end{verbatim}

\hypertarget{text-intro}{%
\section{Introduction}\label{text-intro}}

When working with datasets containing numerical values, we can use statistical analysis and visualizations to describe our data. Within librarianship, like in the humanities and social sciences, datasets are often composed of text. \emph{Text mining}\footnote{\url{https://en.wikipedia.org/wiki/Text_mining}} provides insight into textual data by turning text, documents, or even books into a dataset. We can describe, analyze, modify, and visualize a dataset of words just as we would numbers.

We might want to analyze many academic and social texts, from our library collections, or words used in social and cultural domains that affect library patrons and the communities we serve, including some that are just for fun. Looking at a few R-specific packages for textual analysis tasks is one way to see the breadth of possibilities for text mining:

\begin{itemize}
\tightlist
\item
  \emph{twitteR}\footnote{\url{https://cran.r-project.org/package=twitteR}}
\item
  \emph{gutenbergR}\footnote{\url{https://cran.r-project.org/package=gutenbergr}}
\item
  \emph{janeaustenR}\footnote{\url{https://cran.r-project.org/package=janeaustenr}}
\item
  \emph{scotus}\footnote{\url{https://github.com/EmilHvitfeldt/scotus}}
\item
  \emph{pubmedR}\footnote{\url{https://cran.r-project.org/package=pubmedR}}
\item
  \emph{jstor}\footnote{\url{https://cran.r-project.org/package=jstor}}
\item
  \emph{nasadata}\footnote{\url{https://cran.r-project.org/package=nasadata}}
\item
  \emph{rfacebookstat}\footnote{\url{https://cran.r-project.org/package=rfacebookstat}}
\item
  \emph{shrute}\footnote{\url{https://bradlindblad.github.io/schrute/}}
\end{itemize}

This chapter will only scratch the surface of what is possible with text mining. The first topic we'll discuss is \emph{sentiment analysis}\footnote{\url{https://en.wikipedia.org/wiki/Sentiment_analysis}}, which measures the emotional tone of a dataset, such as positive or negative. We will also touch on term frequency and measures of meaningful words. Most of the predictive search functions that librarians interact with within search engines and library catalogs use a combination of text mining and machine learning\ref{tidymodels}. By quantifying the relationships between words through text mining, search engine developers can use machine learning techniques to predict what will follow a term entered into a search box.

\hypertarget{text-api}{%
\section{Query the NYT article database}\label{text-api}}

To get started with text mining, we need to find a body of text that we can analyze. A useful dataset is one large enough to represent society so that sentiment analysis is meaningful; it must also be accessible. \emph{The New York Times}' article database fits these criteria. We can retrieve articles of particular interest for mining with the NYT developer API. We first discussed APIs in chapter 3\ref{census-setup}. Accessing an application, like an article database, via code is really useful when you need to pull a large amount of data or make repeated queries from a database.

Accessing the article database through its API is not the only way to access that data. There is an R package specific to the NYT articles database called \emph{nytimes}. However, its last update was several years ago. We also found it lacked specificity because we couldn't query or filter by specific metadata fields that contained the text we wanted. In many cases, a context-specific R package is the best way to solve your data science needs. However, in this case, we found that using the NYT developer API was better suited to filtering and searching fields that contained the data most pertinent to our use case.

\hypertarget{dev-account}{%
\subsection{Set up developer account}\label{dev-account}}

Each user must have a developer account to access the NYT article database via API. Our first step is to access \url{http://developer.nytimes.com} and create an account. After creating your account, log in and select ``Get Started'' to begin the process of creating an API key, which is a code unique to you that authorizes access to the NYT articles database.
\includegraphics{images/nyt-api-get-started.png}

\hypertarget{create-app}{%
\subsection{Create an app}\label{create-app}}

The first task after creating a developer task is to create an application and the unique database access key for that application to use. The steps to register an application\citep{get-started} are:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Select \emph{App} from the drop-down menu under your email address.
\item
  Click ``\emph{+ New App}'' to create a new app.
\item
  Enter a name ana=description for your app in the dialog box
\item
  Click \emph{Create}.
\item
  Click the \emph{APIs} tab.
\item
  Click the access toggle to \emph{enable} access to the \emph{Article Search API}
\end{enumerate}

In the last step, you'll notice several APIs to choose from, like Archive, Books, and Most Popular. In this case, you need a large text dataset, and the Article Search is the best fit. However, the NYT API does not return full-text articles, but we can select text-heavy metadata fields like abstracts in our API query that will meet our needs.

We'll need to look at the API menu within this site to get the specific syntax we need to access the data we need. The Article Search API\footnote{\url{https://developer.nytimes.com/docs/articlesearch-product/1/overview}} page gives us the format for the API call specific to this dataset: \texttt{/articlesearch.json?q=\{query\}\&fq=\{filter\}}. There is a great deal of additional information about this API that is useful, particularly how to access specific sections of the newspaper. This page also provide an example API call: \texttt{https://api.nytimes.com/svc/search/v2/articlesearch.json?q=election\&api-key=yourkey}. When we compare the the example call against the format for this API is a specific web address plus our query terms and filters, as well as our specific API key.

\hypertarget{nyt-key}{%
\subsection{Find \& save your API key}\label{nyt-key}}

App creation automatically triggers the creation of an API key. The key is unique to your account and authorizes you to query the NYT database. Many API calls you may utilize in your work will require an API key. To access your API key\citep{get-started},

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Select \emph{My Apps}.
\item
  Select your app.
\item
  View the API key on the \emph{App Details} tab.
\item
  Confirm that the API key status is \emph{Active}.
\end{enumerate}

We need to add an object to store our API key so that our code can pass that into the API call. Saving it as a separate object simplifies our code and prevents us from inadvertently altering it when we update or change parts of the API code itself.

To run this code chunk yourself, delete the `\#', copy your API key from the App Details tab and enter it between the double-quotes below.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# api\_key \textless{}{-} "your\_api\_key\_here"}
\end{Highlighting}
\end{Shaded}

\hypertarget{api-pull}{%
\subsection{Pull data from the NYT API}\label{api-pull}}

Once the API key is stored, we are ready to construct the code to query the NYT article database\footnote{the code in the section is a modification of code from \url{https://rpubs.com/hmgeiger/373949} and \url{https://www.storybench.org/working-with-the-new-york-times-api-in-r/}}. We will be using the \emph{jsonlite} package to retrieve the data in JSON format and then modify it into a form R can interpret. \emph{JSON}\footnote{\url{https://en.wikipedia.org/wiki/JSON}} is an acronym for JavaScript Object Notation and is a standard file format used for transmitting data. It's programming-language agnostic, which means it can work with R, Python, and other languages. Text datasets are often encoded with JSON for export or transfer to other systems, making experience with the \emph{jsonlite} package an instrumental skill.

We need three code sections to pull the data we need from the NYT. These code chunks will take us from our initial API query to a data frame of text field names and contents from the newspaper.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Identify the search query we want to pass to the database.
\end{enumerate}

We want to search for the word ``unemployment'' in the US section of the NYT. This means our query will be ``unemployment'' and our filter will be the ``U.S.'' section of the newspaper. We also have to append the API key to that search URL to authorize our API call. In the chunk below, \texttt{paste0} is a base R function that concatenates two character strings. We want to join the search string with our API key for this search, and save it as an object. Throughout this chapter we will utilize this object, called \texttt{baseurl}, as a representation of our unique API call, which is the query string plus our specific API key.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{baseurl }\OtherTok{\textless{}{-}} \FunctionTok{paste0}\NormalTok{(}
  \StringTok{"http://api.nytimes.com/svc/search/v2/articlesearch.json?q=unemployment\&section\_name=(U.S.)"}\NormalTok{,}
\NormalTok{  api\_key)}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Create an object to represent the number of search results pages we want to retrieve.
\end{enumerate}

When using the API, each search results page contains ten records per page, and the API will only return one page of results at a time, no matter how many total pages the search returns. However, the ten records included on one page of search results will not provide enough data for text analysis so we will need many more than just one page. To balance the need for a large dataset without taking too long to process the data, we will retrieve output for 25 pages of search results. Before we run the query, we need to create an object that will store all of our API output.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{pages }\OtherTok{\textless{}{-}} \FunctionTok{vector}\NormalTok{(}\StringTok{"list"}\NormalTok{,}\AttributeTok{length=}\DecValTok{25}\NormalTok{) }\CommentTok{\#25 pages is 250 articles}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Retrieve 25 pages of search results
\end{enumerate}

We need to retrieve each page of search results, so that we can combine them into one dataset in a subsequent step.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{page1 }\OtherTok{\textless{}{-}} \FunctionTok{fromJSON}\NormalTok{(}\FunctionTok{paste0}\NormalTok{(baseurl,}\StringTok{"\&page=1"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{data.frame}\NormalTok{()}

\NormalTok{page2 }\OtherTok{\textless{}{-}} \FunctionTok{fromJSON}\NormalTok{(}\FunctionTok{paste0}\NormalTok{(baseurl,}\StringTok{"\&page=2"}\NormalTok{)) }\SpecialCharTok{\%\textgreater{}\%}
    \FunctionTok{data.frame}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

Writing out the code for just two pages of search results is more than enough; who wants to write that code 23 more times and then concatenate them(\texttt{c(page1,\ page2,\ ...)}) together into one table?

Thankfully, as with the \emph{purrr} package, R has options to handle the repetition for us. In this case, we want to run the same code 25 times, with only the page number changing. In other words, for every page between numbers 1 and 25, execute the API call and put the output into a single data frame. A particular code function called a \emph{for-loop} makes this possible. For-loops have three parts\citep{wickham2016}:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  \emph{output}, (here, the \texttt{pages} object)
\item
  \emph{sequence},the sequence to be looped over (here, 1-25)
\item
  \emph{body}, (here, the baseurl plus the page number)
\end{enumerate}

One characteristic of JSON data, which is used frequently in websites and web applications, is that the data table retrieved will have tables inside of tables. You could think of this as a row in a spreadsheet that points to another spreadsheet. We need to unpack these tables within tables using the \texttt{FLATTEN} parameter.

In the sequence below, \emph{i} is a placeholder for each page number. We see this in two places, the for-loop as well in the output, where we want to create an object from each page (page \emph{i} in pages 1-25). Also, there's a limit of 10 requests per minute for NYT API calls. Since we have 25 calls to make, we must pause for 6 seconds between page calls.

\begin{Shaded}
\begin{Highlighting}[]
\ControlFlowTok{for}\NormalTok{(i }\ControlFlowTok{in} \DecValTok{1}\SpecialCharTok{:}\DecValTok{25}\NormalTok{)\{ }\CommentTok{\# can be read as "for every page number from 1 to 25"}
\NormalTok{    nytSearch }\OtherTok{\textless{}{-}} \FunctionTok{fromJSON}\NormalTok{(}\FunctionTok{paste0}\NormalTok{(baseurl, }\StringTok{"\&page="}\NormalTok{, i), }\AttributeTok{flatten =} \ConstantTok{TRUE}\NormalTok{) }\SpecialCharTok{\%\textgreater{}\%} \FunctionTok{data.frame}\NormalTok{() }\CommentTok{\# \_body\_}
\NormalTok{    pages[[i]] }\OtherTok{\textless{}{-}}\NormalTok{ nytSearch }\CommentTok{\# \_output\_}
    \FunctionTok{Sys.sleep}\NormalTok{(}\DecValTok{6}\NormalTok{) }\CommentTok{\#there\textquotesingle{}s a limit of 10 requests per minute for API calls so we must pause for 6 seconds before initiating the next call}
\NormalTok{\}}
\end{Highlighting}
\end{Shaded}

\hypertarget{api-dataset}{%
\subsection{Create textual dataset from API call}\label{api-dataset}}

Our for-loop returned data that needs a little tidying before using it. We have 25 separate files with more metadata fields than we need.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  First, we need to stitch each of the 25 pages of search results into one data frame.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{articles\_json }\OtherTok{\textless{}{-}} \FunctionTok{rbind\_pages}\NormalTok{(pages) }\CommentTok{\# combining 25 outputs into one dataframe}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  We need to pull out the four text-rich metadata fields for our analysis: abstract, snippet, lead paragraph, and headline. We need to view the metadata field names to ensure that we combine the correct column names.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{colnames}\NormalTok{(articles\_json)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
##  [1] "status"
##  [2] "copyright"
##  [3] "response.docs.abstract"
##  [4] "response.docs.web_url"
##  [5] "response.docs.snippet"
##  [6] "response.docs.lead_paragraph"
##  [7] "response.docs.source"
##  [8] "response.docs.multimedia"
##  [9] "response.docs.keywords"
## [10] "response.docs.pub_date"
## [11] "response.docs.document_type"
## [12] "response.docs.news_desk"
## [13] "response.docs.section_name"
## [14] "response.docs.type_of_material"
## [15] "response.docs._id"
## [16] "response.docs.word_count"
## [17] "response.docs.uri"
## [18] "response.docs.print_section"
## [19] "response.docs.print_page"
## [20] "response.docs.subsection_name"
## [21] "response.docs.headline.main"
## [22] "response.docs.headline.kicker"
## [23] "response.docs.headline.content_kicker"
## [24] "response.docs.headline.print_headline"
## [25] "response.docs.headline.name"
## [26] "response.docs.headline.seo"
## [27] "response.docs.headline.sub"
## [28] "response.docs.byline.original"
## [29] "response.docs.byline.person"
## [30] "response.docs.byline.organization"
## [31] "response.meta.hits"
## [32] "response.meta.offset"
## [33] "response.meta.time"
## [34] "response.docs.slideshow_credits"
\end{verbatim}

From the list of column names, we can see the specific field syntax we need. Our code won't work if we don't have the exact column name as we start to create a subset of the data for our use case.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  We'll create a new object that is a data frame of only the four text columns we want to use.
\end{enumerate}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{text\_fields }\OtherTok{\textless{}{-}}\NormalTok{ articles\_json }\SpecialCharTok{\%\textgreater{}\%}
\NormalTok{  dplyr}\SpecialCharTok{::}\FunctionTok{select}\NormalTok{(}\FunctionTok{c}\NormalTok{(response.docs.abstract, response.docs.snippet, }\CommentTok{\# select() is a \_dplyr\_ function}
\NormalTok{  response.docs.lead\_paragraph, response.docs.headline.main))}
\end{Highlighting}
\end{Shaded}

\hypertarget{tokenization}{%
\section{Tokenization}\label{tokenization}}

Just as tidy data principles dictate only one value per row/column pair, the tidytext package requires a tidy textual dataset before analysis. We must convert our \texttt{text\_fields} object from a row/column pair containing many sentences into a table with only one word per row/column pair. Each word is called a token, and breaking a text into individual tokens is called \emph{tokenization}\footnote{\url{https://en.wikipedia.org/wiki/Lexical_analysis\#Tokenization}}.

\hypertarget{load-tokenize}{%
\subsection{Use the tidytext package to load a textual dataset into the IDE and tokenize it}\label{load-tokenize}}

We have four text fields, or columns, that we've pulled from the article database, and we need to tokenize each field. We will tokenize each column and then stitch them back together into a single, massive table for analysis in subsequent tasks.

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Tokenize abstracts
\end{enumerate}

Using the \texttt{unnest\_tokens} function, we'll start to tokenize our dataset, beginning with the abstracts.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{abstract\_token }\OtherTok{\textless{}{-}}\NormalTok{ text\_fields }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest\_tokens}\NormalTok{(word, response.docs.abstract)}

\FunctionTok{glimpse}\NormalTok{(abstract\_token)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 7,590
## Columns: 4
## $ response.docs.snippet        <chr> "The Federal Res~
## $ response.docs.lead_paragraph <chr> "This article is~
## $ response.docs.headline.main  <chr> "Is Inflation Ab~
## $ word                         <chr> "the", "federal"~
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
  Select only column 4 to create a clean table of abstract tokens
\end{enumerate}

The unnest\_tokens function adds the tokens to the dataset as a new column. However, we only want the tokens column (the 4th column), so we must subset that column as we did in earlier chapters.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{abstracts }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(abstract\_token[[}\DecValTok{4}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

Let's look at the first six results of the abstracts object.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{head}\NormalTok{(abstracts)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 1
##   `abstract_token[[4]]`
##   <chr>
## 1 the
## 2 federal
## 3 reserve’s
## 4 extraordinary
## 5 interest
## 6 rate
\end{verbatim}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
  Complete tokenization
\end{enumerate}

We'll repeat the unnest\_tokens function for headlines, lead paragraphs, and snippets to finish tokenizing this dataset.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{headline\_token }\OtherTok{\textless{}{-}}\NormalTok{ text\_fields }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest\_tokens}\NormalTok{(word, response.docs.headline.main)}

\NormalTok{headlines }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(headline\_token[[}\DecValTok{4}\NormalTok{]])}

\NormalTok{lead\_token }\OtherTok{\textless{}{-}}\NormalTok{ text\_fields }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest\_tokens}\NormalTok{(word, response.docs.lead\_paragraph)}

\NormalTok{leads }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(lead\_token[[}\DecValTok{4}\NormalTok{]])}

\NormalTok{snippet\_token }\OtherTok{\textless{}{-}}\NormalTok{ text\_fields }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unnest\_tokens}\NormalTok{(word, response.docs.snippet)}

\NormalTok{snippets }\OtherTok{\textless{}{-}} \FunctionTok{tibble}\NormalTok{(snippet\_token[[}\DecValTok{4}\NormalTok{]])}
\end{Highlighting}
\end{Shaded}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{3}
\tightlist
\item
  Combine tokens
\end{enumerate}

After pulling the tokens out of their respective objects, we need to combine them into one very long column using \texttt{c()}. The single column in our table is unnamed, which we will rename so that we can use it in the next section, just like we did in Chapter 5.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{words }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(abstracts,headlines,leads,snippets) }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{unlist}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{tibble}\NormalTok{() }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{rename}\NormalTok{(}\AttributeTok{word =} \FunctionTok{names}\NormalTok{(.))}
\end{Highlighting}
\end{Shaded}

\hypertarget{stop-words}{%
\section{Stop Words}\label{stop-words}}

When trying to intuit meaning from a textual dataset, we want to analyze the words that matter in the text and omit the insubstantial. Words like ``of,'' ``and,'' ``to,'' and ``by'' don't convey meaning as ``library,'' ``classification,'' ``books,'' and ``reference'' do. We call these words that have little value \emph{stop words}\footnote{\url{https://en.wikipedia.org/wiki/Stop_word}}, and we want to remove them from our dataset as part of the tidying process before we start analyzing the dataset. However, we cannot remove stop words until after tokenization. The stop words file has one word per row, so our dataset must also have one word per row. That way, the \emph{dplyr} join function will match them.

As you remove stop words, however, be aware that what's considered a stop word can have implications for analysis in later steps. For example, ``not'' is used in English to negate another word, like ``happy.'' Sentiment analysis is predicated on words having a positive or negative association. However, ``happy'' could have a positive association if ``not'' was used before it and then removed as a stop word, changing the negative textual meaning to a positive sentiment analysis.

\hypertarget{remove-stopwords}{%
\subsection{Remove stop words from the words object}\label{remove-stopwords}}

The \emph{tidytext} package comes with a stop words dataset that we can load and run against our words dataset. We'll use a join from the \emph{dplyr} chapter; in this case, we want to use \texttt{anti\_join()}, which will remove any values that appear in both the words \& stop\_words datasets. This way, any word in the stop words list that appears in our dataset will be removed, along with words that have no match. An anti\_join must match on columns with the same name. The stop\_words dataset has a column named word, and we renamed one of columns in the previous step to word also, so now our anti-join will work.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{data}\NormalTok{(}\StringTok{"stop\_words"}\NormalTok{) }\CommentTok{\#loads the stop words set from \_tidytext\_}

\NormalTok{tidy\_words }\OtherTok{\textless{}{-}}\NormalTok{ words }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{anti\_join}\NormalTok{(stop\_words) }\CommentTok{\#using a \_dplyr\_ join, remove stop words found in the words dataset}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

\hypertarget{sentiment-analysis}{%
\section{Sentiment analysis}\label{sentiment-analysis}}

As previously mentioned, sentiment analysis is analyzing a text's feelings by rating the sentiment of each word in the text. To add objectivity to this analysis, data scientists utilize a lexicon to compare the text they're analyzing with a known list of word/sentiment pairs, like `happy'/positive. Bing, NRC, and the \emph{textdata} package's lexicon\citep{mohammad13} are several popular lexicons used in text mining.

Political scientists often use sentiment analysis to examine sentiment polarity, the positive/negative connotation of public discourse surrounding a political event \citep{matalon21}. Those studies often use social media datasets from Twitter or Facebook as a proxy for societal discourse at large. Similarly, law librarians may come across sentiment analyses of judicial opinions, the text of laws, or law review articles.

We are interested in the negative cultural connotation of the word `unemployment' and are looking for data to back that up. Now that we have a tidy, tokenized dataset, we can compare each term in our dataset against the Bing lexicon, which offers a binary positive/negative rating for each term in the lexicon. Using a \emph{dplyr} join function, we can get the sentiment for each word in the dataset and calculate the positive and negative sentiment ratio.

\hypertarget{bing-load}{%
\subsection{Perform a sentiment analysis of the textual dataset}\label{bing-load}}

To determine the sentiment of our dataset of articles about unemployment, we will join the Bing lexicon\citep{Hu04} with our tokenized dataset and count the number of positive and negative emotions.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bing }\OtherTok{\textless{}{-}}\NormalTok{ tidy\_words }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{inner\_join}\NormalTok{(}\FunctionTok{get\_sentiments}\NormalTok{(}\StringTok{"bing"}\NormalTok{)) }
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Joining, by = "word"
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{bing\_count }\OtherTok{\textless{}{-}}\NormalTok{ bing }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{count}\NormalTok{(sentiment)}
\end{Highlighting}
\end{Shaded}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{print}\NormalTok{(bing\_count)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative    923
## 2 positive    415
\end{verbatim}

This count shows that the 200 unemployment-related articles we analyzed are dominantly negative. By doing a little bit of calculation, we can see the exact ratio of negative to positive words.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{ratio }\OtherTok{\textless{}{-}} \DecValTok{877}\SpecialCharTok{/}\DecValTok{378}

\FunctionTok{print}\NormalTok{(ratio)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## [1] 2.32
\end{verbatim}

Our data shows a rate of more than two negative terms for every one positive word. While we may intuitively know that unemployment has a negative connotation, we now have evidence to support that postulation. Being unemployed or looking for a job is hard enough without the specter of such a resoundingly negative cultural connotation. It would seem, then, that our outreach efforts are much needed.

\hypertarget{tf-idf}{%
\section{TF-IDF}\label{tf-idf}}

One of the critical underpinnings of text mining is the relationship between the frequency that a term appears in a collection of documents and how important or meaningful a term is to those documents. \emph{Term frequency}, or tf, ranks every term in a text from most to least common. A term's frequency is a proportion created by dividing the number of terms in our dataset by counting how many times each word appears in the dataset. \emph{Inverse document frequency}, or idf, relates to the premise that the most unique (least frequent) words are the most meaningful. This phenomenon is known as Zipf's Law\footnote{\url{https://en.wikipedia.org/wiki/Zipf\%27s_law}}, which says that the frequency of a word is inversely proportional to its rank.

When we're analyzing a textual dataset, using \emph{tf-idf} can tell us the most meaningful words, which we can use to tell us the text's topic. With tf-idf, we give greater weight to terms that are essentially more unique in a particular text or corpus (group of many texts). Terms that are common in our texts but otherwise rare tell us what words are most important, and those words are most likely to indicate the subject matter. If we wanted to do a meta-analysis across many texts, we would then be able to compare and contrast datasets based on their topics. In this chapter we will concern ourselves mostly with term frequency, but it is important to be familiar with tf-idf when doing textual analysis.

\hypertarget{word-frequency}{%
\subsection{Determine word frequency}\label{word-frequency}}

The first part of tf-idf is term frequency. To determine the frequency of the different words in our dataset, we need to count term frequency and sort them in descending order.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{word\_counts }\OtherTok{\textless{}{-}}\NormalTok{ tidy\_words }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{count}\NormalTok{(word, }\AttributeTok{sort =} \ConstantTok{TRUE}\NormalTok{) }

\FunctionTok{head}\NormalTok{(word\_counts)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 6 x 2
##   word             n
##   <chr>        <int>
## 1 unemployment   731
## 2 percent        465
## 3 rate           351
## 4 jobs           163
## 5 labor          129
## 6 economy        116
\end{verbatim}

This lists all the words in the dataset with a count of how many times each word appears, which gives us each word's frequency.

\hypertarget{term-frequency}{%
\subsection{Identify the text's most common words}\label{term-frequency}}

Once we have a count of each term's frequency in the overall dataset, we are able to perform the tf-idf calculation to determine which words are most important.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rank\_frequency }\OtherTok{\textless{}{-}}\NormalTok{ word\_counts }\SpecialCharTok{\%\textgreater{}\%}  \CommentTok{\#take our list of words, minus stop words}
  \FunctionTok{mutate}\NormalTok{(}\AttributeTok{rank =} \FunctionTok{row\_number}\NormalTok{(),  }\CommentTok{\#add a new column called rank, which writes the row number for that row}
         \AttributeTok{term\_frequency =}\NormalTok{ n}\SpecialCharTok{/}\DecValTok{12731}\NormalTok{)  }\CommentTok{\#add a new column called term frequency, which calculates the frequency for each term by dividing the total number of count of observations in tidy\_words dataset (12731) by the count of each word}
\FunctionTok{print}\NormalTok{(rank\_frequency)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## # A tibble: 2,655 x 4
##    word             n  rank term_frequency
##    <chr>        <int> <int>          <dbl>
##  1 unemployment   731     1        0.0574
##  2 percent        465     2        0.0365
##  3 rate           351     3        0.0276
##  4 jobs           163     4        0.0128
##  5 labor          129     5        0.0101
##  6 economy        116     6        0.00911
##  7 jobless        104     7        0.00817
##  8 time           103     8        0.00809
##  9 job             89     9        0.00699
## 10 department      82    10        0.00644
## # ... with 2,645 more rows
## # i Use `print(n = ...)` to see more rows
\end{verbatim}

The new column \texttt{term\_frequency}, is a calculated field because it is derived from the results of a mathematical operation. We sorted each \texttt{word} by the number (\texttt{n}) of times it appears in our text. There were a total of 12731 observations (rows) in \texttt{tidy\_words}. The result is a ratio of how many times each word appeared in the text compared with the number of words in total.

\hypertarget{zipf}{%
\subsection{Plot word frequency}\label{zipf}}

Making a plot to show term frequencies is a way to ensure that the textual dataset has a normal distribution. Due to Zipf's Law, a `normal' distribution here would be a constant negative slope; the straight line should be higher on the left and lower on the right. However, when we look at the \texttt{term\_frequency} column in the preceding step, we notice that the numbers start small and then get progresively smaller. If we made a plot of this data as-is, it would not follow the bell-curve of what statisticians call a `normal distribution.' In cases like these, a data scientist might want to transform the entire dataset by taking the log(x) of each value. We'll apply a logarithmic scale to both our x and y axis with the \texttt{scale\_*\_log10()} functions in our \texttt{ggplot()} function call:

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{rank\_frequency }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(rank, term\_frequency, }\AttributeTok{color =}\NormalTok{ word)) }\SpecialCharTok{+}  \CommentTok{\# note that when you start your ggplot2 code, each line is ended with a \textasciigrave{}+\textasciigrave{} instead of the pipe}
  \FunctionTok{geom\_line}\NormalTok{(}\AttributeTok{group =} \DecValTok{1}\NormalTok{, }\AttributeTok{size =} \FloatTok{1.1}\NormalTok{, }\AttributeTok{alpha =} \FloatTok{0.8}\NormalTok{, }\AttributeTok{show.legend =} \ConstantTok{FALSE}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{scale\_x\_log10}\NormalTok{() }\SpecialCharTok{+} \CommentTok{\# we\textquotesingle{}re using a logarithmic scale on both axis to show the constant negative slop}
  \FunctionTok{scale\_y\_log10}\NormalTok{()}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/plot-frequency-1.pdf}
Plotted above, we can see that our textual dataset conforms to Zipf's Law, which validates the normalcy of our dataset and the other conclusions we have drawn about it. By transforming our original skewed, long-tail dataset we were able to create a linear relationship and display the constant negative slope. This highlights the inverse relationship between word frequency and meaning.

\hypertarget{text-summary}{%
\section{Summary}\label{text-summary}}

This chapter covered the basics of what text mining, sentiment analysis, and tf-idf are. We started with APIs and JSON to extract data from a popular article database and then walked through the steps involved in tidying and performing a text analysis on that data.

\hypertarget{text-study}{%
\section{Further Practice}\label{text-study}}

\emph{increase the number of pages returned to see if a larger dataset changes the ratio of negative to positive and if Zipft's Law holds
} load the \emph{janeaustenR} package and run a sentiment analysis on your favorite Austen novel (or another book in \emph{gutenbergR})

\hypertarget{text-resources}{%
\section{Additional Resources}\label{text-resources}}

\begin{itemize}
\tightlist
\item
  Text Mining in R (Julia Silge \& David Robinson)
\item
  Digital Humanities resources:
  -earlyprint.org: ``The EarlyPrint Lab offers a range of tools for the computational exploration and analysis of English print culture before 1700''
  -americaspublicbible.org/
\end{itemize}

\hypertarget{rmarkdown}{%
\chapter{\texorpdfstring{Creating Dynamic Documents with \emph{rmarkdown}}{Creating Dynamic Documents with rmarkdown}}\label{rmarkdown}}

\hypertarget{rmarkdown-los}{%
\section{Learning Objective: Compose a dynamic document using the rmarkdown package. After you finish this chapter, you will be able to perform the following steps to create an R Markdown document:}\label{rmarkdown-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Reproduce markdown syntax.
\item
  Use appropriate code to load data and relevant libraries in the RStudio IDE.
\item
  Generate code to create and format code chunks.
\item
  Modify display options to create a customized R Markdown document.
\item
  ``Knit'' R Markdown documents into various formats.
\end{enumerate}

\hypertarget{rmarkdown-scenario}{%
\section{Scenario}\label{rmarkdown-scenario}}

Now that we have a foundation, let's use everything we've learned so far to create an R Markdown document with our analyses of the US Census and NYT data, as well as our data visualizations (the plots and maps we created earlier) for the relevant stakeholders.

\hypertarget{rmarkdown-intro}{%
\section{Introduction to R Markdown}\label{rmarkdown-intro}}

The \emph{rmarkdown} package allows you to create various documents such as PDFs, word documents, and HTML files. You can also create websites, presentations, and dashboards. Once you grasp R Markdown, you will create a dashboard in Chapter 9@ref\{flexdashboard\}. R Markdown uses \emph{Markdown syntax}\footnote{\url{https://www.markdownguide.org/getting-started/}} to add formatting to plain text documents (Cone, 2022). Examples of such formatting are: adding headers, italicizing and bolding text, and adding ordered and unordered lists. We are not doing a deep dive with R Markdown, but by the time you finish this chapter, you will be up and running with R Markdown in creating a basic document. For more information about R Markdown, please refer to \emph{R Markdown the Definitive Guide}, by Yihui Xie, J. J. Allaire, and Garrett Grolemund\footnote{\url{https://bookdown.org/yihui/rmarkdown/}}.

In addition, R Markdown allows you to include executable code chunks within the R Markdown document. So what's the point in putting in code chunks within an R Markdown document when you can give someone a file with your code? Why not use Microsoft PowerPoint and Word instead? Having the code chunks embedded in the document can serve various purposes. Suppose you want to create a document or presentation with data visualizations such as bar plots and an explanation of the visualization. In that case, you can do that all within one ecosystem instead of creating the visualization in another program, saving the image, and then adding it to your document or presentation. Also, suppose you have to produce regular reports that use continuously updated datasets. In that case, you can easily update your visualization by merely reloading your updated dataset and publishing it.

\hypertarget{rmarkdown-pkgs}{%
\section{Packages \& Datasets Needed}\label{rmarkdown-pkgs}}

For this chapter, you will need to install and load the \emph{tidyverse, tidycensus, readr, sf}, and \emph{tmap} packages. To create the rmarkdown document, we also need to load the necessary data if it isn't already loaded. We will be loading the csv files showing the occupations with the lowest unemployment by sex and our shapefiles of the St.~Louis wards and Census tracts, which we created in earlier chapters.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(tidycensus)}
\FunctionTok{library}\NormalTok{(readr)}
\FunctionTok{library}\NormalTok{(sf)}
\FunctionTok{library}\NormalTok{(tmap)}

\CommentTok{\#csv files}
\NormalTok{male\_low\_unemployment }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/male{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10 Columns: 3
## -- Column specification -------------------------------
## Delimiter: ","
## chr (2): label, male_jobs
## dbl (1): total
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{female\_low\_unemployment }\OtherTok{\textless{}{-}} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/female{-}low{-}unemployment.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 10 Columns: 3
## -- Column specification -------------------------------
## Delimiter: ","
## chr (2): label, female_jobs
## dbl (1): total
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#shapefiles}
\NormalTok{stl\_wards }\OtherTok{\textless{}{-}} \FunctionTok{st\_read}\NormalTok{(}\StringTok{"nbrhds\_wards/WARDS\_2010.shp"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Reading layer `WARDS_2010' from data source
##   `/Users/sarahlin/r4lis/nbrhds_wards/WARDS_2010.shp'
##   using driver `ESRI Shapefile'
## Simple feature collection with 28 features and 3 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 871500 ymin: 982400 xmax: 915300 ymax: 1071000
## Projected CRS: NAD_1983_StatePlane_Missouri_East_FIPS_2401_Feet
\end{verbatim}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{unemployment\_tract }\OtherTok{\textless{}{-}} \FunctionTok{st\_read}\NormalTok{(}\StringTok{"unemployment\_tract/unemployment\_tract.shp"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Reading layer `unemployment_tract' from data source
##   `/Users/sarahlin/r4lis/unemployment_tract/unemployment_tract.shp'
##   using driver `ESRI Shapefile'
## Simple feature collection with 106 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -90.32 ymin: 38.53 xmax: -90.17 ymax: 38.77
## Geodetic CRS:  NAD83
\end{verbatim}

\hypertarget{creating-an-r-markdown-document}{%
\section{Creating an R Markdown document}\label{creating-an-r-markdown-document}}

You can create an R Markdown document in the R Studio RDE by going to \emph{File \textgreater{} New File \textgreater{} R Markdown}. You can create a document, presentation, Shiny app, or you can use a specific template. Keep the defaults selected and click \emph{OK}.

\begin{figure}
\centering
\includegraphics{images/rmarkdown_doc_prompt.png}
\caption{R Markdown document creation prompt}
\end{figure}

\hypertarget{r-markdown-document-structure}{%
\section{R Markdown document structure}\label{r-markdown-document-structure}}

Now that we loaded the necessary libraries and data, it's time to talk about the R Markdown document structure. R Markdown documents consist of three components:
1. YAML header
2. R Markdown syntax
3. Code chunks

\textbf{YAML header}
YAML, ``YAML Ain't Markdown Language,'' is used in the header to set the parameters. It contains the metadata on the document. Here is the default YAML for an R Markdown document:

\begin{Shaded}
\begin{Highlighting}[]
\SpecialCharTok{{-}{-}{-}}
\NormalTok{title}\SpecialCharTok{:} \StringTok{"Enter title here"}
\NormalTok{author}\SpecialCharTok{:} \StringTok{"Author\textquotesingle{}s Name"}
\NormalTok{date}\SpecialCharTok{:} \StringTok{"Date"}
\NormalTok{output}\SpecialCharTok{:}\NormalTok{ html\_document}
\SpecialCharTok{{-}{-}{-}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/rmarkdown_1.png}
\caption{Metadata information}
\end{figure}

This default YAML indicates the title, author, date, and output type. There are a variety of output options that you can designate in the YAML header. For example, in the above code, we specified the output as an html document. There are various output options, such as a PDF, Word, or PowerPoint presentation. You can see a list of output options \emph{here.}\footnote{\url{https://bookdown.org/yihui/rmarkdown/output-formats.html}} You can also customize your R Markdown document using templates which we will discuss later in this chapter.

\textbf{R Markdown syntax}
The R Markdown syntax allows you to add specific styling to your document, such as adding headers, italicizing and bolding texts, adding ordered and unordered lists, and adding links.

\textbf{Creating headers}

\begin{itemize}
\tightlist
\item
  \texttt{\#} is header 1
\item
  \texttt{\#\#} is header 2
\item
  \texttt{\#\#\#} is header 3
\end{itemize}

\textbf{Other styles}
- \texttt{*} Before and after the words: Italicized
- \texttt{**}Before and after the words: Bold

\textbf{Creating lists}
\texttt{-} after each item on the list, create an unordered list.
\texttt{1.} and continuing the numbers sequentially to create an ordered list.

\textbf{Links}
\texttt{{[}Website\ Link{]}(www.website.com)}
The bracket contains the link text while we put the web address
within the parentheses.

\begin{figure}
\centering
\includegraphics{images/rmarkdown_2.png}
\caption{Showing the output for headers, another styling like italicization and bolding along with unordered and ordered lists}
\end{figure}

\hypertarget{embedding-code}{%
\section{Embedding code}\label{embedding-code}}

You can embed code in your R Markdown document by adding code chunks.
You can do this by clicking on the green `C' icon, and you can choose to add a code chunk for a variety of coding languages. You can also manually add the code chunk by typing the code below.

\textbf{Code chunk options}
You can set various options related to the code output and the display of the code within \texttt{\{r\}} section of the code chunk.
- \texttt{include\ =\ FALSE} prevents the code output and code from being shown in the document you create. Other chunks can still use the output of the code.
- \texttt{echo\ =\ FALSE} shows the output but not the code.

Let's show how this works with loading the information about the aldermen.

\textbf{Code example with \texttt{echo\ =\ FALSE}}

\begin{verbatim}
## Rows: 29 Columns: 4
## -- Column specification -------------------------------
## Delimiter: ","
## chr (4): Ward, Alderman, website, Email
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

\textbf{Code example with \texttt{echo\ =\ TRUE}}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\#code example with using echo as TRUE}
\NormalTok{aldermen\_info }\OtherTok{=} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/aldermen{-}contact.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 29 Columns: 4
## -- Column specification -------------------------------
## Delimiter: ","
## chr (4): Ward, Alderman, website, Email
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

With \texttt{echo\ =\ FALSE}, you will not see the output of reading in the csv with the alderman contact information, while when \texttt{echo\ =\ TRUE}, you can see the output.

Code example with \texttt{include\ =\ TRUE}

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# code example with using include = TRUE }
\NormalTok{aldermen\_info }\OtherTok{=} \FunctionTok{read\_csv}\NormalTok{(}\StringTok{"data/aldermen{-}contact.csv"}\NormalTok{)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## Rows: 29 Columns: 4
## -- Column specification -------------------------------
## Delimiter: ","
## chr (4): Ward, Alderman, website, Email
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
\end{verbatim}

Code example with \texttt{include\ =}FALSE`

When loading the alderman info csv with \texttt{include\ =\ TRUE}, you will see both the output and the code, but with \texttt{include\ =\ FALSE}, you will not see either the output or the code.

\hypertarget{adding-data-visualizations-to-your-markdown-document}{%
\section{Adding data visualizations to your markdown document}\label{adding-data-visualizations-to-your-markdown-document}}

You can also embed data visualizations such as plots and maps into R Markdown. Let's embed the plot we made in the ggplot chapter in our R Markdown document. We added a code chunk and made sure \texttt{echo\ =\ TRUE} to see the code output.

\textbf{Plot of the top ten female occupations in Census tracts with lowest unemployment}

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{low\_unemployment\_female\_plot }\OtherTok{\textless{}{-}}\NormalTok{ female\_low\_unemployment }\SpecialCharTok{\%\textgreater{}\%}
  \FunctionTok{ggplot}\NormalTok{(}\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(female\_jobs, total))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten female occupations in census tracts with lowest unemployment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}

\NormalTok{low\_unemployment\_female\_plot}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/unnamed-chunk-11-1.pdf}

You can also embed maps into your R Markdown document. As with the code chunk before, we make sure the setting is \texttt{echo\ =\ TRUE} so we can see the output of the code, which will be a map of the wards in St.~Louis.

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{tm\_shape}\NormalTok{(stl\_wards, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{)}\SpecialCharTok{+}
  \FunctionTok{tm\_polygons}\NormalTok{()}\SpecialCharTok{+}
  \FunctionTok{tm\_layout}\NormalTok{(}\AttributeTok{title =} \StringTok{"Wards in St. Louis City"}\NormalTok{, }\AttributeTok{title.size =}\NormalTok{ .}\DecValTok{7}\NormalTok{) }\SpecialCharTok{+}
  \FunctionTok{tm\_scale\_bar}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"right"}\NormalTok{, }\StringTok{"bottom"}\NormalTok{), }\AttributeTok{width =}\NormalTok{ .}\DecValTok{3}\NormalTok{)}\SpecialCharTok{+}
  \FunctionTok{tm\_compass}\NormalTok{(}\AttributeTok{position =} \FunctionTok{c}\NormalTok{(}\StringTok{"left"}\NormalTok{, }\StringTok{"top"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}

\includegraphics{r4lis_files/figure-latex/unnamed-chunk-12-1.pdf}

\hypertarget{creating-your-output}{%
\section{Creating your output}\label{creating-your-output}}

You know how to add stylistic elements and code to your RMarkdown document, but how do you transform the R Markdown syntax and chunks in your R Markdown file to an output document like a PDF or Word file? Rendering is the process of converting the R Markdown syntax and code chunks into a document. The \emph{knitr} package is required to do this rendering. There are two ways in which you can do the rendering. When you create an R Markdown document in the RStudio RDE, you will see a \emph{Knit} button in the document toolbar. If you press the arrow next to the Knit icon, you will see options to \emph{Knit to PDF}, \emph{Knit to HTML}, and \emph{Knit to Word}. One thing to note is that if you want to create a PDF document, you must install the \emph{tinytex} package by typing \texttt{tinytex::install\_tinytex()}. Here are examples of the various output options displaying the same content in the markdown document.

\textbf{Knit to PDF}
\includegraphics{images/pdf_output1.png}

\includegraphics{images/pdf_output2.png}
\textbf{Knit to HTML}
\includegraphics{images/html_output.png}

\textbf{Knit to Word}
\includegraphics{images/word_output.png}

\hypertarget{customizing-your-r-markdown-document-with-templates}{%
\section{Customizing your R Markdown document with templates}\label{customizing-your-r-markdown-document-with-templates}}

You can customize your R Markdown document through a template you create on your own or through a pre-built template and call it through the ``output'' parameter in the YAML headers. We will focus on using a pre-built template. You can look at examples of R Markdown templates on the \emph{R Markdown gallery}\footnote{\url{https://rmarkdown.rstudio.com/gallery.html}}. There are existing templates you can choose from when you create a new R Markdown document, and you can also add a template by changing the output parameter of the YAML header.

Let's add one of the \emph{prettydoc}\footnote{\url{https://prettydoc.statr.me/themes.html}} templates called `architect' to your R Markdown document.

\begin{Shaded}
\begin{Highlighting}[]
\SpecialCharTok{{-}{-}{-}}
\NormalTok{title}\SpecialCharTok{:} \StringTok{"Report title"}
\NormalTok{author}\SpecialCharTok{:} \StringTok{"Your Name"}
\NormalTok{date}\SpecialCharTok{:}\NormalTok{ July }\DecValTok{31}\NormalTok{, }\DecValTok{2016}
\NormalTok{output}\SpecialCharTok{:}
\NormalTok{ prettydoc}\SpecialCharTok{::}\NormalTok{html\_pretty}\SpecialCharTok{:}
\NormalTok{    theme}\SpecialCharTok{:}\NormalTok{ cayman}
\NormalTok{    highlight}\SpecialCharTok{:}\NormalTok{ github}
\SpecialCharTok{{-}{-}{-}}
\end{Highlighting}
\end{Shaded}

\includegraphics{images/pretty_doc_1.png}

\includegraphics{images/pretty_doc_2.png}
\#\# Summary
R Markdown is a way to create various types of documents in RStudio. Using the Markdown language, it can be used to format plain text documents. An R Markdown document comprises a YAML header containing the metadata, R Markdown syntax allowing you to add stylistic elements in the document, and code chunks. When adding code chunks, you can choose whether to display the code chunks along with the output. After creating your R Markdown document, you can create a Word, PDF, or HTML document through knitting. Knitting converts R Markdown into the output format designated in the YAML header.

\hypertarget{for-further-practice}{%
\section{For further practice}\label{for-further-practice}}

\begin{itemize}
\tightlist
\item
  Create an R Markdown document and add a prettydoc template. Try changing the default YAML output to the other two prettydoc templates, which are \texttt{cayman} and \texttt{tacticle.}
\end{itemize}

\hypertarget{flexdashboard}{%
\chapter{Creating a flexdashboard}\label{flexdashboard}}

\hypertarget{flexdashboard-los}{%
\section{Learning Objective: To create a simple flexdashboard to display data visualizations and tabular data.}\label{flexdashboard-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Define a flexdashboard and the basic functionality of a flexdashboard.
\item
  Generate an R Markdown document with flexdashboard as the output format.
\item
  Recognize various layout structures of the flexdashboard in terms of column and row orientation.
\item
  Create a flexdashboard with a provided layout.
\end{enumerate}

\hypertarget{flexdashboard-terms}{%
\section{Terms You'll Learn}\label{flexdashboard-terms}}

\begin{itemize}
\tightlist
\item
  Flexdashboard
\end{itemize}

\hypertarget{flexdashboard-scenario}{%
\section{Scenario}\label{flexdashboard-scenario}}

Now that you know how to do various tasks with R such as data scrubbing and creating different data visualizations, you need to create a dashboard so the stakeholders can be able to easily access the information you've presented. You will use the \texttt{flexdashboard} package to do so. Your final product should look like this \emph{dashboard(post link)}.

\emph{Note: You should create a blank R-Markdown document when creating this dashboard for the elements of the dashboard will not run within the chapter.}

\hypertarget{flexdashboard-intro}{%
\section{Overview of flexdashboard}\label{flexdashboard-intro}}

Flexdashboard allows you to create a dashboard using R Markdown. It is a flexible interface that allows you to create a dashboard of various data visualizations into one interface. The main components of a flexdashboard consists of pages, columns, and rows. Within these components, you can insert various elements such as tabular output, graphical output or interactive data visualizations that run under the JavaScript programming language (Sievart, Iannone, Allaire, \& Borges, NA).

\hypertarget{flexdashboard-pkgs}{%
\section{Packages and datasets needed}\label{flexdashboard-pkgs}}

We are going to load several familiar packages such as \emph{tidyverse},\emph{tmap}, and \emph{sf}, with the addition of two new packages which are \emph{DT} and \emph{flexdashboard}.

We are also going to need to load the data that we used in the previous chapters as well which is the csvs of the alderman contact information, male and female unemployment information, along with the shapefiles of the St.~Louis wards and Census tracts. We also need to do a simple data transformation to calculate the unemployment rate using the \emph{dplyr} \texttt{rename()} and \texttt{mutate()} functions.

\hypertarget{initiating-the-flexdashboard}{%
\section{Initiating the flexdashboard}\label{initiating-the-flexdashboard}}

There are two ways in which you can initiate a flexdashboard. You can directly create it from the File tab in R Studio by going to \textbf{File \textgreater{} New File \textgreater{} R Markdown \textgreater{} From Template}

\includegraphics{images/rstudio_flex.png}
Another way we can create a flexdashboard is to create an R Markdown document and to specify the flexdashboard as output in the YAML header.

\begin{Shaded}
\begin{Highlighting}[]
\PreprocessorTok{{-}{-}{-}}
\FunctionTok{title}\KeywordTok{:}\AttributeTok{ }\StringTok{"St. Louis Dashboard"}
\FunctionTok{output}\KeywordTok{:}\AttributeTok{ flexdashboard::flex\_dashboard}
\PreprocessorTok{{-}{-}{-}}
\end{Highlighting}
\end{Shaded}

\hypertarget{creating-the-pages}{%
\section{Creating the pages}\label{creating-the-pages}}

We are first going to create three pages in our dashboard which will be of the St.~Louis ward maps, unemployment plots, and alderman information.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{St. Louis Ward Map}
\FunctionTok{===================================}


\NormalTok{Unemployment Plots}
\FunctionTok{===================================}

\NormalTok{Alderman Info}
\FunctionTok{===================================}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard1.png}
\caption{St.~Louis dashboard that shows three buttons which are St.~Louis Ward Map, Unemployment Plots and Alderman Info}
\end{figure}

\hypertarget{creating-the-columns}{%
\section{Creating the columns}\label{creating-the-columns}}

To do an easy comparison, we will place the maps in the \textbf{St.~Louis Ward Map} page and the plots in the \textbf{Unemployment Plots} page side by side. We will do this by adding columns to both these pages. We will not add any columns to the \textbf{Aldermen Info} page.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{St. Louis Ward Map}
\FunctionTok{===================================}

\NormalTok{Column 1}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\NormalTok{St. Louis Wards}


\NormalTok{Column 2}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\NormalTok{Unemployment Rate in St. Louis by Ward}


\NormalTok{Unemployment Plots}
\FunctionTok{===================================}


\NormalTok{Column 1}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}

\FunctionTok{\#\#\# Lowest unemployment (female)}


\NormalTok{Column 2}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\FunctionTok{\#\#\# Lowest unemployment(male)}


\NormalTok{Alderman Info}
\FunctionTok{===================================}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard2.png}
\caption{St.~Louis Wards Map page with two columns}
\end{figure}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard3.png}
\caption{Unemployment plots page with two columns}
\end{figure}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard4.png}
\caption{Aldermen Info page}
\end{figure}

\hypertarget{adding-the-code-chunks}{%
\section{Adding the code chunks}\label{adding-the-code-chunks}}

We will then put the title of the column sections and code chunks below these column designations. For the Alderman info page, we are using the \emph{datatable} function to display the table of alderman contact information. We will remove the titles of the unemployment plots in ggplot since we already have a column title for each plot.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\CommentTok{{-}{-}{-}}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}
\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{St. Louis Ward Map}
\FunctionTok{===================================}

\NormalTok{Column 1}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\NormalTok{St. Louis Wards}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r ward{-}map, echo=FALSE\}}
\InformationTok{tmap\_mode("view")}
\InformationTok{tm\_shape(stl\_wards, unit = "mi")+}
\InformationTok{  tm\_polygons()}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column 2}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\NormalTok{Unemployment Rate in St. Louis by Ward}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r unemployment{-}map, echo = FALSE\}}
\InformationTok{tmap\_mode("view")}
\InformationTok{tm\_shape(stl\_tracts) +}
\InformationTok{  tm\_fill("unemployment\_rate", title= "Umemployment rate",}
\InformationTok{          popup.vars = c("\% unemployed" = "unemployment\_rate"), }
\InformationTok{          id = "NAME")}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Unemployment Plots}
\FunctionTok{===================================}


\NormalTok{Column 1}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}

\NormalTok{**Lowest unemployment (female)**}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r plot{-}female, echo= TRUE\}}
\InformationTok{low\_unemployment\_female\_plot \textless{}{-} female\_low\_unemployment \%\textgreater{}\%}
\InformationTok{  ggplot(aes(x = total, y = reorder(female\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(x = "Total", y = "Occupation")}

\InformationTok{low\_unemployment\_female\_plot}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{Column 2}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\NormalTok{**Lowest unemployment(male)**}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r plot{-}male, echo= TRUE\}}
\InformationTok{low\_unemployment\_male\_plot \textless{}{-} male\_low\_unemployment \%\textgreater{}\%}
\InformationTok{  ggplot(aes(x = total, y = reorder(male\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(x = "Total", y = "Occupation")}

\InformationTok{low\_unemployment\_male\_plot}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{Alderman Info}
\FunctionTok{===================================}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r alderman, echo = TRUE\}}
\InformationTok{datatable(aldermen\_info)}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard5.png}
\caption{St.~Louis Ward Maps page}
\end{figure}

\begin{figure}
\centering
\includegraphics{images/stl_dashboard6.png}
\caption{Unemployment plots page}
\end{figure}

\includegraphics{images/stl_dashboard7.png}
\#\# Summary \{\#flexdashboard-summary\}
You can use R Markdown to create dashboards with the \emph{flexdashboard} package. With the \emph{flexdashboard} package, we can create dashboards that have different pages and we can add columns to those pages. In this chapter, we created a \textbf{St.~Louis Dashboard} in which we created three pages for the St.~Louis ward maps, unemployment plots, and aldermen information.

\hypertarget{flexdashboard-study}{%
\section{For further practice}\label{flexdashboard-study}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  You can apply various \href{https://rstudio.github.io/flexdashboard/articles/theme.html}{themes} to flexdashboard. Try to apply a theme to the St.~Louis Dashboard.
\item
  You will need to add a page that has an R markdown document summarizing your findings. Create another tab called ``Report'' and put your summary in this section.
\end{enumerate}

\hypertarget{flexdashboard-resources}{%
\section{References}\label{flexdashboard-resources}}

\begin{itemize}
\tightlist
\item
  Sievart, C., Ianonne, R., Allaire, J., \& Borges, B., NA. \textbf{Dashboard basics.} Retrieved from \url{https://rstudio.github.io/flexdashboard/articles/flexdashboard.html}.
\end{itemize}

\hypertarget{shiny}{%
\chapter{Creating an interactive dashboard with Shiny}\label{shiny}}

\hypertarget{shiny-los}{%
\section{Learning Objectives}\label{shiny-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  List the basic components of a Shiny app.
\item
  Explain the functions of the user interface and the server.
\item
  Describe reactivity in Shiny apps.
\item
  Generate code in the shiny user interface to access the provided data.
\item
  Generate code to in the shiny server function to access the provided data.
\end{enumerate}

\hypertarget{shiny-terms}{%
\section{Terms You'll Learn}\label{shiny-terms}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Reactivity
\end{enumerate}

\hypertarget{shiny-scenario}{%
\section{Context}\label{shiny-scenario}}

Now that you created the flexdashboard, you want to have some interactivity in your dashboard such as the users being able to provide input which would change the plots or the maps. You heard that you would be able to do that using the \emph{shiny} package.

\hypertarget{shiny-pkgs}{%
\section{Packages and datasets needed}\label{shiny-pkgs}}

We will be using the same packages and data from the chapter on flexdashboard with the addition of \emph{shiny}.

\hypertarget{shiny-intro}{%
\section{What you need to know about shiny}\label{shiny-intro}}

The purpose of this chapter is to get you up and running with using Shiny by introducing you to the most essential concepts of a shiny web application and the basic components. By the end of this chapter, you will be able to develop a simple Shiny app that is interactive between tabular data and data visualizations. This is in no way a thorough introduction to Shiny. For a more extensive introduction, please read \href{https://mastering-shiny.org/basic-app.html}{Mastering Shiny} by Hadley Wickham.

In short, Shiny is a package that allows you to create interactive web applications For example, you can create a simple web application of an interactive plot that can change by what the users select with a dropdown menu. One example of this is the \href{https://shiny.rstudio.com/gallery/telephones-by-region.html}{Telephones by Region}

Shiny apps are composed of three parts, which are the user interface (UI), the server, and the ShinyApp function. The UI is what the user will see on the front end and in the UI you build the look of the app. In the UI, you can add various elements that will allow the user to interact with the app such as buttons, sliders, and drop down menus along with the display of your data visualization or data table. On the back end, the server controls what the app will do. For example, it handles what happens when a user interacts with the app such as pressing a button. The interaction between the UI and server is based on reactivity in which the outputs update based on the input (Wickham, 2020). This concept of reactivity will become more clear as we build our app. To put these both together, so you can deploy your app you need to call the shinyApp function. Let's first create shiny apps of each component of the dashboard, and then we will integrate these apps in the flexdashboard that you created.

\hypertarget{creating-a-shiny-web-map-app}{%
\section{Creating a shiny web map app}\label{creating-a-shiny-web-map-app}}

\hypertarget{initalizing-from-r-studio}{%
\subsection{Initalizing from R Studio}\label{initalizing-from-r-studio}}

In R Studio, you can create a Shiny app by going to \textbf{File \textgreater{} Shiny Web App}. You will be prompted to give an application name and application type. Name the app ``stl\_web\_map'' and select the \textbf{Single File (app.R)} type since we are creating a small app. The \textbf{Multiple File(ui.R/server.R)} type is best when you are creating larger shiny apps (Cetinkaya-Rundel, 2017)

We need to create the user interface and the server functions.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{\# creating the ui}
\NormalTok{ui }\OtherTok{\textless{}{-}} \FunctionTok{fluidPage}\NormalTok{(}

\NormalTok{)}

\CommentTok{\# creating the server}
\NormalTok{server }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(input, output)\{}

\NormalTok{\}}

\CommentTok{\# running the app}
\FunctionTok{shinyApp}\NormalTok{(}\AttributeTok{ui =}\NormalTok{ ui, }\AttributeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

The output will be blank since we haven't added any elements.

After that, we need to add a dropdown menu which will allow us to select what layers we want to see on the map. First, create a variable called ``layers'' which will contain the dropdown menu choices which are ``Select a variable,'' ``Unemployment rate by ward,'' and ``St.~Louis Wards.'' After that, call the \texttt{selectInput} function which contains the name in which we will refer the dropdown menu when we need to call it in the server, the title of the dropdown menu bar, along with our choices. By default, the first option, ``Select a variable'' is selected.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{layers }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Select a variable"}\NormalTok{, }\StringTok{"Unemployment rate by ward"}\NormalTok{, }\StringTok{"St. Louis Wards"}\NormalTok{)}

\CommentTok{\# creating the ui}
\NormalTok{ui }\OtherTok{\textless{}{-}} \FunctionTok{fluidPage}\NormalTok{(}
   \FunctionTok{selectInput}\NormalTok{(}\StringTok{"var"}\NormalTok{, }\StringTok{"St. Louis Maps"}\NormalTok{, }\AttributeTok{choices =}\NormalTok{ layers, }\AttributeTok{selected =}\NormalTok{ layers[}\DecValTok{1}\NormalTok{])}

\NormalTok{)}

\CommentTok{\# creating the server}
\NormalTok{server }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(input, output)\{}

\NormalTok{\}}

\CommentTok{\# running the app}
\FunctionTok{shinyApp}\NormalTok{(}\AttributeTok{ui =}\NormalTok{ ui, }\AttributeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/stl_shiny1.png}
\caption{Select input bar}
\end{figure}

Next, we will create the back end functionality of the app in the server. First, let's add code which will display or render the map. We do that by using the \texttt{tmapOutput} function and giving that output a name which we can refer to when creating the reactive variables. after the \texttt{tmapOutput} function, we define the reactive variable \texttt{output\$map} which allows the map to be instantly updated based on the user's input. Since we are rendering a map based on the \emph{tmap} package, we will be using the \texttt{renderTmap} function. Some of the following code should be familiar: we are creating a leaflet map which will build the map based on the St.~Louis ward polygons. The one addition is setting the z-index in \texttt{tm\_polygons}. A z-index specifies the order of overlapping HTML elements. The z-index is set to the layer number plus 400 (Nowosad et. al, 2022)

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{layers }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Select a variable"}\NormalTok{, }\StringTok{"Unemployment rate by ward"}\NormalTok{, }\StringTok{"St. Louis Wards"}\NormalTok{)}

\CommentTok{\# creating the ui}
\NormalTok{ui }\OtherTok{\textless{}{-}} \FunctionTok{fluidPage}\NormalTok{(}
   \FunctionTok{selectInput}\NormalTok{(}\StringTok{"var"}\NormalTok{, }\StringTok{"St. Louis Maps"}\NormalTok{, }\AttributeTok{choices =}\NormalTok{ layers, }\AttributeTok{selected =}\NormalTok{ layers[}\DecValTok{1}\NormalTok{])}

\NormalTok{)}

\CommentTok{\# creating the server}
\NormalTok{server }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(input, output)\{}
  \CommentTok{\# this is the map that will load}
\FunctionTok{tmapOutput}\NormalTok{(}\StringTok{"map"}\NormalTok{)}

\NormalTok{output}\SpecialCharTok{$}\NormalTok{map }\OtherTok{\textless{}{-}} \FunctionTok{renderTmap}\NormalTok{(\{}
       \FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"view"}\NormalTok{)}
       \FunctionTok{tm\_shape}\NormalTok{(data, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{)}\SpecialCharTok{+}
       \FunctionTok{tm\_polygons}\NormalTok{(}\AttributeTok{zindex =} \DecValTok{401}\NormalTok{)}
\NormalTok{\})}

\NormalTok{\}}

\CommentTok{\# running the app}
\FunctionTok{shinyApp}\NormalTok{(}\AttributeTok{ui =}\NormalTok{ ui, }\AttributeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_app2.png}
\caption{Shiny app showing the St.~Louis ward map}
\end{figure}

The next thing we are going to do is add reactivity to the app by adding the \texttt{observeEvent} function. We want the app to update based on the selection in the drop-down menu. The \texttt{observeEvent} function will run a segment of code based on the reactive variable that is selected which will create the output (Wickham, 2020). We also add \texttt{if} statements to complement the \texttt{observeEvent}function, in order to run certain code chunks based on a specific condition (selected in the dropdown menu). A good way to remember reactivity is that reactive variables are dynamic inputs, while observers are dynamic outputs.

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{layers }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Select a variable"}\NormalTok{, }\StringTok{"Unemployment rate by ward"}\NormalTok{, }\StringTok{"St. Louis Wards"}\NormalTok{)}

\CommentTok{\# creating the ui}
\NormalTok{ui }\OtherTok{\textless{}{-}} \FunctionTok{fluidPage}\NormalTok{(}
   \FunctionTok{selectInput}\NormalTok{(}\StringTok{"var"}\NormalTok{, }\StringTok{"St. Louis Maps"}\NormalTok{, }\AttributeTok{choices =}\NormalTok{ layers, }\AttributeTok{selected =}\NormalTok{ layers[}\DecValTok{1}\NormalTok{])}

\NormalTok{)}

\CommentTok{\# creating the server}
\NormalTok{server }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(input, output)\{}
  \CommentTok{\# this is the map that will load}
\FunctionTok{tmapOutput}\NormalTok{(}\StringTok{"map"}\NormalTok{)}

\NormalTok{output}\SpecialCharTok{$}\NormalTok{map }\OtherTok{\textless{}{-}} \FunctionTok{renderTmap}\NormalTok{(\{}
       \FunctionTok{tmap\_mode}\NormalTok{(}\StringTok{"view"}\NormalTok{)}
       \FunctionTok{tm\_shape}\NormalTok{(data, }\AttributeTok{unit =} \StringTok{"mi"}\NormalTok{)}\SpecialCharTok{+}
       \FunctionTok{tm\_polygons}\NormalTok{(}\AttributeTok{zindex =} \DecValTok{401}\NormalTok{)}
\NormalTok{\})}


\FunctionTok{observeEvent}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{var,}
\NormalTok{\{}
  \ControlFlowTok{if}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{var }\SpecialCharTok{==}\NormalTok{ layers[}\DecValTok{2}\NormalTok{])\{}
\NormalTok{    data }\OtherTok{\textless{}{-}}\NormalTok{ stl\_tracts}
    \FunctionTok{tmapProxy}\NormalTok{(}\StringTok{"map"}\NormalTok{, session, \{}
      \FunctionTok{tm\_remove\_layer}\NormalTok{(}\DecValTok{401}\NormalTok{) }\SpecialCharTok{+}
        \FunctionTok{tm\_shape}\NormalTok{(data) }\SpecialCharTok{+}
         \FunctionTok{tm\_fill}\NormalTok{(}\StringTok{"unemployment\_rate"}\NormalTok{, }\AttributeTok{title=} \StringTok{"Umemployment rate"}\NormalTok{,}
          \AttributeTok{popup.vars =} \FunctionTok{c}\NormalTok{(}\StringTok{"\% unemployed"} \OtherTok{=} \StringTok{"unemployment\_rate"}\NormalTok{), }
          \AttributeTok{id =} \StringTok{"NAME"}\NormalTok{)}\SpecialCharTok{+}
        \FunctionTok{tm\_polygons}\NormalTok{(}\AttributeTok{zindex =} \DecValTok{401}\NormalTok{)}
\NormalTok{    \})}
\NormalTok{  \}}

  \ControlFlowTok{if}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{var }\SpecialCharTok{==}\NormalTok{ layers[}\DecValTok{3}\NormalTok{])\{}
\NormalTok{    data }\OtherTok{\textless{}{-}}\NormalTok{ stl\_wards}
    \FunctionTok{tmapProxy}\NormalTok{(}\StringTok{"map"}\NormalTok{, session, \{}
      \FunctionTok{tm\_remove\_layer}\NormalTok{(}\DecValTok{401}\NormalTok{) }\SpecialCharTok{+}
        \FunctionTok{tm\_shape}\NormalTok{(data) }\SpecialCharTok{+}
        \FunctionTok{tm\_polygons}\NormalTok{(}\AttributeTok{zindex =} \DecValTok{401}\NormalTok{)}
\NormalTok{    \})}
\NormalTok{  \} }
\NormalTok{\})}
\NormalTok{\}}


\CommentTok{\# running the app}
\FunctionTok{shinyApp}\NormalTok{(}\AttributeTok{ui =}\NormalTok{ ui, }\AttributeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

Let's break down the \texttt{observeEvent} function. As mentioned before, the \texttt{observeEvent} function will change the output based on the input. Each visualization has a different way of doing this. In the case of \texttt{tmap}, the \texttt{tmapProxy} function will update the map once various conditions are met. If the user selects the ``Unemployment rate by ward'' selection (layers{[}2{]}), then the \texttt{tmapProxy} function will delete the previously displayed layer and add the ``Unemployment rate by ward'' layer. If the user selects the ``St.~Louis Wards'' (layers{[}3{]}) option, then the \texttt{tmapProxy} will update the map to show that layer. The way the layer is removed is through calling the z-index of the layer that is being displayed, which is 401 in this case.

We are done creating this app! Run it to see what it looks like. \includegraphics{images/shiny_app3.png}

\begin{figure}
\centering
\includegraphics{images/shiny_app4.png}
\caption{St.~Louis Ward Maps Shiny app showing St.~Louis Wards}
\end{figure}

\hypertarget{creating-a-ggplot-shiny-app}{%
\section{Creating a ggplot Shiny app}\label{creating-a-ggplot-shiny-app}}

Now that we have created the Shiny mapping app, let's go ahead and create an interactive plotting app. Make sure to create a new file to create the new Shiny app by going to \textbf{File \textgreater{} New File \textgreater{} Shiny Web App.}

We are not going to go step by step with this app, because the steps are the same as above except for two substitutions: we will use a different output function to display the visualization (\texttt{plotOutput()}) and a different render function to display the plot (\texttt{renderPlot()}).

\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{stl\_plot\_choices }\OtherTok{\textless{}{-}} \FunctionTok{c}\NormalTok{(}\StringTok{"Choose a plot"}\NormalTok{, }\StringTok{"Occupations with the lowest unemployment (female)"}\NormalTok{, }\StringTok{"Occupations with the lowest unemployment (male)"}\NormalTok{)}

\CommentTok{\# creating the ui}
\NormalTok{ui }\OtherTok{\textless{}{-}} \FunctionTok{fluidPage}\NormalTok{(}
  \FunctionTok{selectInput}\NormalTok{(}\StringTok{"plots"}\NormalTok{, }\StringTok{"Plots"}\NormalTok{, }\AttributeTok{choices =}\NormalTok{ stl\_plot\_choices, }\AttributeTok{selected =}\NormalTok{ stl\_plot\_choices[}\DecValTok{1}\NormalTok{])}

\NormalTok{)}

\CommentTok{\# creating the server}
\NormalTok{server }\OtherTok{\textless{}{-}} \ControlFlowTok{function}\NormalTok{(input, output)\{}
  \CommentTok{\# this is the plot that will load}
  \FunctionTok{plotOutput}\NormalTok{(}\StringTok{"plot"}\NormalTok{)}
\NormalTok{output}\SpecialCharTok{$}\NormalTok{plot }\OtherTok{\textless{}{-}} \FunctionTok{renderPlot}\NormalTok{(\{}
  \FunctionTok{ggplot}\NormalTok{(female\_low\_unemployment, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(female\_jobs, total))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten female occupations in census tracts with lowest unemployment"}\NormalTok{,}
       \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}

\NormalTok{\})}

 \FunctionTok{observeEvent}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{plots, \{}

  \ControlFlowTok{if}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{plots }\SpecialCharTok{==}\NormalTok{ stl\_plot\_choices[}\DecValTok{2}\NormalTok{])\{}
\NormalTok{    output}\SpecialCharTok{$}\NormalTok{plot }\OtherTok{\textless{}{-}} \FunctionTok{renderPlot}\NormalTok{(\{}
      \FunctionTok{ggplot}\NormalTok{(female\_low\_unemployment, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =}
      \FunctionTok{reorder}\NormalTok{(female\_jobs, total))) }\SpecialCharTok{+}
  \FunctionTok{geom\_col}\NormalTok{() }\SpecialCharTok{+}
  \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten female occupations in census tracts with lowest unemployment"}\NormalTok{,}
     \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}
\NormalTok{\})}
\NormalTok{  \}    }


    \ControlFlowTok{if}\NormalTok{(input}\SpecialCharTok{$}\NormalTok{plots }\SpecialCharTok{==}\NormalTok{ stl\_plot\_choices[}\DecValTok{3}\NormalTok{])\{}
\NormalTok{     output}\SpecialCharTok{$}\NormalTok{plot }\OtherTok{\textless{}{-}} \FunctionTok{renderPlot}\NormalTok{(\{}
       \FunctionTok{ggplot}\NormalTok{(male\_low\_unemployment, }\FunctionTok{aes}\NormalTok{(}\AttributeTok{x =}\NormalTok{ total, }\AttributeTok{y =} \FunctionTok{reorder}\NormalTok{(male\_jobs, total))) }\SpecialCharTok{+}
   \FunctionTok{geom\_col}\NormalTok{() }\SpecialCharTok{+}
   \FunctionTok{labs}\NormalTok{(}\AttributeTok{title =} \StringTok{"Top ten male occupations in census tracts with lowest unemployment"}\NormalTok{,}
        \AttributeTok{x =} \StringTok{"Total"}\NormalTok{, }\AttributeTok{y =} \StringTok{"Occupation"}\NormalTok{)}
\NormalTok{    \})}
\NormalTok{    \}  }
\NormalTok{ \})}

\NormalTok{\}}

\CommentTok{\# running the app}
\FunctionTok{shinyApp}\NormalTok{(}\AttributeTok{ui =}\NormalTok{ ui, }\AttributeTok{server =}\NormalTok{ server)}
\end{Highlighting}
\end{Shaded}

In this app, in the ui function, the \texttt{stl\_plot\_choices} variable stores the drop-down menu options which are ``Choose a plot'', ``Occupation with the lowest unemployment (female)'', and ``Occupation with the lowest unemployment (male)'' which is called in the \texttt{selectInput} function. The option that is selected by default is ``Choose a plot.'' On the server side, the \texttt{plotOutput} function displays the plot, which is creatively called ``plot.'' Through the \texttt{renderPlot} function, the default plot that is loaded before any selection is made will be the ``Occupation with the lowest unemployment (female)'' plot. The \texttt{observeEvent} plot allows the plot that is being displayed in the app to be updated based on the user selection. To enable this, we have two \texttt{if} statements in which if the user chooses \texttt{stl\_plot\_choices{[}2{]}}(Occupation with the lowest unemployment (female) plot), then it will display or \texttt{stl\_plot\_choices{[}3{]}} Occupation with the lowest unemployment (male) plot), then the specific \texttt{renderPlot} function will run.

Now that we are finished with this app, run the app to see how it looks.

\begin{figure}
\centering
\includegraphics{images/shiny_app6.png}
\caption{Default shiny plot app}
\end{figure}

\begin{figure}
\centering
\includegraphics{images/shiny_app7.png}
\caption{Plot of occupations with the lowest unemployment(female)}
\end{figure}

\begin{figure}
\centering
\includegraphics{images/shiny_app8.png}
\caption{Plot of occupations with the lowest unemployment (male)}
\end{figure}

\hypertarget{integrating-shiny-apps-into-a-flexdashboard}{%
\section{Integrating Shiny apps into a FlexDashboard}\label{integrating-shiny-apps-into-a-flexdashboard}}

Now we are going to put these Shiny apps into the FlexDashboard that we created before. We will do this by putting the ui function in the sidebar and putting the server function in the second column that shows our output. First, we will do it with the Shiny mapping app. Let's put the ui in the sidebar of the St.~Louis Ward Maps page:

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Shiny Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\AnnotationTok{runtime:}\CommentTok{ shiny}
\CommentTok{{-}{-}{-}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}

\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}
\InformationTok{library(shiny)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r global, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{data \textless{}{-} stl\_wards}


\InformationTok{layers \textless{}{-} c("Select a variable", "Unemployment rate by ward", "St. Louis Wards")}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{St. Louis Ward Maps}
\FunctionTok{===================================}

\NormalTok{Column\{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{ selectInput("var", "St. Louis Maps", choices = layers, selected = layers[1])}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_dashboard1.png}
\caption{St.~Louis dashboard with the select input bar}
\end{figure}

Now, let's add the server function into the second column of the St.~Louis Ward Maps page.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Shiny Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\AnnotationTok{runtime:}\CommentTok{ shiny}
\CommentTok{{-}{-}{-}}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}

\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}
\InformationTok{library(shiny)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r global, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{data \textless{}{-} stl\_wards}


\InformationTok{layers \textless{}{-} c("Select a variable", "Unemployment rate by ward", "St. Louis Wards")}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{St. Louis Ward Maps}
\FunctionTok{===================================}

\NormalTok{Column\{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{ selectInput("var", "St. Louis Maps", choices = layers, selected = layers[1])}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo=FALSE\}}
\InformationTok{\# this is the map that will load}
\InformationTok{tmapOutput("map")}

\InformationTok{output$map \textless{}{-} renderTmap(\{}
\InformationTok{       tmap\_mode("view")}
\InformationTok{       tm\_shape(data, unit = "mi")+}
\InformationTok{       tm\_polygons(zindex = 401)}
\InformationTok{\})}


\InformationTok{\#we need to create a reactive variable}
\InformationTok{observeEvent(input$var,}
\InformationTok{\{}
\InformationTok{  if(input$var == layers[2])\{}
\InformationTok{    data \textless{}{-} stl\_tracts}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{         tm\_fill("unemployment\_rate", title= "Umemployment rate",}
\InformationTok{          popup.vars = c("\% unemployed" = "unemployment\_rate"), }
\InformationTok{          id = "NAME")+}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \}}
\InformationTok{  }
\InformationTok{  if(input$var == layers[3])\{}
\InformationTok{    data \textless{}{-} stl\_wards}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \} }
\InformationTok{\})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_dashboard2.png}
\caption{St.~Louis dashboard with the map of St.~Louis Wards}
\end{figure}

Now that we have finished integrating the Shiny app into the St.~Louis Ward Maps section, we will now do the same for the Plots page. Let's first add the ui into the sidebar.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Shiny Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\AnnotationTok{runtime:}\CommentTok{ shiny}
\CommentTok{{-}{-}{-}}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}

\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}
\InformationTok{library(shiny)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r global, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{data \textless{}{-} stl\_wards}


\InformationTok{layers \textless{}{-} c("Select a variable", "Unemployment rate by ward", "St. Louis Wards")}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{St. Louis Ward Maps}
\FunctionTok{===================================}

\NormalTok{Column\{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{ selectInput("var", "St. Louis Maps", choices = layers, selected = layers[1])}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo=FALSE\}}
\InformationTok{\# this is the map that will load}
\InformationTok{tmapOutput("map")}

\InformationTok{output$map \textless{}{-} renderTmap(\{}
\InformationTok{       tmap\_mode("view")}
\InformationTok{       tm\_shape(data, unit = "mi")+}
\InformationTok{       tm\_polygons(zindex = 401)}
\InformationTok{\})}


\InformationTok{\#we need to create a reactive variable}
\InformationTok{observeEvent(input$var,}
\InformationTok{\{}
\InformationTok{  if(input$var == layers[2])\{}
\InformationTok{    data \textless{}{-} stl\_tracts}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{         tm\_fill("unemployment\_rate", title= "Umemployment rate",}
\InformationTok{          popup.vars = c("\% unemployed" = "unemployment\_rate"), }
\InformationTok{          id = "NAME")+}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \}}
\InformationTok{  }
\InformationTok{  if(input$var == layers[3])\{}
\InformationTok{    data \textless{}{-} stl\_wards}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \} }
\InformationTok{\})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Plots}
\FunctionTok{===================================}

\NormalTok{Column \{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}

\InformationTok{stl\_plot\_choices \textless{}{-} c("Choose a plot", "Occupations with the lowest unemployment (female)", "Occupations with the lowest unemployment (male)")}
\InformationTok{selectInput("plots", "Plots", choices = stl\_plot\_choices, selected = stl\_plot\_choices[1])}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_dashboard3.png}
\caption{St.~Louis Dashboard plots page with select input bar}
\end{figure}

Now we will add the server to the second column which will show our main output.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Shiny Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\AnnotationTok{runtime:}\CommentTok{ shiny}
\CommentTok{{-}{-}{-}}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}

\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}
\InformationTok{library(shiny)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r global, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{data \textless{}{-} stl\_wards}


\InformationTok{layers \textless{}{-} c("Select a variable", "Unemployment rate by ward", "St. Louis Wards")}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{St. Louis Ward Maps}
\FunctionTok{===================================}

\NormalTok{Column\{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{ selectInput("var", "St. Louis Maps", choices = layers, selected = layers[1])}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo=FALSE\}}
\InformationTok{\# this is the map that will load}
\InformationTok{tmapOutput("map")}

\InformationTok{output$map \textless{}{-} renderTmap(\{}
\InformationTok{       tmap\_mode("view")}
\InformationTok{       tm\_shape(data, unit = "mi")+}
\InformationTok{       tm\_polygons(zindex = 401)}
\InformationTok{\})}


\InformationTok{\#we need to create a reactive variable}
\InformationTok{observeEvent(input$var,}
\InformationTok{\{}
\InformationTok{  if(input$var == layers[2])\{}
\InformationTok{    data \textless{}{-} stl\_tracts}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{         tm\_fill("unemployment\_rate", title= "Umemployment rate",}
\InformationTok{          popup.vars = c("\% unemployed" = "unemployment\_rate"), }
\InformationTok{          id = "NAME")+}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \}}
\InformationTok{  }
\InformationTok{  if(input$var == layers[3])\{}
\InformationTok{    data \textless{}{-} stl\_wards}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \} }
\InformationTok{\})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Plots}
\FunctionTok{===================================}

\NormalTok{Column \{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}

\InformationTok{stl\_plot\_choices \textless{}{-} c("Choose a plot", "Occupations with the lowest unemployment (female)", "Occupations with the lowest unemployment (male)")}
\InformationTok{selectInput("plots", "Plots", choices = stl\_plot\_choices, selected = stl\_plot\_choices[1])}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}

\InformationTok{plotOutput("plot")}
\InformationTok{output$plot \textless{}{-} renderPlot(\{}
\InformationTok{  ggplot(female\_low\_unemployment, aes(x = total, y = reorder(female\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(title = "Top ten female occupations in census tracts with lowest unemployment",}
\InformationTok{       x = "Total", y = "Occupation")}
\InformationTok{  }
\InformationTok{\})}

\InformationTok{ observeEvent(input$plots, \{}
\InformationTok{   }
\InformationTok{  if(input$plots == stl\_plot\_choices[2])\{}
\InformationTok{    output$plot \textless{}{-} renderPlot(\{}
\InformationTok{      ggplot(female\_low\_unemployment, aes(x = total, y =     }
\InformationTok{      reorder(female\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(title = "Top ten female occupations in census tracts with lowest unemployment",}
\InformationTok{     x = "Total", y = "Occupation")}
\InformationTok{\})}
\InformationTok{  \}    }
\InformationTok{    }
\InformationTok{   }
\InformationTok{    if(input$plots == stl\_plot\_choices[3])\{}
\InformationTok{     output$plot \textless{}{-} renderPlot(\{}
\InformationTok{       ggplot(male\_low\_unemployment, aes(x = total, y = reorder(male\_jobs, total))) +}
\InformationTok{   geom\_col() +}
\InformationTok{   labs(title = "Top ten male occupations in census tracts with lowest unemployment",}
\InformationTok{        x = "Total", y = "Occupation")}
\InformationTok{    \})}
\InformationTok{    \}  }
\InformationTok{ \})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_dashboard4.png}
\caption{St.~Louis Dashboard showing plots}
\end{figure}

As for the Alderman page, we will leave that the same.

\begin{Shaded}
\begin{Highlighting}[]
\CommentTok{{-}{-}{-}}
\AnnotationTok{title:}\CommentTok{ "St. Louis Shiny Dashboard"}
\AnnotationTok{output:}\CommentTok{ flexdashboard::flex\_dashboard}
\AnnotationTok{runtime:}\CommentTok{ shiny}
\CommentTok{{-}{-}{-}}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r setup, include=FALSE\}}
\InformationTok{knitr::opts\_chunk$set(echo = TRUE)}

\InformationTok{\# loading libraries}
\InformationTok{library(tidyverse)}
\InformationTok{library(flexdashboard)}
\InformationTok{library(tmap)}
\InformationTok{library(sf)}
\InformationTok{library(DT)}
\InformationTok{library(shiny)}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r global, include = FALSE\}}
\InformationTok{aldermen\_info = read\_csv("data/aldermen{-}contact.csv")}
\InformationTok{male\_low\_unemployment \textless{}{-} read\_csv("data/male{-}low{-}unemployment.csv")}
\InformationTok{female\_low\_unemployment \textless{}{-} read\_csv("data/female{-}low{-}unemployment.csv")}
\InformationTok{stl\_wards \textless{}{-} st\_read("nbrhds\_wards/WARDS\_2010.shp") }
\InformationTok{stl\_tracts \textless{}{-} st\_read("unemployment\_tract/unemployment\_tract.shp")\%\textgreater{}\%}
\InformationTok{  rename("unemployment\_rate" = "unmply\_") \%\textgreater{}\%}
\InformationTok{  mutate(unemployment\_rate = unemployment\_rate * 100)}
\InformationTok{data \textless{}{-} stl\_wards}


\InformationTok{layers \textless{}{-} c("Select a variable", "Unemployment rate by ward", "St. Louis Wards")}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}


\NormalTok{St. Louis Ward Maps}
\FunctionTok{===================================}

\NormalTok{Column\{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{ selectInput("var", "St. Louis Maps", choices = layers, selected = layers[1])}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo=FALSE\}}
\InformationTok{\# this is the map that will load}
\InformationTok{tmapOutput("map")}

\InformationTok{output$map \textless{}{-} renderTmap(\{}
\InformationTok{       tmap\_mode("view")}
\InformationTok{       tm\_shape(data, unit = "mi")+}
\InformationTok{       tm\_polygons(zindex = 401)}
\InformationTok{\})}


\InformationTok{\#we need to create a reactive variable}
\InformationTok{observeEvent(input$var,}
\InformationTok{\{}
\InformationTok{  if(input$var == layers[2])\{}
\InformationTok{    data \textless{}{-} stl\_tracts}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{         tm\_fill("unemployment\_rate", title= "Umemployment rate",}
\InformationTok{          popup.vars = c("\% unemployed" = "unemployment\_rate"), }
\InformationTok{          id = "NAME")+}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \}}
\InformationTok{  }
\InformationTok{  if(input$var == layers[3])\{}
\InformationTok{    data \textless{}{-} stl\_wards}
\InformationTok{    tmapProxy("map", session, \{}
\InformationTok{      tm\_remove\_layer(401) +}
\InformationTok{        tm\_shape(data) +}
\InformationTok{        tm\_polygons(zindex = 401)}
\InformationTok{    \})}
\InformationTok{  \} }
\InformationTok{\})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Plots}
\FunctionTok{===================================}

\NormalTok{Column \{.sidebar\}}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}

\InformationTok{stl\_plot\_choices \textless{}{-} c("Choose a plot", "Occupations with the lowest unemployment (female)", "Occupations with the lowest unemployment (male)")}
\InformationTok{selectInput("plots", "Plots", choices = stl\_plot\_choices, selected = stl\_plot\_choices[1])}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Column}
\NormalTok{{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}{-}}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}

\InformationTok{plotOutput("plot")}
\InformationTok{output$plot \textless{}{-} renderPlot(\{}
\InformationTok{  ggplot(female\_low\_unemployment, aes(x = total, y = reorder(female\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(title = "Top ten female occupations in census tracts with lowest unemployment",}
\InformationTok{       x = "Total", y = "Occupation")}
\InformationTok{  }
\InformationTok{\})}

\InformationTok{ observeEvent(input$plots, \{}
\InformationTok{   }
\InformationTok{  if(input$plots == stl\_plot\_choices[2])\{}
\InformationTok{    output$plot \textless{}{-} renderPlot(\{}
\InformationTok{      ggplot(female\_low\_unemployment, aes(x = total, y =     }
\InformationTok{      reorder(female\_jobs, total))) +}
\InformationTok{  geom\_col() +}
\InformationTok{  labs(title = "Top ten female occupations in census tracts with lowest unemployment",}
\InformationTok{     x = "Total", y = "Occupation")}
\InformationTok{\})}
\InformationTok{  \}    }
\InformationTok{    }
\InformationTok{   }
\InformationTok{    if(input$plots == stl\_plot\_choices[3])\{}
\InformationTok{     output$plot \textless{}{-} renderPlot(\{}
\InformationTok{       ggplot(male\_low\_unemployment, aes(x = total, y = reorder(male\_jobs, total))) +}
\InformationTok{   geom\_col() +}
\InformationTok{   labs(title = "Top ten male occupations in census tracts with lowest unemployment",}
\InformationTok{        x = "Total", y = "Occupation")}
\InformationTok{    \})}
\InformationTok{    \}  }
\InformationTok{ \})}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}

\NormalTok{Alderman Info}
\FunctionTok{===================================}

\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}\{r, echo = FALSE\}}
\InformationTok{datatable(aldermen\_info)}
\InformationTok{\textasciigrave{}\textasciigrave{}\textasciigrave{}}
\end{Highlighting}
\end{Shaded}

\begin{figure}
\centering
\includegraphics{images/shiny_dashboard5.png}
\caption{St.~Louis Dashboard showing the aldermen information}
\end{figure}

Congratulations, you have created a FlexDashboard which integrates Shiny apps!

You can refer to the \texttt{stl\_dashboard\_shiny.Rmd} file to see the finished product.

\hypertarget{shiny-summary}{%
\chapter{Summary}\label{shiny-summary}}

The Shiny package allows you to create interactive web applications. The foundation of Shiny applications is based on reactivity in which the output will update based on the input. The two components of a Shiny app is the user interface, known as the UI and the server. The UI which displays the various elements such as selection bars, plots, maps, or tables, while the server manages the functionality of the app in terms of reactivity. We updated our flexdashboard to include various Shiny apps such as a map with drop-down menus to display either St.~Louis wards or unemployment rate by ward. In addition, we added a Shiny app of a plot which also has a drop-down menu of displaying occupations in Census tracts that has the lowest unemployment by male and female occupations. We left the data table for the aldermen information unchanged.

\hypertarget{shiny-study}{%
\chapter{For further practice}\label{shiny-study}}

Shiny can become complicated really fast, and one of the best ways to become acquainted with Shiny is to see some examples of Shiny apps. Read this information on \href{https://rstudio.github.io/flexdashboard/articles/shiny.html}{Using Shiny with FlexDashboard}. Look at some of the \href{https://rstudio.github.io/flexdashboard/articles/examples.html}{Example projects} and explore the source code. From these examples, what else could you envision adding on to the dashboard?

\hypertarget{shiny-resources}{%
\section{References}\label{shiny-resources}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Cetinkaya-Rundel, M., (2017). \textbf{Two-file Shiny apps.} Retrieved from \url{https://shiny.rstudio.com/articles/two-file.html}
\item
  Wickham, H., (2020). \textbf{Mastering Shiny.} Retrieved from \url{https://mastering-shiny.org/index.html}.
\item
  Nowosad et al (2022). \textbf{Package `tmap'.} Retrieved from \url{https://cran.r-project.org/web/packages/tmap/tmap.pdf}
\end{enumerate}

\hypertarget{tidymodels}{%
\chapter{\texorpdfstring{Using \emph{tidymodels} to Understand Machine Learning}{Using tidymodels to Understand Machine Learning}}\label{tidymodels}}

\hypertarget{ml-los}{%
\section{Learning Objective: recognize the various processing and analysis functions of machine learning using the tidymodels package.}\label{ml-los}}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Recall the steps in machine learning and the correlating \emph{tidymodels} packages.
\item
  Describe the ways text mining is used in machine learning algorithms.
\item
  Describe uses of machine learning related to employment.
\item
  Identify areas of potential bias in machine learning.
\end{enumerate}

\hypertarget{ml-terms}{%
\section{Terms You'll Learn}\label{ml-terms}}

\begin{itemize}
\tightlist
\item
  machine learning
\item
  algorithm
\end{itemize}

\hypertarget{ml-scenario}{%
\section{Scenario}\label{ml-scenario}}

Job seekers with whom you've worked at the library report many resumés submitted but few callbacks. You've heard many companies use resumé screening software, but you'd like to understand how that works and how it affects hiring. If there's a negative effect, you want to prepare the program participants with knowledge of how to navigate the reality of machine learning in the job application process.

\hypertarget{ml-pkgs}{%
\section{Packages \& Datasets Needed}\label{ml-pkgs}}

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidymodels)}
\end{Highlighting}
\end{Shaded}

\begin{verbatim}
## -- Attaching packages ------------- tidymodels 1.0.0 --
\end{verbatim}

\begin{verbatim}
## v broom        1.0.0     v rsample      1.1.0
## v dials        1.0.0     v tune         1.0.0
## v infer        1.0.2     v workflows    1.0.0
## v modeldata    1.0.0     v workflowsets 1.0.0
## v parsnip      1.0.0     v yardstick    1.0.0
## v recipes      1.0.1
\end{verbatim}

\begin{verbatim}
## -- Conflicts ---------------- tidymodels_conflicts() --
## x scales::discard()   masks purrr::discard()
## x raster::extract()   masks tidyr::extract()
## x dplyr::filter()     masks stats::filter()
## x recipes::fixed()    masks stringr::fixed()
## x jsonlite::flatten() masks purrr::flatten()
## x dplyr::lag()        masks stats::lag()
## x infer::observe()    masks shiny::observe()
## x raster::select()    masks dplyr::select()
## x yardstick::spec()   masks readr::spec()
## x recipes::step()     masks stats::step()
## x recipes::update()   masks raster::update(), stats::update()
## * Use tidymodels_prefer() to resolve common conflicts.
\end{verbatim}

\hypertarget{ml-intro}{%
\section{Introduction}\label{ml-intro}}

This chapter is a brief introduction to machine learning, but unlike the rest of the book, this chapter will stay reasonably high level and not include any coding. Machine learning code requires knowledge of advanced mathematics (linear algebra); while learning to do machine learning is possible, the background knowledge and theory behind the steps covered in this chapter require in-depth explanations beyond the scope of this work. Learning or re-learning linear algebra takes time. However, there is still a significant benefit from understanding the process, even if you can't replicate it on your own (yet). You can still benefit from knowing the process to better interact with ML in your work and assist your library users.

The benefits of learning ML is closely related to its ubiquity in our lives. It powers many applications and systems that librarians and library users rely on for professional and personal work. Professionally, we see ML in predictive search and the search results of discovery systems and research databases we use and maintain daily. In our personal lives, our search engines queries, shopping, media platforms, and social media are all powered by ML, as are many civic (government services eligibility, job applications) and financial services (credit scores, loan approvals) we interact with regularly.

Machine learning should merely augment human decision-making. In practice, however, we tend towards a blind faith in computers that elevates their output above our own intelligence and convinces us they can solve every problem. As information professionals, we need to understand how machine learning works so that we know its flaws and its promise and can communicate that to our colleagues and users. This chapter will cover what machine learning is and how it works, focusing on one machine learning R package.

\hypertarget{what-ml-is}{%
\section{What ML is}\label{what-ml-is}}

\emph{Machine learning} (ML) is a type of artificial intelligence (AI) that uses linear regression to find patterns within data, whether that data contains numbers or text. ML can surface patterns that aren't discernible to the human eye. There are two types of ML: supervised and unsupervised. For simplicity, this chapter will explain the steps of supervised ML, which means that there is a human rather than an algorithmic review of settings and decisions within the ML algorithm. An \emph{algorithm} is a series of repeated programmatic steps, which you could think of as a formula or recipe applied to data to ``predict something: how likely it is that a squiggle on a page is the letter A; how likely it is that a given customer will pay back the mortgage money a bank loans to him; which is the best next move to make in a game of tic-tac-toe, checkers, or chess''\{\citet{broussard2018}\}. The steps of the algorithm repeat in the same order with any new data. Overall, machine learning algorithms take raw data, tidy it, and then manipulate it to get the best predictive results.

Machine learning is related to, but ultimately very different in practice than statistics. With ML, the goal is to make accurate predictions using any method. As a result, ML will use any statistical model that produces more accurate predictions. While statistics use hypotheses, machine learning does not. In statistics, you would compare your model against random chance to see if it performed better, but in machine learning, the only question is which model fits the data the best.

\hypertarget{ml-r}{%
\section{\texorpdfstring{How ML works in R (\emph{tidymodels})}{How ML works in R (tidymodels)}}\label{ml-r}}

In the R package ecosystem, \emph{tidymodels}\footnote{\url{https://www.tidymodels.org/}} is the Tidyverse equivalent for machine learning. Like the Tidyverse, Tidymodels combines several packages that each focus on a step in the machine learning process. We'll cover five packages that map to the five main stages of machine learning.

\hypertarget{choose-a-model-parsnip}{%
\subsection{\texorpdfstring{Choose a Model (\emph{parsnip})}{Choose a Model (parsnip)}}\label{choose-a-model-parsnip}}

As previously mentioned, machine learning only wants the best model that fits the data, so that is the criteria by which data scientists choose their model. They need to figure out what linear regression method would give the best results for the provided data. The model functions as the engine of the algorithm. Examples of models include K nearest neighbors (KNN) and root mean squared error (RMSE). The Tidymodels package to use is called \emph{parsnip}.

\hypertarget{create-a-recipe-recipes}{%
\subsection{\texorpdfstring{Create a recipe (\emph{recipes})}{Create a recipe (recipes)}}\label{create-a-recipe-recipes}}

The Tidymodels \emph{recipes} package creates the algorithm, or formula, for machine learning that handles data the same way every time. The recipe documents which variables the algorithm will use for predicting and the steps involved in preparing the data. Consistent data preprocessing has a significant impact on prediction accuracy. The recipe is vital for consistency because the data workflow is complex.

\hypertarget{sample-the-data-rsample}{%
\subsection{\texorpdfstring{Sample the data (\emph{rsample})}{Sample the data (rsample)}}\label{sample-the-data-rsample}}

One crucial part of the process is dividing your dataset into training and testing datasets. The testing dataset is set aside until the end of the entire ML process to test the algorithm's accuracy. The testing dataset is used immediately with the model and the recipe. There are many decisions to make related to sample size, which all impact prediction accuracy. Too-small testing sets result in unreliable predictions because you wouldn't have tested the model fit enough. On the other hand, if the training dataset is too small, then the model fit will be poor.

\hypertarget{tune-the-model-tune}{%
\subsection{\texorpdfstring{Tune the model (\emph{tune})}{Tune the model (tune)}}\label{tune-the-model-tune}}

Because ML aims to get the best possible model that fits the data at hand, a sizable portion of the process is devoted to adjusting or tuning the model to get a better fit. Tidymodels uses the \emph{tune} package for model tuning. Changing the algorithm parameters and hyperparameters in different ways can improve model fit. Resampling is one example of this, which reconfigures the training dataset in various ways to determine which produces the best results.

\hypertarget{measure-the-fit-of-the-model-yardstick}{%
\subsection{\texorpdfstring{Measure the fit of the model (\emph{yardstick})}{Measure the fit of the model (yardstick)}}\label{measure-the-fit-of-the-model-yardstick}}

The last package for this chapter is \emph{yardstick}, which measures mathematically how accurately the chosen model fits the data. This package would be run on the testing dataset (reserved from the beginning and not used to develop the algorithm) as a final validation step to see how well it predicts. ``Good'' models might fit 80\% of their data points. Depending on the algorithm's use in decision-making, 80\% accuracy might be benign or harmful. For example, 75\% accuracy on which movie appears in the ``Suggested'' tab on your streaming platform that you definitely want to watch has a different impact than an algorithm used to predict who's children should be removed by the state that is wrong about 25\% of the time\{\citet{eubanks2018}\}.

\hypertarget{ml-problems}{%
\section{Problems with machine learning}\label{ml-problems}}

Implicit in the above ML steps is that all ML algorithms are created by humans using data created by humans. Keeping this top of mind helps us remember that AI has no magic. Even if a commercial library database uses an ML algorithm that we can't see (called a `black box' algorithm), we know that their algorithm uses the same preceding steps. Knowing the ML process means we can understand where potential bias or error is likely to sneak in, allowing us to critique and analyze an ML algorithm.

The impact of model accuracy is just one area where problems can arise in the real-world application of machine learning algorithms. Human bias and problems with the data can combine and compound in ways that can be invisible to data scientists, producing unintended negative consequences for machine learning-augmented decision-making.

\hypertarget{the-data}{%
\subsection{The data}\label{the-data}}

The root cause of many ML problems is often the data it ingests. Sometimes, the data itself is too small or too problematic. Quite often, the data we need to solve a problem doesn't exist, so we use the data we can find to act as a proxy for the information we don't have. This generalization is understandable, but again, the validity and credibility of an ML-based decision deserve careful thought to avoid unintended negative consequences.

Contrasting online shopping data with social services data illustrates this point well. Companies want us to buy early and often and use data to predict what we will buy. Online merchants track every click and keystroke on their shopping platforms, collecting tremendous data to drive product suggestions and email marketing campaigns. ML algorithms can't infer intent, though, so you might purchase a gift and then get a related purchase suggestion that's irrelevant. The downstream consequences of a wrong prediction might result in minor annoyance. Data collected by child welfare agencies is entirely different. These agencies aim to predict which children among the entire population are at risk for neglect or abuse\{\citet{eubanks2018}\}.

Data is collected by and created by humans, encoding all of our biases and societal problems: ``if the data comes from humans, it will likely have a bias in it''\{\citet{shane2019}\}. An excellent example of this problem is zip codes. Zip codes (postal codes) are used as a clean data point to say where someone lives or where an event occurs. However, in the United States, zip codes are an unintended proxy for race\{\citet{oneil2016}\}. For decades a process called redlining forced non-white populations into particular geographic areas, often using natural barriers like highways. At the same time, white people could live and buy houses anywhere they wanted\{\citet{rothstein2017}\}. Real estate isn't a mobile possession, leaving areas with low access to services and jobs synonymous with their non-white populations. Even though redlining is illegal, racial disparities between zip codes persist.

\hypertarget{assumptions}{%
\subsection{Assumptions}\label{assumptions}}

The assumptions we make about the data we have is another potential problem. Essentially, an ML algorithm assumes that the data at hand can answer the question posed by the researcher. Not all of these assumptions are harmful; even if benign, practitioners need to be aware of the underlying assumptions in the data, the algorithm, and the analysis. Using one set of data as a proxy for data you don't have or doesn't exist should be a red flag that assumptions are present\{\citet{oneil2016}\}. A common belief is when we think an algorithm can replace tasks usually performed by humans. A good proving ground for this is library discovery systems, which use text as its data input to predict which library resources are most relevant for each search query. Librarians are experts in answering reference questions, but library discovery systems aren't a 1:1 replacement. We must remember that ``search engines are best when retrieving information about the mundane business of everyday life\ldots but library discovery systems were explicitly designed to deal with topics from the ordinary to the complex''\{\citet{reidsma2019}\}. There's an assumption baked into the acquisition of these systems that they will be objective and accurate when they are often wrong and always opaque about how relevancy is determined. People created ML algorithms that encode their bias within the math.

\hypertarget{sampling}{%
\subsection{Sampling}\label{sampling}}

Dividing your dataset into different samples for training and testing your algorithm is vital in machine learning, but the division itself can be problematic. For example, if your dataset is too small, your training set might produce a model that doesn't work on the testing dataset. Alternatively, irregularly distributed data results in inaccurate models and predictions even if the dataset is large enough. An example is earthquakes, which are not evenly distributed and are very hard to predict\{\citet{silver2012}\}.

\hypertarget{tuning-measuring-fit}{%
\subsection{Tuning \& measuring fit}\label{tuning-measuring-fit}}

The final step in machine learning is tuning the model and measuring the fit, which says how accurate its predictions are. The greatest danger in tuning is overfitting the model. These tradeoffs speak to another potential ML problem: if the model fits the data perfectly, it might predict horribly in real life. For example, using data on one town's home sale prices to predict another town's sale prices may or may not work. Another example with more at stake is overfitted models, such as a self-driving car trained on images with grass on both sides of the road that can't process arriving at the start of a bridge\{\citet{shane2019}\}.

\hypertarget{machine-learning-in-employment}{%
\section{Machine learning in employment}\label{machine-learning-in-employment}}

The reality of job searches today is that when ``you submit a job application or resumé via an online job site, an algorithm generally determines whether you meet the criteria to be evaluated by a human or whether you're rejected outright''\{\citet{broussard2018}\}. All job seekers in your outreach program must deal with this, so it's essential to understand what is happening and why.

Resumé screening algorithms utilize text mining and apply machine learning to words rather than numbers. The goal is to parse the words in a resumé or application to predict which candidates will be most successful; those predicted to do well get passed on to the next round in the hiring process. When we think of the words included in an employment application, names, industry keywords, company names, and educational institutions are listed. All of the previously enumerated problems with machine learning are present in a screening algorithm, and there is no recourse for an individual denied an interview for what they think is an illegitimate reason.

These algorithms use words that appear in ``successful'' applications with current applications. Yet if a company has hired many people who are similar in name, education, or background, then the algorithm will predict that only people like them will be successful. Because we believe computers are objective, algorithms are often undisputed, and our decisions encode the disparities and inequities in our current environment. The lexicons used in textual machine learning are of particular concern because models trained on lexicons that originated from crawling the internet can have word associations that reflect the worst parts of our culture to us. Hazardous word associations can manifest in automated restaurant reviews that consistently demote Mexican restaurants because the training data contained a strong word association between ``Mexican'' and ``illegal,'' a term with a negative connotation that carries over to what the algorithm associates as a related word\{\citet{shane2019}\}. While we can't say if a particular algorithm is making negative associations where it shouldn't be, we know that using lexicons based on the internet is popular because the internet holds a vast number of words, and crawling it to create a dataset is free.

\hypertarget{ml-summary}{%
\section{Summary}\label{ml-summary}}

Machine learning is ubiquitous, but we need to understand how it works because it's so intertwined with our personal and professional lives. Supervised machine learning uses data to fit a model that predicts results, where people decide to set sample sizes, set model parameters, and adjust the model's fit. Because people make the data and they make the algorithms, they are both fallible, and placing blind faith in algorithmic predictions as ``truth'' is unwarranted. There are many places where bias can creep into an ML algorithm, making it critical that we probe the origins of the analyzed data and unpack assumptions therein.

\hypertarget{ml-study}{%
\section{Further Practice}\label{ml-study}}

\begin{itemize}
\tightlist
\item
  Work through the examples in each chapter of \emph{Supervised machine learning for text analysis in R} by Emily Hvitfeldt \& Julia Silge: \url{https://smltar.com/}
\end{itemize}

\hypertarget{ml-resources}{%
\section{Additional Resources}\label{ml-resources}}

\begin{itemize}
\tightlist
\item
  Tidymodels website: \url{https://tidymodels.org}
\item
  \emph{You look like a thing and I love you} by Janelle Shane
\item
  \emph{Masked by trust} by Matthew Reidsma
\item
  \emph{Automating inequality} by Virginia Eubanks
\item
  \emph{Weapons of math destruction} by Cathy O'Neil
\item
  \emph{Artificial unintelligence} by Meredith Broussard
\end{itemize}

\hypertarget{conclusion}{%
\chapter{Conclusion}\label{conclusion}}

\hypertarget{wrapping-upfor-now}{%
\section{Wrapping up\ldots for now}\label{wrapping-upfor-now}}

Congratulations on reaching the end of this book! The book's purpose was to give you a basic foundation in R, which you can use for library-related work. Your work will benefit the fictional you and your fictional partners in our storyline in terms of gaining more knowledge and experience with using R and applying it for a particular library-based purpose. This knowledge and skills will also benefit you in the real world. To recap what you have done:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Become familiar with the R-Studio IDE
\item
  Learn how to do basic data cleaning and data scrubbing tasks with the \emph{dplyr} package.
\item
  Make a basic plot with the \emph{ggplot2} package.
\item
  Do web scraping with the \emph{rvest} package.
\item
  Create a static and interactive map with the \emph{tmap} package
\item
  Do text mining operations such as tokenization, sentiment analysis, and tf-idf function.
\item
  Create a basic R markdown document.
\item
  Create a flexdashboard using R Markdown.
\item
  Create Shiny apps within the flexdashboard.
\item
  Have a conceptual understanding of machine learning using the \emph{tidymodels} package.
\end{enumerate}

The final deliverables you created through the scenario are an R Markdown document and flexdashboard that integrates a Shiny app in order to summarize the wards in St.~Louis with the lowest and highest unemployment along with the top ten occupations in each of these wards.

\hypertarget{where-do-you-go-from-here}{%
\section{Where do you go from here?}\label{where-do-you-go-from-here}}

You're probably wondering what's next. This book focused on the breadth of R, not the depth. Really where you go from here depends on what you want to focus on moving forward. We recommend the seminal text \href{https://r4ds.had.co.nz/}{R for Data Science} by Garrett Grolemund and Hadley Wickham, for a more robust foundation with R. \emph{R for Data Science}, more commonly referred to as R4DS, will give a more in-depth overview of such topics as \emph{dplyr}, \emph{ggplot2}, and tidy data.

Suppose you want to learn more about \emph{tmap} and to handle spatial data in R in general. In that case, we recommend \href{https://geocompr.robinlovelace.net/}{Geocomputation in R} by Robin Lovelace, Jakub Nowosad, and Jannes Muenchow. This book gives a comprehensive overview of the types of spatial data, operations related to the different types and components of spatial data, making maps with R with various packages other than \emph{tmap}, and applications in various fields.

When it comes to web scraping and text mining, you can't go wrong with \href{https://steviep42.github.io/webscraping/book/}{Web Scraping with R} by Steve Pittard and \href{https://www.tidytextmining.com/index.html}{Text Mining with R} by Julia Silge and David Robinson. \emph{Web Scraping with R} provides several real-world examples for web scraping using APIs and guidance on dealing with websites that return XML and JSON when attempting to scrape them. With \emph{Text Mining with R}, there is a more in-depth discussion on topics such as tf-idf and sentiment analysis and covers subjects not included in this text, such as n-grams and correlations.

An excellent text to refer to if you want to learn more about \emph{rmarkdown} and \emph{flexdashboard} is \href{https://bookdown.org/yihui/rmarkdown/}{R Markdown: The Definitive Guide} by Yuhui Xie, J.J. Allaire, and Garrett Grolemund. It gives more detail about R Markdown syntax and FlexDashboard and real-world examples and other outputs you can create with R Markdown, such as presentations, document templates, and websites.

We only covered the surface of \emph{shiny} in this book. For a more comprehensive discussion on Shiny, then \href{https://mastering-shiny.org/}{Mastering Shiny} by Hadley Wickham is a good resource and gives a more in-depth discussion of reactivity and best practices for making Shiny apps. Another topic that we didn't go in-depth with is how to do machine learning with the \emph{tidymodels} package. \href{https://www.tmwr.org/}{Tidy Modeling with R} by Max Kuhn and Julia Silge will expose you to both the basics and the basics of tidy modeling through case studies using housing data.

\hypertarget{data-management-never-do-work-without-it}{%
\section{Data management: Never do work without it}\label{data-management-never-do-work-without-it}}

We provided many resources to further your knowledge on the various topics covered in this book. However, it is also crucial that you implement sound data management practices that allow your data to be FAIR which, to reiterate, is findable, accessible, interoperable, and reusable (see chapter one for further reference). One example of sound data management principles includes having a README file that mentions elements such as the name of your project, contact information, file, and variable listings, along with an explanation of the variables. Another example is ensuring that your variable names do not have special characters and that an underscore is used instead of spaces since spaces can cause problems when reading in a file or variables. A good resource for data management is \href{https://pelagicpublishing.com/products/data-management-for-researchers-briney\#}{Data Management for Researchers} by Kristin Briney. By implementing sound data management practices, you're helping others understand what you are doing and the future you as well when you have to revisit a project that you haven't worked on in a while.

\hypertarget{final-send-off-on-your-data-science-journey}{%
\section{Final send-off on your data science journey}\label{final-send-off-on-your-data-science-journey}}

This book has launched your data science journey, but it is just the beginning. We hope that through this book, you can learn basic data science concepts such as data scrubbing and data visualization and gain confidence by working through the scenarios for each chapter. We hope you can see how some of the things you learned can be applied to your work and find an interesting topic you would explore more in-depth. Good luck on your data science journey, and while there might be turns and road bumps on the way, keep on going and don't stop learning!

\cleardoublepage

\hypertarget{appendix-appendix}{%
\appendix \addcontentsline{toc}{chapter}{\appendixname}}


\hypertarget{dependencies}{%
\chapter{Dependencies}\label{dependencies}}

\hypertarget{ios-dependencies-monterey}{%
\section{iOS Dependencies (Monterey)}\label{ios-dependencies-monterey}}

\begin{itemize}
\item
  Homebrew\footnote{\url{https://brew.sh/}} is an application needed by iOS operating systems to install additional software. Follow the instructions on their website to install Homebrew on your Mac. The password requested is the one used to log in to your computer. ``Missing package manager for MacOS''
\item
  go to \url{https://brew.sh} and copy their install code for Homebrew for Mac; what do you do for Windows?
\item
  open the terminal and paste that code; follow any prompts
\item
  if it was installed a while ago, then brew update is needed
\item
  once installed, run the following commands (one at a time):
\item
  brew install udunits
\item
  brew install gdal
\item
  brew install proj **this might already be installed
\end{itemize}

*for Mac users on Big Sur, try:
+install.packages(c(``rgdal'',``sf''), ``\url{https://mac.R-project.org}'') and answer yes to first prompt, but no to compilation from source

\hypertarget{windows-dependencies}{%
\section{Windows Dependencies}\label{windows-dependencies}}

\hypertarget{package-dependencies-for-this-book}{%
\section{Package dependencies for this book}\label{package-dependencies-for-this-book}}

needed, or just listed in the packages in chapter 2?

\hypertarget{additional-skills}{%
\chapter{Additional Skills}\label{additional-skills}}

\hypertarget{using-the-shell-or-command-line-on-your-computer}{%
\section{Using the Shell or command line on your computer}\label{using-the-shell-or-command-line-on-your-computer}}

\hypertarget{using-github-git}{%
\section{Using GitHub \& Git}\label{using-github-git}}

  \bibliography{book.bib,packages.bib}

\backmatter
\printindex

\end{document}