index.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.475">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">

<meta name="author" content="Janani Ravi | jravilab.github.io">
<meta name="dcterms.date" content="2023-08-01">

<title>ML for Microbial Genomics</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>


<script src="index_files/libs/clipboard/clipboard.min.js"></script>
<script src="index_files/libs/quarto-html/quarto.js"></script>
<script src="index_files/libs/quarto-html/popper.min.js"></script>
<script src="index_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="index_files/libs/quarto-html/anchor.min.js"></script>
<link href="index_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="index_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="index_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="index_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="index_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">


</head>

<body class="fullcontent">

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">

<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">ML for Microbial Genomics</h1>
</div>


<div class="quarto-title-meta">

    <div>
    <div class="quarto-title-meta-heading">Author</div>
    <div class="quarto-title-meta-contents">
             <p>Janani Ravi | jravilab.github.io </p>
          </div>
  </div>
    
    <div>
    <div class="quarto-title-meta-heading">Published</div>
    <div class="quarto-title-meta-contents">
      <p class="date">August 1, 2023</p>
    </div>
  </div>
  
    
  </div>
  

</header>

<section id="mlhd-icts-aug-02-2023" class="level1">
<h1>MLHD <span class="citation" data-cites="ICTS">@ICTS</span> | Aug 02, 2023</h1>
<blockquote class="blockquote">
<p>This is a companion repo &amp; webpage for the Microbial Genomics and ML workshop, first presented at the MLHD 2023 conference! You can access the material here: <a href="https://jananiravi.github.io/2023-mlhd" class="uri">https://jananiravi.github.io/2023-mlhd</a></p>
</blockquote>
<section id="overview" class="level2">
<h2 class="anchored" data-anchor-id="overview">Overview</h2>
<blockquote class="blockquote">
<p>This session will cover ideas, concepts, and insights needed to get started with building machine learning models in R with high-dimensional data, such as microbial genomics. No prior knowledge in ML is required.</p>
</blockquote>
<section id="acknowledgments" class="level3">
<h3 class="anchored" data-anchor-id="acknowledgments">Acknowledgments</h3>
<ul>
<li>JRaviLab: Jacob Krol, Ethan Wolfe, Evan Brenner, Keenan Manpearl, Joseph Burke, Vignesh Sridhar, Jill Bilodeaux (contributed to the antimicrobial resistance project)</li>
<li>Arjun Krishnan (contributed to the tidymodels qmd primer)</li>
<li>R-Ladies, esp.&nbsp;R-Ladies East Lansing, R-Ladies Aurora; R/Bioconductor; rOpenSci (for all things R!)</li>
<li><code>tidymodels</code> resource by Julia Silge et al., | <a href="https://tidymodels.org" class="uri">https://tidymodels.org</a></li>
</ul>
</section>
</section>
<section id="install-and-load-packages" class="level2">
<h2 class="anchored" data-anchor-id="install-and-load-packages">Install and load packages</h2>
<p>To use the code in this document, you will need to install the following packages: <code>glmnet</code>, <code>tidyverse</code>, and <code>tidymodels</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (&lt;http://conflicted.r-lib.org/&gt;) to force all conflicts to become errors</code></pre>
</div>
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
✔ broom        1.0.5     ✔ rsample      1.1.1
✔ dials        1.2.0     ✔ tune         1.1.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
✔ parsnip      1.1.0     ✔ yardstick    1.2.0
✔ recipes      1.0.6     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages</code></pre>
</div>
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(glmnet) <span class="co"># for LR</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-7</code></pre>
</div>
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip) <span class="co"># to extract important features</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>
Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi</code></pre>
</div>
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ranger) <span class="co"># for RF</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="explore-your-data" class="level2">
<h2 class="anchored" data-anchor-id="explore-your-data">Explore your data</h2>
<p>Here, we will use microbial genomics data (e.g., gene presence/absence across multiple microbial genomes) wrangled and processed from the <a href="https://bv-brc.org/">BV-BRC</a> to predict the antibiotic resistance phenotype of each sample (genome) based on the presence/absence of genes in that sample.</p>
<p>To make the dataset usable on your local desktop machine, we have pre-processed the data (using custom scripts that use NCBI/BV-BRC data and metadata, NCBI and BV-BRC CLI, Prokka for genome annotation, and Roary/CD-HIT for constructing ht gene presence/absence matrix and gene clusters that serve as ML features). For this workshop, we have selected a subset of ~900 genomes from <em>Staphylococcus aureus</em>, and limited the data to <code>n</code> genes after filtering out core (present in &gt;95% of genomes) and unique (present in &lt;5% of genomes) genes.</p>
<p>The data is contained in the files <code>abc.csv</code> with samples (genomes) along the rows and genes along the columns. To get started, let’s read this data into R using the <code>readr::read_delim</code> function. These files also carry relevant metadata of the genomes and drugs.</p>
<section id="read-in-the-data-file" class="level3">
<h3 class="anchored" data-anchor-id="read-in-the-data-file">Read in the data file</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Can be set to read csv/tsv: any feature matrix file with metadata</span></span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="co"># e.g., gpa-feature-matrix.tsv</span></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>gpa_featmat <span class="ot">&lt;-</span> <span class="fu">read_delim</span>(<span class="st">"data/staph_penicillin_pangenome.csv"</span>,</span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a>                          <span class="at">delim =</span> <span class="st">","</span>, <span class="at">col_names =</span> T)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Rows: 920 Columns: 2328
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr    (4): antibiotic, amr_pheno, drug_class, assembly_accession
dbl (2324): s_no, genome_id, prmA, hisC_1, araB, yqeN, tagH_2, tet(38), lrgB...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.</code></pre>
</div>
</div>
</section>
<section id="data-exploration" class="level3">
<h3 class="anchored" data-anchor-id="data-exploration">Data exploration</h3>
<p>Let’s print the tibble to examine it quickly.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 920 × 2,328
    s_no genome_id antibiotic amr_pheno   drug_class assembly_accession  prmA
   &lt;dbl&gt;     &lt;dbl&gt; &lt;chr&gt;      &lt;chr&gt;       &lt;chr&gt;      &lt;chr&gt;              &lt;dbl&gt;
 1     0     1280. penicillin Susceptible penicillin GCA_024925485.1        1
 2     1     1280. penicillin Susceptible penicillin GCA_024925485.1        1
 3     2     1280. penicillin Susceptible penicillin GCA_024972975.1        1
 4     3     1280. penicillin Susceptible penicillin GCA_024972975.1        1
 5     4     1280. penicillin Resistant   penicillin GCA_025232045.1        1
 6     5     1280. penicillin Resistant   penicillin GCA_025232045.1        1
 7     6    46170. penicillin Resistant   penicillin GCA_002204575.1        1
 8     7     1280. penicillin Susceptible penicillin GCA_002089075.2        1
 9     8     1280. penicillin Resistant   penicillin GCA_002089095.2        1
10     9     1280. penicillin Resistant   penicillin GCA_002097595.2        1
# ℹ 910 more rows
# ℹ 2,321 more variables: hisC_1 &lt;dbl&gt;, araB &lt;dbl&gt;, yqeN &lt;dbl&gt;, tagH_2 &lt;dbl&gt;,
#   `tet(38)` &lt;dbl&gt;, lrgB &lt;dbl&gt;, cmtB &lt;dbl&gt;, scmP_2 &lt;dbl&gt;, est_2 &lt;dbl&gt;,
#   glcB &lt;dbl&gt;, ponA &lt;dbl&gt;, clpX &lt;dbl&gt;, yiiM &lt;dbl&gt;, thiN &lt;dbl&gt;, ilvE &lt;dbl&gt;,
#   ydcV &lt;dbl&gt;, menH &lt;dbl&gt;, relA &lt;dbl&gt;, yicL &lt;dbl&gt;, rho &lt;dbl&gt;, guaA &lt;dbl&gt;,
#   hemB &lt;dbl&gt;, hemA &lt;dbl&gt;, glpQ_1 &lt;dbl&gt;, suhB &lt;dbl&gt;, tatC2 &lt;dbl&gt;, groL &lt;dbl&gt;,
#   glpK &lt;dbl&gt;, frdA &lt;dbl&gt;, yycI &lt;dbl&gt;, pepA_1 &lt;dbl&gt;, feuC &lt;dbl&gt;, miaA &lt;dbl&gt;, …</code></pre>
</div>
<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(gpa_featmat)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1]  920 2328</code></pre>
</div>
</div>
<p>Then, let’s examine the <code>amr_pheno</code> column of this data frame that tells us which antimicrobial resistance (AMR) phenotype (resistance/susceptible) for each sample (i.e., each row, genome) for different drugs. We can tabulate the number and fraction of genomes per phenotype easily using the <code>count</code> and <code>mutate</code> functions from <code>dplyr</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat <span class="sc">%&gt;%</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(amr_pheno) <span class="sc">%&gt;%</span> </span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
  amr_pheno       n  prop
  &lt;chr&gt;       &lt;int&gt; &lt;dbl&gt;
1 Resistant     481 0.523
2 Susceptible   439 0.477</code></pre>
</div>
</div>
<p>Before we proceed, let’s also try and get a sense of the values in this feature matrix. Since there are thousands of genes, we’ll randomly pick a few of them and visualize the distribution of their values across all the samples using boxplots.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>gpa_sum <span class="ot">&lt;-</span> gpa_featmat <span class="sc">|&gt;</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select</span>(<span class="dv">7</span><span class="sc">:</span><span class="fu">last_col</span>()) <span class="sc">|&gt;</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summarize</span>(<span class="fu">across</span>(<span class="fu">where</span>(is.numeric), sum))</span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>gpa_sum_long <span class="ot">&lt;-</span> gpa_sum <span class="sc">|&gt;</span> </span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">pivot_longer</span>(<span class="at">cols =</span> <span class="fu">everything</span>(), <span class="at">names_to =</span> <span class="st">"gene"</span>)</span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(gpa_sum_long, <span class="fu">aes</span>(value)) <span class="sc">+</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>  <span class="co"># geom_histogram(bins=10) +</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_bar</span>() <span class="sc">+</span></span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_binned</span>() <span class="sc">+</span></span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">xlab</span>(<span class="st">"Genes present in X genomes"</span>) <span class="sc">+</span></span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ylab</span>(<span class="st">"N Genes with X frequency"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-5-1.png" class="img-fluid" width="672"></p>
</div>
</div>
</section>
</section>
<section id="feature-matrices-ml" class="level2">
<h2 class="anchored" data-anchor-id="feature-matrices-ml">Feature matrices –&gt; ML</h2>
<p>Given there are genomes with R/S from multiple drugs, to make the problem simpler, let’s pick one drug of interest and define the problem as classifying whether a genome is resistant or not to this antibiotic.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>pos_pheno <span class="ot">&lt;-</span> <span class="st">"Resistant"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Then, we need to modify the <code>amr_pheno</code> variable into a binary indicator of whether it is resistant or not and finally convert that variable into a factor so that the model knows to consider it as a way to partition the samples.</p>
<section id="set-up-the-feature-matrix-and-labels-for-the-ml-model" class="level3">
<h3 class="anchored" data-anchor-id="set-up-the-feature-matrix-and-labels-for-the-ml-model">Set up the feature matrix and labels for the ML model</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat_pheno <span class="ot">&lt;-</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>  gpa_featmat <span class="sc">%&gt;%</span></span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">amr_pheno =</span> <span class="fu">ifelse</span>(amr_pheno<span class="sc">==</span>pos_pheno,</span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a>                            <span class="st">"Resistant"</span>, <span class="st">"Susceptible"</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="fu">across</span>(<span class="fu">where</span>(is.character), as.factor))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>A critical quantity to be fully aware of when setting up an ML problem is class balance, i.e., the relative sizes of the positive (<code>"Resistant"</code>) and negative (<code>"Susceptible"</code>) classes.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat_pheno <span class="sc">%&gt;%</span> </span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(amr_pheno) <span class="sc">%&gt;%</span> </span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
  amr_pheno       n  prop
  &lt;fct&gt;       &lt;int&gt; &lt;dbl&gt;
1 Resistant     481 0.523
2 Susceptible   439 0.477</code></pre>
</div>
</div>
<p>We can see that, in our dataset, only xx% of the samples are “Resistant”. Referred to as <em>class imbalance</em>, this scenario is extremely common in biomedicine and needs careful attention when analyzing and interpreting results.</p>
</section>
<section id="data-splitting" class="level3">
<h3 class="anchored" data-anchor-id="data-splitting">Data splitting</h3>
<p>If we take the data from all samples and train an <em>AMR classification</em> ML model, we cannot easily tell how good the model is. So, let’s reserve 25% of the samples to a <em>test set</em>, which we will hold out until the end of the project, at which point there should only be one or two models under serious consideration. The <em>test set</em> will be used as an unbiased source for measuring final model performance.</p>
<p>This is also the first step where we need to pay attention to class imbalance. As the <code>amr_pheno</code> variable is highly imbalanced, we need to use <em>stratified</em> random samples so that both the splits contain nearly identical proportions of positive and negative samples.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The function `initial_split()` takes the original data and saves the information on how to make the partitions.</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">123</span>)</span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>splits <span class="ot">&lt;-</span> <span class="fu">initial_split</span>(<span class="at">data =</span> gpa_featmat_pheno,</span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>                        <span class="at">strata =</span> amr_pheno)</span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Within initial_split, you can specify proportion using "prop" and</span></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a><span class="co"># grouping/datasets to go into the same set using "group"</span></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a><span class="co"># The `training()` and `testing()` functions return the actual datasets.</span></span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a>gpa_other <span class="ot">&lt;-</span> <span class="fu">training</span>(splits)</span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a>gpa_test  <span class="ot">&lt;-</span> <span class="fu">testing</span>(splits)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Let’s check if we indeed did achieve stratified data splits.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># other set proportions by AMR pheno</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>gpa_other <span class="sc">%&gt;%</span></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(amr_pheno) <span class="sc">%&gt;%</span> </span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
  amr_pheno       n  prop
  &lt;fct&gt;       &lt;int&gt; &lt;dbl&gt;
1 Resistant     360 0.522
2 Susceptible   329 0.478</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># test set proportions by R/S ratio</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>gpa_test <span class="sc">%&gt;%</span></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(amr_pheno) <span class="sc">%&gt;%</span> </span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
  amr_pheno       n  prop
  &lt;fct&gt;       &lt;int&gt; &lt;dbl&gt;
1 Resistant     121 0.524
2 Susceptible   110 0.476</code></pre>
</div>
</div>
<p>What’s up with the <code>gpa_other</code> split that’s not testing? This split will be used to create two new datasets:</p>
<ol type="1">
<li>The set held out for the purpose of measuring performance, called the <em>validation set</em>, and</li>
<li>The remaining data used to fit the model, called the <em>training set</em>.</li>
</ol>
<p>We’ll use the <code>validation_split</code> function to allocate 20% of the <code>gpa_other</code> samples to the validation set and the remaining 80% to the training set. Note that this function too has the <code>strata</code> argument. Do you see why we need it here?</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">234</span>)</span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>gpa_val <span class="ot">&lt;-</span> <span class="fu">validation_split</span>(<span class="at">data =</span> gpa_other,</span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a>                            <span class="at">strata =</span> amr_pheno, <span class="co"># maintain original data split</span></span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a>                            <span class="at">prop =</span> <span class="fl">0.80</span>) <span class="co"># 80% training; 20% validation</span></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>gpa_val</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># Validation Set Split (0.8/0.2)  using stratification 
# A tibble: 1 × 2
  splits            id        
  &lt;list&gt;            &lt;chr&gt;     
1 &lt;split [551/138]&gt; validation</code></pre>
</div>
</div>
</section>
<section id="training-ml-models-in-r-penalized-logistic-regression" class="level3">
<h3 class="anchored" data-anchor-id="training-ml-models-in-r-penalized-logistic-regression">Training ML models in R: Penalized logistic regression</h3>
<p>Since our outcome variable <code>AMR_pheno</code> is categorical, <a href="https://en.wikipedia.org/wiki/Logistic_regression">logistic regression</a> would be a good first model to start. Let’s use a model that can perform feature selection during training. The <a href="https://cran.r-project.org/web/packages/glmnet/index.html">glmnet</a> R package fits a generalized linear model via penalized maximum likelihood. This method of estimating the logistic regression slope parameters uses a <em>penalty</em> on the process so that the coefficients of less relevant predictors are driven towards a value of zero. One of the <code>glmnet</code> penalization methods, called the <a href="https://en.wikipedia.org/wiki/Lasso_(statistics)">lasso method</a>, can actually set the predictor slopes to zero if a large enough penalty is used.</p>
</section>
<section id="build-the-model" class="level3">
<h3 class="anchored" data-anchor-id="build-the-model">Build the model</h3>
<p>To specify a penalized logistic regression model that uses a feature selection penalty, we will use <code>parsnip</code> package (part of <code>tidymodels</code>) that is great at providing a tidy, unified interface to models that can be used to try a range of models without getting bogged down in the syntactical minutiae of the underlying packages.</p>
<p>Here, let’s use it with the <a href="https://www.tidymodels.org/find/parsnip/">glmnet engine</a>:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Build logistic regression model</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>lr_model <span class="ot">&lt;-</span> </span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">logistic_reg</span>(<span class="at">penalty =</span> <span class="fu">tune</span>(), <span class="co"># strength of regularization/penalty</span></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>               <span class="at">mixture =</span> <span class="dv">1</span>) <span class="sc">%&gt;%</span> <span class="co"># specifies a pure lasso model</span></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set_engine</span>(<span class="st">"glmnet"</span>) <span class="co"># set to generalized linear models</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We’ll set the <code>penalty</code> argument to <code>tune()</code> as a placeholder for now. This is a model <em>hyperparameter</em> that we will <a href="https://www.tidymodels.org/start/tuning/">tune</a> to find the best value for making predictions with our data. Setting <code>mixture</code> to a value of <code>1</code> means that the glmnet model will potentially remove irrelevant predictors and choose a simpler model. Sum of absolute values of beta-coefficients is minimized.</p>
<p><em>You can try with <code>mixture=0</code> for L2 ridge regression (or 0-1 for elasticnet combining L1 and L2).</em></p>
</section>
<section id="create-the-recipe" class="level3">
<h3 class="anchored" data-anchor-id="create-the-recipe">Create the recipe</h3>
<p>Next, we’re going to use the <code>recipes</code> to build <a href="https://dplyr.tidyverse.org/">dplyr</a>-like pipeable sequences of feature engineering steps to get our data ready for modeling. Recipes are built as a series of pre-processing steps, such as:</p>
<ul>
<li><p>converting qualitative predictors to indicator variables (also known as dummy variables),</p></li>
<li><p>transforming data to be on a different scale (e.g., taking the logarithm of a variable),</p></li>
<li><p>transforming whole groups of predictors together,</p></li>
<li><p>extracting key features from raw variables (e.g., getting the day of the week out of a date variable),</p></li>
</ul>
<p>and so on. Here, we’re using it to set up the outcome variable as a function of gene presence and then do two things:</p>
<ul>
<li><p><code>step_zv()</code> removes indicator variables that only contain a single unique value (e.g.&nbsp;all zeros). This is important because, for penalized models, the predictors should be centered and scaled.</p></li>
<li><p><code>step_normalize()</code> centers and scales numeric variables.</p></li>
</ul>
<div class="cell">
<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>lr_recipe <span class="ot">&lt;-</span> </span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">recipe</span>(amr_pheno <span class="sc">~</span> ., <span class="at">data =</span> gpa_other) <span class="sc">%&gt;%</span> <span class="co"># specify data + labels</span></span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">update_role</span>(<span class="fu">c</span>(s_no, genome_id, assembly_accession, <span class="co"># genome attributes</span></span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a>                antibiotic, drug_class), <span class="co"># drug attributes</span></span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a>              <span class="at">new_role =</span> <span class="st">"Supplementary"</span>) <span class="sc">%&gt;%</span> <span class="co"># tag metadata not used for ML</span></span>
<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_zv</span>(<span class="fu">all_predictors</span>()) <span class="sc">%&gt;%</span> <span class="co"># remove predictors with only one value</span></span>
<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a>  <span class="co"># step_nzv(all_predictors()) # for near-zero variance</span></span>
<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>()) <span class="co"># normalize all predictors</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p><em>Try with <code>step_nzv</code> instead of only <code>step_zv</code>.</em></p>
</section>
<section id="create-the-workflow" class="level3">
<h3 class="anchored" data-anchor-id="create-the-workflow">Create the workflow</h3>
<p>Let’s bundle the model and recipe into a single <code>workflow()</code> object to make management of the R objects easier:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Standard model recipe for LR | uses our recipe definition from above</span></span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>lr_workflow <span class="ot">&lt;-</span> <span class="fu">workflow</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_model</span>(lr_model) <span class="sc">%&gt;%</span></span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_recipe</span>(lr_recipe)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="create-the-grid-for-tuning" class="level3">
<h3 class="anchored" data-anchor-id="create-the-grid-for-tuning">Create the grid for tuning</h3>
<p>Before we fit this model, we need to set up a grid of <code>penalty</code> values to tune.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Try values from 0.0001 to 0.1 to penalize for complex models;</span></span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Minimizing no. of features with non-zero coefficients</span></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>lr_reg_grid <span class="ot">&lt;-</span> <span class="fu">tibble</span>(<span class="at">penalty =</span> <span class="dv">10</span><span class="sc">^</span><span class="fu">seq</span>(<span class="sc">-</span><span class="dv">4</span>, <span class="sc">-</span><span class="dv">1</span>, <span class="at">length.out =</span> <span class="dv">10</span>))</span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a>lr_reg_grid</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 10 × 1
    penalty
      &lt;dbl&gt;
 1 0.0001  
 2 0.000215
 3 0.000464
 4 0.001   
 5 0.00215 
 6 0.00464 
 7 0.01    
 8 0.0215  
 9 0.0464  
10 0.1     </code></pre>
</div>
</div>
</section>
<section id="train-and-tune-the-model" class="level3">
<h3 class="anchored" data-anchor-id="train-and-tune-the-model">Train and tune the model</h3>
<p>The <code>tune::tune_grid()</code> function will help us train these 10 penalized logistic regression models and save the validation set prediction (via the call to <code>control_grid()</code>) so that diagnostic information will be available after fitting the model. To quantify how well the model performs (on the <em>validation set</em>), let’s first consider the <a href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic">area under the ROC curve</a> across a range of hyperparameters.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>lr_res <span class="ot">&lt;-</span> </span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>  lr_workflow <span class="sc">%&gt;%</span> </span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">tune_grid</span>(<span class="at">resamples =</span> gpa_val, <span class="co"># using validation split</span></span>
<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a>            <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a>            <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb35-6"><a href="#cb35-6" aria-hidden="true" tabindex="-1"></a>            <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span>
<span id="cb35-7"><a href="#cb35-7" aria-hidden="true" tabindex="-1"></a><span class="co">#metrics = metric_set(pr_auc)) # if you want to optimize for AUPRC instead</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="tune-the-model-with-cross-validation-instead" class="level4">
<h4 class="anchored" data-anchor-id="tune-the-model-with-cross-validation-instead">Tune the model with cross-validation instead?</h4>
<div class="cell">
<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>lr_res_cv <span class="ot">&lt;-</span> </span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>  lr_workflow <span class="sc">%&gt;%</span> </span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">tune_grid</span>(<span class="at">resamples =</span> <span class="fu">vfold_cv</span>(gpa_other), <span class="co"># new CV line</span></span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>            <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a>            <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a>            <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span>
<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a><span class="co">#metrics = metric_set(pr_auc)) # if you want to optimize for AUPRC instead</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
</section>
</section>
<section id="evaluation-metrics" class="level2">
<h2 class="anchored" data-anchor-id="evaluation-metrics">Evaluation metrics</h2>
<p>A plot of the area under the ROC curve against the range of penalty values will help us guess which value is best for the problem/dataset at hand.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>lr_plot <span class="ot">&lt;-</span> lr_res <span class="sc">%&gt;%</span> </span>
<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_metrics</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">x =</span> penalty, <span class="at">y =</span> mean)) <span class="sc">+</span> </span>
<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>() <span class="sc">+</span> </span>
<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_line</span>() <span class="sc">+</span> </span>
<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ylab</span>(<span class="st">"Area under the ROC Curve"</span>) <span class="sc">+</span></span>
<span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a>  <span class="co">#ylab("Area under the PR Curve") +</span></span>
<span id="cb37-8"><a href="#cb37-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">label_number</span>()) <span class="sc">+</span></span>
<span id="cb37-9"><a href="#cb37-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>()</span>
<span id="cb37-10"><a href="#cb37-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-11"><a href="#cb37-11" aria-hidden="true" tabindex="-1"></a>lr_plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-19-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>What is your interpretation of this plot? Write it here.</p>
<p>We can also tabulate these results to help pick the “best” hyperparameter.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>top_models <span class="ot">&lt;-</span></span>
<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a>  lr_res <span class="sc">%&gt;%</span> </span>
<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">show_best</span>(<span class="st">"roc_auc"</span>, <span class="at">n =</span> <span class="dv">10</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">arrange</span>(penalty) </span>
<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a>top_models</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err .config              
      &lt;dbl&gt; &lt;chr&gt;   &lt;chr&gt;      &lt;dbl&gt; &lt;int&gt;   &lt;dbl&gt; &lt;chr&gt;                
 1 0.0001   roc_auc binary     0.996     1      NA Preprocessor1_Model01
 2 0.000215 roc_auc binary     0.996     1      NA Preprocessor1_Model02
 3 0.000464 roc_auc binary     0.996     1      NA Preprocessor1_Model03
 4 0.001    roc_auc binary     0.996     1      NA Preprocessor1_Model04
 5 0.00215  roc_auc binary     0.996     1      NA Preprocessor1_Model05
 6 0.00464  roc_auc binary     0.997     1      NA Preprocessor1_Model06
 7 0.01     roc_auc binary     0.997     1      NA Preprocessor1_Model07
 8 0.0215   roc_auc binary     0.996     1      NA Preprocessor1_Model08
 9 0.0464   roc_auc binary     0.996     1      NA Preprocessor1_Model09
10 0.1      roc_auc binary     0.979     1      NA Preprocessor1_Model10</code></pre>
</div>
</div>
<p>Let’s select the best value and visualize the validation set ROC curve. Why are we picking the 6<sup>th</sup> value instead of the 1<sup>st</sup> even though they have nearly identical performance metrics?</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>lr_best <span class="ot">&lt;-</span> lr_res <span class="sc">%&gt;%</span> </span>
<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_metrics</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">arrange</span>(penalty, mean) <span class="sc">%&gt;%</span> </span>
<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">slice</span>(<span class="dv">6</span>)</span>
<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb40-6"><a href="#cb40-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternatively, you can just use</span></span>
<span id="cb40-7"><a href="#cb40-7" aria-hidden="true" tabindex="-1"></a>lr_best <span class="ot">&lt;-</span> lr_res <span class="sc">|&gt;</span> </span>
<span id="cb40-8"><a href="#cb40-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"roc_auc"</span>)</span>
<span id="cb40-9"><a href="#cb40-9" aria-hidden="true" tabindex="-1"></a>lr_best</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 2
  penalty .config              
    &lt;dbl&gt; &lt;chr&gt;                
1 0.00464 Preprocessor1_Model06</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>lr_roc <span class="ot">&lt;-</span> lr_res <span class="sc">%&gt;%</span> </span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> lr_best) <span class="sc">%&gt;%</span> </span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">roc_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span> </span>
<span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(lr_roc)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-22-1.png" class="img-fluid" width="672"></p>
</div>
<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Alternatively ... </span></span>
<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Select the best LR model</span></span>
<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a>final_lr_model <span class="ot">&lt;-</span> <span class="fu">finalize_workflow</span>(lr_workflow, lr_best)</span>
<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the data</span></span>
<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a>lr_fit <span class="ot">&lt;-</span> final_lr_model <span class="sc">%&gt;%</span> <span class="fu">fit</span>(<span class="at">data =</span> gpa_other)</span>
<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Save predictions</span></span>
<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a>lr_aug <span class="ot">&lt;-</span> <span class="fu">augment</span>(lr_fit, gpa_test)</span>
<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate AUROC</span></span>
<span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a>auroc <span class="ot">&lt;-</span> lr_aug <span class="sc">%&gt;%</span> <span class="fu">roc_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span></span>
<span id="cb43-10"><a href="#cb43-10" aria-hidden="true" tabindex="-1"></a>      <span class="fu">select</span>(.estimate) <span class="sc">%&gt;%</span> <span class="fu">as.numeric</span>()</span>
<span id="cb43-11"><a href="#cb43-11" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUROC:"</span>, auroc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUROC: 0.993313298271975"</code></pre>
</div>
</div>
<p>The area under the ROC curve has a nice property that it can be interpreted as a probability and has a close connection to a statistical test (<a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">the Mann-Whitney U test</a>).</p>
<section id="selecting-the-top-features" class="level3">
<h3 class="anchored" data-anchor-id="selecting-the-top-features">Selecting the top features</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb45"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Extract top 10 genes</span></span>
<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a>n_top_genes <span class="ot">&lt;-</span> <span class="dv">10</span></span>
<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a>top_genes <span class="ot">&lt;-</span> lr_fit <span class="sc">%&gt;%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%&gt;%</span></span>
<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a>  vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">%&gt;%</span> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">%&gt;%</span></span>
<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">%&gt;%</span> <span class="fu">pull</span>()</span>
<span id="cb45-6"><a href="#cb45-6" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(top_genes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632"  "group_7634"  "blaZ"        "blaR1"       "ugpQ"       
 [6] "blaR1-2"     "bin3"        "group_11618" "cadC"        "group_5831" </code></pre>
</div>
</div>
</section>
<section id="when-you-have-imbalanced-classes" class="level3">
<h3 class="anchored" data-anchor-id="when-you-have-imbalanced-classes">When you have imbalanced classes</h3>
<p>However, this measure is not sensitive to class imbalances and can come out to be high even if the model is making many mistakes in the minor positive class — which is typically of biomedical interest — and getting most of the major negative class correct.</p>
<p>So, the final analysis we’re going to do is to evaluate performance based on another metric called <a href="https://en.wikipedia.org/wiki/Precision_and_recall">area under the Precision-Recall curve</a> that is more sensitive to the minor positive class by focusing on the fraction of top positive predictions that are correct (precision) and the fraction of positive samples that are correctly predicted (recall).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a>lr_res_pr <span class="ot">&lt;-</span> lr_workflow <span class="sc">%&gt;%</span> </span>
<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">tune_grid</span>(<span class="at">resamples =</span> gpa_val,</span>
<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a>            <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a>            <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a>            <span class="at">metrics =</span> <span class="fu">metric_set</span>(pr_auc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb48"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>lr_plot_pr <span class="ot">&lt;-</span> lr_res_pr <span class="sc">%&gt;%</span> </span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_metrics</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb48-3"><a href="#cb48-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">x =</span> penalty, <span class="at">y =</span> mean)) <span class="sc">+</span> </span>
<span id="cb48-4"><a href="#cb48-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_point</span>() <span class="sc">+</span> </span>
<span id="cb48-5"><a href="#cb48-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_line</span>() <span class="sc">+</span> </span>
<span id="cb48-6"><a href="#cb48-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ylab</span>(<span class="st">"Area under the PR Curve"</span>) <span class="sc">+</span></span>
<span id="cb48-7"><a href="#cb48-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">label_number</span>()) <span class="sc">+</span></span>
<span id="cb48-8"><a href="#cb48-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_bw</span>()</span>
<span id="cb48-9"><a href="#cb48-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb48-10"><a href="#cb48-10" aria-hidden="true" tabindex="-1"></a>lr_plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-25-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a>lr_best_pr <span class="ot">&lt;-</span> lr_res_pr <span class="sc">%&gt;%</span> </span>
<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_metrics</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">arrange</span>(penalty) <span class="sc">%&gt;%</span> </span>
<span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">slice</span>(<span class="dv">6</span>)</span>
<span id="cb49-5"><a href="#cb49-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb49-6"><a href="#cb49-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternatively, you can just use</span></span>
<span id="cb49-7"><a href="#cb49-7" aria-hidden="true" tabindex="-1"></a>lr_best_pr <span class="ot">&lt;-</span> lr_res_pr <span class="sc">|&gt;</span> </span>
<span id="cb49-8"><a href="#cb49-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"pr_auc"</span>)</span>
<span id="cb49-9"><a href="#cb49-9" aria-hidden="true" tabindex="-1"></a>lr_best</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 2
  penalty .config              
    &lt;dbl&gt; &lt;chr&gt;                
1 0.00464 Preprocessor1_Model06</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb51"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>lr_pr <span class="ot">&lt;-</span> lr_res_pr <span class="sc">%&gt;%</span> </span>
<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> lr_best) <span class="sc">%&gt;%</span> </span>
<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">pr_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span> </span>
<span id="cb51-4"><a href="#cb51-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb51-5"><a href="#cb51-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb51-6"><a href="#cb51-6" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(lr_pr)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-27-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<section id="retrieving-your-top-features" class="level4">
<h4 class="anchored" data-anchor-id="retrieving-your-top-features">Retrieving your top features</h4>
<div class="cell">
<div class="sourceCode cell-code" id="cb52"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Select best LR model</span></span>
<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a>best_lr_model_pr <span class="ot">&lt;-</span> <span class="fu">select_best</span>(lr_res_pr, <span class="st">"pr_auc"</span>)</span>
<span id="cb52-3"><a href="#cb52-3" aria-hidden="true" tabindex="-1"></a>final_lr_model_pr <span class="ot">&lt;-</span> <span class="fu">finalize_workflow</span>(lr_workflow, best_lr_model_pr)</span>
<span id="cb52-4"><a href="#cb52-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-5"><a href="#cb52-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the data</span></span>
<span id="cb52-6"><a href="#cb52-6" aria-hidden="true" tabindex="-1"></a>lr_fit_pr <span class="ot">&lt;-</span> final_lr_model_pr <span class="sc">%&gt;%</span></span>
<span id="cb52-7"><a href="#cb52-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">fit</span>(<span class="at">data =</span> gpa_other)</span>
<span id="cb52-8"><a href="#cb52-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-9"><a href="#cb52-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Save predictions</span></span>
<span id="cb52-10"><a href="#cb52-10" aria-hidden="true" tabindex="-1"></a>lr_aug_pr <span class="ot">&lt;-</span> <span class="fu">augment</span>(lr_fit_pr, gpa_test)</span>
<span id="cb52-11"><a href="#cb52-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-12"><a href="#cb52-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Get AUPRC</span></span>
<span id="cb52-13"><a href="#cb52-13" aria-hidden="true" tabindex="-1"></a>auprc <span class="ot">&lt;-</span> lr_aug_pr <span class="sc">%&gt;%</span></span>
<span id="cb52-14"><a href="#cb52-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">pr_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span></span>
<span id="cb52-15"><a href="#cb52-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select</span>(.estimate) <span class="sc">%&gt;%</span> <span class="fu">as.numeric</span>()</span>
<span id="cb52-16"><a href="#cb52-16" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUPRC:"</span>, auprc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUPRC: 0.99443515870738"</code></pre>
</div>
<div class="sourceCode cell-code" id="cb54"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Extract top 10 genes</span></span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a>n_top_genes <span class="ot">&lt;-</span> <span class="dv">10</span></span>
<span id="cb54-3"><a href="#cb54-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-4"><a href="#cb54-4" aria-hidden="true" tabindex="-1"></a>top_genes_pr <span class="ot">&lt;-</span> lr_fit_pr <span class="sc">|&gt;</span> </span>
<span id="cb54-5"><a href="#cb54-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">extract_fit_parsnip</span>() <span class="sc">|&gt;</span></span>
<span id="cb54-6"><a href="#cb54-6" aria-hidden="true" tabindex="-1"></a> vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">|&gt;</span></span>
<span id="cb54-7"><a href="#cb54-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">|&gt;</span></span>
<span id="cb54-8"><a href="#cb54-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">|&gt;</span></span>
<span id="cb54-9"><a href="#cb54-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">pull</span>()</span>
<span id="cb54-10"><a href="#cb54-10" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(top_genes_pr)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632"  "group_7634"  "blaZ"        "blaR1"       "ugpQ"       
 [6] "blaR1-2"     "bin3"        "group_11618" "cadC"        "group_5831" </code></pre>
</div>
</div>
</section>
</section>
</section>
<section id="predicting-ar-w-random-forest" class="level2">
<h2 class="anchored" data-anchor-id="predicting-ar-w-random-forest">Predicting AR w/ Random Forest</h2>
<div class="cell">
<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Setting a seed enables our analysis to be reproducible when random numbers are used.</span></span>
<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a>    <span class="fu">set.seed</span>(<span class="dv">569</span>)</span>
<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a>    rf_splits <span class="ot">&lt;-</span> <span class="fu">initial_split</span>(gpa_featmat_pheno,</span>
<span id="cb56-5"><a href="#cb56-5" aria-hidden="true" tabindex="-1"></a>                                <span class="co">#prop = train_test_split,</span></span>
<span id="cb56-6"><a href="#cb56-6" aria-hidden="true" tabindex="-1"></a>                                <span class="at">strata =</span> amr_pheno)</span>
<span id="cb56-7"><a href="#cb56-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb56-8"><a href="#cb56-8" aria-hidden="true" tabindex="-1"></a>    <span class="co">#Create separate data frames for the training and testing sets.</span></span>
<span id="cb56-9"><a href="#cb56-9" aria-hidden="true" tabindex="-1"></a>    gpa_train <span class="ot">&lt;-</span> <span class="fu">training</span>(rf_splits)</span>
<span id="cb56-10"><a href="#cb56-10" aria-hidden="true" tabindex="-1"></a>    gpa_test <span class="ot">&lt;-</span> <span class="fu">testing</span>(rf_splits)</span>
<span id="cb56-11"><a href="#cb56-11" aria-hidden="true" tabindex="-1"></a>    </span>
<span id="cb56-12"><a href="#cb56-12" aria-hidden="true" tabindex="-1"></a>    <span class="fu">set.seed</span>(<span class="dv">234</span>)</span>
<span id="cb56-13"><a href="#cb56-13" aria-hidden="true" tabindex="-1"></a>    gpa_val <span class="ot">&lt;-</span> <span class="fu">validation_split</span>(<span class="at">data =</span> gpa_train,</span>
<span id="cb56-14"><a href="#cb56-14" aria-hidden="true" tabindex="-1"></a>                                <span class="at">strata =</span> amr_pheno, <span class="co"># maintain original data split</span></span>
<span id="cb56-15"><a href="#cb56-15" aria-hidden="true" tabindex="-1"></a>                                <span class="at">prop =</span> <span class="fl">0.80</span>) <span class="co"># 80% training; 20% validation</span></span>
<span id="cb56-16"><a href="#cb56-16" aria-hidden="true" tabindex="-1"></a>    gpa_val</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># Validation Set Split (0.8/0.2)  using stratification 
# A tibble: 1 × 2
  splits            id        
  &lt;list&gt;            &lt;chr&gt;     
1 &lt;split [551/138]&gt; validation</code></pre>
</div>
<div class="sourceCode cell-code" id="cb58"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a>    <span class="co">#Create recipe</span></span>
<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a>    rf_recipe <span class="ot">&lt;-</span> <span class="fu">recipe</span>(amr_pheno <span class="sc">~</span> ., <span class="at">data =</span> gpa_train) <span class="sc">%&gt;%</span></span>
<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a>      <span class="co"># To keep these columns but not use them as predictors or outcome</span></span>
<span id="cb58-4"><a href="#cb58-4" aria-hidden="true" tabindex="-1"></a>      <span class="fu">update_role</span>(<span class="fu">c</span>(s_no, genome_id, assembly_accession, <span class="co"># genome attributes</span></span>
<span id="cb58-5"><a href="#cb58-5" aria-hidden="true" tabindex="-1"></a>                    antibiotic, drug_class), <span class="co"># drug attributes</span></span>
<span id="cb58-6"><a href="#cb58-6" aria-hidden="true" tabindex="-1"></a>                  <span class="at">new_role =</span> <span class="st">"Supplementary"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb58-7"><a href="#cb58-7" aria-hidden="true" tabindex="-1"></a>      <span class="fu">step_zv</span>(<span class="fu">all_predictors</span>()) <span class="sc">%&gt;%</span> <span class="co"># remove predictors with only one value</span></span>
<span id="cb58-8"><a href="#cb58-8" aria-hidden="true" tabindex="-1"></a>      <span class="co"># step_nzv(all_predictors()) # for near-zero variance</span></span>
<span id="cb58-9"><a href="#cb58-9" aria-hidden="true" tabindex="-1"></a>      <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>()) <span class="co"># normalize all predictors</span></span>
<span id="cb58-10"><a href="#cb58-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-11"><a href="#cb58-11" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Build random forest model</span></span>
<span id="cb58-12"><a href="#cb58-12" aria-hidden="true" tabindex="-1"></a>    num_trees <span class="ot">&lt;-</span> <span class="dv">1000</span></span>
<span id="cb58-13"><a href="#cb58-13" aria-hidden="true" tabindex="-1"></a>    rf_model <span class="ot">&lt;-</span> <span class="fu">rand_forest</span>(<span class="at">trees =</span> num_trees) <span class="sc">%&gt;%</span></span>
<span id="cb58-14"><a href="#cb58-14" aria-hidden="true" tabindex="-1"></a>      <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb58-15"><a href="#cb58-15" aria-hidden="true" tabindex="-1"></a>      <span class="fu">set_mode</span>(<span class="st">"classification"</span>)</span>
<span id="cb58-16"><a href="#cb58-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-17"><a href="#cb58-17" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Create workflow</span></span>
<span id="cb58-18"><a href="#cb58-18" aria-hidden="true" tabindex="-1"></a>    rf_workflow <span class="ot">&lt;-</span> <span class="fu">workflow</span>() <span class="sc">%&gt;%</span></span>
<span id="cb58-19"><a href="#cb58-19" aria-hidden="true" tabindex="-1"></a>      <span class="fu">add_model</span>(rf_model) <span class="sc">%&gt;%</span></span>
<span id="cb58-20"><a href="#cb58-20" aria-hidden="true" tabindex="-1"></a>      <span class="fu">add_recipe</span>(rf_recipe)</span>
<span id="cb58-21"><a href="#cb58-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-22"><a href="#cb58-22" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Specify the hyperparameter tuning grid</span></span>
<span id="cb58-23"><a href="#cb58-23" aria-hidden="true" tabindex="-1"></a>    rf_grid <span class="ot">&lt;-</span> <span class="fu">tibble</span>(<span class="at">mtry =</span> <span class="fu">c</span>(<span class="fl">0.002</span>, <span class="fl">0.02</span>, <span class="fl">0.2</span>), <span class="at">min_n =</span> <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">6</span>, <span class="dv">12</span>))</span>
<span id="cb58-24"><a href="#cb58-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-25"><a href="#cb58-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Tune the model using cross-validation;</span></span>
<span id="cb58-26"><a href="#cb58-26" aria-hidden="true" tabindex="-1"></a>    <span class="co"># try 30 different hyperparameter sets; use auprc as evaluation metric.</span></span>
<span id="cb58-27"><a href="#cb58-27" aria-hidden="true" tabindex="-1"></a>    rf_res <span class="ot">&lt;-</span> <span class="fu">tune_grid</span>(rf_workflow,</span>
<span id="cb58-28"><a href="#cb58-28" aria-hidden="true" tabindex="-1"></a>                        <span class="at">resamples =</span> <span class="fu">vfold_cv</span>(gpa_train),</span>
<span id="cb58-29"><a href="#cb58-29" aria-hidden="true" tabindex="-1"></a>                        <span class="at">grid =</span> rf_grid,</span>
<span id="cb58-30"><a href="#cb58-30" aria-hidden="true" tabindex="-1"></a>                        <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> T),</span>
<span id="cb58-31"><a href="#cb58-31" aria-hidden="true" tabindex="-1"></a>                        <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Warning: No tuning parameters have been detected, performance will be evaluated
using the resamples with no tuning. Did you want to [tune()] parameters?</code></pre>
</div>
<div class="sourceCode cell-code" id="cb60"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a>    rf_best <span class="ot">&lt;-</span> rf_res <span class="sc">|&gt;</span> </span>
<span id="cb60-2"><a href="#cb60-2" aria-hidden="true" tabindex="-1"></a>      <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"roc_auc"</span>)</span>
<span id="cb60-3"><a href="#cb60-3" aria-hidden="true" tabindex="-1"></a>    </span>
<span id="cb60-4"><a href="#cb60-4" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Plot AUROC</span></span>
<span id="cb60-5"><a href="#cb60-5" aria-hidden="true" tabindex="-1"></a>    rf_roc <span class="ot">&lt;-</span> rf_res <span class="sc">%&gt;%</span> </span>
<span id="cb60-6"><a href="#cb60-6" aria-hidden="true" tabindex="-1"></a>      <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> rf_best) <span class="sc">%&gt;%</span> </span>
<span id="cb60-7"><a href="#cb60-7" aria-hidden="true" tabindex="-1"></a>      <span class="fu">roc_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span> </span>
<span id="cb60-8"><a href="#cb60-8" aria-hidden="true" tabindex="-1"></a>      <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb60-9"><a href="#cb60-9" aria-hidden="true" tabindex="-1"></a>    <span class="fu">autoplot</span>(lr_roc)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-29-1.png" class="img-fluid" width="672"></p>
</div>
<div class="sourceCode cell-code" id="cb61"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Select best RF model</span></span>
<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a>    best_rf_model <span class="ot">&lt;-</span> <span class="fu">select_best</span>(rf_res, <span class="st">"roc_auc"</span>)</span>
<span id="cb61-3"><a href="#cb61-3" aria-hidden="true" tabindex="-1"></a>    final_rf_model <span class="ot">&lt;-</span> <span class="fu">finalize_workflow</span>(rf_workflow, best_rf_model)</span>
<span id="cb61-4"><a href="#cb61-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-5"><a href="#cb61-5" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Fit the data</span></span>
<span id="cb61-6"><a href="#cb61-6" aria-hidden="true" tabindex="-1"></a>    rf_fit <span class="ot">&lt;-</span> final_rf_model <span class="sc">%&gt;%</span> <span class="fu">fit</span>(<span class="at">data =</span> gpa_train)</span>
<span id="cb61-7"><a href="#cb61-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-8"><a href="#cb61-8" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Save predictions</span></span>
<span id="cb61-9"><a href="#cb61-9" aria-hidden="true" tabindex="-1"></a>    rf_aug <span class="ot">&lt;-</span> <span class="fu">augment</span>(rf_fit, gpa_test)</span>
<span id="cb61-10"><a href="#cb61-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-11"><a href="#cb61-11" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Get auprc</span></span>
<span id="cb61-12"><a href="#cb61-12" aria-hidden="true" tabindex="-1"></a>    auroc <span class="ot">&lt;-</span> rf_aug <span class="sc">%&gt;%</span></span>
<span id="cb61-13"><a href="#cb61-13" aria-hidden="true" tabindex="-1"></a>      <span class="fu">roc_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%&gt;%</span></span>
<span id="cb61-14"><a href="#cb61-14" aria-hidden="true" tabindex="-1"></a>      <span class="fu">select</span>(.estimate) <span class="sc">%&gt;%</span></span>
<span id="cb61-15"><a href="#cb61-15" aria-hidden="true" tabindex="-1"></a>      <span class="fu">as.numeric</span>()</span>
<span id="cb61-16"><a href="#cb61-16" aria-hidden="true" tabindex="-1"></a>    <span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUROC:"</span>, auroc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUROC: 0.989857250187829"</code></pre>
</div>
<div class="sourceCode cell-code" id="cb63"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Extract top 10 genes</span></span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a>    n_top_genes <span class="ot">&lt;-</span> <span class="dv">10</span></span>
<span id="cb63-3"><a href="#cb63-3" aria-hidden="true" tabindex="-1"></a>    top_genes_rf <span class="ot">&lt;-</span> rf_fit <span class="sc">%&gt;%</span></span>
<span id="cb63-4"><a href="#cb63-4" aria-hidden="true" tabindex="-1"></a>      <span class="fu">extract_fit_parsnip</span>() <span class="sc">%&gt;%</span></span>
<span id="cb63-5"><a href="#cb63-5" aria-hidden="true" tabindex="-1"></a>      vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">%&gt;%</span> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">%&gt;%</span></span>
<span id="cb63-6"><a href="#cb63-6" aria-hidden="true" tabindex="-1"></a>      <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">%&gt;%</span> <span class="fu">pull</span>()</span>
<span id="cb63-7"><a href="#cb63-7" aria-hidden="true" tabindex="-1"></a>    <span class="fu">print</span>(top_genes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632"  "group_7634"  "blaZ"        "blaR1"       "ugpQ"       
 [6] "blaR1-2"     "bin3"        "group_11618" "cadC"        "group_5831" </code></pre>
</div>
</div>
</section>
<section id="too-many-features" class="level2">
<h2 class="anchored" data-anchor-id="too-many-features">Too many features?</h2>
<p>Try dimensionality reduction with SVD –&gt; retrieve top PCs –&gt; find contributing features to the top PCs.</p>
</section>
<section id="recap-conclusions" class="level2">
<h2 class="anchored" data-anchor-id="recap-conclusions">Recap &amp; Conclusions</h2>
<ul class="task-list">
<li><p><input type="checkbox" disabled="" checked="">Reproducible docs &amp; code with <code>qmd</code>/<code>rmd</code></p></li>
<li><p><input type="checkbox" disabled="" checked="">basic data cleanup to get it ready for ML models</p></li>
<li><p><input type="checkbox" disabled="" checked="">tidymodels</p></li>
<li><p><input type="checkbox" disabled="" checked="">building recipes and workflows</p></li>
<li><p><input type="checkbox" disabled="" checked="">calculating AUROC and AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">train-validate-test splits to optimize for best hyperparameters</p></li>
<li><p><input type="checkbox" disabled="" checked="">picking the best models based on low penalty and high AUROC/AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">plotting AUROC/AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">Logistic regression with L1 lasso regression (and L2)</p></li>
<li><p><input type="checkbox" disabled="" checked="">Random Forest models</p></li>
</ul>
</section>
<section id="how-to-contact-us" class="level2">
<h2 class="anchored" data-anchor-id="how-to-contact-us">How to contact us</h2>
<ul>
<li><p>Website: <a href="https://jravilab.github.io" class="uri">https://jravilab.github.io</a></p></li>
<li><p>Twitter: @jravilab @janani137</p></li>
<li><p>Email: janani DOT ravi AT cuanschutz DOT edu</p></li>
<li><p>Rendered material: <a href="https://jananiravi.github.io/2023-mlhd" class="uri">https://jananiravi.github.io/2023-mlhd</a></p></li>
</ul>
</section>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>