forked from jananiravi/2023-mlhd
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathindex.html
981 lines (948 loc) · 88.5 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.475">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="Janani Ravi | jravilab.github.io">
<meta name="dcterms.date" content="2023-08-01">
<title>ML for Microbial Genomics</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1.6em;
vertical-align: middle;
}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
<script src="index_files/libs/clipboard/clipboard.min.js"></script>
<script src="index_files/libs/quarto-html/quarto.js"></script>
<script src="index_files/libs/quarto-html/popper.min.js"></script>
<script src="index_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="index_files/libs/quarto-html/anchor.min.js"></script>
<link href="index_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="index_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="index_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="index_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="index_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
</head>
<body class="fullcontent">
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">ML for Microbial Genomics</h1>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>Janani Ravi | jravilab.github.io </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">August 1, 2023</p>
</div>
</div>
</div>
</header>
<section id="mlhd-icts-aug-02-2023" class="level1">
<h1>MLHD <span class="citation" data-cites="ICTS">@ICTS</span> | Aug 02, 2023</h1>
<blockquote class="blockquote">
<p>This is a companion repo & webpage for the Microbial Genomics and ML workshop, first presented at the MLHD 2023 conference! You can access the material here: <a href="https://jananiravi.github.io/2023-mlhd" class="uri">https://jananiravi.github.io/2023-mlhd</a></p>
</blockquote>
<section id="overview" class="level2">
<h2 class="anchored" data-anchor-id="overview">Overview</h2>
<blockquote class="blockquote">
<p>This session will cover ideas, concepts, and insights needed to get started with building machine learning models in R with high-dimensional data, such as microbial genomics. No prior knowledge in ML is required.</p>
</blockquote>
<section id="acknowledgments" class="level3">
<h3 class="anchored" data-anchor-id="acknowledgments">Acknowledgments</h3>
<ul>
<li>JRaviLab: Jacob Krol, Ethan Wolfe, Evan Brenner, Keenan Manpearl, Joseph Burke, Vignesh Sridhar, Jill Bilodeaux (contributed to the antimicrobial resistance project)</li>
<li>Arjun Krishnan (contributed to the tidymodels qmd primer)</li>
<li>R-Ladies, esp. R-Ladies East Lansing, R-Ladies Aurora; R/Bioconductor; rOpenSci (for all things R!)</li>
<li><code>tidymodels</code> resource by Julia Silge et al., | <a href="https://tidymodels.org" class="uri">https://tidymodels.org</a></li>
</ul>
</section>
</section>
<section id="install-and-load-packages" class="level2">
<h2 class="anchored" data-anchor-id="install-and-load-packages">Install and load packages</h2>
<p>To use the code in this document, you will need to install the following packages: <code>glmnet</code>, <code>tidyverse</code>, and <code>tidymodels</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.2 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors</code></pre>
</div>
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
✔ broom 1.0.5 ✔ rsample 1.1.1
✔ dials 1.2.0 ✔ tune 1.1.1
✔ infer 1.0.4 ✔ workflows 1.1.3
✔ modeldata 1.1.0 ✔ workflowsets 1.0.1
✔ parsnip 1.1.0 ✔ yardstick 1.2.0
✔ recipes 1.0.6
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages</code></pre>
</div>
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(glmnet) <span class="co"># for LR</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loaded glmnet 4.1-7</code></pre>
</div>
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip) <span class="co"># to extract important features</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>
Attaching package: 'vip'
The following object is masked from 'package:utils':
vi</code></pre>
</div>
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ranger) <span class="co"># for RF</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="explore-your-data" class="level2">
<h2 class="anchored" data-anchor-id="explore-your-data">Explore your data</h2>
<p>Here, we will use microbial genomics data (e.g., gene presence/absence across multiple microbial genomes) wrangled and processed from the <a href="https://bv-brc.org/">BV-BRC</a> to predict the antibiotic resistance phenotype of each sample (genome) based on the presence/absence of genes in that sample.</p>
<p>To make the dataset usable on your local desktop machine, we have pre-processed the data (using custom scripts that use NCBI/BV-BRC data and metadata, NCBI and BV-BRC CLI, Prokka for genome annotation, and Roary/CD-HIT for constructing ht gene presence/absence matrix and gene clusters that serve as ML features). For this workshop, we have selected a subset of ~900 genomes from <em>Staphylococcus aureus</em>, and limited the data to <code>n</code> genes after filtering out core (present in >95% of genomes) and unique (present in <5% of genomes) genes.</p>
<p>The data is contained in the files <code>abc.csv</code> with samples (genomes) along the rows and genes along the columns. To get started, let’s read this data into R using the <code>readr::read_delim</code> function. These files also carry relevant metadata of the genomes and drugs.</p>
<section id="read-in-the-data-file" class="level3">
<h3 class="anchored" data-anchor-id="read-in-the-data-file">Read in the data file</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Can be set to read csv/tsv: any feature matrix file with metadata</span></span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="co"># e.g., gpa-feature-matrix.tsv</span></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>gpa_featmat <span class="ot"><-</span> <span class="fu">read_delim</span>(<span class="st">"data/staph_penicillin_pangenome.csv"</span>,</span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a> <span class="at">delim =</span> <span class="st">","</span>, <span class="at">col_names =</span> T)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Rows: 920 Columns: 2328
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): antibiotic, amr_pheno, drug_class, assembly_accession
dbl (2324): s_no, genome_id, prmA, hisC_1, araB, yqeN, tagH_2, tet(38), lrgB...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.</code></pre>
</div>
</div>
</section>
<section id="data-exploration" class="level3">
<h3 class="anchored" data-anchor-id="data-exploration">Data exploration</h3>
<p>Let’s print the tibble to examine it quickly.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb12"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 920 × 2,328
s_no genome_id antibiotic amr_pheno drug_class assembly_accession prmA
<dbl> <dbl> <chr> <chr> <chr> <chr> <dbl>
1 0 1280. penicillin Susceptible penicillin GCA_024925485.1 1
2 1 1280. penicillin Susceptible penicillin GCA_024925485.1 1
3 2 1280. penicillin Susceptible penicillin GCA_024972975.1 1
4 3 1280. penicillin Susceptible penicillin GCA_024972975.1 1
5 4 1280. penicillin Resistant penicillin GCA_025232045.1 1
6 5 1280. penicillin Resistant penicillin GCA_025232045.1 1
7 6 46170. penicillin Resistant penicillin GCA_002204575.1 1
8 7 1280. penicillin Susceptible penicillin GCA_002089075.2 1
9 8 1280. penicillin Resistant penicillin GCA_002089095.2 1
10 9 1280. penicillin Resistant penicillin GCA_002097595.2 1
# ℹ 910 more rows
# ℹ 2,321 more variables: hisC_1 <dbl>, araB <dbl>, yqeN <dbl>, tagH_2 <dbl>,
# `tet(38)` <dbl>, lrgB <dbl>, cmtB <dbl>, scmP_2 <dbl>, est_2 <dbl>,
# glcB <dbl>, ponA <dbl>, clpX <dbl>, yiiM <dbl>, thiN <dbl>, ilvE <dbl>,
# ydcV <dbl>, menH <dbl>, relA <dbl>, yicL <dbl>, rho <dbl>, guaA <dbl>,
# hemB <dbl>, hemA <dbl>, glpQ_1 <dbl>, suhB <dbl>, tatC2 <dbl>, groL <dbl>,
# glpK <dbl>, frdA <dbl>, yycI <dbl>, pepA_1 <dbl>, feuC <dbl>, miaA <dbl>, …</code></pre>
</div>
<div class="sourceCode cell-code" id="cb14"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">dim</span>(gpa_featmat)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] 920 2328</code></pre>
</div>
</div>
<p>Then, let’s examine the <code>amr_pheno</code> column of this data frame that tells us which antimicrobial resistance (AMR) phenotype (resistance/susceptible) for each sample (i.e., each row, genome) for different drugs. We can tabulate the number and fraction of genomes per phenotype easily using the <code>count</code> and <code>mutate</code> functions from <code>dplyr</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb16"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat <span class="sc">%>%</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(amr_pheno) <span class="sc">%>%</span> </span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
amr_pheno n prop
<chr> <int> <dbl>
1 Resistant 481 0.523
2 Susceptible 439 0.477</code></pre>
</div>
</div>
<p>Before we proceed, let’s also try and get a sense of the values in this feature matrix. Since there are thousands of genes, we’ll randomly pick a few of them and visualize the distribution of their values across all the samples using boxplots.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb18"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>gpa_sum <span class="ot"><-</span> gpa_featmat <span class="sc">|></span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="dv">7</span><span class="sc">:</span><span class="fu">last_col</span>()) <span class="sc">|></span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">summarize</span>(<span class="fu">across</span>(<span class="fu">where</span>(is.numeric), sum))</span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>gpa_sum_long <span class="ot"><-</span> gpa_sum <span class="sc">|></span> </span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">pivot_longer</span>(<span class="at">cols =</span> <span class="fu">everything</span>(), <span class="at">names_to =</span> <span class="st">"gene"</span>)</span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(gpa_sum_long, <span class="fu">aes</span>(value)) <span class="sc">+</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a> <span class="co"># geom_histogram(bins=10) +</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_bar</span>() <span class="sc">+</span></span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_binned</span>() <span class="sc">+</span></span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">xlab</span>(<span class="st">"Genes present in X genomes"</span>) <span class="sc">+</span></span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">ylab</span>(<span class="st">"N Genes with X frequency"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-5-1.png" class="img-fluid" width="672"></p>
</div>
</div>
</section>
</section>
<section id="feature-matrices-ml" class="level2">
<h2 class="anchored" data-anchor-id="feature-matrices-ml">Feature matrices –> ML</h2>
<p>Given there are genomes with R/S from multiple drugs, to make the problem simpler, let’s pick one drug of interest and define the problem as classifying whether a genome is resistant or not to this antibiotic.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>pos_pheno <span class="ot"><-</span> <span class="st">"Resistant"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Then, we need to modify the <code>amr_pheno</code> variable into a binary indicator of whether it is resistant or not and finally convert that variable into a factor so that the model knows to consider it as a way to partition the samples.</p>
<section id="set-up-the-feature-matrix-and-labels-for-the-ml-model" class="level3">
<h3 class="anchored" data-anchor-id="set-up-the-feature-matrix-and-labels-for-the-ml-model">Set up the feature matrix and labels for the ML model</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat_pheno <span class="ot"><-</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a> gpa_featmat <span class="sc">%>%</span></span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">amr_pheno =</span> <span class="fu">ifelse</span>(amr_pheno<span class="sc">==</span>pos_pheno,</span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a> <span class="st">"Resistant"</span>, <span class="st">"Susceptible"</span>)) <span class="sc">%>%</span></span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="fu">across</span>(<span class="fu">where</span>(is.character), as.factor))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>A critical quantity to be fully aware of when setting up an ML problem is class balance, i.e., the relative sizes of the positive (<code>"Resistant"</code>) and negative (<code>"Susceptible"</code>) classes.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>gpa_featmat_pheno <span class="sc">%>%</span> </span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(amr_pheno) <span class="sc">%>%</span> </span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
amr_pheno n prop
<fct> <int> <dbl>
1 Resistant 481 0.523
2 Susceptible 439 0.477</code></pre>
</div>
</div>
<p>We can see that, in our dataset, only xx% of the samples are “Resistant”. Referred to as <em>class imbalance</em>, this scenario is extremely common in biomedicine and needs careful attention when analyzing and interpreting results.</p>
</section>
<section id="data-splitting" class="level3">
<h3 class="anchored" data-anchor-id="data-splitting">Data splitting</h3>
<p>If we take the data from all samples and train an <em>AMR classification</em> ML model, we cannot easily tell how good the model is. So, let’s reserve 25% of the samples to a <em>test set</em>, which we will hold out until the end of the project, at which point there should only be one or two models under serious consideration. The <em>test set</em> will be used as an unbiased source for measuring final model performance.</p>
<p>This is also the first step where we need to pay attention to class imbalance. As the <code>amr_pheno</code> variable is highly imbalanced, we need to use <em>stratified</em> random samples so that both the splits contain nearly identical proportions of positive and negative samples.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="co"># The function `initial_split()` takes the original data and saves the information on how to make the partitions.</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">123</span>)</span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>splits <span class="ot"><-</span> <span class="fu">initial_split</span>(<span class="at">data =</span> gpa_featmat_pheno,</span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a> <span class="at">strata =</span> amr_pheno)</span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Within initial_split, you can specify proportion using "prop" and</span></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a><span class="co"># grouping/datasets to go into the same set using "group"</span></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a><span class="co"># The `training()` and `testing()` functions return the actual datasets.</span></span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a>gpa_other <span class="ot"><-</span> <span class="fu">training</span>(splits)</span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a>gpa_test <span class="ot"><-</span> <span class="fu">testing</span>(splits)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Let’s check if we indeed did achieve stratified data splits.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># other set proportions by AMR pheno</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>gpa_other <span class="sc">%>%</span></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(amr_pheno) <span class="sc">%>%</span> </span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
amr_pheno n prop
<fct> <int> <dbl>
1 Resistant 360 0.522
2 Susceptible 329 0.478</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co"># test set proportions by R/S ratio</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>gpa_test <span class="sc">%>%</span></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">count</span>(amr_pheno) <span class="sc">%>%</span> </span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">prop =</span> n<span class="sc">/</span><span class="fu">sum</span>(n))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 2 × 3
amr_pheno n prop
<fct> <int> <dbl>
1 Resistant 121 0.524
2 Susceptible 110 0.476</code></pre>
</div>
</div>
<p>What’s up with the <code>gpa_other</code> split that’s not testing? This split will be used to create two new datasets:</p>
<ol type="1">
<li>The set held out for the purpose of measuring performance, called the <em>validation set</em>, and</li>
<li>The remaining data used to fit the model, called the <em>training set</em>.</li>
</ol>
<p>We’ll use the <code>validation_split</code> function to allocate 20% of the <code>gpa_other</code> samples to the validation set and the remaining 80% to the training set. Note that this function too has the <code>strata</code> argument. Do you see why we need it here?</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">234</span>)</span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a>gpa_val <span class="ot"><-</span> <span class="fu">validation_split</span>(<span class="at">data =</span> gpa_other,</span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a> <span class="at">strata =</span> amr_pheno, <span class="co"># maintain original data split</span></span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a> <span class="at">prop =</span> <span class="fl">0.80</span>) <span class="co"># 80% training; 20% validation</span></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a>gpa_val</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># Validation Set Split (0.8/0.2) using stratification
# A tibble: 1 × 2
splits id
<list> <chr>
1 <split [551/138]> validation</code></pre>
</div>
</div>
</section>
<section id="training-ml-models-in-r-penalized-logistic-regression" class="level3">
<h3 class="anchored" data-anchor-id="training-ml-models-in-r-penalized-logistic-regression">Training ML models in R: Penalized logistic regression</h3>
<p>Since our outcome variable <code>AMR_pheno</code> is categorical, <a href="https://en.wikipedia.org/wiki/Logistic_regression">logistic regression</a> would be a good first model to start. Let’s use a model that can perform feature selection during training. The <a href="https://cran.r-project.org/web/packages/glmnet/index.html">glmnet</a> R package fits a generalized linear model via penalized maximum likelihood. This method of estimating the logistic regression slope parameters uses a <em>penalty</em> on the process so that the coefficients of less relevant predictors are driven towards a value of zero. One of the <code>glmnet</code> penalization methods, called the <a href="https://en.wikipedia.org/wiki/Lasso_(statistics)">lasso method</a>, can actually set the predictor slopes to zero if a large enough penalty is used.</p>
</section>
<section id="build-the-model" class="level3">
<h3 class="anchored" data-anchor-id="build-the-model">Build the model</h3>
<p>To specify a penalized logistic regression model that uses a feature selection penalty, we will use <code>parsnip</code> package (part of <code>tidymodels</code>) that is great at providing a tidy, unified interface to models that can be used to try a range of models without getting bogged down in the syntactical minutiae of the underlying packages.</p>
<p>Here, let’s use it with the <a href="https://www.tidymodels.org/find/parsnip/">glmnet engine</a>:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Build logistic regression model</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>lr_model <span class="ot"><-</span> </span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">logistic_reg</span>(<span class="at">penalty =</span> <span class="fu">tune</span>(), <span class="co"># strength of regularization/penalty</span></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a> <span class="at">mixture =</span> <span class="dv">1</span>) <span class="sc">%>%</span> <span class="co"># specifies a pure lasso model</span></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_engine</span>(<span class="st">"glmnet"</span>) <span class="co"># set to generalized linear models</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We’ll set the <code>penalty</code> argument to <code>tune()</code> as a placeholder for now. This is a model <em>hyperparameter</em> that we will <a href="https://www.tidymodels.org/start/tuning/">tune</a> to find the best value for making predictions with our data. Setting <code>mixture</code> to a value of <code>1</code> means that the glmnet model will potentially remove irrelevant predictors and choose a simpler model. Sum of absolute values of beta-coefficients is minimized.</p>
<p><em>You can try with <code>mixture=0</code> for L2 ridge regression (or 0-1 for elasticnet combining L1 and L2).</em></p>
</section>
<section id="create-the-recipe" class="level3">
<h3 class="anchored" data-anchor-id="create-the-recipe">Create the recipe</h3>
<p>Next, we’re going to use the <code>recipes</code> to build <a href="https://dplyr.tidyverse.org/">dplyr</a>-like pipeable sequences of feature engineering steps to get our data ready for modeling. Recipes are built as a series of pre-processing steps, such as:</p>
<ul>
<li><p>converting qualitative predictors to indicator variables (also known as dummy variables),</p></li>
<li><p>transforming data to be on a different scale (e.g., taking the logarithm of a variable),</p></li>
<li><p>transforming whole groups of predictors together,</p></li>
<li><p>extracting key features from raw variables (e.g., getting the day of the week out of a date variable),</p></li>
</ul>
<p>and so on. Here, we’re using it to set up the outcome variable as a function of gene presence and then do two things:</p>
<ul>
<li><p><code>step_zv()</code> removes indicator variables that only contain a single unique value (e.g. all zeros). This is important because, for penalized models, the predictors should be centered and scaled.</p></li>
<li><p><code>step_normalize()</code> centers and scales numeric variables.</p></li>
</ul>
<div class="cell">
<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>lr_recipe <span class="ot"><-</span> </span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">recipe</span>(amr_pheno <span class="sc">~</span> ., <span class="at">data =</span> gpa_other) <span class="sc">%>%</span> <span class="co"># specify data + labels</span></span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">update_role</span>(<span class="fu">c</span>(s_no, genome_id, assembly_accession, <span class="co"># genome attributes</span></span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a> antibiotic, drug_class), <span class="co"># drug attributes</span></span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a> <span class="at">new_role =</span> <span class="st">"Supplementary"</span>) <span class="sc">%>%</span> <span class="co"># tag metadata not used for ML</span></span>
<span id="cb31-6"><a href="#cb31-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_zv</span>(<span class="fu">all_predictors</span>()) <span class="sc">%>%</span> <span class="co"># remove predictors with only one value</span></span>
<span id="cb31-7"><a href="#cb31-7" aria-hidden="true" tabindex="-1"></a> <span class="co"># step_nzv(all_predictors()) # for near-zero variance</span></span>
<span id="cb31-8"><a href="#cb31-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>()) <span class="co"># normalize all predictors</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p><em>Try with <code>step_nzv</code> instead of only <code>step_zv</code>.</em></p>
</section>
<section id="create-the-workflow" class="level3">
<h3 class="anchored" data-anchor-id="create-the-workflow">Create the workflow</h3>
<p>Let’s bundle the model and recipe into a single <code>workflow()</code> object to make management of the R objects easier:</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb32"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Standard model recipe for LR | uses our recipe definition from above</span></span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a>lr_workflow <span class="ot"><-</span> <span class="fu">workflow</span>() <span class="sc">%>%</span> </span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_model</span>(lr_model) <span class="sc">%>%</span></span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_recipe</span>(lr_recipe)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="create-the-grid-for-tuning" class="level3">
<h3 class="anchored" data-anchor-id="create-the-grid-for-tuning">Create the grid for tuning</h3>
<p>Before we fit this model, we need to set up a grid of <code>penalty</code> values to tune.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Try values from 0.0001 to 0.1 to penalize for complex models;</span></span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Minimizing no. of features with non-zero coefficients</span></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>lr_reg_grid <span class="ot"><-</span> <span class="fu">tibble</span>(<span class="at">penalty =</span> <span class="dv">10</span><span class="sc">^</span><span class="fu">seq</span>(<span class="sc">-</span><span class="dv">4</span>, <span class="sc">-</span><span class="dv">1</span>, <span class="at">length.out =</span> <span class="dv">10</span>))</span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a>lr_reg_grid</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 10 × 1
penalty
<dbl>
1 0.0001
2 0.000215
3 0.000464
4 0.001
5 0.00215
6 0.00464
7 0.01
8 0.0215
9 0.0464
10 0.1 </code></pre>
</div>
</div>
</section>
<section id="train-and-tune-the-model" class="level3">
<h3 class="anchored" data-anchor-id="train-and-tune-the-model">Train and tune the model</h3>
<p>The <code>tune::tune_grid()</code> function will help us train these 10 penalized logistic regression models and save the validation set prediction (via the call to <code>control_grid()</code>) so that diagnostic information will be available after fitting the model. To quantify how well the model performs (on the <em>validation set</em>), let’s first consider the <a href="https://en.wikipedia.org/wiki/Receiver_operating_characteristic">area under the ROC curve</a> across a range of hyperparameters.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>lr_res <span class="ot"><-</span> </span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a> lr_workflow <span class="sc">%>%</span> </span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">tune_grid</span>(<span class="at">resamples =</span> gpa_val, <span class="co"># using validation split</span></span>
<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a> <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb35-6"><a href="#cb35-6" aria-hidden="true" tabindex="-1"></a> <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span>
<span id="cb35-7"><a href="#cb35-7" aria-hidden="true" tabindex="-1"></a><span class="co">#metrics = metric_set(pr_auc)) # if you want to optimize for AUPRC instead</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<section id="tune-the-model-with-cross-validation-instead" class="level4">
<h4 class="anchored" data-anchor-id="tune-the-model-with-cross-validation-instead">Tune the model with cross-validation instead?</h4>
<div class="cell">
<div class="sourceCode cell-code" id="cb36"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>lr_res_cv <span class="ot"><-</span> </span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a> lr_workflow <span class="sc">%>%</span> </span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">tune_grid</span>(<span class="at">resamples =</span> <span class="fu">vfold_cv</span>(gpa_other), <span class="co"># new CV line</span></span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a> <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a> <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span>
<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a><span class="co">#metrics = metric_set(pr_auc)) # if you want to optimize for AUPRC instead</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
</section>
</section>
<section id="evaluation-metrics" class="level2">
<h2 class="anchored" data-anchor-id="evaluation-metrics">Evaluation metrics</h2>
<p>A plot of the area under the ROC curve against the range of penalty values will help us guess which value is best for the problem/dataset at hand.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb37"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>lr_plot <span class="ot"><-</span> lr_res <span class="sc">%>%</span> </span>
<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_metrics</span>() <span class="sc">%>%</span> </span>
<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">x =</span> penalty, <span class="at">y =</span> mean)) <span class="sc">+</span> </span>
<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_point</span>() <span class="sc">+</span> </span>
<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_line</span>() <span class="sc">+</span> </span>
<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">ylab</span>(<span class="st">"Area under the ROC Curve"</span>) <span class="sc">+</span></span>
<span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a> <span class="co">#ylab("Area under the PR Curve") +</span></span>
<span id="cb37-8"><a href="#cb37-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_log10</span>(<span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">label_number</span>()) <span class="sc">+</span></span>
<span id="cb37-9"><a href="#cb37-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>()</span>
<span id="cb37-10"><a href="#cb37-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb37-11"><a href="#cb37-11" aria-hidden="true" tabindex="-1"></a>lr_plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-19-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>What is your interpretation of this plot? Write it here.</p>
<p>We can also tabulate these results to help pick the “best” hyperparameter.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb38"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a>top_models <span class="ot"><-</span></span>
<span id="cb38-2"><a href="#cb38-2" aria-hidden="true" tabindex="-1"></a> lr_res <span class="sc">%>%</span> </span>
<span id="cb38-3"><a href="#cb38-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">show_best</span>(<span class="st">"roc_auc"</span>, <span class="at">n =</span> <span class="dv">10</span>) <span class="sc">%>%</span> </span>
<span id="cb38-4"><a href="#cb38-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(penalty) </span>
<span id="cb38-5"><a href="#cb38-5" aria-hidden="true" tabindex="-1"></a>top_models</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 10 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0001 roc_auc binary 0.996 1 NA Preprocessor1_Model01
2 0.000215 roc_auc binary 0.996 1 NA Preprocessor1_Model02
3 0.000464 roc_auc binary 0.996 1 NA Preprocessor1_Model03
4 0.001 roc_auc binary 0.996 1 NA Preprocessor1_Model04
5 0.00215 roc_auc binary 0.996 1 NA Preprocessor1_Model05
6 0.00464 roc_auc binary 0.997 1 NA Preprocessor1_Model06
7 0.01 roc_auc binary 0.997 1 NA Preprocessor1_Model07
8 0.0215 roc_auc binary 0.996 1 NA Preprocessor1_Model08
9 0.0464 roc_auc binary 0.996 1 NA Preprocessor1_Model09
10 0.1 roc_auc binary 0.979 1 NA Preprocessor1_Model10</code></pre>
</div>
</div>
<p>Let’s select the best value and visualize the validation set ROC curve. Why are we picking the 6<sup>th</sup> value instead of the 1<sup>st</sup> even though they have nearly identical performance metrics?</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb40"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a>lr_best <span class="ot"><-</span> lr_res <span class="sc">%>%</span> </span>
<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_metrics</span>() <span class="sc">%>%</span> </span>
<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(penalty, mean) <span class="sc">%>%</span> </span>
<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice</span>(<span class="dv">6</span>)</span>
<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb40-6"><a href="#cb40-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternatively, you can just use</span></span>
<span id="cb40-7"><a href="#cb40-7" aria-hidden="true" tabindex="-1"></a>lr_best <span class="ot"><-</span> lr_res <span class="sc">|></span> </span>
<span id="cb40-8"><a href="#cb40-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"roc_auc"</span>)</span>
<span id="cb40-9"><a href="#cb40-9" aria-hidden="true" tabindex="-1"></a>lr_best</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00464 Preprocessor1_Model06</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb42"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a>lr_roc <span class="ot"><-</span> lr_res <span class="sc">%>%</span> </span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> lr_best) <span class="sc">%>%</span> </span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">roc_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%>%</span> </span>
<span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(lr_roc)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-22-1.png" class="img-fluid" width="672"></p>
</div>
<div class="sourceCode cell-code" id="cb43"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Alternatively ... </span></span>
<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Select the best LR model</span></span>
<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a>final_lr_model <span class="ot"><-</span> <span class="fu">finalize_workflow</span>(lr_workflow, lr_best)</span>
<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the data</span></span>
<span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a>lr_fit <span class="ot"><-</span> final_lr_model <span class="sc">%>%</span> <span class="fu">fit</span>(<span class="at">data =</span> gpa_other)</span>
<span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Save predictions</span></span>
<span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a>lr_aug <span class="ot"><-</span> <span class="fu">augment</span>(lr_fit, gpa_test)</span>
<span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Calculate AUROC</span></span>
<span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a>auroc <span class="ot"><-</span> lr_aug <span class="sc">%>%</span> <span class="fu">roc_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%>%</span></span>
<span id="cb43-10"><a href="#cb43-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(.estimate) <span class="sc">%>%</span> <span class="fu">as.numeric</span>()</span>
<span id="cb43-11"><a href="#cb43-11" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUROC:"</span>, auroc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUROC: 0.993313298271975"</code></pre>
</div>
</div>
<p>The area under the ROC curve has a nice property that it can be interpreted as a probability and has a close connection to a statistical test (<a href="https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">the Mann-Whitney U test</a>).</p>
<section id="selecting-the-top-features" class="level3">
<h3 class="anchored" data-anchor-id="selecting-the-top-features">Selecting the top features</h3>
<div class="cell">
<div class="sourceCode cell-code" id="cb45"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Extract top 10 genes</span></span>
<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a>n_top_genes <span class="ot"><-</span> <span class="dv">10</span></span>
<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a>top_genes <span class="ot"><-</span> lr_fit <span class="sc">%>%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%>%</span></span>
<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a> vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">%>%</span> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">%>%</span></span>
<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">%>%</span> <span class="fu">pull</span>()</span>
<span id="cb45-6"><a href="#cb45-6" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(top_genes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632" "group_7634" "blaZ" "blaR1" "ugpQ"
[6] "blaR1-2" "bin3" "group_11618" "cadC" "group_5831" </code></pre>
</div>
</div>
</section>
<section id="when-you-have-imbalanced-classes" class="level3">
<h3 class="anchored" data-anchor-id="when-you-have-imbalanced-classes">When you have imbalanced classes</h3>
<p>However, this measure is not sensitive to class imbalances and can come out to be high even if the model is making many mistakes in the minor positive class — which is typically of biomedical interest — and getting most of the major negative class correct.</p>
<p>So, the final analysis we’re going to do is to evaluate performance based on another metric called <a href="https://en.wikipedia.org/wiki/Precision_and_recall">area under the Precision-Recall curve</a> that is more sensitive to the minor positive class by focusing on the fraction of top positive predictions that are correct (precision) and the fraction of positive samples that are correctly predicted (recall).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb47"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a>lr_res_pr <span class="ot"><-</span> lr_workflow <span class="sc">%>%</span> </span>
<span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">tune_grid</span>(<span class="at">resamples =</span> gpa_val,</span>
<span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> lr_reg_grid,</span>
<span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a> <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> <span class="cn">TRUE</span>),</span>
<span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a> <span class="at">metrics =</span> <span class="fu">metric_set</span>(pr_auc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb48"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>lr_plot_pr <span class="ot"><-</span> lr_res_pr <span class="sc">%>%</span> </span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_metrics</span>() <span class="sc">%>%</span> </span>
<span id="cb48-3"><a href="#cb48-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">ggplot</span>(<span class="fu">aes</span>(<span class="at">x =</span> penalty, <span class="at">y =</span> mean)) <span class="sc">+</span> </span>
<span id="cb48-4"><a href="#cb48-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_point</span>() <span class="sc">+</span> </span>
<span id="cb48-5"><a href="#cb48-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">geom_line</span>() <span class="sc">+</span> </span>
<span id="cb48-6"><a href="#cb48-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">ylab</span>(<span class="st">"Area under the PR Curve"</span>) <span class="sc">+</span></span>
<span id="cb48-7"><a href="#cb48-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">scale_x_log10</span>(<span class="at">labels =</span> scales<span class="sc">::</span><span class="fu">label_number</span>()) <span class="sc">+</span></span>
<span id="cb48-8"><a href="#cb48-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_bw</span>()</span>
<span id="cb48-9"><a href="#cb48-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb48-10"><a href="#cb48-10" aria-hidden="true" tabindex="-1"></a>lr_plot</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-25-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb49"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a>lr_best_pr <span class="ot"><-</span> lr_res_pr <span class="sc">%>%</span> </span>
<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_metrics</span>() <span class="sc">%>%</span> </span>
<span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">arrange</span>(penalty) <span class="sc">%>%</span> </span>
<span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice</span>(<span class="dv">6</span>)</span>
<span id="cb49-5"><a href="#cb49-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb49-6"><a href="#cb49-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Alternatively, you can just use</span></span>
<span id="cb49-7"><a href="#cb49-7" aria-hidden="true" tabindex="-1"></a>lr_best_pr <span class="ot"><-</span> lr_res_pr <span class="sc">|></span> </span>
<span id="cb49-8"><a href="#cb49-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"pr_auc"</span>)</span>
<span id="cb49-9"><a href="#cb49-9" aria-hidden="true" tabindex="-1"></a>lr_best</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00464 Preprocessor1_Model06</code></pre>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb51"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>lr_pr <span class="ot"><-</span> lr_res_pr <span class="sc">%>%</span> </span>
<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> lr_best) <span class="sc">%>%</span> </span>
<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">pr_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%>%</span> </span>
<span id="cb51-4"><a href="#cb51-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb51-5"><a href="#cb51-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb51-6"><a href="#cb51-6" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(lr_pr)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-27-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<section id="retrieving-your-top-features" class="level4">
<h4 class="anchored" data-anchor-id="retrieving-your-top-features">Retrieving your top features</h4>
<div class="cell">
<div class="sourceCode cell-code" id="cb52"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Select best LR model</span></span>
<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a>best_lr_model_pr <span class="ot"><-</span> <span class="fu">select_best</span>(lr_res_pr, <span class="st">"pr_auc"</span>)</span>
<span id="cb52-3"><a href="#cb52-3" aria-hidden="true" tabindex="-1"></a>final_lr_model_pr <span class="ot"><-</span> <span class="fu">finalize_workflow</span>(lr_workflow, best_lr_model_pr)</span>
<span id="cb52-4"><a href="#cb52-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-5"><a href="#cb52-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the data</span></span>
<span id="cb52-6"><a href="#cb52-6" aria-hidden="true" tabindex="-1"></a>lr_fit_pr <span class="ot"><-</span> final_lr_model_pr <span class="sc">%>%</span></span>
<span id="cb52-7"><a href="#cb52-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">fit</span>(<span class="at">data =</span> gpa_other)</span>
<span id="cb52-8"><a href="#cb52-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-9"><a href="#cb52-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Save predictions</span></span>
<span id="cb52-10"><a href="#cb52-10" aria-hidden="true" tabindex="-1"></a>lr_aug_pr <span class="ot"><-</span> <span class="fu">augment</span>(lr_fit_pr, gpa_test)</span>
<span id="cb52-11"><a href="#cb52-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb52-12"><a href="#cb52-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Get AUPRC</span></span>
<span id="cb52-13"><a href="#cb52-13" aria-hidden="true" tabindex="-1"></a>auprc <span class="ot"><-</span> lr_aug_pr <span class="sc">%>%</span></span>
<span id="cb52-14"><a href="#cb52-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">pr_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%>%</span></span>
<span id="cb52-15"><a href="#cb52-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(.estimate) <span class="sc">%>%</span> <span class="fu">as.numeric</span>()</span>
<span id="cb52-16"><a href="#cb52-16" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUPRC:"</span>, auprc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUPRC: 0.99443515870738"</code></pre>
</div>
<div class="sourceCode cell-code" id="cb54"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Extract top 10 genes</span></span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a>n_top_genes <span class="ot"><-</span> <span class="dv">10</span></span>
<span id="cb54-3"><a href="#cb54-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-4"><a href="#cb54-4" aria-hidden="true" tabindex="-1"></a>top_genes_pr <span class="ot"><-</span> lr_fit_pr <span class="sc">|></span> </span>
<span id="cb54-5"><a href="#cb54-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">extract_fit_parsnip</span>() <span class="sc">|></span></span>
<span id="cb54-6"><a href="#cb54-6" aria-hidden="true" tabindex="-1"></a> vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">|></span></span>
<span id="cb54-7"><a href="#cb54-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">|></span></span>
<span id="cb54-8"><a href="#cb54-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">|></span></span>
<span id="cb54-9"><a href="#cb54-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">pull</span>()</span>
<span id="cb54-10"><a href="#cb54-10" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(top_genes_pr)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632" "group_7634" "blaZ" "blaR1" "ugpQ"
[6] "blaR1-2" "bin3" "group_11618" "cadC" "group_5831" </code></pre>
</div>
</div>
</section>
</section>
</section>
<section id="predicting-ar-w-random-forest" class="level2">
<h2 class="anchored" data-anchor-id="predicting-ar-w-random-forest">Predicting AR w/ Random Forest</h2>
<div class="cell">
<div class="sourceCode cell-code" id="cb56"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Setting a seed enables our analysis to be reproducible when random numbers are used.</span></span>
<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">set.seed</span>(<span class="dv">569</span>)</span>
<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a> rf_splits <span class="ot"><-</span> <span class="fu">initial_split</span>(gpa_featmat_pheno,</span>
<span id="cb56-5"><a href="#cb56-5" aria-hidden="true" tabindex="-1"></a> <span class="co">#prop = train_test_split,</span></span>
<span id="cb56-6"><a href="#cb56-6" aria-hidden="true" tabindex="-1"></a> <span class="at">strata =</span> amr_pheno)</span>
<span id="cb56-7"><a href="#cb56-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb56-8"><a href="#cb56-8" aria-hidden="true" tabindex="-1"></a> <span class="co">#Create separate data frames for the training and testing sets.</span></span>
<span id="cb56-9"><a href="#cb56-9" aria-hidden="true" tabindex="-1"></a> gpa_train <span class="ot"><-</span> <span class="fu">training</span>(rf_splits)</span>
<span id="cb56-10"><a href="#cb56-10" aria-hidden="true" tabindex="-1"></a> gpa_test <span class="ot"><-</span> <span class="fu">testing</span>(rf_splits)</span>
<span id="cb56-11"><a href="#cb56-11" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb56-12"><a href="#cb56-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">set.seed</span>(<span class="dv">234</span>)</span>
<span id="cb56-13"><a href="#cb56-13" aria-hidden="true" tabindex="-1"></a> gpa_val <span class="ot"><-</span> <span class="fu">validation_split</span>(<span class="at">data =</span> gpa_train,</span>
<span id="cb56-14"><a href="#cb56-14" aria-hidden="true" tabindex="-1"></a> <span class="at">strata =</span> amr_pheno, <span class="co"># maintain original data split</span></span>
<span id="cb56-15"><a href="#cb56-15" aria-hidden="true" tabindex="-1"></a> <span class="at">prop =</span> <span class="fl">0.80</span>) <span class="co"># 80% training; 20% validation</span></span>
<span id="cb56-16"><a href="#cb56-16" aria-hidden="true" tabindex="-1"></a> gpa_val</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># Validation Set Split (0.8/0.2) using stratification
# A tibble: 1 × 2
splits id
<list> <chr>
1 <split [551/138]> validation</code></pre>
</div>
<div class="sourceCode cell-code" id="cb58"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a> <span class="co">#Create recipe</span></span>
<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a> rf_recipe <span class="ot"><-</span> <span class="fu">recipe</span>(amr_pheno <span class="sc">~</span> ., <span class="at">data =</span> gpa_train) <span class="sc">%>%</span></span>
<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a> <span class="co"># To keep these columns but not use them as predictors or outcome</span></span>
<span id="cb58-4"><a href="#cb58-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">update_role</span>(<span class="fu">c</span>(s_no, genome_id, assembly_accession, <span class="co"># genome attributes</span></span>
<span id="cb58-5"><a href="#cb58-5" aria-hidden="true" tabindex="-1"></a> antibiotic, drug_class), <span class="co"># drug attributes</span></span>
<span id="cb58-6"><a href="#cb58-6" aria-hidden="true" tabindex="-1"></a> <span class="at">new_role =</span> <span class="st">"Supplementary"</span>) <span class="sc">%>%</span></span>
<span id="cb58-7"><a href="#cb58-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_zv</span>(<span class="fu">all_predictors</span>()) <span class="sc">%>%</span> <span class="co"># remove predictors with only one value</span></span>
<span id="cb58-8"><a href="#cb58-8" aria-hidden="true" tabindex="-1"></a> <span class="co"># step_nzv(all_predictors()) # for near-zero variance</span></span>
<span id="cb58-9"><a href="#cb58-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>()) <span class="co"># normalize all predictors</span></span>
<span id="cb58-10"><a href="#cb58-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-11"><a href="#cb58-11" aria-hidden="true" tabindex="-1"></a> <span class="co"># Build random forest model</span></span>
<span id="cb58-12"><a href="#cb58-12" aria-hidden="true" tabindex="-1"></a> num_trees <span class="ot"><-</span> <span class="dv">1000</span></span>
<span id="cb58-13"><a href="#cb58-13" aria-hidden="true" tabindex="-1"></a> rf_model <span class="ot"><-</span> <span class="fu">rand_forest</span>(<span class="at">trees =</span> num_trees) <span class="sc">%>%</span></span>
<span id="cb58-14"><a href="#cb58-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%>%</span></span>
<span id="cb58-15"><a href="#cb58-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_mode</span>(<span class="st">"classification"</span>)</span>
<span id="cb58-16"><a href="#cb58-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-17"><a href="#cb58-17" aria-hidden="true" tabindex="-1"></a> <span class="co"># Create workflow</span></span>
<span id="cb58-18"><a href="#cb58-18" aria-hidden="true" tabindex="-1"></a> rf_workflow <span class="ot"><-</span> <span class="fu">workflow</span>() <span class="sc">%>%</span></span>
<span id="cb58-19"><a href="#cb58-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_model</span>(rf_model) <span class="sc">%>%</span></span>
<span id="cb58-20"><a href="#cb58-20" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_recipe</span>(rf_recipe)</span>
<span id="cb58-21"><a href="#cb58-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-22"><a href="#cb58-22" aria-hidden="true" tabindex="-1"></a> <span class="co"># Specify the hyperparameter tuning grid</span></span>
<span id="cb58-23"><a href="#cb58-23" aria-hidden="true" tabindex="-1"></a> rf_grid <span class="ot"><-</span> <span class="fu">tibble</span>(<span class="at">mtry =</span> <span class="fu">c</span>(<span class="fl">0.002</span>, <span class="fl">0.02</span>, <span class="fl">0.2</span>), <span class="at">min_n =</span> <span class="fu">c</span>(<span class="dv">2</span>, <span class="dv">6</span>, <span class="dv">12</span>))</span>
<span id="cb58-24"><a href="#cb58-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-25"><a href="#cb58-25" aria-hidden="true" tabindex="-1"></a> <span class="co"># Tune the model using cross-validation;</span></span>
<span id="cb58-26"><a href="#cb58-26" aria-hidden="true" tabindex="-1"></a> <span class="co"># try 30 different hyperparameter sets; use auprc as evaluation metric.</span></span>
<span id="cb58-27"><a href="#cb58-27" aria-hidden="true" tabindex="-1"></a> rf_res <span class="ot"><-</span> <span class="fu">tune_grid</span>(rf_workflow,</span>
<span id="cb58-28"><a href="#cb58-28" aria-hidden="true" tabindex="-1"></a> <span class="at">resamples =</span> <span class="fu">vfold_cv</span>(gpa_train),</span>
<span id="cb58-29"><a href="#cb58-29" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> rf_grid,</span>
<span id="cb58-30"><a href="#cb58-30" aria-hidden="true" tabindex="-1"></a> <span class="at">control =</span> <span class="fu">control_grid</span>(<span class="at">save_pred =</span> T),</span>
<span id="cb58-31"><a href="#cb58-31" aria-hidden="true" tabindex="-1"></a> <span class="at">metrics =</span> <span class="fu">metric_set</span>(roc_auc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stderr">
<pre><code>Warning: No tuning parameters have been detected, performance will be evaluated
using the resamples with no tuning. Did you want to [tune()] parameters?</code></pre>
</div>
<div class="sourceCode cell-code" id="cb60"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a> rf_best <span class="ot"><-</span> rf_res <span class="sc">|></span> </span>
<span id="cb60-2"><a href="#cb60-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">select_best</span>(<span class="at">metric =</span> <span class="st">"roc_auc"</span>)</span>
<span id="cb60-3"><a href="#cb60-3" aria-hidden="true" tabindex="-1"></a> </span>
<span id="cb60-4"><a href="#cb60-4" aria-hidden="true" tabindex="-1"></a> <span class="co"># Plot AUROC</span></span>
<span id="cb60-5"><a href="#cb60-5" aria-hidden="true" tabindex="-1"></a> rf_roc <span class="ot"><-</span> rf_res <span class="sc">%>%</span> </span>
<span id="cb60-6"><a href="#cb60-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">collect_predictions</span>(<span class="at">parameters =</span> rf_best) <span class="sc">%>%</span> </span>
<span id="cb60-7"><a href="#cb60-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">roc_curve</span>(amr_pheno, .pred_Resistant) <span class="sc">%>%</span> </span>
<span id="cb60-8"><a href="#cb60-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">model =</span> <span class="st">"Logistic Regression"</span>)</span>
<span id="cb60-9"><a href="#cb60-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">autoplot</span>(lr_roc)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="index_files/figure-html/unnamed-chunk-29-1.png" class="img-fluid" width="672"></p>
</div>
<div class="sourceCode cell-code" id="cb61"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a> <span class="co"># Select best RF model</span></span>
<span id="cb61-2"><a href="#cb61-2" aria-hidden="true" tabindex="-1"></a> best_rf_model <span class="ot"><-</span> <span class="fu">select_best</span>(rf_res, <span class="st">"roc_auc"</span>)</span>
<span id="cb61-3"><a href="#cb61-3" aria-hidden="true" tabindex="-1"></a> final_rf_model <span class="ot"><-</span> <span class="fu">finalize_workflow</span>(rf_workflow, best_rf_model)</span>
<span id="cb61-4"><a href="#cb61-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-5"><a href="#cb61-5" aria-hidden="true" tabindex="-1"></a> <span class="co"># Fit the data</span></span>
<span id="cb61-6"><a href="#cb61-6" aria-hidden="true" tabindex="-1"></a> rf_fit <span class="ot"><-</span> final_rf_model <span class="sc">%>%</span> <span class="fu">fit</span>(<span class="at">data =</span> gpa_train)</span>
<span id="cb61-7"><a href="#cb61-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-8"><a href="#cb61-8" aria-hidden="true" tabindex="-1"></a> <span class="co"># Save predictions</span></span>
<span id="cb61-9"><a href="#cb61-9" aria-hidden="true" tabindex="-1"></a> rf_aug <span class="ot"><-</span> <span class="fu">augment</span>(rf_fit, gpa_test)</span>
<span id="cb61-10"><a href="#cb61-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb61-11"><a href="#cb61-11" aria-hidden="true" tabindex="-1"></a> <span class="co"># Get auprc</span></span>
<span id="cb61-12"><a href="#cb61-12" aria-hidden="true" tabindex="-1"></a> auroc <span class="ot"><-</span> rf_aug <span class="sc">%>%</span></span>
<span id="cb61-13"><a href="#cb61-13" aria-hidden="true" tabindex="-1"></a> <span class="fu">roc_auc</span>(<span class="at">truth =</span> amr_pheno, .pred_Resistant) <span class="sc">%>%</span></span>
<span id="cb61-14"><a href="#cb61-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(.estimate) <span class="sc">%>%</span></span>
<span id="cb61-15"><a href="#cb61-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">as.numeric</span>()</span>
<span id="cb61-16"><a href="#cb61-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">print</span>(<span class="fu">paste</span>(<span class="st">"AUROC:"</span>, auroc))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code>[1] "AUROC: 0.989857250187829"</code></pre>
</div>
<div class="sourceCode cell-code" id="cb63"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a> <span class="co"># Extract top 10 genes</span></span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a> n_top_genes <span class="ot"><-</span> <span class="dv">10</span></span>
<span id="cb63-3"><a href="#cb63-3" aria-hidden="true" tabindex="-1"></a> top_genes_rf <span class="ot"><-</span> rf_fit <span class="sc">%>%</span></span>
<span id="cb63-4"><a href="#cb63-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%>%</span></span>
<span id="cb63-5"><a href="#cb63-5" aria-hidden="true" tabindex="-1"></a> vip<span class="sc">::</span><span class="fu">vi</span>() <span class="sc">%>%</span> <span class="fu">slice</span>(<span class="dv">1</span><span class="sc">:</span>n_top_genes) <span class="sc">%>%</span></span>
<span id="cb63-6"><a href="#cb63-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">select</span>(<span class="dv">1</span>) <span class="sc">%>%</span> <span class="fu">pull</span>()</span>
<span id="cb63-7"><a href="#cb63-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">print</span>(top_genes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code> [1] "group_7632" "group_7634" "blaZ" "blaR1" "ugpQ"
[6] "blaR1-2" "bin3" "group_11618" "cadC" "group_5831" </code></pre>
</div>
</div>
</section>
<section id="too-many-features" class="level2">
<h2 class="anchored" data-anchor-id="too-many-features">Too many features?</h2>
<p>Try dimensionality reduction with SVD –> retrieve top PCs –> find contributing features to the top PCs.</p>
</section>
<section id="recap-conclusions" class="level2">
<h2 class="anchored" data-anchor-id="recap-conclusions">Recap & Conclusions</h2>
<ul class="task-list">
<li><p><input type="checkbox" disabled="" checked="">Reproducible docs & code with <code>qmd</code>/<code>rmd</code></p></li>
<li><p><input type="checkbox" disabled="" checked="">basic data cleanup to get it ready for ML models</p></li>
<li><p><input type="checkbox" disabled="" checked="">tidymodels</p></li>
<li><p><input type="checkbox" disabled="" checked="">building recipes and workflows</p></li>
<li><p><input type="checkbox" disabled="" checked="">calculating AUROC and AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">train-validate-test splits to optimize for best hyperparameters</p></li>
<li><p><input type="checkbox" disabled="" checked="">picking the best models based on low penalty and high AUROC/AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">plotting AUROC/AUPRC</p></li>
<li><p><input type="checkbox" disabled="" checked="">Logistic regression with L1 lasso regression (and L2)</p></li>
<li><p><input type="checkbox" disabled="" checked="">Random Forest models</p></li>
</ul>
</section>
<section id="how-to-contact-us" class="level2">
<h2 class="anchored" data-anchor-id="how-to-contact-us">How to contact us</h2>
<ul>
<li><p>Website: <a href="https://jravilab.github.io" class="uri">https://jravilab.github.io</a></p></li>
<li><p>Twitter: @jravilab @janani137</p></li>
<li><p>Email: janani DOT ravi AT cuanschutz DOT edu</p></li>
<li><p>Rendered material: <a href="https://jananiravi.github.io/2023-mlhd" class="uri">https://jananiravi.github.io/2023-mlhd</a></p></li>
</ul>
</section>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const clipboard = new window.ClipboardJS('.code-copy-button', {
target: function(trigger) {
return trigger.previousElementSibling;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>