From b3be6999be6ad546a958e52d9d53233c2fd6683c Mon Sep 17 00:00:00 2001 From: debbieargueta3 <143042167+debbieargueta3@users.noreply.github.com> Date: Mon, 2 Oct 2023 19:53:19 -0700 Subject: [PATCH] Add files via upload --- lab6.html | 3515 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3515 insertions(+) create mode 100644 lab6.html diff --git a/lab6.html b/lab6.html new file mode 100644 index 0000000..a8c0346 --- /dev/null +++ b/lab6.html @@ -0,0 +1,3515 @@ + + + + + + + + + +lab 6 + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

lab 6

+
+ + + +
+ + + + +
+ + +
+ +
+

Set Up

+
+
library(data.table)
+library(dplyr)
+
+

+Attaching package: 'dplyr'
+
+
+
The following objects are masked from 'package:data.table':
+
+    between, first, last
+
+
+
The following objects are masked from 'package:stats':
+
+    filter, lag
+
+
+
The following objects are masked from 'package:base':
+
+    intersect, setdiff, setequal, union
+
+
library(ggplot2)
+library(tidytext)
+
+download.file(
+  "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/00_mtsamples/mtsamples.csv",
+  destfile = "mtsamples.csv",
+  method   = "libcurl",
+  timeout  = 60
+)
+
+mtsamples <- data.table::fread("mtsamples.csv")
+
+#changing column names
+colnames(mtsamples) <- c("case_number", "description", "medical_specialty", "sample_name", "transcription", "keywords")
+
+mtsamples <- mtsamples[-c(1),]
+
+
+
+

Question 1: What specialties do we have?

+

We can use count() from dplyr to figure out how many different catagories do we have? Are these catagories related? overlapping? evenly distributed?

+
    +
  • In this dataset, you’ll find a diverse range of 40 different medical specialties. These specialties are all connected within the broader field of medicine. Some of them might center around the same body systems or organs. It’s interesting to note that there’s no overlap between these specialties. When we look at the cases in the dataset, surgery stands out with the most cases, followed by consult and cardiovascular/pulmonary specialties.
  • +
+
+
medical_categories <- mtsamples %>% count(medical_specialty)
+dim(medical_categories)
+
+
[1] 40  2
+
+
+
+
+

Question 2

+
    +
  • Tokenize the the words in the transcription column

  • +
  • Count the number of times each token appears

  • +
  • Visualize the top 20 most frequent words

  • +
+

Explain what we see from this result. Does it makes sense? What insights (if any) do we get?

+
    +
  • The most frequent words in the transcription include common ones like “the,” “and,” “patient,” “she,” and “he.” These words are typically expected to be among the top 20 most frequently used words in the transcriptions. Many of them are prepositions that convey relationships between words, which are essential in most sentences. Additionally, the term “patient” is understandably prevalent in the transcriptions as physicians often refer to the patient in their notes.
  • +
+
+
mtsamples2 <- mtsamples
+
+library(forcats)
+
+mtsamples2 %>%
+  unnest_tokens(token, transcription) %>%
+  count(token) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(token, n))) +
+  geom_col()
+
+

+
+
+
+
+

Question 3

+
    +
  • Redo visualization but remove stopwords before

  • +
  • Bonus points if you remove numbers as well

  • +
+

What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?

+
    +
  • After removing stop words, the prominent words in the text are “patient,” “left,” “procedure,” and “pain,” among others. Many of the top 20 words are associated with surgical procedures, including terms like “anesthesia” and “incision.” This aligns with expectations since most entries are from the surgery specialty. These words provide valuable context about the procedures performed on patients.
  • +
+
+
mtsamples2 %>%
+  unnest_tokens(token, transcription) %>%
+  anti_join(stop_words, by = c("token" = "word")) %>%
+  filter(!grepl("^\\d+$", token)) %>%
+  count(token, sort = TRUE) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(token, n))) +
+  geom_col()
+
+

+
+
+
+
+

Question 4

+

repeat question 2, but this time tokenize into bi-grams. how does the result change if you look at tri-grams?

+
    +
  • In both n-grams, the most frequent phrase is “the patient.” Many phrases also convey location descriptions. Tri-grams, which include an additional word in the phrase, provide even more contextual information, as expected.
  • +
+
+
mtsamples2 %>%
+  unnest_ngrams(ngram, transcription, n = 2) %>%
+  count(ngram, sort = TRUE) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(ngram, n))) +
+  geom_col()
+
+

+
+
+
+
mtsamples2 %>%
+  unnest_ngrams(ngram, transcription, n = 3) %>%
+  count(ngram, sort = TRUE) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(ngram, n))) +
+  geom_col()
+
+

+
+
+
+
+

Question 5

+

Using the results you got from questions 4. Pick a word and count the words that appears after and before it.

+
+
library(tidyr)
+
+#words before
+mtsamples2 %>%
+  unnest_ngrams(ngram, transcription, n = 2) %>%
+  separate(ngram, into = c("word1", "word2"), sep = " ") %>%
+  select(word1, word2) %>%
+  filter(word2 == "patient") %>%
+  count(word1, sort = TRUE) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(word1, n))) +
+  geom_col()
+
+

+
+
+
+
#words after
+mtsamples2 %>%
+  unnest_ngrams(ngram, transcription, n = 2) %>%
+  separate(ngram, into = c("word1", "word2"), sep = " ") %>%
+  select(word1, word2) %>%
+  filter(word1 == "patient") %>%
+  count(word2, sort = TRUE) %>%
+  top_n(20, n) %>%
+  ggplot(aes(n, fct_reorder(word2, n))) +
+  geom_col()
+
+

+
+
+
+
+

Question 6

+

Which words are most used in each of the specialties. you can use group_by() and top_n() from dplyr to have the calculations be done within each specialty. Remember to remove stopwords. How about the most 5 used words?

+
    +
  • In the field of allergy/immunology, the top 5 frequently used words are “history,” “noted,” “patient,” “allergies,” and “nasal” (tied with “past” for fifth place). In dentistry, the leading words are “patient,” “tooth,” “teeth,” “left,” and “procedure.”
  • +
+
+
  top5 <- mtsamples2 %>%
+  group_by(medical_specialty) %>%
+  unnest_tokens(token, transcription) %>%
+  anti_join(stop_words, by = c("token" = "word")) %>%
+  filter(!grepl("^\\d+$", token)) %>%
+  count(token, sort = TRUE) %>%
+  top_n(5, n)
+
+top5 <- top5 %>% arrange(medical_specialty)
+
+top5
+
+
# A tibble: 210 × 3
+# Groups:   medical_specialty [40]
+   medical_specialty    token         n
+   <chr>                <chr>     <int>
+ 1 Allergy / Immunology history      38
+ 2 Allergy / Immunology noted        23
+ 3 Allergy / Immunology patient      22
+ 4 Allergy / Immunology allergies    21
+ 5 Allergy / Immunology nasal        13
+ 6 Allergy / Immunology past         13
+ 7 Autopsy              left         83
+ 8 Autopsy              inch         59
+ 9 Autopsy              neck         55
+10 Autopsy              anterior     47
+# ℹ 200 more rows
+
+
+
+
+

Question 7 - extra

+

Find your own insight in the data:

+

Ideas:

+
    +
  • Interesting ngrams

    +
      +
    • See if certain words are used more in some specialties then others

      +
        +
      • Using 6grams, it is interesting that a lot of the transcripts mention how the patient reacted to the procedure done.
      • +
      +
      +
      mtsamples2 %>%
      +  unnest_ngrams(ngram, transcription, n = 6) %>%
      +  count(ngram, sort = TRUE) %>%
      +  top_n(20, n) %>%
      +  ggplot(aes(n, fct_reorder(ngram, n))) +
      +  geom_col()
      +
      +

      +
      +
    • +
  • +
+
+ +
+ + +
+ + + + \ No newline at end of file