|
## the script that follows demonstrates some basic techniques for text analysis in R |
|
## for more on the topic see: https://www.tidytextmining.com/ |
|
|
|
## load packages for analysis |
|
# install.packages(c("tidyverse", "tidytext", "topicmodels", "tm", "janitor")) |
|
library(tidyverse) |
|
library(tidytext) |
|
library(topicmodels) |
|
library(tm) |
|
library(janitor) |
|
|
|
## 1: load simulated EMR data |
|
|
|
emr_data <- |
|
## read in csv file using readr::read_csv |
|
read_csv("simulated_emr_data.csv") %>% |
|
## use janitor to clean "messy" column names |
|
clean_names() |
|
|
|
## what do those column names look like after cleaning ... |
|
emr_data %>% |
|
names() |
|
|
|
## NOTE: the code below is equivalent to %>% syntax above |
|
names(emr_data) |
|
|
|
## take a peek at the simulated data |
|
glimpse(emr_data) |
|
|
|
## 2: tokenize the provider notes |
|
|
|
## going to use some built-in data for stop words ... |
|
## ... and a function to "unnest" the tokenized text |
|
## what are those all about? |
|
?stop_words |
|
?unnest_tokens |
|
|
|
tokens <- |
|
emr_data %>% |
|
## select relevant columns |
|
select(encounter_id, patient_id, chief_complaint, provider_notes) %>% |
|
## split provider notes into individual words (AKA tokens in this context) |
|
unnest_tokens(word, provider_notes) %>% |
|
## remove anything that exactly matches a stop word |
|
anti_join(stop_words, by = "word") %>% |
|
## filter to keep words that contain letters (i.e., filters out pure numbers) |
|
filter(str_detect(word, "[a-z]|[A-Z]")) |
|
|
|
tokens |
|
|
|
## 3: perform sentiment analysis |
|
|
|
## take a look at the sentiment lexicon retrieval function |
|
?get_sentiments |
|
|
|
## get the bing sentiments |
|
bing <- get_sentiments("bing") |
|
|
|
sentiment_by_complaint <- |
|
tokens %>% |
|
## join the sentiment to tokens |
|
inner_join(bing, by = "word") %>% |
|
## summarize sentiment by chief complaint |
|
count(chief_complaint, sentiment) %>% |
|
## transpose data from long to wide |
|
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% |
|
## calculate overall sentiment |
|
mutate(sentiment_score = positive - negative) |
|
|
|
sentiment_by_complaint |
|
|
|
## 4: summarize common words by chief complaint |
|
|
|
common_words <- |
|
tokens %>% |
|
## count the occurrences for every combination of chief complaint and word |
|
count(chief_complaint, word, sort = TRUE) |
|
|
|
top_words <- |
|
common_words %>% |
|
## within each chief complaint ... |
|
group_by(chief_complaint) %>% |
|
## ... find the top 5 words |
|
slice_max(n, n = 5) %>% |
|
## good practice to ungroup after you are "done" with the grouping |
|
ungroup() |
|
|
|
top_words |
|
|
|
## 5: topic modeling with LDA |
|
## NOTE: for a primer on topic modeling see https://www.tidytextmining.com/topicmodeling |
|
|
|
## first need to prep a document term matrix |
|
dtm <- |
|
tokens %>% |
|
## count the occurrence of each word in the encounter |
|
count(encounter_id, word) %>% |
|
## convert to special object needed to fit the LDA model |
|
cast_dtm(encounter_id, word, n) |
|
|
|
## estimate the "best" number of topics into which the encounter notes will be clustered |
|
## create vector of possible number of topics |
|
ks <- c(2, 3, 4, 5, 6, 8, 10) |
|
|
|
## define function to get perplexity metric given an arbitrary value of k |
|
get_perplexities <- function(dtm, k) { |
|
fit <- LDA(dtm, k = k, control = list(seed = 1234)) |
|
perplexity_val <- perplexity(fit, dtm) |
|
tibble(k = k, perplexity = perplexity_val) |
|
} |
|
|
|
## iterate across all ks above |
|
## use purrr::map_df to stack all results in a tibble |
|
perplexities <- map_df(ks, function(x) get_perplexities(dtm = dtm, k = x)) |
|
|
|
## plot the perplexity by k with ggplot2 |
|
ggplot(perplexities, aes(x = k, y = perplexity)) + |
|
geom_line() + |
|
geom_point() + |
|
labs(title = "LDA Perplexity vs Number of Topics", |
|
x = "Number of Topics (k)", |
|
y = "Perplexity") |
|
|
|
## pick a best number of topics |
|
## NOTE: this is best done with a combination of human intuition i.e., not an "automatic" procedure |
|
best_k <- 8 |
|
|
|
## fit the LDA model with the best k |
|
lda_fit <- LDA(dtm, k = best_k, control = list(seed = 1234)) |
|
|
|
## summarize terms by modeled topic |
|
topic_terms <- |
|
## use tidy::broom to summarize model fit |
|
tidy(lda_fit, matrix = "beta") %>% |
|
## within topic find the top 8 terms by beta |
|
group_by(topic) %>% |
|
slice_max(beta, n = 8) %>% |
|
ungroup() %>% |
|
arrange(topic) |
|
|
|
topic_terms |
|
|
|
## visualize topic terms |
|
## NOTE: here we are piping directly into ggplot2 |
|
topic_terms %>% |
|
mutate(term = reorder_within(term, beta, topic)) %>% |
|
ggplot(aes(x = term, y = beta, fill = factor(topic))) + |
|
geom_col(show.legend = FALSE) + |
|
facet_wrap(~ topic, scales = "free") + |
|
coord_flip() + |
|
scale_x_reordered() + |
|
labs(title = "Top Words per Topic", x = "Word", y = "Importance (beta)") |
|
|
|
## assign topics back to documents |
|
|
|
## first get the top topics in each document |
|
top_doc_topics <- |
|
tidy(lda_fit, matrix = "gamma") %>% |
|
group_by(document) %>% |
|
slice_max(gamma, n = 1) %>% |
|
ungroup() |
|
|
|
## join back to original emr data |
|
emr_data %>% |
|
rename(document = encounter_id) %>% |
|
left_join(top_doc_topics, by = "document") |