vpnagraj/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Overview

This Gist includes an example script to perform basic exploratory analysis on open text data with R. The script includes steps to read in a dataset, tokenize text, summarize counts of tokens per document, perform sentiment analysis, create a document term matrix, and runs a topic modeling procedure.
Data

The script references a file called simulated_emr_data.csv. This data was created by ChatGPT 4o using the following prompt:

Simulate EMR data. Include 6 columns: Encounter ID, Patient ID, Name, Age, Visit Date, Chief Complaint, Provider Notes (free text). Include some patients with repeat visits and ensure that repeated patients have matching IDs, names, and ages. The provider notes should be at least 25 words and should mix tone and style of notes. Create a csv file with 1000 rows.

Note: The data is intended only to motivate the examples in the demonstration script, and is not intended to be interpreted or used for other purposes.
Suggested resources


https://www.tidytextmining.com/


## example.R
## the script that follows demonstrates some basic techniques for text analysis in R
## for more on the topic see: https://www.tidytextmining.com/

## load packages for analysis
# install.packages(c("tidyverse", "tidytext", "topicmodels", "tm", "janitor"))
library(tidyverse)
library(tidytext)
library(topicmodels)
library(tm)
library(janitor)

## 1: load simulated EMR data

emr_data <-
  ## read in csv file using readr::read_csv
  read_csv("simulated_emr_data.csv") %>%
  ## use janitor to clean "messy" column names
  clean_names()

## what do those column names look like after cleaning ...
emr_data %>%
  names()

## NOTE: the code below is equivalent to %>% syntax above
names(emr_data)

## take a peek at the simulated data
glimpse(emr_data)

## 2: tokenize the provider notes

## going to use some built-in data for stop words ...
## ... and a function to "unnest" the tokenized text
## what are those all about?
?stop_words
?unnest_tokens

tokens <-
  emr_data %>%
  ## select relevant columns
  select(encounter_id, patient_id, chief_complaint, provider_notes) %>%
  ## split provider notes into individual words (AKA tokens in this context)
  unnest_tokens(word, provider_notes) %>%
  ## remove anything that exactly matches a stop word
  anti_join(stop_words, by = "word") %>%
  ## filter to keep words that contain letters (i.e., filters out pure numbers)
  filter(str_detect(word, "[a-z]|[A-Z]"))

tokens

## 3: perform sentiment analysis

## take a look at the sentiment lexicon retrieval function
?get_sentiments

## get the bing sentiments
bing <- get_sentiments("bing")

sentiment_by_complaint <-
  tokens %>%
  ## join the sentiment to tokens
  inner_join(bing, by = "word") %>%
  ## summarize sentiment by chief complaint
  count(chief_complaint, sentiment) %>%
  ## transpose data from long to wide
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  ## calculate overall sentiment
  mutate(sentiment_score = positive - negative)

sentiment_by_complaint

## 4: summarize common words by chief complaint

common_words <-
  tokens %>%
  ## count the occurrences for every combination of chief complaint and word
  count(chief_complaint, word, sort = TRUE)

top_words <-
  common_words %>%
  ## within each chief complaint ...
  group_by(chief_complaint) %>%
  ## ... find the top 5 words
  slice_max(n, n = 5) %>%
  ## good practice to ungroup after you are "done" with the grouping
  ungroup()

top_words

## 5: topic modeling with LDA
## NOTE: for a primer on topic modeling see https://www.tidytextmining.com/topicmodeling

## first need to prep a document term matrix
dtm <-
  tokens %>%
  ## count the occurrence of each word in the encounter
  count(encounter_id, word) %>%
  ## convert to special object needed to fit the LDA model
  cast_dtm(encounter_id, word, n)

## estimate the "best" number of topics into which the encounter notes will be clustered
## create vector of possible number of topics
ks <- c(2, 3, 4, 5, 6, 8, 10)

## define function to get perplexity metric given an arbitrary value of k
get_perplexities <- function(dtm, k) {
  fit <- LDA(dtm, k = k, control = list(seed = 1234))
  perplexity_val <- perplexity(fit, dtm)
  tibble(k = k, perplexity = perplexity_val)
}

## iterate across all ks above
## use purrr::map_df to stack all results in a tibble
perplexities <- map_df(ks, function(x) get_perplexities(dtm = dtm, k = x))

## plot the perplexity by k with ggplot2
ggplot(perplexities, aes(x = k, y = perplexity)) +
  geom_line() +
  geom_point() +
  labs(title = "LDA Perplexity vs Number of Topics",
       x = "Number of Topics (k)",
       y = "Perplexity")

## pick a best number of topics
## NOTE: this is best done with a combination of human intuition i.e., not an "automatic" procedure
best_k <- 8

## fit the LDA model with the best k
lda_fit <- LDA(dtm, k = best_k, control = list(seed = 1234))

## summarize terms by modeled topic
topic_terms <-
  ## use tidy::broom to summarize model fit
  tidy(lda_fit, matrix = "beta") %>%
  ## within topic find the top 8 terms by beta
  group_by(topic) %>%
  slice_max(beta, n = 8) %>%
  ungroup() %>%
  arrange(topic)

topic_terms

## visualize topic terms
## NOTE: here we are piping directly into ggplot2
topic_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  labs(title = "Top Words per Topic", x = "Word", y = "Importance (beta)")

## assign topics back to documents

## first get the top topics in each document
top_doc_topics <-
  tidy(lda_fit, matrix = "gamma") %>%
  group_by(document) %>%
  slice_max(gamma, n = 1) %>%
  ungroup()

## join back to original emr data
emr_data %>%
  rename(document = encounter_id) %>%
  left_join(top_doc_topics, by = "document")
	## the script that follows demonstrates some basic techniques for text analysis in R
	## for more on the topic see: https://www.tidytextmining.com/

	## load packages for analysis
	# install.packages(c("tidyverse", "tidytext", "topicmodels", "tm", "janitor"))
	library(tidyverse)
	library(tidytext)
	library(topicmodels)
	library(tm)
	library(janitor)

	## 1: load simulated EMR data

	emr_data <-
	## read in csv file using readr::read_csv
	read_csv("simulated_emr_data.csv") %>%
	## use janitor to clean "messy" column names
	clean_names()

	## what do those column names look like after cleaning ...
	emr_data %>%
	names()

	## NOTE: the code below is equivalent to %>% syntax above
	names(emr_data)

	## take a peek at the simulated data
	glimpse(emr_data)

	## 2: tokenize the provider notes

	## going to use some built-in data for stop words ...
	## ... and a function to "unnest" the tokenized text
	## what are those all about?
	?stop_words
	?unnest_tokens

	tokens <-
	emr_data %>%
	## select relevant columns
	select(encounter_id, patient_id, chief_complaint, provider_notes) %>%
	## split provider notes into individual words (AKA tokens in this context)
	unnest_tokens(word, provider_notes) %>%
	## remove anything that exactly matches a stop word
	anti_join(stop_words, by = "word") %>%
	## filter to keep words that contain letters (i.e., filters out pure numbers)
	filter(str_detect(word, "[a-z]\|[A-Z]"))

	tokens

	## 3: perform sentiment analysis

	## take a look at the sentiment lexicon retrieval function
	?get_sentiments

	## get the bing sentiments
	bing <- get_sentiments("bing")

	sentiment_by_complaint <-
	tokens %>%
	## join the sentiment to tokens
	inner_join(bing, by = "word") %>%
	## summarize sentiment by chief complaint
	count(chief_complaint, sentiment) %>%
	## transpose data from long to wide
	pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
	## calculate overall sentiment
	mutate(sentiment_score = positive - negative)

	sentiment_by_complaint

	## 4: summarize common words by chief complaint

	common_words <-
	tokens %>%
	## count the occurrences for every combination of chief complaint and word
	count(chief_complaint, word, sort = TRUE)

	top_words <-
	common_words %>%
	## within each chief complaint ...
	group_by(chief_complaint) %>%
	## ... find the top 5 words
	slice_max(n, n = 5) %>%
	## good practice to ungroup after you are "done" with the grouping
	ungroup()

	top_words

	## 5: topic modeling with LDA
	## NOTE: for a primer on topic modeling see https://www.tidytextmining.com/topicmodeling

	## first need to prep a document term matrix
	dtm <-
	tokens %>%
	## count the occurrence of each word in the encounter
	count(encounter_id, word) %>%
	## convert to special object needed to fit the LDA model
	cast_dtm(encounter_id, word, n)

	## estimate the "best" number of topics into which the encounter notes will be clustered
	## create vector of possible number of topics
	ks <- c(2, 3, 4, 5, 6, 8, 10)

	## define function to get perplexity metric given an arbitrary value of k
	get_perplexities <- function(dtm, k) {
	fit <- LDA(dtm, k = k, control = list(seed = 1234))
	perplexity_val <- perplexity(fit, dtm)
	tibble(k = k, perplexity = perplexity_val)
	}

	## iterate across all ks above
	## use purrr::map_df to stack all results in a tibble
	perplexities <- map_df(ks, function(x) get_perplexities(dtm = dtm, k = x))

	## plot the perplexity by k with ggplot2
	ggplot(perplexities, aes(x = k, y = perplexity)) +
	geom_line() +
	geom_point() +
	labs(title = "LDA Perplexity vs Number of Topics",
	x = "Number of Topics (k)",
	y = "Perplexity")

	## pick a best number of topics
	## NOTE: this is best done with a combination of human intuition i.e., not an "automatic" procedure
	best_k <- 8

	## fit the LDA model with the best k
	lda_fit <- LDA(dtm, k = best_k, control = list(seed = 1234))

	## summarize terms by modeled topic
	topic_terms <-
	## use tidy::broom to summarize model fit
	tidy(lda_fit, matrix = "beta") %>%
	## within topic find the top 8 terms by beta
	group_by(topic) %>%
	slice_max(beta, n = 8) %>%
	ungroup() %>%
	arrange(topic)

	topic_terms

	## visualize topic terms
	## NOTE: here we are piping directly into ggplot2
	topic_terms %>%
	mutate(term = reorder_within(term, beta, topic)) %>%
	ggplot(aes(x = term, y = beta, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	coord_flip() +
	scale_x_reordered() +
	labs(title = "Top Words per Topic", x = "Word", y = "Importance (beta)")

	## assign topics back to documents

	## first get the top topics in each document
	top_doc_topics <-
	tidy(lda_fit, matrix = "gamma") %>%
	group_by(document) %>%
	slice_max(gamma, n = 1) %>%
	ungroup()

	## join back to original emr data
	emr_data %>%
	rename(document = encounter_id) %>%
	left_join(top_doc_topics, by = "document")
No results found