# Load required libraries libraries <- c( "quanteda", "quanteda.textstats", "quanteda.textplots", "topicmodels", "readtext", "tidytext", "dplyr", "jsonlite", "udpipe", "stopwords", "readxl", "stringr", "LDAvis", "wordcloud", "ggplot2", "gridExtra", "RColorBrewer", "forcats", "readr" ) invisible(lapply(libraries, library, character.only = TRUE)) # Set working directory this_dir <- dirname(parent.frame(2)$ofile) setwd(this_dir) # Download and load UD model for Czech udmodel <- udpipe_download_model(language = "czech") udmodel <- udpipe_load_model(file = udmodel$file_model) # Function to process text and save nouns to a file process_text <- function(file, output_file) { text <- read_lines(file, skip = 0, n_max = -1L) annotations <- udpipe_annotate(udmodel, text) nouns <- as.data.frame(annotations, detailed = TRUE) %>% filter(upos == "NOUN") %>% mutate(lemma = tolower(lemma)) %>% pull(lemma) writeLines(nouns, output_file) } # Process text files and extract nouns process_text("stazeny_text_Vesela_Zide.txt", "stazeny_text_Vesela_Zide_nouns.txt") process_text("stazeny_text_magie.txt", "stazeny_text_magie_nouns.txt") # Load stopwords stopwords_cs <- stopwords::stopwords("cs", source = "stopwords-iso") stopwords_cz <- c(stopwords_cs, readLines("stopwords_cz.txt")) # Read text files into corpus y <- readtext(c("stazeny_text_Vesela_Zide_nouns.txt", "stazeny_text_magie_nouns.txt")) data_txt <- corpus(y) # Tokenization and preprocessing tokens <- tokens(data_txt) %>% tokens_remove(min_nchar = 3) %>% tokens_remove(pattern = c("*-*", "und", "der", "býti")) %>% tokens_remove(stopwords_cz) # Create document-feature matrix dfm <- dfm(tokens) dfm_df <- as.data.frame(t(dfm)) # Topic modeling dtm <- convert(dfm, to = "topicmodels") set.seed(1234) topic_model <- LDA(dtm, method = "Gibbs", k = 4, control = list(alpha = 0.1)) # Extract top words for a specific topic topic_number <- 3 word_topic_posterior <- posterior(topic_model)$terms[topic_number, ] top_words_for_topicX <- head(sort(word_topic_posterior, decreasing = TRUE), n = 50) # Visualize topics word_topics <- tidy(topic_model, matrix = "beta") wordcloud(names(top_words_for_topicX), top_words_for_topicX) # Extract document-topic distribution topic_model_documents <- tidy(topic_model, matrix = "gamma") document_to_topic <- topic_model_documents %>% group_by(document) %>% slice_max(gamma) %>% ungroup() # Filter documents for a specific topic topic_number <- 2 document_topic_filtr <- filter(document_to_topic, topic == topic_number) # Get top terms for each topic word_topics_TOPterms_df <- as.data.frame(terms(topic_model, 30)) # Compute JSON for visualization dtm <- dtm[slam::row_sums(dtm) > 0, ] phi <- as.matrix(posterior(topic_model)$terms) theta <- as.matrix(posterior(topic_model)$topics) vocab <- colnames(phi) doc.length <- slam::row_sums(dtm) term.freq <- slam::col_sums(dtm)[match(vocab, colnames(dtm))] json <- createJSON( phi = phi, theta = theta, vocab = vocab, doc.length = doc.length, term.frequency = term.freq, reorder.topics = FALSE ) serVis(json)