# Load required libraries
libraries <- c(
  "quanteda", "quanteda.textstats", "quanteda.textplots", "topicmodels", "readtext",
  "tidytext", "dplyr", "jsonlite", "udpipe", "stopwords", "readxl", "stringr",
  "LDAvis", "wordcloud", "ggplot2", "gridExtra", "RColorBrewer", "forcats", "readr"
)
invisible(lapply(libraries, library, character.only = TRUE))

# Set working directory
this_dir <- dirname(parent.frame(2)$ofile)
setwd(this_dir)

# Download and load UD model for Czech
udmodel <- udpipe_download_model(language = "czech")
udmodel <- udpipe_load_model(file = udmodel$file_model)

# Function to process text and save nouns to a file
process_text <- function(file, output_file) {
  text <- read_lines(file, skip = 0, n_max = -1L)
  annotations <- udpipe_annotate(udmodel, text)
  nouns <- as.data.frame(annotations, detailed = TRUE) %>%
    filter(upos == "NOUN") %>%
    mutate(lemma = tolower(lemma)) %>%
    pull(lemma)
  writeLines(nouns, output_file)
}

# Process text files and extract nouns
process_text("stazeny_text_Vesela_Zide.txt", "stazeny_text_Vesela_Zide_nouns.txt")
process_text("stazeny_text_magie.txt", "stazeny_text_magie_nouns.txt")

# Load stopwords
stopwords_cs <- stopwords::stopwords("cs", source = "stopwords-iso")
stopwords_cz <- c(stopwords_cs, readLines("stopwords_cz.txt"))

# Read text files into corpus
y <- readtext(c("stazeny_text_Vesela_Zide_nouns.txt", "stazeny_text_magie_nouns.txt"))
data_txt <- corpus(y)

# Tokenization and preprocessing
tokens <- tokens(data_txt) %>%
  tokens_remove(min_nchar = 3) %>%
  tokens_remove(pattern = c("*-*", "und", "der", "býti")) %>%
  tokens_remove(stopwords_cz)

# Create document-feature matrix
dfm <- dfm(tokens)
dfm_df <- as.data.frame(t(dfm))

# Topic modeling
dtm <- convert(dfm, to = "topicmodels")
set.seed(1234)
topic_model <- LDA(dtm, method = "Gibbs", k = 4, control = list(alpha = 0.1))

# Extract top words for a specific topic
topic_number <- 3
word_topic_posterior <- posterior(topic_model)$terms[topic_number, ]
top_words_for_topicX <- head(sort(word_topic_posterior, decreasing = TRUE), n = 50)

# Visualize topics
word_topics <- tidy(topic_model, matrix = "beta")
wordcloud(names(top_words_for_topicX), top_words_for_topicX)

# Extract document-topic distribution
topic_model_documents <- tidy(topic_model, matrix = "gamma")
document_to_topic <- topic_model_documents %>%
  group_by(document) %>%
  slice_max(gamma) %>%
  ungroup()

# Filter documents for a specific topic
topic_number <- 2
document_topic_filtr <- filter(document_to_topic, topic == topic_number)

# Get top terms for each topic
word_topics_TOPterms_df <- as.data.frame(terms(topic_model, 30))

# Compute JSON for visualization
dtm <- dtm[slam::row_sums(dtm) > 0, ]
phi <- as.matrix(posterior(topic_model)$terms)
theta <- as.matrix(posterior(topic_model)$topics)
vocab <- colnames(phi)
doc.length <- slam::row_sums(dtm)
term.freq <- slam::col_sums(dtm)[match(vocab, colnames(dtm))]
json <- createJSON(
  phi = phi, theta = theta, vocab = vocab,
  doc.length = doc.length, term.frequency = term.freq,
  reorder.topics = FALSE
)
serVis(json)