# Load necessary libraries for data manipulation and visualization.
library(readxl)
library(readr)
library(dplyr)
library(udpipe)
library(RColorBrewer)
library(wordcloud2)
library(htmlwidgets)

# Get the directory of the currently executing script.
this_dir <- dirname(parent.frame(2)$ofile)
setwd(this_dir)

# Download and load the UDPipe model for Czech language.
udmodel <- udpipe_download_model(language = "czech")
udmodel <- udpipe_load_model(file = udmodel$file_model)

# Read the OCR text from the downloaded file.
file <- "stazeny_text_Vesela_Zide.txt"
y <- read_lines(file, skip = 0, n_max = -1L)

# Perform tokenization, tagging, and lemmatization using UDPipe.
x <- udpipe_annotate(udmodel, y)
lv <- as.data.frame(x, detailed = TRUE)

# Convert lemma column to lowercase.
lv$lemma <- tolower(lv$lemma)

# Read Czech stopwords from file.
stopwords <- readLines("stopwords_cz.txt")

# Define a function to remove stopwords from text.
remove_stopwords <- function(text, stopwords) {
  words <- unlist(strsplit(text, "\\s+"))
  filtered_words <- words[!tolower(words) %in% stopwords]
  cleaned_text <- paste(filtered_words, collapse = " ")
  return(cleaned_text)
}

# Apply the remove_stopwords function to remove stopwords.
lv$lemma <- sapply(lv$lemma, remove_stopwords, stopwords)

# Remove leading and trailing whitespaces.
lv$lemma <- trimws(lv$lemma)

# Convert the lemma column to a factor.
lv$lemma <- as.factor(lv$lemma)

# Count the occurrences of each lemma and arrange in descending order.
lv_framed <- lv %>% count(lemma, sort = TRUE)

# Convert lemma column to character and filter out lemmas with less than 3 characters.
lv_framed$lemma <- as.character(lv_framed$lemma)
df <- lv_framed %>% filter(nchar(lemma) > 2)

# Create a word cloud using wordcloud2 package.
hw <- wordcloud2(data = df, size = 1.6, color = 'random-dark')

# Save the word cloud as an HTML file.
saveWidget(hw, "wordcloud_vysledek_Vesela_Zide.html", selfcontained = FALSE)