# Load necessary libraries for data manipulation and visualization. library(readxl) library(readr) library(dplyr) library(udpipe) library(RColorBrewer) library(wordcloud2) library(htmlwidgets) # Get the directory of the currently executing script. this_dir <- dirname(parent.frame(2)$ofile) setwd(this_dir) # Download and load the UDPipe model for Czech language. udmodel <- udpipe_download_model(language = "czech") udmodel <- udpipe_load_model(file = udmodel$file_model) # Read the OCR text from the downloaded file. file <- "stazeny_text_Vesela_Zide.txt" y <- read_lines(file, skip = 0, n_max = -1L) # Perform tokenization, tagging, and lemmatization using UDPipe. x <- udpipe_annotate(udmodel, y) lv <- as.data.frame(x, detailed = TRUE) # Convert lemma column to lowercase. lv$lemma <- tolower(lv$lemma) # Read Czech stopwords from file. stopwords <- readLines("stopwords_cz.txt") # Define a function to remove stopwords from text. remove_stopwords <- function(text, stopwords) { words <- unlist(strsplit(text, "\\s+")) filtered_words <- words[!tolower(words) %in% stopwords] cleaned_text <- paste(filtered_words, collapse = " ") return(cleaned_text) } # Apply the remove_stopwords function to remove stopwords. lv$lemma <- sapply(lv$lemma, remove_stopwords, stopwords) # Remove leading and trailing whitespaces. lv$lemma <- trimws(lv$lemma) # Convert the lemma column to a factor. lv$lemma <- as.factor(lv$lemma) # Count the occurrences of each lemma and arrange in descending order. lv_framed <- lv %>% count(lemma, sort = TRUE) # Convert lemma column to character and filter out lemmas with less than 3 characters. lv_framed$lemma <- as.character(lv_framed$lemma) df <- lv_framed %>% filter(nchar(lemma) > 2) # Create a word cloud using wordcloud2 package. hw <- wordcloud2(data = df, size = 1.6, color = 'random-dark') # Save the word cloud as an HTML file. saveWidget(hw, "wordcloud_vysledek_Vesela_Zide.html", selfcontained = FALSE)