#' Preprocess Text Data
#'
#' Cleans raw text data by removing punctuation, numbers, and stopwords, 
#' and converting to a DocumentTermMatrix.
#'
#' @param text_vector A character vector of documents.
#' @return A DocumentTermMatrix compatible with topicmodels.
#' @importFrom tm VCorpus VectorSource tm_map content_transformer removePunctuation removeNumbers removeWords stopwords DocumentTermMatrix
#' @noRd
clean_and_dtm <- function(text_vector) {
  # Change 1: Use VCorpus (Volatile Corpus) instead of Corpus/SimpleCorpus
  # This prevents the "transformation drops documents" warnings.
  corpus <- tm::VCorpus(tm::VectorSource(text_vector))
  
  # Standard preprocessing
  corpus <- tm::tm_map(corpus, tm::content_transformer(tolower))
  corpus <- tm::tm_map(corpus, tm::removePunctuation)
  corpus <- tm::tm_map(corpus, tm::removeNumbers)
  corpus <- tm::tm_map(corpus, tm::removeWords, tm::stopwords("en"))
  
  # Change 2: Handle whitespace that results from removals
  corpus <- tm::tm_map(corpus, tm::stripWhitespace)
  
  dtm <- tm::DocumentTermMatrix(corpus)
  
  # Note: We do NOT remove empty rows here anymore. 
  # We handle them in the main LDA function to ensure row alignment 
  # with the student score matrix is not lost.
  
  return(dtm)
}

#' Concatenate Student Responses
#'
#' Aggregates all written responses within a testlet for each student.
#'
#' @param response_matrix An N x J matrix of character strings (essays).
#' @return A named character vector of length N.
#' @export
aggregate_responses <- function(response_matrix) {
  # Concatenate columns (items) for each row (student)
  combined_text <- apply(response_matrix, 1, function(x) paste(x, collapse = " "))
  return(combined_text)
}