#' @include partition_class.R TermDocumentMatrix_methods.R
NULL

#' Get Number of Tokens.
#' 
#' The method will get the number of tokens in a corpus or partition,
#' or the dispersion across one or more s-attributes.
#' 
#' One or more s-attributes can be provided to get the dispersion of
#' tokens across one or more dimensions. Two or more s-attributes
#' can lead to reasonable results only if the corpus XML is flat.
#' 
#' @param x object to get size(s) for
#' @param sAttribute character vector with s-attributes (one or more)
#' @param verbose logical, whether to print messages
#' @param ... further arguments
#' @rdname size-method
#' @return an integer vector if sAttribute is NULL, a \code{data.table} otherweise
#' @seealso See \code{\link{dispersion}}-method for counts of hits. The \code{\link{hits}}
#' method calls the \code{size}-method to get sizes of subcorpora.
#' @examples
#' use("polmineR")
#' size("GERMAPARLMINI")
#' size("GERMAPARLMINI", sAttribute = "date")
#' size("GERMAPARLMINI", sAttribute = c("date", "party"))
#' 
#' P <- partition("GERMAPARLMINI", date = "2009-11-11")
#' size(P, sAttribute = "speaker")
#' size(P, sAttribute = "party")
#' size(P, sAttribute = c("speaker", "party"))
setGeneric("size", function(x, ...) UseMethod("size"))

#' @rdname size-method
setMethod("size", "character", function(x, sAttribute = NULL, verbose = TRUE){
  if (is.null(sAttribute)){
    return( CQI$attribute_size(x, "word", type = "p") )
  } else {
    stopifnot(all(sAttribute %in% sAttributes(x)))
    dt <- as.data.table(
      lapply(
        setNames(sAttribute, sAttribute),
        function(sAttr){
          sAttrDecoded <- CQI$struc2str(x, sAttr, 0:(CQI$attribute_size(x, sAttr, type = "s") - 1))
          as.nativeEnc(sAttrDecoded, from = getEncoding(x))
        }
      )
    )
    cpos_matrix <- RcppCWB::get_region_matrix(
      corpus = x, s_attribute = sAttribute[1],
      strucs = 0L:(CQI$attribute_size(x, sAttribute[1], "s") - 1L),
      registry = Sys.getenv("CORPUS_REGISTRY")
    )
    
    dt[, size := cpos_matrix[,2] - cpos_matrix[,1] + 1L]
    y <- dt[, sum(size), by = eval(sAttribute), with = TRUE]
    setnames(y, old = "V1", new = "size")
    setkeyv(y, cols = sAttribute)
    return(y)
  }
})

#' @rdname size-method
#' @exportMethod size
setMethod("size", "partition", function(x, sAttribute = NULL){
  if (is.null(sAttribute)){
    return( sum(as.integer(x@cpos[,2]) - as.integer(x@cpos[,1]) + 1L) )
  } else {
    stopifnot(all(sAttribute %in% sAttributes(x)))
    dt <- as.data.table(
      lapply(
        setNames(sAttribute, sAttribute),
        function(sAttr) as.nativeEnc(CQI$struc2str(x@corpus, sAttr, x@strucs), from = x@encoding)
      )
    )
    dt[, size := x@cpos[,2] - x@cpos[,1] + 1L]
    y <- dt[, sum(size), by = eval(sAttribute), with = TRUE]
    setnames(y, old = "V1", new = "size")
    setkeyv(y, cols = sAttribute)
    return( y )
  }
  })


#' @rdname size-method
setMethod("size", "DocumentTermMatrix", function(x){
  setNames(tapply(x$v, INDEX = x$i, sum), x[["dimnames"]][["Docs"]])
})

#' @rdname TermDocumentMatrix
setMethod("size", "TermDocumentMatrix", function(x){
  setNames(tapply(x$v, INDEX = x$j, sum), x[["dimnames"]][["Docs"]])
})
