% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dfm_weight.R
\name{dfm_weight}
\alias{dfm_weight}
\alias{dfm_smooth}
\title{Weight the feature frequencies in a dfm}
\usage{
dfm_weight(
  x,
  scheme = c("count", "prop", "propmax", "logcount", "boolean", "augmented", "logave"),
  weights = NULL,
  base = 10,
  k = 0.5,
  force = FALSE
)

dfm_smooth(x, smoothing = 1)
}
\arguments{
\item{x}{document-feature matrix created by \link{dfm}}

\item{scheme}{a label of the weight type:
\describe{
\item{\code{count}}{\eqn{tf_{ij}}, an integer feature count (default when a dfm is created)}
\item{\code{prop}}{the proportion of the feature counts of total feature
counts (aka relative frequency), calculated as \eqn{tf_{ij} / \sum_j tf_{ij}}}
\item{\code{propmax}}{the proportion of the feature counts of the highest
feature count in a document, \eqn{tf_{ij} / \textrm{max}_j tf_{ij}}}
\item{\code{logcount}}{take the 1 + the logarithm of each count, for the
given base, or 0 if the count was zero: \eqn{1 +
  \textrm{log}_{base}(tf_{ij})} if \eqn{tf_{ij} > 0}, or 0 otherwise.}
\item{\code{boolean}}{recode all non-zero counts as 1}
\item{\code{augmented}}{equivalent to \eqn{k + (1 - k) *} \code{dfm_weight(x, "propmax")}}
\item{\code{logave}}{1 + the log of the counts) / (1 + log of the counts / the average count within document), or
\deqn{\frac{1 + \textrm{log}_{base} tf_{ij}}{1 + \textrm{log}_{base}(\sum_j tf_{ij} / N_i)}}}
}}

\item{weights}{if \code{scheme} is unused, then \code{weights} can be a named
numeric vector of weights to be applied to the dfm, where the names of the
vector correspond to feature labels of the dfm, and the weights will be
applied as multipliers to the existing feature counts for the corresponding
named features.  Any features not named will be assigned a weight of 1.0
(meaning they will be unchanged).}

\item{base}{base for the logarithm when \code{scheme} is \code{"logcount"} or
\code{logave}}

\item{k}{the k for the augmentation when \code{scheme = "augmented"}}

\item{force}{logical; if \code{TRUE}, apply weighting scheme even if the dfm
has been weighted before.  This can result in invalid weights, such as as
weighting by \code{"prop"} after applying \code{"logcount"}, or after
having grouped a dfm using \code{\link[=dfm_group]{dfm_group()}}.}

\item{smoothing}{constant added to the dfm cells for smoothing, default is 1}
}
\value{
\code{dfm_weight} returns the dfm with weighted values.  Note the
because the default weighting scheme is \code{"count"}, simply calling this
function on an unweighted dfm will return the same object.  Many users will
want the normalized dfm consisting of the proportions of the feature counts
within each document, which requires setting \code{scheme = "prop"}.

\code{dfm_smooth} returns a dfm whose values have been smoothed by
adding the \code{smoothing} amount. Note that this effectively converts a
matrix from sparse to dense format, so may exceed memory requirements
depending on the size of your input matrix.
}
\description{
Weight the feature frequencies in a dfm
}
\examples{
dfmat1 <- dfm(data_corpus_inaugural)

dfmat2 <- dfm_weight(dfmat1, scheme = "prop")
topfeatures(dfmat2)
dfmat3 <- dfm_weight(dfmat1)
topfeatures(dfmat3)
dfmat4 <- dfm_weight(dfmat1, scheme = "logcount")
topfeatures(dfmat4)
dfmat5 <- dfm_weight(dfmat1, scheme = "logave")
topfeatures(dfmat5)

# combine these methods for more complex dfm_weightings, e.g. as in Section 6.4
# of Introduction to Information Retrieval
head(dfm_tfidf(dfmat1, scheme_tf = "logcount"))

# apply numeric weights
str <- c("apple is better than banana", "banana banana apple much better")
(dfmat6 <- dfm(str, remove = stopwords("english")))
dfm_weight(dfmat6, weights = c(apple = 5, banana = 3, much = 0.5))

# smooth the dfm
dfmat <- dfm(data_corpus_inaugural)
dfm_smooth(dfmat, 0.5)
}
\references{
Manning, C.D., Raghavan, P., & Schütze, H. (2008).
\emph{An Introduction to Information Retrieval}. Cambridge: Cambridge University Press.
\url{https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf}
}
\seealso{
\code{\link[=docfreq]{docfreq()}}
}
\keyword{dfm}
