% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/tokenize.R
\name{tokenize}
\alias{tokenise}
\alias{tokenize}
\alias{tokenize.character}
\alias{tokenize.corpus}
\title{tokenize a set of texts}
\usage{
tokenize(x, ...)

\method{tokenize}{character}(x, simplify = FALSE, sep = " ", ...)

\method{tokenize}{corpus}(x, ...)
}
\arguments{
\item{x}{The text(s) or corpus to be tokenized}

\item{...}{additional arguments passed to \code{\link{clean}}}

\item{simplify}{If \code{TRUE}, return a character vector of tokens rather
than a list of length \code{\link{ndoc}(texts)}, with each element of the
list containing a character vector of the tokens corresponding to that
text.}

\item{sep}{by default, tokenize expects a "white-space" delimiter between
tokens. Alternatively, \code{sep} can be used to specify another character
which delimits fields.}
}
\value{
A list of length \code{\link{ndoc}(x)} of the tokens found in each text.
}
\description{
Tokenize the texts from a character vector or from a corpus.
}
\examples{
# same for character vectors and for lists
tokensFromChar <- tokenize(inaugTexts[1:3])
tokensFromCorp <- tokenize(subset(inaugCorpus, Year<1798))
identical(tokensFromChar, tokensFromCorp)
str(tokensFromChar)
# returned as a list
head(tokenize(inaugTexts[57])[[1]], 10)
# returned as a character vector using simplify=TRUE
head(tokenize(inaugTexts[57], simplify=TRUE), 10)

# demonstrate some options with clean
head(tokenize(inaugTexts[57], simplify=TRUE, cpp=TRUE), 30)
## NOTE: not the same as
head(tokenize(inaugTexts[57], simplify=TRUE, cpp=FALSE), 30)

## MORE COMPARISONS
tokenize("this is MY <3 4U @myhandle gr8 stuff :-)", removeTwitter=FALSE, cpp=TRUE)
tokenize("this is MY <3 4U @myhandle gr8 stuff :-)", removeTwitter=FALSE, cpp=FALSE)
tokenize("great website http://textasdata.com", removeURL=FALSE, cpp=TRUE)
tokenize("great website http://textasdata.com", removeURL=FALSE, cpp=FALSE)
tokenize("great website http://textasdata.com", removeURL=TRUE, cpp=TRUE)
tokenize("great website http://textasdata.com", removeURL=TRUE, cpp=FALSE)
}
\author{
Kohei Watanabe (C++ code), Ken Benoit, and Paul Nulty
}

