% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/phrases.R
\docType{methods}
\name{phrasetotoken}
\alias{phrasetotoken}
\alias{phrasetotoken,corpus,ANY-method}
\alias{phrasetotoken,textORtokens,dictionary-method}
\alias{phrasetotoken,textORtokens,collocations-method}
\alias{phrasetotoken,character,character-method}
\alias{phrasetotoken,tokenizedTexts,character-method}
\title{convert phrases into single tokens}
\usage{
phrasetotoken(object, phrases, ...)

\S4method{phrasetotoken}{corpus,ANY}(object, phrases, ...)

\S4method{phrasetotoken}{textORtokens,dictionary}(object, phrases, ...)

\S4method{phrasetotoken}{textORtokens,collocations}(object, phrases, ...)

\S4method{phrasetotoken}{character,character}(object, phrases,
  concatenator = "_", valuetype = c("glob", "regex", "fixed"),
  case_insensitive = TRUE, ...)

\S4method{phrasetotoken}{tokenizedTexts,character}(object, phrases,
  concatenator = "_", valuetype = c("glob", "regex", "fixed"),
  case_insensitive = TRUE, ...)
}
\arguments{
\item{object}{source texts, a character or character vector}

\item{phrases}{a \code{\link{dictionary}} object that 
contains some phrases, defined as multiple words delimited by whitespace, 
up to 9 words long; or a quanteda collocation object created
by \code{\link{collocations}}}

\item{...}{additional arguments passed through to core \code{"character,character"} method}

\item{concatenator}{the concatenation character that will connect the words 
making up the multi-word phrases.  The default \code{_} is highly 
recommended since it will not be removed during normal cleaning and 
tokenization (while nearly all other punctuation characters, at least those
in the Unicode punctuation class [P] will be removed.}

\item{valuetype}{how to interpret keyword expressions: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{if \code{TRUE}, ignore case when matching}
}
\value{
character or character vector of texts with phrases replaced by 
  compound "words" joined by the concatenator
}
\description{
Replace multi-word phrases in text(s) with a compound version of the phrases 
concatenated with  \code{concatenator} (by default, the "\code{_}" character) to
form a single token.  This prevents tokenization of the phrases during 
subsequent processing by eliminating the whitespace delimiter.
}
\examples{
\dontrun{
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
             "New York City has raised a taxes: an income tax and a sales tax.")
mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
(cw <- phrasetotoken(mytexts, mydict))
dfm(cw, verbose=FALSE)

# when used as a dictionary for dfm creation
mydfm2 <- dfm(cw, dictionary = dictionary(lapply(mydict, function(x) gsub(" ", "_", x))))
mydfm2

# to pick up "taxes" in the second text, set valuetype = "regex"
mydfm3 <- dfm(cw, dictionary = dictionary(lapply(mydict, phrasetotoken, mydict)),
              valuetype = "regex")
mydfm3
## one more token counted for "tax" than before
}
# using a dictionary to pre-process multi-word expressions
myDict <- dictionary(list(negative = c("bad* word*", "negative", "awful text"),
                          postiive = c("good stuff", "like? th??")))
txt <- c("I liked this, when we can use bad words, in awful text.",
         "Some damn good stuff, like the text, she likes that too.")
phrasetotoken(txt, myDict)

# on simple text
\donttest{
phrasetotoken("This is a simpler version of multi word expressions.", "multi word expression*")
}
\donttest{
# on simple text
toks <- tokenize("Simon sez the multi word expression plural is multi word expressions, Simon sez.")
phrases <- c("multi word expression*", "Simon sez")
phrasetotoken(toks, phrases)
}
}
\author{
Kenneth Benoit
}
\keyword{deprecated}
\keyword{internal}
