% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/corpus.R
\name{corpus}
\alias{+.corpus}
\alias{corpus}
\alias{corpus.VCorpus}
\alias{corpus.character}
\alias{corpus.corpusSource}
\alias{is.corpus}
\title{constructor for corpus objects}
\usage{
corpus(x, ...)

\method{corpus}{character}(x, enc = NULL, docnames = NULL, docvars = NULL,
  source = NULL, notes = NULL, citation = NULL, ...)

\method{corpus}{corpusSource}(x, enc = NULL, notes = NULL,
  citation = NULL, ...)

\method{corpus}{VCorpus}(x, enc = NULL, notes = NULL, citation = NULL,
  ...)

is.corpus(x)

\method{+}{corpus}(c1, c2)
}
\arguments{
\item{x}{a source of texts to form the documents in the corpus, a character
vector or a \link{corpusSource-class} object created using
\code{\link{textfile}}.}

\item{...}{additional arguments}

\item{enc}{A string specifying the input encoding for texts in the
corpus.  Must be a valid entry in \code{\link{iconvlist}()}, since the code in
\code{corpus.character} will convert this to \code{UTF-8} using \code{\link{iconv}}.
Currently only one input encoding can be specified for a collection of input texts,
meaning that you should not mix input text encoding types in a single \code{corpus} call.}

\item{docnames}{Names to be assigned to the texts, defaults to the names of the
character vector (if any), otherwise assigns "text1", "text2", etc.}

\item{docvars}{A data frame of attributes that is associated with each text.}

\item{source}{A string specifying the source of the texts, used for referencing.}

\item{notes}{A string containing notes about who created the text, warnings, To Dos, etc.}

\item{citation}{Information on how to cite the corpus.}

\item{c1}{corpus one to be added}

\item{c2}{corpus two to be added}
}
\value{
A corpus class object containing the original texts, document-level
  variables, document-level metadata, corpus-level metadata, and default
  settings for subsequent processing of the corpus.  A corpus consists of a
  list of elements described below, although these should only be accessed
  through accessor and replacement functions, not directly (since the
  internals may be subject to change).  The structure of a corpus classed
  list object is:

  \item{$documents}{A data frame containing the document level information,
  consisting of \code{\link{texts}}, user-named \code{\link{docvars}}
  variables describing attributes of the documents, and \code{metadoc}
  document-level metadata whose names begin with an underscore character,
  such as \code{_language}.}

  \item{$metadata}{A named list set of corpus-level meta-data, including
  \code{source} and \code{created} (both generated automatically unless
  assigned), \code{notes}, and \code{citation}.}

  \item{$settings}{Settings for the corpus which record options that govern
  the subsequent processing of the corpus when it is converted into a
  document-feature matrix (\link{dfm}).  See \link{settings}.}

  \item{$tokens}{An indexed list of tokens and types tabulated by document,
  including information on positions.  Not yet fully implemented.}

\code{is.corpus} returns \code{TRUE} if the object is a corpus
}
\description{
Creates a corpus from a document source.  The current available
  document sources are: \itemize{ \item a character vector (as in R class
  \code{char}) of texts; \item a \link{corpusSource-class} object,
  constructed using \code{\link{textfile}}; \item a \pkg{tm}
  \link[tm]{VCorpus} class corpus object, meaning that anything you can use
  to create a \pkg{tm} corpus, including all of the tm plugins plus the
  built-in functions of tm for importing pdf, Word, and XML documents, can be
  used to create a quanteda \link{corpus}. } Corpus-level meta-data can be
  specified at creation, containing (for example) citation information and
  notes, as can document-level variables and document-level meta-data.
}
\details{
The \code{+} operator for a corpus object will combine two corpus
  objects, resolving any non-matching \code{\link{docvars}} or
  \code{\link{metadoc}} fields by making them into \code{NA} values for the
  corpus lacking that field.  Corpus-level meta data is concatenated, except
  for \code{source} and \code{notes}, which are stamped with information
  pertaining to the creation of the new joined corpus.

  There are some issues that need to be addressed in future revisions of
  quanteda concerning the use of factors to store document variables and
  meta-data.  Currently most or all of these are not recorded as factors,
  because we use \code{stringsAsFactors=FALSE} in the
  \code{\link{data.frame}} calls that are used to create and store the
  document-level information, because the texts should always be stored as
  character vectors and never as factors.
}
\note{
When \code{x} is a \link[tm]{VCorpus} object, the fixed metadata
  fields from that object are imported as document-level metadata. Currently
  no corpus-level metadata is imported, but we will add that soon.
}
\examples{
#
# create a corpus from texts
corpus(inaugTexts)

# create a corpus from texts and assign meta-data and document variables
ukimmigCorpus <- corpus(ukimmigTexts,
                            docvars=data.frame(party=names(ukimmigTexts)),
                            enc="UTF-8")
\donttest{# the fifth column of this csv file is the text field
mytexts <- textfile("http://www.kenbenoit.net/files/text_example.csv", textField=5)
str(mytexts)
mycorp <- corpus(mytexts)
mycorp2 <- corpus(textfile("http://www.kenbenoit.net/files/text_example.csv", textField="Title"))
identical(texts(mycorp), texts(mycorp2))
identical(docvars(mycorp), docvars(mycorp2))}
#
## import a tm VCorpus
if (require(tm)) {
    data(crude)    # load in a tm example VCorpus
    mytmCorpus <- corpus(crude)
    summary(mytmCorpus, showmeta=TRUE)
}
}
\author{
Kenneth Benoit and Paul Nulty
}
\seealso{
\link{docvars}, \link{metadoc}, \link{metacorpus}, \link{language},
  \link{encoding}, \link{settings}, \link{texts}
}

