% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/preprocessing.r
\name{pre_process}
\alias{pre_center}
\alias{pre_convert}
\alias{pre_pca}
\alias{pre_process}
\alias{pre_remove}
\alias{pre_remove_constant}
\alias{pre_remove_correlated}
\alias{pre_scale}
\alias{pre_split}
\alias{pre_transpose}
\title{Data preprocessing}
\usage{
pre_split(x, y, fold)

pre_convert(data, x_fun, y_fun, ...)

pre_transpose(data)

pre_remove(data, feature)

pre_center(data, y = FALSE, na.rm = TRUE)

pre_scale(data, y = FALSE, na.rm = TRUE, center = TRUE)

pre_remove_constant(data)

pre_remove_correlated(data, cutoff)

pre_pca(data, ncomponent, scale. = TRUE, ...)
}
\arguments{
\item{x}{Dataset.}

\item{y}{Response vector.}

\item{fold}{A logical or numeric vector with \code{TRUE} or positive numbers
for fitting observations, \code{FALSE} or \code{0} for test
observations, and \code{NA} for observations not to be included.}

\item{data}{Fitting and testing data sets, as returned by
\code{\link{pre_split}}.}

\item{x_fun}{Function to apply to the descriptors of the datasets
(e.g. \code{x}). This function will be applied independenly to the fitting
and testing sets.}

\item{y_fun}{Function to be applied to the response of the training and test
sets (independently).}

\item{...}{Sent to internal methods, see the code of each function.}

\item{feature}{The features to be removed. Can be integer, logical or
character.}

\item{na.rm}{A logical value indicating whether \code{NA} values should be
ignored.}

\item{center}{Whether to center the data before scaling.}

\item{cutoff}{See \code{\link[caret]{findCorrelation}}.}

\item{ncomponent}{Number of PCA components to use. Missing all components
are used.}

\item{scale.}{Sent to \code{\link{prcomp}}.}
}
\value{
A list with the following components
\describe{
    \item{\code{fit}}{Fitting set.}
    \item{\code{test}}{Test set.}
    \item{\code{feature_selection}}{Logical vector indicating which features were kept
        (TRUE) and discarded (FALSE).}
    \item{\code{fold}}{The fold that was used to split the data.}
}
}
\description{
These functions are run in \code{\link{evaluate}} just prior to model
fitting, to extract fitting and test sets from the entire dataset and apply
transformations to pre-process the data (for handling missing values,
scaling, compression etc.).
They can also be used to adapt the form of the data to a specific
fitting function, e.g. \code{\link{pre_pamr}} that transposes the dataset
to make it compatible with the \code{pamr} classification method.
}
\details{
When supplied to \code{\link{evaluate}}, pre-processing functions can be
chained (i.e. executed sequentially) after an initating call to
\code{\link{pre_split}}.
This can either be done using the \code{\link[=chain]{pipe operator}} defined
in the \pkg{magrittr} package or by putting all pre-processing functions in a
regular list (see the examples).

Note that all transformations are defined based on the fitting data only
and then applied to both fitting set and test set. It is important to not let
the test data in any way be part of the model fitting, including the
preprocessing, to not risk information leakage and biased results!

The imputation functions can also be used outside of
\code{\link{evaluate}} by not supplying a fold to
\code{\link{pre_split}}.
See the code of \code{\link{impute_median}} for an example.
}
\examples{
# Setup an example to work on
x <- as.matrix(iris[-5])
x[sample(600, 6)] <- NA
y <- iris$Species
cv <- resample("crossvalidation", y, nrepeat=3, nfold=4)
procedure <- modeling_procedure("lda")

# Simple dataset splitting
sets <- pre_split(x, y, cv[[1]])

# Chaining using the pipe operator
sets <- pre_split(x, y, cv[[1]]) \%>\%
    pre_impute_median \%>\%
    pre_scale

# Integration with `evaluate`
result <- evaluate(procedure, x, y, resample=cv,
    pre_process = function(...){
        pre_split(...) \%>\%
        pre_impute_median \%>\%
        pre_scale
    }
)

# or analogously with a list
result <- evaluate(procedure, x, y, resample=cv,
    pre_process = list(pre_split, pre_impute_median, pre_scale))

# Imputing without splitting
x.imputed <- impute_knn(x)

# Using a whole chain without splitting
x.processed <- pre_split(x, y=NULL) \%>\%
    pre_impute_median \%>\%
    pre_scale \%>\%
    (function(data) data$fit$x)

}
\author{
Christofer \enc{Bäcklin}{Backlin}
}
\seealso{
\code{\link{emil}}, \code{\link{pre_impute_knn}}
}

