% Generated by roxygen2 (4.0.1): do not edit by hand
\name{pre.process}
\alias{pre.center}
\alias{pre.impute.median}
\alias{pre.process}
\alias{pre.scale}
\alias{pre.split}
\title{Data preprocessing}
\usage{
pre.split(x, y, fold)

pre.center(x, y, fold)

pre.scale(x, y, fold, scale = TRUE)

pre.impute.median(x, y, fold)
}
\arguments{
\item{x}{Dataset.}

\item{y}{Response vector.}

\item{fold}{A logical vector with \code{FALSE} for fitting observations,
\code{TRUE} for test observations and \code{NA} for observations not
to be included.}

\item{scale}{Whether to scale each feature to have standard deviation = 1.}
}
\value{
A list with the following components
\describe{
    \item{\code{fit}}{Fitting set.}
    \item{\code{test}}{Test set.}
    \item{\code{features}}{Logical vector indicating which features were kept
        (TRUE) and discarded (FALSE). This is only set in case of variable
        selection.}
}
}
\description{
These functions are run in \code{\link{batch.model}} just prior to model
fitting and serve two purposes. 1) They extract fitting and test sets from
the entire dataset and 2) they can at the same time apply a transformation
to pre-process the data for handling missing values, scaling, compression
etc.
They can also be used to modify the form of the data, if required by the
fitting function, e.g. \code{\link{pre.pamr}} that transposes the dataset
to make it compatible with the \code{pamr} classification method.
}
\details{
Note that all transformations are defined based on the fitting data only
and then applied to both fitting set and test set. It is important to not let
the test data in any way be part of the model fitting, including the
preprocessing, to not risk information leakage and biased results!

The imputation functions can also be used outside of the resampling scheme,
see \code{\link{impute}}.
}
\examples{
# A splitter that only keeps variables with a class-wise mean difference > `d`
my.split <- function(x, y, fold, d=2){
    fit.idx <- index.fit(fold)
    test.idx <- index.test(fold)
    class.means <- sapply(
        split(x[fit.idx,, drop=FALSE], y[fit.idx]),
        sapply, mean, na.rm=TRUE)
    diff.feats <- apply(class.means, 1, function(x) diff(range(x))) > d
    return(list(
        fit = list(x = x[fit.idx, diff.feats, drop=FALSE],
                   y = y[fit.idx]),
        test = list(x = x[test.idx, diff.feats, drop=FALSE],
                    y = y[test.idx]),
        features = diff.feats))
}

# Use it during modeling
proc <- modeling.procedure("lda")
perf <- evaluate.modeling(proc, x = iris[-5], y = iris$Species,
                          pre.process = my.split)

# Example of how the end user can change the `d` parameter,
# without redefining the function
perf <- evaluate.modeling(proc, x = iris[-5], y = iris$Species,
                          pre.process = function(...) my.split(..., d = 1.3))
}
\author{
Christofer \enc{Bäcklin}{Backlin}
}
\seealso{
\code{\link{emil}}, \code{\link{pre.impute.knn}}
}

