\name{run.cluster.matrix}
\alias{run.cluster.matrix}
\title{Identify Equivalent Peaks from Different Subjects}

\description{
Takes the file generated by \code{\link{run.lrg.peaks}}, identifies equivalent peaks in each spectrum,
and fills in missing values.
}

\usage{
run.cluster.matrix(pre.align = FALSE, align.method = c("PL",
                   "spline", "affine", "none"), align.fcn = NA,
                   trans.method = c("shiftedlog", "glog", "none"),
                   add.par = 0, subtract.base = FALSE, 
                   lrg.only = TRUE, calc.all.peaks = FALSE, 
                   masses = NA, isotope.dist = 7, 
                   cluster.method = c("ppm", "constant", "usewidth"), 
                   cluster.constant = 10, num.pts = 5, 
                   R2.thresh = 0.98, oneside.min = 1, min.spect = 1,
                   peak.method = c("parabola", "locmaxes"), 
                   bhbysubj = TRUE, covariates, root.dir = ".",
                   base.dir, peak.dir, lrg.dir,
                   lrg.file = "lrg_peaks.RData", overwrite = FALSE,
                   use.par.file = FALSE, par.file = "parameters.RData")
}

\arguments{
    \item{pre.align}{either \code{FALSE}, or a numeric vector of shifts to apply to spectra, or a four-component list (of the form described in the \code{Note} section below) to be used before identifying peaks from different spectra}
    \item{align.method}{alignment algorithm for peaks}
    \item{align.fcn}{function (and inverse) to apply to masses before (and after) applying \code{align.method}; see below}
    \item{trans.method}{type of transformation to use on spectra before statistical analysis}
    \item{add.par}{additive parameter for \code{"shiftedlog"} or \code{"glog"} options for \code{trans.method}}
    \item{subtract.base}{logical; whether to subtract calculated baseline from spectrum}
    \item{lrg.only}{logical; whether to consider only peaks that have at least one \dQuote{large} peak; i.e., identified by \code{run.lrg.peaks}}
    \item{calc.all.peaks}{logical; whether to calculate all possible peaks or only sufficiently large ones}
    \item{masses}{specific masses to test}
    \item{isotope.dist}{maximum distance for declaring isotopes}
    \item{cluster.method}{method for determining when two peaks from different spectra are the same}
    \item{cluster.constant}{parameter used in running \code{cluster.method}}
    \item{num.pts}{number of consecutive points needed for peak fitting}
    \item{R2.thresh}{\eqn{R^2} value needed for peak fitting}
    \item{oneside.min}{minimum number of points on each side of local maximum for peak fitting}
    \item{min.spect}{minimum number of spectra necessary for peak to be used in \code{\link{run.analysis}}}
    \item{peak.method}{method for locating peaks}
    \item{bhbysubj}{logical; whether to look for number of large peaks by subject (i.e., combining replicates) or by spectrum}
    \item{covariates}{data frame with rownames given by raw data files with extensions (e.g., \dQuote{.txt}) stripped;
        only needed if \code{bhbysubj == TRUE}}
    \item{root.dir}{directory for parameters file and raw data}
    \item{base.dir}{directory for baseline files; default is \code{paste(root.dir, "/Baselines", sep = "")}}
    \item{peak.dir}{directory for peak location files; default is \code{paste(root.dir, "/All_Peaks", sep = "")}}
    \item{lrg.dir}{directory for large peaks file; default is \code{paste(root.dir, "/Large_Peaks", sep = "")}}
    \item{lrg.file}{name of file to store large peaks in}
    \item{overwrite}{logical; whether to replace existing files with new ones}
    \item{use.par.file}{logical; if \code{TRUE}, then parameters are read from \code{par.file} in directory \code{root.dir}}
    \item{par.file}{string containing name of parameters file}
}

\details{
Reads in information from file created by \code{\link{run.strong.peaks}},
calculates the cluster matrix, fills in missing values, and overwrites the file
named \code{lrg.file} in \code{lrg.dir}.  The resulting file contains variables
\tabular{ll}{ \tab \cr
    \code{amps} \tab data frame of amplitudes created by \code{\link{run.strong.peaks}} \cr
    \code{centers} \tab data frame of centers created by \code{\link{run.strong.peaks}} \cr
    \code{clust.mat} \tab data frame with columns given by samples and rows given by the distinct peaks in the samples \cr
    \code{lrg.mat} \tab data frame of same size as \code{clust.mat} with entries given by \code{TRUE} if the peak was large in that spectrum and \code{FALSE} otherwise \cr
    \code{lrg.peaks} \tab the data frame of significant peaks created by \code{\link{run.lrg.peaks}} \cr
    \code{num.lrg} \tab number of subjects (or spectra if \code{bhbysubj == TRUE}) with a large peak at the corresponding mass \cr
}
and is ready to be used by \code{\link{run.analysis}}.
}

\value{
No value returned; the file is simply created.
}

\references{
Barkauskas, D.A. and D.M. Rocke.  (2009a) \dQuote{A general-purpose baseline 
estimation algorithm for spectroscopic data}.  to appear in \emph{Analytica 
Chimica Acta}.  doi:10.1016/j.aca.2009.10.043   

Barkauskas, D.A. \emph{et al}. (2009b) \dQuote{Analysis of MALDI FT-ICR mass 
spectrometry data: A time series approach}.  \emph{Analytica Chimica Acta}, 
\bold{648}:2, 207--214.

Barkauskas, D.A. \emph{et al}. (2009c) \dQuote{Detecting glycan cancer 
biomarkers in serum samples using MALDI FT-ICR mass spectrometry data}.  
\emph{Bioinformatics}, \bold{25}:2, 251--257.

Zhang, L.-K. \emph{et al}. (2005) \dQuote{Accurate mass measurements by Fourier 
transform mass spectrometry}.  \emph{Mass Spectrom Rev}, \bold{24}:2, 286--309.
}

\author{Don Barkauskas (\email{barkda@wald.ucdavis.edu})}

\note{
If \code{use.par.file == TRUE} and other parameters are entered into the function
call, then the parameters entered in the function call overwrite those read in
from the file.  Note that this is opposite from the behavior for
\pkg{\link{FTICRMS}} versions 0.7 and earlier.

\code{align.method}, \code{cluster.method}, \code{peak.method}, and
\code{trans.method} can be abbreviated.

If \code{align.fcn} is not \code{NA}, then it should consist of a list with
components \code{fcn} and \code{inv}, each of class \code{function}.
\code{align.fcn$fcn} should take a vector of masses as its argument and return a
vector of transformed masses.  (Typically, this will be transforming masses to
frequencies; see Zhang (2005).)  \code{align.fcn$inv} should be the inverse
function of \code{align.fcn$fcn}.

If \code{align.method == "spline"}, then alignment consists of making the
transformed masses of the strong peaks all agree exactly with their means, then
shifting the rest of the transformed masses via an interpolation spline
generated using \code{\link[splines]{interpSpline}}.  If
\code{align.method == "PL"}, then the same is done but interpolation is done
piecewise linearly between the strong peaks.  If
\code{align.method == "leastsq"}, then the transformed masses of the strong peaks
are aligned to their means using a least-squares affine fit for each spectrum.
In any of these cases, if there are no strong peaks, \code{align.method} is
changed to \code{"none"} with a warning.  If there is exactly one strong peak,
then alignment is by a simple shift in each spectrum on the transformed masses.
If there are exactly two strong peaks, then the alignment is by a simple affine
transformation on the transformed masses in each spectrum.  If
\code{align.method = "spline"} and there are exactly three strong peaks, then
alignment is piecewise affine on the transformed masses (i.e., identical to
\code{align.method = "PL"}).

If \code{align.method = "leastsq"}, it is strongly recommended that you supply a
value for \code{align.fcn} that makes the data points (approximately)
equally-spaced.

Defining a value for \code{min.spect} can vastly speed up the run time at the
(small) cost of a little flexibility in doing the statistical analysis in
\code{\link{run.analysis}}.  For exploratory data analysis, this should probably
be left alone, but once the peak criterion has been established, further
analyses will go much more quickly with \code{min.spect} re-defined.  The value
can either be an integer, which is interpreted as the number of spectra; or a
number between 0 and 1, in which case it is interpreted as a fraction of the
total number of spectra.  In either case, the values of \code{clust.mat},
\code{lrg.mat}, and \code{num.lrg} saved in \code{lrg.file} are only those
masses which have at least \code{min.spect} large peaks among the spectra.

\code{pre.align = FALSE} is used if the spectra have already been aligned by the
mass spectroscopists.  If it is not \code{FALSE}, it can either be a vector of
additive shifts to be applied to the spectra, or a list with components
\code{targets}, \code{actual}, and \code{align.method}.  In the last case,
\code{targets} is a vector of target masses, and \code{actual} is a matrix with
\code{length(targets)} columns and a row for each spectrum, \code{actual[i,j]}
being the mass in spectrum \code{i} that should be matched exactly to
\code{target[j]}, with \code{NA} being a valid entry in \code{actual}.  The
alignment is then done as in the description in the above paragraph, depending
on the number of non-missing values in row \code{i}).

Suppose \code{cluster.constant = K} and we have two peaks in different spectra
with masses \eqn{m_1 < m_2}{m[1]<m[2]}.  If \code{cluster.method == "constant"},
then the peaks are considered to be the same peak if we have
\eqn{m_{2}-m_{1} < K}{m[2]-m[1] < K}.  If \code{cluster.method == "ppm"}, then
the peaks are considered to be the same peak if we have
\eqn{m_{2}-m_{1} < Km_{2}/10^{6}}{m[2]-m[1] < K * m[2] * 1e-6}.  If
\code{cluster.method == "usewidth"}, then the algorithm uses the observation that
\code{log(Width_hat)} and \code{log(Center_hat)} appear to be linearly related.
Tolerances are computed using this relationship.
}

\seealso{
\code{\link{run.lrg.peaks}}, \code{\link{run.strong.peaks}},
\code{\link[splines]{interpSpline}}
}

\examples{}
