% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/smart_mva.R
\name{smart_mva}
\alias{smart_mva}
\title{Smart Multivariate Analyses (wrapper of PCA, PERMANOVA and PERMDISP)}
\arguments{
\item{snp_data}{snp_data}{File name read from working directory.
SNP = rows, samples = columns without row names or column headings.
SNP values must be count data (no decimals allowed).
File extension detected automatically whether text or \code{EIGENSTRAT}.
See details.}

\item{packed_data}{Logical value for \code{EIGENSTRAT}, irrelevant for text data.
Default \code{packed_data = FALSE} assumes uncompressed \code{EIGENSTRAT}.
\code{packed_data = TRUE} for compressed or binary \code{EIGENSTRAT} (\code{PACKENDANCESTRYMAP}).}

\item{sample_group}{Character or numeric vector assigning samples to groups.
Coerced to factor.}

\item{sample_remove}{Logical \code{FALSE} or numeric vector indicating column numbers (samples) to be removed from computations.
Default \code{sample_remove =  FALSE} keeps all samples.}

\item{snp_remove}{Logical \code{FALSE} or numeric vector indicating row numbers (SNPs) to be removed from computations.
Default \code{snp_remove =  FALSE} keeps all SNPs.
See details.}

\item{pca}{Logical indicating if PCA is computed.
Default \code{TRUE}.}

\item{permanova}{Logical indicating if PERMANOVA is computed.
Default \code{TRUE}}

\item{permdisp}{Logical indicating if PERMDISP is computed.
Default \code{TRUE}.}

\item{missing_value}{Number \code{9} or string \code{NA} indicating missing value.
Default \code{missing_value = 9} as in \code{EIGENSTRAT}.
If no missing values present, no effect on computation.}

\item{missing_impute}{String handling missing values.
Default \code{missing_impute = "mean"} replaces missing values of each SNP by mean of non-missing values across samples.
\code{missing_impute = "remove"} removes SNPs with at least one missing value.
If no missing values present, no effect on computation.}

\item{scaling}{String. Default \code{scaling = "drift"} scales SNPs to control for expected allele frequency dispersion caused by genetic drift (SMARTPCA).
\code{scaling = "center"} for \code{centering} (covariance-based PCA).
\code{scaling = "sd"} for \code{centered} SNPs divided by standard deviation (correlation-based PCA).
\code{scaling = "none"} for no scaling.
See details.}

\item{program_svd}{String indicating R package computing single value decomposition (SVD).
Default \code{program_svd = "Rspectra"} for \code{\link[RSpectra]{svds}}.
\code{program_svd = "bootSVD"} for \code{\link[bootSVD]{fastSVD}}.
See details.}

\item{sample_project}{Numeric vector indicating column numbers (ancient samples) projected onto (modern) PCA space.
Default \code{sample_project =  FALSE} implements no projection.
See details.}

\item{pc_project}{Numeric vector indicating the ranks of the PCA axes ancient samples are projected onto. Default \code{pc_ancient = c(1, 2)} for PCA axes 1 and 2.
If \code{program_svd = "RSpectra"}, \code{length(pc_ancient)} must be smaller than or equal to \code{pc_axes}.
No effect on computation, if no ancient samples present.}

\item{sample_distance}{Type of inter-sample proximity computed (distance, similarity, dissimilarity).
Default is \code{Euclidean distance}.
See details.}

\item{program_distance}{A string value indicating R package to estimate proximities between pairs of samples.
Default \code{program_distance = "Rfast"} uses function \code{\link[Rfast]{Dist}}; \code{program_distance = "vegan"} uses \code{\link[vegan]{vegdist}}.
See details.}

\item{target_space}{String.
Default \code{target_space = "multidimensional"} applies PERMANOVA and/or PERMDISP to sample-by-sample triangular matrix computed from variable-by-sample data, \code{pc_axes} has no effect on computation. \code{target_space = "pca"} applies PERMANOVA and/or PERMDISP to sample-by-sample data in PCA space, \code{pc_axes} determines number of PCA axes for testing.}

\item{pc_axes}{Number of PCA axes computed always starting with PCA axis 1.
Default \code{pc_axes = 2} computes PCA axes 1 and 2 if \code{target_space = "pca"}.
No effect on computation if \code{target_space = "multidimensional"}.}

\item{pairwise}{Logical.
Default \code{pairwise = FALSE} computes global test. \code{pairwise = TRUE} computes global and pairwise tests.}

\item{pairwise_method}{String specifying type of correction for multiple testing.
Default \code{"holm"}.}

\item{permutation_n}{Number of permutations resulting in PERMANOVA/PERMDISP test \emph{p value}.
Default \code{9999}.}

\item{permutation_seed}{Number fixing random generator of permutations.
Default \code{1}.}

\item{dispersion_type}{String indicating quantification of group dispersion whether relative to spatial \code{"median"} or \code{"centroid"} in PERMDISP.
Default \code{"median"}.}

\item{samplesize_bias}{Logical. \code{samplesize_bias = TRUE} for dispersion weighted by number of samples per group in PERMDISP.
Default \code{pairwise = FALSE} for no weighting.}
}
\value{
Returns a list containing the following elements:
\itemize{
\item{pca.snp_loadings}{Dataframe of principal coefficients of SNPs.
One set of coefficients per PCA axis computed.}
\item{pca.eigenvalues}{Dataframe of eigenvalues, variance and cumulative variance explained.
One eigenvalue per PCA axis computed.}
\item{pca_sample_coordinates}{Dataframe showing PCA sample summary. Column \emph{Group} assigns samples to groups. Column \emph{Class} specifies if samples "Removed" from PCA or "Projected" onto PCA space.
Sequence of additional columns shows principal components (coordinates) of samples in PCA space (1 column per PCA computed named PC1, PC2, ...).}
\item{test_samples}{Dataframe showing test sample summary.
Column \emph{Group} assigns samples to tested groups.
Column \emph{Class} specifies if samples were used in, or removed from, testing (PERMANOVA and/or PERMDISP).
Column \emph{Sample_dispersion} shows dispersion of individual samples relative to spatial \code{"median"} or \code{"centroid"} used in PERMDISP.}
\item{permanova.global_test}{List showing PERMANOVA table with degrees of freedom, sum of squares, mean sum of squares, \emph{F} statistic, variance explained (\emph{R2}) and \emph{p} value.}
\item{permanova.pairwise_test}{List showing PERMANOVA table with \emph{F} statistic, variance explained (\emph{R2}), \emph{p} value and corrected \emph{p} value per pair of groups.}
\item{permdisp.global_test}{List showing PERMDISP table with degrees of freedoms, sum of squares, mean sum of squares, \emph{F} statistic and \emph{p} value.}
\item{permdisp.pairwise_test}{List showing PERMDISP table with \emph{F} statistic, \emph{p} value and corrected \emph{p} value per pair of groups.
Obtained only if \code{pairwise = TRUE}.}
\item{permdisp.bias}{String indicating if PERMDISP dispersion corrected for number of samples per group.}
\item{permdisp.group_location}{Dataframe showing coordinates of spatial \code{"median"} or \code{"centroid"} per group in PERMDISP.}
\item{test.pairwise_correction}{String indicating type of correction for multiple testing in PERMANOVA and/or PERMDISP.}
\item{test.permutation_number}{Number of permutations applied to obtain the distribution of \emph{F} statistic of PERMANOVA and/or PERMDISP.}
\item{test.permutation_seed}{Number fixing random generator of permutations of PERMANOVA and/or PERMDISP for reproducibility of results.}
}
}
\description{
Computes Principal Component Analysis (PCA) for variable x sample genotype data, such as Single Nucleotide Polymorphisms (SNP), in combination with Permutational Multivariate Analysis of Variance (PERMANOVA) and Permutational Multivariate Analysis of Dispersion (PERMDISP).
A wrapper of functions \code{smart_pca}, \code{smart_permanova} and \code{smart_permdisp}.
Genetic markers such as SNPs can be scaled by \code{centering}, z-scores and genetic drift-based dispersion.
The latter follows the SMARTPCA implementation of Patterson, Price and Reich (2006).
Optimized to run fast computation for big datasets.
}
\details{
See details in other functions for conceptualization of PCA (\code{smart_pca}) (Hotelling 1993), SMARTPCA (Patterson, Price and Reich 2006), PERMANOVA (\code{smart_permanova}) (Anderson 2001) and PERMDISP (\code{smart_permdisp} (Anderson 2006), types of scaling, ancient projection, and correction for multiple testing.\cr

Users can compute any combination of the three analyses by assigning \code{TRUE} or \code{FALSE} to \code{pca} and/or \code{permanova} and/or \code{permdisp}.\cr

PERMANOVA and PERMDISP exclude samples (columns) specified in either \code{sample_remove} or \code{sample_project}.
Projected samples are not used for testing as their PCA coordinates are derived from, and therefore depend on, the coordinates of non-projected samples.\cr

Data read from working directory with SNPs as rows and samples as columns. Two alternative formats: (1) text file of SNPs by samples (file extension and column separators recognized automatically) read using \code{\link[data.table]{fread}}; or (2) duet of \code{EIGENSTRAT} files (see \url{https://reich.hms.harvard.edu/software}) using \code{\link[vroom]{vroom_fwf}}, including a genotype file of SNPs by samples (\code{*.geno}), and a sample file (\code{*.ind}) containing three vectors assigning individual samples to unique user-predefined groups (populations), sexes (or other user-defined descriptor) and alphanumeric identifiers.
For \code{EIGENSTRAT}, vector \code{sample_group} assigns samples to groups retrievable from column 3 of file \code{*.ind}.
SNPs with zero variance removed prior to SVD to optimize computation time and avoid undefined values if \code{scaling = "sd"} or \code{"drift"}.\cr

Users can select subsets of samples or SNPs by introducing a vector including column numbers for samples (\code{sample_remove}) and/or row numbers for SNPs (\code{snp_remove}) to be removed from computations.
Function stops if the final number of SNPs is 1 or 2.
\code{EIGENSOFT} was conceived for the analysis of human genes and its SMARTPCA suite so accepts 22 (autosomal) chromosomes by default.
If >22 chromosomes are provided and the internal parameter \code{numchrom} is not set to the target number chromosomes of interest, SMARTPCA automatically subsets chromosomes 1 to 22.
In contrast, \code{smart_mva} accepts any number of autosomes with or without the sex chromosomes from an \code{EIGENSTRAT} file.\cr
}
\examples{
# Path to example genotype matrix "dataSNP"
pathToGenoFile = system.file("extdata", "dataSNP", package = "smartsnp")

# Assign 50 samples to each of two groups and colors
my_groups <- as.factor(c(rep("A", 50), rep("B", 50))); cols = c("red", "blue")

# Run PCA, PERMANOVA and PERMDISP
mvaR <- smart_mva(snp_data = pathToGenoFile, sample_group = my_groups)
mvaR$pca$pca.eigenvalues # extract PCA eigenvalues
mvaR$pca$pca.snp_loadings # extract principal coefficients (SNP loadings)
mvaR$pca$pca.sample_coordinates # extract PCA principal components (sample position in PCA space)

# plot PCA
plot(mvaR$pca$pca.sample_coordinates[,c("PC1","PC2")], cex = 2,
     pch = 19, col = cols[my_groups], main = "genotype smartpca")
legend("topleft", legend = levels(my_groups), cex = 1,
       pch = 19, col = cols, text.col = cols)

# Extract PERMANOVA table
mvaR$test$permanova.global_test

# Extract PERMDISP table
mvaR$test$permdisp.global_test # extract PERMDISP table

# Extract sample summary and dispersion of individual samples used in PERMDISP
mvaR$test$test_samples

}
\seealso{
\code{\link{smart_pca}},
\code{\link{smart_permanova}},
\code{\link{smart_permdisp}}
}
