% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sim_data.R
\name{sim_data}
\alias{sim_data}
\alias{print.sim_data}
\alias{pool_sim_data}
\title{Simulate Datasets Based on a Model}
\usage{
sim_data(
  nrep = 10,
  ptable = NULL,
  model = NULL,
  pop_es = NULL,
  ...,
  n = 100,
  iseed = NULL,
  number_of_indicators = NULL,
  reliability = NULL,
  x_fun = list(),
  e_fun = list(),
  process_data = NULL,
  parallel = FALSE,
  progress = FALSE,
  ncores = max(1, parallel::detectCores(logical = FALSE) - 1)
)

\method{print}{sim_data}(
  x,
  digits = 3,
  digits_descriptive = 2,
  data_long = TRUE,
  fit_to_all_args = list(),
  est_type = "standardized",
  variances = NULL,
  pure_x = TRUE,
  pure_y = TRUE,
  ...
)

pool_sim_data(object, as_list = FALSE)
}
\arguments{
\item{nrep}{The number of replications
to generate the simulated datasets.
Default is 10.}

\item{ptable}{The output of
\code{\link[=ptable_pop]{ptable_pop()}}, which is a
\code{ptable_pop} object, representing the
population model. If \code{NULL}, the
default, \code{\link[=ptable_pop]{ptable_pop()}} will be
called to generate the \code{ptable_pop}
object, using arguments such as
\code{model} and \code{pop_es}.}

\item{model}{The \code{lavaan} model
syntax of the population model.
Ignored if \code{ptable} is
specified. See \link{ptable_pop} on
how to specify this argument.}

\item{pop_es}{The character to
specify population effect sizes.
See \link{ptable_pop} on
how to specify this argument.
Ignored if \code{ptable} is
specified.}

\item{...}{For \link{sim_data}, parameters
to be passed to \code{\link[=ptable_pop]{ptable_pop()}}. For
\code{\link[=print.sim_data]{print.sim_data()}}, these arguments
are ignored.}

\item{n}{The sample size for each
dataset. Default is 100.}

\item{iseed}{The seed for the random
number generator. Default is \code{NULL}
and the seed is not changed.}

\item{number_of_indicators}{A named
vector to specify the number of
indicators for each factors. See
the help page on how to set this
argument. Default is \code{NULL} and all
variables in the model syntax are
observed variables.
See the help page on how
to use this argument.}

\item{reliability}{A named vector
(for a single-group model) or a
named list of named vectors
(for a multigroup model)
to set the reliability coefficient
of each set of indicators. Default
is \code{NULL}.
See the help page on how
to use this argument.}

\item{x_fun}{The function(s) used to
generate the exogenous variables or
error terms. If
not supplied, or set to \code{list()}, the
default, the variables are generated
from a multivariate normal
distribution. See the help page on how
to use this argument.}

\item{e_fun}{The function(s) used to
generate the error terms of indicators,
if any. If
not supplied, or set to \code{list()}, the
default, the error terms of indicators
are generated
from a multivariate normal
distribution. Specify in the same
way as \code{x_fun}. Refer to the help
page on \code{x_fun} on how to use this
argument.}

\item{process_data}{If not \code{NULL}, it
must be a named list with these
elements: \code{fun} (required), the function
to further processing the simulated
data, such as generating missing data using
functions such as \code{\link[mice:ampute]{mice::ampute()}}; \code{args} (optional), a
named list of arguments to be passed
to \code{fun}, except the one for the
source data; \code{sim_data_name} (required) the
name of the argument to receive the
simulated data (e.g., \code{data} for
\code{\link[mice:ampute]{mice::ampute()}}); \code{processed_data_name}
(optional), the name of the data frame
after being processed by \code{fun},
such as the data frame
with missing data in the output of
\code{fun} (e.g., \code{"amp"} for \code{\link[mice:ampute]{mice::ampute()}}),
if omitted, the output of \code{fun} should
be the data frame with missing data.}

\item{parallel}{If \code{TRUE}, parallel
processing will be used to simulate
the datasets. Default is \code{FALSE}.}

\item{progress}{If \code{TRUE}, the progress
of data simulation will be displayed.
Default is `FALSE.}

\item{ncores}{The number of CPU
cores to use if parallel processing
is used.}

\item{x}{The \code{sim_data} object
to be printed.}

\item{digits}{The numbers of digits
displayed after the decimal.}

\item{digits_descriptive}{The
number of digits displayed after
the decimal for the descriptive
statistics table.}

\item{data_long}{If \code{TRUE}, detailed
information will be printed.}

\item{fit_to_all_args}{A named list
of arguments to be passed to
\code{\link[lavaan:sem]{lavaan::sem()}} when the model is
fitted to a sample combined from
all samples stored.}

\item{est_type}{The type of estimates
to be printed. Can be a character
vector of one to two elements. If
only \code{"standardized"}, then the
standardized estimates are printed.
If only \code{"unstandardized"}, then the
unstandardized estimates are printed.
If a vector like
\code{c("standardized", "unstandardized")},
then both unstandardized and
standardized estimates are printed.}

\item{variances}{Logical. Whether
variances and error variances are printed.
Default depends on \code{est_type}. If
\code{"unstandardized"} is in \code{est_type},
then default is \code{TRUE} If
only \code{"standardized"} is in \code{est_type},
then default is \code{FALSE}.}

\item{pure_x, pure_y}{When Logical. When
printing indirect effects, whether
only "pure" x-variables (variables
not predicted by another other variables)
and/or "pure" y-variables (variables
that do not predict any other variables
other than indicators) will be included
in enumerating the paths.}

\item{object}{Either a \code{sim_data}
object or a \code{power4test} object.
It extracts the simulated data
and return them, combined to one
single data frame or, if \code{as_list}
is \code{TRUE}, as a list of data
frames.}

\item{as_list}{Logical. If \code{TRUE},
the simulated datasets is returned as one
single data frame. If \code{FALSE}, they
are returned as a list of data
frames.}
}
\value{
The function \code{\link[=sim_out]{sim_out()}} returns
a list of the class \code{sim_data},
with length \code{nrep}. Each element
is a \code{sim_data_i} object, with
the following major elements:
\itemize{
\item \code{ptable}: A \code{lavaan} parameter
table of the model, with population
values set in the column \code{start}.
(It is the output of the
function \code{\link[=ptable_pop]{ptable_pop()}}.)
\item \code{mm_out}: The population model
represented by model matrices
as in \code{lavaan}. (It is the output
of the function
\code{\link[=model_matrices_pop]{model_matrices_pop()}}.)
\item \code{mm_lm_out}: A list of regression
model formula, one for each
endogenous variable. (It is the
output of the internal function
\code{mm_lm()}.)
\item \code{mm_lm_dat_out}: A simulated dataset
generated from the population model.
(It is the output of the internal
function \code{mm_lm_data()}).
\item \code{model_original}: The original model
syntax (i.e., the argument \code{model}).
\item \code{model_final}: A modified model
syntax if the model is a latent
variable model. Indicators are added
to the syntax.
\item \code{fit0}: The output of \code{\link[lavaan:sem]{lavaan::sem()}}
with \code{ptable} as the model and
\code{do.fit} set to \code{FALSE}. Used for the
easy retrieval of information
about the model.
}

The \code{print} method of \code{sim_data}
returns \code{x} invisibly. It is called for
its side effect.

The function \code{pool_sim_data()} returns
either one data frame or a list
of data frames, depending on the
argument \code{as_list}
}
\description{
Get a model matrix and
effect size specification and
simulate a number of datasets,
along with other information.

The function
}
\details{
The function \code{\link[=sim_data]{sim_data()}} generates
a list of datasets based on a population
model.
}
\section{The role of \code{sim_data()}}{
The function \code{\link[=sim_data]{sim_data()}} is used by
the all-in-one function
\code{\link[=power4test]{power4test()}}. Users usually do not
call this function directly, though
developers can use this function to
develop other functions for power
analysis, or to build their own
workflows to do the power analysis.
}

\section{Workflow}{
The function \code{\link[=sim_data]{sim_data()}} does two tasks:
\itemize{
\item Determine the actual population
model with population values based
on:
\itemize{
\item A model syntax for the observed
variables (for a path model)
or the latent factors (for a
latent variable model).
\item A textual specification of the
effect sizes of parameters.
\item The number of indicators for
each latent factor if the model
is a latent variable model.
\item The reliability of each latent
factor as measured by the
indicators if the model is a
latent factor model.
}
\item Generate \emph{nrep} simulated datasets
from the population model.
}

The simulated datasets can then be
used to fit a model, test
parameters, and estimate power.

The output is usually used by
\code{\link[=fit_model]{fit_model()}} to fit a target model,
by default the population model, to each
of the dataset.
}

\section{Set 'number_of_indicators' and 'reliability'}{
The arguments \code{number_of_indicators}
and \code{reliability} are used to
specify the number of indicators
(e.g., items) for each factor,
and the population reliability
coefficient of each factor,
if the variables in the model
syntax are latent variables.
\subsection{Single-Group Model}{

If a variable in the model is to be
replaced by indicators in the generated
data, set
\code{number_of_indicators} to a named
numeric vector. The names are the
variables of variables with
indicators, as appeared in the
\code{model} syntax. The value of each
name is the number of indicators.

The
argument \code{reliability} should then be
set a named numeric vector (or list,
see the section on multigroup models)
to specify the population reliability
coefficient ("omega") of each set of
indicators. The population standardized factor
loadings are then computed to ensure
that the population reliability
coefficient is of the target value.

These are examples for a single group
model:

\preformatted{number of indicator = c(m = 3, x = 4, y = 5)}

The numbers of indicators for \code{m},
\code{x}, and \code{y} are 3, 4, and 5,
respectively.

\preformatted{reliability = c(m = .90, x = .80, y = .70)}

The population reliability
coefficients of \code{m}, \code{x}, and \code{y} are
.90, .80, and .70, respectively.
}

\subsection{Multigroup Models}{

Multigroup models are supported.
The number of groups is inferred
from \code{pop_es} (see the help page
of \code{\link[=ptable_pop]{ptable_pop()}}), or directly
from \code{ptable}.

For a multigroup model, the number
of indicators for each variable
must be the same across groups.

However, the population reliability
coefficients can be different
across groups. For a multigroup model
of \emph{k} groups,
with one or more population reliability
coefficients differ across groups,
the argument \code{reliability} should be
set to a named list. The names are
the variables to which the population
reliability coefficients are to be
set. The element for each name is
either a single value for the common
reliability coefficient, or a
numeric vector of the reliability
coefficient of each group.

This is an example of \code{reliability}
for a model with 2 groups:

\preformatted{reliability = list(x = .80, m = c(.70, .80))}

The reliability coefficients of \code{x} are
.80 in all groups, while the
reliability coefficients of \code{m} are
.70 in one group and .80 in another.
}

\subsection{Equal Numbers of Indicators and/or Reliability Coefficients}{

If all variables in the model has
the same number of indicators,
\code{number_of_indicators} can be set
to one single value.

Similarly, if all sets of indicators
have the same population reliability
in all groups, \code{reliability} can also
be set to one single value.
}
}

\section{Specify The Distributions of Exogenous Variables Or Error Terms Using 'x_fun'}{
By default, variables and error
terms are generated
from a multivariate normal distribution.
If desired, users can supply the
function used to generate an exogenous
variable and error term by setting \code{x_fun} to
a named list.

The names of the list are the variables
for which a user function will be used
to generate the data.

Each element of the list must also
be a list. The first element of this
list, can be unnamed, is the
function to be used. If other arguments
need to be supplied, they should be
included as named elements of this list.

For example:

\preformatted{x_fun = list(x = list(power4mome::rexp_rs),
             w = list(power4mome::rbinary_rs,
                      p1 = .70)))}

The variables \code{x} and \code{w} will be
generated by user-supplied functions.

For \code{x}, the function is
\code{power4mome::rexp_rs}. No additional
argument when calling this function.

For \code{w}, the function is
\code{power4mome::rbinary_rx}. The argument
\code{p1 = .70} will be passed to this
function when generating the values
of \code{w}.

If a variable is an endogenous
variable (e.g., being predicted by
another variable in a model), then
\code{x_fun} is used to generate its
\emph{error term}. Its implied population
distribution may still be different
from that generate by \code{x_fun} because
the distribution also depends on the
distribution of other variables
predicting it.

These are requirements for the
user-functions:
\itemize{
\item They must return a numeric vector.
\item They mush has an argument \code{n} for
the number of values.
\item The \emph{population} mean and standard
deviation of the generated values
must be 0 and 1, respectively.
}

The package \code{power4mome} has
helper functions for generating
values from some common nonnormal
distributions and then scaling them
to have population mean and standard
deviation equal to 0 and 1 (by default), respectively.
These are some of them:
\itemize{
\item \code{\link[=rbinary_rs]{rbinary_rs()}}.
\item \code{\link[=rexp_rs]{rexp_rs()}}.
\item \code{\link[=rbeta_rs]{rbeta_rs()}}.
\item \code{\link[=rlnorm_rs]{rlnorm_rs()}}.
\item \code{\link[=rpgnorm_rs]{rpgnorm_rs()}}.
}

To use \code{x_fun}, the variables must
have zero covariances with other
variables in the population. It is
possible to generate nonnormal
multivariate data but we believe this
is rarely needed when estimating
power \emph{before} having the data.
}

\section{Specify the Population Model by 'model'}{
\subsection{Single-Group Model}{

For a single-group model, \code{model}
should be a \code{lavaan} model syntax
string of the \emph{form} of the model.
The population values of the model
parameters are to be determined by
\code{pop_es}.

If the model has latent factors,
the syntax in \code{model} should specify
only the \emph{structural model} for the
\emph{latent factors}. There is no
need to specify the measurement
part. Other functions will generate
the measurement part on top of this
model.

For example, this is a simple mediation
model:

\preformatted{"m ~ x
 y ~ m + x"}

Whether \code{m}, \code{x}, and \code{y} denote
observed variables or latent factors
are determined by other functions,
such as \code{\link[=power4test]{power4test()}}.
}

\subsection{Multigroup Model}{

Because the model is the population
model, equality constraints are
irrelevant and the model syntax
specifies only the \emph{form} of the
model. Therefore, \code{model} is
specified as in the case of single
group models.
}
}

\section{Specify 'pop_es' Using Named Vectors}{
The argument \code{pop_es} is for specifying
the population values of model
parameters. This section describes
how to do this using named vectors.
\subsection{Single-Group Model}{

If \code{pop_es} is specified by a named
vector, it must follow the convention
below.
\itemize{
\item The names of the vectors are
\code{lavaan} names for the selected
parameters. For example, \code{m ~ x}
denotes the path from \code{x} to \code{m}.
\item Alternatively, the names can be
either \code{".beta."} or \code{".cov."}.
Use \code{".beta."} to set the default
values for all regression coefficients.
Use \code{".cov."} to set the default
values for all correlations of
exogenous variables (e.g., predictors).
\item The names can also be of this form:
\code{".ind.(<path>)"}, whether \verb{<path>}
denote path in the model. For
example, \code{".ind.(x->m->y)"} denotes
the path from \code{x} through \code{m} to
\code{y}. Alternatively, the \code{lavaan}
symbol \code{~} can also be used:
\code{".ind.(y~m~x)"}. This form is used
to set the indirect effect (standardized,
by default) along this path. The
value for this name will override
other settings.
\item If using \code{lavaan} names, can
specify more than one parameter
using \code{+}. For example, \code{y ~ m + x}
denotes the two paths from \code{m} and
\code{x} to \code{y}.
\item The value of each element can be
the label for the effect size: \code{n}
for nil, \code{s} for small, \code{m} for
medium, and \code{l} for large. The
value for each label is determined
by \code{es1} and \code{es2}. See the section
on specifying these two arguments.
\item The value of \code{pop_es} can also be
set to a value, but it must be
quoted as a string, such as \code{"y ~ x" = ".31"}.
}

This is an example:

\preformatted{c(".beta." = "s",
  "m1 ~ x" = "-m",
  "m2 ~ m1" = "l",
  "y ~ x:w" = "s")}

In this example,
\itemize{
\item All regression coefficients are
set to small (\code{s}) by default, unless
specified otherwise.
\item The path from \code{x} to \code{m1} is
set to medium and negative (\code{-m}).
\item The path from \code{m1} to \code{m2} is set
to large (\code{l}).
\item The coefficient of the product
term \code{x:w} when predicting \code{y} is
set to small (\code{s}).
}
\subsection{Indirect Effect}{

When setting an indirect effect to
a symbol (default: \code{"si"}, \code{"mi"},
\code{"li"}, with \code{"i"} added to differentiate
them from the labels for a direct path),
the corresponding value is used to
determine the population values of
\emph{all} component paths along the pathway.
All the values are assumed to be equal.
Therefore, \code{".ind.(x->m->y)" = ".20"}
is equivalent to setting \code{m ~ x}
and \code{y ~ m} to the square root of
.20, such that the corresponding
indirect effect is equal to the
designated value.

This behavior, though restricted,
is for quick manipulation of the
indirect effect. If different values
along a pathway, set the value for
each path directly.

Only nonnegative value is supported.
Therefore, \code{".ind.(x->m->y)" = "-si"}
and \code{".ind.(x->m->y)" = "-.20"} will
throw an error.
}

}

\subsection{Multigroup Model}{

The argument \code{pop_es} also supports multigroup
models.

For \code{pop_es}, instead of
named vectors, named \emph{list} of
named vectors should be used.
\itemize{
\item The names are the parameters, or
keywords such as \code{.beta.} and
\code{.cov.}, like specifying the
population values for a single
group model.
\item The elements are character vectors.
If it has only one element (e.g.,
a single string), then it is the
the population value for all groups.
If it has more than one element
(e.g., a vector of three strings),
then they are the population values
of the groups. For a model of \emph{k}
groups, each vector must have
either \emph{k} elements or one element.
}

This is an example:

\preformatted{list("m ~ x" = "m",
     "y ~ m" = c("s", "m", "l"))}

In this model, the population value
of the path \code{m ~ x} is medium (\code{m}) for
all groups, while the population
values for the path \code{y ~ m} are
small (\code{s}), medium (\code{m}), and large (\code{l}),
respectively.
}
}

\section{Specify 'pop_es' Using a Multiline String}{
When setting the argument \code{pop_es},
instead of using a named vector
or named list for
\code{pop_es}, the population values of
model parameters can also be
specified using a multiline string,
as illustrated below, to be parsed
by \code{\link[=pop_es_yaml]{pop_es_yaml()}}.
\subsection{Single-Group Model}{

This is an example of the multiline string
for a single-group model:

\preformatted{y ~ m: l
m ~ x: m
y ~ x: nil}

The string must follow this format:
\itemize{
\item Each line starts with \verb{tag:}.
\itemize{
\item \code{tag} can be the name of a
parameter, in \code{lavaan} model
syntax format.
\itemize{
\item For example, \code{m ~ x}
denotes the path from \code{x} to \code{m}.
}
\item A tag in \code{lavaan} model syntax can
specify more than one parameter
using \code{+}.
\itemize{
\item For example, \code{y ~ m + x}
denotes the two paths from \code{m} and
\code{x} to \code{y}.
}
\item Alternatively, the \code{tag} can be
either \code{.beta.} or \code{.cov.}.
\itemize{
\item Use \code{.beta.} to set the default
values for all regression coefficients.
\item Use \code{.cov.} to set the default
values for all correlations of
exogenous variables (e.g., predictors).
}
}
\item After each tag is the value of the
population value:

-\code{nil} for nil (zero),
\itemize{
\item \code{s} for small,
\item \code{m} for medium, and
\item \code{l} for large.
\item \code{si}, \code{mi}, and \code{li} for
small, medium, and large a
standardized indirect effect,
respectively.
}

Note: \code{n} \emph{cannot} be used in this mode.

The
value for each label is determined
by \code{es1} and \code{es2} as described
in \code{\link[=ptable_pop]{ptable_pop()}}.
\itemize{
\item The value can also be
set to a numeric value, such as
\code{.30} or \code{-.30}.
}
}

This is another example:

\preformatted{.beta.: s
y ~ m: l}

In this example, all regression
coefficients are \code{small}, while
the path from \code{m} to \code{y} is large.
}

\subsection{Multigroup Model}{

This is an example of the string
for a multigroup model:

\preformatted{y ~ m: l
m ~ x:
  - nil
  - s
y ~ x: nil}

The format is similar to that for
a single-group model. If a parameter
has the same value for all groups,
then the line can be specified
as in the case of a single-group
model: \code{tag: value}.

If a parameter has different
values across groups, then it must
be in this format:
\itemize{
\item A line starts with the tag, followed
by two or more lines. Each line
starts with a hyphen \code{-} and the
value for a group.
}

For example:

\preformatted{m ~ x:
  - nil
  - s}

This denotes that the model has
two groups. The values of the path
from \code{x} to \code{m} for the two
groups are 0 (\code{nil}) and
small (\code{s}), respectively.

Another equivalent way to specify
the values are using \verb{[]}, on
the same line of a tag.

For example:

\preformatted{m ~ x: [nil, s]}

The number of groups is inferred
from the number of values for
a parameter. Therefore, if a tag
has more than one value, each tag
must has the same number of value,
or only one value.

The tag \code{.beta.} and \code{.cov.} can
also be used for multigroup models.
}

\subsection{Which Approach To Use}{

Note that using named vectors or
named lists is more reliable. However,
using a multiline string is
more user-friendly. If this method
failed, please use named vectors or
named list instead.
}

\subsection{Technical Details}{

The multiline string is parsed by \code{\link[yaml:read_yaml]{yaml::read_yaml()}}.
Therefore, the format requirement
is actually that of YAML. Users
knowledgeable of YAML can use other
equivalent way to specify the string.
}
}

\section{Set the Values for Effect Size Labels ('es1' and 'es2')}{
The vector \code{es1} is for correlations,
regression coefficients, and
indirect effect, and the
vector \code{es2} is for for standardized
moderation effect, the coefficients
of a product term. These labels
are to be used in interpreting
the specification in \code{pop_es}.
}

\examples{

# Specify the model

mod <-
"m ~ x
 y ~ m + x"

# Specify the population values

es <-
"
y ~ m: m
m ~ x: m
y ~ x: n
"

# Generate the simulated datasets

data_all <- sim_data(nrep = 5,
                     model = mod,
                     pop_es = es,
                     n = 100,
                     iseed = 1234)

data_all

}
\seealso{
\code{\link[=power4test]{power4test()}}
}
