% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/partition.R
\name{partition}
\alias{partition}
\title{Create balanced partitions.}
\usage{
partition(
  data,
  p = 0.2,
  cat_col = NULL,
  num_col = NULL,
  id_col = NULL,
  id_aggregation_fn = sum,
  extreme_pairing_levels = 1,
  force_equal = FALSE,
  list_out = TRUE
)
}
\arguments{
\item{data}{Data frame.}

\item{p}{List or vector of partition sizes.
Given as whole number(s) and/or percentage(s) (\code{0} < \code{n} < \code{1}).
E.g. \eqn{c(0.2, 3, 0.1)}.}

\item{cat_col}{Name of categorical variable to balance between partitions.

E.g. when training and testing a model for predicting a binary variable (a or b),
we usually want both classes represented in both the training set and the test set.

N.B. If also passing an \code{id_col}, \code{cat_col} should be constant within each ID.}

\item{num_col}{Name of numerical variable to balance between partitions.

N.B. When used with \code{id_col}, values in \code{num_col} for each ID are
aggregated using \code{id_aggregation_fn} before being balanced.}

\item{id_col}{Name of factor with IDs. Used to keep all rows that share an ID in
the same partition (if possible).

E.g. If we have measured a participant multiple times and want to see the
effect of time, we want to have all observations of this participant in
the same partition.}

\item{id_aggregation_fn}{Function for aggregating values in \code{num_col} for each ID,
before balancing \code{num_col}.

N.B. Only used when \code{num_col} and \code{id_col} are both specified.}

\item{extreme_pairing_levels}{How many levels of extreme pairing to do
when balancing partitions by a numerical column (i.e. \code{num_col} is specified).

\strong{Extreme pairing}: Rows/pairs are ordered as smallest, largest,
second smallest, second largest, etc. If \code{extreme_pairing_levels > 1},
this is done "recursively" on the extreme pairs. See \code{"Details/num_col"} for more.

N.B. Larger values work best with large datasets. If set too high,
the result might not be stochastic. Always check if an increase
actually makes the partitions more balanced. See example.}

\item{force_equal}{Discard excess data. (Logical)}

\item{list_out}{Return partitions in a list. (Logical)}
}
\value{
If \code{list_out is TRUE}:

A list of partitions where partitions are data frames.

If \code{list_out is FALSE}:

A data frame with grouping factor for subsetting.
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("stable")}

Splits data into partitions.
Balances a given categorical variable and/or numerical variable between partitions and keeps (if possible)
all data points with a shared ID (e.g. participant_id) in the same partition.
}
\details{
\subsection{cat_col}{
\enumerate{
\item Data is subset by \code{cat_col}.
\item Subsets are partitioned and merged.
}
}

\subsection{id_col}{
\enumerate{
\item Partitions are created from unique IDs.
}
}

\subsection{num_col}{
\enumerate{
\item Rows are shuffled.\preformatted{ \\strong\{Note\} that this will only affect rows with the same value in \code{num_col}.
 \\item Extreme pairing 1: Rows are ordered as smallest, largest, second smallest, second largest, etc.
 Each pair get a group identifier.
 \\item If \code{extreme_pairing_levels > 1}: The group identifiers are reordered as smallest,
 largest, second smallest, second largest, etc., by the sum of \code{num_col} in the represented rows.
 These pairs (of pairs) get a new set of group identifiers, and the process is repeated
  \code{extreme_pairing_levels-2} times. Note that the group identifiers at the last level will represent
  \code{2^extreme_pairing_levels} rows, why you should be careful when choosing that setting.
 \\item The final group identifiers are shuffled, and their order is applied to the full dataset.
 \\item The ordered dataset is split by the sizes in \code{p}.
}

}

N.B. When doing extreme pairing of an unequal number of rows,
the row with the largest value is placed in a group by itself, and the order is instead:
smallest, second largest, second smallest, third largest, ... , largest.
}

\subsection{cat_col AND id_col}{
\enumerate{
\item Data is subset by \code{cat_col}.
\item Partitions are created from unique IDs in each subset.
\item Subsets are merged.
}
}

\subsection{cat_col AND num_col}{
\enumerate{
\item Data is subset by \code{cat_col}.
\item Subsets are partitioned by \code{num_col}.
\item Subsets are merged.
}
}

\subsection{num_col AND id_col}{
\enumerate{
\item Values in \code{num_col} are aggregated for each ID, using \code{id_aggregation_fn}.
\item The IDs are partitioned, using the aggregated values as "\code{num_col}".
\item The partition identifiers are transferred to the rows of the IDs.
}
}

\subsection{cat_col AND num_col AND id_col}{
\enumerate{
\item Values in \code{num_col} are aggregated for each ID, using \code{id_aggregation_fn}.
\item IDs are subset by \code{cat_col}.
\item The IDs for each subset are partitioned,
by using the aggregated values as "\code{num_col}".
\item The partition identifiers are transferred to the rows of the IDs.
}
}
}
\examples{
# Attach packages
library(groupdata2)
library(dplyr)

# Create data frame
df <- data.frame(
  "participant" = factor(rep(c("1", "2", "3", "4", "5", "6"), 3)),
  "age" = rep(sample(c(1:100), 6), 3),
  "diagnosis" = factor(rep(c("a", "b", "a", "a", "b", "b"), 3)),
  "score" = sample(c(1:100), 3 * 6)
)
df <- df \%>\% arrange(participant)
df$session <- rep(c("1", "2", "3"), 6)

# Using partition()

# Without balancing
partitions <- partition(data = df, p = c(0.2, 0.3))

# With cat_col
partitions <- partition(data = df, p = 0.5, cat_col = "diagnosis")

# With id_col
partitions <- partition(data = df, p = 0.5, id_col = "participant")

# With num_col
partitions <- partition(data = df, p = 0.5, num_col = "score")

# With cat_col and id_col
partitions <- partition(
  data = df,
  p = 0.5,
  cat_col = "diagnosis",
  id_col = "participant"
)

# With cat_col, num_col and id_col
partitions <- partition(
  data = df,
  p = 0.5,
  cat_col = "diagnosis",
  num_col = "score",
  id_col = "participant"
)

# Return data frame with grouping factor
# with list_out = FALSE
partitions <- partition(df, c(0.5), list_out = FALSE)

# Check if additional extreme_pairing_levels
# improve the numerical balance
set.seed(2) # try with seed 1 as well
partitions_1 <- partition(
  data = df,
  p = 0.5,
  num_col = "score",
  extreme_pairing_levels = 1,
  list_out = FALSE
)
partitions_1 \%>\%
  dplyr::group_by(.partitions) \%>\%
  dplyr::summarise(
    sum_score = sum(score),
    mean_score = mean(score)
  )
set.seed(2) # try with seed 1 as well
partitions_2 <- partition(
  data = df,
  p = 0.5,
  num_col = "score",
  extreme_pairing_levels = 2,
  list_out = FALSE
)
partitions_2 \%>\%
  dplyr::group_by(.partitions) \%>\%
  dplyr::summarise(
    sum_score = sum(score),
    mean_score = mean(score)
  )
}
\seealso{
Other grouping functions: 
\code{\link{all_groups_identical}()},
\code{\link{fold}()},
\code{\link{group_factor}()},
\code{\link{group}()},
\code{\link{splt}()}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
\concept{grouping functions}
