% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_valid_subset.r
\name{get_valid_subset}
\alias{get_valid_subset}
\title{Get the subset that satisfies the missing rate condition.}
\usage{
get_valid_subset(
  df,
  row_na_ratio = 0.5,
  col_na_ratio = 0.2,
  row_priority = 1,
  speedup_ratio = 0,
  return_index = FALSE
)
}
\arguments{
\item{df}{A data frame.}

\item{row_na_ratio}{The maximum acceptable missing rate of rows.}

\item{col_na_ratio}{The maximum acceptable missing rate of columns.}

\item{row_priority}{A positive numerical, the priority to keep rows. The higher the value, the higher the priority,
with \code{1} indicating equal priority for rows and columns.}

\item{speedup_ratio}{A positive numerical, the ratio of speedup. The higher the value, the greedier the algorithm.}

\item{return_index}{A logical, whether to return only the row and column indices of the subset.}
}
\value{
The subset data frame, or a list that contains the row and column indices of the subset.
}
\description{
Get the subset of a data frame that satisfies the missing rate condition using a greedy algorithm.
}
\details{
The function is based on a greedy algorithm. It iteratively removes the row or column with
the highest excessive missing rate weighted by the inverse of \code{row_priority} until the missing rates
of all rows and columns are below the specified threshold. Then it reversely tries to add rows and columns that
do not break the conditions back and finalize the subset. The result depends on the \code{row_priority} parameter
drastically, so it's recommended to try different \code{row_priority} values to find the most satisfying one.
}
\examples{
data(cancer, package = "survival")
dim(cancer)
max_missing_rates(cancer)

cancer_valid <- get_valid_subset(cancer, row_na_ratio = 0.2, col_na_ratio = 0.1, row_priority = 1)
dim(cancer_valid)
max_missing_rates(cancer_valid)
}
