% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/record_group.R
\name{record_group}
\alias{record_group}
\title{Multistage deterministic record linkage}
\usage{
record_group(df, sn = NULL, criteria, sub_criteria = NULL,
  strata = NULL, data_source = NULL, group_stats = FALSE,
  display = TRUE, to_s4 = TRUE)
}
\arguments{
\item{df}{\code{data.frame}. One or more datasets appended together.}

\item{sn}{Unique numerical record identifier. Optional.}

\item{criteria}{Column names of attributes to match. Each \code{criteria} is a stage in the process and the order in which they are listed determines the relevance of matches.}

\item{sub_criteria}{Matching sub-criteria. Additional matching conditions for each stage (\code{criteria}).}

\item{strata}{Subsets of the dataset. Record grouping will be done separately with each subset of the dataset. You can use multiple columns supplied as column names.}

\item{data_source}{Unique dataset identifier. Useful when \code{df} contains data from multiple sources.}

\item{group_stats}{If \code{TRUE}, output will include additional columns with useful stats for each record group.}

\item{display}{If \code{TRUE} (default), a progress message is printed on screen.}

\item{to_s4}{If \code{TRUE} (default), record groups are returned as a \code{\link[=pid-class]{pid}} object.}
}
\value{
\code{\link[=pid-class]{pid}} objects or \code{data.frame} if \code{to_s4} is \code{FALSE})

\itemize{
\item \code{sn} - unique record identifier as provided (or generated)
\item \code{pid | .Data} - unique group identifier
\item \code{link_id} - unique record identifier of matching records
\item \code{pid_cri} - matching criteria
\item \code{pid_dataset} - data sources in each group
\item \code{pid_total} - number of records in each group
}
}
\description{
Group matching or partially matching records in multiple stages of relevance using different criteria.
}
\details{
Record grouping occurs in stages of matching \code{criteria}.

Records are matched in two ways: an exact match i.e. the equivalent of \code{(==)}, or range matching.
An example of range matching is matching a date give or take 5 days, or matching an age give or take 2 years.
To do this, create the range as a \code{\link{number_line}} object and supply it to the \code{criteria} or \code{sub_criteria} argument.
The actual value within each range must be assigned to the \code{gid} slot of the \code{number_line} object.

A match at each stage is considered more relevant than a match at the next stage.
Therefore, \code{criteria} should be listed in order of decreasing relevance or certainty.

\code{sub_criteria} can be used to force additional matching conditions at each stage.
If \code{sub_criteria} is not \code{NULL}, only records with matching \code{criteria} and \code{sub_criteria} values are grouped together.
If a record has missing values for any \code{criteria}, that record is skipped at that stage, and another attempt is made at the next stage.
If there are no matches for a record at every stage, that record is assigned a unique group ID.

When a \code{data_source} identifier is provided,
\code{pid_dataset} is included in the output. This lists the source of every record in each record group.
}
\examples{
library(diyar)
three_people <- data.frame(forename=c("Obinna","James","Ojay","James","Obinna"),
                           stringsAsFactors = FALSE)

three_people$pids_a <- record_group(three_people, criteria= forename, to_s4 = TRUE)
three_people

# To handle missing or unknown data, recode missing or unknown values to NA or "".
three_people$forename[c(1,4)] <- NA
three_people$pids_b <- record_group(three_people, criteria= forename, to_s4 =TRUE)
three_people

data(staff_records); staff_records

# Range matching
dob <- staff_records["sex"]
dob$age <- c(30,28,40,25,25,29,27)

# age range: age + 20 years
dob$range_a <- number_line(dob$age, dob$age+20, gid=dob$age)
dob$pids_a <- record_group(dob, criteria = sex, sub_criteria = list(s1a="range_a"), to_s4 = TRUE)
dob[c("sex","age","range_a","pids_a")]

# age range: age +- 20 years
dob$range_b <- number_line(dob$age-20, dob$age+20, gid=dob$age)
dob$pids_b <- record_group(dob, criteria = sex, sub_criteria = list(s1a="range_b"), to_s4 = TRUE)
dob[c("sex","age","range_b","pids_b")]

dob$pids_c <- record_group(dob, criteria = range_b, to_s4 = TRUE)
dob[c("age","range_b","pids_c")]


# Multistage record grouping
staff_records$pids_a <- record_group(staff_records, sn = r_id, criteria = c(forename, surname),
                                     data_source = sex, display = FALSE, to_s4 = TRUE)
staff_records

# Add `sex` to the second stage (`cri`) to be more certain
staff_records$cri_2 <- paste0(staff_records$surname,"-", staff_records$sex)
staff_records$pids_b <- record_group(staff_records, r_id, c(forename, cri_2),
                                     data_source = dataset, display = FALSE, to_s4 = TRUE)
staff_records

# Using sub-criteria
data(missing_staff_id); missing_staff_id

missing_staff_id$pids <- record_group(missing_staff_id, r_id, c(staff_id, age),
list(s2a=c("initials","hair_colour","branch_office")), data_source = source_1, to_s4 = TRUE)

missing_staff_id
}
\seealso{
\code{\link{episode_group}} and \code{\link{number_line}}
}
