% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/extract.R
\name{extract}
\alias{extract}
\title{Extract pattern matches from text}
\usage{
extract(
  data,
  col_name = "text",
  regex_table,
  pattern_col = "pattern",
  data_return_cols = NULL,
  regex_return_cols = NULL,
  date_col = NULL,
  date_start = NULL,
  date_end = NULL,
  remove_acronyms = FALSE,
  do_clean_text = TRUE,
  verbose = TRUE,
  cl = NULL
)
}
\arguments{
\item{data}{A data frame or character vector containing the text to search.}

\item{col_name}{Column name in data frame containing text to search through.}

\item{regex_table}{A regex lookup table with a pattern column.}

\item{pattern_col}{Name of the regex pattern column in regex_table.}

\item{data_return_cols}{Optional vector of column names to include from 'data'.}

\item{regex_return_cols}{Optional vector of column names to include from 'regex_table'.}

\item{date_col}{Optional column in 'data' for date filtering.}

\item{date_start}{Optional start date for filtering 'data'.}

\item{date_end}{Optional end date for filtering 'data'.}

\item{remove_acronyms}{Logical; if TRUE, removes all-uppercase patterns from regex_table.}

\item{do_clean_text}{Logical; if TRUE, applies basic text cleaning to the input before matching.}

\item{verbose}{Logical; if TRUE, displays progress messages.}

\item{cl}{A cluster object created by \code{parallel::makeCluster()}, or an integer to indicate number of child-processes (integer values are ignored on Windows) for parallel evaluations. Passed to \code{\link[pbapply:pbapply]{pbapply::pblapply()}}.}
}
\value{
A tibble (data frame) with columns:
\itemize{
\item \code{row_id} Integer row identifier corresponding to the input data
\item  Additional columns from \code{data} if \code{data_return_cols} specified
\item  Additional columns from \code{regex_table} if \code{regex_return_cols} specified
\item \code{pattern} The matched regex pattern(s)
\item \code{match} The specific text extracted from the data (original casing preserved)
}
}
\description{
Uses a regex lookup table to extract \strong{all} pattern matches.
}
\details{
Pattern matching is performed using R's regular expression engine and is
case-insensitive by default. For each input row, the function checks every
pattern in \code{regex_table} and returns the first match of each pattern.

The output contains one row per pattern match per input row. If multiple
patterns match the same text, multiple rows will be returned for that text.
}
\examples{
# Create sample data
data <- data.frame(
  id = 1:3,
  text = c("I love apples", "Bananas are great", "Oranges and apples"),
  stringsAsFactors = FALSE
)

# Create regex patterns
patterns <- data.frame(
  pattern = c("apples", "bananas", "oranges"),
  category = c("fruit", "fruit", "fruit")
)

# Extract matches
extract(data, "text", patterns)
}
