% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sim.R
\name{sim_data}
\alias{sim_data}
\title{Download SIM Mortality Microdata}
\usage{
sim_data(
  year,
  vars = NULL,
  uf = NULL,
  cause = NULL,
  decode_age = TRUE,
  parse = TRUE,
  col_types = NULL,
  cache = TRUE,
  cache_dir = NULL,
  lazy = FALSE,
  backend = c("arrow", "duckdb")
)
}
\arguments{
\item{year}{Integer. Year(s) of the data. Required.}

\item{vars}{Character vector. Variables to keep. If NULL (default),
returns all available variables. Use \code{\link[=sim_variables]{sim_variables()}} to see
available variables.}

\item{uf}{Character. Two-letter state abbreviation(s) to download.
If NULL (default), downloads all 27 states.
Example: \code{"SP"}, \code{c("SP", "RJ")}.}

\item{cause}{Character. CID-10 code pattern(s) to filter by cause of
death (\code{CAUSABAS}). Supports partial matching (prefix).
If NULL (default), returns all causes.
Example: \code{"I21"} (infarct), \code{"C"} (all neoplasms).}

\item{decode_age}{Logical. If TRUE (default), adds a numeric column
\code{age_years} with age in years decoded from the \code{IDADE} variable.}

\item{parse}{Logical. If TRUE (default), converts columns to
appropriate types (integer, double, Date) based on the variable
metadata. Use \code{\link[=sim_variables]{sim_variables()}} to see the target type for each
variable. Set to FALSE for backward-compatible all-character output.}

\item{col_types}{Named list. Override the default type for specific
columns. Names are column names, values are type strings:
\code{"character"}, \code{"integer"}, \code{"double"},
\code{"date_dmy"}, \code{"date_ymd"}, \code{"date_ym"}, \code{"date"}.
Example: \code{list(PESO = "character")} to keep PESO as character.}

\item{cache}{Logical. If TRUE (default), caches downloaded data for
faster future access.}

\item{cache_dir}{Character. Directory for caching. Default:
\code{tools::R_user_dir("healthbR", "cache")}.}

\item{lazy}{Logical. If TRUE, returns a lazy query object instead of a
tibble. Requires the \pkg{arrow} package. The lazy object supports
dplyr verbs (filter, select, mutate, etc.) which are pushed down
to the query engine before collecting into memory. Call
\code{dplyr::collect()} to materialize the result. Default: FALSE.}

\item{backend}{Character. Backend for lazy evaluation: \code{"arrow"}
(default) or \code{"duckdb"}. Only used when \code{lazy = TRUE}.
DuckDB backend requires the \pkg{duckdb} package.}
}
\value{
A tibble with mortality microdata. Includes columns \code{year}
and \code{uf_source} to identify the source when multiple years/states
are combined.
}
\description{
Downloads and returns mortality microdata from DATASUS FTP.
Each row represents one death record (Declaracao de Obito).
Data is downloaded per state (UF) as compressed .dbc files, decompressed
internally, and returned as a tibble.
}
\details{
Data is downloaded from DATASUS FTP as .dbc files (one per state per year).
The .dbc format is decompressed internally using vendored C code from the
blast library. No external dependencies are required.

When \code{uf} is specified, only the requested state(s) are downloaded,
making the operation much faster than downloading the entire country.
}
\examples{
\dontshow{if (interactive()) withAutoprint(\{ # examplesIf}
# all deaths in Acre, 2022
ac_2022 <- sim_data(year = 2022, uf = "AC")

# deaths by infarct in Sao Paulo, 2020-2022
infarct_sp <- sim_data(year = 2020:2022, uf = "SP", cause = "I21")

# only key variables, Rio de Janeiro, 2022
sim_data(year = 2022, uf = "RJ",
         vars = c("DTOBITO", "SEXO", "IDADE",
                  "RACACOR", "CODMUNRES", "CAUSABAS"))
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[=censo_populacao]{censo_populacao()}} for population denominators to calculate
mortality rates.

Other sim: 
\code{\link{sim_cache_status}()},
\code{\link{sim_clear_cache}()},
\code{\link{sim_dictionary}()},
\code{\link{sim_info}()},
\code{\link{sim_variables}()},
\code{\link{sim_years}()}
}
\concept{sim}
