---
title: "LFS"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{LFS}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = nzchar(Sys.getenv("COMPILE_VIG_CANPUMF"))
)
```

```{r setup}
library(dplyr)
library(tidyr)
library(ggplot2)
library(canpumf)
options(canpumf.cache_path = Sys.getenv("COMPILE_VIG_CANPUMF"))
```

The LFS is one of the most-used PUMF series, since January 2021 the LFS PUMF is now easily available for direct download instead of needing to request it via EFT. This makes it very easy to integrate the LFS into reproducible workflows.

The `canpumf` package has two functions to facilitate access to the LFS PUMF. The first lists all LFS pumf versions that are available for direct download.

```{r}
list_canpumf_collection() |> 
  filter(Acronym=="LFS")
```

The second one fetches and loads the LFS data. For example, to download the LFS pumf for 2022 we use

```{r}
lfs_2022 <- get_pumf("LFS","2022")

lfs_2022 |>
  select(1:5) |>
  head(10)
```


By default the data is stored in the temporary session path, generally we want to make sure that the `canpumf.cache_path` option is set to a path to permanently cache the data.

Values come labelled, but columns are not. People working regularly with the LFS data will likely want to keep the short default column names, they can be converted to human readable column labels using `label_pumf_columns` function.

```{r}
lfs_2022 <- lfs_2022 |> label_pumf_columns()
```

With this we can do some simple descriptive analysis. We could use the `add_bootstrap_weights` function to add bootstrap weights if desired. We focus in on February 2022 and add bootstrap weights. By default this adds 500 weights, this will take time to generate for the full LFS sample. For LFS data the bootstrap weight generation will automatically stratify the generation by year and month. Here we shortcut this by only generating bootstrap weights for the February 2022 data after calling collect, so they just get generated in memory for the resulting tibble.

```{r}
lfs_2022_02_data <- lfs_2022 |> 
  filter(`Survey month`==2) |>
  collect() |>
  add_bootstrap_weights(weight_col = "Standard final weight", seed = 42)
```

For this vignette we look at gender-specific labour force status statistics for the 20 to 64 year old population, computing age-adjusted rates to even out age-specific effects.

```{r}
data <- lfs_2022_02_data |>
  filter(substr(`Five-year age group of respondent`,0,2) %in% seq(20,60,5)) |>
  filter(`Labour force status`!="Not in labour force") |>
  summarise(across(matches("Standard final weight|CPBSW\\d+"),sum),
            .by=c(`Labour force status`,`Five-year age group of respondent`,`Gender of respondent`,
                  `Marital status of respondent`)) |>
  pivot_longer(matches("Standard final weight|CPBSW\\d+"),names_to="Weight",values_to="Count") |>
  group_by(`Five-year age group of respondent`,`Gender of respondent`,
           `Marital status of respondent`, Weight) |>
  mutate(Share=ifelse(Count==0,0,Count/sum(Count))) |>
  ungroup()

data_age_adjusted <- data %>%
  left_join((.) |> 
              summarize(Count=sum(Count),
                        .by=c(`Five-year age group of respondent`,`Gender of respondent`,Weight)) |>
              mutate(P_age__gender=Count/sum(Count),
                     .by=c(`Gender of respondent`,Weight)) |>
              select(`Gender of respondent`,`Five-year age group of respondent`,Weight,P_age__gender),
            by=c("Gender of respondent","Five-year age group of respondent","Weight")) |>
  summarise(age_adjusted=sum(Share*P_age__gender),
            .by=c(`Gender of respondent`,`Labour force status`,`Marital status of respondent`, Weight))
  
data_age_adjusted |>
  filter(`Labour force status`=="Unemployed") |>
  ggplot(aes(x=age_adjusted, y=`Marital status of respondent`, fill=`Gender of respondent`)) +
  geom_boxplot() +
  geom_point(shape=21,data=~filter(.,Weight=="Standard final weight"),position=position_dodge(width=0.75)) +
  scale_x_continuous(labels=scales::percent) +
  labs(title="Unemployment rates of 20 to 64 year olds in February 2022",
       x="Age-adjusted unemployment rate",
       caption="StatCan LFS PUMF 2022-02")
```

We can similarly compute the age-adjusted participation rate by gender and marital status.

```{r}
data2 <- lfs_2022_02_data |>
  filter(substr(`Five-year age group of respondent`,0,2) %in% seq(20,60,5)) |>
  summarise(across(matches("Standard final weight|CPBSW\\d+"),sum),
            .by=c(`Labour force status`, `Five-year age group of respondent`,
                  `Gender of respondent`, `Marital status of respondent`)) |>
  pivot_longer(matches("Standard final weight|CPBSW\\d+"),names_to="Weight",values_to="Count") |>
  mutate(Share=ifelse(Count==0,0,Count/sum(Count)),
         .by=c(`Five-year age group of respondent`,`Gender of respondent`,
               `Marital status of respondent`, Weight)) 

data_age_adjusted2 <- data2 %>%
  left_join((.) |> 
              summarize(Count=sum(Count),
                        .by=c(`Five-year age group of respondent`,`Gender of respondent`,Weight)) |>
              mutate(P_age__sex=Count/sum(Count),
                     .by=c(`Gender of respondent`,Weight)) |>
              select(`Gender of respondent`,`Five-year age group of respondent`,Weight,P_age__sex),
            by=c("Gender of respondent","Five-year age group of respondent","Weight")) |>
  summarise(age_adjusted=sum(Share*P_age__sex),
            .by=c(`Gender of respondent`,`Labour force status`,`Marital status of respondent`, Weight))
  
data_age_adjusted2 |>
  filter(`Labour force status`=="Not in labour force") |>
  ggplot(aes(x=1-age_adjusted, y=`Marital status of respondent`, fill=`Gender of respondent`)) +
  geom_boxplot() +
  geom_point(shape=21,data=~filter(.,Weight=="Standard final weight"),position=position_dodge(width=0.75)) +
  scale_x_continuous(labels=scales::percent) +
  labs(title="Labour force participation rates of 20 to 64 year olds in February 2022",
       x="Age-adjusted participation rate",
       caption="StatCan LFS PUMF 2022-02")
```

Narrowing it down a bit to only look at the share of the population employed and at work in February 2022 drops these shares a bit.

```{r}
data_age_adjusted2 |>
  filter(`Labour force status`=="Employed, at work") |>
  ggplot(aes(x=age_adjusted, y=`Marital status of respondent`, fill=`Gender of respondent`)) +
  geom_boxplot() +
  geom_point(shape=21,data=~filter(.,Weight=="Standard final weight"),position=position_dodge(width=0.75)) +
  scale_x_continuous(labels=scales::percent) +
  labs(title="Share of 20 to 64 year olds working in February 2022",
       x="Age-adjusted share at work",
       caption="StatCan LFS PUMF 2022-02")
```

It's good practice to close the database connection after being done with a specific task.

```{r}
lfs_2022 |> close_pumf()
```

Derived connections, like the one to the February 2022 table, will automatically be closed too.

## Timelines

LFS data can also easily be accessed across time.

```{r}
lfs_pumf <- get_pumf("LFS", refresh="auto")
```


We can now easily extract time series data, we want to perform as many operations as possible at the database level. There are several convenience functions when working with the LFS data, one is `add_lfs_SURVDATE` which adds a `SURVDATE` column based on the survey year and month.

```{r}
unemployment_stats <- lfs_pumf |> 
  filter(LFSSTAT !="Not in labour force") |>
  filter(AGE_12 %in% c("25 to 29 years","30 to 34 years", "35 to 39 years")) |>
  mutate(jd=case_when(is.na(DURJLESS) ~ "Not applicable",
                      DURJLESS<12 ~ "Less than one year",
                      TRUE ~ "One year or more")) |>
  add_lfs_SURVDATE() |>
  summarize(Count=sum(FINALWT),.by=c(SURVDATE,jd,AGE_12)) |>
  mutate(Share=Count/sum(Count),.by=c(SURVDATE,AGE_12)) |>
  filter(jd!="Not applicable")


unemployment_stats |>
  ggplot(aes(x=SURVDATE,y=Share,colour=AGE_12)) +
  geom_line() +
  facet_wrap(~jd) +
  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="Unemployment by duration of unemployment",
       y="Unemployment rate",x=NULL,
       colour="Age group",
       caption="StatCan LFS (PUMF)")
```

Before plotting we could call `collect`, but this does not need to be done explicitly. 

Because the data is efficiently organised in DuckDB, this query runs quite fast despite no explicit indexing of the database, taking less than half a second.

```{r}
microbenchmark::microbenchmark(collect(unemployment_stats)) |> 
  boxplot()
```


The SEX variable has been recategorized into the GENDER concept starting in 2011, older LFS PUMF data still uses SEX. We can harmonize this by coalescing the values to create a new `GENDER_SEX` column as done by the convenience function `add_lfs_GENDER_SEX`. 


```{r}
lfs_pumf |> 
  filter(LFSSTAT !="Not in labour force") |>
  add_lfs_SURVDATE() |>
  add_lfs_GENDER_SEX() |>
  summarise(Count=sum(FINALWT),.by=c(SURVDATE,LFSSTAT,GENDER_SEX)) |>
  mutate(Share=Count/sum(Count),.by=c(SURVDATE,GENDER_SEX)) |>
  filter(LFSSTAT=="Unemployed") |>
  ggplot(aes(x=SURVDATE,y=Share,colour=GENDER_SEX)) +
  geom_line() +
  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="Unemployment sex/gender",
       y="Unemployment rate",x=NULL,
       colour="Gender",
       caption="StatCan LFS (PUMF)")
```

```{r}
lfs_pumf |> close_pumf()
```