Using brolgar to understand Mixed Effects Models

library(brolgar)
library(lme4)
library(modelr)
library(ggplot2)

Just as it is important to explore your data before modelling, it is important to explore your data after you fit a model, and during the modelling process.

wages
#> # A tsibble: 6,402 x 9 [!]
#> # Key:       id [888]
#>       id ln_wages    xp   ged xp_since_ged black hispanic high_grade
#>    <int>    <dbl> <dbl> <int>        <dbl> <int>    <int>      <int>
#>  1    31     1.49 0.015     1        0.015     0        1          8
#>  2    31     1.43 0.715     1        0.715     0        1          8
#>  3    31     1.47 1.73      1        1.73      0        1          8
#>  4    31     1.75 2.77      1        2.77      0        1          8
#>  5    31     1.93 3.93      1        3.93      0        1          8
#>  6    31     1.71 4.95      1        4.95      0        1          8
#>  7    31     2.09 5.96      1        5.96      0        1          8
#>  8    31     2.13 6.98      1        6.98      0        1          8
#>  9    36     1.98 0.315     1        0.315     0        0          9
#> 10    36     1.80 0.983     1        0.983     0        0          9
#> # ℹ 6,392 more rows
#> # ℹ 1 more variable: unemploy_rate <dbl>

We might explore this by looking at experience against wages, for each individual:


gg_wages_all <- 
ggplot(wages,
       aes(x = xp,
           y = ln_wages,
           group = id)) + 
  geom_line(alpha = 0.25)

gg_wages_all

gg_wages_all +
  facet_sample()

gg_wages_all + facet_strata()

gg_wages_all + 
  facet_strata(along = unemploy_rate)


gg_wages_all + 
  facet_strata(along = xp_since_ged)


gg_wages_all + facet_wrap(~high_grade)

So let’s fit a model where we look at the impact of xp, unemployment rate, and fit an intercept for each individual.

library(lme4)
wages_fit_int <- lmer(ln_wages ~ xp + ged + unemploy_rate + (xp |id), 
                      data = wages)

library(modelr)
wages_aug <- wages %>%
  add_predictions(wages_fit_int, var = "pred_int") %>%
  add_residuals(wages_fit_int, var = "res_int")

ggplot(wages_aug,
       aes(x = xp,
           y = pred_int,
           group = id)) + 
  geom_line(alpha = 0.4)

ggplot(wages_aug,
       aes(x = xp,
           y = pred_int,
           group = id)) + 
  geom_line() + 
  facet_sample()

ggplot(wages_aug,
       aes(x = xp,
           y = pred_int,
           group = id)) + 
  geom_line() + 
  facet_strata(along = res_int)

wages_aug %>%
  sample_n_keys(size = 9) %>%
  ggplot(aes(x = xp,
             y = pred_int,
             group = id,
             colour = factor(id))) + 
  geom_line() + 
  geom_point(aes(x = xp,
                 y = ln_wages,
                 colour = factor(id))) + 
  facet_wrap(~id) + 
  theme(legend.position = "none")

What if we grabbed a sample of those who have the best, middle, and worst residuals? Those who are closest to these values:

summary(wages_aug$res_int)
#>      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#> -1.305694 -0.159728 -0.009445  0.000000  0.136411  2.148938

We can use keys_near() to return those specified keys that are close to these values. Because this is a tsibble object, we don’t need to specify the key variable here.

wages_aug_near <- wages_aug %>%
  keys_near(var = res_int)

wages_aug_near
#> # A tibble: 6 × 5
#>      id  res_int stat  stat_value  stat_diff
#>   <int>    <dbl> <fct>      <dbl>      <dbl>
#> 1   122 -1.31    min     -1.31    0         
#> 2   735  2.15    max      2.15    0         
#> 3  2026 -0.00956 med     -0.00945 0.000115  
#> 4  3961 -0.00933 med     -0.00945 0.000115  
#> 5  5778 -0.160   q_25    -0.160   0.0000113 
#> 6  8246  0.136   q_75     0.136   0.00000363

This shows us the keys where we the residuals match closest to the five number summary.

We can plot this data by joining it back to the wages data with predictions, to see what the spread of predictions is like.

library(dplyr)

wages_aug_near_full <- left_join(wages_aug_near,
                                 wages_aug,
                                 by = "id") 

gg_wages_near <- 
  ggplot(wages_aug_near_full,
       aes(x = xp,
           y = pred_int,
           group = id,
           colour = stat)) + 
  geom_line() + 
  geom_point(aes(y = ln_wages)) 

gg_wages_near


gg_wages_near + 
 facet_wrap(~stat) +
  theme(legend.position = "none")

wages_aug %>%
  stratify_keys(n_strata = 12, 
                along = res_int) %>%
  sample_n_keys(size = 9) %>%
  ggplot(aes(x = xp,
             y = pred_int,
             group = id,
             colour = factor(id))) + 
  geom_line() + 
  geom_point(aes(x = xp,
                 y = ln_wages,
                 colour = factor(id))) + 
  facet_wrap(~.strata) + 
  theme(legend.position = "none")