Many small models – the Secret Weapon

how to fit and visualize small multiples

Allen and Andrew true
2022-06-10
substrate <- c("soil", "onion", "radish", "broccoli", "soy", "corn")

genotypes <- c("H", "N")

library(tidyverse)


treatment <- expand_grid(substrate, genotypes)


fly_ids <- paste0("fly", 1:(30*12))

# assign flies to treatments

trt_id <- rep(1:12, times = 30)

design_matrix <- tibble(fly_ids, trt_id) |> 
  rowwise() |> 
  mutate(trt = list(treatment[trt_id,])) |> 
  unnest("trt")

obs_flies <- design_matrix |> 
  rowwise() |> 
  mutate(time = list(c("t1", "t2", "t3"))) |> 
  unnest(time) |> 
  mutate(obs = rnorm(length(fly_ids), mean = 40, sd = sqrt(40)))

# faster simpler 
expand_grid(genotype = c("H", "N"),
            substrate = c("control", "onion", "radish", "broco", "soy", "corn"),
            flies = 1:30,
            time = c("4", "24", "48")) %>%
  group_by(genotype, substrate) %>%
  mutate(fly_id = cur_group_id()) %>%
  ungroup()
# A tibble: 1,080 x 5
   genotype substrate flies time  fly_id
   <chr>    <chr>     <int> <chr>  <int>
 1 H        control       1 4          2
 2 H        control       1 24         2
 3 H        control       1 48         2
 4 H        control       2 4          2
 5 H        control       2 24         2
 6 H        control       2 48         2
 7 H        control       3 4          2
 8 H        control       3 24         2
 9 H        control       3 48         2
10 H        control       4 4          2
# ... with 1,070 more rows

Fit many small models to each one!

fly_model_df <- obs_flies |> 
  mutate(time = factor(time, levels = c("t3", "t1", "t2"))) |> 
  nest_by(substrate, genotypes, .key = "flydata")

fly_model_coefs <- fly_model_df |> 
  # fit models
  mutate(model = list(lm(obs ~ time, data = flydata))) |> 
  # mutate(coef = list(coef(model))) # there's a cooler way! 
  mutate(tb = list(broom::tidy(model))) |> 
  unnest(cols = c("tb"))

fly_model_coefs |> 
  filter(term == "(Intercept)") |> 
  ggplot(aes(y = estimate, x = substrate, col = genotypes, ymin = estimate - std.error, ymax = estimate + std.error)) + 
  geom_pointrange(position = position_dodge(width = .1))

could do it again for another kind of preference: the contrast between t1 and t3

fly_model_coefs |> 
  filter(term == "timet1") |> 
  ggplot(aes(y = estimate, x = substrate, col = genotypes, 
             ymin = estimate - std.error, ymax = estimate + std.error)) + 
  geom_pointrange(position = position_dodge(width = .1))

what about for a zero-inflated model?

library(glmmTMB)

obs_flies_zi <- obs_flies |> 
  mutate(time = factor(time, levels = c("t3", "t1", "t2")),
         obs = if_else(runif(1080)<.1, true = 0, false = obs)) 


obs_flies_zi |> 
  nest_by(substrate, genotypes, .key = "flydata") |> 
  mutate(zi_mod = list(glmmTMB(obs ~ time, ziformula = ~time, data = flydata)))
# A tibble: 12 x 4
# Rowwise:  substrate, genotypes
   substrate genotypes            flydata zi_mod   
   <chr>     <chr>     <list<tibble[,4]>> <list>   
 1 broccoli  H                   [90 x 4] <glmmTMB>
 2 broccoli  N                   [90 x 4] <glmmTMB>
 3 corn      H                   [90 x 4] <glmmTMB>
 4 corn      N                   [90 x 4] <glmmTMB>
 5 onion     H                   [90 x 4] <glmmTMB>
 6 onion     N                   [90 x 4] <glmmTMB>
 7 radish    H                   [90 x 4] <glmmTMB>
 8 radish    N                   [90 x 4] <glmmTMB>
 9 soil      H                   [90 x 4] <glmmTMB>
10 soil      N                   [90 x 4] <glmmTMB>
11 soy       H                   [90 x 4] <glmmTMB>
12 soy       N                   [90 x 4] <glmmTMB>

From easystats documentation

library(parameters)
model <- glmmTMB(
  count ~ spp + mined + (1 | site),
  ziformula = ~mined,
  family = poisson(),
  data = Salamanders
)

simulate_parameters(model, centrality = "mean")
# Fixed Effects

Parameter   | Coefficient |         95% CI |      p
---------------------------------------------------
(Intercept) |       -0.36 | [-0.89,  0.21] | 0.222 
sppPR       |       -1.27 | [-1.72, -0.80] | < .001
sppDM       |        0.27 | [ 0.00,  0.53] | 0.054 
sppEC-A     |       -0.56 | [-0.98, -0.17] | 0.002 
sppEC-L     |        0.67 | [ 0.42,  0.91] | < .001
sppDES-L    |        0.63 | [ 0.38,  0.86] | < .001
sppDF       |        0.12 | [-0.16,  0.39] | 0.438 
minedno     |        1.26 | [ 0.75,  1.79] | < .001

# Zero-Inflated

Parameter      | Coefficient |         95% CI |      p
------------------------------------------------------
(Intercept)_zi |        0.79 | [ 0.28,  1.30] | 0.008 
minedno_zi     |       -1.85 | [-2.43, -1.27] | < .001
simulate_parameters(model, ci = c(.8, .95), component = "zero_inflated")
# Fixed Effects

Parameter   | Coefficient |         80% CI |         95% CI |      p
--------------------------------------------------------------------
(Intercept) |        0.80 | [ 0.44,  1.14] | [ 0.24,  1.33] | 0.006 
minedno     |       -1.85 | [-2.25, -1.45] | [-2.51, -1.20] | < .001

bootstrapping a dataframe

one_sample <- obs_flies_zi |> 
  nest(timeobs = c(time, obs)) |> 
  # group by experimental factors
  group_by(substrate, genotypes) |> 
  sample_frac(size = 1, replace = TRUE)

one_sample |> 
  filter(trt_id == 1) |> 
  arrange(fly_ids)
# A tibble: 30 x 5
# Groups:   substrate, genotypes [1]
   fly_ids trt_id substrate genotypes timeobs         
   <chr>    <int> <chr>     <chr>     <list>          
 1 fly109       1 soil      H         <tibble [3 x 2]>
 2 fly109       1 soil      H         <tibble [3 x 2]>
 3 fly121       1 soil      H         <tibble [3 x 2]>
 4 fly13        1 soil      H         <tibble [3 x 2]>
 5 fly133       1 soil      H         <tibble [3 x 2]>
 6 fly145       1 soil      H         <tibble [3 x 2]>
 7 fly145       1 soil      H         <tibble [3 x 2]>
 8 fly169       1 soil      H         <tibble [3 x 2]>
 9 fly193       1 soil      H         <tibble [3 x 2]>
10 fly205       1 soil      H         <tibble [3 x 2]>
# ... with 20 more rows
one_sample |> 
  unnest(timeobs) |> 
  ungroup()
# A tibble: 1,080 x 6
   fly_ids trt_id substrate genotypes time    obs
   <chr>    <int> <chr>     <chr>     <fct> <dbl>
 1 fly271       7 broccoli  H         t1     39.9
 2 fly271       7 broccoli  H         t2     33.2
 3 fly271       7 broccoli  H         t3     27.4
 4 fly283       7 broccoli  H         t1     41.8
 5 fly283       7 broccoli  H         t2     41.3
 6 fly283       7 broccoli  H         t3     50.9
 7 fly7         7 broccoli  H         t1     42.8
 8 fly7         7 broccoli  H         t2     34.6
 9 fly7         7 broccoli  H         t3     31.2
10 fly187       7 broccoli  H         t1     41.0
# ... with 1,070 more rows

that’s one rep, let’s do a lot!

boot_list <- rerun(30, {
  obs_flies_zi |> 
    nest(timeobs = c(time, obs)) |> 
    # group by experimental factors
    group_by(substrate, genotypes) |> 
    sample_frac(size = 1, replace = TRUE) |> 
    unnest(timeobs) |> 
    ungroup()
})

now just use map to apply a function to each of these

model_list <- boot_list |> 
  map(~ glmmTMB(obs ~ time, ziformula = ~time, data = .x))

# use bootstrapped models to predict on original dataset
model_predictions <- map(model_list, 
                         predict,
                         type = "response",
                         newdata = obs_flies_zi)

# model_predictions |> map(head)

# rowMeans(do.call(cbind, model_predictions))

# get more information though!
boot_pred_df <- model_predictions |> 
  map_dfr(enframe, name = "rownum", value = "pred", .id = "boot_id")

summarize each of these

summarized_data <- boot_pred_df |> 
  nest_by(rownum) |> 
  mutate(med = median(data$pred),
         quant_lo = quantile(data$pred, 0.025),
         quant_hi = quantile(data$pred, 0.975)) |> 
  select(-data)

bind_cols(obs_flies_zi, summarized_data) |> 
  ggplot(aes(x = substrate, y = obs)) + 
  geom_count() + 
  facet_grid(genotypes ~ time) + 
  geom_count(aes(y = med, x = substrate), col = "orange") + 
  geom_pointrange(aes(y = med, ymin = quant_lo, ymax = quant_hi), col = "green")