Visualizing ordinal model probabilities with ribbons

Here is a ggplot2 recipe for plotting cumulative probabilities from ordinal regression models.

We have rating scores of various wines (1–5 point scale), and during the production, temperature (temp) and skin contact were experimentally controlled.

library(tidyverse)
library(ordinal)
wine <- as_tibble(wine)
wine
#> # A tibble: 72 × 6
#>    response rating temp  contact bottle judge
#>       <dbl> <ord>  <fct> <fct>   <fct>  <fct>
#>  1       36 2      cold  no      1      1    
#>  2       48 3      cold  no      2      1    
#>  3       47 3      cold  yes     3      1    
#>  4       67 4      cold  yes     4      1    
#>  5       77 4      warm  no      5      1    
#>  6       60 4      warm  no      6      1    
#>  7       83 5      warm  yes     7      1    
#>  8       90 5      warm  yes     8      1    
#>  9       17 1      cold  no      1      2    
#> 10       22 2      cold  no      2      2    
#> # ℹ 62 more rows

model <- clm(rating ~ temp * contact, data = wine)
pred_grid <- wine |>
  distinct(contact, temp)

predict.clm(type = "cum.prob") predictions provide two sets of cumulative probabilities P(rating) ≤ rating_k and P(rating) < rating_k. The first set yields a probability of 1 for the highest rating, and the second set have probability 0 for the first rating.

pred_grid |> 
  predict(model, newdata = _, type = "cum.prob") 
#> $cprob1
#>             1          2         3         4 5
#> 1 0.196035088 0.75833150 0.9669806 0.9929102 1
#> 2 0.059595928 0.44919920 0.8838723 0.9732608 1
#> 3 0.023374759 0.23547849 0.7419059 0.9321882 1
#> 4 0.004323076 0.05291811 0.3427392 0.7137748 1
#> 
#> $cprob2
#>   1           2          3         4         5
#> 1 0 0.196035088 0.75833150 0.9669806 0.9929102
#> 2 0 0.059595928 0.44919920 0.8838723 0.9732608
#> 3 0 0.023374759 0.23547849 0.7419059 0.9321882
#> 4 0 0.004323076 0.05291811 0.3427392 0.7137748

Each rating’s probability is the difference between these two sets of cumulative probabilities, so ultimately we will visualize them directly as a ribbon between the two cumulative probabilities.

Get a long dataframe of probabilities.

preds <- pred_grid |>
  predict(model, newdata = _, type = "cum.prob") |>
  lapply(as.data.frame) |>
  bind_rows(.id = "prob_type") |>
  cbind(pred_grid, x = _) |>
  rename_with(stringr::str_remove, pattern = "x[.]") |> 
  tidyr::pivot_longer(
    cols = c(-prob_type, -temp, -contact),
    names_to = "rating",
    values_to = "prob"
  ) |>
  tidyr::pivot_wider(
    names_from = prob_type,
    values_from = prob
  ) |>
  rename(prob_lte = cprob1, prob_lt = cprob2)
preds
#> # A tibble: 20 × 5
#>    contact temp  rating prob_lte prob_lt
#>    <fct>   <fct> <chr>     <dbl>   <dbl>
#>  1 no      cold  1       0.196   0      
#>  2 no      cold  2       0.758   0.196  
#>  3 no      cold  3       0.967   0.758  
#>  4 no      cold  4       0.993   0.967  
#>  5 no      cold  5       1       0.993  
#>  6 yes     cold  1       0.0596  0      
#>  7 yes     cold  2       0.449   0.0596 
#>  8 yes     cold  3       0.884   0.449  
#>  9 yes     cold  4       0.973   0.884  
#> 10 yes     cold  5       1       0.973  
#> 11 no      warm  1       0.0234  0      
#> 12 no      warm  2       0.235   0.0234 
#> 13 no      warm  3       0.742   0.235  
#> 14 no      warm  4       0.932   0.742  
#> 15 no      warm  5       1       0.932  
#> 16 yes     warm  1       0.00432 0      
#> 17 yes     warm  2       0.0529  0.00432
#> 18 yes     warm  3       0.343   0.0529 
#> 19 yes     warm  4       0.714   0.343  
#> 20 yes     warm  5       1       0.714

The less-than probabilities (prob_lt) provide thresholds between rating levels.

ggplot(preds) +
  aes(x = temp) +
  geom_line(
    aes(group = rating, y = prob_lt),
    # don't draw solid line at y = 0
    data = function(x) filter(x, prob_lt != 0),
    linetype = "dashed"
  ) +
  facet_wrap("contact")

center

Next, we might fill in the areas. geom_ribbon() only likes continuous x axes, so we note:

preds$temp_num <- as.numeric(preds$temp)
preds |> distinct(temp_num, temp)
#> # A tibble: 2 × 2
#>   temp_num temp 
#>      <dbl> <fct>
#> 1        1 cold 
#> 2        2 warm

Here the two columns of cumulative probabilities pay off because one provides the bottom of the ribbon and the other provides the top of the ribbon.

ggplot(preds) +
  aes(x = as.numeric(temp)) +
  geom_ribbon(
    aes(
      fill = rating, ymin = prob_lt, ymax = prob_lte
    )
  ) +
  geom_line(
    aes(group = rating, y = prob_lt),
    # don't draw solid line at y = 0
    data = function(x) filter(x, prob_lt != 0),
    linetype = "dashed"
  ) +
  facet_wrap("contact") +
  scale_x_continuous(
    breaks = 1:2, labels = c("cold", "warm"),
    expand = expansion(mult = .25)
  ) +
  scale_color_viridis_d(option = "E", aesthetics = "fill")

center

last_plot() +
  stat_summary(
    aes(
      x = 1 + as.numeric(rating) / 5 - .05,
      label = rating,
      color = rating,
      y = (prob_lte + prob_lt) / 2
    ),
    fun = median,
    geom = "text"
  ) +
  scale_color_manual(
    values = scales::viridis_pal(option = "E")(2)[c(2,2,2,1,1)]
  ) +
  guides(color = "none", fill = "none") +
  labs(x = "temperature", y = "cumulative probability")

center

Bayesian version

I once prepared a Bayesian version of the idea. Here is that recipe:

b_model <- brms::brm(
  rating ~ temp * contact, 
  data = wine,
  family = brms::cumulative(), 
  backend = "cmdstanr",
  chains = 1, 
  file = "_caches/2022-03-07-ordinal"
)

Predictions comes as .category probabilities, so we sum them to compute two cumulative probabilities. But we also compute posterior medians to get the thresholds.

data_rvars <- pred_grid |> 
  tidybayes::add_epred_draws(b_model) |> 
  tidybayes::nest_rvars() |> 
  group_by(contact, temp, .row) |> 
  mutate(
    prob_lte = cumsum(.epred),
    # subtracting 1 from 1 to make a 0 rvar
    prob_lt = c(prob_lte[5] - 1, prob_lte[1:4])
  )
#> Loading required namespace: rstan

data_boundaries <- data_rvars |>
  group_by(contact, temp, .category) |>
  ggdist::median_qi(prob_lte, prob_lt)

Here is the trick. Hint at the uncertainty by overlaying 100 ribbons:

data_draws_100 <- data_rvars |> 
  tidybayes::unnest_rvars() |>
  filter(.draw <= 100)
  
# create 100 geom_area() layers
layers <- data_draws_100 |>
  split(~ .draw) |>
  lapply(function(x) {
    geom_ribbon(
      aes(fill = .category, ymin = prob_lt, ymax = prob_lte),
      alpha = .01,
      data = x
    )
  })

ggplot(data_draws_100) +
  aes(x = as.numeric(temp)) +
  layers +
  geom_line(
    aes(y = prob_lte, group = .category),
    data = data_boundaries |> filter(.category != 5),
    linetype = "dashed"
  ) +
  stat_summary(
    aes(
      x = 1 + as.numeric(.category) / 5 - .05,
      label = .category,
      color = .category,
      y = (prob_lte + prob_lt) / 2
    ),
    data = data_boundaries,
    fun = median,
    geom = "text"
  ) +
  facet_wrap("contact") +
  scale_x_continuous(
    breaks = 1:2, labels = c("cold", "warm"),
    expand = expansion(mult = .25)
  ) +
  scale_color_viridis_d(option = "E", aesthetics = "fill") +
  scale_color_manual(
    values = scales::viridis_pal(option = "E")(2)[c(2,2,2,1,1)]
  ) +
  guides(color = "none", fill = "none") +
  labs(x = "temperature", y = "cumulative probability")

center

Bayesian version

Leave a comment