Visualizing Associations and Models

Author

Emorie D Beck

The Data

Code
load(url("https://github.com/emoriebeck/psc290-data-viz-2022/blob/main/04-week4-associations/04-data/week4-data.RData?raw=true"))
pred_data

Part 1 Visualizing Associations Among Quantitative Variables

Scatterplots

  • Scatterplots are pretty ubiquitous
  • From a data visualization standpoint, this makes sense
  • Scatterplots
    • show raw data
    • are common enough that little visualization literacy is needed
    • allow for lots of summaries to be placed atop them
    • this is why they are our entry point for today

Scatterplots - Basics

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth)

Let’s look at a basic scatterplot:

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth) %>%
  ggplot(aes(x = p_value, y = SRhealth)) + 
    geom_point(shape = 21, fill = "grey80", color = "black", size = 2) + 
    labs(
      x = "Agreeableness (POMP; 0-10)"
      , y = "Self-Rated Health (POMP; 0-10)"
    ) + 
    theme_classic()

Now let’s add a trend line:

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth) %>%
  ggplot(aes(x = p_value, y = SRhealth)) + 
    geom_point(shape = 21, fill = "grey80", color = "black", size = 2) + 
    geom_smooth(method = "lm", size = 3, se = F) + 
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Self-Rated Health (POMP; 0-10)"
    ) + 
    theme_classic()

But we have multiple studies, so we need to separate them out using facet_wrap()

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth) %>%
  filter(!is.na(SRhealth)) %>%
  ggplot(aes(x = p_value, y = SRhealth)) + 
    geom_point(shape = 21, fill = "grey80", color = "black", size = 2) + 
    scale_fill_manual(values = c("grey80", "seagreen4")) + 
    facet_wrap(~study) +
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Self-Rated Health (POMP; 0-10)"
    ) + 
    theme_classic()

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth) %>%
  filter(!is.na(SRhealth)) %>%
  ggplot(aes(x = p_value, y = SRhealth)) + 
    geom_point(shape = 21, fill = "grey80", color = "black", size = 2, alpha = .25) + 
    geom_smooth(method = "lm", size = 3, se = F) + 
    scale_fill_manual(values = c("grey80", "seagreen4")) + 
    facet_wrap(~study) +
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Self-Rated Health (POMP; 0-10)"
    ) + 
    theme_classic()

But if you remember from your readings, we don’t typically want to show associations without some sort of estimate of error, confidence, etc.

Code
pred_data %>% 
  select(study, SID, p_value, SRhealth) %>%
  filter(!is.na(SRhealth)) %>%
  ggplot(aes(x = p_value, y = SRhealth)) + 
    geom_point(shape = 21, fill = "grey80", color = "black", size = 2, alpha = .25) + 
    geom_smooth(method = "lm", size = 1.5, se = T, color = "black") + 
    scale_fill_manual(values = c("grey80", "seagreen4")) + 
    facet_wrap(~study) +
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Self-Rated Health (POMP; 0-10)"
      , title = "Conscientiousness -- Self-Rated Health Associations Across Samples"
    ) + 
    theme_classic()

Correlations and Correlelograms

  • Understanding associations is always important, but perhaps never more so than when we do descriptives

  • My hot take is that zero-order correlation maatrices should always be included in papers

    • Someone’s meta-analysis will thank you
  • If you’re dumping correlations in supplementary materials, then tables are fine

  • But you (and your brain) will thank yourself if you use heat maps or correlelograms to visualize the correlations

    • (Remember how quickly and preattentively we perceive color and size?)
  • There are R packages for this, but where’s the fun in that?

  • All right, let’s estimate some correlation matrices for each sample.

Code
r_data <- pred_data %>%
  select(study, p_value, age, gender, SRhealth, smokes, exercise, BMI, education, parEdu, mortality = o_value) %>%
  mutate_if(is.factor, ~as.numeric(as.character(.))) %>%
  group_by(study) %>%
  nest() %>%
  ungroup() %>%
  mutate(r = map(data, ~cor(., use = "pairwise")))
r_data
  • The thing is that we know ggplot doesn’t like wide form data, which is what cor() produces
Code
r_data$r[[1]]
               p_value          age       gender    SRhealth       smokes
p_value    1.000000000 -0.005224085  0.053627861  0.15917525 -0.069013463
age       -0.005224085  1.000000000 -0.057243245 -0.22438335 -0.078788619
gender     0.053627861 -0.057243245  1.000000000 -0.03182278  0.022275557
SRhealth   0.159175251 -0.224383351 -0.031822781  1.00000000 -0.129241536
smokes    -0.069013463 -0.078788619  0.022275557 -0.12924154  1.000000000
exercise   0.048576025 -0.361768736  0.061659017  0.34546038 -0.155018841
BMI       -0.019741798  0.036151816  0.012217132 -0.09340105 -0.037713371
education  0.001465775 -0.173399716 -0.001603648  0.11008540 -0.096936630
parEdu     0.019871078 -0.374733606  0.055468171  0.08273023  0.005215303
mortality -0.089637524  0.627069166 -0.092109448 -0.31142292  0.035759332
             exercise         BMI    education       parEdu   mortality
p_value    0.04857602 -0.01974180  0.001465775  0.019871078 -0.08963752
age       -0.36176874  0.03615182 -0.173399716 -0.374733606  0.62706917
gender     0.06165902  0.01221713 -0.001603648  0.055468171 -0.09210945
SRhealth   0.34546038 -0.09340105  0.110085399  0.082730234 -0.31142292
smokes    -0.15501884 -0.03771337 -0.096936630  0.005215303  0.03575933
exercise   1.00000000 -0.06217297  0.210204022  0.176766791 -0.32138385
BMI       -0.06217297  1.00000000 -0.048914825 -0.075000576  0.01643219
education  0.21020402 -0.04891483  1.000000000  0.232321970 -0.17215791
parEdu     0.17676679 -0.07500058  0.232321970  1.000000000 -0.18796244
mortality -0.32138385  0.01643219 -0.172157913 -0.187962436  1.00000000

Reshaping

  • So we need to reshape it in long form
  • We’re going to use a function so we only have to write the code once and can apply it to all the samples
  • Here’s what we’ll do:
    • remove the lower triangle and the diagonal of the correlation matrix
    • make matrix a data frame
    • make the row names of the matrix a column
    • make the columns long
    • factor them to retain order
Code
r_reshape_fun <- function(r){
  coln <- colnames(r)
  # remove lower tri and diagonal
  r[lower.tri(r, diag = T)] <- NA
  r %>% data.frame() %>%
    rownames_to_column("V1") %>%
    pivot_longer(
      cols = -V1
      , values_to = "r"
      , names_to = "V2"
    ) %>%
    mutate_at(vars(V1, V2), ~factor(., coln))
}

r_data <- r_data %>%
  mutate(r = map(r, r_reshape_fun))
r_data$r[[1]]

Heat Map Time!

This is, technically, a heat map, but I think we can do better!

Code
r_data$r[[1]] %>%
  ggplot(aes(
    x = V1
    , y = V2
    , fill = r
  )) + 
  geom_raster() + 
  theme_minimal()

Colors

Let’s add some intuitive colors using scale_fill_gradient2()

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r)) + 
  geom_raster() + 
  scale_fill_gradient2(
    limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue"
    , high = "red"
    , mid = "white"
    , na.value = "white"
    ) + 
  theme_minimal()

Labels

Do we need axis labels?

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r)) + 
  geom_raster() + 
  scale_fill_gradient2(limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue", high = "red"
    , mid = "white", na.value = "white") + 
  labs(
    x = NULL
    , y = NULL
    , fill = "Zero-Order Correlation"
    , title = "Zero-Order Correlations Among Variables in Sample 1"
    ) + 
  theme_minimal()

Theme Elements

Let’s fix the theme elements. So close!

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r)) + 
  geom_raster() + 
  scale_fill_gradient2(limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue", high = "red"
    , mid = "white", na.value = "white") + 
  labs(
    x = NULL
    , y = NULL
    , fill = "Zero-Order Correlation"
    , title = "Zero-Order Correlations Among Variables"
    , subtitle = "Sample 1"
    ) + 
  theme_classic() + 
  theme(
    legend.position = "bottom"
    , axis.text = element_text(face = "bold")
    , axis.text.x = element_text(angle = 45, hjust = 1)
    , plot.title = element_text(face = "bold", hjust = .5)
    , plot.subtitle = element_text(face = "italic", hjust = .5)
    , panel.background = element_rect(color = "black", size = 1)
  )

Finishing Touches!

Let’s fix the theme elements. So close!

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r)) + 
  geom_raster() + 
  geom_text(aes(label = round(r, 2))) + 
  scale_fill_gradient2(limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue", high = "red"
    , mid = "white", na.value = "white") + 
  labs(
    x = NULL
    , y = NULL
    , fill = "Zero-Order Correlation"
    , title = "Zero-Order Correlations Among Variables"
    , subtitle = "Sample 1"
    ) + 
  theme_classic() + 
  theme(
    legend.position = "bottom"
    , axis.text = element_text(face = "bold")
    , axis.text.x = element_text(angle = 45, hjust = 1)
    , plot.title = element_text(face = "bold", hjust = .5)
    , plot.subtitle = element_text(face = "italic", hjust = .5)
    , panel.background = element_rect(color = "black", size = 1)
  )

Correlelogram

A correlelogram is basically a heat map that uses size in addition to color.

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, color = r, size = abs(r))) + 
  geom_point() + 
  theme_classic()

Improvements

We’re going to skip the steps we took with a heat map. So close! Just need to get rid of that size legend.

Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r, size = abs(r))) + 
  geom_point(shape = 21) + 
  scale_fill_gradient2(limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue", high = "red"
    , mid = "white", na.value = "white") + 
  scale_size_continuous(range = c(3,14)) + 
  labs(
    x = NULL
    , y = NULL
    , fill = "Zero-Order Correlation"
    , title = "Zero-Order Correlations Among Variables"
    , subtitle = "Sample 1"
    ) + 
  theme_classic() + 
  theme(
    legend.position = "bottom"
    , axis.text = element_text(face = "bold")
    , axis.text.x = element_text(angle = 45, hjust = 1)
    , plot.title = element_text(face = "bold", hjust = .5)
    , plot.subtitle = element_text(face = "italic", hjust = .5)
    , panel.background = element_rect(color = "black", size = 1)
  )

Legend

  • To do this, we’ll use the guides() function!
Code
r_data$r[[1]] %>%
  ggplot(aes(x = V1, y = V2, fill = r, size = abs(r))) + 
  geom_point(shape = 21) + 
  scale_fill_gradient2(limits = c(-1,1)
    , breaks = c(-1, -.5, 0, .5, 1)
    , low = "blue", high = "red"
    , mid = "white", na.value = "white") + 
  scale_size_continuous(range = c(3,14)) + 
  labs(
    x = NULL
    , y = NULL
    , fill = "Zero-Order Correlation"
    , title = "Zero-Order Correlations Among Variables"
    , subtitle = "Sample 1"
    ) + 
  guides(size = "none") + 
  theme_classic() + 
  theme(
    legend.position = "bottom"
    , axis.text = element_text(face = "bold")
    , axis.text.x = element_text(angle = 45, hjust = 1)
    , plot.title = element_text(face = "bold", hjust = .5)
    , plot.subtitle = element_text(face = "italic", hjust = .5)
    , panel.background = element_rect(color = "black", size = 1)
  )

Part 2 Visualizing Associations, Parameters, and Predictions from Models

  • The goal of data visualization is to tell a story that tables, words, etc. either can’t or can’t do simply
  • Data visualizations aims to clarify complex patterns in data
  • Thus far, we’ve mostly focused on building models from raw data or descriptives of raw data
  • But in most research, we lean on inferential statistics and hypothesis testing (frequent or Bayesian) to tell our story
  • So next, we’ll talk about how to use data visualization to tell stories with models
    • The reality is that there is no generalizable way to do this
    • So we will focus on models for which we are interested in specific parameters and/or parameterized our questions
    • Why? These have some shared functions across lots of packages in R
    • For models that don’t, that’s a data cleaning problem, not a visualization problem
  • Let’s start with a basic model and predict later all-cause mortality from Conscientiousness in Sample 1.
  • The basic form of the model is:

\[ logit(\frac{\pi}{1-\pi}) = b_0 + b_1*C_{ij} + \epsilon_{ij} \]

Code
ds1 <- pred_data %>% filter(study == "Study1")
m1 <- glm(o_value ~ p_value, data = ds1, family = binomial(link = "logit"))
summary(m1)

Call:
glm(formula = o_value ~ p_value, family = binomial(link = "logit"), 
    data = ds1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2774  -1.0164  -0.9358   1.3244   1.4866  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.23213    0.25920   0.896   0.3705  
p_value     -0.09349    0.03632  -2.574   0.0101 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1117.4  on 830  degrees of freedom
Residual deviance: 1110.7  on 829  degrees of freedom
AIC: 1114.7

Number of Fisher Scoring iterations: 4
  • Models and other objects in R are stored in lists or list-like objects
  • We can explore these lots of ways, but one good one is with str()
Code
str(m1)
List of 30
 $ coefficients     : Named num [1:2] 0.2321 -0.0935
  ..- attr(*, "names")= chr [1:2] "(Intercept)" "p_value"
 $ residuals        : Named num [1:831] -1.68 -2.26 -1.5 -1.64 -1.64 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ fitted.values    : Named num [1:831] 0.403 0.558 0.331 0.391 0.391 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ effects          : Named num [1:831] 5.755 -2.574 -0.729 -0.78 -0.78 ...
  ..- attr(*, "names")= chr [1:831] "(Intercept)" "p_value" "" "" ...
 $ R                : num [1:2, 1:2] -14.1 0 -96.5 27.5
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:2] "(Intercept)" "p_value"
  .. ..$ : chr [1:2] "(Intercept)" "p_value"
 $ rank             : int 2
 $ qr               :List of 5
  ..$ qr   : num [1:831, 1:2] -14.0555 0.0353 0.0335 0.0347 0.0347 ...
  .. ..- attr(*, "dimnames")=List of 2
  .. .. ..$ : chr [1:831] "1" "2" "3" "4" ...
  .. .. ..$ : chr [1:2] "(Intercept)" "p_value"
  ..$ rank : int 2
  ..$ qraux: num [1:2] 1.03 1.12
  ..$ pivot: int [1:2] 1 2
  ..$ tol  : num 1e-11
  ..- attr(*, "class")= chr "qr"
 $ family           :List of 12
  ..$ family    : chr "binomial"
  ..$ link      : chr "logit"
  ..$ linkfun   :function (mu)  
  ..$ linkinv   :function (eta)  
  ..$ variance  :function (mu)  
  ..$ dev.resids:function (y, mu, wt)  
  ..$ aic       :function (y, n, mu, wt, dev)  
  ..$ mu.eta    :function (eta)  
  ..$ initialize: language {     if (NCOL(y) == 1) { ...
  ..$ validmu   :function (mu)  
  ..$ valideta  :function (eta)  
  ..$ simulate  :function (object, nsim)  
  ..- attr(*, "class")= chr "family"
 $ linear.predictors: Named num [1:831] -0.391 0.232 -0.703 -0.443 -0.443 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ deviance         : num 1111
 $ aic              : num 1115
 $ null.deviance    : num 1117
 $ iter             : int 4
 $ weights          : Named num [1:831] 0.241 0.247 0.222 0.238 0.238 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ prior.weights    : Named num [1:831] 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ df.residual      : int 829
 $ df.null          : int 830
 $ y                : Named num [1:831] 0 0 0 0 0 0 0 0 0 0 ...
  ..- attr(*, "names")= chr [1:831] "1" "2" "3" "4" ...
 $ converged        : logi TRUE
 $ boundary         : logi FALSE
 $ model            :'data.frame':  831 obs. of  2 variables:
  ..$ o_value: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
  ..$ p_value: num [1:831] 6.67 0 10 7.22 7.22 ...
  ..- attr(*, "terms")=Classes 'terms', 'formula'  language o_value ~ p_value
  .. .. ..- attr(*, "variables")= language list(o_value, p_value)
  .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. .. ..$ : chr [1:2] "o_value" "p_value"
  .. .. .. .. ..$ : chr "p_value"
  .. .. ..- attr(*, "term.labels")= chr "p_value"
  .. .. ..- attr(*, "order")= int 1
  .. .. ..- attr(*, "intercept")= int 1
  .. .. ..- attr(*, "response")= int 1
  .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. .. ..- attr(*, "predvars")= language list(o_value, p_value)
  .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "factor" "numeric"
  .. .. .. ..- attr(*, "names")= chr [1:2] "o_value" "p_value"
 $ call             : language glm(formula = o_value ~ p_value, family = binomial(link = "logit"), data = ds1)
 $ formula          :Class 'formula'  language o_value ~ p_value
  .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
 $ terms            :Classes 'terms', 'formula'  language o_value ~ p_value
  .. ..- attr(*, "variables")= language list(o_value, p_value)
  .. ..- attr(*, "factors")= int [1:2, 1] 0 1
  .. .. ..- attr(*, "dimnames")=List of 2
  .. .. .. ..$ : chr [1:2] "o_value" "p_value"
  .. .. .. ..$ : chr "p_value"
  .. ..- attr(*, "term.labels")= chr "p_value"
  .. ..- attr(*, "order")= int 1
  .. ..- attr(*, "intercept")= int 1
  .. ..- attr(*, "response")= int 1
  .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
  .. ..- attr(*, "predvars")= language list(o_value, p_value)
  .. ..- attr(*, "dataClasses")= Named chr [1:2] "factor" "numeric"
  .. .. ..- attr(*, "names")= chr [1:2] "o_value" "p_value"
 $ data             : tibble [831 × 25] (S3: tbl_df/tbl/data.frame)
  ..$ study       : chr [1:831] "Study1" "Study1" "Study1" "Study1" ...
  ..$ o_value     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
  ..$ p_year      : num [1:831] 2005 2005 2005 2005 2005 ...
  ..$ SID         : chr [1:831] "61215" "184965" "488251" "650779" ...
  ..$ p_value     : num [1:831] 6.67 0 10 7.22 7.22 ...
  ..$ age         : num [1:831] -29.925 -22.925 -3.925 -25.925 -0.925 ...
  ..$ gender      : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 1 2 1 2 ...
  ..$ grsWages    : num [1:831] 1.021 1.139 0.717 0.644 0.812 ...
  ..$ parEdu      : Factor w/ 3 levels "1","2","3": 2 2 1 3 2 2 1 1 2 2 ...
  ..$ race        : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
  ..$ physhlthevnt: Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
  ..$ SRhealth    : num [1:831] 9.23 7.5 6.43 6.92 5.77 ...
  ..$ smokes      : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 2 1 2 1 ...
  ..$ alcohol     : Factor w/ 2 levels "0","1": NA NA NA NA NA NA NA NA NA NA ...
  ..$ exercise    : num [1:831] 3.75 6.25 5 10 4.5 ...
  ..$ BMI         : num [1:831] 2.02 1.84 1.67 1.89 3.29 ...
  ..$ parDivorce  : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
  ..$ PhysFunc    : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
  ..$ religion    : Factor w/ 3 levels "0","1","2": 1 1 2 2 2 1 1 2 2 2 ...
  ..$ education   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
  ..$ married     : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 2 2 1 1 ...
  ..$ numKids     : num [1:831] 0 0 0.588 0 1.765 ...
  ..$ parOccPrstg : num [1:831] 3.67 5.41 1.21 4.66 4.99 ...
  ..$ reliability : num [1:831] 0.511 0.511 0.511 0.511 0.511 ...
  ..$ predInt     : num [1:831] 13 13 13 13 13 13 13 13 13 13 ...
 $ offset           : NULL
 $ control          :List of 3
  ..$ epsilon: num 1e-08
  ..$ maxit  : num 25
  ..$ trace  : logi FALSE
 $ method           : chr "glm.fit"
 $ contrasts        : NULL
 $ xlevels          : Named list()
 - attr(*, "class")= chr [1:2] "glm" "lm"
  • The broom package is great for working with models (and the broom.mixed add-on makes it even better)
  • We’re going to talk about how three its functions can be used for / improve data visualization:
    • tidy()
    • glance()
    • augment()

Models + broom: tidy()

  • Outside of dplyr/tidyr, tidy() is a close contender with purrr::map() functions as my most used function
  • Why?
    • When you run a model, base R provides the summary(), coef(), etc. to extract various components of the model
    • But these aren’t data.frames, which are core input to a lot of other R functions across packages
    • tidy() provides a data frame with core model coefficients, inferential tests, etc. that be easily matched and merged across models, etc.
  • But with logistic regression with a logit link, we are left with coefficents that have to be interpreted in log odds, which realistically, almost no one can do
  • So we have to “undo” the log, which you may remember can done by exponentiating the natural log (ln)
  • But we can directly exponentiate from the summary function because it’s the wrong class of object
  • We could just exponentiate the coefficients from the coef() function, but this still leaves us with the need to extract estimates of precision, like standard errors, confidence intervals, and more.
Code
coef(m1)
(Intercept)     p_value 
  0.2321341  -0.0934916 
Code
tidy(m1)
  • Even better, we can easily get confidence intervals
Code
tidy(m1, conf.int = T)

Multiple Parameter Plots

  • But when would you ever want to create a plot of just two parameters? Maybe never, but what if we wanted to do it for all 6 samples?
  • Watch! Let’s make a nested data frame that will hold
    • All the data for each sample
    • A model for each sample
    • The tidy() data frame of the parmeter estimates for each sample
Code
tidy_ci <- function(m) tidy(m, conf.int = T)

nested_m <- pred_data %>%
  group_by(study) %>%
  nest() %>%
  ungroup() %>%
  mutate(
    m = map(data, ~glm(o_value ~p_value, data = ., family = binomial(link = "logit")))
    , tidy = map(m, tidy_ci)
  )
nested_m

Now, we’ll drop the data and m columns that we don’t need and unnest() our tidy() data frames

Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy)

Basic Plot

Now these parameters from multiple models, we may want to plot!

Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate)
  ) + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point() + 
    theme_classic()

Faceting

Almost, but we have two parameters for each model (Intercept and p_value), so let’s split those in a facet:

Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate)
  ) + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point() + 
    facet_grid(~term) + 
    theme_classic()

We’ve got some work to do to make this an intuitive figure. Let’s: + Add a dashed line at 1 (odd ratio of 1 is a null effect) + Make the points bigger + Fix the titles on the plot and axis titles + Add some color + Fiddle with themes to make it prettier

Null Comparison

Add a dashed line at 1 (odd ratio of 1 is a null effect)

Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate)
  ) + 
    geom_vline(aes(xintercept = 1), linetype = "dashed") + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point() + 
    facet_grid(~term, scales = "free") + 
    theme_classic()

Point Size

  • Make the points bigger
Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate)
  ) + 
    geom_vline(aes(xintercept = 1), linetype = "dashed") + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point(size = 3, shape = 15) + 
    facet_grid(~term, scales = "free") + 
    theme_classic()

Titles

  • Fix the titles on the plot and axis titles
Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate)
  ) + 
    geom_vline(aes(xintercept = 1), linetype = "dashed") + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point(size = 3, shape = 15) + 
    labs(
      x = "Estimate (CI) in OR"
      , y = NULL
      , title = "Conscientiousness was associated with mortality 50% of samples"
      , subtitle = "Samples with lower mortality risk overall had fewer significant associations"
      ) + 
    facet_grid(~term, scales = "free") + 
    theme_classic()

Color and Themes

Add some color Fiddle with themes to make it prettier

Code
nested_m %>%
  select(study, tidy) %>%
  unnest(tidy) %>%
  mutate_at(vars(estimate, conf.low, conf.high), exp) %>%
  ggplot(
    aes(y = study, x = estimate, fill = study)
  ) + 
    geom_vline(aes(xintercept = 1), linetype = "dashed") + 
    geom_errorbar(
      aes(xmin = conf.low, xmax = conf.high)
      , position = position_dodge(width = .9)
      , width = .1
      ) + 
    geom_point(size = 3, shape = 22) + 
    labs(
      x = "Estimate (CI) in OR"
      , y = NULL
      , title = "Conscientiousness was associated with mortality 50% of samples"
      , subtitle = "Samples with lower mortality risk overall had fewer significant associations"
      ) + 
    facet_grid(~term, scales = "free") + 
    theme_classic() + 
    theme(
      legend.position = "none"
      , axis.text = element_text(face = "bold", size = rel(1.1))
      , axis.title = element_text(face = "bold", size = rel(1.2))
      , axis.line = element_blank()
      , strip.text = element_text(face = "bold", size = rel(1.1), color = "white")
      , strip.background = element_rect(fill = "black")
      , plot.title = element_text(face = "bold", size = rel(1.1), hjust = .5)
      , plot.subtitle = element_text(face = "italic", size = rel(1.1))
      , panel.border = element_rect(color = "black", fill = NA, size = 1)
    )

  • This isn’t perfect. But we’re going to come back to this kind of plot when we talk about “piecing plots together.”
  • Personally, I would:
    • Add text with Est. (CI) and N for each sample in the figure
    • Build both of these separately in order to order by effect size
    • Then put them back together and re-add the title

Models + broom: glance()

  • When we run models, we need to care about more that just point and interval estimates
  • Often we are interested in comparing models, checking diagnostics, etc.
  • Again, all of these are embedded (mostly), in the model objects
  • The glance() function brings some of these important ones into a single object
  • Here’s what it gives us for our logistic regression model
Code
glance(m1)
  • Let’s also look for al linear model, which may be more familiar for many of you:
Code
m2 <- lm(SRhealth ~ age, data = ds1)
glance(m2)

As before, we can do this with lots of models to compare across samples:

Code
nested_m <- nested_m %>%
  mutate(glance = map(m, glance))
nested_m
Code
nested_m %>%
  select(study, glance) %>%
  unnest(glance)

Realistically, this is the kind of info we table, but we can also merge it with info from tidy:

Code
nested_m %>%
  select(-data, -m) %>%
  unnest(tidy) %>% 
  unnest(glance) %>%
  mutate_if(is.numeric, ~round(., 2))
  • Diagnostics are not just summary statistics!
  • We care a lot about prediction, too
    • Residuals both tell us unexplained variance (i.e. how observed data deviate from model predictions)
    • Model predictions and prediction intervals tell us about how our model is doing across levels our variables

Let’s keep working with our nested data frame. Remember, it looks like this:

Code
nested_m

Models + broom: augment()

  • augment() let’s us add (augment) the raw data we feed the model based on the fitted model
  • Notice we now have more columns
Code
nested_m <- nested_m %>%
  mutate(data = map2(m, data, augment, se_fit = T))
nested_m

glm() + augment()

  • Here’s the columns we used along with the additional columns with a glm:
    • .fitted: fitted / predicted value
    • .se.fit: standard error
    • .resid: observed - fitted
    • .std.resd: standardized residuals
    • .sigma: estimated residual SD when this obs is dropped from model
    • cooksd: Cooks distance (is this an outlier?)
Code
nested_m$data[[1]] %>%
  select(o_value, SID, p_value, .fitted:.cooksd)

lm() + augment()

For the most part, many of the checks with glm’s and lm’s are the same. But it’s a bit easier to wrap your head around lm(), so let’s switch to that:

Code
nested_lm <- pred_data %>%
  select(study, SID, p_value, age, SRhealth) %>%
  drop_na() %>%
  group_by(study) %>%
  nest() %>%
  ungroup() %>%
  mutate(m = map(data, ~lm(SRhealth ~ p_value + age, data = .))
         , tidy = map(m, tidy_ci)
         , glance = map(m, glance)
         , data = map2(m, data, augment, se_fit = T, interval = "confidence"))
nested_lm
  • Here’s the columns we used along with the additional columns with an lm:
    • .fitted: fitted / predicted value
    • .se.fit: standard error
    • .lower: lower bound of the confidence/prediction interval
    • .upper: upper bound of the confidence/prediction interval
    • .resid: observed - fitted
    • .std.resd: standardized residuals
    • .sigma: estimated residual SD when this obs is dropped from model
    • cooksd: Cooks distance (is this an outlier?)
  • One standard diagnostic plot is to plot fitted values v residuals
  • Looks a little wonky (remember, these are results from multiple harmonized studies)
Code
nested_lm %>%
  select(study, data) %>%
  unnest(data) %>%
  ggplot(aes(
    x = .fitted
    , y = .resid
  )) + 
  geom_point() + 
  theme_classic()

Plotting augment() Diagnostics

  • One standard diagnostic plot is to plot fitted values v residuals
  • Looks a little wonky (remember, these are results from multiple harmonized studies)
Code
nested_lm %>%
  select(study, data) %>%
  unnest(data) %>%
  ggplot(aes(
    x = .fitted
    , y = .resid
  )) + 
  geom_point() +
  labs(
    x = "Model Fitted Values"
    , y = "Residual") +
  facet_wrap(~study) + 
  theme_classic()

Models + broom: augment()

Another is raw v. fitted

Code
nested_lm %>%
  select(study, data) %>%
  unnest(data) %>%
  ggplot(aes(
    x = p_value
    , y = .resid
  )) + 
  geom_point() +
  facet_wrap(~study) + 
  theme_classic()

Model Predictions

  • Although we can get the standard error of the prediction for each person, we often want to look at theoretical predictions, adjusting for covariates. We can typically use built-in predict() or fitted() functions
  • To do this, we need to see theoretical ranges of key variables and grab averages of covariates
  • I use functions for this. We’ll do one and build
Code
m1 <- nested_lm$m[[1]]
d1 <- m1$model

crossing(
  p_value = seq(0, 10, length.out = 100)
  , age = mean(d1$age)
) %>%
  bind_cols(
    .
    , predict(m1, newdata = ., interval = "prediction")
  )

Plotting That

Code
crossing(
  p_value = seq(0, 10, .1)
  , age = mean(d1$age)
) %>%
  bind_cols(
    .
    , predict(m1, newdata = ., interval = "prediction")
  ) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    theme_classic()

  • This is fine, but it could use some improvements:
    • better scales
    • raw data
    • the usual aesthetics

Better scales

Code
crossing(
  p_value = seq(0, 10, .1)
  , age = mean(d1$age)
) %>%
  bind_cols(
    .
    , predict(m1, newdata = ., interval = "prediction")
  ) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    theme_classic()

Raw Data

Code
crossing(
  p_value = seq(0, 10, .1)
  , age = mean(d1$age)
) %>%
  bind_cols(., predict(m1, newdata = ., interval = "prediction")) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_point(
      data = d1
      , aes(x = p_value, y = SRhealth)
      , alpha = .4
      , color = "seagreen4"
      ) + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    theme_classic()

The usual aesthetics

Code
crossing(
  p_value = seq(0, 10, .1)
  , age = mean(d1$age)
) %>%
  bind_cols(., predict(m1, newdata = ., interval = "prediction")) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_point(data = d1, aes(x = p_value, y = SRhealth)
      , alpha = .4, color = "seagreen4") + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Predicted Self-Rated Health (POMP; 0-10)"
      , title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated"
      ) + 
    theme_classic() + 
    theme(
      axis.text = element_text(face = "bold", size = rel(1.1))
      , axis.title = element_text(face = "bold", size = rel(1.1))
      , plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
      )

Now let’s do it for all of the samples

Code
pred_fun <- function(m){
  d <- m$model

  crossing(
    p_value = seq(0, 10, length.out = 100)
    , age = mean(d$age)
  ) %>%
    bind_cols(
      .
      , predict(m, newdata = ., interval = "prediction")
    )
}

nested_lm <- nested_lm %>%
  mutate(pred = map(m, pred_fun))
nested_lm

Now let’s do it for all of the samples

Code
nested_lm %>%
  select(study, pred) %>%
  unnest(pred)
  • Now let’s do it for all of the samples
  • Very close, but our intervals are cutoff
Code
nested_lm %>%
  select(study, pred) %>%
  unnest(pred) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_point(data = d1, aes(x = p_value, y = SRhealth)
      , alpha = .2, color = "seagreen4") + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Predicted Self-Rated Health (POMP; 0-10)"
      , title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated In Most Samples"
      ) + 
    facet_wrap(~study, ncol = 2) + 
    theme_classic() + 
    theme(
      axis.text = element_text(face = "bold", size = rel(1.1))
      , axis.title = element_text(face = "bold", size = rel(1.1))
      , plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
      )

  • Now let’s do it for all of the samples
  • Very close, but our intervals are cutoff
Code
nested_lm %>%
  select(study, pred) %>%
  unnest(pred) %>%
  mutate(upr = ifelse(upr > 10, 10, upr)
         , lwr = ifelse(lwr < 0, 0, lwr)) %>%
  ggplot(aes(x = p_value, y = fit)) + 
    geom_point(data = d1, aes(x = p_value, y = SRhealth)
      , alpha = .2, color = "seagreen4") + 
    geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "seagreen4", alpha = .2) + 
    geom_line(color = "seagreen4", size = 2) + 
    scale_x_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    scale_y_continuous(limits = c(0,10.2), breaks = seq(0,10,2)) + 
    labs(
      x = "Conscientiousness (POMP; 0-10)"
      , y = "Predicted Self-Rated Health (POMP; 0-10)"
      , title = "Conscientiousness and Self-Rated Health\nWere Weakly Associated In Most Samples"
      ) + 
    facet_wrap(~study, ncol = 2) + 
    theme_classic() + 
    theme(
      axis.text = element_text(face = "bold", size = rel(1.1))
      , axis.title = element_text(face = "bold", size = rel(1.1))
      , plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
      , strip.background = element_rect(fill = "darkseagreen4")
      , strip.text = element_text(face = "bold", color = "white")
      )

Wrapping Up

  • This is a quick introduction to visualizing associations and working with models
  • Here, we focused on doing things very manually to promote understanding
  • But there are lots of packages to automate much of this