ANOVA

Table of contents

ANOVA

Problem

You want to compare multiple groups using an ANOVA.

Solution

Suppose this is your data:

data <- read.table(header=T, con <- textConnection('
 subject sex   age before after
       1   F   old    9.5   7.1
       2   M   old   10.3  11.0
       3   M   old    7.5   5.8
       4   F   old   12.4   8.8
       5   M   old   10.2   8.6
       6   M   old   11.0   8.0
       7   M young    9.1   3.0
       8   F young    7.9   5.2
       9   F   old    6.6   3.4
      10   M young    7.7   4.0
      11   M young    9.4   5.3
      12   M   old   11.6  11.3
      13   M young    9.9   4.6
      14   F young    8.6   6.4
      15   F young   14.3  13.5
      16   F   old    9.2   4.7
      17   M young    9.8   5.1
      18   F   old    9.9   7.3
      19   F young   13.0   9.5
      20   M young   10.2   5.4
      21   M young    9.0   3.7
      22   F young    7.9   6.2
      23   M   old   10.1  10.0
      24   M young    9.0   1.7
      25   M young    8.6   2.9
      26   M young    9.4   3.2
      27   M young    9.7   4.7
      28   M young    9.3   4.9
      29   F young   10.7   9.8
      30   M   old    9.3   9.4
'))
close(con)

One way between ANOVA

# One way between:
# IV: sex
# DV: before
aov.before.sex <- aov(before ~ sex, data=data)
summary(aov.before.sex)
#             Df Sum Sq Mean Sq F value Pr(>F)
# sex          1  1.529   1.529   0.573 0.4554
# Residuals   28 74.701   2.668 

# Show the means
model.tables(aov.before.sex, "means")
# Tables of means
# Grand mean
# 9.703333
#
#  sex 
#      F      M
#     10  9.532
# rep 11 19.000

Two way between ANOVA

# 2x2 between:
# IV: sex
# IV: age
# DV: after
# These two calls are equivalent
aov.after.sex.age <- aov(after ~ sex*age, data=data)
aov.after.sex.age <- aov(after ~ sex + age + sex:age, data=data)
summary(aov.after.sex.age)
#             Df  Sum Sq Mean Sq F value   Pr(>F)    
# sex          1  16.078  16.078  4.0384 0.054962 .  
# age          1  38.961  38.961  9.7862 0.004301 ** 
# sex:age      1  89.611  89.611 22.5085  6.6e-05 ***
# Residuals   26 103.512   3.981                     
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

# Show the means
model.tables(aov.after.sex.age, "means")
# Tables of means
# Grand mean
#          
# 6.483333 
# 
#  sex 
#          F      M
#      7.445  5.926
# rep 11.000 19.000
# 
#  age 
#      young    old
#      5.556  7.874
# rep 18.000 12.000
# 
#  sex:age 
#      age
# sex   young  old   
#   F    8.433  6.260
#   rep  6.000  5.000
#   M    4.042  9.157
#   rep 12.000  7.000

Tukey HSD post-hoc test

TukeyHSD(aov.after.sex.age)
#   Tukey multiple comparisons of means
#     95% family-wise confidence level
#
# Fit: aov(formula = after ~ sex + age + sex:age, data = data)
#
# $sex
#          diff       lwr        upr     p adj
# M-F -1.519139 -3.073025 0.03474709 0.0549625
#
# $age
#              diff       lwr      upr     p adj
# old-young 2.317850 0.7893498 3.846349 0.0044215
#
# $`sex:age`
#                       diff        lwr       upr     p adj
# M:young-F:young -4.3916667 -7.1285380 -1.654795 0.0008841
# F:old-F:young   -2.1733333 -5.4878491  1.141182 0.2966111
# M:old-F:young    0.7238095 -2.3214997  3.769119 0.9138789
# F:old-M:young    2.2183333 -0.6952887  5.131955 0.1832890
# M:old-M:young    5.1154762  2.5121923  7.718760 0.0000676
# M:old-F:old      2.8971429 -0.3079526  6.102238 0.0869856

ANOVAs with within-subjects variables

For ANOVAs with within-subjects variables, the data must be in long format. The data supplied above is in wide format, so we have to convert it first. (See ../../Manipulating data/Converting data between wide and long format for more information.)

Also, for ANOVAs with a within-subjects variable, there must be an identifier column. In this case, it is subject. This identifier variable must be a factor. If it is a numeric type, the function will interpret it incorrectly and it won't work properly.

library(reshape)

# This is what the original (wide) data looks like
# subject sex   age before after
#       1   F   old    9.5   7.1
#       2   M   old   10.3  11.0
#       3   M   old    7.5   5.8

# Convert it to long format
data.long <- melt(data, id = c("subject","sex","age"), # keep these columns the same
                  measure = c("before","after"),       # Put these two columns into a new column
                  variable_name="time")                # Name of the new column with the labels
# subject sex   age   time value
#       1   F   old before   9.5
#       2   M   old before  10.3
#       3   M   old before   7.5
#     ...
#       1   F   old  after   7.1
#       2   M   old  after  11.0
#       3   M   old  after   5.8
#     ...

# Make sure subject column is a factor
data$subject <- factor(data$subject)

One-way within ANOVA

First, convert the data to long format and make sure subject is a factor, as shown above.

# IV (within): time
# DV:          value
aov.time <- aov(value ~ time + Error(subject/time), data=data.long)
summary(aov.time)
# Error: subject
#           Df Sum Sq Mean Sq F value Pr(>F)
# Residuals  1 2.6238  2.6238               
#
# Error: subject:time
#      Df Sum Sq Mean Sq
# time  1 136.97  136.97
#
# Error: Within
#           Df Sum Sq Mean Sq F value  Pr(>F)  
# time       1  21.35   21.35   3.748 0.05793 .
# Residuals 56 318.97    5.70                  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

# This won't work here for some reason (?)
model.tables(aov.time, "means")
# Tables of means
# Grand mean
#         
# 8.093333 
# Warning message:
# In any(efficiency) : coercing argument of type 'double' to logical

Mixed design ANOVA

First, convert the data to long format and make sure subject is a factor, as shown above.

# 2x2 mixed:
# IV between: age
# IV within:  time
# DV:         value
aov.after.age.time <- aov(value ~ age*time + Error(subject/time), data=data.long)
summary(aov.after.age.time)
# Error: subject
#     Df Sum Sq Mean Sq
# age  1 2.6238  2.6238
#
# Error: subject:time
#      Df Sum Sq Mean Sq
# time  1 136.97  136.97
#
# Error: Within
#           Df  Sum Sq Mean Sq F value  Pr(>F)  
# age        1  22.260  22.260  4.2831 0.04329 *
# time       1  21.348  21.348  4.1076 0.04764 *
# age:time   1  16.064  16.064  3.0909 0.08440 .
# Residuals 54 280.648   5.197                  
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

# This won't work here because the data is unbalanced
model.tables(aov.after.age.time, "means")
# Error in model.tables.aovlist(aov.after.age.time, "means") : 
#   design is unbalanced so cannot proceed

More ANOVAs with within-subjects variables

These examples don't operate on the data above, but they should illustrate how to do things. First, convert the data to long format and make sure subject is a factor, as shown above.

# Two within variables
aov.ww <- aov(y ~ w1*w2 + Error(subject/(w1*w2)), data=data.long)

# One between variable and two within variables
aov.bww <- aov(y ~ b1*w1*w2 + Error(subject/(w1*w2)) + b1, data=data.long)

# Two between variables and one within variables
aov.bww <- aov(y ~ b1*b2*w1 + Error(subject/(w1)) + b1*b2, data=data.long)

Note

This is the code used to generate the data above. It is here for safe keeping.

set.seed(123)

N <- 30
subject     <- 1:N
data        <- data.frame(subject)
data$sex    <- factor(vector(length=N), levels=c("F","M"))
data$age    <- factor(vector(length=N), levels=c("young","old"))
data$before <- vector("numeric", length=N)
data$after  <- vector("numeric", length=N)

for (i in 1:N) {
    if (runif(1) < .5)  data$sex[i] <- "F"
    else                data$sex[i] <- "M"

    if (runif(1) < .5)  data$age[i] <- "young"
    else                data$age[i] <- "old"

    # Set different effects based on sex and condition
    if (data$sex[i] == "F") {
        if (data$age[i] == "young")  change.mean <- -2
        else                         change.mean <- -4 
    }
    else {
        if (data$age[i] == "old")  change.mean <- -1
        else                       change.mean <- -5 
    }

    data$before[i] <- round(rnorm(1, mean=10, sd=2), digits=1)
    data$after[i]  <- data$before[i] + round(rnorm(1, mean=change.mean, sd=1), digits=1)

}

For more information on ANOVA in R, see: