Summarizing data

Table of contents

Summarizing data
- Problem
- Solution

Problem

You want to do summarize your data (with mean, standard deviation, etc.), broken down by group.

Solution

There are three ways described here to group data based on some specified variables, and apply a summary function (like mean, standard deviation, etc.) to each group.

The ddply() function. It is the easiest to use, though it requires the plyr package. This is probably what you want to use.
The summarizeBy() function. It is easier to use, though it requires the doBy package.
The aggregate() function. It is more difficult to use but is included in the base install of R.

Suppose you have this data and want to find the N, mean of change, standard deviation, and standard error of the mean for each group, where the groups are specified by each combination of sex and condition: F-placebo, F-aspirin, M-placebo, and M-aspirin.

data <- read.table(header=T, con <- textConnection('
 subject sex condition before after change
       1   F   placebo   10.1   6.9   -3.2
       2   F   placebo    6.3   4.2   -2.1
       3   M   aspirin   12.4   6.3   -6.1
       4   F   placebo    8.1   6.1   -2.0
       5   M   aspirin   15.2   9.9   -5.3
       6   F   aspirin   10.9   7.0   -3.9
       7   F   aspirin   11.6   8.5   -3.1
       8   M   aspirin    9.5   3.0   -6.5
       9   F   placebo   11.5   9.0   -2.5
      10   M   placebo   11.9  11.0   -0.9
      11   F   aspirin   11.4   8.0   -3.4
      12   M   aspirin   10.0   4.4   -5.6
      13   M   aspirin   12.5   5.4   -7.1
      14   M   placebo   10.6  10.6    0.0
      15   M   aspirin    9.1   4.3   -4.8
      16   F   placebo   12.1  10.2   -1.9
      17   F   placebo   11.0   8.8   -2.2
      18   F   placebo   11.9  10.2   -1.7
      19   M   aspirin    9.1   3.6   -5.5
      20   M   placebo   13.5  12.4   -1.1
      21   M   aspirin   12.0   7.5   -4.5
      22   F   placebo    9.1   7.6   -1.5
      23   M   placebo    9.9   8.0   -1.9
      24   F   placebo    7.6   5.2   -2.4
      25   F   placebo   11.8   9.7   -2.1
      26   F   placebo   11.8  10.7   -1.1
      27   F   aspirin   10.1   7.9   -2.2
      28   M   aspirin   11.6   8.3   -3.3
      29   F   aspirin   11.3   6.8   -4.5
      30   F   placebo   10.3   8.3   -2.0
 '))
close(con)

Using ddply

library(plyr)

# Run the functions length, mean, and sd on the value of "change" for each group, 
# broken down by sex + condition
cdata <- ddply(data, .(sex, condition), summarise, 
               N      = length(change),
               change = mean(change),
               sd     = sd(change),
               se     = sd(change) / sqrt(length(change)) )
# sex condition  N    change        sd        se
#   F   aspirin  5 -3.420000 0.8642916 0.3865230
#   F   placebo 12 -2.058333 0.5247655 0.1514867
#   M   aspirin  9 -5.411111 1.1307569 0.3769190
#   M   placebo  4 -0.975000 0.7804913 0.3902456

Handling missing data

If there are NA's in the data, you need to pass the flag na.rm=TRUE to each of the functions. length() doesn't take na.rm as an option, so one way to work around it is to use sum(!is.na(...)) to count how many non-NA's there are.

# Put some NA's in the data
dataNA <- data
dataNA$change[11:14] <- NA

cdata <- ddply(dataNA, .(sex, condition), summarise, 
               N      = sum(!is.na(change)),
               change = mean(change, na.rm=TRUE),
               sd     = sd(change, na.rm=TRUE),
               se     = sd(change, na.rm=TRUE) / sqrt(sum(!is.na(change))) )
# sex condition  N    change        sd        se
#   F   aspirin  4 -3.425000 0.9979145 0.4989572
#   F   placebo 12 -2.058333 0.5247655 0.1514867
#   M   aspirin  7 -5.142857 1.0674848 0.4034713
#   M   placebo  3 -1.300000 0.5291503 0.3055050

A function for mean, count, standard deviation, standard error of the mean, and confidence interval

Instead of manually specifying all the values you want and then calculating the standard error, as shown above, this function will handle all of those details. It will do all the things described here:

Find the mean, standard deviation, and count (N)
Find the standard error of the mean (again, this may not be what you want if you are collapsing over a within-subject variable. See ../../Graphs/Plotting means and error bars (ggplot2) for information on how to make error bars for graphs with within-subjects variables.)
Find a 95% confidence interval (or other value, if desired)
Rename the columns so that the resulting data frame is easier to work with

To use, put this function in your code and call it as demonstrated below.

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {
    require(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This is does the summary; it's not easy to understand...
    datac <- ddply(data, groupvars, .drop=.drop,
                   .fun= function(xx, col, na.rm) {
                           c( N    = length2(xx[,col], na.rm=na.rm),
                              mean = mean   (xx[,col], na.rm=na.rm),
                              sd   = sd     (xx[,col], na.rm=na.rm)
                              )
                          },
                    measurevar,
                    na.rm
             )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean"=measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

Example usage (with 95% confidence interval). Instead of doing all the steps manually, as done previously, the summarySE function does it all in one step:

summarySE(data, measurevar="change", groupvars=c("sex", "condition"))
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex","condition"), na.rm=TRUE)
# sex condition  N    change        sd        se        ci
#   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
#   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

Filling empty combinations with zeros

Sometimes there will be empty combinations of factors in the summary data frame -- that is, combinations of factors that are possible, but don't actually occur in the original data frame. It is often useful to automatically fill in those combinations in the summary data frame with NA's. To do this, set .drop=FALSE in the call to ddply or summarySE.

Example usage:

# First remove some all Male+Placebo entries from the data
dataSub <- subset(data, !(sex=="M" & condition=="placebo"))

# If we summarize the data, there will be a missing row for Male+Placebo,
# since there were no cases with this combination.
summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"))
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767

# Set .drop=FALSE to NOT drop those combinations
summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"), .drop=FALSE)
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#   M   placebo  0       NaN        NA        NA        NA
# Warning message:
# In qt(p, df, lower.tail, log.p) : NaNs produced

Using summaryBy

To collapse the data using the summarizeBy() function:

library(doBy)

# Run the functions length, mean, and sd on the value of "change" for each group, 
# broken down by sex + condition
cdata <- summaryBy(change ~ sex + condition, data=data, FUN=c(length,mean,sd))
# sex condition change.length change.mean change.sd
#   F   aspirin             5   -3.420000 0.8642916
#   F   placebo            12   -2.058333 0.5247655
#   M   aspirin             9   -5.411111 1.1307569
#   M   placebo             4   -0.975000 0.7804913

# Rename column change.length to just N
names(cdata)[names(cdata)=="change.length"] <- "N"

# Calculate standard error of the mean
cdata$change.se <- cdata$change.sd / sqrt(cdata$N)
# sex condition  N change.mean change.sd change.se
#   F   aspirin  5   -3.420000 0.8642916 0.3865230
#   F   placebo 12   -2.058333 0.5247655 0.1514867
#   M   aspirin  9   -5.411111 1.1307569 0.3769190
#   M   placebo  4   -0.975000 0.7804913 0.3902456

Note that if you have any within-subjects variables, these standard error values may not be useful for comparing groups. See ../../Graphs/Plotting means and error bars (ggplot2) for information on how to make error bars for graphs with within-subjects variables.

Handling missing data

If there are NA's in the data, you need to pass the flag na.rm=TRUE to the functions. Normally you could pass it to summaryBy() and it would get passed to each of the functions called, but length() does not recognize it and so it won't work. One way around it is to define a new length function that handles the NA's.

# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
}

# Put some NA's in the data
dataNA <- data
dataNA$change[11:14] <- NA

cdataNA <- summaryBy(change ~ sex + condition, data=dataNA, FUN=c(length2,mean,sd), na.rm=TRUE)
# sex condition change.length2 change.mean change.sd
#   F   aspirin              4   -3.425000 0.9979145
#   F   placebo             12   -2.058333 0.5247655
#   M   aspirin              7   -5.142857 1.0674848
#   M   placebo              3   -1.300000 0.5291503

# Now, do the same as before

A function for mean, count, standard deviation, standard error of the mean, and confidence interval

Find the mean, standard deviation, and count (N)
Find the standard error of the mean (again, this may not be what you want if you are collapsing over a within-subject variable. See ../../Graphs/Plotting means and error bars (ggplot2) for information on how to make error bars for graphs with within-subjects variables.)
Find a 95% confidence interval (or other value, if desired)
Rename the columns so that the resulting data frame is easier to work with

To use, put this function in your code and call it as demonstrated below.

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95) {
    require(doBy)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # Collapse the data
    formula <- as.formula(paste(measurevar, paste(groupvars, collapse=" + "), sep=" ~ "))
    datac <- summaryBy(formula, data=data, FUN=c(length2,mean,sd), na.rm=na.rm)

    # Rename columns
    names(datac)[ names(datac) == paste(measurevar, ".mean",    sep="") ] <- measurevar
    names(datac)[ names(datac) == paste(measurevar, ".sd",      sep="") ] <- "sd"
    names(datac)[ names(datac) == paste(measurevar, ".length2", sep="") ] <- "N"

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

Example usage (with 95% confidence interval). Instead of doing all the steps manually, as done previously, the summarySE function does it all in one step:

summarySE(data, measurevar="change", groupvars=c("sex","condition"))
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#   M   placebo  4 -0.975000 0.7804913 0.3902456 1.2419358

# With a data set with NA's, use na.rm=TRUE
summarySE(dataNA, measurevar="change", groupvars=c("sex","condition"), na.rm=TRUE)
# sex condition  N    change        sd        se        ci
#   F   aspirin  4 -3.425000 0.9979145 0.4989572 1.5879046
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  7 -5.142857 1.0674848 0.4034713 0.9872588
#   M   placebo  3 -1.300000 0.5291503 0.3055050 1.3144821

Filling empty combinations with zeros

This function will fill in those missing combinations with zeros:

fillMissingCombs <- function(df, factors, measures) {

    # Make a list of the combinations of factor levels
    levelList <- list()
    for (f in factors) {  levelList[[f]] <- levels(df[,f])  }

    fullFactors <- expand.grid(levelList)

    dfFull <- merge(fullFactors, df, all.x=TRUE)

    # Wherever there is an NA in the measure vars, replace with 0
    for (m in measures) {  dfFull[is.na(dfFull[,m]), m] <- 0  }

    return(dfFull)
}

Example usage:

# First remove some all Male+Placebo entries from the data
dataSub <- subset(data, !(sex=="M" & condition=="placebo"))

# If we summarize the data, there will be a missing row for Male+Placebo,
# since there were no cases with this combination.
cdataSub <- summarySE(dataSub, measurevar="change", groupvars=c("sex", "condition"))
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767

# This will fill in the missing combinations with zeros
fillMissingCombs(cdataSub, factors=c("sex","condition"), measures=c("N","change","sd","se","ci"))
# sex condition  N    change        sd        se        ci
#   F   aspirin  5 -3.420000 0.8642916 0.3865230 1.0731598
#   F   placebo 12 -2.058333 0.5247655 0.1514867 0.3334201
#   M   aspirin  9 -5.411111 1.1307569 0.3769190 0.8691767
#   M   placebo  0  0.000000 0.0000000 0.0000000 0.0000000

Using aggregate

The aggregate function is more difficult to use, but it is included in the base R installation and does not require the installation of another package.

# Get a count of number of subjects in each category (sex*condition)
cdata <- aggregate(data["subject"], by=data[c("sex","condition")], FUN=length)
# sex condition subject
#   F   aspirin       5
#   M   aspirin       9
#   F   placebo      12
#   M   placebo       4

# Rename "subject" column to "N"
names(cdata)[names(cdata)=="subject"] <- "N"
# sex condition  N
#   F   placebo 12
#   M   placebo  4
#   F   aspirin  5
#   M   aspirin  9

# Sort by sex first
cdata <- cdata[order(cdata$sex),]
# sex condition  N
#   F   placebo 12
#   F   aspirin  5
#   M   placebo  4
#   M   aspirin  9

# We also keep the __before__ and __after__ columns:
# Get the average effect size by sex and condition
cdata.means <- aggregate(data[c("before","after","change")], 
                         by = data[c("sex","condition")], FUN=mean)
# sex condition   before     after    change
#   F   placebo 10.13333  8.075000 -2.058333
#   M   placebo 11.47500 10.500000 -0.975000
#   F   aspirin 11.06000  7.640000 -3.420000
#   M   aspirin 11.26667  5.855556 -5.411111

# Merge the data frames
cdata <- merge(cdata, cdata.means)
# sex condition  N   before     after    change
#   F   aspirin  5 11.06000  7.640000 -3.420000
#   F   placebo 12 10.13333  8.075000 -2.058333
#   M   aspirin  9 11.26667  5.855556 -5.411111
#   M   placebo  4 11.47500 10.500000 -0.975000

# Get the sample (n-1) standard deviation for "change"
cdata.sd <- aggregate(data["change"],
                      by = data[c("sex","condition")], FUN=sd)
# Rename the column to change.sd
names(cdata.sd)[names(cdata.sd)=="change"] <- "change.sd"
# sex condition change.sd
#   F   placebo 0.5247655
#   M   placebo 0.7804913
#   F   aspirin 0.8642916
#   M   aspirin 1.1307569

# Merge
cdata <- merge(cdata, cdata.sd)
# sex condition  N   before     after    change change.sd
#   F   aspirin  5 11.06000  7.640000 -3.420000 0.8642916
#   F   placebo 12 10.13333  8.075000 -2.058333 0.5247655
#   M   aspirin  9 11.26667  5.855556 -5.411111 1.1307569
#   M   placebo  4 11.47500 10.500000 -0.975000 0.7804913

# Calculate standard error of the mean
cdata$change.se <- cdata$change.sd / sqrt(cdata$N)
# sex condition  N   before     after    change change.sd change.se
#   F   aspirin  5 11.06000  7.640000 -3.420000 0.8642916 0.3865230
#   F   placebo 12 10.13333  8.075000 -2.058333 0.5247655 0.1514867
#   M   aspirin  9 11.26667  5.855556 -5.411111 1.1307569 0.3769190
#   M   placebo  4 11.47500 10.500000 -0.975000 0.7804913 0.3902456

If you have NA's in your data and wish to skip them, use na.rm=TRUE:

cdata.means <- aggregate(data[c("before","after","change")], 
                         by = data[c("sex","condition")],
                         FUN=mean, na.rm=TRUE)

Note

This is the code used to generate the data above. It is here for safe keeping.

set.seed(103)

N <- 30
subject        <- 1:N
data           <- data.frame(subject)
data$sex       <- factor(vector(length=N), levels=c("F","M"))
data$condition <- factor(vector(length=N), levels=c("placebo","aspirin"))
data$before    <- vector("numeric", length=N)
data$after     <- vector("numeric", length=N)
data$change    <- vector("numeric", length=N)

for (i in 1:N) {
    if (runif(1) < .5)  data$sex[i] <- "F"
    else                data$sex[i] <- "M"

    if (runif(1) < .5)  data$condition[i] <- "placebo"
    else                data$condition[i] <- "aspirin"

    # Set different effects based on sex and condition
    if (data$sex[i] == "F") {
        if (data$condition[i] == "placebo")  change.mean <- -2
        else                                 change.mean <- -4 
    }
    else {
        if (data$condition[i] == "placebo")  change.mean <- -1
        else                                 change.mean <- -5 
    }

    data$before[i] <- round(rnorm(1, mean=10, sd=2), digits=1)
    data$change[i] <- round(rnorm(1, mean=change.mean, sd=1), digits=1)
    data$after[i]  <- data$before[i] + data$change[i]

}