Grouping and Aggregation

Sorting data

head(iris, 6)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
15.13.51.40.2setosa
24.931.40.2setosa
34.73.21.30.2setosa
44.63.11.50.2setosa
553.61.40.2setosa
65.43.91.70.4setosa
ridx <- order(iris$Sepal.Length)
head(iris[ridx,], 4)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
94.42.91.40.2setosa
394.431.30.2setosa
434.43.21.30.2setosa
ridx <- order(iris$Sepal.Length, decreasing = TRUE)
head(iris[ridx,], 4)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
1327.93.86.42virginica
1187.73.86.72.2virginica
1197.72.66.92.3virginica
1237.72.86.72virginica
ridx <- order(iris$Sepal.Length, iris$Petal.Length)
head(iris[ridx,], 4)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
394.431.30.2setosa
434.43.21.30.2setosa
94.42.91.40.2setosa
ridx <- order(iris$Sepal.Length, -iris$Petal.Length)
head(iris[ridx,], 4)
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
144.331.10.1setosa
94.42.91.40.2setosa
394.431.30.2setosa
434.43.21.30.2setosa

Trnasposing data

ir6 <- iris[sample(1:nrow(iris), 6, replace=FALSE),]
ir6
Sepal.LengthSepal.WidthPetal.LengthPetal.WidthSpecies
1126.42.75.31.9virginica
1466.735.22.3virginica
565.72.84.51.3versicolor
325.43.41.50.4setosa
986.22.94.31.3versicolor
1296.42.85.62.1virginica
t(ir6)
112146563298129
Sepal.Length6.46.75.75.46.26.4
Sepal.Width2.73.02.83.42.92.8
Petal.Length5.35.24.51.54.35.6
Petal.Width1.92.31.30.41.32.1
Speciesvirginica virginica versicolorsetosa versicolorvirginica

Aggregation (Subgrouping)

mt10 <- mtcars[sample(1:nrow(mtcars), 10, replace=FALSE),]
mt10
mpgcyldisphpdratwtqsecvsamgearcarb
Dodge Challenger15.583181502.763.5216.870032
Hornet Sportabout18.783601753.153.4417.020032
Fiat 12832.4478.7664.082.219.471141
Merc 450SLC15.28275.81803.073.78180033
Chrysler Imperial14.784402303.235.34517.420034
Volvo 142E21.441211094.112.7818.61142
Toyota Corolla33.9471.1654.221.83519.91141
Merc 450SE16.48275.81803.074.0717.40033
Mazda RX42161601103.92.6216.460144
Porsche 914-2264120.3914.432.1416.70152
with(mtcars, aggregate(mtcars, by=list(cyl=cyl), FUN=mean))
cylmpgcyldisphpdratwtqsecvsamgearcarb
1426.663644105.136482.636364.0709092.28572719.137270.90909090.72727274.0909091.545455
2619.742866183.3143122.28573.5857143.11714317.977140.57142860.42857143.8571433.428571
3815.18353.1209.21433.2292863.99921416.7721400.14285713.2857143.5
with(mtcars, aggregate(mtcars, by=list(cyl=cyl, gear=gear), FUN=median))
cylgearmpgcyldisphpdratwtqsecvsamgearcarb
14321.54120.1973.72.46520.011031
26319.756241.5107.52.923.337519.831031
38315.283551803.0753.8117.350033
44425.85493.5664.082.2619.1851141.5
56420.16163.8116.53.913.157517.660.50.544
64528.24107.71024.11.826516.80.5152
76519.761451753.622.7715.50156
88515.48326299.53.883.3714.550156

Reshaping data

library(reshape2)
library(plyr) # needed for the . function
Warning message:
: package ‘plyr’ was built under R version 3.1.3

Starting data frame

ID <- factor(rep(1:2, each=3))
Time <- rep(1:3, 2)
Gene1 <- c(0,5,10,10,5,0) + rnorm(6)
Gene2 <- c(0,5,10,0,5,10) + rnorm(6)
expt <- data.frame(PID=ID, Time=Time, Gene1=Gene1, Gene2=Gene2)
expt
PIDTimeGene1Gene2
111-1.658798-1.509927
2126.4229992.617535
3138.70391711.20532
42111.80803-0.9115341
5224.6703614.869456
623-0.59828498.871238

“Melt” into a “tall” format with all values in a single column. This reuqires identifying all the columns that are needed to uniquely define a row value. In this csse, the “id” columns are “PID” and “Time”.

m.expt <- melt(expt, id=c("PID", "Time"), variable.name="Gene")
m.expt
PIDTimeGenevalue
111Gene1-1.658798
212Gene16.422999
313Gene18.703917
421Gene111.80803
522Gene14.670361
623Gene1-0.5982849
711Gene2-1.509927
812Gene22.617535
913Gene211.20532
1021Gene2-0.9115341
1122Gene24.869456
1223Gene28.871238

Use dcast to reshape

Show the time series for each (PID, Gene) combination.

dcast(m.expt, PID + Gene ~ Time)
PIDGene123
11Gene1-1.6587986.4229998.703917
21Gene2-1.5099272.61753511.20532
32Gene111.808034.670361-0.5982849
42Gene2-0.91153414.8694568.871238

Show the time series for each (Gene, PID) combination.

dcast(m.expt, Gene + PID ~ Time)
GenePID123
1Gene11-1.6587986.4229998.703917
2Gene1211.808034.670361-0.5982849
3Gene21-1.5099272.61753511.20532
4Gene22-0.91153414.8694568.871238

Recreate the original data set

dcast(m.expt, PID + Time ~ Gene)
PIDTimeGene1Gene2
111-1.658798-1.509927
2126.4229992.617535
3138.70391711.20532
42111.80803-0.9115341
5224.6703614.869456
623-0.59828498.871238

Show all data for each subject in a single row

dcast(m.expt, PID ~ Gene + Time)
PIDGene1_1Gene1_2Gene1_3Gene2_1Gene2_2Gene2_3
11-1.6587986.4229998.703917-1.5099272.61753511.20532
2211.808034.670361-0.5982849-0.91153414.8694568.871238

We can also aggregate while reshaping

What is the average expression value for each gene for each subject over all time points?

dcast(m.expt, PID ~ Gene, mean)
PIDGene1Gene2
114.4893724.104311
225.2933694.276387

What is the average expression value for each gene for each time point over all subjects?

dcast(m.expt, Time ~ Gene, mean)
TimeGene1Gene2
115.074616-1.210731
225.546683.743496
334.05281610.03828

Finally, we can perform subssetting on the named vairables.

For example, restrict the previous query to subject with PID= 1.

dcast(m.expt, Time ~ Gene, mean, subset = .(PID == 1))
TimeGene1Gene2
11-1.658798-1.509927
226.4229992.617535
338.70391711.20532

Merging data

A common task in data analysis is to link data from two or more datasts, for example, to relate assay data to clinical phenoytpe.

Here we will work thought a typical example where the genotype and phenoytpe information come from two different data sets, and the ID information needed to link the two is from a third data set.

phenodat <- read.csv("phenodat.csv")
gdat1 <- read.csv("gdat1.csv")
gdat2 <- read.csv("gdat2.csv")
iddat <- read.csv("iddat.csv")

Eyeball data sett

A quick sanity check to see what the data look like.

(dim(phenodat))
(dim(gdat1))
(dim(gdat2))
(dim(iddat))
  1. 10
  2. 2
  1. 11
  2. 3
  1. 11
  2. 3
  1. 20
  2. 2
head(phenodat, 3)
pidtrt
1pid60
2pid151
3pid80
head(gdat1, 3)
expidgene1gene2
1100020-0.4321298-0.2288958
2100018-1.3189380.7935853
31000131.242919-1.334354
head(gdat2, 3)
expidgene1gene2
1100009-1.220512-0.2416898
21000080.28654861.685887
3100007-0.7717918-1.070068
head(iddat, 3)
pidexpid
1pid20100020
2pid9100009
3pid13100013

Combine gene data from two data sets

Often, we have the same type of data stroed in mulitple data sets, for example, one per batch. In this case, we want to combine rows.

gdat <- rbind(gdat1, gdat2)

Checking for duplicates

show.dups <- function(df) {
    return(df[duplicated(df), ])
    }
show.dups(phenodat)
pidtrt
show.dups(iddat)
pidexpid
show.dups(gdat)
expidgene1gene2
131000080.28654861.685887
151000030.88673610.2760235
17100004-0.151396-1.048976
18100018-1.3189380.7935853
201000110.8001769-0.7729782
21100001-0.59960831.689873

Remove duplicates

gdat <- unique(gdat)
dim(gdat)
  1. 16
  2. 3
show.dups(gdat)
expidgene1gene2

Merging

To combine columns from different data sets, we can perform a merge operation. Rows in the different data set need some common identifier to be merged, typcialy information from one or more “ID” columns.

Merge all rows with information for both phenotype and gene

First we merge phnenoytpe data with the ID data

(df1 <- merge(phenodat, iddat, by="pid", all.x=TRUE))
pidtrtexpid
1pid10100001
2pid121100012
3pid151100015
4pid160100016
5pid170100017
6pid181100018
7pid200100020
8pid60100006
9pid70100007
10pid80100008

Then we merge with gene data

(df2 <- merge(gdat, df1, by="expid"))
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200

Note that there are now only 7 rows becasue 3 phenotypes did not have matching gene data.

What if we want to show all genes even if there is no matching phenotype data?

(df3 <- merge(gdat, df1, by="expid", all.x=TRUE))
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100002-0.12941071.228393NANA
31000030.88673610.2760235NANA
4100004-0.151396-1.048976NANA
51000050.3297912-0.5208693NANA
6100007-0.7717918-1.070068pid70
71000080.28654861.685887pid80
8100009-1.220512-0.2416898NANA
91000110.8001769-0.7729782NANA
101000131.242919-1.334354NANA
11100014-0.93438510.4958705NANA
121000150.39370871.233976pid151
13100017-0.88643670.4120223pid170
14100018-1.3189380.7935853pid181
151000190.02884391-0.1524106NANA
16100020-0.4321298-0.2288958pid200

What if we want to show all phenotypes even if there is no matching gene data?

(df4 <- merge(gdat, df1, by="expid", all.y=TRUE))
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100006NANApid60
3100007-0.7717918-1.070068pid70
41000080.28654861.685887pid80
5100012NANApid121
61000150.39370871.233976pid151
7100016NANApid160
8100017-0.88643670.4120223pid170
9100018-1.3189380.7935853pid181
10100020-0.4321298-0.2288958pid200

What if we want to show everything?

(df5 <- merge(gdat, df1, by="expid", all.x=TRUE, all.y=TRUE))
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100002-0.12941071.228393NANA
31000030.88673610.2760235NANA
4100004-0.151396-1.048976NANA
51000050.3297912-0.5208693NANA
6100006NANApid60
7100007-0.7717918-1.070068pid70
81000080.28654861.685887pid80
9100009-1.220512-0.2416898NANA
101000110.8001769-0.7729782NANA
11100012NANApid121
121000131.242919-1.334354NANA
13100014-0.93438510.4958705NANA
141000150.39370871.233976pid151
15100016NANApid160
16100017-0.88643670.4120223pid170
17100018-1.3189380.7935853pid181
181000190.02884391-0.1524106NANA
19100020-0.4321298-0.2288958pid200

Rearrange column order

df2[, c(4,1,2,3,5)]
pidexpidgene1gene2trt
1pid1100001-0.59960831.6898730
2pid7100007-0.7717918-1.0700680
3pid81000080.28654861.6858870
4pid151000150.39370871.2339761
5pid17100017-0.88643670.41202230
6pid18100018-1.3189380.79358531
7pid20100020-0.4321298-0.22889580

Sorting data

df2
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
df2[order(df2$expid),]
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
df2[order(df2$pid),]
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
df2[order(df2$pid, df2$expid),]
expidgene1gene2pidtrt
1100001-0.59960831.689873pid10
41000150.39370871.233976pid151
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181
7100020-0.4321298-0.2288958pid200
2100007-0.7717918-1.070068pid70
31000080.28654861.685887pid80
df2[order(df2$gene1, decreasing = TRUE),]
expidgene1gene2pidtrt
41000150.39370871.233976pid151
31000080.28654861.685887pid80
7100020-0.4321298-0.2288958pid200
1100001-0.59960831.689873pid10
2100007-0.7717918-1.070068pid70
5100017-0.88643670.4120223pid170
6100018-1.3189380.7935853pid181