head(iris, 6)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5 | 3.6 | 1.4 | 0.2 | setosa |
6 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
ridx <- order(iris$Sepal.Length)
head(iris[ridx,], 4)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
ridx <- order(iris$Sepal.Length, decreasing = TRUE)
head(iris[ridx,], 4)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
132 | 7.9 | 3.8 | 6.4 | 2 | virginica |
118 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
119 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
123 | 7.7 | 2.8 | 6.7 | 2 | virginica |
ridx <- order(iris$Sepal.Length, iris$Petal.Length)
head(iris[ridx,], 4)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
ridx <- order(iris$Sepal.Length, -iris$Petal.Length)
head(iris[ridx,], 4)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
14 | 4.3 | 3 | 1.1 | 0.1 | setosa |
9 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
39 | 4.4 | 3 | 1.3 | 0.2 | setosa |
43 | 4.4 | 3.2 | 1.3 | 0.2 | setosa |
ir6 <- iris[sample(1:nrow(iris), 6, replace=FALSE),]
ir6
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
112 | 6.4 | 2.7 | 5.3 | 1.9 | virginica |
146 | 6.7 | 3 | 5.2 | 2.3 | virginica |
56 | 5.7 | 2.8 | 4.5 | 1.3 | versicolor |
32 | 5.4 | 3.4 | 1.5 | 0.4 | setosa |
98 | 6.2 | 2.9 | 4.3 | 1.3 | versicolor |
129 | 6.4 | 2.8 | 5.6 | 2.1 | virginica |
t(ir6)
112 | 146 | 56 | 32 | 98 | 129 | |
---|---|---|---|---|---|---|
Sepal.Length | 6.4 | 6.7 | 5.7 | 5.4 | 6.2 | 6.4 |
Sepal.Width | 2.7 | 3.0 | 2.8 | 3.4 | 2.9 | 2.8 |
Petal.Length | 5.3 | 5.2 | 4.5 | 1.5 | 4.3 | 5.6 |
Petal.Width | 1.9 | 2.3 | 1.3 | 0.4 | 1.3 | 2.1 |
Species | virginica | virginica | versicolor | setosa | versicolor | virginica |
mt10 <- mtcars[sample(1:nrow(mtcars), 10, replace=FALSE),]
mt10
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
Dodge Challenger | 15.5 | 8 | 318 | 150 | 2.76 | 3.52 | 16.87 | 0 | 0 | 3 | 2 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.44 | 17.02 | 0 | 0 | 3 | 2 |
Fiat 128 | 32.4 | 4 | 78.7 | 66 | 4.08 | 2.2 | 19.47 | 1 | 1 | 4 | 1 |
Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.78 | 18 | 0 | 0 | 3 | 3 |
Chrysler Imperial | 14.7 | 8 | 440 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
Volvo 142E | 21.4 | 4 | 121 | 109 | 4.11 | 2.78 | 18.6 | 1 | 1 | 4 | 2 |
Toyota Corolla | 33.9 | 4 | 71.1 | 65 | 4.22 | 1.835 | 19.9 | 1 | 1 | 4 | 1 |
Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.07 | 17.4 | 0 | 0 | 3 | 3 |
Mazda RX4 | 21 | 6 | 160 | 110 | 3.9 | 2.62 | 16.46 | 0 | 1 | 4 | 4 |
Porsche 914-2 | 26 | 4 | 120.3 | 91 | 4.43 | 2.14 | 16.7 | 0 | 1 | 5 | 2 |
with(mtcars, aggregate(mtcars, by=list(cyl=cyl), FUN=mean))
cyl | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 4 | 26.66364 | 4 | 105.1364 | 82.63636 | 4.070909 | 2.285727 | 19.13727 | 0.9090909 | 0.7272727 | 4.090909 | 1.545455 |
2 | 6 | 19.74286 | 6 | 183.3143 | 122.2857 | 3.585714 | 3.117143 | 17.97714 | 0.5714286 | 0.4285714 | 3.857143 | 3.428571 |
3 | 8 | 15.1 | 8 | 353.1 | 209.2143 | 3.229286 | 3.999214 | 16.77214 | 0 | 0.1428571 | 3.285714 | 3.5 |
with(mtcars, aggregate(mtcars, by=list(cyl=cyl, gear=gear), FUN=median))
cyl | gear | mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 4 | 3 | 21.5 | 4 | 120.1 | 97 | 3.7 | 2.465 | 20.01 | 1 | 0 | 3 | 1 |
2 | 6 | 3 | 19.75 | 6 | 241.5 | 107.5 | 2.92 | 3.3375 | 19.83 | 1 | 0 | 3 | 1 |
3 | 8 | 3 | 15.2 | 8 | 355 | 180 | 3.075 | 3.81 | 17.35 | 0 | 0 | 3 | 3 |
4 | 4 | 4 | 25.85 | 4 | 93.5 | 66 | 4.08 | 2.26 | 19.185 | 1 | 1 | 4 | 1.5 |
5 | 6 | 4 | 20.1 | 6 | 163.8 | 116.5 | 3.91 | 3.1575 | 17.66 | 0.5 | 0.5 | 4 | 4 |
6 | 4 | 5 | 28.2 | 4 | 107.7 | 102 | 4.1 | 1.8265 | 16.8 | 0.5 | 1 | 5 | 2 |
7 | 6 | 5 | 19.7 | 6 | 145 | 175 | 3.62 | 2.77 | 15.5 | 0 | 1 | 5 | 6 |
8 | 8 | 5 | 15.4 | 8 | 326 | 299.5 | 3.88 | 3.37 | 14.55 | 0 | 1 | 5 | 6 |
library(reshape2)
library(plyr) # needed for the . function
Warning message:
: package ‘plyr’ was built under R version 3.1.3
ID <- factor(rep(1:2, each=3))
Time <- rep(1:3, 2)
Gene1 <- c(0,5,10,10,5,0) + rnorm(6)
Gene2 <- c(0,5,10,0,5,10) + rnorm(6)
expt <- data.frame(PID=ID, Time=Time, Gene1=Gene1, Gene2=Gene2)
expt
PID | Time | Gene1 | Gene2 | |
---|---|---|---|---|
1 | 1 | 1 | -1.658798 | -1.509927 |
2 | 1 | 2 | 6.422999 | 2.617535 |
3 | 1 | 3 | 8.703917 | 11.20532 |
4 | 2 | 1 | 11.80803 | -0.9115341 |
5 | 2 | 2 | 4.670361 | 4.869456 |
6 | 2 | 3 | -0.5982849 | 8.871238 |
“Melt” into a “tall” format with all values in a single column. This reuqires identifying all the columns that are needed to uniquely define a row value. In this csse, the “id” columns are “PID” and “Time”.
m.expt <- melt(expt, id=c("PID", "Time"), variable.name="Gene")
m.expt
PID | Time | Gene | value | |
---|---|---|---|---|
1 | 1 | 1 | Gene1 | -1.658798 |
2 | 1 | 2 | Gene1 | 6.422999 |
3 | 1 | 3 | Gene1 | 8.703917 |
4 | 2 | 1 | Gene1 | 11.80803 |
5 | 2 | 2 | Gene1 | 4.670361 |
6 | 2 | 3 | Gene1 | -0.5982849 |
7 | 1 | 1 | Gene2 | -1.509927 |
8 | 1 | 2 | Gene2 | 2.617535 |
9 | 1 | 3 | Gene2 | 11.20532 |
10 | 2 | 1 | Gene2 | -0.9115341 |
11 | 2 | 2 | Gene2 | 4.869456 |
12 | 2 | 3 | Gene2 | 8.871238 |
dcast(m.expt, PID + Gene ~ Time)
PID | Gene | 1 | 2 | 3 | |
---|---|---|---|---|---|
1 | 1 | Gene1 | -1.658798 | 6.422999 | 8.703917 |
2 | 1 | Gene2 | -1.509927 | 2.617535 | 11.20532 |
3 | 2 | Gene1 | 11.80803 | 4.670361 | -0.5982849 |
4 | 2 | Gene2 | -0.9115341 | 4.869456 | 8.871238 |
dcast(m.expt, Gene + PID ~ Time)
Gene | PID | 1 | 2 | 3 | |
---|---|---|---|---|---|
1 | Gene1 | 1 | -1.658798 | 6.422999 | 8.703917 |
2 | Gene1 | 2 | 11.80803 | 4.670361 | -0.5982849 |
3 | Gene2 | 1 | -1.509927 | 2.617535 | 11.20532 |
4 | Gene2 | 2 | -0.9115341 | 4.869456 | 8.871238 |
dcast(m.expt, PID + Time ~ Gene)
PID | Time | Gene1 | Gene2 | |
---|---|---|---|---|
1 | 1 | 1 | -1.658798 | -1.509927 |
2 | 1 | 2 | 6.422999 | 2.617535 |
3 | 1 | 3 | 8.703917 | 11.20532 |
4 | 2 | 1 | 11.80803 | -0.9115341 |
5 | 2 | 2 | 4.670361 | 4.869456 |
6 | 2 | 3 | -0.5982849 | 8.871238 |
dcast(m.expt, PID ~ Gene + Time)
PID | Gene1_1 | Gene1_2 | Gene1_3 | Gene2_1 | Gene2_2 | Gene2_3 | |
---|---|---|---|---|---|---|---|
1 | 1 | -1.658798 | 6.422999 | 8.703917 | -1.509927 | 2.617535 | 11.20532 |
2 | 2 | 11.80803 | 4.670361 | -0.5982849 | -0.9115341 | 4.869456 | 8.871238 |
dcast(m.expt, PID ~ Gene, mean)
PID | Gene1 | Gene2 | |
---|---|---|---|
1 | 1 | 4.489372 | 4.104311 |
2 | 2 | 5.293369 | 4.276387 |
dcast(m.expt, Time ~ Gene, mean)
Time | Gene1 | Gene2 | |
---|---|---|---|
1 | 1 | 5.074616 | -1.210731 |
2 | 2 | 5.54668 | 3.743496 |
3 | 3 | 4.052816 | 10.03828 |
For example, restrict the previous query to subject with PID= 1.
dcast(m.expt, Time ~ Gene, mean, subset = .(PID == 1))
Time | Gene1 | Gene2 | |
---|---|---|---|
1 | 1 | -1.658798 | -1.509927 |
2 | 2 | 6.422999 | 2.617535 |
3 | 3 | 8.703917 | 11.20532 |
A common task in data analysis is to link data from two or more datasts, for example, to relate assay data to clinical phenoytpe.
Here we will work thought a typical example where the genotype and phenoytpe information come from two different data sets, and the ID information needed to link the two is from a third data set.
phenodat <- read.csv("phenodat.csv")
gdat1 <- read.csv("gdat1.csv")
gdat2 <- read.csv("gdat2.csv")
iddat <- read.csv("iddat.csv")
A quick sanity check to see what the data look like.
(dim(phenodat))
(dim(gdat1))
(dim(gdat2))
(dim(iddat))
head(phenodat, 3)
pid | trt | |
---|---|---|
1 | pid6 | 0 |
2 | pid15 | 1 |
3 | pid8 | 0 |
head(gdat1, 3)
expid | gene1 | gene2 | |
---|---|---|---|
1 | 100020 | -0.4321298 | -0.2288958 |
2 | 100018 | -1.318938 | 0.7935853 |
3 | 100013 | 1.242919 | -1.334354 |
head(gdat2, 3)
expid | gene1 | gene2 | |
---|---|---|---|
1 | 100009 | -1.220512 | -0.2416898 |
2 | 100008 | 0.2865486 | 1.685887 |
3 | 100007 | -0.7717918 | -1.070068 |
head(iddat, 3)
pid | expid | |
---|---|---|
1 | pid20 | 100020 |
2 | pid9 | 100009 |
3 | pid13 | 100013 |
Often, we have the same type of data stroed in mulitple data sets, for example, one per batch. In this case, we want to combine rows.
gdat <- rbind(gdat1, gdat2)
show.dups <- function(df) {
return(df[duplicated(df), ])
}
show.dups(phenodat)
pid | trt |
---|
show.dups(iddat)
pid | expid |
---|
show.dups(gdat)
expid | gene1 | gene2 | |
---|---|---|---|
13 | 100008 | 0.2865486 | 1.685887 |
15 | 100003 | 0.8867361 | 0.2760235 |
17 | 100004 | -0.151396 | -1.048976 |
18 | 100018 | -1.318938 | 0.7935853 |
20 | 100011 | 0.8001769 | -0.7729782 |
21 | 100001 | -0.5996083 | 1.689873 |
To combine columns from different data sets, we can perform a merge
operation. Rows in the different data set need some common identifier to
be merged, typcialy information from one or more “ID” columns.
(df1 <- merge(phenodat, iddat, by="pid", all.x=TRUE))
pid | trt | expid | |
---|---|---|---|
1 | pid1 | 0 | 100001 |
2 | pid12 | 1 | 100012 |
3 | pid15 | 1 | 100015 |
4 | pid16 | 0 | 100016 |
5 | pid17 | 0 | 100017 |
6 | pid18 | 1 | 100018 |
7 | pid20 | 0 | 100020 |
8 | pid6 | 0 | 100006 |
9 | pid7 | 0 | 100007 |
10 | pid8 | 0 | 100008 |
(df2 <- merge(gdat, df1, by="expid"))
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
Note that there are now only 7 rows becasue 3 phenotypes did not have matching gene data.
(df3 <- merge(gdat, df1, by="expid", all.x=TRUE))
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100002 | -0.1294107 | 1.228393 | NA | NA |
3 | 100003 | 0.8867361 | 0.2760235 | NA | NA |
4 | 100004 | -0.151396 | -1.048976 | NA | NA |
5 | 100005 | 0.3297912 | -0.5208693 | NA | NA |
6 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
7 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
8 | 100009 | -1.220512 | -0.2416898 | NA | NA |
9 | 100011 | 0.8001769 | -0.7729782 | NA | NA |
10 | 100013 | 1.242919 | -1.334354 | NA | NA |
11 | 100014 | -0.9343851 | 0.4958705 | NA | NA |
12 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
13 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
14 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
15 | 100019 | 0.02884391 | -0.1524106 | NA | NA |
16 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
(df4 <- merge(gdat, df1, by="expid", all.y=TRUE))
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100006 | NA | NA | pid6 | 0 |
3 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
4 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
5 | 100012 | NA | NA | pid12 | 1 |
6 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
7 | 100016 | NA | NA | pid16 | 0 |
8 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
9 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
10 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
(df5 <- merge(gdat, df1, by="expid", all.x=TRUE, all.y=TRUE))
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100002 | -0.1294107 | 1.228393 | NA | NA |
3 | 100003 | 0.8867361 | 0.2760235 | NA | NA |
4 | 100004 | -0.151396 | -1.048976 | NA | NA |
5 | 100005 | 0.3297912 | -0.5208693 | NA | NA |
6 | 100006 | NA | NA | pid6 | 0 |
7 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
8 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
9 | 100009 | -1.220512 | -0.2416898 | NA | NA |
10 | 100011 | 0.8001769 | -0.7729782 | NA | NA |
11 | 100012 | NA | NA | pid12 | 1 |
12 | 100013 | 1.242919 | -1.334354 | NA | NA |
13 | 100014 | -0.9343851 | 0.4958705 | NA | NA |
14 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
15 | 100016 | NA | NA | pid16 | 0 |
16 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
17 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
18 | 100019 | 0.02884391 | -0.1524106 | NA | NA |
19 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
df2[, c(4,1,2,3,5)]
pid | expid | gene1 | gene2 | trt | |
---|---|---|---|---|---|
1 | pid1 | 100001 | -0.5996083 | 1.689873 | 0 |
2 | pid7 | 100007 | -0.7717918 | -1.070068 | 0 |
3 | pid8 | 100008 | 0.2865486 | 1.685887 | 0 |
4 | pid15 | 100015 | 0.3937087 | 1.233976 | 1 |
5 | pid17 | 100017 | -0.8864367 | 0.4120223 | 0 |
6 | pid18 | 100018 | -1.318938 | 0.7935853 | 1 |
7 | pid20 | 100020 | -0.4321298 | -0.2288958 | 0 |
df2
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
df2[order(df2$expid),]
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
df2[order(df2$pid),]
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
df2[order(df2$pid, df2$expid),]
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
df2[order(df2$gene1, decreasing = TRUE),]
expid | gene1 | gene2 | pid | trt | |
---|---|---|---|---|---|
4 | 100015 | 0.3937087 | 1.233976 | pid15 | 1 |
3 | 100008 | 0.2865486 | 1.685887 | pid8 | 0 |
7 | 100020 | -0.4321298 | -0.2288958 | pid20 | 0 |
1 | 100001 | -0.5996083 | 1.689873 | pid1 | 0 |
2 | 100007 | -0.7717918 | -1.070068 | pid7 | 0 |
5 | 100017 | -0.8864367 | 0.4120223 | pid17 | 0 |
6 | 100018 | -1.318938 | 0.7935853 | pid18 | 1 |