Import count data using basic tools

We want to read in the count files using more basic tools (important if the count files are not generated fro,m htseq-count). We will import the files as matrices. We start with some of the preliminary steps

library(tools)
phfile="/home/owzar001/CURRENT/summercourse-2015/Data/sampletable.txt"
cntdir="/home/owzar001/CURRENT/summercourse-2015/Data/COUNTS"
md5sum(phfile)
phdata=read.table(phfile,sep=",",stringsAsFactor=FALSE)
/home/owzar001/CURRENT/summercourse-2015/Data/sampletable.txt: '10cdaaa6e5fc76ab92b6d845d33a2a2f'
colnames(phdata)=c("filename","sampid","trt")
phdata[["md5sum"]]=md5sum(file.path(cntdir,phdata[["filename"]]))
phdata
filenamesampidtrtmd5sum
1AGTCAA_counts.tsvAGTCAA0a9eaa959aba1b02b3831583c2a9751c8
2AGTTCC_counts.tsvAGTTCC04183767e4eeb75dc582bcf438af13500
3ATGTCA_counts.tsvATGTCA026fbba06520758e5a3acd9bd432ebed4
4CCGTCC_counts.tsvCCGTCC150036a88fd48645f740a31f4f4352cfb
5GTCCGC_counts.tsvGTCCGC1bb1cecd886127159157e9431d072cad5
6GTGAAA_counts.tsvGTGAAA1fa544c0a076eedb54937c7189f4e1fbc
phdata=phdata[c("sampid","filename","trt","md5sum")]
phdata
sampidfilenametrtmd5sum
1AGTCAAAGTCAA_counts.tsv0a9eaa959aba1b02b3831583c2a9751c8
2AGTTCCAGTTCC_counts.tsv04183767e4eeb75dc582bcf438af13500
3ATGTCAATGTCA_counts.tsv026fbba06520758e5a3acd9bd432ebed4
4CCGTCCCCGTCC_counts.tsv150036a88fd48645f740a31f4f4352cfb
5GTCCGCGTCCGC_counts.tsv1bb1cecd886127159157e9431d072cad5
6GTGAAAGTGAAA_counts.tsv1fa544c0a076eedb54937c7189f4e1fbc
phdata[["trt"]]=as.factor(phdata[["trt"]])

This function reads the data from one file and returns the results as a matrix. It is designed to only keep the rows for which the gene id starts with the prefix “GeneID”

readcnts=function(fname,sep="\t",prefix="GeneID",collab="V1",header=FALSE)
    {
        ### Import text file
        dat=read.table(fname,sep=sep,header=header,stringsAsFactor=FALSE)
        ### Only keep rows for which the gene id matches with the prefix
        dat=dat[substr(dat[[collab]],1,nchar(prefix))==prefix,]
        return(dat)
    }

Let’s read in a file

file1=readcnts("/home/owzar001/CURRENT/summercourse-2015/Data/COUNTS/AGTCAA_counts.tsv")
dim(file1)
  1. 4436
  2. 2
head(file1)
V1V2
1GeneID:12930114118
2GeneID:1293011530
3GeneID:1293011615
4GeneID:1293011712
5GeneID:12930118122
6GeneID:1293011960

Now read in all files

## Read in the first file
countdat=readcnts(file.path(cntdir,phdata$filename[1]))
### Assign the sample id as the column name
names(countdat)[2]=phdata$sampid[1]
### Repeat the last two steps for files 2 ... 6 and merge
### along each step
for(i in 2:nrow(phdata))
    {
        dat2=readcnts(file.path(cntdir,phdata$filename[i]))
        names(dat2)[2]=phdata$sampid[i]
        countdat=merge(countdat,dat2,by="V1")
    }

geneid=countdat$V1
countdat=as.matrix(countdat[,-1])
row.names(countdat)=geneid
head(countdat)
AGTCAAAGTTCCATGTCACCGTCCGTCCGCGTGAAA
GeneID:12930114118137149120161174
GeneID:12930115304225183234
GeneID:12930116155537493627
GeneID:1293011712121311 7 6
GeneID:12930118122137 94 48131 69
GeneID:12930119608878534329
library(DESeq2)
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following object is masked from ‘package:stats’:

    xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, as.vector, cbind, colnames,
    do.call, duplicated, eval, evalq, Filter, Find, get, intersect,
    is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax,
    pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int,
    rownames, sapply, setdiff, sort, table, tapply, union, unique,
    unlist, unsplit

Creating a generic function for ‘nchar’ from package ‘base’ in package ‘S4Vectors’
Loading required package: IRanges
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: Rcpp
Loading required package: RcppArmadillo
dds=DESeqDataSetFromMatrix(countdat,DataFrame(phdata),design=~trt)
dds
class: DESeqDataSet
dim: 4436 6
exptData(0):
assays(1): counts
rownames(4436): GeneID:12930114 GeneID:12930115 ... GeneID:13406005
  GeneID:13406006
rowRanges metadata column names(0):
colnames(6): AGTCAA AGTTCC ... GTCCGC GTGAAA
colData names(4): sampid filename trt md5sum
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 8 (jessie)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
 [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
 [1] parallel  stats4    tools     stats     graphics  grDevices utils
 [8] datasets  methods   base

other attached packages:
[1] DESeq2_1.8.1              RcppArmadillo_0.5.200.1.0
[3] Rcpp_0.11.6               GenomicRanges_1.20.5
[5] GenomeInfoDb_1.4.1        IRanges_2.2.5
[7] S4Vectors_0.6.2           BiocGenerics_0.14.0

loaded via a namespace (and not attached):
 [1] RColorBrewer_1.1-2   futile.logger_1.4.1  plyr_1.8.3
 [4] XVector_0.8.0        futile.options_1.0.0 base64enc_0.1-3
 [7] rpart_4.1-9          digest_0.6.8         uuid_0.1-2
[10] RSQLite_1.0.0        annotate_1.46.1      lattice_0.20-31
[13] jsonlite_0.9.16      evaluate_0.7         gtable_0.1.2
[16] DBI_0.3.1            IRdisplay_0.3        IRkernel_0.4
[19] proto_0.3-10         gridExtra_2.0.0      rzmq_0.7.7
[22] genefilter_1.50.0    cluster_2.0.1        repr_0.3
[25] stringr_1.0.0        locfit_1.5-9.1       nnet_7.3-9
[28] grid_3.2.1           Biobase_2.28.0       AnnotationDbi_1.30.1
[31] XML_3.98-1.3         survival_2.38-2      BiocParallel_1.2.9
[34] foreign_0.8-63       latticeExtra_0.6-26  Formula_1.2-1
[37] geneplotter_1.46.0   ggplot2_1.0.1        reshape2_1.4.1
[40] lambda.r_1.1.7       magrittr_1.5         splines_3.2.1
[43] scales_0.2.5         Hmisc_3.16-0         MASS_7.3-40
[46] xtable_1.7-4         colorspace_1.2-6     stringi_0.5-5
[49] acepack_1.3-3.3      munsell_0.4.2