From 2bca5d9df29f6a2b5b976ba28e9cae9428c47545 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 20:12:18 +0100 Subject: [PATCH 1/9] fix typo --- vignettes/01_04_rmarkdown.Rmd | 2 +- ...1_transforms.html => 01_04_rmarkdown.html} | 231 +++++++++--------- 2 files changed, 119 insertions(+), 114 deletions(-) rename vignettes/{01_11_transforms.html => 01_04_rmarkdown.html} (53%) diff --git a/vignettes/01_04_rmarkdown.Rmd b/vignettes/01_04_rmarkdown.Rmd index 55c4cc1..3397a35 100644 --- a/vignettes/01_04_rmarkdown.Rmd +++ b/vignettes/01_04_rmarkdown.Rmd @@ -47,7 +47,7 @@ knit_hooks$set(setPch = function(before, options, envir) { opts_chunk$set(setPch = TRUE) ``` -We will again hide this in future documents, but this shows how to set a default figure width and height, as well as setting plot margians. +We will again hide this in future documents, but this shows how to set a default figure width and height, as well as setting plot margins. ```{r global-plot,warning=FALSE, message=FALSE} diff --git a/vignettes/01_11_transforms.html b/vignettes/01_04_rmarkdown.html similarity index 53% rename from vignettes/01_11_transforms.html rename to vignettes/01_04_rmarkdown.html index cc05719..9a44950 100644 --- a/vignettes/01_11_transforms.html +++ b/vignettes/01_04_rmarkdown.html @@ -11,7 +11,7 @@ -Data transforms +R markdown lecture @@ -61,121 +61,133 @@

Dependencies

This document depends on the following packages:

-
  library(devtools)
-  library(Biobase)
-  library(dplyr)
+
library(devtools)

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

-
install.packages(c("devtools"))
-source("http://www.bioconductor.org/biocLite.R")
-biocLite(c("Biobase"))
+
install.packages("devtools")
-
-

General principles

- +
+

Make the code pretty

+

Set the color scheme and pch.

+
## Color scheme inspired by the RSkittleBrewer package
+## https://github.com/alyssafrazee/RSkittleBrewer
+tropical=  c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
+palette(tropical)
+par(pch=19)
+

Unfortunately for R markdown documents the pch won’t propagate unless you do something a little more messy. We’ll hide this later, but for now this is what you need to do to set the default.

+
## see ch. 10 Hooks of Xie's knitr book
+library(knitr)
+knit_hooks$set(setPch = function(before, options, envir) {
+  if(before) par(pch = 19)
+})
+opts_chunk$set(setPch = TRUE)
+

We will again hide this in future documents, but this shows how to set a default figure width and height, as well as setting plot margians.

+
knitr::opts_chunk$set(fig.width=5, fig.height=5, size="footnotesize",
+                      warning=FALSE, message=FALSE)
+knitr::knit_hooks$set(small.mar = function(before, options, envir) {
+  if (before) graphics::par(mar = c(5,5,1.5,1))
+})
+
+
+

Compiling documents

+
    +
  1. Try compiling this document using the “Knit HTML” button. What files are produced?
  2. +
  3. Edit the output to be “pdf_document” and recompile. What files are produced?
  4. +
  5. Edit the output to be “word_document” and recompile. What files are produced?
  6. +
-
-

Load some data

-

We will use this expression set to look at how we use plots and tables to check for different characteristics

-
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
-load(file=con)
-close(con)
-bm = bodymap.eset
-pdata=pData(bm)
-edata=as.data.frame(exprs(bm))
-fdata = fData(bm)
-ls()
-
## [1] "bm"           "bodymap.eset" "con"          "edata"       
-## [5] "fdata"        "pdata"        "tropical"
+
+

Naming code chunks

+

The label “chunk1” tells you which part of the code was running in case you have errors. If you compile this document you’ll see it under the “R markdown” window on R studio.

+
x = rnorm(100)
+plot(x,col=3)
+

-
-

Skewed distributions

-

We would like continuous data to be nice and symmetric like a normal distribution for two reasons:(1) plots are easier to see this way and (2) most statistical methods are designed to work better for non-skewed data.

-
hist(rnorm(1000),col=2)
-

-

Realistically though most genomic data is skewed

-
hist(edata[,1],col=2,breaks=100)
-

-

One way to address this skew is to use a transformation. One common transformation is the log transform.

-
hist(log(edata[,1]),col=2,breaks=100)
-

-

One thing to be careful with is that values of zero become -inf when you apply the log transform because the log of zero isn’t defined.

-
min(log(edata))
-
## [1] -Inf
-

You can remove this problem by adding a small number to all the counts before taking the log. This doesn’t change the overall distribution much but resolves the zero problem

-
min(log(edata[,1] + 1))
-
## [1] 0
-
hist(log(edata[,1] + 1),breaks=100,col=2)
-

-

A common choice is to use the log2 transform. The reason is that data on the log2 scale can be expressed as fold changes. If log2(x) - log2(y) = z then there is a z fold change difference between x and y.

-
hist(log2(edata[,1] + 1),breaks=100,col=2)
-

-

Let’s zoom in on the values greater than zero (we’ll come back to the zero values in a minute)

-
hist(log2(edata[,1] + 1),breaks=100,col=2,xlim=c(1,15),ylim=c(0,400))
-

-

This is starting to look a little better but we still have the zeros problem.

+
+

Headers

-
-

Removing features without much data

-

In many types of sequencing data we observe that there is not much data for many features.

-
hist(rowSums(edata==0),col=2)
-

-

A common pre-processing technique is to remove features that don’t have much data. One way to identify and remove them is to use rowMeans and rowMedians

-
low_genes = rowMeans(edata) < 5
-table(low_genes)
-
## low_genes
-## FALSE  TRUE 
-## 11413 41167
-
filt_edata = filter(edata,!low_genes)
-dim(filt_edata)
-
## [1] 11413    19
-
low_genes2 = rowMedians(as.matrix(edata)) < 5
-table(low_genes2,low_genes)
-
##           low_genes
-## low_genes2 FALSE  TRUE
-##      FALSE  9844     2
-##      TRUE   1569 41165
-
filt_edata2 = filter(edata,!low_genes2)
-dim(filt_edata2)
-
## [1] 9846   19
-

After filtering we observe that using the log transform makes the distribution much more symmetric but there are still zeros. We could increase the filter to remove this issue if we needed to.

-
hist(log2(filt_edata[,1] + 1),col=2)
-

+
+

This is a primary header.

+
+

This a secondary header

+
+

This is a tertiary header

-
-

More transforms

-

There are a large number of other transforms depending on the type of data you are using. Some common ones are:

+
+
+

Lists

+

You can create bulleted and numbered lists in the following way.

    -
  • Variance stabilizing transforms which seek to remove a mean variance relationship among the data
  • -
  • Box-Cox transforms which seek to make the data approximately Normally distributed
  • -
  • rlog transform - unique to genomics count data, this is a regularized version of the log transform that seeks to minimize differences at low count levels.
  • +
  • Bullet item 1
  • +
  • Bullet item 2
  • +
  • Bullet item 3
+
    +
  1. Numbered item 1
  2. +
  3. Numbered item 2
  4. +
  5. Numbered item 3
  6. +
+
+
+

Figures

+

The main arguments you might want to change are centering and figure height.

+
x = rnorm(100)
+plot(x,col=3,pch=19)
+

+
+
+

Other chunk arguments

+

Add a chunk option of echo=FALSE to hide the code.

+

+

Setting cache=TRUE makes it so that the code won’t take as long to compile the second time. To see this, uncomment the code below and then run the document twice.

+
#Sys.sleep(10)

Session information

-

Here is the session information

+

It is very useful to record the session information when performing analyses so you can collaborate with yourself in the future.

+
sessionInfo()
+
## R version 3.2.2 (2015-08-14)
+## Platform: x86_64-apple-darwin13.4.0 (64-bit)
+## Running under: OS X 10.10.5 (Yosemite)
+## 
+## locale:
+## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] knitr_1.11     devtools_1.9.1
+## 
+## loaded via a namespace (and not attached):
+##  [1] magrittr_1.5    formatR_1.2     tools_3.2.2     htmltools_0.2.6
+##  [5] yaml_2.1.13     memoise_0.2.1   stringi_0.5-5   rmarkdown_0.8  
+##  [9] stringr_1.0.0   digest_0.6.8    evaluate_0.7.2
devtools::session_info()
##  setting  value                       
 ##  version  R version 3.2.2 (2015-08-14)
@@ -186,29 +198,22 @@ 

Session information

## tz Europe/London ## date 2015-09-17 ## -## package * version date source -## assertthat 0.1 2013-12-06 CRAN (R 3.2.0) -## Biobase * 2.28.0 2015-04-17 Bioconductor -## BiocGenerics * 0.14.0 2015-04-17 Bioconductor -## DBI 0.3.1 2014-09-24 CRAN (R 3.2.0) -## devtools * 1.9.1 2015-09-11 CRAN (R 3.2.0) -## digest 0.6.8 2014-12-31 CRAN (R 3.2.0) -## dplyr * 0.4.3 2015-09-01 CRAN (R 3.2.2) -## evaluate 0.7.2 2015-08-13 CRAN (R 3.2.0) -## formatR 1.2 2015-04-21 CRAN (R 3.2.0) -## htmltools 0.2.6 2014-09-08 CRAN (R 3.2.0) -## knitr * 1.11 2015-08-14 CRAN (R 3.2.2) -## lazyeval 0.1.10 2015-01-02 CRAN (R 3.2.0) -## magrittr 1.5 2014-11-22 CRAN (R 3.2.0) -## memoise 0.2.1 2014-04-22 CRAN (R 3.2.0) -## R6 2.1.1 2015-08-19 CRAN (R 3.2.2) -## Rcpp 0.12.1 2015-09-10 CRAN (R 3.2.0) -## rmarkdown 0.8 2015-08-30 CRAN (R 3.2.2) -## stringi 0.5-5 2015-06-29 CRAN (R 3.2.0) -## stringr 1.0.0 2015-04-30 CRAN (R 3.2.0) -## yaml 2.1.13 2014-06-12 CRAN (R 3.2.0)
+## package * version date source +## devtools * 1.9.1 2015-09-11 CRAN (R 3.2.0) +## digest 0.6.8 2014-12-31 CRAN (R 3.2.0) +## evaluate 0.7.2 2015-08-13 CRAN (R 3.2.0) +## formatR 1.2 2015-04-21 CRAN (R 3.2.0) +## htmltools 0.2.6 2014-09-08 CRAN (R 3.2.0) +## knitr * 1.11 2015-08-14 CRAN (R 3.2.2) +## magrittr 1.5 2014-11-22 CRAN (R 3.2.0) +## memoise 0.2.1 2014-04-22 CRAN (R 3.2.0) +## rmarkdown 0.8 2015-08-30 CRAN (R 3.2.2) +## stringi 0.5-5 2015-06-29 CRAN (R 3.2.0) +## stringr 1.0.0 2015-04-30 CRAN (R 3.2.0) +## yaml 2.1.13 2014-06-12 CRAN (R 3.2.0)

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

+
From be68d160e42ae9ca8aa1ff7b9a62c082e641e207 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 20:16:11 +0100 Subject: [PATCH 2/9] remove html file --- vignettes/01_04_rmarkdown.html | 241 --------------------------------- 1 file changed, 241 deletions(-) delete mode 100644 vignettes/01_04_rmarkdown.html diff --git a/vignettes/01_04_rmarkdown.html b/vignettes/01_04_rmarkdown.html deleted file mode 100644 index 9a44950..0000000 --- a/vignettes/01_04_rmarkdown.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - - - - - - -R markdown lecture - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -
-

Dependencies

-

This document depends on the following packages:

-
library(devtools)
-

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

-
install.packages("devtools")
-
-
-

Make the code pretty

-

Set the color scheme and pch.

-
## Color scheme inspired by the RSkittleBrewer package
-## https://github.com/alyssafrazee/RSkittleBrewer
-tropical=  c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
-palette(tropical)
-par(pch=19)
-

Unfortunately for R markdown documents the pch won’t propagate unless you do something a little more messy. We’ll hide this later, but for now this is what you need to do to set the default.

-
## see ch. 10 Hooks of Xie's knitr book
-library(knitr)
-knit_hooks$set(setPch = function(before, options, envir) {
-  if(before) par(pch = 19)
-})
-opts_chunk$set(setPch = TRUE)
-

We will again hide this in future documents, but this shows how to set a default figure width and height, as well as setting plot margians.

-
knitr::opts_chunk$set(fig.width=5, fig.height=5, size="footnotesize",
-                      warning=FALSE, message=FALSE)
-knitr::knit_hooks$set(small.mar = function(before, options, envir) {
-  if (before) graphics::par(mar = c(5,5,1.5,1))
-})
-
-
-

Compiling documents

-
    -
  1. Try compiling this document using the “Knit HTML” button. What files are produced?
  2. -
  3. Edit the output to be “pdf_document” and recompile. What files are produced?
  4. -
  5. Edit the output to be “word_document” and recompile. What files are produced?
  6. -
-
-
-

Naming code chunks

-

The label “chunk1” tells you which part of the code was running in case you have errors. If you compile this document you’ll see it under the “R markdown” window on R studio.

-
x = rnorm(100)
-plot(x,col=3)
-

-
-
-

Headers

-
-
-

This is a primary header.

-
-

This a secondary header

-
-

This is a tertiary header

-
-
-
-

Lists

-

You can create bulleted and numbered lists in the following way.

-
    -
  • Bullet item 1
  • -
  • Bullet item 2
  • -
  • Bullet item 3
  • -
-
    -
  1. Numbered item 1
  2. -
  3. Numbered item 2
  4. -
  5. Numbered item 3
  6. -
-
-
-

Figures

-

The main arguments you might want to change are centering and figure height.

-
x = rnorm(100)
-plot(x,col=3,pch=19)
-

-
-
-

Other chunk arguments

-

Add a chunk option of echo=FALSE to hide the code.

-

-

Setting cache=TRUE makes it so that the code won’t take as long to compile the second time. To see this, uncomment the code below and then run the document twice.

-
#Sys.sleep(10)
-
-
-

Session information

-

It is very useful to record the session information when performing analyses so you can collaborate with yourself in the future.

-
sessionInfo()
-
## R version 3.2.2 (2015-08-14)
-## Platform: x86_64-apple-darwin13.4.0 (64-bit)
-## Running under: OS X 10.10.5 (Yosemite)
-## 
-## locale:
-## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
-## 
-## attached base packages:
-## [1] stats     graphics  grDevices utils     datasets  methods   base     
-## 
-## other attached packages:
-## [1] knitr_1.11     devtools_1.9.1
-## 
-## loaded via a namespace (and not attached):
-##  [1] magrittr_1.5    formatR_1.2     tools_3.2.2     htmltools_0.2.6
-##  [5] yaml_2.1.13     memoise_0.2.1   stringi_0.5-5   rmarkdown_0.8  
-##  [9] stringr_1.0.0   digest_0.6.8    evaluate_0.7.2
-
devtools::session_info()
-
##  setting  value                       
-##  version  R version 3.2.2 (2015-08-14)
-##  system   x86_64, darwin13.4.0        
-##  ui       X11                         
-##  language (EN)                        
-##  collate  en_GB.UTF-8                 
-##  tz       Europe/London               
-##  date     2015-09-17                  
-## 
-##  package   * version date       source        
-##  devtools  * 1.9.1   2015-09-11 CRAN (R 3.2.0)
-##  digest      0.6.8   2014-12-31 CRAN (R 3.2.0)
-##  evaluate    0.7.2   2015-08-13 CRAN (R 3.2.0)
-##  formatR     1.2     2015-04-21 CRAN (R 3.2.0)
-##  htmltools   0.2.6   2014-09-08 CRAN (R 3.2.0)
-##  knitr     * 1.11    2015-08-14 CRAN (R 3.2.2)
-##  magrittr    1.5     2014-11-22 CRAN (R 3.2.0)
-##  memoise     0.2.1   2014-04-22 CRAN (R 3.2.0)
-##  rmarkdown   0.8     2015-08-30 CRAN (R 3.2.2)
-##  stringi     0.5-5   2015-06-29 CRAN (R 3.2.0)
-##  stringr     1.0.0   2015-04-30 CRAN (R 3.2.0)
-##  yaml        2.1.13  2014-06-12 CRAN (R 3.2.0)
-

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

-
-
- - -
- - - - - - - - From 4a47c6db36dbbd8b9f2c92fb1914da2472e01668 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 20:44:04 +0100 Subject: [PATCH 3/9] sub-item formating --- vignettes/01_10_exploratory-analysis.Rmd | 10 +- vignettes/01_10_exploratory-analysis.html | 432 ++++++++++++++++++++++ 2 files changed, 437 insertions(+), 5 deletions(-) create mode 100644 vignettes/01_10_exploratory-analysis.html diff --git a/vignettes/01_10_exploratory-analysis.Rmd b/vignettes/01_10_exploratory-analysis.Rmd index 03de4cb..9fa13a2 100644 --- a/vignettes/01_10_exploratory-analysis.Rmd +++ b/vignettes/01_10_exploratory-analysis.Rmd @@ -53,11 +53,11 @@ biocLite("alyssafrazee/RSkittleBrewer") * Use plots as often as possible * Use tables for phenotype data -* Look for - * Missing values - * Outlier values - * Mislabeled samples - * Naming consistency +* Look for: + * Missing values + * Outlier values + * Mislabeled samples + * Naming consistency ### Make the plots pretty diff --git a/vignettes/01_10_exploratory-analysis.html b/vignettes/01_10_exploratory-analysis.html new file mode 100644 index 0000000..335d579 --- /dev/null +++ b/vignettes/01_10_exploratory-analysis.html @@ -0,0 +1,432 @@ + + + + + + + + + + + + + +Exploratory analysis + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+

Dependencies

+

This document depends on the following packages:

+
  library(gplots)
+  library(devtools)
+  library(Biobase)
+  library(RSkittleBrewer)
+  library(org.Hs.eg.db)
+  library(AnnotationDbi)
+

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

+
install.packages(c("devtools","gplots"))
+source("http://www.bioconductor.org/biocLite.R")
+biocLite(c("Biobase","org.Hs.eg.db","AnnotationDbi"))
+biocLite("alyssafrazee/RSkittleBrewer")
+
+

General principles

+
    +
  • Use plots as often as possible
  • +
  • Use tables for phenotype data
  • +
  • Look for: +
      +
    • Missing values
    • +
    • Outlier values
    • +
    • Mislabeled samples
    • +
    • Naming consistency
    • +
  • +
+
+
+

Make the plots pretty

+

Typically we will use color to explore data sets and label different values. There are a large number of color options in R. I like the RSkittleBrewer package, but you can also check out Jenny Bryan’s excellent lecture on colors in R for more information.

+

Load the library and set the color palette with the palette function. Now when I type col = 1 it will look for the first color in the trop colors. We also set the character to be a filled dot with par(pch=19).

+
library(RSkittleBrewer)
+# Make the colors pretty
+trop = RSkittleBrewer("tropical")
+palette(trop)
+par(pch=19)
+
+
+

Load some data

+

We will use this expression set to look at how we use plots and tables to check for different characteristics

+
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
+load(file=con)
+close(con)
+bm = bodymap.eset
+pdata=pData(bm)
+edata=exprs(bm)
+fdata = fData(bm)
+ls()
+
## [1] "bm"           "bodymap.eset" "con"          "edata"       
+## [5] "fdata"        "pdata"        "tropical"
+
+
+

Tables for factor/character variables

+

Tables are good for looking at factor or character variables, especially in phenotype data

+
table(pdata$gender)
+
## 
+## F M 
+## 8 8
+
table(pdata$gender,pdata$race)
+
##    
+##     african_american caucasian
+##   F                1         7
+##   M                0         8
+
+
+

Look for missing values

+

First check a summary of the distribution to look for scale, this is also one way to check for NA values.

+
summary(edata)
+
##    ERS025098           ERS025092          ERS025085       
+##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
+##  Median :      0.0   Median :     0.0   Median :     0.0  
+##  Mean   :    455.6   Mean   :   361.1   Mean   :   399.3  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
+##  Max.   :1584528.0   Max.   :499802.0   Max.   :808641.0  
+##    ERS025088           ERS025089         ERS025082      
+##  Min.   :      0.0   Min.   :      0   Min.   :      0  
+##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:      0  
+##  Median :      0.0   Median :      0   Median :      0  
+##  Mean   :    445.5   Mean   :    445   Mean   :    509  
+##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:      0  
+##  Max.   :1014579.0   Max.   :1415741   Max.   :2484692  
+##    ERS025081           ERS025096         ERS025099       
+##  Min.   :      0.0   Min.   :      0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:     0.0  
+##  Median :      0.0   Median :      0   Median :     0.0  
+##  Mean   :    430.4   Mean   :    558   Mean   :   445.5  
+##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:     0.0  
+##  Max.   :1356643.0   Max.   :3517964   Max.   :572374.0  
+##    ERS025086          ERS025084          ERS025087     
+##  Min.   :     0.0   Min.   :     0.0   Min.   :     0  
+##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0  
+##  Median :     0.0   Median :     0.0   Median :     0  
+##  Mean   :   370.7   Mean   :   592.1   Mean   :  1097  
+##  3rd Qu.:     0.0   3rd Qu.:     0.0   3rd Qu.:     0  
+##  Max.   :458168.0   Max.   :287539.0   Max.   :519683  
+##    ERS025093           ERS025083          ERS025095        
+##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      0.0  
+##  Median :      0.0   Median :     0.0   Median :      0.0  
+##  Mean   :    997.7   Mean   :   434.7   Mean   :    479.6  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:      0.0  
+##  Max.   :1332827.0   Max.   :626824.0   Max.   :1605570.0  
+##    ERS025097           ERS025094          ERS025090       
+##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
+##  Median :      0.0   Median :     0.0   Median :     0.0  
+##  Mean   :    540.1   Mean   :   518.5   Mean   :   465.3  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
+##  Max.   :1888083.0   Max.   :776623.0   Max.   :535410.0  
+##    ERS025091        
+##  Min.   :      0.0  
+##  1st Qu.:      0.0  
+##  Median :      0.0  
+##  Mean   :    530.1  
+##  3rd Qu.:      0.0  
+##  Max.   :1221921.0
+

NA is the most common character for missing values, but sometimes they are coded as spaces, 999, -1 or “missing”. Check for missing values in a variety of ways

+
# Use option useNA to include NA's in table
+table(pdata$age,useNA="ifany")
+
## 
+##   19   29   37   47   58   60   65   68   73   77   86 <NA> 
+##    1    1    1    1    1    3    1    1    2    3    1    3
+
# is.na checks for NA values
+table(is.na(pdata$age))
+
## 
+## FALSE  TRUE 
+##    16     3
+
# Check for other common missing names
+sum(pdata$age==" ")
+
## [1] NA
+
# Check genomic data for NAs
+sum(is.na(edata))
+
## [1] 0
+
# Make the distribution of NA's by genes
+gene_na = rowSums(is.na(edata))
+table(gene_na)
+
## gene_na
+##     0 
+## 52580
+
# Make the distribution of NA's by samples
+sample_na = rowSums(is.na(edata))
+table(sample_na)
+
## sample_na
+##     0 
+## 52580
+
+
+

Make sure dimensions match up

+

The number of rows of the feature data should match the number of rows of the genomic data (both are the number of genes). The number of rows of the phenotype data should match the number of columns of the genomic data (both are the number of samples).

+
dim(fdata)
+
## [1] 52580     1
+
dim(pdata)
+
## [1] 19  6
+
dim(edata)
+
## [1] 52580    19
+
+
+

Look at overall distributions

+

Here we see that there are a lot of outliers

+
boxplot(log2(edata+1),col=2,range=0)
+

+

We can also look at this sample by sample with histograms

+
par(mfrow=c(1,2))
+hist(log2(edata[,1]+1),col=2)
+hist(log2(edata[,2]+1),col=2)
+

+

Or with density plots

+
plot(density(log2(edata[,1]+1)),col=2)
+lines(density(log2(edata[,2]+1)),col=3)
+

+

A very common task is to compare distributions of measurements (say before normalization). You can do this with a qq-plot

+
qqplot(log2(edata[,1]+1), log2(edata[,2]+1),col=3)
+

+

A very widely used plot is what is known as a M-A plot, sometimes called a Bland Altman plot. The basic idea is to plot the sum of the two values on the x-axis and the difference on the y-axis. This can be used to see any difference between the (samples, averages, etc.) and to see if there is any intensity-specific biases.

+
mm = log2(edata[,1]+1) - log2(edata[,2]+1)
+aa = log2(edata[,1]+1) + log2(edata[,2]+1)
+plot(aa,mm,col=2)
+

+

We can remove rows that are mostly zero and notice any differences in the distributions across samples.

+
edata = as.data.frame(edata)
+filt_edata = filter(edata,rowMeans(edata) > 1)
+boxplot(as.matrix(log2(filt_edata+1)),col=2)
+

+
+
+

Check for obvious data mixups

+

Here we are going to do a check to make sure that the men and women are correctly labeled by looking at expression on the Y chromosome. In general you might do several of this type of check to confirm the data are correctly labeled.

+

Get the chromosomes for each gene using the feature data.

+
aeid = as.character(fdata[,1])
+chr = AnnotationDbi::select(org.Hs.eg.db,keys=aeid,keytype="ENSEMBL",columns="CHR")
+head(chr)
+
##           ENSEMBL CHR
+## 1 ENSG00000000003   X
+## 2 ENSG00000000005   X
+## 3 ENSG00000000419  20
+## 4 ENSG00000000457   1
+## 5 ENSG00000000460   1
+## 6 ENSG00000000938   1
+

Filter to the data on chromsome Y and sum up all the counts. A tricky issue is that some genes are annotated to multiple chromsomes. Here we take the first chromsome each is annotated to.

+
dim(chr)
+
## [1] 52724     2
+
dim(edata)
+
## [1] 52580    19
+
# Take non-duplicated chromsomes
+chr = chr[!duplicated(chr[,1]),]
+
+# Confirm that the annotation still is in the right order
+all(chr[,1] == rownames(edata))
+
## [1] TRUE
+
# Select the chromosome Y samples
+edatay = dplyr::filter(edata,chr$CHR=="Y")
+
+# Males have Y chromsome expression as expected
+boxplot(colSums(edatay) ~ pdata$gender)
+points(colSums(edatay) ~ jitter(as.numeric(pdata$gender)),
+        col=as.numeric(pdata$gender),
+        pch=19)
+

+
+
+

Heatmaps

+

A common type of plot for genomics data is a heatmap. They are usually used for visualizing matrices. For example we can look at all genes with an average number of counts greater than 500:

+
ematrix = as.matrix(edata)[rowMeans(edata) > 10000,]
+heatmap(ematrix)
+

+

We might change the coloring since this one is a little hard to see. To do this you have to set up a color palette.

+
colramp = colorRampPalette(c(3,"white",2))(9)
+heatmap(ematrix,col=colramp)
+

+

You might have noticed some automatic clustering here, you can turn that off (we’ll learn more about it in a later lecture)

+
heatmap(ematrix,col=colramp,Rowv=NA,Colv=NA)
+

+

If you load the gplots package you can add a color scale with the heatmap.2 package. Here we have to add some options to make the dendogram disappear, scale the data by rows, and remove a tracing plot.

+
heatmap.2(ematrix,col=colramp,Rowv=NA,Colv=NA,
+          dendrogram="none", scale="row",trace="none")
+

+
+
+
+

Session information

+

Here is the session information

+
devtools::session_info()
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_GB.UTF-8                 
+##  tz       Europe/London               
+##  date     2015-09-17                  
+## 
+##  package        * version date      
+##  AnnotationDbi  * 1.30.1  2015-04-26
+##  assertthat       0.1     2013-12-06
+##  Biobase        * 2.28.0  2015-04-17
+##  BiocGenerics   * 0.14.0  2015-04-17
+##  bitops           1.0-6   2013-08-17
+##  caTools          1.17.1  2014-09-10
+##  DBI            * 0.3.1   2014-09-24
+##  devtools       * 1.9.1   2015-09-11
+##  digest           0.6.8   2014-12-31
+##  dplyr          * 0.4.3   2015-09-01
+##  evaluate         0.7.2   2015-08-13
+##  formatR          1.2     2015-04-21
+##  gdata            2.17.0  2015-07-04
+##  GenomeInfoDb   * 1.4.2   2015-08-15
+##  gplots         * 2.17.0  2015-05-02
+##  gtools           3.5.0   2015-05-29
+##  htmltools        0.2.6   2014-09-08
+##  IRanges        * 2.2.7   2015-08-09
+##  KernSmooth       2.23-15 2015-06-29
+##  knitr          * 1.11    2015-08-14
+##  lazyeval         0.1.10  2015-01-02
+##  magrittr         1.5     2014-11-22
+##  memoise          0.2.1   2014-04-22
+##  org.Hs.eg.db   * 3.1.2   2015-08-24
+##  R6               2.1.1   2015-08-19
+##  Rcpp             0.12.1  2015-09-10
+##  rmarkdown        0.8     2015-08-30
+##  RSkittleBrewer * 1.1     2015-09-17
+##  RSQLite        * 1.0.0   2014-10-25
+##  S4Vectors      * 0.6.5   2015-09-01
+##  stringi          0.5-5   2015-06-29
+##  stringr          1.0.0   2015-04-30
+##  yaml             2.1.13  2014-06-12
+##  source                                      
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)
+

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

+
+ + +
+ + + + + + + + From 8bdc23ac95bd6f7c4cbbefb2576203dd700510ff Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 21:10:07 +0100 Subject: [PATCH 4/9] Format plot main title --- vignettes/01_10_exploratory-analysis.Rmd | 4 ++-- vignettes/01_10_exploratory-analysis.html | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vignettes/01_10_exploratory-analysis.Rmd b/vignettes/01_10_exploratory-analysis.Rmd index 9fa13a2..e7d345f 100644 --- a/vignettes/01_10_exploratory-analysis.Rmd +++ b/vignettes/01_10_exploratory-analysis.Rmd @@ -149,8 +149,8 @@ We can also look at this sample by sample with histograms ```{r histograms} par(mfrow=c(1,2)) -hist(log2(edata[,1]+1),col=2) -hist(log2(edata[,2]+1),col=2) +hist(log2(edata[,1]+1),col=2, cex.main=0.8) +hist(log2(edata[,2]+1),col=2, cex.main=0.8) ``` Or with density plots diff --git a/vignettes/01_10_exploratory-analysis.html b/vignettes/01_10_exploratory-analysis.html index 335d579..065aac4 100644 --- a/vignettes/01_10_exploratory-analysis.html +++ b/vignettes/01_10_exploratory-analysis.html @@ -247,9 +247,9 @@

Look at overall distributions

We can also look at this sample by sample with histograms

par(mfrow=c(1,2))
-hist(log2(edata[,1]+1),col=2)
-hist(log2(edata[,2]+1),col=2)
-

+hist(log2(edata[,1]+1),col=2, cex.main=0.8) +hist(log2(edata[,2]+1),col=2, cex.main=0.8) +

Or with density plots

plot(density(log2(edata[,1]+1)),col=2)
 lines(density(log2(edata[,2]+1)),col=3)
@@ -301,7 +301,7 @@

Check for obvious data mixups

points(colSums(edatay) ~ jitter(as.numeric(pdata$gender)), col=as.numeric(pdata$gender), pch=19) -

+

Heatmaps

From bdd2cb6df0539e8f457fbfe4851126fd29bb17ad Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 21:12:11 +0100 Subject: [PATCH 5/9] remove html file --- vignettes/01_10_exploratory-analysis.html | 432 ---------------------- 1 file changed, 432 deletions(-) delete mode 100644 vignettes/01_10_exploratory-analysis.html diff --git a/vignettes/01_10_exploratory-analysis.html b/vignettes/01_10_exploratory-analysis.html deleted file mode 100644 index 065aac4..0000000 --- a/vignettes/01_10_exploratory-analysis.html +++ /dev/null @@ -1,432 +0,0 @@ - - - - - - - - - - - - - -Exploratory analysis - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -
-

Dependencies

-

This document depends on the following packages:

-
  library(gplots)
-  library(devtools)
-  library(Biobase)
-  library(RSkittleBrewer)
-  library(org.Hs.eg.db)
-  library(AnnotationDbi)
-

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

-
install.packages(c("devtools","gplots"))
-source("http://www.bioconductor.org/biocLite.R")
-biocLite(c("Biobase","org.Hs.eg.db","AnnotationDbi"))
-biocLite("alyssafrazee/RSkittleBrewer")
-
-

General principles

-
    -
  • Use plots as often as possible
  • -
  • Use tables for phenotype data
  • -
  • Look for: -
      -
    • Missing values
    • -
    • Outlier values
    • -
    • Mislabeled samples
    • -
    • Naming consistency
    • -
  • -
-
-
-

Make the plots pretty

-

Typically we will use color to explore data sets and label different values. There are a large number of color options in R. I like the RSkittleBrewer package, but you can also check out Jenny Bryan’s excellent lecture on colors in R for more information.

-

Load the library and set the color palette with the palette function. Now when I type col = 1 it will look for the first color in the trop colors. We also set the character to be a filled dot with par(pch=19).

-
library(RSkittleBrewer)
-# Make the colors pretty
-trop = RSkittleBrewer("tropical")
-palette(trop)
-par(pch=19)
-
-
-

Load some data

-

We will use this expression set to look at how we use plots and tables to check for different characteristics

-
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
-load(file=con)
-close(con)
-bm = bodymap.eset
-pdata=pData(bm)
-edata=exprs(bm)
-fdata = fData(bm)
-ls()
-
## [1] "bm"           "bodymap.eset" "con"          "edata"       
-## [5] "fdata"        "pdata"        "tropical"
-
-
-

Tables for factor/character variables

-

Tables are good for looking at factor or character variables, especially in phenotype data

-
table(pdata$gender)
-
## 
-## F M 
-## 8 8
-
table(pdata$gender,pdata$race)
-
##    
-##     african_american caucasian
-##   F                1         7
-##   M                0         8
-
-
-

Look for missing values

-

First check a summary of the distribution to look for scale, this is also one way to check for NA values.

-
summary(edata)
-
##    ERS025098           ERS025092          ERS025085       
-##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
-##  Median :      0.0   Median :     0.0   Median :     0.0  
-##  Mean   :    455.6   Mean   :   361.1   Mean   :   399.3  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
-##  Max.   :1584528.0   Max.   :499802.0   Max.   :808641.0  
-##    ERS025088           ERS025089         ERS025082      
-##  Min.   :      0.0   Min.   :      0   Min.   :      0  
-##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:      0  
-##  Median :      0.0   Median :      0   Median :      0  
-##  Mean   :    445.5   Mean   :    445   Mean   :    509  
-##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:      0  
-##  Max.   :1014579.0   Max.   :1415741   Max.   :2484692  
-##    ERS025081           ERS025096         ERS025099       
-##  Min.   :      0.0   Min.   :      0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:     0.0  
-##  Median :      0.0   Median :      0   Median :     0.0  
-##  Mean   :    430.4   Mean   :    558   Mean   :   445.5  
-##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:     0.0  
-##  Max.   :1356643.0   Max.   :3517964   Max.   :572374.0  
-##    ERS025086          ERS025084          ERS025087     
-##  Min.   :     0.0   Min.   :     0.0   Min.   :     0  
-##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0  
-##  Median :     0.0   Median :     0.0   Median :     0  
-##  Mean   :   370.7   Mean   :   592.1   Mean   :  1097  
-##  3rd Qu.:     0.0   3rd Qu.:     0.0   3rd Qu.:     0  
-##  Max.   :458168.0   Max.   :287539.0   Max.   :519683  
-##    ERS025093           ERS025083          ERS025095        
-##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      0.0  
-##  Median :      0.0   Median :     0.0   Median :      0.0  
-##  Mean   :    997.7   Mean   :   434.7   Mean   :    479.6  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:      0.0  
-##  Max.   :1332827.0   Max.   :626824.0   Max.   :1605570.0  
-##    ERS025097           ERS025094          ERS025090       
-##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
-##  Median :      0.0   Median :     0.0   Median :     0.0  
-##  Mean   :    540.1   Mean   :   518.5   Mean   :   465.3  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
-##  Max.   :1888083.0   Max.   :776623.0   Max.   :535410.0  
-##    ERS025091        
-##  Min.   :      0.0  
-##  1st Qu.:      0.0  
-##  Median :      0.0  
-##  Mean   :    530.1  
-##  3rd Qu.:      0.0  
-##  Max.   :1221921.0
-

NA is the most common character for missing values, but sometimes they are coded as spaces, 999, -1 or “missing”. Check for missing values in a variety of ways

-
# Use option useNA to include NA's in table
-table(pdata$age,useNA="ifany")
-
## 
-##   19   29   37   47   58   60   65   68   73   77   86 <NA> 
-##    1    1    1    1    1    3    1    1    2    3    1    3
-
# is.na checks for NA values
-table(is.na(pdata$age))
-
## 
-## FALSE  TRUE 
-##    16     3
-
# Check for other common missing names
-sum(pdata$age==" ")
-
## [1] NA
-
# Check genomic data for NAs
-sum(is.na(edata))
-
## [1] 0
-
# Make the distribution of NA's by genes
-gene_na = rowSums(is.na(edata))
-table(gene_na)
-
## gene_na
-##     0 
-## 52580
-
# Make the distribution of NA's by samples
-sample_na = rowSums(is.na(edata))
-table(sample_na)
-
## sample_na
-##     0 
-## 52580
-
-
-

Make sure dimensions match up

-

The number of rows of the feature data should match the number of rows of the genomic data (both are the number of genes). The number of rows of the phenotype data should match the number of columns of the genomic data (both are the number of samples).

-
dim(fdata)
-
## [1] 52580     1
-
dim(pdata)
-
## [1] 19  6
-
dim(edata)
-
## [1] 52580    19
-
-
-

Look at overall distributions

-

Here we see that there are a lot of outliers

-
boxplot(log2(edata+1),col=2,range=0)
-

-

We can also look at this sample by sample with histograms

-
par(mfrow=c(1,2))
-hist(log2(edata[,1]+1),col=2, cex.main=0.8)
-hist(log2(edata[,2]+1),col=2, cex.main=0.8)
-

-

Or with density plots

-
plot(density(log2(edata[,1]+1)),col=2)
-lines(density(log2(edata[,2]+1)),col=3)
-

-

A very common task is to compare distributions of measurements (say before normalization). You can do this with a qq-plot

-
qqplot(log2(edata[,1]+1), log2(edata[,2]+1),col=3)
-

-

A very widely used plot is what is known as a M-A plot, sometimes called a Bland Altman plot. The basic idea is to plot the sum of the two values on the x-axis and the difference on the y-axis. This can be used to see any difference between the (samples, averages, etc.) and to see if there is any intensity-specific biases.

-
mm = log2(edata[,1]+1) - log2(edata[,2]+1)
-aa = log2(edata[,1]+1) + log2(edata[,2]+1)
-plot(aa,mm,col=2)
-

-

We can remove rows that are mostly zero and notice any differences in the distributions across samples.

-
edata = as.data.frame(edata)
-filt_edata = filter(edata,rowMeans(edata) > 1)
-boxplot(as.matrix(log2(filt_edata+1)),col=2)
-

-
-
-

Check for obvious data mixups

-

Here we are going to do a check to make sure that the men and women are correctly labeled by looking at expression on the Y chromosome. In general you might do several of this type of check to confirm the data are correctly labeled.

-

Get the chromosomes for each gene using the feature data.

-
aeid = as.character(fdata[,1])
-chr = AnnotationDbi::select(org.Hs.eg.db,keys=aeid,keytype="ENSEMBL",columns="CHR")
-head(chr)
-
##           ENSEMBL CHR
-## 1 ENSG00000000003   X
-## 2 ENSG00000000005   X
-## 3 ENSG00000000419  20
-## 4 ENSG00000000457   1
-## 5 ENSG00000000460   1
-## 6 ENSG00000000938   1
-

Filter to the data on chromsome Y and sum up all the counts. A tricky issue is that some genes are annotated to multiple chromsomes. Here we take the first chromsome each is annotated to.

-
dim(chr)
-
## [1] 52724     2
-
dim(edata)
-
## [1] 52580    19
-
# Take non-duplicated chromsomes
-chr = chr[!duplicated(chr[,1]),]
-
-# Confirm that the annotation still is in the right order
-all(chr[,1] == rownames(edata))
-
## [1] TRUE
-
# Select the chromosome Y samples
-edatay = dplyr::filter(edata,chr$CHR=="Y")
-
-# Males have Y chromsome expression as expected
-boxplot(colSums(edatay) ~ pdata$gender)
-points(colSums(edatay) ~ jitter(as.numeric(pdata$gender)),
-        col=as.numeric(pdata$gender),
-        pch=19)
-

-
-
-

Heatmaps

-

A common type of plot for genomics data is a heatmap. They are usually used for visualizing matrices. For example we can look at all genes with an average number of counts greater than 500:

-
ematrix = as.matrix(edata)[rowMeans(edata) > 10000,]
-heatmap(ematrix)
-

-

We might change the coloring since this one is a little hard to see. To do this you have to set up a color palette.

-
colramp = colorRampPalette(c(3,"white",2))(9)
-heatmap(ematrix,col=colramp)
-

-

You might have noticed some automatic clustering here, you can turn that off (we’ll learn more about it in a later lecture)

-
heatmap(ematrix,col=colramp,Rowv=NA,Colv=NA)
-

-

If you load the gplots package you can add a color scale with the heatmap.2 package. Here we have to add some options to make the dendogram disappear, scale the data by rows, and remove a tracing plot.

-
heatmap.2(ematrix,col=colramp,Rowv=NA,Colv=NA,
-          dendrogram="none", scale="row",trace="none")
-

-
-
-
-

Session information

-

Here is the session information

-
devtools::session_info()
-
##  setting  value                       
-##  version  R version 3.2.2 (2015-08-14)
-##  system   x86_64, darwin13.4.0        
-##  ui       X11                         
-##  language (EN)                        
-##  collate  en_GB.UTF-8                 
-##  tz       Europe/London               
-##  date     2015-09-17                  
-## 
-##  package        * version date      
-##  AnnotationDbi  * 1.30.1  2015-04-26
-##  assertthat       0.1     2013-12-06
-##  Biobase        * 2.28.0  2015-04-17
-##  BiocGenerics   * 0.14.0  2015-04-17
-##  bitops           1.0-6   2013-08-17
-##  caTools          1.17.1  2014-09-10
-##  DBI            * 0.3.1   2014-09-24
-##  devtools       * 1.9.1   2015-09-11
-##  digest           0.6.8   2014-12-31
-##  dplyr          * 0.4.3   2015-09-01
-##  evaluate         0.7.2   2015-08-13
-##  formatR          1.2     2015-04-21
-##  gdata            2.17.0  2015-07-04
-##  GenomeInfoDb   * 1.4.2   2015-08-15
-##  gplots         * 2.17.0  2015-05-02
-##  gtools           3.5.0   2015-05-29
-##  htmltools        0.2.6   2014-09-08
-##  IRanges        * 2.2.7   2015-08-09
-##  KernSmooth       2.23-15 2015-06-29
-##  knitr          * 1.11    2015-08-14
-##  lazyeval         0.1.10  2015-01-02
-##  magrittr         1.5     2014-11-22
-##  memoise          0.2.1   2014-04-22
-##  org.Hs.eg.db   * 3.1.2   2015-08-24
-##  R6               2.1.1   2015-08-19
-##  Rcpp             0.12.1  2015-09-10
-##  rmarkdown        0.8     2015-08-30
-##  RSkittleBrewer * 1.1     2015-09-17
-##  RSQLite        * 1.0.0   2014-10-25
-##  S4Vectors      * 0.6.5   2015-09-01
-##  stringi          0.5-5   2015-06-29
-##  stringr          1.0.0   2015-04-30
-##  yaml             2.1.13  2014-06-12
-##  source                                      
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)
-

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

-
- - -
- - - - - - - - From 4a3077195b4ce72e08ea7b9dcc37b9e7e173c000 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Thu, 17 Sep 2015 22:00:09 +0100 Subject: [PATCH 6/9] Format plot labels --- vignettes/01_10_exploratory-analysis.Rmd | 4 +- vignettes/01_10_exploratory-analysis.html | 432 ++++++++++++++++++++++ 2 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 vignettes/01_10_exploratory-analysis.html diff --git a/vignettes/01_10_exploratory-analysis.Rmd b/vignettes/01_10_exploratory-analysis.Rmd index e7d345f..8c0dd5c 100644 --- a/vignettes/01_10_exploratory-analysis.Rmd +++ b/vignettes/01_10_exploratory-analysis.Rmd @@ -142,7 +142,7 @@ dim(edata) Here we see that there are a lot of outliers ```{r boxplot} -boxplot(log2(edata+1),col=2,range=0) +boxplot(log2(edata+1),col=2,range=0, las=2, cex.axis=0.8) ``` We can also look at this sample by sample with histograms @@ -180,7 +180,7 @@ We can remove rows that are mostly zero and notice any differences in the distri ```{r} edata = as.data.frame(edata) filt_edata = filter(edata,rowMeans(edata) > 1) -boxplot(as.matrix(log2(filt_edata+1)),col=2) +boxplot(as.matrix(log2(filt_edata+1)),col=2, las=2, cex.axis=0.8) ``` ### Check for obvious data mixups diff --git a/vignettes/01_10_exploratory-analysis.html b/vignettes/01_10_exploratory-analysis.html new file mode 100644 index 0000000..f8b34a6 --- /dev/null +++ b/vignettes/01_10_exploratory-analysis.html @@ -0,0 +1,432 @@ + + + + + + + + + + + + + +Exploratory analysis + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+

Dependencies

+

This document depends on the following packages:

+
  library(gplots)
+  library(devtools)
+  library(Biobase)
+  library(RSkittleBrewer)
+  library(org.Hs.eg.db)
+  library(AnnotationDbi)
+

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

+
install.packages(c("devtools","gplots"))
+source("http://www.bioconductor.org/biocLite.R")
+biocLite(c("Biobase","org.Hs.eg.db","AnnotationDbi"))
+biocLite("alyssafrazee/RSkittleBrewer")
+
+

General principles

+
    +
  • Use plots as often as possible
  • +
  • Use tables for phenotype data
  • +
  • Look for: +
      +
    • Missing values
    • +
    • Outlier values
    • +
    • Mislabeled samples
    • +
    • Naming consistency
    • +
  • +
+
+
+

Make the plots pretty

+

Typically we will use color to explore data sets and label different values. There are a large number of color options in R. I like the RSkittleBrewer package, but you can also check out Jenny Bryan’s excellent lecture on colors in R for more information.

+

Load the library and set the color palette with the palette function. Now when I type col = 1 it will look for the first color in the trop colors. We also set the character to be a filled dot with par(pch=19).

+
library(RSkittleBrewer)
+# Make the colors pretty
+trop = RSkittleBrewer("tropical")
+palette(trop)
+par(pch=19)
+
+
+

Load some data

+

We will use this expression set to look at how we use plots and tables to check for different characteristics

+
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
+load(file=con)
+close(con)
+bm = bodymap.eset
+pdata=pData(bm)
+edata=exprs(bm)
+fdata = fData(bm)
+ls()
+
## [1] "bm"           "bodymap.eset" "con"          "edata"       
+## [5] "fdata"        "pdata"        "tropical"
+
+
+

Tables for factor/character variables

+

Tables are good for looking at factor or character variables, especially in phenotype data

+
table(pdata$gender)
+
## 
+## F M 
+## 8 8
+
table(pdata$gender,pdata$race)
+
##    
+##     african_american caucasian
+##   F                1         7
+##   M                0         8
+
+
+

Look for missing values

+

First check a summary of the distribution to look for scale, this is also one way to check for NA values.

+
summary(edata)
+
##    ERS025098           ERS025092          ERS025085       
+##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
+##  Median :      0.0   Median :     0.0   Median :     0.0  
+##  Mean   :    455.6   Mean   :   361.1   Mean   :   399.3  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
+##  Max.   :1584528.0   Max.   :499802.0   Max.   :808641.0  
+##    ERS025088           ERS025089         ERS025082      
+##  Min.   :      0.0   Min.   :      0   Min.   :      0  
+##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:      0  
+##  Median :      0.0   Median :      0   Median :      0  
+##  Mean   :    445.5   Mean   :    445   Mean   :    509  
+##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:      0  
+##  Max.   :1014579.0   Max.   :1415741   Max.   :2484692  
+##    ERS025081           ERS025096         ERS025099       
+##  Min.   :      0.0   Min.   :      0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:     0.0  
+##  Median :      0.0   Median :      0   Median :     0.0  
+##  Mean   :    430.4   Mean   :    558   Mean   :   445.5  
+##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:     0.0  
+##  Max.   :1356643.0   Max.   :3517964   Max.   :572374.0  
+##    ERS025086          ERS025084          ERS025087     
+##  Min.   :     0.0   Min.   :     0.0   Min.   :     0  
+##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0  
+##  Median :     0.0   Median :     0.0   Median :     0  
+##  Mean   :   370.7   Mean   :   592.1   Mean   :  1097  
+##  3rd Qu.:     0.0   3rd Qu.:     0.0   3rd Qu.:     0  
+##  Max.   :458168.0   Max.   :287539.0   Max.   :519683  
+##    ERS025093           ERS025083          ERS025095        
+##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      0.0  
+##  Median :      0.0   Median :     0.0   Median :      0.0  
+##  Mean   :    997.7   Mean   :   434.7   Mean   :    479.6  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:      0.0  
+##  Max.   :1332827.0   Max.   :626824.0   Max.   :1605570.0  
+##    ERS025097           ERS025094          ERS025090       
+##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
+##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
+##  Median :      0.0   Median :     0.0   Median :     0.0  
+##  Mean   :    540.1   Mean   :   518.5   Mean   :   465.3  
+##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
+##  Max.   :1888083.0   Max.   :776623.0   Max.   :535410.0  
+##    ERS025091        
+##  Min.   :      0.0  
+##  1st Qu.:      0.0  
+##  Median :      0.0  
+##  Mean   :    530.1  
+##  3rd Qu.:      0.0  
+##  Max.   :1221921.0
+

NA is the most common character for missing values, but sometimes they are coded as spaces, 999, -1 or “missing”. Check for missing values in a variety of ways

+
# Use option useNA to include NA's in table
+table(pdata$age,useNA="ifany")
+
## 
+##   19   29   37   47   58   60   65   68   73   77   86 <NA> 
+##    1    1    1    1    1    3    1    1    2    3    1    3
+
# is.na checks for NA values
+table(is.na(pdata$age))
+
## 
+## FALSE  TRUE 
+##    16     3
+
# Check for other common missing names
+sum(pdata$age==" ")
+
## [1] NA
+
# Check genomic data for NAs
+sum(is.na(edata))
+
## [1] 0
+
# Make the distribution of NA's by genes
+gene_na = rowSums(is.na(edata))
+table(gene_na)
+
## gene_na
+##     0 
+## 52580
+
# Make the distribution of NA's by samples
+sample_na = rowSums(is.na(edata))
+table(sample_na)
+
## sample_na
+##     0 
+## 52580
+
+
+

Make sure dimensions match up

+

The number of rows of the feature data should match the number of rows of the genomic data (both are the number of genes). The number of rows of the phenotype data should match the number of columns of the genomic data (both are the number of samples).

+
dim(fdata)
+
## [1] 52580     1
+
dim(pdata)
+
## [1] 19  6
+
dim(edata)
+
## [1] 52580    19
+
+
+

Look at overall distributions

+

Here we see that there are a lot of outliers

+
boxplot(log2(edata+1),col=2,range=0, las=2, cex.axis=0.8)
+

+

We can also look at this sample by sample with histograms

+
par(mfrow=c(1,2))
+hist(log2(edata[,1]+1),col=2, cex.main=0.8)
+hist(log2(edata[,2]+1),col=2, cex.main=0.8)
+

+

Or with density plots

+
plot(density(log2(edata[,1]+1)),col=2)
+lines(density(log2(edata[,2]+1)),col=3)
+

+

A very common task is to compare distributions of measurements (say before normalization). You can do this with a qq-plot

+
qqplot(log2(edata[,1]+1), log2(edata[,2]+1),col=3)
+

+

A very widely used plot is what is known as a M-A plot, sometimes called a Bland Altman plot. The basic idea is to plot the sum of the two values on the x-axis and the difference on the y-axis. This can be used to see any difference between the (samples, averages, etc.) and to see if there is any intensity-specific biases.

+
mm = log2(edata[,1]+1) - log2(edata[,2]+1)
+aa = log2(edata[,1]+1) + log2(edata[,2]+1)
+plot(aa,mm,col=2)
+

+

We can remove rows that are mostly zero and notice any differences in the distributions across samples.

+
edata = as.data.frame(edata)
+filt_edata = filter(edata,rowMeans(edata) > 1)
+boxplot(as.matrix(log2(filt_edata+1)),col=2, las=2, cex.axis=0.8)
+

+
+
+

Check for obvious data mixups

+

Here we are going to do a check to make sure that the men and women are correctly labeled by looking at expression on the Y chromosome. In general you might do several of this type of check to confirm the data are correctly labeled.

+

Get the chromosomes for each gene using the feature data.

+
aeid = as.character(fdata[,1])
+chr = AnnotationDbi::select(org.Hs.eg.db,keys=aeid,keytype="ENSEMBL",columns="CHR")
+head(chr)
+
##           ENSEMBL CHR
+## 1 ENSG00000000003   X
+## 2 ENSG00000000005   X
+## 3 ENSG00000000419  20
+## 4 ENSG00000000457   1
+## 5 ENSG00000000460   1
+## 6 ENSG00000000938   1
+

Filter to the data on chromsome Y and sum up all the counts. A tricky issue is that some genes are annotated to multiple chromsomes. Here we take the first chromsome each is annotated to.

+
dim(chr)
+
## [1] 52724     2
+
dim(edata)
+
## [1] 52580    19
+
# Take non-duplicated chromsomes
+chr = chr[!duplicated(chr[,1]),]
+
+# Confirm that the annotation still is in the right order
+all(chr[,1] == rownames(edata))
+
## [1] TRUE
+
# Select the chromosome Y samples
+edatay = dplyr::filter(edata,chr$CHR=="Y")
+
+# Males have Y chromsome expression as expected
+boxplot(colSums(edatay) ~ pdata$gender)
+points(colSums(edatay) ~ jitter(as.numeric(pdata$gender)),
+        col=as.numeric(pdata$gender),
+        pch=19)
+

+
+
+

Heatmaps

+

A common type of plot for genomics data is a heatmap. They are usually used for visualizing matrices. For example we can look at all genes with an average number of counts greater than 500:

+
ematrix = as.matrix(edata)[rowMeans(edata) > 10000,]
+heatmap(ematrix)
+

+

We might change the coloring since this one is a little hard to see. To do this you have to set up a color palette.

+
colramp = colorRampPalette(c(3,"white",2))(9)
+heatmap(ematrix,col=colramp)
+

+

You might have noticed some automatic clustering here, you can turn that off (we’ll learn more about it in a later lecture)

+
heatmap(ematrix,col=colramp,Rowv=NA,Colv=NA)
+

+

If you load the gplots package you can add a color scale with the heatmap.2 package. Here we have to add some options to make the dendogram disappear, scale the data by rows, and remove a tracing plot.

+
heatmap.2(ematrix,col=colramp,Rowv=NA,Colv=NA,
+          dendrogram="none", scale="row",trace="none")
+

+
+
+
+

Session information

+

Here is the session information

+
devtools::session_info()
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_GB.UTF-8                 
+##  tz       Europe/London               
+##  date     2015-09-17                  
+## 
+##  package        * version date      
+##  AnnotationDbi  * 1.30.1  2015-04-26
+##  assertthat       0.1     2013-12-06
+##  Biobase        * 2.28.0  2015-04-17
+##  BiocGenerics   * 0.14.0  2015-04-17
+##  bitops           1.0-6   2013-08-17
+##  caTools          1.17.1  2014-09-10
+##  DBI            * 0.3.1   2014-09-24
+##  devtools       * 1.9.1   2015-09-11
+##  digest           0.6.8   2014-12-31
+##  dplyr          * 0.4.3   2015-09-01
+##  evaluate         0.7.2   2015-08-13
+##  formatR          1.2     2015-04-21
+##  gdata            2.17.0  2015-07-04
+##  GenomeInfoDb   * 1.4.2   2015-08-15
+##  gplots         * 2.17.0  2015-05-02
+##  gtools           3.5.0   2015-05-29
+##  htmltools        0.2.6   2014-09-08
+##  IRanges        * 2.2.7   2015-08-09
+##  KernSmooth       2.23-15 2015-06-29
+##  knitr          * 1.11    2015-08-14
+##  lazyeval         0.1.10  2015-01-02
+##  magrittr         1.5     2014-11-22
+##  memoise          0.2.1   2014-04-22
+##  org.Hs.eg.db   * 3.1.2   2015-08-24
+##  R6               2.1.1   2015-08-19
+##  Rcpp             0.12.1  2015-09-10
+##  rmarkdown        0.8     2015-08-30
+##  RSkittleBrewer * 1.1     2015-09-17
+##  RSQLite        * 1.0.0   2014-10-25
+##  S4Vectors      * 0.6.5   2015-09-01
+##  stringi          0.5-5   2015-06-29
+##  stringr          1.0.0   2015-04-30
+##  yaml             2.1.13  2014-06-12
+##  source                                      
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.2)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.2)                              
+##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
+##  CRAN (R 3.2.0)                              
+##  Bioconductor                                
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)                              
+##  CRAN (R 3.2.0)
+

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

+
+ + +
+ + + + + + + + From a227309a8b01197e6609250d7bee52cd57e2aaf2 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Sat, 19 Sep 2015 14:24:55 +0100 Subject: [PATCH 7/9] add exponent 2 in variance explanation formula --- vignettes/01_10_exploratory-analysis.html | 432 ---------------------- vignettes/02_03_dimension-reduction.html | 236 ++++++++++++ 2 files changed, 236 insertions(+), 432 deletions(-) delete mode 100644 vignettes/01_10_exploratory-analysis.html create mode 100644 vignettes/02_03_dimension-reduction.html diff --git a/vignettes/01_10_exploratory-analysis.html b/vignettes/01_10_exploratory-analysis.html deleted file mode 100644 index f8b34a6..0000000 --- a/vignettes/01_10_exploratory-analysis.html +++ /dev/null @@ -1,432 +0,0 @@ - - - - - - - - - - - - - -Exploratory analysis - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -
-

Dependencies

-

This document depends on the following packages:

-
  library(gplots)
-  library(devtools)
-  library(Biobase)
-  library(RSkittleBrewer)
-  library(org.Hs.eg.db)
-  library(AnnotationDbi)
-

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

-
install.packages(c("devtools","gplots"))
-source("http://www.bioconductor.org/biocLite.R")
-biocLite(c("Biobase","org.Hs.eg.db","AnnotationDbi"))
-biocLite("alyssafrazee/RSkittleBrewer")
-
-

General principles

-
    -
  • Use plots as often as possible
  • -
  • Use tables for phenotype data
  • -
  • Look for: -
      -
    • Missing values
    • -
    • Outlier values
    • -
    • Mislabeled samples
    • -
    • Naming consistency
    • -
  • -
-
-
-

Make the plots pretty

-

Typically we will use color to explore data sets and label different values. There are a large number of color options in R. I like the RSkittleBrewer package, but you can also check out Jenny Bryan’s excellent lecture on colors in R for more information.

-

Load the library and set the color palette with the palette function. Now when I type col = 1 it will look for the first color in the trop colors. We also set the character to be a filled dot with par(pch=19).

-
library(RSkittleBrewer)
-# Make the colors pretty
-trop = RSkittleBrewer("tropical")
-palette(trop)
-par(pch=19)
-
-
-

Load some data

-

We will use this expression set to look at how we use plots and tables to check for different characteristics

-
con = url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/bodymap_eset.RData")
-load(file=con)
-close(con)
-bm = bodymap.eset
-pdata=pData(bm)
-edata=exprs(bm)
-fdata = fData(bm)
-ls()
-
## [1] "bm"           "bodymap.eset" "con"          "edata"       
-## [5] "fdata"        "pdata"        "tropical"
-
-
-

Tables for factor/character variables

-

Tables are good for looking at factor or character variables, especially in phenotype data

-
table(pdata$gender)
-
## 
-## F M 
-## 8 8
-
table(pdata$gender,pdata$race)
-
##    
-##     african_american caucasian
-##   F                1         7
-##   M                0         8
-
-
-

Look for missing values

-

First check a summary of the distribution to look for scale, this is also one way to check for NA values.

-
summary(edata)
-
##    ERS025098           ERS025092          ERS025085       
-##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
-##  Median :      0.0   Median :     0.0   Median :     0.0  
-##  Mean   :    455.6   Mean   :   361.1   Mean   :   399.3  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
-##  Max.   :1584528.0   Max.   :499802.0   Max.   :808641.0  
-##    ERS025088           ERS025089         ERS025082      
-##  Min.   :      0.0   Min.   :      0   Min.   :      0  
-##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:      0  
-##  Median :      0.0   Median :      0   Median :      0  
-##  Mean   :    445.5   Mean   :    445   Mean   :    509  
-##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:      0  
-##  Max.   :1014579.0   Max.   :1415741   Max.   :2484692  
-##    ERS025081           ERS025096         ERS025099       
-##  Min.   :      0.0   Min.   :      0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:      0   1st Qu.:     0.0  
-##  Median :      0.0   Median :      0   Median :     0.0  
-##  Mean   :    430.4   Mean   :    558   Mean   :   445.5  
-##  3rd Qu.:      0.0   3rd Qu.:      0   3rd Qu.:     0.0  
-##  Max.   :1356643.0   Max.   :3517964   Max.   :572374.0  
-##    ERS025086          ERS025084          ERS025087     
-##  Min.   :     0.0   Min.   :     0.0   Min.   :     0  
-##  1st Qu.:     0.0   1st Qu.:     0.0   1st Qu.:     0  
-##  Median :     0.0   Median :     0.0   Median :     0  
-##  Mean   :   370.7   Mean   :   592.1   Mean   :  1097  
-##  3rd Qu.:     0.0   3rd Qu.:     0.0   3rd Qu.:     0  
-##  Max.   :458168.0   Max.   :287539.0   Max.   :519683  
-##    ERS025093           ERS025083          ERS025095        
-##  Min.   :      0.0   Min.   :     0.0   Min.   :      0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:      0.0  
-##  Median :      0.0   Median :     0.0   Median :      0.0  
-##  Mean   :    997.7   Mean   :   434.7   Mean   :    479.6  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:      0.0  
-##  Max.   :1332827.0   Max.   :626824.0   Max.   :1605570.0  
-##    ERS025097           ERS025094          ERS025090       
-##  Min.   :      0.0   Min.   :     0.0   Min.   :     0.0  
-##  1st Qu.:      0.0   1st Qu.:     0.0   1st Qu.:     0.0  
-##  Median :      0.0   Median :     0.0   Median :     0.0  
-##  Mean   :    540.1   Mean   :   518.5   Mean   :   465.3  
-##  3rd Qu.:      0.0   3rd Qu.:     0.0   3rd Qu.:     0.0  
-##  Max.   :1888083.0   Max.   :776623.0   Max.   :535410.0  
-##    ERS025091        
-##  Min.   :      0.0  
-##  1st Qu.:      0.0  
-##  Median :      0.0  
-##  Mean   :    530.1  
-##  3rd Qu.:      0.0  
-##  Max.   :1221921.0
-

NA is the most common character for missing values, but sometimes they are coded as spaces, 999, -1 or “missing”. Check for missing values in a variety of ways

-
# Use option useNA to include NA's in table
-table(pdata$age,useNA="ifany")
-
## 
-##   19   29   37   47   58   60   65   68   73   77   86 <NA> 
-##    1    1    1    1    1    3    1    1    2    3    1    3
-
# is.na checks for NA values
-table(is.na(pdata$age))
-
## 
-## FALSE  TRUE 
-##    16     3
-
# Check for other common missing names
-sum(pdata$age==" ")
-
## [1] NA
-
# Check genomic data for NAs
-sum(is.na(edata))
-
## [1] 0
-
# Make the distribution of NA's by genes
-gene_na = rowSums(is.na(edata))
-table(gene_na)
-
## gene_na
-##     0 
-## 52580
-
# Make the distribution of NA's by samples
-sample_na = rowSums(is.na(edata))
-table(sample_na)
-
## sample_na
-##     0 
-## 52580
-
-
-

Make sure dimensions match up

-

The number of rows of the feature data should match the number of rows of the genomic data (both are the number of genes). The number of rows of the phenotype data should match the number of columns of the genomic data (both are the number of samples).

-
dim(fdata)
-
## [1] 52580     1
-
dim(pdata)
-
## [1] 19  6
-
dim(edata)
-
## [1] 52580    19
-
-
-

Look at overall distributions

-

Here we see that there are a lot of outliers

-
boxplot(log2(edata+1),col=2,range=0, las=2, cex.axis=0.8)
-

-

We can also look at this sample by sample with histograms

-
par(mfrow=c(1,2))
-hist(log2(edata[,1]+1),col=2, cex.main=0.8)
-hist(log2(edata[,2]+1),col=2, cex.main=0.8)
-

-

Or with density plots

-
plot(density(log2(edata[,1]+1)),col=2)
-lines(density(log2(edata[,2]+1)),col=3)
-

-

A very common task is to compare distributions of measurements (say before normalization). You can do this with a qq-plot

-
qqplot(log2(edata[,1]+1), log2(edata[,2]+1),col=3)
-

-

A very widely used plot is what is known as a M-A plot, sometimes called a Bland Altman plot. The basic idea is to plot the sum of the two values on the x-axis and the difference on the y-axis. This can be used to see any difference between the (samples, averages, etc.) and to see if there is any intensity-specific biases.

-
mm = log2(edata[,1]+1) - log2(edata[,2]+1)
-aa = log2(edata[,1]+1) + log2(edata[,2]+1)
-plot(aa,mm,col=2)
-

-

We can remove rows that are mostly zero and notice any differences in the distributions across samples.

-
edata = as.data.frame(edata)
-filt_edata = filter(edata,rowMeans(edata) > 1)
-boxplot(as.matrix(log2(filt_edata+1)),col=2, las=2, cex.axis=0.8)
-

-
-
-

Check for obvious data mixups

-

Here we are going to do a check to make sure that the men and women are correctly labeled by looking at expression on the Y chromosome. In general you might do several of this type of check to confirm the data are correctly labeled.

-

Get the chromosomes for each gene using the feature data.

-
aeid = as.character(fdata[,1])
-chr = AnnotationDbi::select(org.Hs.eg.db,keys=aeid,keytype="ENSEMBL",columns="CHR")
-head(chr)
-
##           ENSEMBL CHR
-## 1 ENSG00000000003   X
-## 2 ENSG00000000005   X
-## 3 ENSG00000000419  20
-## 4 ENSG00000000457   1
-## 5 ENSG00000000460   1
-## 6 ENSG00000000938   1
-

Filter to the data on chromsome Y and sum up all the counts. A tricky issue is that some genes are annotated to multiple chromsomes. Here we take the first chromsome each is annotated to.

-
dim(chr)
-
## [1] 52724     2
-
dim(edata)
-
## [1] 52580    19
-
# Take non-duplicated chromsomes
-chr = chr[!duplicated(chr[,1]),]
-
-# Confirm that the annotation still is in the right order
-all(chr[,1] == rownames(edata))
-
## [1] TRUE
-
# Select the chromosome Y samples
-edatay = dplyr::filter(edata,chr$CHR=="Y")
-
-# Males have Y chromsome expression as expected
-boxplot(colSums(edatay) ~ pdata$gender)
-points(colSums(edatay) ~ jitter(as.numeric(pdata$gender)),
-        col=as.numeric(pdata$gender),
-        pch=19)
-

-
-
-

Heatmaps

-

A common type of plot for genomics data is a heatmap. They are usually used for visualizing matrices. For example we can look at all genes with an average number of counts greater than 500:

-
ematrix = as.matrix(edata)[rowMeans(edata) > 10000,]
-heatmap(ematrix)
-

-

We might change the coloring since this one is a little hard to see. To do this you have to set up a color palette.

-
colramp = colorRampPalette(c(3,"white",2))(9)
-heatmap(ematrix,col=colramp)
-

-

You might have noticed some automatic clustering here, you can turn that off (we’ll learn more about it in a later lecture)

-
heatmap(ematrix,col=colramp,Rowv=NA,Colv=NA)
-

-

If you load the gplots package you can add a color scale with the heatmap.2 package. Here we have to add some options to make the dendogram disappear, scale the data by rows, and remove a tracing plot.

-
heatmap.2(ematrix,col=colramp,Rowv=NA,Colv=NA,
-          dendrogram="none", scale="row",trace="none")
-

-
-
-
-

Session information

-

Here is the session information

-
devtools::session_info()
-
##  setting  value                       
-##  version  R version 3.2.2 (2015-08-14)
-##  system   x86_64, darwin13.4.0        
-##  ui       X11                         
-##  language (EN)                        
-##  collate  en_GB.UTF-8                 
-##  tz       Europe/London               
-##  date     2015-09-17                  
-## 
-##  package        * version date      
-##  AnnotationDbi  * 1.30.1  2015-04-26
-##  assertthat       0.1     2013-12-06
-##  Biobase        * 2.28.0  2015-04-17
-##  BiocGenerics   * 0.14.0  2015-04-17
-##  bitops           1.0-6   2013-08-17
-##  caTools          1.17.1  2014-09-10
-##  DBI            * 0.3.1   2014-09-24
-##  devtools       * 1.9.1   2015-09-11
-##  digest           0.6.8   2014-12-31
-##  dplyr          * 0.4.3   2015-09-01
-##  evaluate         0.7.2   2015-08-13
-##  formatR          1.2     2015-04-21
-##  gdata            2.17.0  2015-07-04
-##  GenomeInfoDb   * 1.4.2   2015-08-15
-##  gplots         * 2.17.0  2015-05-02
-##  gtools           3.5.0   2015-05-29
-##  htmltools        0.2.6   2014-09-08
-##  IRanges        * 2.2.7   2015-08-09
-##  KernSmooth       2.23-15 2015-06-29
-##  knitr          * 1.11    2015-08-14
-##  lazyeval         0.1.10  2015-01-02
-##  magrittr         1.5     2014-11-22
-##  memoise          0.2.1   2014-04-22
-##  org.Hs.eg.db   * 3.1.2   2015-08-24
-##  R6               2.1.1   2015-08-19
-##  Rcpp             0.12.1  2015-09-10
-##  rmarkdown        0.8     2015-08-30
-##  RSkittleBrewer * 1.1     2015-09-17
-##  RSQLite        * 1.0.0   2014-10-25
-##  S4Vectors      * 0.6.5   2015-09-01
-##  stringi          0.5-5   2015-06-29
-##  stringr          1.0.0   2015-04-30
-##  yaml             2.1.13  2014-06-12
-##  source                                      
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.2)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.2)                              
-##  Github (alyssafrazee/RSkittleBrewer@0a96a20)
-##  CRAN (R 3.2.0)                              
-##  Bioconductor                                
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)                              
-##  CRAN (R 3.2.0)
-

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-17.

-
- - -
- - - - - - - - diff --git a/vignettes/02_03_dimension-reduction.html b/vignettes/02_03_dimension-reduction.html new file mode 100644 index 0000000..cae8b2e --- /dev/null +++ b/vignettes/02_03_dimension-reduction.html @@ -0,0 +1,236 @@ + + + + + + + + + + + + + +Dimension reduction for genomics + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+

Dependencies

+

This document depends on the following packages:

+
  library(devtools)
+  library(Biobase)
+

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

+
install.packages(c("devtools"))
+source("http://www.bioconductor.org/biocLite.R")
+biocLite(c("Biobase"))
+
+
+

General principles

+
    +
  • Can we find patterns in matrices of data?
  • +
+
+
+

Load some data

+

We will use this expression set that combines two studies Transcriptome genetics using second generation sequencing in a Caucasian population. and Understanding mechanisms underlying human gene expression variation with RNA sequencing.. These studies are different populations but we counted the same genes for both. Then we’ll explore the differences.

+
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
+load(file=con)
+close(con)
+mp = montpick.eset
+pdata=pData(mp)
+edata=as.data.frame(exprs(mp))
+fdata = fData(mp)
+ls()
+
## [1] "con"           "edata"         "fdata"         "montpick.eset"
+## [5] "mp"            "pdata"         "tropical"
+
+
+

Calculate the singular vectors

+

Here we calculate the singular vectors:

+
edata = edata[rowMeans(edata) > 100, ]
+edata = log2(edata + 1)
+edata_centered = edata - rowMeans(edata)
+svd1 = svd(edata_centered)
+names(svd1)
+
## [1] "d" "u" "v"
+
+
+

Look at the percent variance explained

+

The percent of variance explained is given by \(\frac{d_{ii}}{\sum_{j}d_{jj}^2}\)

+
plot(svd1$d,ylab="Singular value",col=2)
+

+
plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2)
+

+
+
+

Plot top two principal components

+
par(mfrow=c(1,2))
+plot(svd1$v[,1],col=2,ylab="1st PC")
+plot(svd1$v[,2],col=2,ylab="2nd PC")
+

+
+
+

Plot PC1 vs. PC2

+

A very common plot is to plot PC1 versus PC2 to see if you can see any “clusters” or “groups”.

+
plot(svd1$v[,1],svd1$v[,2],col=2,ylab="2nd PC",xlab="1st PC")
+

+

One thing you can do is color them by different variables to see if clusters stand out.

+
plot(svd1$v[,1],svd1$v[,2],ylab="2nd PC",
+     xlab="1st PC",col=as.numeric(pdata$study))
+

+

Another common plot is to make boxplots comparing the PC for different levels of known covariates (don’t forget to show the actual data!).

+
boxplot(svd1$v[,1] ~ pdata$study,border=c(1,2))
+points(svd1$v[,1] ~ jitter(as.numeric(pdata$study)),col=as.numeric(pdata$study))
+

+
+
+

PCs versus SVs

+

What we have been plotting is not exactly the principal components.

+
pc1 = prcomp(edata)
+plot(pc1$rotation[,1],svd1$v[,1])
+

+

To get the actual PCs you have to subtract the column means rather than the row means when normalizing.

+
edata_centered2 = t(t(edata) - colMeans(edata))
+svd2 = svd(edata_centered2)
+plot(pc1$rotation[,1],svd2$v[,1],col=2)
+

+

Despite this, it is most common for people to perform row-centering and then plot the singular vectors (sometimes labeling them PCs like I have done in this document)

+
+
+

Outliers

+

What happens if we introduce a single outlying gene

+
edata_outlier = edata_centered
+edata_outlier[6,] = edata_centered[6,] * 10000
+svd3 = svd(edata_outlier)
+plot(svd1$v[,1],svd3$v[,1],xlab="Without outlier",ylab="With outlier")
+

+

It turns out the new top singular vector is perfectly correlated with the outlying gene

+
plot(svd3$v[,1],edata_outlier[6,],col=4)
+

+
+
+

Further resources

+

There are a large number of resources available about PCA and SVD but the lecture notes from Advanced Statistics for the Life Sciences are the best set of lecture notes focused on genomics currently available.

+
+
+

Session information

+

Here is the session information

+
devtools::session_info()
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_GB.UTF-8                 
+##  tz       Europe/London               
+##  date     2015-09-19                  
+## 
+##  package      * version date       source        
+##  Biobase      * 2.28.0  2015-04-17 Bioconductor  
+##  BiocGenerics * 0.14.0  2015-04-17 Bioconductor  
+##  devtools     * 1.9.1   2015-09-11 CRAN (R 3.2.0)
+##  digest         0.6.8   2014-12-31 CRAN (R 3.2.0)
+##  evaluate       0.8     2015-09-18 CRAN (R 3.2.2)
+##  formatR        1.2.1   2015-09-18 CRAN (R 3.2.2)
+##  htmltools      0.2.6   2014-09-08 CRAN (R 3.2.0)
+##  knitr        * 1.11    2015-08-14 CRAN (R 3.2.2)
+##  magrittr       1.5     2014-11-22 CRAN (R 3.2.0)
+##  memoise        0.2.1   2014-04-22 CRAN (R 3.2.0)
+##  rmarkdown      0.8     2015-08-30 CRAN (R 3.2.2)
+##  stringi        0.5-5   2015-06-29 CRAN (R 3.2.0)
+##  stringr        1.0.0   2015-04-30 CRAN (R 3.2.0)
+##  yaml           2.1.13  2014-06-12 CRAN (R 3.2.0)
+

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-19.

+
+ + +
+ + + + + + + + From d240aac70ff12ec05b4397d23f0e8b6982104f89 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Sat, 19 Sep 2015 14:28:19 +0100 Subject: [PATCH 8/9] remove html file --- vignettes/02_03_dimension-reduction.html | 236 ----------------------- 1 file changed, 236 deletions(-) delete mode 100644 vignettes/02_03_dimension-reduction.html diff --git a/vignettes/02_03_dimension-reduction.html b/vignettes/02_03_dimension-reduction.html deleted file mode 100644 index cae8b2e..0000000 --- a/vignettes/02_03_dimension-reduction.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - - - - - - -Dimension reduction for genomics - - - - - - - - - - - - - - - - - - - - - -
- - - - - - -
-

Dependencies

-

This document depends on the following packages:

-
  library(devtools)
-  library(Biobase)
-

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

-
install.packages(c("devtools"))
-source("http://www.bioconductor.org/biocLite.R")
-biocLite(c("Biobase"))
-
-
-

General principles

-
    -
  • Can we find patterns in matrices of data?
  • -
-
-
-

Load some data

-

We will use this expression set that combines two studies Transcriptome genetics using second generation sequencing in a Caucasian population. and Understanding mechanisms underlying human gene expression variation with RNA sequencing.. These studies are different populations but we counted the same genes for both. Then we’ll explore the differences.

-
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
-load(file=con)
-close(con)
-mp = montpick.eset
-pdata=pData(mp)
-edata=as.data.frame(exprs(mp))
-fdata = fData(mp)
-ls()
-
## [1] "con"           "edata"         "fdata"         "montpick.eset"
-## [5] "mp"            "pdata"         "tropical"
-
-
-

Calculate the singular vectors

-

Here we calculate the singular vectors:

-
edata = edata[rowMeans(edata) > 100, ]
-edata = log2(edata + 1)
-edata_centered = edata - rowMeans(edata)
-svd1 = svd(edata_centered)
-names(svd1)
-
## [1] "d" "u" "v"
-
-
-

Look at the percent variance explained

-

The percent of variance explained is given by \(\frac{d_{ii}}{\sum_{j}d_{jj}^2}\)

-
plot(svd1$d,ylab="Singular value",col=2)
-

-
plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2)
-

-
-
-

Plot top two principal components

-
par(mfrow=c(1,2))
-plot(svd1$v[,1],col=2,ylab="1st PC")
-plot(svd1$v[,2],col=2,ylab="2nd PC")
-

-
-
-

Plot PC1 vs. PC2

-

A very common plot is to plot PC1 versus PC2 to see if you can see any “clusters” or “groups”.

-
plot(svd1$v[,1],svd1$v[,2],col=2,ylab="2nd PC",xlab="1st PC")
-

-

One thing you can do is color them by different variables to see if clusters stand out.

-
plot(svd1$v[,1],svd1$v[,2],ylab="2nd PC",
-     xlab="1st PC",col=as.numeric(pdata$study))
-

-

Another common plot is to make boxplots comparing the PC for different levels of known covariates (don’t forget to show the actual data!).

-
boxplot(svd1$v[,1] ~ pdata$study,border=c(1,2))
-points(svd1$v[,1] ~ jitter(as.numeric(pdata$study)),col=as.numeric(pdata$study))
-

-
-
-

PCs versus SVs

-

What we have been plotting is not exactly the principal components.

-
pc1 = prcomp(edata)
-plot(pc1$rotation[,1],svd1$v[,1])
-

-

To get the actual PCs you have to subtract the column means rather than the row means when normalizing.

-
edata_centered2 = t(t(edata) - colMeans(edata))
-svd2 = svd(edata_centered2)
-plot(pc1$rotation[,1],svd2$v[,1],col=2)
-

-

Despite this, it is most common for people to perform row-centering and then plot the singular vectors (sometimes labeling them PCs like I have done in this document)

-
-
-

Outliers

-

What happens if we introduce a single outlying gene

-
edata_outlier = edata_centered
-edata_outlier[6,] = edata_centered[6,] * 10000
-svd3 = svd(edata_outlier)
-plot(svd1$v[,1],svd3$v[,1],xlab="Without outlier",ylab="With outlier")
-

-

It turns out the new top singular vector is perfectly correlated with the outlying gene

-
plot(svd3$v[,1],edata_outlier[6,],col=4)
-

-
-
-

Further resources

-

There are a large number of resources available about PCA and SVD but the lecture notes from Advanced Statistics for the Life Sciences are the best set of lecture notes focused on genomics currently available.

-
-
-

Session information

-

Here is the session information

-
devtools::session_info()
-
##  setting  value                       
-##  version  R version 3.2.2 (2015-08-14)
-##  system   x86_64, darwin13.4.0        
-##  ui       X11                         
-##  language (EN)                        
-##  collate  en_GB.UTF-8                 
-##  tz       Europe/London               
-##  date     2015-09-19                  
-## 
-##  package      * version date       source        
-##  Biobase      * 2.28.0  2015-04-17 Bioconductor  
-##  BiocGenerics * 0.14.0  2015-04-17 Bioconductor  
-##  devtools     * 1.9.1   2015-09-11 CRAN (R 3.2.0)
-##  digest         0.6.8   2014-12-31 CRAN (R 3.2.0)
-##  evaluate       0.8     2015-09-18 CRAN (R 3.2.2)
-##  formatR        1.2.1   2015-09-18 CRAN (R 3.2.2)
-##  htmltools      0.2.6   2014-09-08 CRAN (R 3.2.0)
-##  knitr        * 1.11    2015-08-14 CRAN (R 3.2.2)
-##  magrittr       1.5     2014-11-22 CRAN (R 3.2.0)
-##  memoise        0.2.1   2014-04-22 CRAN (R 3.2.0)
-##  rmarkdown      0.8     2015-08-30 CRAN (R 3.2.2)
-##  stringi        0.5-5   2015-06-29 CRAN (R 3.2.0)
-##  stringr        1.0.0   2015-04-30 CRAN (R 3.2.0)
-##  yaml           2.1.13  2014-06-12 CRAN (R 3.2.0)
-

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-19.

-
- - -
- - - - - - - - From 3b5848797861caf93c5e28aa2dcb40174fe98f91 Mon Sep 17 00:00:00 2001 From: Fabio Amaral Date: Sat, 19 Sep 2015 14:32:29 +0100 Subject: [PATCH 9/9] add exponent 2 to percent of variance explained equation --- vignettes/02_03_dimension-reduction.Rmd | 2 +- vignettes/02_03_dimension-reduction.html | 236 +++++++++++++++++++++++ 2 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 vignettes/02_03_dimension-reduction.html diff --git a/vignettes/02_03_dimension-reduction.Rmd b/vignettes/02_03_dimension-reduction.Rmd index 4f121d5..2a520da 100644 --- a/vignettes/02_03_dimension-reduction.Rmd +++ b/vignettes/02_03_dimension-reduction.Rmd @@ -74,7 +74,7 @@ names(svd1) ## Look at the percent variance explained -The percent of variance explained is given by $\frac{d_{ii}}{\sum_{j}d_{jj}^2}$ +The percent of variance explained is given by $\frac{d_{ii}^2}{\sum_{j}d_{jj}^2}$ ```{r} plot(svd1$d,ylab="Singular value",col=2) plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2) diff --git a/vignettes/02_03_dimension-reduction.html b/vignettes/02_03_dimension-reduction.html new file mode 100644 index 0000000..ca6b71d --- /dev/null +++ b/vignettes/02_03_dimension-reduction.html @@ -0,0 +1,236 @@ + + + + + + + + + + + + + +Dimension reduction for genomics + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+

Dependencies

+

This document depends on the following packages:

+
  library(devtools)
+  library(Biobase)
+

To install these packages you can use the code (or if you are compiling the document, remove the eval=FALSE from the chunk.)

+
install.packages(c("devtools"))
+source("http://www.bioconductor.org/biocLite.R")
+biocLite(c("Biobase"))
+
+
+

General principles

+
    +
  • Can we find patterns in matrices of data?
  • +
+
+
+

Load some data

+

We will use this expression set that combines two studies Transcriptome genetics using second generation sequencing in a Caucasian population. and Understanding mechanisms underlying human gene expression variation with RNA sequencing.. These studies are different populations but we counted the same genes for both. Then we’ll explore the differences.

+
con =url("http://bowtie-bio.sourceforge.net/recount/ExpressionSets/montpick_eset.RData")
+load(file=con)
+close(con)
+mp = montpick.eset
+pdata=pData(mp)
+edata=as.data.frame(exprs(mp))
+fdata = fData(mp)
+ls()
+
## [1] "con"           "edata"         "fdata"         "montpick.eset"
+## [5] "mp"            "pdata"         "tropical"
+
+
+

Calculate the singular vectors

+

Here we calculate the singular vectors:

+
edata = edata[rowMeans(edata) > 100, ]
+edata = log2(edata + 1)
+edata_centered = edata - rowMeans(edata)
+svd1 = svd(edata_centered)
+names(svd1)
+
## [1] "d" "u" "v"
+
+
+

Look at the percent variance explained

+

The percent of variance explained is given by \(\frac{d_{ii}^2}{\sum_{j}d_{jj}^2}\)

+
plot(svd1$d,ylab="Singular value",col=2)
+

+
plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2)
+

+
+
+

Plot top two principal components

+
par(mfrow=c(1,2))
+plot(svd1$v[,1],col=2,ylab="1st PC")
+plot(svd1$v[,2],col=2,ylab="2nd PC")
+

+
+
+

Plot PC1 vs. PC2

+

A very common plot is to plot PC1 versus PC2 to see if you can see any “clusters” or “groups”.

+
plot(svd1$v[,1],svd1$v[,2],col=2,ylab="2nd PC",xlab="1st PC")
+

+

One thing you can do is color them by different variables to see if clusters stand out.

+
plot(svd1$v[,1],svd1$v[,2],ylab="2nd PC",
+     xlab="1st PC",col=as.numeric(pdata$study))
+

+

Another common plot is to make boxplots comparing the PC for different levels of known covariates (don’t forget to show the actual data!).

+
boxplot(svd1$v[,1] ~ pdata$study,border=c(1,2))
+points(svd1$v[,1] ~ jitter(as.numeric(pdata$study)),col=as.numeric(pdata$study))
+

+
+
+

PCs versus SVs

+

What we have been plotting is not exactly the principal components.

+
pc1 = prcomp(edata)
+plot(pc1$rotation[,1],svd1$v[,1])
+

+

To get the actual PCs you have to subtract the column means rather than the row means when normalizing.

+
edata_centered2 = t(t(edata) - colMeans(edata))
+svd2 = svd(edata_centered2)
+plot(pc1$rotation[,1],svd2$v[,1],col=2)
+

+

Despite this, it is most common for people to perform row-centering and then plot the singular vectors (sometimes labeling them PCs like I have done in this document)

+
+
+

Outliers

+

What happens if we introduce a single outlying gene

+
edata_outlier = edata_centered
+edata_outlier[6,] = edata_centered[6,] * 10000
+svd3 = svd(edata_outlier)
+plot(svd1$v[,1],svd3$v[,1],xlab="Without outlier",ylab="With outlier")
+

+

It turns out the new top singular vector is perfectly correlated with the outlying gene

+
plot(svd3$v[,1],edata_outlier[6,],col=4)
+

+
+
+

Further resources

+

There are a large number of resources available about PCA and SVD but the lecture notes from Advanced Statistics for the Life Sciences are the best set of lecture notes focused on genomics currently available.

+
+
+

Session information

+

Here is the session information

+
devtools::session_info()
+
##  setting  value                       
+##  version  R version 3.2.2 (2015-08-14)
+##  system   x86_64, darwin13.4.0        
+##  ui       X11                         
+##  language (EN)                        
+##  collate  en_GB.UTF-8                 
+##  tz       Europe/London               
+##  date     2015-09-19                  
+## 
+##  package      * version date       source        
+##  Biobase      * 2.28.0  2015-04-17 Bioconductor  
+##  BiocGenerics * 0.14.0  2015-04-17 Bioconductor  
+##  devtools     * 1.9.1   2015-09-11 CRAN (R 3.2.0)
+##  digest         0.6.8   2014-12-31 CRAN (R 3.2.0)
+##  evaluate       0.8     2015-09-18 CRAN (R 3.2.2)
+##  formatR        1.2.1   2015-09-18 CRAN (R 3.2.2)
+##  htmltools      0.2.6   2014-09-08 CRAN (R 3.2.0)
+##  knitr        * 1.11    2015-08-14 CRAN (R 3.2.2)
+##  magrittr       1.5     2014-11-22 CRAN (R 3.2.0)
+##  memoise        0.2.1   2014-04-22 CRAN (R 3.2.0)
+##  rmarkdown      0.8     2015-08-30 CRAN (R 3.2.2)
+##  stringi        0.5-5   2015-06-29 CRAN (R 3.2.0)
+##  stringr        1.0.0   2015-04-30 CRAN (R 3.2.0)
+##  yaml           2.1.13  2014-06-12 CRAN (R 3.2.0)
+

It is also useful to compile the time the document was processed. This document was processed on: 2015-09-19.

+
+ + +
+ + + + + + + +