Analyse GEO2R data with R and Bioconductor

From BITS wiki
Jump to: navigation, search


RStudio.2x.png

Continue where GEO2R ends and perform more MA analysis steps in RStudio

[ Main_Page | Analyze_GEO_data_with_GEO2R ]


 

Extending the GEO2R analysis

Use as start material the data generated by GEO2R in the previous Analyze GEO data with GEO2R tutorial. Apply Bioconductor packages in RStudio to perform additional classical MA test and plot.

RStudio_start.png

 


original code as copied from GEO2R

# Version info: R 2.14.1, Biobase 2.15.3, GEOquery 2.23.2, limma 3.10.1
# R scripts generated  Wed Apr 2 07:35:46 EDT 2014
 
################################################################
#   Differential expression analysis with limma
library(Biobase)
library(GEOquery)
library(limma)
 
# load series and platform data from GEO
 
gset <- getGEO("GSE1871", GSEMatrix =TRUE)
if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
 
# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))
 
# group names for all samples
sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X");
 
# eliminate samples marked as "X"
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
 
# log2 transform
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
            exprs(gset) <- log2(ex) }
 
# set up the data and proceed with analysis
fl <- as.factor(sml)
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
cont.matrix <- makeContrasts(G1-G0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
 
# load NCBI platform annotation
gpl <- annotation(gset)
platf <- getGEO(gpl, AnnotGPL=TRUE)
ncbifd <- data.frame(attr(dataTable(platf), "table"))
 
# replace original platform annotation
tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))]
tT <- merge(tT, ncbifd, by="ID")
tT <- tT[order(tT$P.Value), ]  # restore correct order
 
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title"))
write.table(tT, file=stdout(), row.names=F, sep="\t")
 
################################################################
#   Boxplot for selected GEO samples
library(Biobase)
library(GEOquery)
 
# load series and platform data from GEO
 
gset <- getGEO("GSE1871", GSEMatrix =TRUE)
if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
 
# group names for all samples in a series
sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X")
 
# eliminate samples marked as "X"
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
 
# order samples by group
ex <- exprs(gset)[ , order(sml)]
sml <- sml[order(sml)]
fl <- as.factor(sml)
labels <- c("Control","LPS")
 
# set parameters and draw the plot
palette(c("#f4dfdf","#dfeaf4", "#AABBCC"))
dev.new(width=4+dim(gset)[[2]]/5, height=6)
par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1))
title <- paste ("GSE1871", '/', annotation(gset), " selected samples", sep ='')
boxplot(ex, boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=fl)
legend("topleft", labels, fill=palette(), bty="n")

Technical.png In the code sections shown below:

  • added comment lines start with '#+'
  • a modified or removed line of code is commented out using '###' with the new version immediately following it
  • added blocks of code are enclosed between '## added code' & '## end added code' tag lines for clarity

 

Modify GEO2R code to to store results locally

The original code gets data from the internet each time it is run. This can be quite lengthy and does not allow working offline. We therefore slightly modify the code so that the download happens only once and all necessary data and annotation are stored locally in the result folder for easy reuse. The final workspace will also be saved to allow reloading the complete RStudio environment at once at later points in time.

The original GEO2R code is now cut into sections and additions or modifications are shown and explained.

Load libraries and define folders

# Version info: R 2.14.1, Biobase 2.15.3, GEOquery 2.23.2, limma 3.10.1
# R scripts generated  Wed Apr 2 03:03:43 EDT 2014
 
################################################################
#   Differential expression analysis with limma
library(Biobase)
library(GEOquery)
library(limma)

you may need to change the following location

## move to a project folder and create a result folder in it 
basedir <- "~/R.analysis/19.GEO2R.GSE1871"
setwd(basedir)
 
resfolder <- "GEO2R.GSE1871.results"
if (! file.exists(resfolder) ){
    dir.create(resfolder, showWarnings = FALSE, recursive = FALSE, mode = "0777")
    Sys.chmod(resfolder, mode = "0777", use_umask=TRUE)
    }

load (once only) series and platform data from GEO

if (! exists("gset") ){
gset <- getGEO("GSE1871", destdir = resfolder, GSEMatrix =TRUE)
}

Arrange the (normalized) data and apply log2 transformation

This section is mostly untouched except the last line

if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
 
# make proper column names to match toptable 
fvarLabels(gset) <- make.names(fvarLabels(gset))
 
# group names for all samples (Simvastatin excluded)
sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X");
 
# eliminate samples marked as "X" (Simvastatin not used here)
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
 
# log2 transform
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
 
if (LogC) { ex[which(ex <= 0)] <- NaN
            exprs(gset) <- log2(ex) }

set up the DE contrasts and proceed with analysis

We work here with only two conditions but more conditions can be compared provided that a correctly formatted design is given. The code is left as obtained from GEO2R. Optionally, one could add a few lines to save the plot to file.

fl <- as.factor(sml)
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
 
# the differential expression contrast is set here to "LPS" versus "Control"
cont.matrix <- makeContrasts(G1-G0, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
# tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=nrow(fit2))
Error creating thumbnail: Unable to save thumbnail to destination
Instead of the top 250 rows, the tT object now contains ALL rows given by 'nrow(fit2)'

Save the resulting table to file for reuse (eg. IPA)

filename <- "GSE1871.data.tsv"
outfile <- paste(basedir, resfolder, filename, sep="/")
write.table(tT, file=outfile, 
            quote = FALSE, dec=",", sep="\t", col.names = NA, row.names = T)

Volcano boxplots to visualize the extend and confidence of the Differential Expression

The volcano plot is useful to visualize confidence and differential expression in a united plot. It sgenerally shows a symetrical profile where down-regulation and up-regulation are similarily represented and where high confidence calls represent a minority of the full table but remain present in sufficient amounts.

If the later is not observed, one can conclude that

  • the experimental conditions were not ideal
  • the number of replicates was too low
  • the stimulus or contrast used in the experiment is not strong enough (biologically) to drift transcript expression levels between reference and test samples.

Volcano plots[1] show distribution of the confidence values against expression fold changes. A symmetrical plot indicates an unbiased result, more points in the low p.value range indicate a higher confidence about the statement of 'differential expression'.

A line at 95% confidence (~5% expected error) indicates the usual limit used by biologists to call differentially expressed genes.

plot(tT$logFC, 1-tT$adj.P.Val, xlim=c(-6, 6), 
     main="Effect of intratracheal exposure to LPS",
     xlab="log2Ratio", ylab="1-adj.P.Val")
abline(h=0.95, col="red")

Volcano-Plot for GSE1871

volcano_plot.png

Technical.png This result is typical of a strong effect for both Up and Down regulation directions

Data mining and clustering of the processed results

Defining differentially expressed signatures is the obliged beginning for more complex data analysis and biological interpretation of the data. Before running into pathway analysis, a number of classical quality control checks need to be passed in order to ensure that the quality of the data is sufficient to motivate the next steps. Several attractive and commonly seen methods can be applied to the data to isolate subsets thereof or just present it in a nice graphical way. Some of these methods are illustrated in the remaining of this document.

Principal Component Analysis (PCA) of the 6 samples

Before proceeding with the GEO2R script, it can be important to control that samples are more similar withing groups than between groups. This should be true when the differential expression induced by the experimental conditions involves a great enough number of genes and causes variance in the data. The PCA analysis is very simple but powerful to detect mislabeled samples or weak experiments.

# copy the ex object created above and transpose it to get rows as samples and columns as probes
pca.data <- ex
 
# rename samples
grps <- c(rep("Cont",3), rep("LPS",3))
grpcol <- c(rep("blue",3), rep("red",3))
colnames(pca.data) <- paste(grps, colnames(pca.data), sep="-")
 
# remove NAs
pca.data <- na.omit(as.matrix(pca.data))
 
# transpose
pca.data <- t(pca.data)
 
# inspect pca.data
#dim(pca.data)
#str(pca.data)
 
# preview first 5 probe-sets
pca.data[,1:5]
 
# compute PCA
pca <- prcomp(pca.data, scale=TRUE)
 
# identify variance in components
summary(pca)
 
# the first 2 component group 64% of the total variance
# the first 3 component group 79% of the total variance
# the first 4 component group 90% of the total variance
 
# components #1 and #2
plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1&2", type="p", pch=10, col=grpcol)
text(pca$x[,1], pca$x[,2], rownames(pca.data), cex=0.75)
 
# show other component pairs for the example
# components #1 and #3
plot(pca$x[,1], pca$x[,3], xlab="PCA1", ylab="PCA3", main="PCA for components 1&3", type="p", pch=10, col=grpcol)
text(pca$x[,1], pca$x[,3], rownames(pca.data), cex=0.75)
 
# components #2 and #3
plot(pca$x[,2], pca$x[,3], xlab="PCA2", ylab="PCA3", main="PCA for components 2&3", type="p", pch=10, col=grpcol)
text(pca$x[,2], pca$x[,3], rownames(pca.data), cex=0.75)

PCA results

Plot for PCA#1 and #2


PCA_analysis1.png

Plot for PCA#1 and #3


PCA_analysis2.png

Plot for PCA#2 and #3


PCA_analysis3.png
Error creating thumbnail: Unable to save thumbnail to destination
The nice discrimination between Control and LPS samples indicates a good variance between these conditions

Together with the former good box-plot, this supports continuing with this data and analyzing it in your favorite pathway analysis tool (eg DAVID or its expensive and more powerful counterpart IPA)

Discusssion

We show in this tutorial that using the code generated by GEO2R within minutes and adding classical MA analytic steps can be quite straightforward and very rewarding and leads to a complete set of pictures and tables that can be used in IPA or as reference data to support your work with publicly available data.

Error creating thumbnail: Unable to save thumbnail to destination
This kind of approach should be encouraged for scientists who do not have resources for their own MA experiments but are sufficiently 'motivated' to challenge their hypotheses and results with DATA obtained by others scientists

A nice complementary reading about extra functionalities of the GEOquery package can be found in this page[2]

 



Download Howto files here

Data generated during this tutorial can be fetched from the GSE1871 result folder

References:
  1. http://en.wikipedia.org/wiki/Volcano_plot_(statistics)
  2. http://www2.warwick.ac.uk/fac/sci/moac/people/students/peter_cock/r/geo/


[ Main_Page | Analyze_GEO_data_with_GEO2R ]