Analyse GEO2R data with R and Bioconductor
Continue where GEO2R ends and perform more MA analysis steps in RStudio
[ Main_Page | Analyze_GEO_data_with_GEO2R ]
Contents
- 1 Extending the GEO2R analysis
- 1.1 Modify GEO2R code to to store results locally
- 1.2 Load libraries and define folders
- 1.3 load (once only) series and platform data from GEO
- 1.4 Arrange the (normalized) data and apply log2 transformation
- 1.5 set up the DE contrasts and proceed with analysis
- 1.6 Volcano boxplots to visualize the extend and confidence of the Differential Expression
- 2 Data mining and clustering of the processed results
- 3 Discusssion
Extending the GEO2R analysis
Use as start material the data generated by GEO2R in the previous Analyze GEO data with GEO2R tutorial. Apply Bioconductor packages in RStudio to perform additional classical MA test and plot.
original code as copied from GEO2R
# Version info: R 2.14.1, Biobase 2.15.3, GEOquery 2.23.2, limma 3.10.1 # R scripts generated Wed Apr 2 07:35:46 EDT 2014 ################################################################ # Differential expression analysis with limma library(Biobase) library(GEOquery) library(limma) # load series and platform data from GEO gset <- getGEO("GSE1871", GSEMatrix =TRUE) if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1 gset <- gset[[idx]] # make proper column names to match toptable fvarLabels(gset) <- make.names(fvarLabels(gset)) # group names for all samples sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X"); # eliminate samples marked as "X" sel <- which(sml != "X") sml <- sml[sel] gset <- gset[ ,sel] # log2 transform ex <- exprs(gset) qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0) || (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2) if (LogC) { ex[which(ex <= 0)] <- NaN exprs(gset) <- log2(ex) } # set up the data and proceed with analysis fl <- as.factor(sml) gset$description <- fl design <- model.matrix(~ description + 0, gset) colnames(design) <- levels(fl) fit <- lmFit(gset, design) cont.matrix <- makeContrasts(G1-G0, levels=design) fit2 <- contrasts.fit(fit, cont.matrix) fit2 <- eBayes(fit2, 0.01) tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) # load NCBI platform annotation gpl <- annotation(gset) platf <- getGEO(gpl, AnnotGPL=TRUE) ncbifd <- data.frame(attr(dataTable(platf), "table")) # replace original platform annotation tT <- tT[setdiff(colnames(tT), setdiff(fvarLabels(gset), "ID"))] tT <- merge(tT, ncbifd, by="ID") tT <- tT[order(tT$P.Value), ] # restore correct order tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","Gene.symbol","Gene.title")) write.table(tT, file=stdout(), row.names=F, sep="\t") ################################################################ # Boxplot for selected GEO samples library(Biobase) library(GEOquery) # load series and platform data from GEO gset <- getGEO("GSE1871", GSEMatrix =TRUE) if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1 gset <- gset[[idx]] # group names for all samples in a series sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X") # eliminate samples marked as "X" sel <- which(sml != "X") sml <- sml[sel] gset <- gset[ ,sel] # order samples by group ex <- exprs(gset)[ , order(sml)] sml <- sml[order(sml)] fl <- as.factor(sml) labels <- c("Control","LPS") # set parameters and draw the plot palette(c("#f4dfdf","#dfeaf4", "#AABBCC")) dev.new(width=4+dim(gset)[[2]]/5, height=6) par(mar=c(2+round(max(nchar(sampleNames(gset)))/2),4,2,1)) title <- paste ("GSE1871", '/', annotation(gset), " selected samples", sep ='') boxplot(ex, boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=fl) legend("topleft", labels, fill=palette(), bty="n")
In the code sections shown below:
- added comment lines start with '#+'
- a modified or removed line of code is commented out using '###' with the new version immediately following it
- added blocks of code are enclosed between '## added code' & '## end added code' tag lines for clarity
Modify GEO2R code to to store results locally
The original code gets data from the internet each time it is run. This can be quite lengthy and does not allow working offline. We therefore slightly modify the code so that the download happens only once and all necessary data and annotation are stored locally in the result folder for easy reuse. The final workspace will also be saved to allow reloading the complete RStudio environment at once at later points in time.
The original GEO2R code is now cut into sections and additions or modifications are shown and explained.
Load libraries and define folders
# Version info: R 2.14.1, Biobase 2.15.3, GEOquery 2.23.2, limma 3.10.1 # R scripts generated Wed Apr 2 03:03:43 EDT 2014 ################################################################ # Differential expression analysis with limma library(Biobase) library(GEOquery) library(limma)
you may need to change the following location
## move to a project folder and create a result folder in it basedir <- "~/R.analysis/19.GEO2R.GSE1871" setwd(basedir) resfolder <- "GEO2R.GSE1871.results" if (! file.exists(resfolder) ){ dir.create(resfolder, showWarnings = FALSE, recursive = FALSE, mode = "0777") Sys.chmod(resfolder, mode = "0777", use_umask=TRUE) }
load (once only) series and platform data from GEO
if (! exists("gset") ){ gset <- getGEO("GSE1871", destdir = resfolder, GSEMatrix =TRUE) }
Arrange the (normalized) data and apply log2 transformation
This section is mostly untouched except the last line
if (length(gset) > 1) idx <- grep("GPL1261", attr(gset, "names")) else idx <- 1 gset <- gset[[idx]] # make proper column names to match toptable fvarLabels(gset) <- make.names(fvarLabels(gset)) # group names for all samples (Simvastatin excluded) sml <- c("G0","G0","G0","G1","G1","G1","X","X","X","X","X","X"); # eliminate samples marked as "X" (Simvastatin not used here) sel <- which(sml != "X") sml <- sml[sel] gset <- gset[ ,sel] # log2 transform ex <- exprs(gset) qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0) || (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2) if (LogC) { ex[which(ex <= 0)] <- NaN exprs(gset) <- log2(ex) }
set up the DE contrasts and proceed with analysis
We work here with only two conditions but more conditions can be compared provided that a correctly formatted design is given. The code is left as obtained from GEO2R. Optionally, one could add a few lines to save the plot to file.
fl <- as.factor(sml) gset$description <- fl design <- model.matrix(~ description + 0, gset) colnames(design) <- levels(fl) fit <- lmFit(gset, design) # the differential expression contrast is set here to "LPS" versus "Control" cont.matrix <- makeContrasts(G1-G0, levels=design) fit2 <- contrasts.fit(fit, cont.matrix) fit2 <- eBayes(fit2, 0.01) # tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) tT <- topTable(fit2, adjust="fdr", sort.by="B", number=nrow(fit2))
Save the resulting table to file for reuse (eg. IPA)
filename <- "GSE1871.data.tsv" outfile <- paste(basedir, resfolder, filename, sep="/") write.table(tT, file=outfile, quote = FALSE, dec=",", sep="\t", col.names = NA, row.names = T)
Volcano boxplots to visualize the extend and confidence of the Differential Expression
The volcano plot is useful to visualize confidence and differential expression in a united plot. It sgenerally shows a symetrical profile where down-regulation and up-regulation are similarily represented and where high confidence calls represent a minority of the full table but remain present in sufficient amounts.
If the later is not observed, one can conclude that
- the experimental conditions were not ideal
- the number of replicates was too low
- the stimulus or contrast used in the experiment is not strong enough (biologically) to drift transcript expression levels between reference and test samples.
Volcano plots[1] show distribution of the confidence values against expression fold changes. A symmetrical plot indicates an unbiased result, more points in the low p.value range indicate a higher confidence about the statement of 'differential expression'.
A line at 95% confidence (~5% expected error) indicates the usual limit used by biologists to call differentially expressed genes.
plot(tT$logFC, 1-tT$adj.P.Val, xlim=c(-6, 6), main="Effect of intratracheal exposure to LPS", xlab="log2Ratio", ylab="1-adj.P.Val") abline(h=0.95, col="red")
Volcano-Plot for GSE1871
Data mining and clustering of the processed results
Defining differentially expressed signatures is the obliged beginning for more complex data analysis and biological interpretation of the data. Before running into pathway analysis, a number of classical quality control checks need to be passed in order to ensure that the quality of the data is sufficient to motivate the next steps. Several attractive and commonly seen methods can be applied to the data to isolate subsets thereof or just present it in a nice graphical way. Some of these methods are illustrated in the remaining of this document.
Principal Component Analysis (PCA) of the 6 samples
Before proceeding with the GEO2R script, it can be important to control that samples are more similar withing groups than between groups. This should be true when the differential expression induced by the experimental conditions involves a great enough number of genes and causes variance in the data. The PCA analysis is very simple but powerful to detect mislabeled samples or weak experiments.
# copy the ex object created above and transpose it to get rows as samples and columns as probes pca.data <- ex # rename samples grps <- c(rep("Cont",3), rep("LPS",3)) grpcol <- c(rep("blue",3), rep("red",3)) colnames(pca.data) <- paste(grps, colnames(pca.data), sep="-") # remove NAs pca.data <- na.omit(as.matrix(pca.data)) # transpose pca.data <- t(pca.data) # inspect pca.data #dim(pca.data) #str(pca.data) # preview first 5 probe-sets pca.data[,1:5] # compute PCA pca <- prcomp(pca.data, scale=TRUE) # identify variance in components summary(pca) # the first 2 component group 64% of the total variance # the first 3 component group 79% of the total variance # the first 4 component group 90% of the total variance # components #1 and #2 plot(pca$x[,1], pca$x[,2], xlab="PCA1", ylab="PCA2", main="PCA for components 1&2", type="p", pch=10, col=grpcol) text(pca$x[,1], pca$x[,2], rownames(pca.data), cex=0.75) # show other component pairs for the example # components #1 and #3 plot(pca$x[,1], pca$x[,3], xlab="PCA1", ylab="PCA3", main="PCA for components 1&3", type="p", pch=10, col=grpcol) text(pca$x[,1], pca$x[,3], rownames(pca.data), cex=0.75) # components #2 and #3 plot(pca$x[,2], pca$x[,3], xlab="PCA2", ylab="PCA3", main="PCA for components 2&3", type="p", pch=10, col=grpcol) text(pca$x[,2], pca$x[,3], rownames(pca.data), cex=0.75)
PCA results
Plot for PCA#1 and #3
Plot for PCA#2 and #3
Together with the former good box-plot, this supports continuing with this data and analyzing it in your favorite pathway analysis tool (eg DAVID or its expensive and more powerful counterpart IPA)
Discusssion
We show in this tutorial that using the code generated by GEO2R within minutes and adding classical MA analytic steps can be quite straightforward and very rewarding and leads to a complete set of pictures and tables that can be used in IPA or as reference data to support your work with publicly available data.
A nice complementary reading about extra functionalities of the GEOquery package can be found in this page[2]
Download Howto files here
References: