---
title: "Day 3 - Gene Expression"
output: html_document
---
Load the [GEOquery](https://bioconductor.org/package/GEOquery) package.
```{r}
library(GEOquery)
```
Get the data
```{r, cache=TRUE}
x = getGEOSuppFiles("GSE20986")
x
```
Untarring the data
```{r}
untar("GSE20986/GSE20986_RAW.tar", exdir = "data")
```
gunzipping the data
```{r}
cels = list.files("data/", pattern = "[gz]")
sapply(paste("data", cels, sep = "/"), gunzip)
```
creating your phenodata
```{r}
phenodata = matrix(rep(list.files("data"), 2), ncol =2)
class(phenodata)
phenodata <- as.data.frame(phenodata)
colnames(phenodata) <- c("Name", "FileName")
phenodata$Targets <- c("iris",
"retina",
"retina",
"iris",
"retina",
"iris",
"choroid",
"choroid",
"choroid",
"huvec",
"huvec",
"huvec")
write.table(phenodata, "data/phenodata.txt", quote = F, sep = "\t", row.names = F)
```
```{r}
library(simpleaffy)
celfiles <- read.affy(covdesc = "phenodata.txt", path = "data")
boxplot(celfiles)
library(RColorBrewer)
cols = brewer.pal(8, "Set1")
eset <- exprs(celfiles)
samples <- celfiles$Targets
colnames(eset)
colnames(eset) <- samples
boxplot(celfiles, col = cols, las = 2)
distance <- dist(t(eset), method = "maximum")
clusters <- hclust(distance)
plot(clusters)
```
```{r}
require(simpleaffy)
require(affyPLM)
celfiles.gcrma = gcrma(celfiles)
par(mfrow=c(1,2))
boxplot(celfiles.gcrma, col = cols, las = 2, main = "Post-Normalization");
boxplot(celfiles, col = cols, las = 2, main = "Pre-Normalization")
dev.off()
distance <- dist(t(exprs(celfiles.gcrma)), method = "maximum")
clusters <- hclust(distance)
plot(clusters)
```
```{r}
library(limma)
phenodata
samples <- as.factor(samples)
design <- model.matrix(~0+samples)
colnames(design)
colnames(design) <- c("choroid", "huvec", "iris", "retina")
design
contrast.matrix = makeContrasts(
huvec_choroid = huvec - choroid,
huvec_retina = huvec - retina,
huvec_iris <- huvec - iris,
levels = design)
fit = lmFit(celfiles.gcrma, design)
huvec_fit <- contrasts.fit(fit, contrast.matrix)
huvec_ebay <- eBayes(huvec_fit)
library(hgu133plus2.db)
library(annotate)
probenames.list <- rownames(topTable(huvec_ebay, number = 100000))
getsymbols <- getSYMBOL(probenames.list, "hgu133plus2")
results <- topTable(huvec_ebay, number = 100000, coef = "huvec_choroid")
results <- cbind(results, getsymbols)
```
To make thresholds
```{r}
summary(results)
results$threshold <- "1"
a <- subset(results, adj.P.Val < 0.05 & logFC > 5)
results[rownames(a), "threshold"] <- "2"
b <- subset(results, adj.P.Val < 0.05 & logFC < -5)
results[rownames(b), "threshold"] <- "3"
table(results$threshold)
```
To make ggplot
```{r}
library(ggplot2)
volcano <- ggplot(data = results,
aes(x = logFC, y = -1*log10(adj.P.Val),
colour = threshold,
label = getsymbols))
volcano <- volcano +
geom_point() +
scale_color_manual(values = c("black", "red", "green"),
labels = c("Not Significant", "Upregulated", "Downregulated"),
name = "Key/Legend")
volcano +
geom_text(data = subset(results, logFC > 5 & -1*log10(adj.P.Val) > 5), aes(x = logFC, y = -1*log10(adj.P.Val), colour = threshold, label = getsymbols) )
```
[GEO Link](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE20986)