Load Packages

suppressPackageStartupMessages({
    library(here)
    library(oligo)
})

Load the dataset

dataset <- read.celfiles(list.files(here("data/microarray"), full.names = TRUE))
## Loading required package: pd.hg.u95av2
## Loading required package: RSQLite
## Loading required package: DBI
## Platform design info loaded.
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651309.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651310.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651311.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651312.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651313.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651314.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651315.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651316.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651317.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651318.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651319.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651320.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651321.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651322.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651323.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651324.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651325.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651326.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651327.CEL.gz
## Reading in : /home/jialin/Courses/code6150/data/microarray/GSM651328.CEL.gz
#dataset <- dataset[, c(paste0(c("GSM651310", "GSM651320", "GSM651315", "GSM651325"), ".CEL.gz"))]
dataset <- dataset[, c(paste0(c("GSM651311", "GSM651321", "GSM651316", "GSM651326"), ".CEL.gz"))]
sampleNames(dataset) <- c("control_rep1", "control_rep2", "treat_rep1", "treat_rep2")
pData(dataset)$group <- c("control", "control", "treat", "treat")
dataset
## ExpressionFeatureSet (storageMode: lockedEnvironment)
## assayData: 409600 features, 4 samples 
##   element names: exprs 
## protocolData
##   rowNames: control_rep1 control_rep2 treat_rep1 treat_rep2
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: control_rep1 control_rep2 treat_rep1 treat_rep2
##   varLabels: index group
##   varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.hg.u95av2

Quality control

Boxplot

boxplot(dataset, target = "core")

MA plot

oligo::MAplot(dataset, pairs = TRUE, ylim = c(-1, 1))

Quality report of the raw data

The following will generate a quality report of the raw microarray data in docs/microarray_qualitymetrics directory.

library(arrayQualityMetrics)
arrayQualityMetrics(expressionset = dataset,
    outdir = "docs/microarray_qualitymetrics",
    force = TRUE, do.logtransform = TRUE,
    intgroup = c("group"))

RMA

The RMA method proceeds with background subtraction, normalization and summarization using a deconvolution method for background correction, quantile normalization and the RMA (robust multichip average) algorithm for summarization.

edata <- oligo::rma(dataset)
## Background correcting
## Normalizing
## Calculating Expression
edata
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 12625 features, 4 samples 
##   element names: exprs 
## protocolData
##   rowNames: control_rep1 control_rep2 treat_rep1 treat_rep2
##   varLabels: exprs dates
##   varMetadata: labelDescription channel
## phenoData
##   rowNames: control_rep1 control_rep2 treat_rep1 treat_rep2
##   varLabels: index group
##   varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.hg.u95av2

Access the quality after normalization

PCA plot before and after normalization

library(ggplot2)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
plot_grid(
    local({
        PCA_raw <- prcomp(t(log2(exprs(dataset))), scale = FALSE)
    
        dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2],
                         Group = pData(dataset)$group)
            
        qplot(PC1, PC2, data = dataGG, color = Group,
              main = "PCA plot of the raw data (log-transformed)", asp = 1.0, geom = "text",
        label = sampleNames(dataset)) + scale_colour_brewer(palette = "Set2")
    }),
    local({
        PCA <- prcomp(t(exprs(edata)), scale = FALSE)
        dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2],
                  Group = pData(dataset)$group)
        qplot(PC1, PC2, data = dataGG, color =  Group,
            main = "PCA plot of the normalized data", asp = 1.0, geom = "text",
            label = sampleNames(edata)) +
            scale_colour_brewer(palette = "Set2")
    })
)

Boxplot before and after normalization

par(mfrow = c(1,2))
boxplot(dataset)
boxplot(edata)

par(mfrow = c(1,1))

MA plot after normalization

oligo::MAplot(edata, pairs = TRUE, ylim = c(-1, 1))

Heatmap with sample-to-sample distance after normalization

It can not provide too much information for us since the number of samples is limited.

library(RColorBrewer)
library(pheatmap)
dists <- as.matrix(dist(t(exprs(edata)), method = "manhattan"))
diag(dists) <- NA
hmcol <- colorRampPalette(rev(brewer.pal(9, "PuOr")))(255)
pheatmap(dists, col = rev(hmcol), clustering_distance_rows = "manhattan",
                    clustering_distance_cols = "manhattan")

Filter based on intensity

Microarray data commonly show a large number of probes in the background intensity range. They also do not change much across arrays. Hence they combine a low variance with a low intensity. We want to filter these results as they may contribute to false positive results in the differential expression analysis. The bars represent the distribution of median intensities, the red vertical line represents the threshold value. If more than two samples of a gene have intensities larger than the threshold value, the gene will be kept.

edata_medians <- rowMedians(exprs(edata))
hist_res <- hist(edata_medians, 100, col="#e7efd8", freq = FALSE,
                 main = "Histogram of the median intensities",
                 xlab = "Median intensities")
emp_mu <- hist_res$breaks[which.max(hist_res$density)]
emp_sd <- mad(edata_medians)/2
prop_cental <- 0.50
lines(sort(edata_medians),
      prop_cental*dnorm(sort(edata_medians), mean = emp_mu, sd = emp_sd),
    col = "grey10", lwd = 3)

#cut_val <- 0.05 / prop_cental
thresh_median <- qnorm(0.05 / prop_cental, emp_mu, emp_sd)
abline(v = thresh_median, lwd = 4, col = "red")

samples_cutoff <- 2
idx_thresh_median <- apply(exprs(edata), 1, function(x){
                   sum(x > thresh_median) >= samples_cutoff})
table(idx_thresh_median)
## idx_thresh_median
## FALSE  TRUE 
##  4060  8565
edata <- subset(edata, idx_thresh_median)

Identification of differentially expressed genes

Create a design matrix. We will also consider the batch effects between replicates in order to remove them.

library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:oligo':
## 
##     backgroundCorrect
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
f <- factor(c("control", "control", "treat", "treat"))
batch <- factor(c("rep1", "rep2", "rep1", "rep2"))
design <- model.matrix(~ 0 + f + batch)
colnames(design)
## [1] "fcontrol"  "ftreat"    "batchrep2"
colnames(design) <- c("control", "treat", "batch")
design
##   control treat batch
## 1       1     0     0
## 2       1     0     1
## 3       0     1     0
## 4       0     1     1
## attr(,"assign")
## [1] 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$f
## [1] "contr.treatment"
## 
## attr(,"contrasts")$batch
## [1] "contr.treatment"

We can fit the linear model, define appropriate contrast to test the hypothesis on treatment effect and compute the moderated t–statistics by calling the eBayes function.

data.fit <- lmFit(exprs(edata), design)
head(data.fit$coefficients)
##            control    treat      batch
## 100_g_at  7.797102 8.130695 -0.2555991
## 1000_at   7.593515 7.814241  0.1087141
## 1002_f_at 4.612296 4.664025 -0.1417638
## 1003_s_at 6.091525 6.084982 -0.2208167
## 1004_at   5.848525 5.979026 -0.1795733
## 1005_at   8.166263 9.442745 -0.3113774
contrast.matrix <- makeContrasts(treat-control,levels=design)
data.fit.con <- contrasts.fit(data.fit,contrast.matrix)
data.fit.eb <- eBayes(data.fit.con)

Volcano plot to show the distribution of fold change and p value.

volcanoplot(data.fit.eb,highlight=10)

Then we sort the results by their absolute t-statistics.

top <- topTable(data.fit.eb, number = Inf)
head(top)
##              logFC  AveExpr         t      P.Value    adj.P.Val        B
## 37989_at -4.187867 5.058882 -21.61960 3.275846e-17 2.805762e-13 28.37765
## 38428_at -4.238118 5.624551 -17.23860 5.383859e-15 2.020411e-11 23.91131
## 40522_at  3.109750 8.149514  17.02754 7.076745e-15 2.020411e-11 23.66414
## 36629_at  2.923410 7.890901  16.27198 1.929437e-14 4.131408e-11 22.75146
## 41193_at -2.827488 6.189725 -15.87952 3.299633e-14 5.652271e-11 22.25952
## 1081_at  -3.472805 7.263600 -14.50306 2.373227e-13 3.387781e-10 20.43040

Check how many results can we get if we use a p value cutoff by 0.001 or an adjusted p value cutoff by 0.05.

table(top$adj.P.Val < 0.05)
## 
## FALSE  TRUE 
##  8117   448
table(top$P.Value < 0.001)
## 
## FALSE  TRUE 
##  8205   360

We would like to visualize the distribution of p value with a histogram.

hist(top$P.Value, col = brewer.pal(3, name = "Set2")[1], breaks = 100,
      main = "Control vs Treatment of DEX", xlab = "p-values")

Annotating genes

We need to annotate the gene names by the probe IDs.

suppressPackageStartupMessages({
    library(hgu95av2.db)
})
get_symbol <- function(probeid) {
    ans <- mapIds(hgu95av2.db::hgu95av2.db,
                  keys = probeid, keytype = "PROBEID", column = "SYMBOL", multiVals = "first")
    unname(ans)
}
get_genename <- function(probeid) {
    ans <- mapIds(hgu95av2.db::hgu95av2.db,
                  keys = probeid, keytype = "PROBEID", column = "GENENAME", multiVals = "first")
    unname(ans)
}
top$symbol <- get_symbol(rownames(top))
## 'select()' returned 1:many mapping between keys and columns
top$gene_name <- get_genename(rownames(top))
## 'select()' returned 1:many mapping between keys and columns

Generate a table of differentially expressed genes

We will use a cutoff of adjusted p value by 0.05.

selected <- top[, c("symbol", "gene_name", "logFC", "P.Value", "adj.P.Val")]
#top <- cbind(data.frame(probeID = rownames(top), stringsAsFactors = FALSE), top)
#rownames(top) <- NULL
selected <- selected[selected$adj.P.Val < 0.05,]
nrow(selected)
## [1] 448

We will only show the first 100 genes.

knitr::kable(selected[1:100,])
symbol gene_name logFC P.Value adj.P.Val
37989_at PTHLH parathyroid hormone like hormone -4.187867 0.0e+00 0.0000000
38428_at MMP1 matrix metallopeptidase 1 -4.238118 0.0e+00 0.0000000
40522_at GLUL glutamate-ammonia ligase 3.109750 0.0e+00 0.0000000
36629_at TSC22D3 TSC22 domain family member 3 2.923410 0.0e+00 0.0000000
41193_at DUSP6 dual specificity phosphatase 6 -2.827488 0.0e+00 0.0000000
1081_at ODC1 ornithine decarboxylase 1 -3.472805 0.0e+00 0.0000000
39528_at RRAD RRAD, Ras related glycolysis inhibitor and calcium channel regulator 2.497038 0.0e+00 0.0000000
37701_at RGS2 regulator of G protein signaling 2 2.807652 0.0e+00 0.0000000
39070_at FSCN1 fascin actin-bundling protein 1 -2.405557 0.0e+00 0.0000000
37324_at TFRC transferrin receptor 2.709999 0.0e+00 0.0000000
33272_at SAA1 serum amyloid A1 2.724206 0.0e+00 0.0000000
31859_at MMP9 matrix metallopeptidase 9 -2.271432 0.0e+00 0.0000000
1776_at RRAD RRAD, Ras related glycolysis inhibitor and calcium channel regulator 2.250750 0.0e+00 0.0000000
700_s_at MUC1 mucin 1, cell surface associated 2.965587 0.0e+00 0.0000000
39681_at ZBTB16 zinc finger and BTB domain containing 16 2.393121 0.0e+00 0.0000000
615_s_at PTHLH parathyroid hormone like hormone -2.038927 0.0e+00 0.0000000
37743_at FEZ1 fasciculation and elongation protein zeta 1 -2.043081 0.0e+00 0.0000000
34898_at AREG amphiregulin -2.033047 0.0e+00 0.0000000
37310_at PLAU plasminogen activator, urokinase -1.898265 0.0e+00 0.0000001
34721_at FKBP5 FK506 binding protein 5 2.159865 0.0e+00 0.0000001
39330_s_at ACTN1 actinin alpha 1 -1.959949 0.0e+00 0.0000001
35275_at CA12 carbonic anhydrase 12 1.924336 0.0e+00 0.0000001
36203_at ODC1 ornithine decarboxylase 1 -2.701456 0.0e+00 0.0000001
40202_at KLF9 Kruppel like factor 9 1.946240 0.0e+00 0.0000001
31792_at ANXA3 annexin A3 -2.475384 0.0e+00 0.0000001
1237_at IER3 immediate early response 3 -1.882725 0.0e+00 0.0000001
33058_at KRT75 keratin 75 -1.857103 0.0e+00 0.0000001
36454_at CA12 carbonic anhydrase 12 2.174531 0.0e+00 0.0000001
34091_s_at VIM vimentin -2.243330 0.0e+00 0.0000002
39329_at ACTN1 actinin alpha 1 -1.794896 0.0e+00 0.0000002
37430_at ALOX15B arachidonate 15-lipoxygenase, type B 1.822350 0.0e+00 0.0000002
34265_at SCG5 secretogranin V -1.784317 0.0e+00 0.0000003
38326_at G0S2 G0/G1 switch 2 -2.645540 0.0e+00 0.0000003
34378_at PLIN2 perilipin 2 1.691106 0.0e+00 0.0000003
38783_at MUC1 mucin 1, cell surface associated 2.071283 0.0e+00 0.0000003
38717_at METTL7A methyltransferase like 7A 1.721512 0.0e+00 0.0000005
33410_at ITGA6 integrin subunit alpha 6 -1.720180 0.0e+00 0.0000005
1549_s_at SERPINB4 serpin family B member 4 -1.706609 0.0e+00 0.0000007
40091_at BCL6 B cell CLL/lymphoma 6 2.030188 0.0e+00 0.0000008
39697_at HSD11B2 hydroxysteroid 11-beta dehydrogenase 2 1.926822 0.0e+00 0.0000008
32818_at TNC tenascin C -1.687841 0.0e+00 0.0000008
34281_at VSNL1 visinin like 1 -1.607313 0.0e+00 0.0000008
927_s_at MUC1 mucin 1, cell surface associated 2.001399 0.0e+00 0.0000009
1006_at MMP10 matrix metallopeptidase 10 -1.580954 0.0e+00 0.0000010
41246_at SERPINE2 serpin family E member 2 -1.839290 0.0e+00 0.0000010
41772_at MAOA monoamine oxidase A 1.820624 0.0e+00 0.0000010
34777_at ADM adrenomedullin 1.773652 0.0e+00 0.0000010
38784_g_at MUC1 mucin 1, cell surface associated 1.838938 0.0e+00 0.0000014
35959_at ZNF365 zinc finger protein 365 -1.954610 0.0e+00 0.0000016
36931_at TAGLN transgelin -1.565841 0.0e+00 0.0000019
32242_at CRYAB crystallin alpha B 1.624378 0.0e+00 0.0000019
1788_s_at DUSP4 dual specificity phosphatase 4 -1.510940 0.0e+00 0.0000020
32521_at SFRP1 secreted frizzled related protein 1 1.631606 0.0e+00 0.0000020
41771_g_at MAOA monoamine oxidase A 1.543015 0.0e+00 0.0000021
39950_at SMPDL3A sphingomyelin phosphodiesterase acid like 3A 1.610451 0.0e+00 0.0000029
32632_g_at GBA glucosylceramidase beta 1.761072 0.0e+00 0.0000030
35414_s_at JAG1 jagged 1 -1.462484 0.0e+00 0.0000030
33411_g_at ITGA6 integrin subunit alpha 6 -1.616264 0.0e+00 0.0000035
38411_at SORL1 sortilin related receptor 1 -1.465392 0.0e+00 0.0000038
33752_at IVNS1ABP influenza virus NS1A binding protein -1.470288 0.0e+00 0.0000041
32749_s_at FLNA filamin A -1.423586 0.0e+00 0.0000051
668_s_at MMP7 matrix metallopeptidase 7 1.824695 0.0e+00 0.0000051
1364_at PTPRZ1 protein tyrosine phosphatase, receptor type Z1 -1.553698 0.0e+00 0.0000057
41531_at TM4SF1 transmembrane 4 L six family member 1 -1.423341 1.0e-07 0.0000070
35280_at LAMC2 laminin subunit gamma 2 -1.535877 1.0e-07 0.0000070
40771_at MSN moesin -1.370099 1.0e-07 0.0000082
32570_at HPGD 15-hydroxyprostaglandin dehydrogenase 1.504577 1.0e-07 0.0000097
213_at ROR1 receptor tyrosine kinase like orphan receptor 1 1.360094 1.0e-07 0.0000112
40626_at NA NA 1.412192 1.0e-07 0.0000130
37635_at TCHH trichohyalin -1.342366 1.0e-07 0.0000149
38356_at FST follistatin -1.304834 2.0e-07 0.0000196
36671_at ASNS asparagine synthetase (glutamine-hydrolyzing) -1.293577 2.0e-07 0.0000217
151_s_at TUBB tubulin beta class I -1.284977 2.0e-07 0.0000234
33436_at SOX9 SRY-box 9 -1.275733 2.0e-07 0.0000252
35909_at PHLDA1 pleckstrin homology like domain family A member 1 -1.421403 3.0e-07 0.0000296
32243_g_at CRYAB crystallin alpha B 1.478674 3.0e-07 0.0000340
40631_at TOB1 transducer of ERBB2, 1 1.306595 4.0e-07 0.0000404
39087_at FXYD3 FXYD domain containing ion transport regulator 3 1.250711 4.0e-07 0.0000404
37157_at CALB2 calbindin 2 -1.256502 4.0e-07 0.0000420
36791_g_at TPM1 tropomyosin 1 -1.282536 5.0e-07 0.0000546
36790_at TPM1 tropomyosin 1 -1.372583 5.0e-07 0.0000559
39182_at EMP3 epithelial membrane protein 3 -1.204635 5.0e-07 0.0000567
39597_at ABLIM3 actin binding LIM protein family member 3 1.404302 6.0e-07 0.0000633
32203_at RBCK1 RANBP2-type and C3HC4-type zinc finger containing 1 -1.280468 7.0e-07 0.0000714
1667_s_at CYP4B1 cytochrome P450 family 4 subfamily B member 1 1.204252 8.0e-07 0.0000829
37225_at KANK1 KN motif and ankyrin repeat domains 1 1.227635 9.0e-07 0.0000867
1005_at DUSP1 dual specificity phosphatase 1 1.276482 9.0e-07 0.0000867
32140_at SORL1 sortilin related receptor 1 -1.172159 9.0e-07 0.0000884
574_s_at CASP1 caspase 1 -1.194880 1.0e-06 0.0000981
32382_at UPK1B uroplakin 1B 1.169892 1.0e-06 0.0000981
757_at ANXA2 annexin A2 -1.154000 1.1e-06 0.0000983
36638_at CTGF connective tissue growth factor -1.188337 1.1e-06 0.0000983
41770_at MAOA monoamine oxidase A 1.182567 1.1e-06 0.0000983
160031_at CDH3 cadherin 3 -1.160196 1.1e-06 0.0000983
35947_at TGM1 transglutaminase 1 1.188614 1.2e-06 0.0001047
647_at PROCR protein C receptor -1.175622 1.3e-06 0.0001191
36543_at F3 coagulation factor III, tissue factor -1.131893 1.4e-06 0.0001265
38630_at CERS6 ceramide synthase 6 1.208474 1.5e-06 0.0001332
31830_s_at SMTN smoothelin -1.182221 1.8e-06 0.0001558
38131_at PTGES prostaglandin E synthase 1.123511 1.9e-06 0.0001617

Gene clustering

The following is a heatmap of gene expression values of the differentially expressed genes.

library(pheatmap)
mat <- exprs(edata)[rownames(selected), ]
rownames(mat) <- selected$symbol
pData(edata)$sample_name <- rownames(pData(edata))
anno <- as.data.frame(pData(edata))[, c("group", "sample_name")]
pheatmap(mat, annotation_col = anno, show_rownames = FALSE,
         main = "Heatmap of gene expression levels")

We can roughly divide the genes (probes) into three clusters, and the control/treat samples are also clearly separated.

PCA plot

library(ggfortify)
autoplot(prcomp(exprs(edata)[rownames(selected), ]))

The PCA plot did not provide much useful information.

Pathway analysis

library(ReactomePA)
## 
## ReactomePA v1.24.0  For help: https://guangchuangyu.github.io/ReactomePA
## 
## If you use ReactomePA in published research, please cite:
## Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479
entrez_ids <- mapIds(hgu95av2.db::hgu95av2.db, keys = rownames(selected),
                     keytype = "PROBEID", column = "ENTREZID", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
entrez_ids <- entrez_ids[!is.na(entrez_ids)]
reactome_enrich <- enrichPathway(gene = entrez_ids, organism = "human")
barplot(reactome_enrich, showCategory = Inf)

dotplot(reactome_enrich, showCategory = Inf)

ReactomePA::emapplot(reactome_enrich, showCategory = Inf)

Pathway analysis of up-regulated genes

up_regulated <- selected[selected$logFC > 0, ]
up_regulated <- up_regulated[rownames(up_regulated) %in% names(entrez_ids), ]
ReactomePA::emapplot(
    enrichPathway(gene = entrez_ids[rownames(up_regulated)], organism = "human"),
    showCategory = Inf)
## Using `nicely` as default layout

Pathway analysis of down-regulated genes

down_regulated <- selected[selected$logFC < 0, ]
down_regulated <- down_regulated[rownames(down_regulated) %in% names(entrez_ids), ]
ReactomePA::emapplot(
    enrichPathway(gene = entrez_ids[rownames(down_regulated)], organism = "human"),
    showCategory = Inf)