Bioinformatics Course Board
   
  [DNA Microarray Bioinformatics] Bioconductor demo from the course
  Writer : Seyeon Weon     Date : 10-02     Hit : 8938    
  트랙백 주소 : http://www.bioinformatics.pe.kr/gnuboard/bbs/tb.php/course/50
# An R script to show how to do the following work
# with BioConductor

# 1. Read in data from an experiment set.
# 2. Examine the data and deside the type of normalzation
#    to be conducted.
# 3. Normalization
# 4. Examine the results.
# 5. Remove genes with too many missing values.
# 6. Fill in remaining missing values with KNN.
# 7. Do clustering

# Written by Seyeon Weon
# http://www.bioinformatics.pe.kr/sywon/email.html
# for DNA Microarray Bioinformatics Course
# http://www.bioinformatics.pe.kr/course/ARRAY/
# 2004. 5. 20


#######################################################################
# Getting Some Data to Work On
#######################################################################

# The data file and some part of the scripts were obtained from
# http://www.soe.ucsc.edu/classes/bme210/Winter04/labs/index.html

# Before you start, try to have at least basic understanding of
# how object oriented method works in R from the material provided
# in our course.

# Load marrayTools, which is a part of marray package.
# Bioconductor marray package is to preprocess two color
# spotted array experiments.
library(marrayTools)

# Make sure you understand the format of gal file.
galFile <- "Pae-printA.gal"

# This file describes the experiment set.
experimentsFile <- "Pae-arrays.txt"

# Read in the names of genes for each spot.
geneNames <- read.marrayInfo(fname=galFile, info.id=4:5, labels=5, skip=21)

# Check if it is done right.
geneNames[1:10,1]
# Or, simply by typing
geneNames

# Read in the layout of arrays.
arrayLayout <- read.marrayLayout(fname=galFile, pl.col=2, ctl.col=4, ngr=4, ngc=4, nsr=23, nsc=24, skip=21)

# Check if it is done right.
arrayLayout
slotNames(arrayLayout)
arrayLayout@maNgr
arrayLayout@maNsr

# Read in the experiment set.
experiments <- read.marrayInfo(fname=experimentsFile)

# Finally, read in the data from all 7 arrays of the experiment set.
rawData <- read.GenePix(layout=arrayLayout, targets=experiments, gnames=geneNames)

# Check if it is done right.
rawData
slotNames(rawData)
class(rawData@maRf)
dim(rawData@maRf)
rawData@maRf[1:10, 1:7]


#######################################################################
# Let's Diagnose the Data
#######################################################################

# It is necessary for many reasons.
# Above all, we need it to deside what type of normalization
# has to be conducted for our particular data.

# Load the microarray plotting library facilities
library(marrayPlots, verbose = FALSE)

# To produce a spatial image of background intensities
# for the Cy3 channel
# "tmp" is to receive the summary information.
tmp <- maImage(rawData[, 1], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 1], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 2], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 2], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 3], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 3], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 4], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 4], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 5], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 5], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 6], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 6], x = "maRb", bar = FALSE)
tmp <- maImage(rawData[, 7], x = "maGb", bar = FALSE)
tmp <- maImage(rawData[, 7], x = "maRb", bar = FALSE)

# For foreground
tmp <- maImage(rawData[, 1], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 1], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 2], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 2], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 3], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 3], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 4], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 4], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 5], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 5], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 6], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 6], x = "maRf", bar = FALSE)
tmp <- maImage(rawData[, 7], x = "maGf", bar = FALSE)
tmp <- maImage(rawData[, 7], x = "maRf", bar = FALSE)

# How do you think about the experiment set?

# To produce a spatial image of log ratios
tmp <- maImage(rawData[, 1], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 2], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 3], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 4], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 5], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 6], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)
tmp <- maImage(rawData[, 7], col = maPalette(low = "blue", high = "yellow"), bar = FALSE)

# To produce boxplots of log ratios by pin group
maBoxplot(rawData[, 1])
maBoxplot(rawData[, 2])
maBoxplot(rawData[, 3])
maBoxplot(rawData[, 4])
maBoxplot(rawData[, 5])
maBoxplot(rawData[, 6])
maBoxplot(rawData[, 7])

# To produce boxplots of log ratios by plate for the experiment set.
maPlate(rawData) <- maCompPlate(rawData, n = 384)
maBoxplot(rawData[, 1], x = "maPlate", names = NULL)
maBoxplot(rawData[, 2], x = "maPlate", names = NULL)
maBoxplot(rawData[, 3], x = "maPlate", names = NULL)
maBoxplot(rawData[, 4], x = "maPlate", names = NULL)
maBoxplot(rawData[, 5], x = "maPlate", names = NULL)
maBoxplot(rawData[, 6], x = "maPlate", names = NULL)
maBoxplot(rawData[, 7], x = "maPlate", names = NULL)

# Which of the 23 384-well plates behavior somewhat differently?
# What is your choice of normalization among, global, pin-group,
# and plate-group?


#######################################################################
# Normalize the Data Using Pin-Group Loess
#######################################################################

# Get all the (normalized) log-ratios and copy
# them into a matrix.
normData  <- maNorm(rawData)

# Check if it is done right.
normData
slotNames(normData)
dim(normData@maM)
class(normData@maM)
normData@maM[1:10, 1:7]
colnames(normData@maM[1:10, 1:7])

# Get the primary data we want to use in subsequent analysis
normM <- normData@maM

# Associate the gene names from the microarray
# with the matrix we just created.
row.names(normM) <- rawData@maGnames@maInfo[,1]

# Check if it is done right.
colnames(normM)
rownames(normM[1:10,])


#######################################################################
# Let's Examine the Normalized Data
#######################################################################

# Create a color palette where low values will be
# painted green and high values will be painted red.
rg.palette <- maPalette(low = "green", high = "red", k = 50)

# Make sure you understand what transpose of a matrix does.
image(t(normM), col=rg.palette)

# MA plots with pin-group normalization
oldpar <- par(mfrow=c(2,1))
maPlot(rawData[,1])
maPlot(normData[,1])
par(oldpar)

# Boxplots of log ratios for all arrays
maBoxplot(rawData[,1:7], names=1:7)
maBoxplot(normData[,1:7], names=1:7)

# Boxplots of log ratios for each arrays in pin-group
# By changing i, you can do plot for different arrays.
i = 1
maBoxplot(normData[,i])

# Boxplots of log ratios for each arrays in plate-group.
i = 1
maBoxplot(normData[,i], x="maPlate")

# What would you say about the normalization results?


#######################################################################
# Removing Genes with Too Many Missing Values
#######################################################################

# Define a filter function that checks genes
# with more than 20% of NAs at default.
# Install BioConductor "analyses" package which includes genefilter,
# if you have not done yet.
source("http://www.bioconductor.org/getBioC.R")
getBioC("analyses")

mmfilt <- function(limit=0.2) {
  function(x) {
    sum(is.na(x)) <= length(x) * limit
  }
}

# Let's make the limit to 30%.
mmfun <- mmfilt(limit=0.3)

# Make it a filterfun class to be used in genefilter.
ffun <- filterfun(mmfun)

# Now genes with too many NAs are removed.
goodGenes <- genefilter(normM, ffun)

# Check if it is done right.
sum(goodGenes)
length(goodGenes)
maNspots(rawData)


#######################################################################
# Guessing the Remaining NAs using KNN
#######################################################################

# install.packages("EMV")
library(EMV)
cleanedM <- knn(normM[goodGenes,], k=10)
cleanedM <- as.matrix(cleanedM$data)

# Check if it worked.
dim(cleanedM)
sum(is.na(cleanedM))
dim(normM[goodGenes,])
sum(is.na(normM[goodGenes,]))
dim(normM)
sum(is.na(normM))


#######################################################################
# Pearson Correlation Coefficient between Genes
#######################################################################

# You need a pretty good computer for this step.
# Otherwise, it will take forever to finish.
# You may need at least 2GB of RAM for prokaryote or yeast
# gene expression data.
# For human and other higher eukaryotes, you may need 4GB or more.
# Of course, the number of spots in an array and the number of arrays
# for the whole data set will determine how much memory you need.
# I also recommend you to buy a main board with dual channel
# memory access, since the memory access speed is most critical
# for performance.
# It is easy to find an inexpensive main board
# that can accommodate 4GB or 6GB.
# Be careful when choosing a main board with dual CPU.
# What is important for microarray data analysis
# is the size of RAM accessible by single CPU and single
# running process, not the total RAM on the main board.
# Also, dual CPU wouldn't help much for microarray data analysis,
# since it is not the type of work requiring multiple processes
# running at the same time.
# In conclusion, get as much RAM as possible for single CPU.
# One last comment on linux.  If your system has bigger than
# 4GB RAM, you need the big memory kernel.  You can easily find
# it from any ftp site for linux.

# Calculate the Pearson correlation coefficient between genes.
r <- cor(t(cleanedM))

# Convert Pearson correlation coefficient to distance.
d <- 1 - r
rm(r)

# Let's name the rows and columns according to the gene names.
goodGeneNames <- rawData@maGnames@maInfo[goodGenes,1]
length(goodGeneNames)
dimnames(d) <- list(as.vector(goodGeneNames), as.vector(goodGeneNames))

# See if we can recover some memory.
ls()

# Is there any object you can remove?
# You can save objects into your hard disk and reload them
# into memory again when they are needed.
save(normM, rawData, file = "savedobjects")
rm(normM, rawData)
load("savedobjects")


#######################################################################
# Pearson Correlation Coefficient between Samples
#######################################################################

# Calculate the Pearson correlation coefficient between samples.
r <- cor(cleanedM)

# Convert Pearson correlation coefficient to distance.
d <- 1 - r

# Give the rows and columns adequate names.
colnames(rawData@maTargets@maInfo)
rawData@maTargets@maInfo
rawData@maTargets@maInfo$file

# Convert part of the file names to the labels for experiments.
experimentNames <- substr(as.character(rawData@maTargets@maInfo$file), 5, 8)
dimnames(r) <- list(as.vector(experimentNames), as.vector(experimentNames))
d <- 1 - r


#######################################################################
# Hierarchical Clustering
#######################################################################

oldpar <- par(mfrow=c(2,2))

# Average linkage
hc1 <- hclust(as.dist(d), method = "average")

# Calculate cophenetic distance as a measure for
# how good the clustering is.
coph1 <- cor(cophenetic(hc1), as.dist(d))

# Plot it with proper labels.
plot(hc1, main = paste("Dendrogram for My Arrays: Coph = ", round(coph1, 2)), sub = paste("Average linkage, Pearson correlation, ", length(cleanedM[,1]), " genes"), xlab="Experiments")

# Cut the dendrogram into three groups.
cthc1 <- cutree(hc1, 3)
table(cthc1, experimentNames)
names(cthc1[cthc1==1])

# Single linkage
hc1 <- hclust(as.dist(d), method = "single")
coph1 <- cor(cophenetic(hc1), as.dist(d))
plot(hc1, main = paste("Dendrogram for My Arrays: Coph = ", round(coph1, 2)), sub = paste("Single linkage, Pearson correlation, ", length(cleanedM[,1]), " genes"), xlab="Experiments")

# Centroid linkage
hc1 <- hclust(as.dist(d), method = "centroid")
coph1 <- cor(cophenetic(hc1), as.dist(d))
plot(hc1, main = paste("Dendrogram for My Arrays: Coph = ", round(coph1, 2)), sub = paste("Centroid linkage, Pearson correlation, ", length(cleanedM[,1]), " genes"), xlab="Experiments")

# Complete linkage
hc1 <- hclust(as.dist(d), method = "complete")
coph1 <- cor(cophenetic(hc1), as.dist(d))
plot(hc1, main = paste("Dendrogram for My Arrays: Coph = ", round(coph1, 2)), sub = paste("Complete linkage, Pearson correlation, ", length(cleanedM[,1]), " genes"), xlab="Experiments")

par(oldpar)

# Which one seems to be the best?


#######################################################################
# k-Means Clustering
#######################################################################

# k = 2
k <- kmeans(as.dist(d), 2, 10)
plot(d, col = k$cluster, pch=15)
text(d, labels=experimentNames, adj=1)
points(k$centers, col = 1:2, pch = 8, )

# k = 3
k <- kmeans(as.dist(d), 3, 10)
plot(d, col = k$cluster, pch=15)
text(d, labels=experimentName, adj=1)
points(k$centers, col = 1:3, pch = 8)

# Extrating cluster 1
table(experimentName, k$cluster)
experimentName[k$cluster==1]

# k = 4
k <- kmeans(as.dist(d), 4, 10)
plot(d, col = k$cluster, pch=15)
text(d, labels=experimentName, adj=1)
points(k$centers, col = 1:4, pch = 8)

# k = 5
k <- kmeans(as.dist(d), 5, 10)
plot(d, col = k$cluster, pch=15)
text(d, labels=experimentName, adj=1)
points(k$centers, col = 1:5, pch = 8)

# k = 6
k <- kmeans(as.dist(d), 6, 10)
plot(d, col = k$cluster, pch=15)
text(d, labels=experimentName, adj=1)
points(k$centers, col = 1:6, pch = 8)

# Which k would be the best one?

# Well, this is about all we can do for our course.
# However, please explore more with R and BioConductor.