Bioinformatics Exercises
  R demo from the course II
  Writer : Seyeon Weon   Updated : 10-07   Hit : 4409   Updates 
# An R script to show how to do the following work
# without using BioConductor

# 1. Read in data from GenePix output
# 2. Normalization with loess
# 3. Plot the results

# (Steps 1 to 3 are in demo1A.R)

# 4. With multiple arrays processed using above procedure,
#    remove genes with too many NAs.
# 5. Fill in remaining missing values with KNN.
# 6. Filter up or down regulated genes
# 7. Do hierarchical clustering
# 8. Obtain gene list from the clusters

# Written by Seyeon Weon
# http://www.bioinformatics.pe.kr/sywon/email.html
# for DNA Microarray Bioinformatics Course
# http://www.bioinformatics.pe.kr/course/ARRAY/
# Updated 2005. 10. 5


#######################################################################
# Getting Some Data to Work On
#######################################################################

ysOrig <- read.delim("complete_dataset.txt", header=TRUE)

class(ysOrig)
dim(ysOrig)
colnames(ysOrig)
ysOrig[1:10,4:5]

# Heat shock experiment set 1 only
exprOrig <- as.matrix(ysOrig[4:11])
exprOrig[1:10,1:2]
dim(exprOrig)
plot(exprOrig[,1])
plot(exprOrig[,2])

# Set aside UID and gene names
UIDOrig <- ysOrig[,1]
geneNameOrig <- ysOrig[,2]

# Discard the original data to save memory space
rm(ysOrig)


#######################################################################
# Removing Rows with Too Many NAs
#######################################################################

too.many.na <- function(v, limit) {
  v <- as.vector(v)
  sum(is.na(v)) >= length(v) * limit
}

is.na.row <- function(m, limit=0.2) {
  m <- as.matrix(m)
  apply(m, 1, too.many.na, limit=limit)
}

# Genes with more than 20% NAs are removed.
dim(exprOrig)
sum(is.na(exprOrig[,1]))
sum(!is.na.row(exprOrig, limit=0.05))
sum(!is.na.row(exprOrig, limit=0.2))
sum(!is.na.row(exprOrig, limit=0.9))

naGenes <- is.na.row(exprOrig, limit=0.2)

exprNARemoved <- exprOrig[!naGenes, ]

dim(exprNARemoved)

# Set aside UID and gene names
UIDNARemoved <- UIDOrig[!naGenes]
geneNameNARemoved <- geneNameOrig[!naGenes]

length(UIDOrig)
length(UIDNARemoved)
rm(exprOrig)
rm(UIDOrig)
rm(geneNameOrig)


#######################################################################
# Recovering Missing Values by KNN
#######################################################################

# Ten closest genes by Euclidean distance are used to estimate
# missing values.
# install.packages("EMV")
library(EMV)
exprImputed <- knn(exprNARemoved, k=10)
exprImputed <- exprImputed$data

dim(exprImputed)
dim(exprNARemoved)
any(is.na(exprImputed))
any(is.na(exprNARemoved))
rm(exprNARemoved)

hist(exprImputed, breaks=30, freq=FALSE)


#######################################################################
# Computing Distances
#######################################################################

# Pearson correlation coefficient
distMat <- cor(t(exprImputed), method="pearson")

dim(distMat)
rm(distMat)

# Or, Euclidean distance using dist function
distMat <- dist(exprImputed)
distMat <- as.matrix(distMat)
distMat <- as.dist(distMat)

rm(distMat)


#######################################################################
# Gene Filtering
#######################################################################

hist(exprImputed[,1], breaks=30, freq=FALSE)
boxplot(data.frame(exprImputed))

updown <- apply(exprImputed, 1, function(v) {sum(v > 2 | v < -2)})
cutoff <- 0
sum(updown > cutoff)

expr <- exprImputed[updown > cutoff,]
UID <- UIDNARemoved[updown > cutoff]
geneName <- geneNameNARemoved[updown > cutoff]

length(UID)


#######################################################################
# Hierarchical Clustering
#######################################################################

# Euclidean distance and average link
# Clustering of genes
hcGenes <- hclust(dist(expr, method="euclidean"), method="average")
plot(hcGenes)

# Clustering of samples
hcSamples <- hclust(dist(t(expr), method="euclidean"), method="average")
plot(hcSamples)

# Pearson correlation distance and complete link
# Clustering of genes
hcGenes <- hclust(as.dist(1 - cor(t(expr), method="pearson")), method="complete")
plot(hcGenes)

# Clustering of samples with single link
hcSamples <- hclust(as.dist(1 - cor(expr, method="pearson")), method="single")
plot(hcSamples)

# Clustering of samples with centroid link
hcSamples <- hclust(as.dist(1 - cor(expr, method="pearson")), method="centroid")
plot(hcSamples)

# Clustering of samples with average link
hcSamples <- hclust(as.dist(1 - cor(expr, method="pearson")), method="average")
plot(hcSamples)

# Clustering of samples with complete link
hcSamples <- hclust(as.dist(1 - cor(expr, method="pearson")), method="complete")
plot(hcSamples)

# Representing in 2D
heatmap(expr, Rowv=as.dendrogram(hcGenes), Colv=as.dendrogram(hcSamples), col=heat.colors(32))


#######################################################################
# Obtaining Gene List from Clusters
#######################################################################

plot(hcGenes)
cutheight <- 0.3
abline(h = cutheight, col="red")
cluster <- cutree(hcGenes, h=cutheight)
table(cluster)
plot(table(cluster))

sortedCluster <- sort(table(cluster), decreasing = TRUE)
geneName[cluster == names(sortedCluster[1])]
UID[cluster == names(sortedCluster[1])]

Up