Bioinformatics Exercises
  R demo from the course I
  Writer : Seyeon Weon   Updated : 10-07   Hit : 5036   Updates 
# An R script to show how to do the following work
# without using BioConductor:

# 1. Read in data from GenePix output
# 2. Normalization with loess
# 3. Plot the results
# 4. With multiple arrays processed using above procedure,
#    remove genes with too many NAs.
# 5. Fill in remaining missing values with KNN.
# 6. Do hierarchical clustering

# Written by Seyeon Weon
# http://www.bioinformatics.pe.kr/sywon/email.html
# for DNA Microarray Bioinformatics Course
# http://www.bioinformatics.pe.kr/course/ARRAY/
# 2004. 5. 18
# Revised 2005. 6. 17.


#######################################################################
# Getting Some Data to Work On
#######################################################################
# Visit http://smd.stanford.edu
# Click "Publications" button in the menu bar.
# Set "Limit to:" to "Saccharomyces cerevisiae",
# and press Re-list button.
# Choose "Segal E, et al. (2003). Nat Genet 34(2):166-176",
# and press the "SMD" icon.
# Download exptsetno_1822.tar.gz by clicking the "Raw Data" icon.
# Also, look around the information about the data by clicking other places.
# Decompress the file.
# Move the data files into a directory of your choice.
# "cd" into the directory.


#######################################################################
# Reading a GenePix File
#######################################################################
# Let's choose 30319.xls file to work on.
# It is heat shock of usv1(=ypl230w=DBY10058) mutant vs. wt (DBY8778)
# Using vi or other editor, look inside the file.
# Be sure to turn off "line wrapping".
# How many comment lines are in the file?
# Which line is the header line for field names?
# How many spots?
# How many pins?
# How many rows and columns for each pin?
ys <- read.delim("30319.xls", header=TRUE, skip=25)

# Check for some attributes.
dim(ys)
class(ys)
mode(ys)
length(ys)
colnames(ys)

# Attach ys to name space.
attach(ys)

# Remember that underscore(_) can be entered by Ctrl-Q + underscore.
# You need R version 1.9.0 or above to use underscores.
# If you are using a previous version, you need to substitute underscores to dots.

# Draw histrogram for cy3 median.
hist(CH1I_MEDIAN, prob = TRUE, breaks=50)

# Draw histrogram for cy5 median.
hist(CH2I_MEDIAN, prob = TRUE, breaks=50)

# Draw histrogram for cy3 backgroud median.
hist(CH1B_MEDIAN, prob = TRUE, breaks=50)

# Draw histrogram for cy5 backgroud median.
hist(CH2B_MEDIAN, prob = TRUE, breaks=50)

# What would think about the quality of this microarray experiment?
# The foreground intensities (CH1I_MEDIAN and CH2I_MEDIAN) are approximately
# lognormal distributions and the background intensities (CH1B_MEDIAN and
# CH2B_MEDIAN) are approximately normal distributions.  Why is it so?

#######################################################################
# Optional Section
#######################################################################

# Let's compare with lognormal curve.
numGenes <- length(CH1I_MEDIAN)

dataRange <- range(CH1I_MEDIAN)
randomLogNormal <- dlnorm(dataRange[1]:dataRange[2],
                          meanlog = mean(log(CH1I_MEDIAN)),
                          sdlog = sd(log(CH1I_MEDIAN)))
hist(CH1I_MEDIAN, prob = TRUE, ylim = range(randomLogNormal), breaks=100)
lines(dataRange[1]:dataRange[2], randomLogNormal, col="red")

hist(CH1I_MEDIAN, breaks=500, xlim=c(0,10000), prob=TRUE)
lines(0:10000, randomLogNormal[1:10001], col="red")

rm(numGenes, dataRange, randomLogNormal)

# Let's create a function for above funtionality
PlotHistLnorm <- function(x, breaks=50, left=range(x)[1], right=range(x)[2]) {
  hist(x, breaks=breaks, prob=TRUE, xlim=c(left, right))
  r <- range(x)
  lx <- log(x)
  d <- dlnorm(seq(r[1], r[2], length=breaks), meanlog=mean(lx), sdlog=sd(lx))
  lines(seq(r[1], r[2], length=breaks), d, col="red")
}

PlotHistLnorm(CH2I_MEDIAN, breaks=1000)

#######################################################################
# End of Optional Section
#######################################################################


#######################################################################
# Background Subtraction and Log Transform
#######################################################################

# Subtract background
i3 <- CH1I_MEDIAN - CH1B_MEDIAN
i5 <- CH2I_MEDIAN - CH2B_MEDIAN

# Draw histrogram after background subtraction
hist(i3, prob = TRUE, breaks=50)
hist(i5, prob = TRUE, breaks=50)

# Plot a scatter plot

plot(i3, i5)

# With small dots
plot(i3, i5, pch=".")

# Log transform using base 2
l3 <- log2(i3)
l5 <- log2(i5)

# Check what has happened
length(i3)
length(l3)

sum(is.na(l3))
sum(is.infinite(l3))
sum(is.na(l5))
sum(is.infinite(l5))

plot(l3, l5)

# Could you notice the difference between before and after
# log transforamtion?

#######################################################################
# Optional Section
#######################################################################

# Compare with normal curve
PlotHistNorm <- function(x, breaks=50) {
  hist(x, breaks=breaks, prob=TRUE)
  r <- range(x)
  d <- dnorm(seq(r[1], r[2], length=breaks), mean=mean(x), sd=sd(x))
  lines(seq(r[1], r[2], length=breaks), d, col="red")
}

naRemoved <- l3[!is.na(l3) & !is.infinite(l3)]

PlotHistNorm(naRemoved, breaks=100)

# Q-Q plot
QqLnorm <- function(x) {
  r <- range(x)
  d <- dlnorm(seq(r[1], r[2]), meanlog=mean(x), sdlog=sd(x))
  qqplot(d, x)
}

QqLnorm(naRemoved)

rm(naRemoved)

#######################################################################
# End of Optional Section
#######################################################################


#######################################################################
# Calculate M and A
#######################################################################

# M means "minus" and A means "add".
# Since l5 and l3 are logarithms, M is the ratio between forground
# and background, and A is the average of them.

M <- l5 - l3
A <- (l5 + l3) / 2


#######################################################################
# Dealing with NAs
#######################################################################

# Dealing with NAs is always a nuisance in data analysis.
# Before you do normalization, you have to remove NAs temporarily
# and put it back after normalization.

#######################################################################
# Remove NAs Temporarily
#######################################################################

# Check for NAs and Infinites
NAsM <- is.na(M) | is.infinite(M)
NAsA <- is.na(A) | is.infinite(A)
# NA (or Infinite) +- Num = NA (or Infinite)
# and the reverse is the same.  Therefore,
sum(NAsM)
sum(NAsA)
which(NAsM)
which(NAsA)

# Create cleaned M and A
cleanedM <- M[!NAsM]
cleanedA <- A[!NAsA]
length(cleanedM)
length(M)
length(cleanedA)
length(A)
 
 
#######################################################################
# Normalization
#######################################################################

# Plot un-normalized data

plot(cleanedM ~ cleanedA)

# Loess
loessMA <- loess(cleanedM ~ cleanedA)
lMA <- predict(loessMA)

# Draw the loess line in red
lines(cleanedA[order(cleanedA)], lMA[order(cleanedA)], col="red")

# This is the normalized data
lM <- cleanedM - lMA
length(lM)

# Check if it is indeed normalized.
plot(lM ~ cleanedA)

# Below is to show two plots at once.
oldpar <- par(mfrow=c(2,1))
plot(lM ~ cleanedA)
plot(cleanedM ~ cleanedA)
par(oldpar)

# Can you do pin-group normalization for yourself?


#######################################################################
# Recovering the original length by including NAs again.
#######################################################################

finalM <- numeric(length(M))
finalM[!NAsM] <- lM
finalM[NAsM] <- NA
finalA <- numeric(length(A))
finalA[!NAsA] <- cleanedA
finalA[NAsA] <- NA

length(finalM)
class(finalM)
length(finalA)
class(finalA)
sum(NAsM)
sum(is.na(finalM))
sum(NAsM)
sum(is.na(finalA))

# In fact, the normalized data is included in 30319.xls,
# which is LOG_RAT2N_MEDIAN.
oldpar <- par(mfrow=c(2,1))
plot(finalM ~ finalA)
plot(LOG_RAT2N_MEDIAN ~ finalA)
par(oldpar)

# Now normalization for one chip is done.
# By repeating above process for each chip, we will get
# "complete_dataset.txt".
# Continued on demo1B.R.

Up