# This library implements a Bayesian t-test for microarray and other laboratory data # where the variance is primarily a function of the signal strength. # # The method is described in "A Bayesian two sample t-test for microarray data", # Fox and Dimmic (2006). # # Copyright (C) 2006 Richard J. Fox, Codexis, Inc. (richard.fox@codexis.com) # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # Main function that peforms a t-test mp <- function(y1,y2,m=8) { # y1 = matrix of first sample (rows=genes, cols=replicates). # y2 = matrix of second sample. # m = number of similarly expressed genes to use for calculating Bayesian variance and prior degrees of freedom. # The default value is currently 8 but the optimal value is likely to be a function of the particular data set. # Users are encouraged to experiment with calibration data sets to help set this value. y1 <- as.matrix(y1) y2 <- as.matrix(y2) y1m <- apply(y1,1,mean,na.rm=TRUE) y2m <- apply(y2,1,mean,na.rm=TRUE) y1m[is.nan(y1m)] <- NA y2m[is.nan(y2m)] <- NA df1 <- apply(!is.na(y1),1,sum)-1 df2 <- apply(!is.na(y2),1,sum)-1 df1 <- replace(df1, df1 < 0, 0) df2 <- replace(df2, df2 < 0, 0) ssq1 <- apply((y1-y1m)^2,1,sum,na.rm=TRUE) ssq2 <- apply((y2-y2m)^2,1,sum,na.rm=TRUE) n1 <- ncol(y1) n2 <- ncol(y2) rows <- nrow(y1) # Calculate similarly expressed sum of squared differences bayes <- sim.ssq(y1m,y2m,ssq1,ssq2,df1,df2,(m/2)) result <- NULL result$diff <- numeric() # Difference in sample means result$ss.bayes <- numeric() # Bayesian sum-squares result$df.bayes <- numeric() # Bayesian degrees of freedom result$var.bayes <- numeric() # Bayesian variance result$t <- numeric() # t-statistic result$p <- numeric() # p-values for the t-test for (i in 1:rows) { ss.bayes <- bayes$ss1[i] + bayes$ss2[i] df.bayes <- bayes$df1[i] + bayes$df2[i] samp1 <- y1[i,] samp2 <- y2[i,] var.bayes <- ss.bayes/df.bayes sd.bayes <- sqrt(var.bayes) diff <- mean(samp1)-mean(samp2) n1.true <- n1 - sum(is.na(samp1)) n2.true <- n2 - sum(is.na(samp2)) tstat <- diff/(sd.bayes*sqrt(1/n1.true+1/n2.true)) p.val <- 1 - pf(tstat^2, 1, df.bayes) result$diff[i] <- diff result$ss.bayes[i] <- ss.bayes result$df.bayes[i] <- df.bayes result$var.bayes[i] <- var.bayes result$t[i] <- tstat result$p[i] <- p.val } return (result) } # Supporting function to calculate sum of squared differences of similary expressed genes sim.ssq <- function(y1m,y2m,ssq1,ssq2,df1,df2,m) { o1 <- order(y1m) o2 <- order(y2m) ssq1sort <- ssq1[o1] ssq2sort <- ssq2[o2] df1sort <- df1[o1] df2sort <- df2[o2] result <- NULL result$ss1 <- numeric() result$ss2 <- numeric() result$df1 <- numeric() result$df2 <- numeric() rows <- length(ssq1) for (i in 1:rows) { lower <- i + ceiling(-m/2) upper <- i + ceiling(m/2) if (lower < 1) { lower <- 1 upper <- m + 1 } if (upper > rows) { lower <- rows - m upper <- rows } result$ss1[o1[i]] <- sum(ssq1sort[lower:upper],na.rm=TRUE) result$ss2[o2[i]] <- sum(ssq2sort[lower:upper],na.rm=TRUE) result$df1[o1[i]] <- sum(df1sort[lower:upper]) result$df2[o2[i]] <- sum(df2sort[lower:upper]) } return (result) }