\section{Simulation of small cell number data}\label{smallCellNumber} \subsection{Preliminaries} <>= library("HD2013SGI") dir.create(file.path("result","Figures"),recursive=TRUE,showWarnings=FALSE) data("featuresPerWell", package="HD2013SGI") data("TargetAnnotation",package="HD2013SGI") data("QueryAnnotation",package="HD2013SGI") data("datamatrix", package="HD2013SGI") @ \subsection{Convert data from plate order to SGI-array} <>= NROW = 15 NCOL = 23 NFIELD = 4 @ % The simulation of a genetic interaction experiment with smaller cell numbers followed the scripts in Sections~\ref{sec:convert} and \ref{sec:main}. First, the plate barcodes were parsed. % <>= plates = featuresPerWell$Anno[seq(1,nrow(featuresPerWell$Anno), by=NFIELD*NCOL*NROW),"plate"] PlateAnnotation = HD2013SGI:::parsePlateBarcodes(plates) @ % The names of all query genes were extracted. % <>= S = which(PlateAnnotation$queryGroup=="sample") tdnames = unique(PlateAnnotation$targetDesign[S]) qnames = unique(PlateAnnotation$queryGene[S]) qdnames = unique(PlateAnnotation$queryDesign[S]) repnames = unique(PlateAnnotation$replicate[S]) @ % The data were converted from the screen plate layout to a multi-dimensional array. <>= D = array(0.0, dim=c(field=NFIELD,col=NCOL,row=NROW, features=dim(featuresPerWell$data)[2], targetDesign=length(tdnames), query=length(qnames),queryDesign=length(qdnames), replicate=length(repnames))) dimnames(D) = list(field=seq_len(NFIELD), col=seq_len(NCOL),row=LETTERS[seq_len(NROW)+1], features=dimnames(featuresPerWell$data)[[2]], targetDesign=tdnames, queryGene=qnames,queryDesign=qdnames,replicate=repnames) z=0 for (td in tdnames) { for (q in qnames) { for (qd in qdnames) { for (r in repnames) { plate = PlateAnnotation$plate[ which((PlateAnnotation$targetDesign == td) & (PlateAnnotation$queryGene == q) & (PlateAnnotation$queryDesign == qd) & (PlateAnnotation$replicate == r) ) ] z=z+1 I = which(featuresPerWell$Anno$plate == plate) D[,,,,td,q,qd,r] = as.vector(featuresPerWell$data[I,]) } } } } D[is.na(D)] = 0.0 @ The data were summarized by their mean value \begin{enumerate} \item considering only one imaging field per well, \item considering two imaging fields per well, and \item considering all four imaging fields per well. \end{enumerate} <>= Dsub1 = D[1,,,,,,,] Dsub2 = (D[1,,,,,,,] + D[2,,,,,,,])/2 D = (D[1,,,,,,,] + D[2,,,,,,,] + D[3,,,,,,,] + D[4,,,,,,,])/4 D = aperm(D,c(1,2,4,5,6,3,7)) dn = dimnames(D) dim(D) = c(prod(dim(D)[1:2]),dim(D)[3:7]) dimnames(D) = c(list(targetGene = sprintf("%s%d",rep(LETTERS[seq_len(NROW)+1],each=NCOL), rep(seq_len(NCOL),times=NROW))), dn[3:7]) Dsub1 = aperm(Dsub1,c(1,2,4,5,6,3,7)) dim(Dsub1) = c(prod(dim(Dsub1)[1:2]),dim(Dsub1)[3:7]) dimnames(Dsub1) = dimnames(D) Dsub2 = aperm(Dsub2,c(1,2,4,5,6,3,7)) dim(Dsub2) = c(prod(dim(Dsub2)[1:2]),dim(Dsub2)[3:7]) dimnames(Dsub2) = dimnames(D) datamatrixfullsub = list(Dsub1 = Dsub1, Dsub2 = Dsub2, Dsub4 = D) @ Get indices for different sets of experiments (samples, controls, ...). <>= IndexSAMPLE = which(TargetAnnotation$group == "sample") IndexCTRL = which(TargetAnnotation$group == "SingleKDctrl") IndexNEG = which(TargetAnnotation$group == "negctrl") IndexSAMPLENEG = c(IndexSAMPLE, IndexNEG) @ Set the color value for controls. <>= colCTRL = rep("gray", nrow(TargetAnnotation)) colCTRL[TargetAnnotation$group == "SingleKDctrl"] = "royalblue" colCTRL[TargetAnnotation$group == "negctrl"] = "red" @ \subsection{Transform features and screen normalization} A function for a generalized log transformation. <>= logtrafo <- function(x,c) { log2((x+sqrt(x^2+c^2))/2) } @ % Transform all features with a generalized log transformation. A 3 percent quantile was used as an additive constant in the glog transformation. The same constant was used for the \Rfunction{glog}-transformation of all three data sets. % <>= for (i in seq_len(dim(D)[5])) { m = quantile(D[,,,,i,],probs=0.03,na.rm=TRUE) D[,,,,i,] = logtrafo(D[,,,,i,],m) Dsub1[,,,,i,] = logtrafo(Dsub1[,,,,i,],m) Dsub2[,,,,i,] = logtrafo(Dsub2[,,,,i,],m) } @ % Normalize median per plate and feature to compensate for global differences between replicates and siRNA designs. % <>= normalize <- function(D) { M = apply(D,c(2:6),median,na.rm=TRUE) M2 = apply(M, c(2,4), mean) M2 = rep(M2, times=8) dim(M2) = dim(M)[c(2,4,1,3,5)] M2 = aperm(M2,c(3,1,4,2,5)) M = M - M2 M = rep(M[], each=dim(D)[1]) dim(M) = dim(D) D = D - M D } D <- normalize(D) Dsub1 <- normalize(Dsub1) Dsub2 <- normalize(Dsub2) @ % Subtract median and divide by median deviation per feature. % <>= for (i in 1:dim(D)[5]) { D[,,,,i,] = (D[,,,,i,] - median(D[,,,,i,],na.rm=TRUE)) / mad(D[,,,,i,],na.rm=TRUE) } for (i in 1:dim(Dsub1)[5]) { Dsub1[,,,,i,] = (Dsub1[,,,,i,] - median(Dsub1[,,,,i,],na.rm=TRUE)) / mad(Dsub1[,,,,i,],na.rm=TRUE) } for (i in 1:dim(Dsub2)[5]) { Dsub2[,,,,i,] = (Dsub2[,,,,i,] - median(Dsub2[,,,,i,],na.rm=TRUE)) / mad(Dsub2[,,,,i,],na.rm=TRUE) } @ \subsection{Quality control of features} The dimension of the data cube before quality control is: \Sexpr{paste(dim(D), collapse=" x ")} (targets x siRNA designs x queries x designs x features x replicates). Take the mean over all four siRNA design pairs and compute for each feature the Pearson correlation between first and second replicate. % <>= C = rep(NA_real_,dim(D)[5]) D2 = (D[,1,,1,,] + D[,2,,1,,] + D[,1,,2,,] + D[,2,,2,,]) / 4 D2sub1 = (Dsub1[,1,,1,,] + Dsub1[,2,,1,,] + Dsub1[,1,,2,,] + Dsub1[,2,,2,,]) / 4 D2sub2 = (Dsub2[,1,,1,,] + Dsub2[,2,,1,,] + Dsub2[,1,,2,,] + Dsub2[,2,,2,,]) / 4 for (i in 1:dim(D)[5]) { C[i] = cor(as.vector(D2[IndexSAMPLE,,i,1]),as.vector(D2[IndexSAMPLE,,i,2])) } @ % Select all features with a correlation of at least 0.6. The number of features passing quality control is \Sexpr{sum(C >= 0.6, na.rm=TRUE)} out of \Sexpr{dim(D)[5]}. The same features as in the main analysis were selected in all three simulated cases. % <>= I = which(C >= 0.6) D = D[,,,,I,,drop=FALSE] Dsub1 = Dsub1[,,,,I,,drop=FALSE] Dsub2 = Dsub2[,,,,I,,drop=FALSE] dim(D) @ % The dimension of the data cube is now: \Sexpr{paste(dim(D), collapse=" x ")} (targets x siRNA designs x queries x designs x features x replicates). \subsection{Quality control of siRNA designs} For each target siRNA the phenotypic profiles were summarized over two query siRNA and two replicates. % <>= D1 = (D[,,,1,,1] + D[,,,1,,2] + D[,,,2,,1] + D[,,,2,,2])/4 @ % For each target gene the Pearson correlation coefficient was computed for phenotypic profiles between the two siRNA designs separately for each feature. The mean of correlation coefficients over all features is reported for each target gene. <>= Cdesign1 = rep(NA_real_,dim(D)[1]) for (k in 1:dim(D)[5]) { for (i in 1:dim(D)[1]) { Cdesign1[i] = cor(as.vector(D1[i,1,,k]),as.vector(D1[i,2,,k])) } if ( k == 1) { Cdesign1all = Cdesign1 } else { Cdesign1all = Cdesign1all + Cdesign1 } } Cdesign1all = Cdesign1all / dim(D)[5] @ % All gene with a correlation of the phenotypic profiles of at least 0.7 are selected for further analysis. The number of target genes passing quality control is \Sexpr{sum(Cdesign1all[IndexSAMPLE] >= 0.7, na.rm=TRUE)} out of \Sexpr{length(IndexSAMPLE)}. Additionally, \Sexpr{length(IndexNEG)} negative control target siRNAs are selected. The same target genes as in the main analysis are selected for all three simulated cases. % <>= I = IndexSAMPLE[Cdesign1all[IndexSAMPLE] >= 0.7] I = c(I, IndexNEG) D = D[I,,,,,,drop=FALSE] Dsub1 = Dsub1[I,,,,,,drop=FALSE] Dsub2 = Dsub2[I,,,,,,drop=FALSE] TargetAnnotation = TargetAnnotation[I,] @ The dimension of the data cube is now: \Sexpr{paste(dim(D),collapse=" x ")} (targets x siRNA designs x queries x designs x features x replicates). \subsection{Selection of non-redundant features} <>= data(stabilitySelection, package="HD2013SGI") Sel = stabilitySelection$ratioPositive >= 0.5 @ % A set of \Sexpr{sum(Sel)} features was considered to contain non-redundant information. The criteria for feature selection are saved and the features are selected. The same features as in the main analysis were selected for all three simulated datasets. % <>= D = D[,,,,stabilitySelection$selected[Sel],,drop=FALSE] dimnames(D)[[1]] = TargetAnnotation$Symbol dimnames(D)[[3]] = QueryAnnotation$Symbol Dsub1 = Dsub1[,,,,stabilitySelection$selected[Sel],,drop=FALSE] Dsub2 = Dsub2[,,,,stabilitySelection$selected[Sel],,drop=FALSE] dimnames(Dsub1) = dimnames(D) dimnames(Dsub2) = dimnames(D) @ After selecting \Sexpr{sum(Sel)} features, the dimension of the data cube is now: \Sexpr{paste(dim(D),collapse=" x ")} (targets x siRNA designs x queries x designs x features x replicates). \subsection{Pairwise interaction scores} Pairwise interaction scores ($\pi$-scores) are estimated using a robust linear fit. The query main effects are lifted such that they equal to the mean of the single knock down measurements of the query genes (target siRNA is a scrambled sequence serving as negative control). <>= getInteractions <- function(D) { pimatrix = datamatrix pimatrix$D[] = NA_real_ mainEffects = list(target = D[,,1,,,], query = D[1,,,,,], overall = D[1,,1,,,], Anno = datamatrix$Anno) for (i in 1:2) { for (j in 1:2) { for (k in 1:dim(D)[5]) { for (l in 1:2) { MP = HD2013SGImaineffects(D[,i,,j,k,l], TargetNeg=which(TargetAnnotation$group == "negctrl")) pimatrix$D[,i,,j,k,l] = MP$pi mainEffects$target[,i,j,k,l] = MP$targetMainEffect mainEffects$query[i,,j,k,l] = MP$queryMainEffect mainEffects$overall[i,j,k,l] = MP$neg } } } } D = pimatrix$D PADJ = D[pimatrix$Anno$target$group == "sample",,,,,1] s = rep(NA_real_, dim(D)[5]) for (i in 1:dim(D)[5]) { Data = D[,,,,i,] Data = Data[pimatrix$Anno$target$group == "sample",,,,] d = dim(Data) dim(Data) = c(prod(d[1:4]),prod(d[5])) Data[abs(Data[,1]-Data[,2]) > 4*mad(Data[,1]-Data[,2],center=0.0),] = NA_real_ s[i] = median(apply(Data,1,sd), na.rm=TRUE) padj = rep(NA_real_, nrow(Data)) K = which(apply(!is.na(Data),1,all)) fit = eBayes(lmFit(Data[K,])) padj[K] = p.adjust(fit$p.value, method="BH") PADJ[,,,,i] = padj cat("i=",i," nr int (1%) = ",sum(padj <= 0.01,na.rm=TRUE)/nrow(Data), " nr int (3%) = ",sum(padj <= 0.03,na.rm=TRUE)/nrow(Data),"\n") } PI = pimatrix$D PI = PI[pimatrix$Anno$target$group == "sample",,,,,] Interactions = list(piscore = PI, scale = s, padj = PADJ, Anno = pimatrix$Anno) Interactions$Anno$target = Interactions$Anno$target[ pimatrix$Anno$target$group == "sample",] Interactions } InteractionsSub4 = getInteractions(D) InteractionsSub1 = getInteractions(Dsub1) InteractionsSub2 = getInteractions(Dsub2) @ The number of significantly interacting siRNA pairs is plotted. <>= field1 = sum(InteractionsSub1$padj <= 0.01,na.rm=TRUE) field2 = sum(InteractionsSub2$padj <= 0.01,na.rm=TRUE) field4 = sum(InteractionsSub4$padj <= 0.01,na.rm=TRUE) <>= pdf(file.path("result","Figures","subsamplingNrInteractions.pdf"), height=5,width=5) @ <>= plot(c(field1,field2,field4),type="b",pch=19, ylab="number of interactions",xlab="", xlim=c(0.5,3.5),ylim=c(0.0,field4), main="",xaxt="n",cex.lab=1.75,cex.axis=1.5) axis(side=1,at=1:3,labels=c("","","")) axis(side=1,at=1:3,labels=c("1775\ncells","3550\ncells","7100\ncells"), line=1.5,lwd=0,cex.axis=1.5) @ <>= dev.off() @ \begin{center} \includegraphics[width=0.5\textwidth]{result/Figures/subsamplingNrInteractions.pdf} \end{center}