# ================================================================================================================== # # CISN-JC69: an integrated program for confidence intervals of the nucleotide substitution number for JC69 model # # ================================================================================================================== # # # # CIK:confidence interval of the substitution number for the constant substitution rate. # # CIK=function(alpha,c,d,seq1,seq2) # # (1) Choose a "alpha" value be the confidence level. Usually, "alpha" can be selected # # to be 0.95 or 0.9. # # (2) If we have empirical information K falling (a,b), "c" can be chosen to be a, and # # "d" can be chosen to be b. If we do not have any empirical information for K, # # "c" can be chosen to be 0, and "d" can be chosen to be any number greater than 1. # # (3) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows. # # # # CIH:confidence interval of the substitution number for the variable substitution rate. # # CIH=function(alpha,aa,c,d,seq1,seq2) # # (1) Choose a "alpha" value be the confidence level. Usually, "alpha" can be selected # # to be 0.95 or 0.9. # # (2) Choose a "aa" value be the shape parameter of gamma distribution. # # (3) If we have empirical information H falling (a,b), "c" can be chosen to be a, and # # "d" can be chosen to be b. If we do not have any empirical information for H, # # "c" can be chosen to be 0, and "d" can be chosen to be any number greater than 1. # # (4) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows. # # # # Run the codes: # # First you need to install the R package "seqinr". You can run the codes using either # # of the following tow ways. # # (a)On the R window, open the file, press "ctrl+a", then press "ctrl+r" or # # (b)Save the CISN-JC69.txt in D:\ and on the R window type: source("D:/CISN-JC69.txt"). # # # # You need to load the data of the two sequences and set the name for them. # # Then you can use the CIK and CIH function to find the intervals for two sequences as follows. # # # # For example, # # # # > x1 <- read.fasta(file="D:/AY090666.fasta") # # > x2 <- read.fasta(file="D:/AY090687.fasta") # # # # > CIK(0.95,0.1,0.3,x1,x2) # # The level 0.95 wald, score and adjusted confidence intervals # # for nucleotide substitution number under the constant substitution rate assumption # # and the empirical information ( 0.1 , 0.3 ) # # # # wald confidence interval ( 0.1 , 0.1813532 ) # # score confidence interval ( 0.1010138 , 0.185105 ) # # adjusted confidence interval ( 0.1 , 0.1877515 ) # # # # # # > CIH(0.95,1,0.1,0.3,x1,x2) # # The level 0.95 wald and adjusted confidence intervals # # for nucleotide substitution number under the variable substitution rate assumption # # and the empirical information ( 0.1 , 0.3 ) # # # # wald confidence interval ( 0.1 , 0.2035344 ) # # adjusted confidence interval ( 0.10193 , 0.2116486 ) # # # # ================================================================================================================== # # ================================================================================================================== # library(seqinr) source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") library(Biostrings) CIK <- function(alpha,c,d,seq1,seq2){ s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])])) s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])])) #alignment of two sequences globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2 , gapExtension = -8,scoreOnly=FALSE) a1<-pattern(globalAlign) a2<-subject(globalAlign) b1<-as.character(a1) b2<-as.character(a2) S1=substring(b1,1:nchar(b1),1:nchar(b1)) S2=substring(b2,1:nchar(b2),1:nchar(b2)) #compute the different number of the two sequences n=length(S1) x=0 for(i in 1:length(S1)){ if(S1[i]=="-" | S2[i]=="-" | S1[i]=="N" | S2[i]=="N") n=n-1 if(S1[i]!="-" & S2[i]!="-" & S1[i]!="N" & S2[i]!="N" & S1[i]!=S2[i]) x=x+1 } y=-3/4*log(1-4/3*x/n) ph=x/n var1=(ph-ph^2)/(n*(1-4/3*ph)^2) z=qnorm(1-((1-alpha)/2)) #wald interval CL1=y-z*(var1)^(1/2) CU1=y+z*(var1)^(1/2) #score interval CL2=y+z^2/(2*n)-z*(4*y*n+z^2)^(1/2)/(2*n) CU2=y+z^2/(2*n)+z*(4*y*n+z^2)^(1/2)/(2*n) #adjusted interval n1=n+z^2 x1=x+z^2/2 my=-3/4*log(1-4/3*x1/n1) mph=x1/n1 var2=(mph-mph^2)/(n1*(1-4/3*mph)^2) CL3=my-z*(var2)^(1/2) CU3=my+z*(var2)^(1/2) #show the results cat("The level",alpha ,"wald, score and adjusted confidence intervals for nucleotide substitution number under the constant substitution rate assumption and the empirical information (",c,",",d,")","\n\n") if (CU1d){ cat("wald confidence interval ", " " ,"(", CL1 , "," , CU1 , ") or (", c , ",", d ,")","\n")} else{ cat("wald confidence interval ", " " ,"(", max(c,CL1) , "," , min(d,CU1) , ")","\n")} if (CU2d){ cat("score confidence interval ", " " ,"(", CL2 , "," , CU2 , ") or (", c , ",", d ,")","\n")} else{ cat("score confidence interval ", " " ,"(", max(c,CL2) , "," , min(d,CU2) , ")","\n")} if (CU3d){ cat("adjusted confidence interval ", " " ,"(", CL3 , "," , CU3 , ") or (", c , ",", d ,")","\n")} else{ cat("adjusted confidence interval ", " " ,"(", max(c,CL3) , "," , min(d,CU3) , ")","\n")} } # ================================================================================================================== # CIH<-function(alpha,aa,c,d,seq1,seq2){ s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])])) s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])])) #alignment of two sequences globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2, gapExtension = -8,scoreOnly=FALSE) a1<-pattern(globalAlign) a2<-subject(globalAlign) b1<-as.character(a1) b2<-as.character(a2) S1=substring(b1,1:nchar(b1),1:nchar(b1)) S2=substring(b2,1:nchar(b2),1:nchar(b2)) n=length(S1) x=0 for(i in 1:length(S1)){ if(S1[i]=="-" | S2[i]=="-" | S1[i]=="N" | S2[i]=="N") n=n-1 if(S1[i]!="-" & S2[i]!="-" & S1[i]!="N" & S2[i]!="N" & S1[i]!=S2[i]) x=x+1 } y=3/4*aa*((1-4/3*x/n)^(-1/aa)-1) ph=x/n var1=(ph-ph^2)/n*(1-4/3*ph)^(-2*(1/aa+1)) z=qnorm(1-((1-alpha)/2)) #wald CL1=y-z*(var1)^(1/2) CU1=y+z*(var1)^(1/2) #adjusted n1=n+z^2 x1=x+z^2/2 mph=x1/n1 my=3/4*aa*((1-4/3*mph)^(-1/aa)-1) var2=(mph-mph^2)/n1*(1-4/3*mph)^(-2*(1/aa+1)) CL3=my-z*(var2)^(1/2) CU3=my+z*(var2)^(1/2) #show the results cat("The level",alpha ,"wald and adjusted confidence intervals for nucleotide substitution number under the variable substitution rate assumption and the empirical information (",c,",",d,")","\n\n") if (CU1d){ cat("wald confidence interval ", " " ,"(", CL1 , "," , CU1 , ") or (", c , ",", d ,")","\n")} else{ cat("wald confidence interval ", " " ,"(", max(c,CL1) , "," , min(d,CU1) , ")","\n")} if (CU3d){ cat("adjusted confidence interval ", " " ,"(", CL3 , "," , CU3 , ") or (", c , ",", d ,")","\n")} else{ cat("adjusted confidence interval ", " " ,"(", max(c,CL3) , "," , min(d,CU3) , ")","\n")} }