# ================================================================================================================== #
#   CISN-JC69: an integrated program for confidence intervals of the nucleotide substitution number for JC69 model   # 
# ================================================================================================================== #
#                                                                                                                    #
# CIK:confidence interval of the substitution number for the constant substitution rate.                             #
# CIK=function(alpha,c,d,seq1,seq2)                                                                                  #
# (1) Choose a "alpha" value be the confidence level. Usually, "alpha" can be selected                               #
#     to be 0.95 or 0.9.                                                                                             #
# (2) If we have empirical information K falling (a,b), "c" can be chosen to be a, and                               #
#     "d" can be chosen to be b. If we do not have any empirical information for K,                                  #
#     "c" can be chosen to be 0, and "d" can be chosen to be any number greater than 1.                              #
# (3) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows.                            #
#                                                                                                                    #
# CIH:confidence interval of the substitution number for the variable substitution rate.                             #
# CIH=function(alpha,aa,c,d,seq1,seq2)                                                                               #
# (1) Choose a "alpha" value be the confidence level. Usually, "alpha" can be selected                               #
#     to be 0.95 or 0.9.                                                                                             #
# (2) Choose a "aa" value be the shape parameter of gamma distribution.                                              # 
# (3) If we have empirical information H falling (a,b), "c" can be chosen to be a, and                               #
#     "d" can be chosen to be b. If we do not have any empirical information for H,                                  #
#     "c" can be chosen to be 0, and "d" can be chosen to be any number greater than 1.                              #
# (4) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows.                            #
#                                                                                                                    #
# Run the codes:                                                                                                     #
#    First you need to install the R package "seqinr". You can run the codes using either                            #
#    of the following tow ways.                                                                                      #
#    (a)On the R window, open the file, press "ctrl+a", then press  "ctrl+r" or                                      #
#    (b)Save the CISN-JC69.txt in D:\ and on the R window type: source("D:/CISN-JC69.txt").                          #
#                                                                                                                    #
#     You need to load the data of the two sequences and set the name for them.                                      #
#     Then you can use the CIK and CIH function to find the intervals for two sequences as follows.                  #
#                                                                                                                    #
# For example,                                                                                                       #
#                                                                                                                    #  
#  > x1 <- read.fasta(file="D:/AY090666.fasta")                                                                      #
#  > x2 <- read.fasta(file="D:/AY090687.fasta")                                                                      #
#                                                                                                                    #
#  > CIK(0.95,0.1,0.3,x1,x2)                                                                                         #
#  The level 0.95 wald, score and adjusted confidence intervals                                                      #
#  for nucleotide substitution number under the constant substitution rate assumption                                #
#  and the empirical information ( 0.1 , 0.3 )                                                                       #
#                                                                                                                    #
#  wald confidence interval          ( 0.1 , 0.1813532 )                                                             #
#  score confidence interval         ( 0.1010138 , 0.185105 )                                                        # 
#  adjusted confidence interval      ( 0.1 , 0.1877515 )                                                             #
#                                                                                                                    #
#                                                                                                                    #
#  > CIH(0.95,1,0.1,0.3,x1,x2)                                                                                       #
#  The level 0.95 wald and adjusted confidence intervals                                                             #
#  for nucleotide substitution number under the variable substitution rate assumption                                #
#  and the empirical information ( 0.1 , 0.3 )                                                                       #
#                                                                                                                    #
#  wald confidence interval           ( 0.1 , 0.2035344 )                                                            #
#  adjusted confidence interval       ( 0.10193 , 0.2116486 )                                                        #
#                                                                                                                    #                      
# ================================================================================================================== #

# ================================================================================================================== #

library(seqinr)
source("http://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)  



CIK <- function(alpha,c,d,seq1,seq2){


s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])]))
s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])]))
#alignment of two sequences
globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2
                                       , gapExtension = -8,scoreOnly=FALSE)                
a1<-pattern(globalAlign)
a2<-subject(globalAlign)
b1<-as.character(a1)
b2<-as.character(a2)
S1=substring(b1,1:nchar(b1),1:nchar(b1))
S2=substring(b2,1:nchar(b2),1:nchar(b2))

#compute the different number of the two sequences
n=length(S1)
x=0
for(i in 1:length(S1)){
  if(S1[i]=="-" | S2[i]=="-" | S1[i]=="N" | S2[i]=="N")
      n=n-1
  if(S1[i]!="-" & S2[i]!="-" & S1[i]!="N" & S2[i]!="N" & S1[i]!=S2[i])
      x=x+1
}

y=-3/4*log(1-4/3*x/n)
ph=x/n
var1=(ph-ph^2)/(n*(1-4/3*ph)^2)
z=qnorm(1-((1-alpha)/2))

#wald interval
CL1=y-z*(var1)^(1/2)
CU1=y+z*(var1)^(1/2)


#score interval
CL2=y+z^2/(2*n)-z*(4*y*n+z^2)^(1/2)/(2*n)
CU2=y+z^2/(2*n)+z*(4*y*n+z^2)^(1/2)/(2*n)


#adjusted interval
n1=n+z^2
x1=x+z^2/2
my=-3/4*log(1-4/3*x1/n1)
mph=x1/n1
var2=(mph-mph^2)/(n1*(1-4/3*mph)^2)
CL3=my-z*(var2)^(1/2)
CU3=my+z*(var2)^(1/2)



#show the results
cat("The level",alpha ,"wald, score and adjusted confidence intervals 
for nucleotide substitution number under the constant substitution rate assumption
and the empirical information (",c,",",d,")","\n\n")
if (CU1<c | CL1>d){
cat("wald confidence interval        ", " " ,"(", CL1 , "," , CU1 , ") or (", c , ",", d ,")","\n")}
else{
cat("wald confidence interval        ", " " ,"(", max(c,CL1) , "," , min(d,CU1) , ")","\n")}
if (CU2<c | CL2>d){
cat("score confidence interval       ", " " ,"(", CL2 , "," , CU2 , ") or (", c , ",", d ,")","\n")}
else{
cat("score confidence interval       ", " " ,"(", max(c,CL2) , "," , min(d,CU2) , ")","\n")}
if (CU3<c | CL3>d){
cat("adjusted confidence interval    ", " " ,"(", CL3 , "," , CU3 , ") or (", c , ",", d ,")","\n")}
else{
cat("adjusted confidence interval    ", " " ,"(", max(c,CL3) , "," , min(d,CU3) , ")","\n")}
}


# ================================================================================================================== #

CIH<-function(alpha,aa,c,d,seq1,seq2){


s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])]))
s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])]))
#alignment of two sequences
globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2,
                                                       gapExtension = -8,scoreOnly=FALSE)
a1<-pattern(globalAlign)
a2<-subject(globalAlign)
b1<-as.character(a1)
b2<-as.character(a2)
S1=substring(b1,1:nchar(b1),1:nchar(b1))
S2=substring(b2,1:nchar(b2),1:nchar(b2))

n=length(S1)
x=0
for(i in 1:length(S1)){
  if(S1[i]=="-" | S2[i]=="-" | S1[i]=="N" | S2[i]=="N")
      n=n-1
  if(S1[i]!="-" & S2[i]!="-" & S1[i]!="N" & S2[i]!="N" & S1[i]!=S2[i])
      x=x+1
}

y=3/4*aa*((1-4/3*x/n)^(-1/aa)-1)
ph=x/n
var1=(ph-ph^2)/n*(1-4/3*ph)^(-2*(1/aa+1))
z=qnorm(1-((1-alpha)/2))

#wald
CL1=y-z*(var1)^(1/2)
CU1=y+z*(var1)^(1/2)


#adjusted
n1=n+z^2
x1=x+z^2/2
mph=x1/n1
my=3/4*aa*((1-4/3*mph)^(-1/aa)-1)
var2=(mph-mph^2)/n1*(1-4/3*mph)^(-2*(1/aa+1))
CL3=my-z*(var2)^(1/2)
CU3=my+z*(var2)^(1/2)



#show the results
cat("The level",alpha ,"wald and adjusted confidence intervals 
for nucleotide substitution number under the variable substitution rate assumption
and the empirical information (",c,",",d,")","\n\n")
if (CU1<c | CL1>d){
cat("wald confidence interval        ", " " ,"(", CL1 , "," , CU1 , ") or (", c , ",", d ,")","\n")}
else{
cat("wald confidence interval        ", " " ,"(", max(c,CL1) , "," , min(d,CU1) , ")","\n")}
if (CU3<c | CL3>d){
cat("adjusted confidence interval    ", " " ,"(", CL3 , "," , CU3 , ") or (", c , ",", d ,")","\n")}
else{
cat("adjusted confidence interval    ", " " ,"(", max(c,CL3) , "," , min(d,CU3) , ")","\n")}

}