# ================================================================================================================== #
#    An integrated program for variance estimators of the nucleotide substitution number for F84 model               # 
# ================================================================================================================== #
#                                                                                                                    #
# F84:variance estimators of the nucleotide substitution number for F84 model.                                       #
# F84=function(seq1,seq2)                                                                                            #
# (1) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows.                            #
#                                                                                                                    #
# Run the codes:                                                                                                     #
#    First you need to install the R package "seqinr". You can run the codes using either                            #
#    of the following two ways.                                                                                      #
#    (a)On the R window, open the file, press "ctrl+a", then press "ctrl+r" or                                       #
#    (b)Save the F84.txt in D:\ and on the R window type: source("D:/F84.txt").                                      #
#                                                                                                                    #
#     You need to load the data of the two sequences and set the name for them.                                      #
#     Then you can use the CIK and CIH function to find the intervals for two sequences as follows.                  #
#                                                                                                                    #
# For example,                                                                                                       #
#                                                                                                                    #  
#  > x1 <- read.fasta(file="D:/AY090666.fasta")                                                                      #
#  > x2 <- read.fasta(file="D:/AY090687.fasta")                                                                      #
#                                                                                                                    #
#  > > F84(x1,x2)                                                                                                    #
#      The 4 variance estimators are                                                                                 #
#      V1= 0.0005245847 , V2= 0.000528068 , V3= 2.254411e-07 , and V4= 0.0005243593 .                                #                 
# ================================================================================================================== #

# ================================================================================================================== #

library(seqinr)
source("http://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)  

F84<-function(seq1,seq2){

s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])]))
s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])]))
#alignment of two sequences
globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2
                                       , gapExtension = -8,scoreOnly=FALSE)                
a1<-pattern(globalAlign)
a2<-subject(globalAlign)
b1<-as.character(a1)
b2<-as.character(a2)
SS1=substring(b1,1:nchar(b1),1:nchar(b1))
SS2=substring(b2,1:nchar(b2),1:nchar(b2))


#compute the different number of the two sequences
XAA=0 ; XAT=0 ; XAC=0 ; XAG=0 ; XTA=0 ; XTT=0 ; XTC=0 ; XTG=0 ;XCA=0 ; 
XCT=0 ; XCC=0 ; XCG=0 ;XGA=0 ; XGT=0 ; XGC=0 ; XGG=0 ; X=matrix(0,4,4);
len=0 ; piT=0 ; piC=0 ; piA=0 ; piG=0 ; S1=0 ; S2=0 ; S=0 ; V=0 ;
piY=0 ; piR=0 ; N1=0 ; N2=0 ; N3=0 ; 
M1=0 ; M2=0 ; M3=0 ; M4=0 ; M5=0 ; M6=0 ; M7=0 ; 
T1=0 ; T2=0 ; T3=0 ; T4=0 ; T5=0 ; T6=0 ; T7=0 ; T8=0 ; T9=0 ;
var1=0 ;  var2=0 ;  var3=0 ;  var4=0 ;

for( i in 1 :length(SS1)) {
  if ( SS1[ i ] == "A" & SS2[ i ] == "A")
      XAA = XAA + 1
  if ( SS1[ i ] == "A" & SS2[ i ] == "T")
      XAT = XAT + 1
  if ( SS1[ i ] == "A" & SS2[ i ] == "C")
      XAC = XAC + 1
  if ( SS1[ i ] == "A" & SS2[ i ] == "G")
      XAG = XAG + 1
 
  if ( SS1[ i ] == "T" & SS2[ i ] == "A")
      XTA = XTA + 1
  if ( SS1[ i ] == "T" & SS2[ i ] == "T")
      XTT = XTT + 1
  if ( SS1[ i ] == "T" & SS2[ i ] == "C")
      XTC = XTC + 1
  if ( SS1[ i ] == "T" & SS2[ i ] == "G")
      XTG = XTG + 1
  
  if ( SS1[ i ] == "C" & SS2[ i ] == "A")
      XCA = XCA + 1
  if ( SS1[ i ] == "C" & SS2[ i ] == "T")
      XCT = XCT + 1
  if ( SS1[ i ] == "C" & SS2[ i ] == "C")
      XCC = XCC + 1
  if ( SS1[ i ] == "C" & SS2[ i ] == "G")
      XCG = XCG + 1
 
  if ( SS1[ i ] == "G" & SS2[ i ] == "A")
      XGA = XGA + 1
  if ( SS1[ i ] == "G" & SS2[ i ] == "T")
      XGT = XGT + 1
  if ( SS1[ i ] == "G" & SS2[ i ] == "C")
      XGC = XGC + 1
  if ( SS1[ i ] == "G" & SS2[ i ] == "G")
      XGG = XGG + 1
}
X[1,1]=XTT;X[1,2]=XTC;X[1,3]=XTA;X[1,4]=XTG;
X[2,1]=XCT;X[2,2]=XCC;X[2,3]=XCA;X[2,4]=XCG
X[3,1]=XAT;X[3,2]=XAC;X[3,3]=XAA;X[3,4]=XAG
X[4,1]=XGT;X[4,2]=XGC;X[4,3]=XGA;X[4,4]=XGG
len=X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[4,1]+X[4,2]+X[4,3]+X[4,4]

#compute the parameters
piT = (X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[1,1]+X[2,1]+X[3,1]+X[4,1])/(2*len)
piC = (X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[1,2]+X[2,2]+X[3,2]+X[4,2])/(2*len)
piA = (X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[1,3]+X[2,3]+X[3,3]+X[4,3])/(2*len)
piG = (X[4,1]+X[4,2]+X[4,3]+X[4,4]+X[1,4]+X[2,4]+X[3,4]+X[4,4])/(2*len)
S1  = (X[1,2]+X[2,1])/len
S2  = (X[3,4]+X[4,3])/len
S   = S1+S2
V   = (X[1,3]+X[1,4]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[4,1]+X[4,2])/len
piY = piT+piC
piR = piA+piG

N1 = (piT*piC/piY)+(piA*piG/piR)
N2 = piT*piC+piA*piG
N3 = piY*piR

M1 = 1/(len*(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3)))
M2 = ((N1-N2)/(N3*len))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))-((N1-N2-N3)/(N3*len))/(1-V/(2*N3))
M3 = 1/(2*N1*len^2*(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2)
M4 = ((N1-N2)^2/(2*N1*N3^2*len^2))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2-((N1-N2-N3)/(2*N3^2*len^2))/(1-V/(2*N3))^2
M5 = ((N1-N2)/(2*N1*N3*len^2))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2
M6 = -2*N1*log(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))+2*(N1-N2-N3)*log(1-V/(2*N3))
M7 = M6+len*S*(1-S)/2*M3+len*V*(1-V)/2*M4-len*S*V*M5

T1 = len*S*(1-S)*(1-2*S)
T2 = len*V*(1-V)*(1-2*V)
T3 = len*S*V*(2*V-1)
T4 = len*S*V*(2*S-1)
T5 = 3*len*(len-2)*S^4-6*len*(len-2)*S^3+len*(3*len-7)*S^2+len*S
T6 = 3*len*(len-2)*V^4-6*len*(len-2)*V^3+len*(3*len-7)*V^2+len*V
T7 = 3*len^2*S*V^2*(V-1)+len*(-6*S*V^3+6*S*V^2+S*V)
T8 = 3*len^2*S^2*V*(S-1)+len*(-6*S^3*V+6*S^2*V+S*V)
T9 = len^2*S*V*(3*S*V-S-V+1)+len*S*V*(-6*S*V+2*S+2*V-1)

var1 = len*S*(1-S)*M1^2+len*V*(1-V)*M2^2-2*len*S*V*M1*M2
var2 = len*S*(1-S)*M1^2+len*V*(1-V)*M2^2+len^2*S^2*(1-S)^2*M3^2/4+len^2*V^2*(1-V)^2*M4^2/4-2*len*S*V*M1*M2+
       T1*M1*M3+T2*M2*M4+T3*(M1*M4+2*M2*M5)+T4*(M2*M3+2*M1*M5)+T5*M3^2/4+T6*M4^2/4+
       (T7+len^2*S*V^2*(1-V))*M4*M5+(T8+len^2*S^2*V*(1-S))*M3*M5+(T9-len^2*S^2*V^2)*M5^2+
       (T9-len^2*S*(1-S)*V*(1-V))*M3*M4/2
var3 = (M6-M7)^2
var4 = (M6-M7)^2+len*S*(1-S)*(M1^2+M3*M6-M3*M7)+len*V*(1-V)*(M2^2+M4*M6-M4*M7)-2*len*S*V*(M1*M2+M5*M6-M5*M7)


#show the results
cat("The 4 variance estimators are \n V1=",var1,", V2=",var2,", V3=",var3,", and V4=",var4,".\n")

}