# ================================================================================================================== # # An integrated program for variance estimators of the nucleotide substitution number for TN93 model # # ================================================================================================================== # # # # TN93:variance estimators of the nucleotide substitution number for TN93 model. # # TN93=function(seq1,seq2) # # (1) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows. # # # # Run the codes: # # First you need to install the R package "seqinr". You can run the codes using either # # of the following two ways. # # (a)On the R window, open the file, press "ctrl+a", then press "ctrl+r" or # # (b)Save the TN93.txt in D:\ and on the R window type: source("D:/TN93.txt"). # # # # You need to load the data of the two sequences and set the name for them. # # Then you can use the CIK and CIH function to find the intervals for two sequences as follows. # # # # For example, # # # # > x1 <- read.fasta(file="D:/AY090666.fasta") # # > x2 <- read.fasta(file="D:/AY090687.fasta") # # # # > TN93(x1,x2) # # The 4 variance estimators are # # V1= 0.0005821599 , V2= 0.0005914688 , V3= 1.291229e-06 , and V4= 0.0005808686 . # # ================================================================================================================== # # ================================================================================================================== # library(seqinr) source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") library(Biostrings) TN93<-function(seq1,seq2){ s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])])) s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])])) #alignment of two sequences globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2 , gapExtension = -8,scoreOnly=FALSE) a1<-pattern(globalAlign) a2<-subject(globalAlign) b1<-as.character(a1) b2<-as.character(a2) SS1=substring(b1,1:nchar(b1),1:nchar(b1)) SS2=substring(b2,1:nchar(b2),1:nchar(b2)) #compute the different number of the two sequences XAA=0 ; XAT=0 ; XAC=0 ; XAG=0 ; XTA=0 ; XTT=0 ; XTC=0 ; XTG=0 ;XCA=0 ; XCT=0 ; XCC=0 ; XCG=0 ;XGA=0 ; XGT=0 ; XGC=0 ; XGG=0 ; X=matrix(0,4,4); len=0 ; piT=0 ; piC=0 ; piA=0 ; piG=0 ; S1=0 ; S2=0 ; S=0 ; V=0 ; piY=0 ; piR=0 ; var1=0 ; var2=0 ; var3=0 ; var4=0 ; T1=0 ; T2=0 ; T3=0 ; F1=0 ; F2=0 ; F3=0 ; K1=0 ; K2=0 ; K3=0 ; K11=0 ; K22=0 ; K33=0 ; K12=0 ; K13=0 ; K23=0 ; g_c=0 ; for( i in 1 :length(SS1)) { if ( SS1[ i ] == "A" & SS2[ i ] == "A") XAA = XAA + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "T") XAT = XAT + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "C") XAC = XAC + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "G") XAG = XAG + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "A") XTA = XTA + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "T") XTT = XTT + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "C") XTC = XTC + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "G") XTG = XTG + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "A") XCA = XCA + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "T") XCT = XCT + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "C") XCC = XCC + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "G") XCG = XCG + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "A") XGA = XGA + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "T") XGT = XGT + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "C") XGC = XGC + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "G") XGG = XGG + 1 } X[1,1]=XTT;X[1,2]=XTC;X[1,3]=XTA;X[1,4]=XTG; X[2,1]=XCT;X[2,2]=XCC;X[2,3]=XCA;X[2,4]=XCG X[3,1]=XAT;X[3,2]=XAC;X[3,3]=XAA;X[3,4]=XAG X[4,1]=XGT;X[4,2]=XGC;X[4,3]=XGA;X[4,4]=XGG len=X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[4,1]+X[4,2]+X[4,3]+X[4,4] piT = (X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[1,1]+X[2,1]+X[3,1]+X[4,1])/(2*len) piC = (X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[1,2]+X[2,2]+X[3,2]+X[4,2])/(2*len) piA = (X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[1,3]+X[2,3]+X[3,3]+X[4,3])/(2*len) piG = (X[4,1]+X[4,2]+X[4,3]+X[4,4]+X[1,4]+X[2,4]+X[3,4]+X[4,4])/(2*len) S1 = (X[1,2]+X[2,1])/len S2 = (X[3,4]+X[4,3])/len S = S1+S2 V = (X[1,3]+X[1,4]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[4,1]+X[4,2])/len piY = piT+piC piR = piA+piG F1 = piT*piC/piY F2 = piA*piG/piR F3 = piY*piR T1 = 1-(S1/(2*F1))-(V/(2*piY)) T2 = 1-(S2/(2*F2))-(V/(2*piR)) T3 = 1-(V/(2*F3)) K1 = 1/(len*T1) K2 = 1/(len*T2) K3 = F1/(piY*len*T1)+F2/(piR*len*T2)+(F3-F1*piR-F2*piY)/(F3*len*T3) K11 = 1/(2*F1*len^2*T1^2) K22 = 1/(2*F2*len^2*T2^2) K33 = F1/(2*piY^2*len^2*T1^2)+F2/(2*piR^2*len^2*T2^2)+(F3-F1*piR-F2*piY)/(F3^2*len^2*T3^2) K12 = 0 K13 = 1/(2*piY*len^2*T1^2) K23 = 1/(2*piR*len^2*T2^2) g_c = -len*S1*(1-S1)*K11/2-len*S2*(1-S2)*K22/2-len*V*(1-V)*K33/2+len*S1*S2*K12+len*S1*V*K13+len*S2*V*K23 var1 = len*S1*(1-S1)*K1^2+len*S2*(1-S2)*K2^2+len*V*(1-V)*K3^2- 2*len*S1*S2*K1*K2-2*len*S1*V*K1*K3-2*len*S2*V*K2*K3 var2 = len*S1*(1-S1)*K1^2+len*S2*(1-S2)*K2^2+len*V*(1-V)*K3^2+ len*S1*(1-S1)*(1-2*S1)*K1*K11+len*S2*(1-S2)*(1-2*S2)*K2*K22+len*V*(1-V)*(1-2*V)*K3*K33+ (3*len*(len-2)*S1^4-6*len*(len-2)*S1^3+len*(3*len-7)*S1^2+len*S1-len^2*S1^2*(1-S1)^2)*K11^2/4+ (3*len*(len-2)*S2^4-6*len*(len-2)*S2^3+len*(3*len-7)*S2^2+len*S2-len^2*S2^2*(1-S2)^2)*K22^2/4+ (3*len*(len-2)*V^4-6*len*(len-2)*V^3+len*(3*len-7)*V^2+len*V-len^2*V^2*(1-V)^2)*K33^2/4- 2*len*S1*S2*K1*K2-2*len*S1*V*K1*K3-2*len*S2*V*K2*K3+ ((len-2)*S1*S2*(3*len*S1*S2-len*S1-len*S2)+len*(len-1)*S1*S2-(len*S1*S2)^2)*K12^2+ ((len-2)*S1*V*(3*len*S1*V-len*S1-len*V)+len*(len-1)*S1*V-(len*S1*V)^2)*K13^2+ ((len-2)*S2*V*(3*len*S2*V-len*S2-len*V)+len*(len-1)*S2*V-(len*S2*V)^2)*K23^2+ ((len-2)*S1*S2*(3*len*S1*S2-len*S1-len*S2)+len*(len-1)*S1*S2-len*S1*(1-S1)*len*S2*(1-S2))*K11*K22/2+ ((len-2)*S1*V*(3*len*S1*V-len*S1-len*V)+len*(len-1)*S1*V-len*S1*(1-S1)*len*V*(1-V))*K11*K33/2+ ((len-2)*S2*V*(3*len*S2*V-len*S2-len*V)+len*(len-1)*S2*V-len*S2*(1-S2)*len*V*(1-V))*K22*K33/2+ len*S1*S2*(2*S2-1)*(K1*K22+2*K2*K12)+len*S1*V*(2*V-1)*(K1*K33+2*K3*K13)+ len*S1*S2*(2*S1-1)*(K2*K11+2*K1*K12)+len*S2*V*(2*V-1)*(K2*K33+2*K3*K23)+ len*S1*V*(2*S1-1)*(K3*K11+2*K1*K13)+len*S2*V*(2*S2-1)*(K3*K22+2*K2*K23)+ (3*len*(len-2)*S1^2*S2*(S1-1)+len*S1*S2+len^2*S1^2*(1-S1)*S2)*K11*K12+ (3*len*(len-2)*S1^2*V*(S1-1)+len*S1*V+len^2*S1^2*(1-S1)*V)*K11*K13+ (3*len*(len-2)*S2^2*S1*(S2-1)+len*S1*S2+len^2*S2^2*(1-S2)*S1)*K22*K12+ (3*len*(len-2)*S2^2*V*(S2-1)+len*S2*V+len^2*S2^2*(1-S2)*V)*K22*K23+ (3*len*(len-2)*V^2*S1*(V-1)+len*S1*V+len^2*V^2*(1-V)*S1)*K33*K13+ (3*len*(len-2)*V^2*S2*(V-1)+len*S2*V+len^2*V^2*(1-V)*S2)*K33*K23+ 2*len*S1*S2*V*(K1*K23+K2*K13+K3*K12)+ len*(len-2)*S1*S2*V*(3*S1-1)*(K11*K23+2*K12*K13)+ len*(len-2)*S1*S2*V*(3*S2-1)*(K22*K13+2*K12*K23)+ len*(len-2)*S1*S2*V*(3*V-1)*(K33*K12+2*K13*K23)+ len^2*S1*S2*V*(1-S1)*K11*K23+len^2*S1*S2*V*(1-S2)*K22*K13+len^2*S1*S2*V*(1-V)*K33*K12- 2*len^2*S1^2*S2*V*K12*K13-2*len^2*S1*S2^2*V*K12*K23-2*len^2*S1*S2*V^2*K13*K23 var3 = g_c^2 var4 = g_c^2+len*S1*(1-S1)*(K1^2+g_c*K11)+len*S2*(1-S2)*(K2^2+g_c*K22)+len*V*(1-V)*(K3^2+g_c*K33)- 2*len*S1*S2*(K1*K2+g_c*K12)-2*len*S1*V*(K1*K3+g_c*K13)-2*len*S2*V*(K2*K3+g_c*K23) #show the results cat("The 4 variance estimators are \n V1=",var1,", V2=",var2,", V3=",var3,", and V4=",var4,".\n") }