# ================================================================================================================== # # An integrated program for variance estimators of the nucleotide substitution number for F84 model # # ================================================================================================================== # # # # F84:variance estimators of the nucleotide substitution number for F84 model. # # F84=function(seq1,seq2) # # (1) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows. # # # # Run the codes: # # First you need to install the R package "seqinr". You can run the codes using either # # of the following two ways. # # (a)On the R window, open the file, press "ctrl+a", then press "ctrl+r" or # # (b)Save the F84.txt in D:\ and on the R window type: source("D:/F84.txt"). # # # # You need to load the data of the two sequences and set the name for them. # # Then you can use the CIK and CIH function to find the intervals for two sequences as follows. # # # # For example, # # # # > x1 <- read.fasta(file="D:/AY090666.fasta") # # > x2 <- read.fasta(file="D:/AY090687.fasta") # # # # > > F84(x1,x2) # # The 4 variance estimators are # # V1= 0.0005245847 , V2= 0.000528068 , V3= 2.254411e-07 , and V4= 0.0005243593 . # # ================================================================================================================== # # ================================================================================================================== # library(seqinr) source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") library(Biostrings) F84<-function(seq1,seq2){ s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])])) s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])])) #alignment of two sequences globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2 , gapExtension = -8,scoreOnly=FALSE) a1<-pattern(globalAlign) a2<-subject(globalAlign) b1<-as.character(a1) b2<-as.character(a2) SS1=substring(b1,1:nchar(b1),1:nchar(b1)) SS2=substring(b2,1:nchar(b2),1:nchar(b2)) #compute the different number of the two sequences XAA=0 ; XAT=0 ; XAC=0 ; XAG=0 ; XTA=0 ; XTT=0 ; XTC=0 ; XTG=0 ;XCA=0 ; XCT=0 ; XCC=0 ; XCG=0 ;XGA=0 ; XGT=0 ; XGC=0 ; XGG=0 ; X=matrix(0,4,4); len=0 ; piT=0 ; piC=0 ; piA=0 ; piG=0 ; S1=0 ; S2=0 ; S=0 ; V=0 ; piY=0 ; piR=0 ; N1=0 ; N2=0 ; N3=0 ; M1=0 ; M2=0 ; M3=0 ; M4=0 ; M5=0 ; M6=0 ; M7=0 ; T1=0 ; T2=0 ; T3=0 ; T4=0 ; T5=0 ; T6=0 ; T7=0 ; T8=0 ; T9=0 ; var1=0 ; var2=0 ; var3=0 ; var4=0 ; for( i in 1 :length(SS1)) { if ( SS1[ i ] == "A" & SS2[ i ] == "A") XAA = XAA + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "T") XAT = XAT + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "C") XAC = XAC + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "G") XAG = XAG + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "A") XTA = XTA + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "T") XTT = XTT + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "C") XTC = XTC + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "G") XTG = XTG + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "A") XCA = XCA + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "T") XCT = XCT + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "C") XCC = XCC + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "G") XCG = XCG + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "A") XGA = XGA + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "T") XGT = XGT + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "C") XGC = XGC + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "G") XGG = XGG + 1 } X[1,1]=XTT;X[1,2]=XTC;X[1,3]=XTA;X[1,4]=XTG; X[2,1]=XCT;X[2,2]=XCC;X[2,3]=XCA;X[2,4]=XCG X[3,1]=XAT;X[3,2]=XAC;X[3,3]=XAA;X[3,4]=XAG X[4,1]=XGT;X[4,2]=XGC;X[4,3]=XGA;X[4,4]=XGG len=X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[4,1]+X[4,2]+X[4,3]+X[4,4] #compute the parameters piT = (X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[1,1]+X[2,1]+X[3,1]+X[4,1])/(2*len) piC = (X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[1,2]+X[2,2]+X[3,2]+X[4,2])/(2*len) piA = (X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[1,3]+X[2,3]+X[3,3]+X[4,3])/(2*len) piG = (X[4,1]+X[4,2]+X[4,3]+X[4,4]+X[1,4]+X[2,4]+X[3,4]+X[4,4])/(2*len) S1 = (X[1,2]+X[2,1])/len S2 = (X[3,4]+X[4,3])/len S = S1+S2 V = (X[1,3]+X[1,4]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[4,1]+X[4,2])/len piY = piT+piC piR = piA+piG N1 = (piT*piC/piY)+(piA*piG/piR) N2 = piT*piC+piA*piG N3 = piY*piR M1 = 1/(len*(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))) M2 = ((N1-N2)/(N3*len))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))-((N1-N2-N3)/(N3*len))/(1-V/(2*N3)) M3 = 1/(2*N1*len^2*(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2) M4 = ((N1-N2)^2/(2*N1*N3^2*len^2))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2-((N1-N2-N3)/(2*N3^2*len^2))/(1-V/(2*N3))^2 M5 = ((N1-N2)/(2*N1*N3*len^2))/(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))^2 M6 = -2*N1*log(1-S/(2*N1)-(N1-N2)*V/(2*N1*N3))+2*(N1-N2-N3)*log(1-V/(2*N3)) M7 = M6+len*S*(1-S)/2*M3+len*V*(1-V)/2*M4-len*S*V*M5 T1 = len*S*(1-S)*(1-2*S) T2 = len*V*(1-V)*(1-2*V) T3 = len*S*V*(2*V-1) T4 = len*S*V*(2*S-1) T5 = 3*len*(len-2)*S^4-6*len*(len-2)*S^3+len*(3*len-7)*S^2+len*S T6 = 3*len*(len-2)*V^4-6*len*(len-2)*V^3+len*(3*len-7)*V^2+len*V T7 = 3*len^2*S*V^2*(V-1)+len*(-6*S*V^3+6*S*V^2+S*V) T8 = 3*len^2*S^2*V*(S-1)+len*(-6*S^3*V+6*S^2*V+S*V) T9 = len^2*S*V*(3*S*V-S-V+1)+len*S*V*(-6*S*V+2*S+2*V-1) var1 = len*S*(1-S)*M1^2+len*V*(1-V)*M2^2-2*len*S*V*M1*M2 var2 = len*S*(1-S)*M1^2+len*V*(1-V)*M2^2+len^2*S^2*(1-S)^2*M3^2/4+len^2*V^2*(1-V)^2*M4^2/4-2*len*S*V*M1*M2+ T1*M1*M3+T2*M2*M4+T3*(M1*M4+2*M2*M5)+T4*(M2*M3+2*M1*M5)+T5*M3^2/4+T6*M4^2/4+ (T7+len^2*S*V^2*(1-V))*M4*M5+(T8+len^2*S^2*V*(1-S))*M3*M5+(T9-len^2*S^2*V^2)*M5^2+ (T9-len^2*S*(1-S)*V*(1-V))*M3*M4/2 var3 = (M6-M7)^2 var4 = (M6-M7)^2+len*S*(1-S)*(M1^2+M3*M6-M3*M7)+len*V*(1-V)*(M2^2+M4*M6-M4*M7)-2*len*S*V*(M1*M2+M5*M6-M5*M7) #show the results cat("The 4 variance estimators are \n V1=",var1,", V2=",var2,", V3=",var3,", and V4=",var4,".\n") }