# ================================================================================================================== # # An integrated program for variance estimators of the nucleotide substitution number for F81 model # # ================================================================================================================== # # # # F81:variance estimators of the nucleotide substitution number for F81 model. # # F81=function(seq1,seq2) # # (1) "seq1" and "seq2" are the read data of the two sequences, such as x1 and x2 belows. # # # # Run the codes: # # First you need to install the R package "seqinr". You can run the codes using either # # of the following two ways. # # (a)On the R window, open the file, press "ctrl+a", then press "ctrl+r" or # # (b)Save the F81.txt in D:\ and on the R window type: source("D:/F81.txt"). # # # # You need to load the data of the two sequences and set the name for them. # # Then you can use the CIK and CIH function to find the intervals for two sequences as follows. # # # # For example, # # # # > x1 <- read.fasta(file="D:/AY090666.fasta") # # > x2 <- read.fasta(file="D:/AY090687.fasta") # # # # > F81(x1,x2) # # The 4 variance estimators are # # V1= 0.000524515 , V2= 0.000526922 , V3= 1.298986e-07 , and V4= 0.0005243851 . # # # ================================================================================================================== # # ================================================================================================================== # library(seqinr) source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") library(Biostrings) F81<-function(seq1,seq2){ s1<-toupper(c2s(seq1[[1]][1:length(seq1[[1]])])) s2<-toupper(c2s(seq2[[1]][1:length(seq2[[1]])])) #alignment of two sequences globalAlign <-pairwiseAlignment(s1, s2, substitutionMatrix = "BLOSUM100", gapOpening = -2 , gapExtension = -8,scoreOnly=FALSE) a1<-pattern(globalAlign) a2<-subject(globalAlign) b1<-as.character(a1) b2<-as.character(a2) SS1=substring(b1,1:nchar(b1),1:nchar(b1)) SS2=substring(b2,1:nchar(b2),1:nchar(b2)) #compute the different number of the two sequences XAA=0 ; XAT=0 ; XAC=0 ; XAG=0 ; XTA=0 ; XTT=0 ; XTC=0 ; XTG=0 ;XCA=0 ; XCT=0 ; XCC=0 ; XCG=0 ;XGA=0 ; XGT=0 ; XGC=0 ; XGG=0 ; X=matrix(0,4,4); len=0 ; piT=0 ; piC=0 ; piA=0 ; piG=0 ; S1=0 ; S2=0 ; V=0 ; piY=0 ; piR=0 ; P=0 ; M=0 ; var1=0 ; var2=0 ; var3=0 ; var4=0 ; for( i in 1 :length(SS1)) { if ( SS1[ i ] == "A" & SS2[ i ] == "A") XAA = XAA + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "T") XAT = XAT + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "C") XAC = XAC + 1 if ( SS1[ i ] == "A" & SS2[ i ] == "G") XAG = XAG + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "A") XTA = XTA + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "T") XTT = XTT + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "C") XTC = XTC + 1 if ( SS1[ i ] == "T" & SS2[ i ] == "G") XTG = XTG + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "A") XCA = XCA + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "T") XCT = XCT + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "C") XCC = XCC + 1 if ( SS1[ i ] == "C" & SS2[ i ] == "G") XCG = XCG + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "A") XGA = XGA + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "T") XGT = XGT + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "C") XGC = XGC + 1 if ( SS1[ i ] == "G" & SS2[ i ] == "G") XGG = XGG + 1 } X[1,1]=XTT;X[1,2]=XTC;X[1,3]=XTA;X[1,4]=XTG; X[2,1]=XCT;X[2,2]=XCC;X[2,3]=XCA;X[2,4]=XCG X[3,1]=XAT;X[3,2]=XAC;X[3,3]=XAA;X[3,4]=XAG X[4,1]=XGT;X[4,2]=XGC;X[4,3]=XGA;X[4,4]=XGG len=X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[4,1]+X[4,2]+X[4,3]+X[4,4] #compute the parameters piT = (X[1,1]+X[1,2]+X[1,3]+X[1,4]+X[1,1]+X[2,1]+X[3,1]+X[4,1])/(2*len) piC = (X[2,1]+X[2,2]+X[2,3]+X[2,4]+X[1,2]+X[2,2]+X[3,2]+X[4,2])/(2*len) piA = (X[3,1]+X[3,2]+X[3,3]+X[3,4]+X[1,3]+X[2,3]+X[3,3]+X[4,3])/(2*len) piG = (X[4,1]+X[4,2]+X[4,3]+X[4,4]+X[1,4]+X[2,4]+X[3,4]+X[4,4])/(2*len) S1 = (X[1,2]+X[2,1])/len S2 = (X[3,4]+X[4,3])/len V = (X[1,3]+X[1,4]+X[2,3]+X[2,4]+X[3,1]+X[3,2]+X[4,1]+X[4,2])/len piY = piT+piC piR = piA+piG P = S1+S2+V M = piT*piC+piA*piG+piY*piR var1=P*(1-P)/(len*((1-(P/(2*M)))^2)) var2=(P*(1-P)/(len*((1-(P/(2*M)))^2)))+(P*(1-P)*(1-2*P)/(2*len^2*M*(1-(P/(2*M)))^3))+(P^2*(1-P)^2/(8*M^2*len^2*(1-(P/(2*M)))^4)) var3=P^2*(1-P)^2/(16*M^2*len^2*((1-(P/(2*M)))^4)) var4=(P*(1-P)/(len*((1-(P/(2*M)))^2)))-(P^2*(1-P)^2/(16*M^2*len^2*(1-(P/(2*M)))^4)) #show the results cat("The 4 variance estimators are \n V1=",var1,", V2=",var2,", V3=",var3,", and V4=",var4,".\n") }