2 Ziheng Yang, June 2004
\r
4 cc -O2 -DNEB -o PositiveSitesNEB PositiveSites.c -lm
\r
5 cc -O2 -DBEB -o PositiveSitesBEB PositiveSites.c -lm
\r
6 cc -O2 -DBranchSite -o PositiveSitesBS PositiveSites.c -lm
\r
8 cl -O2 -DNEB -FePositiveSitesNEB.exe PositiveSites.c
\r
9 cl -O2 -DBEB -FePositiveSitesBEB.exe PositiveSites.c
\r
10 cl -O2 -DBranchSite -FePositiveSitesBS.exe PositiveSites.c
\r
12 PositiveSitesBEB <#sites> <#repl>
\r
13 PositiveSitesBEB <#sites> <#repl> <Evolverf> <Codemlf>
\r
14 PositiveSitesBS <#sites> <#repl> <Evolverf> <Codemlf> <positive site classes>
\r
16 This compares siteID from evolverNSsites and mlc from codeml to calculate
\r
17 the accuracy, power, and false positive rate of codeml inference of sites
\r
18 under positive selection. The measures are defined as follows (Anisimova et
\r
19 al. 2002; Wong et al. 2004).
\r
23 evolver + N++ N+- N+.
\r
29 FalsePositive = N-+/N-.
\r
31 The program collects N++ (NmatchB & NmatchC), N+. (NEvolver), and
\r
32 N.+ (NCodemlB & NcodemlC), and then calculates the three measures as above.
\r
33 Note that codeml inference depends on cutoff P, hence the B (for binned) and
\r
34 C (for cumulative) difference. All proportions are calculated as the ratio
\r
35 of averages, taking the ratio after counting sites over replicate data sets.
\r
36 Output is on the screen.
\r
51 int model=0, NSsites=3; char *Codemlf="mlc", startCodeml[]="Bayes Empirical", startEvolver[]="replicate ";
\r
53 int model=0, NSsites=3; char *Codemlf="mlc", startCodeml[]="Naive Empirical", startEvolver[]="replicate ";
\r
54 #elif(defined BranchSite)
\r
55 int model=1, NSsites=3; char *Codemlf="mlc", startCodeml[]="Bayes Empirical", startEvolver[]="replicate ";
\r
56 int nPositiveClass, PositiveClass[1000];
\r
60 int main (int argc, char* argv[])
\r
62 int nbin=21, noisy=0, nr=1, ls=100;
\r
63 double PCut[]={.525, .55, .575, .6, .625, .65, .675, .7, .725, .75, .775, .8, .825, .85, .875, .9, .925, .95, .975, .99, 1}, PCut0=0.5;
\r
65 int nbin=11, noisy=0, nr=1, ls=100;
\r
66 double PCut[]={0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99, 1}, PCut0=0.5;
\r
68 int ir, ib, i,j,k, ch, nmatch0;
\r
69 double NmatchB[50], NCodemlB[50], NEvolver=0;
\r
70 double NmatchC[50], NCodemlC[50];
\r
71 int *siteEvolver, *siteCodeml, *ibin, nsiteEvolver, nsiteCodeml, lline=1000;
\r
72 int nPositiveClass, PositiveClass[100];
\r
73 char *Evolverf="siterates.txt", line[1001];
\r
74 double p, AccuracyB, AccuracyC, Power, FalsePositive;
\r
75 FILE *fEvolver, *fCodeml;
\r
78 puts("Usage:\n\tPositiveSitesBR <#sites> <#repl> <Evolverf> <Codemlf> <positive site classes>");
\r
80 puts("Usage:\n\tPositiveSitesBEB <#sites> <#repl>");
\r
81 puts("Usage:\n\tPositiveSitesBEB <#sites> <#repl> <Evolverf> <Codemlf>");
\r
84 if(argc<3) exit(-1);
\r
85 if(argc>1) sscanf(argv[1],"%d", &ls);
\r
86 if(argc>2) sscanf(argv[2],"%d", &nr);
\r
87 if(argc>3) Evolverf=argv[3];
\r
88 if(argc>4) Codemlf=argv[4];
\r
89 printf("%d sites, %d replicate.\n", ls,nr);
\r
90 printf("codeml file %s starts with '%s' for each replicate.\n", Codemlf, startCodeml);
\r
92 if(nr>20) { printf("Hit Enter to continue."); getchar(); }
\r
94 fEvolver=fopen(Evolverf,"r"); fCodeml=fopen(Codemlf,"r");
\r
95 if(!fEvolver || !fCodeml) { puts("file error"); exit(-1); }
\r
97 siteEvolver=(int*)malloc(ls*3*sizeof(int));
\r
98 if(siteEvolver==NULL) { puts("oom"); exit(-1); }
\r
99 siteCodeml=siteEvolver+ls; ibin=siteCodeml+ls;
\r
101 for(i=0; i<nbin; i++) {
\r
102 NmatchB[i]=NCodemlB[i]=0;
\r
103 NmatchC[i]=NCodemlC[i]=0;
\r
106 if(model && NSsites) { /* branch-site model */
\r
107 nPositiveClass = argc-5;
\r
108 for(i=0; i<nPositiveClass; i++)
\r
109 sscanf(argv[5+i], "%d", &PositiveClass[i]);
\r
110 printf("%d site classes are under positive selection: ", nPositiveClass);
\r
111 for(i=0; i<nPositiveClass; i++) printf(" %2d", PositiveClass[i]);
\r
114 for(ir=0; ir<nr; ir++) {
\r
115 /* Read true sites from evovler siteID */
\r
117 if(fgets(line, lline, fEvolver)==NULL) break;
\r
118 if(strstr(line, startEvolver)) break;
\r
121 if(NSsites && !model) { /* site models */
\r
122 if(!strchr(line,':')) { puts("did not find ':' in line."); exit(-1); }
\r
123 sscanf(strchr(line,':')+1, "%d", &nsiteEvolver);
\r
124 if(nsiteEvolver>ls) { puts("Too many sites. ls wrong?"); exit(-1); }
\r
125 for(i=0; i<nsiteEvolver; i++)
\r
126 fscanf(fEvolver, "%d", &siteEvolver[i]);
\r
128 else { /* branch-site models */
\r
129 for(i=0,nsiteEvolver=0; i<ls; i++) {
\r
130 fscanf(fEvolver, "%d", &k);
\r
131 for(j=0; j<nPositiveClass; j++)
\r
132 if(k==PositiveClass[j]) {
\r
133 siteEvolver[nsiteEvolver++]=i+1;
\r
139 NEvolver+=nsiteEvolver;
\r
141 printf("\n\n%d sites from Evolver:\n", nsiteEvolver);
\r
142 for(i=0;i<nsiteEvolver; i++) printf(" %3d", siteEvolver[i]);
\r
145 /* Read inferred sites and probs from codeml mlc or rst, bin probs */
\r
147 if(fgets(line, lline, fCodeml)==NULL) break;
\r
148 if(strstr(line, startCodeml)) break;
\r
150 for(i=0; i<3; i++) fgets(line, lline, fCodeml);
\r
151 for(i=nsiteCodeml=0; i<ls; i++,nsiteCodeml++) {
\r
152 if(fscanf(fCodeml, "%d %c%lf", &siteCodeml[i], &ch, &p)!=3) break;
\r
153 fgets(line, lline, fCodeml);
\r
154 for(j=0; j<nbin-1; j++) if(p<=PCut[j]) break;
\r
158 printf("\n%d sites from codeml at 50%%:\n", nsiteCodeml);
\r
159 for(i=0; i<nsiteCodeml; i++) printf("%4d", siteCodeml[i]);
\r
162 /* count matches by going through codeml sites */
\r
163 for(i=0,nmatch0=0; i<nsiteCodeml; i++) {
\r
166 for(j=0; j<=ib; j++) NCodemlC[j]++;
\r
167 for(j=0; j<nsiteEvolver; j++)
\r
168 if(siteCodeml[i]==siteEvolver[j]) break;
\r
169 if(j<nsiteEvolver) { /* a match */
\r
172 for(j=0; j<=ib; j++)
\r
177 printf("\nReplicate %3d: %3d evolver sites, %3d codeml sites at 50%%, %3d matches",
\r
178 ir+1,nsiteEvolver,nsiteCodeml,nmatch0);
\r
181 printf("\n\n%6s%22s%10s%17s%10s%10s\n\n",
\r
182 "P", "AccuracyBin", "Pcut", "AccuracyCum", "Power", "FalsePos");
\r
183 for(j=0; j<nbin; j++) {
\r
184 AccuracyB = (NmatchB[j] ? NmatchB[j]/NCodemlB[j] : 0);
\r
185 AccuracyC = (NmatchC[j] ? NmatchC[j]/NCodemlC[j] : 0);
\r
186 Power = (NmatchC[j] ? NmatchC[j]/NEvolver : 0);
\r
187 FalsePositive = NCodemlC[j]-NmatchC[j];
\r
188 if(FalsePositive) FalsePositive/=(ls*nr-NEvolver);
\r
190 p = (j==0 ? PCut0 : PCut[j-1]);
\r
191 printf("%5.3f - %5.3f: %7.3f (%5.0f) ", p, PCut[j], AccuracyB, NCodemlB[j]);
\r
192 printf( " >%4.3f: %7.3f (%5.0f) %7.3f %7.3f\n", p, AccuracyC, NCodemlC[j], Power, FalsePositive);
\r
194 printf( "\nTrue positive sites from evolver: %5.0f out of %5d total sites\n", NEvolver,ls*nr);
\r
195 fclose(fEvolver); fclose(fCodeml);
\r