]> git.donarmstrong.com Git - mothur.git/blob - metastats2.c
fixed makefile
[mothur.git] / metastats2.c
1
2 #include "metastats.h"
3
4 //The following code has been modified using the original Metastats program from White, J.R., Nagarajan, N. & Pop, M. Statistical methods for detecting differentially abundant features in clinical metagenomic samples. PLoS Comput Biol 5, e1000352 (2009).
5
6 int metastat_main (char* outputFileName, int numRows, int numCols, double threshold, int numPermutations, double** data, int secondGroupingStart){
7         
8         int size,c=0,i=0,j=0,counter=0, bflag=0; 
9         int B=numPermutations;
10         int row = numRows;
11         int col = numCols;
12         int g = secondGroupingStart;
13         double thresh=threshold;
14         double min=0; 
15         
16         char output[1024];
17         strcpy(output, outputFileName);
18         FILE *out;
19         
20         if (g>=col || g<=0){
21                 printf("Check your g value\n"); 
22         }
23         
24         // Initialize the matrices
25         size = row*col;
26         double matrix[row][col];
27         double pmatrix[size],permuted[size];  
28         double storage[row][9];
29         
30         for (i=0;i<row;i++){
31                 for (j =0;j<9;j++){
32                         storage[i][j]=0;                
33                 }       
34         }
35         
36         for(i=0; i<row; i++){
37                 for(j=0; j<col;j++){
38                         matrix[i][j]=data[i][j];
39                         pmatrix[c]=0; // initializing to zero
40                         permuted[c]=0;
41                         c++;
42                 }
43         }
44         
45         // Produces the sum of each column
46         double total[col],total1=0,total2=0;
47         double ratio[col];
48         
49         for(i=0;i<col;i++){
50                 total[i]=0;
51                 ratio[i]=0; }
52         
53         for(i=0; i<col; i++){
54                 for(j=0;j<row;j++){
55                         total[i]=total[i]+matrix[j][i];                 
56                 }
57         }
58         
59         for(i=0;i<g-1;i++){
60                 total1=total1+total[i];}
61         
62         for(i=g-1;i<col;i++){
63                 total2=total2+total[i];}
64         
65         
66         // Creates the ratios by first finding the minimum of totals
67         min = total[0];
68         if (col==2){
69                 if (total[0]<total[1]){
70                         min = total[1];}        
71         }
72         if (col >2){
73                 for(i=1;i<col;i++){
74                         if (min > total[i]){
75                                 min = total[i];}
76                 }
77         }
78         if (min<=0){
79                 printf("Error, the sum of one of the columns <= 0.");
80                 return 0;       
81         }
82         
83         
84         // Ratio time...
85         for(i=0;i<col;i++){
86                 ratio[i]=total[i]/min;
87         }
88         
89         //Change matrix into an array as received by R for compatibility.
90         
91         c=0;
92         for(i=0;i<col;i++){
93                 for(j=0;j<row;j++){
94                         pmatrix[c]=matrix[j][i];
95                         c++;
96                 }
97         }
98         
99         if(row == 1){
100                 for (i =0; i<col;i++){
101                         pmatrix[i]=pmatrix[i]/ratio[i];
102                 }
103         }
104         else {
105                 counter = 0;
106                 j=-1;
107                 for (i=0; i<size; i++) {
108                         if (counter % row == 0) {
109                                 j++;
110                         }
111                         pmatrix[i]=pmatrix[i]/ratio[j];
112                         counter++; 
113                 }   
114         }
115         // pass everything to the rest of the code using pointers. then 
116         // write to output file. below pointers for most of the values are 
117         // created to send everything by reference.
118         
119         int ptt_size, *permutes,*nc,*nr,*gvalue;
120         
121         nc = &col;
122         nr = &row;
123         gvalue = &g;
124         
125         permutes = &B;
126         ptt_size = B*row;
127         
128         //changing ptt_size to row
129         double permuted_ttests[row], pvalues[row], tinitial[row];
130         
131         for(i=0;i<row;i++){
132                 permuted_ttests[i]=0;}
133         
134         for(i=0;i<row;i++){
135                 pvalues[i]=0;
136                 tinitial[i]=0; }
137         
138         // Find the initial values for the matrix.
139         start(pmatrix,gvalue,nr,nc,tinitial,storage);
140         
141         // Start the calculations.
142         
143         if ( (col==2) || ((g-1)<8) || ((col-g+1) < 8) ){  
144                 
145                 double fish[row], fish2[row];
146                 for(i=0;i<row;i++){
147                         fish[i]=0;
148                         fish2[i]=0;}
149                 
150                 for(i=0;i<row;i++){
151                         
152                         for(j=0;j<g-1;j++){
153                                 fish[i]=fish[i]+matrix[i][j];
154                         }
155                         
156                         for(j=g-1;j<col;j++){ 
157                                 fish2[i]=fish2[i]+matrix[i][j];
158                         }
159                         
160                         double  f11,f12,f21,f22;
161                         
162                         f11=fish[i];
163                         f12=fish2[i];
164                         
165                         f21=total1-f11;
166                         f22=total2-f12;
167                         
168                         double data[] = {f11, f12, f21, f22};
169                         
170                         // CONTINGENGCY TABLE:
171                         //   f11   f12
172                         //   f21   f22
173                         
174                         int *nr, *nc, *ldtabl, *work;
175                         int nrow=2, ncol=2, ldtable=2, workspace=100000;
176                         double *expect, *prc, *emin,*prt,*pre;
177                         double e=0, prc1=0, emin1=0, prt1=0, pre1=0;
178                         
179                         nr = &nrow;
180                         nc = &ncol;
181                         ldtabl=&ldtable;
182                         work = &workspace;
183                         
184                         expect = &e;
185                         prc = &prc1;
186                         emin=&emin1;
187                         prt=&prt1;
188                         pre=&pre1;
189                         
190                         //MothurFisher fishtere;
191                         //double mothurFex = fishtere.fexact(f11, f12, f21, f22);
192                         
193                         fexact(nr,nc,data, ldtabl,expect,prc,emin,prt,pre,work);
194                         
195                         if (*pre>.999999999){
196                                 *pre=1;
197                         }
198                         
199                         //printf("feaxt = %f\t%f\t%f\t%f\t%f\t%f\n", *expect, *pre, f11, f12, f21, f22);
200                         storage[i][8] = *pre;
201                         pvalues[i]=*pre;
202                 }
203         }
204         else{
205                 
206                 testp(permuted_ttests, permutes, permuted,pmatrix, nc, nr, gvalue,tinitial,pvalues);
207                 
208                 // Checks to make sure the matrix isn't sparse.
209                 double sparse[row], sparse2[row];
210                 for(i=0;i<row;i++){
211                         sparse[i]=0;
212                         sparse2[i]=0;}
213                 
214                 c=0;    
215                 for(i=0;i<row;i++){
216                         
217                         for(j=0;j<g-1;j++){
218                                 sparse[i]=sparse[i]+matrix[i][j];
219                         }
220                         
221                         if(sparse[i] < (double)(g-1)){
222                                 c++;
223                         }
224                         for(j=g-1;j<col;j++){ // ?<= for col
225                                 sparse2[i]=sparse2[i]+matrix[i][j];
226                         }
227                         
228                         if( (sparse2[i] <(double)(col-g+1))) {
229                                 c++;
230                         }
231                         
232                         if (c==2){
233                                 c=0;
234                                 
235                                 double  f11,f12,f21,f22;
236                                 
237                                 f11=sparse[i];
238                                 sparse[i]=0;
239                                 
240                                 f12=sparse2[i];
241                                 sparse2[i]=0;
242                                 
243                                 f21=total1-f11;
244                                 f22=total2-f12;
245                                 
246                                 double data[] = {f11, f12, f21, f22};
247                                 
248                                 int *nr, *nc, *ldtabl, *work;
249                                 int nrow=2, ncol=2, ldtable=2, workspace=10000000; // I added two zeros for larger data sets
250                                 double *expect, *prc, *emin,*prt,*pre;
251                                 double e=0, prc1=0, emin1=0, prt1=0, pre1=0;
252                                 
253                                 nr = &nrow;
254                                 nc = &ncol;
255                                 ldtabl=&ldtable;
256                                 work = &workspace;
257                                 
258                                 expect = &e;
259                                 prc = &prc1;
260                                 emin=&emin1;
261                                 prt=&prt1;
262                                 pre=&pre1;
263                                 
264                                 fexact(nr,nc,data, ldtabl,expect,prc,emin,prt,pre,work);
265                                 
266                                 if (*pre>.999999999){
267                                         *pre=1;
268                                 }
269                                 storage[i][8] = *pre;
270                                 pvalues[i]=*pre;
271                         }
272                 }  
273                 // End of else statement
274                 bflag = 1;
275         }
276         
277         // Calculates the mean of counts (not normalized)
278         double temp[row][2];
279         
280         for(j=0;j<row;j++){
281                 for(i=0;i<2;i++){
282                         temp[j][i]=0;
283                 }
284         }
285         
286         for (j=0;j<row;j++){
287                 for (i=1; i<=(g-1); i++){
288                         temp[j][0]=temp[j][0]+matrix[j][i-1];
289                 }
290                 temp[j][0]= (double) temp[j][0]/(g-1);
291                 for(i=g;i<=col;i++){
292                         temp[j][1]=temp[j][1]+matrix[j][i-1];
293                 }
294                 temp[j][1]= (double) temp[j][1]/(col-g+1);
295         }
296         
297         for(i=0;i<row;i++){
298                 storage[i][3]=temp[i][0];
299                 storage[i][7]=temp[i][1];
300                 storage[i][8]=pvalues[i];
301         }
302         
303         // BACKUP checks
304         
305         for (i=0;i<row;i++){
306                 if(pvalues[i]<thresh){
307                         printf("Feature %d is significant, p = %.10lf \n",i+1,pvalues[i]);
308                 }       
309         }
310         
311         // And now we write the files to a text file.
312         struct tm *local;
313         time_t t;
314         t = time(NULL);
315         local = localtime(&t);
316         
317         out = fopen(output,"w");
318         
319         fprintf(out,"Local time and date of test: %s\n", asctime(local));
320         fprintf(out,"# rows = %d, # col = %d, g = %d\n\n",row,col,g);
321         if (bflag == 1){
322                 fprintf(out,"%d permutations\n\n",B);   
323         }
324         
325         //output column headings - not really sure... documentation labels 9 columns, there are 10 in the output file
326         //storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
327         fprintf(out, "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean_of_counts(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tmean_of_counts(group1)\tp-value\n");
328     
329         for(i=0; i<row; i++){
330                 fprintf(out,"%d",(i+1));
331                 
332                 for(j=0; j<9;j++){
333                         fprintf(out,"\t%.12lf",storage[i][j]);
334                 }
335                 fprintf(out,"\n");
336         }  
337         
338         fprintf(out,"\n \n");
339         
340         // fclose(jobj);
341         fclose(out);
342         
343         return 0;
344 }
345
346 void testp(double *permuted_ttests,int *B,double *permuted,
347                    double *Imatrix,int *nc,int *nr,int *g,double *Tinitial,double 
348                    *ps) {
349         
350         double Tvalues[*nr];
351         int a, b, n, j;
352         
353         a = *B;
354         b = *nr;
355         n = a*b;
356         
357         double counter[b];
358         
359         for(j=0;j<b;j++){
360                 counter[j]=0;
361         }    
362         
363         for (j=1; j<=*B; j++){
364                 permute_matrix(Imatrix,nc,nr,permuted,g,Tvalues,Tinitial,counter);
365                 // for(i=0;i<*nr;i++){
366                 //   permuted_ttests[k]=fabs(Tvalues[i]);
367                 //    k++;
368     }
369         
370         
371         for(j=0;j<*nr;j++){
372                 ps[j]=((counter[j]+1)/(double)(a+1));
373         }
374 }       
375
376 void permute_matrix(double *Imatrix,int *nc,int *nr,double *permuted,
377                                         int *g,double *trial_ts,double *Tinitial,double *counter1){
378         
379         int i=0,j=0,n=0,a=0,b=0,f=0,c=0,k=0;
380         
381         a = *nr; // number of rows
382         b = *nc;
383         n = a*b;
384         
385         int y[b];
386         
387         for (i=1; i<=*nc; i++){
388                 y[i-1] = i;
389         }
390         
391         permute_array(y, b); 
392         
393         for (i=0; i<*nc; i++){
394                 f = y[i]; //column number
395                 c=1;
396                 c*=(f-1);
397                 c*=a;
398                 if (f == 1){
399                         c = 0;
400                 } // starting value position in the Imatrix
401                 for(j=1; j<=*nr; j++){
402                         permuted[k] = Imatrix[c];
403                         c++;
404                         k++;
405                 }
406         }
407         
408         calc_twosample_ts(permuted,g,nc,nr,trial_ts,Tinitial,counter1);
409 }
410
411 void permute_array(int *array, int n) {
412         static int seeded = 0;
413         int i;
414         
415         if (! seeded) {
416                 seeded = 1;
417                 srand(time(NULL));
418         }
419         
420         for (i = 0; i < n; i++) {
421                 int selection = rand() % (n - i);
422                 int tmp = array[i + selection];
423                 array[i + selection] = array[i];
424                 array[i] = tmp;
425         }
426 }
427
428 void calc_twosample_ts(double *Pmatrix,int *g,int *nc,int *nr,
429                                            double *Ts,double *Tinitial,double *counter) {
430         int i,a;
431         a = *nr;
432         a*=4;
433         
434         double C1[*nr][3], C2[*nr][3], storage[a],tool[a];
435         double nrows,ncols,gvalue, xbardiff=0, denom=0;
436         
437         nrows = (double) *nr;
438         ncols = (double) *nc;
439         gvalue= (double) *g;
440         
441     meanvar(Pmatrix,g,nr,nc,storage);
442     for(i=0;i<=a-1;i++){
443                 tool[i]=storage[i];
444     }
445     for (i=0; i<*nr;i++){
446                 C1[i][0]=tool[i];
447                 C1[i][1]=tool[i+*nr+*nr];
448                 C1[i][2]=C1[i][1]/(gvalue-1);
449                 
450                 C2[i][0]=tool[i+*nr];
451                 C2[i][1]=tool[i+*nr+*nr+*nr]; // var group 2 
452                 C2[i][2]=C2[i][1]/(ncols-gvalue+1);
453     }
454     
455     for (i=0; i<*nr; i++){
456                 xbardiff = C1[i][0]-C2[i][0];
457                 denom = sqrt(C1[i][2]+C2[i][2]);
458                 Ts[i]=fabs(xbardiff/denom);
459                 if (fabs(Ts[i])>(fabs(Tinitial[i])+.0000000000001)){ //13th place
460                         counter[i]++;
461                 }
462     }   
463 }
464
465 void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *store){
466         double temp[*nr], temp2[*nr],var[*nr],var2[*nr],a,b;
467         
468         int i,m,k,l,n;
469         
470         a = (double) *g-1;          
471         b = (double) (*nc-a);
472         
473         for (i = 0; i<*nr; i++){
474                 temp[i]=0;
475                 temp2[i]=0;
476                 var[i]=0;
477                 var2[i]=0;
478         }
479         
480         k = *nr; // number of rows 
481         l = *nc;
482         n = k*l;        
483         
484     m=0;
485     m=*g-1;
486     k=*nr;
487     m*=k; // m = g * nr now
488     for (i=0;i<m;i++){
489                 temp[i%k]=temp[i%k]+pmatrix[i];
490     }
491     for (i=0;i<n;i++){
492                 temp2[i%k]=temp2[i%k]+pmatrix[i];
493     }
494     for (i=0;i<*nr;i++){
495                 temp2[i]=temp2[i]-temp[i];
496     }
497     for (i=0;i<=*nr-1;i++){
498                 store[i]=temp[i]/a;
499                 store[i+*nr]=temp2[i]/b;
500     }
501     
502     // That completes the mean calculations.
503     
504     for (i=0;i<m;i++){
505                 var[i%k]=var[i%k]+pow((pmatrix[i]-store[i%k]),2);
506     }
507     for (i=m;i<n;i++){
508                 var2[i%k]=var2[i%k]+pow((pmatrix[i]-store[(i%k)+*nr]),2);
509     }
510     
511     for (i=0;i<=*nr-1;i++){
512                 store[i+2*k]=var[i]/(a-1);
513                 store[i+3*k]=var2[i]/(b-1);
514     }
515     // That completes var calculations.
516 }
517
518 void start(double *Imatrix,int *g,int *nr,int *nc,double *initial,
519                    double storage[][9]){
520         int i, a = *nr;
521         a*=4;
522         
523         double store[a], tool[a], C1[*nr][3], C2[*nr][3];
524         double nrows,ncols,gvalue, xbardiff=0, denom=0;
525         
526         nrows = (double) *nr;
527         ncols = (double) *nc;
528         gvalue= (double) *g;
529         
530         meanvar(Imatrix,g,nr,nc,store);
531         
532         for(i=0;i<=a-1;i++){
533                 tool[i]=store[i];
534         }
535         for (i=0; i<*nr;i++){
536                 C1[i][0]=tool[i]; //mean group 1
537                 storage[i][0]=C1[i][0];
538                 C1[i][1]=tool[i+*nr+*nr]; // var group 1
539                 storage[i][1]=C1[i][1];
540                 C1[i][2]=C1[i][1]/(gvalue-1);
541                 storage[i][2]=sqrt(C1[i][2]);
542                 
543                 C2[i][0]=tool[i+*nr]; // mean group 2
544                 storage[i][4]=C2[i][0];    
545                 C2[i][1]=tool[i+*nr+*nr+*nr]; // var group 2 
546                 storage[i][5]=C2[i][1];        
547                 C2[i][2]=C2[i][1]/(ncols-gvalue+1);
548                 storage[i][6]=sqrt(C2[i][2]);   
549         }
550         for (i=0; i<*nr; i++){
551                 xbardiff = C1[i][0]-C2[i][0];
552                 denom = sqrt(C1[i][2]+C2[i][2]);
553                 initial[i]=fabs(xbardiff/denom);
554         }                                                                                       
555 }
556
557
558
559