]> git.donarmstrong.com Git - mothur.git/blob - seqnoise.cpp
fixes while testing 1.33.0
[mothur.git] / seqnoise.cpp
1 /*
2  *  mySeqNoise.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/31/11.
6  *  Copyright 2011 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqnoise.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
13 #include "inputdata.h"
14
15 #define MIN_DELTA 1.0e-6
16 #define MIN_ITER 20
17 #define MAX_ITER 1000
18 #define MIN_COUNT 0.1
19 #define MIN_TAU   1.0e-4
20 #define MIN_WEIGHT 0.1
21
22
23 /**************************************************************************************************/
24 int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
25         try {
26                 
27                 ifstream sequenceFile;
28                 m->openInputFile(sequenceFileName, sequenceFile);
29                 
30                 while(!sequenceFile.eof()){
31                         
32                         if (m->control_pressed) { break; }
33                         
34                         Sequence temp(sequenceFile); m->gobble(sequenceFile);
35                         
36                         if (temp.getName() != "") {
37                                 sequences.push_back(temp.getAligned());
38                         }
39                 }
40                 sequenceFile.close();
41                 
42                 return 0;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "seqNoise", "getSequenceData");
46                 exit(1);
47         }
48 }
49 /**************************************************************************************************/
50 int seqNoise::addSeq(string seq, vector<string>& sequences){ 
51         try {
52                 sequences.push_back(seq);
53                 return 0;
54         }
55         catch(exception& e) {
56                 m->errorOut(e, "seqNoise", "addSeq");
57                 exit(1);
58         }
59 }       
60 /**************************************************************************************************/
61 //no checks for file mismatches
62 int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
63         try {
64                 string unique, redundant;
65                 ifstream namesFile;
66                 m->openInputFile(namesFileName, namesFile);
67                 
68                 for(int i=0;i<redundantNames.size();i++){
69                         
70                         if (m->control_pressed) { break; }
71                         
72                         namesFile >> uniqueNames[i]; m->gobble(namesFile);
73                         namesFile >> redundantNames[i]; m->gobble(namesFile);
74                         
75                         seqFreq[i] = m->getNumNames(redundantNames[i]);
76                 }
77                 namesFile.close();
78                 
79                 return 0;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "seqNoise", "getRedundantNames");
83                 exit(1);
84         }
85 }
86 /**************************************************************************************************/
87 int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){ 
88         try {
89                 
90                 uniqueNames.push_back(uniqueName);
91                 redundantNames.push_back(redundantName);
92                 seqFreq.push_back(m->getNumNames(redundantName));
93                 
94                 return 0;
95         }
96         catch(exception& e) {
97                 m->errorOut(e, "seqNoise", "addRedundantName");
98                 exit(1);
99         }
100 }       
101 /**************************************************************************************************/
102 int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
103         try {
104                 
105                 ifstream distFile;
106                 m->openInputFile(distFileName, distFile);
107                 
108                 int numSeqs = 0;
109                 string name = "";
110                 
111                 distFile >> numSeqs;
112                 
113                 for(int i=0;i<numSeqs;i++){
114                         
115                         if (m->control_pressed) {  break; }
116                         
117                         distances[i * numSeqs + i] = 0.0000;
118                         
119                         distFile >> name;
120                         
121                         for(int j=0;j<i;j++){
122                                 distFile >> distances[i * numSeqs + j];
123                                 distances[j * numSeqs + i] = distances[i * numSeqs + j];
124                         }
125                 }
126                 
127                 distFile.close();       
128                 
129                 return 0;
130         }
131         catch(exception& e) {
132                 m->errorOut(e, "seqNoise", "getDistanceData");
133                 exit(1);
134         }
135 }
136
137 /**************************************************************************************************/
138 int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
139         try {
140                 
141                 ifstream listFile;
142                 m->openInputFile(listFileName, listFile);
143                 
144                 bool adjustCutoff = true;
145         string lastLabel = "";
146         
147                 while(!listFile.eof()){
148             
149             ListVector list(listFile); m->gobble(listFile); //10/18/13 - change to reading with listvector to accomodate changes to the listfiel format. ie. adding header labels.
150             
151             string thisLabel = list.getLabel();
152             lastLabel = thisLabel;
153             
154             if (thisLabel == "unique") {} //skip to next label in listfile
155             else {
156                 double threshold;
157                 m->mothurConvert(thisLabel, threshold);
158                 
159                 if(threshold < cutOff){} //skip to next label in listfile
160                 else{
161                     adjustCutoff = false;
162                     int numOTUs = list.getNumBins();
163                     otuFreq.resize(numOTUs, 0);
164                     
165                     for(int i=0;i<numOTUs;i++){
166                         
167                         if (m->control_pressed) { return 0; }
168                         
169                         string otu = list.get(i);
170                         int count = 0;
171                         string number = "";
172                         
173                         for(int j=0;j<otu.size();j++){
174                             if(otu[j] != ','){
175                                 number += otu[j];
176                             }
177                             else{
178                                 int index = atoi(number.c_str());
179                                 otuData[index] = i;
180                                 count++;
181                                 number = "";
182                             }
183                         }
184                         
185                         int index = atoi(number.c_str());
186                         otuData[index] = i;
187                         count++;
188                         
189                         otuFreq[i] = count;
190                     }
191                     
192                     otuBySeqLookUp.resize(numOTUs);
193                     
194                     int numSeqs = otuData.size();
195                     
196                     for(int i=0;i<numSeqs;i++){
197                         if (m->control_pressed) { return 0; }
198                         otuBySeqLookUp[otuData[i]].push_back(i);
199                     }
200                     for(int i=0;i<numOTUs;i++){
201                         if (m->control_pressed) { return 0; }
202                         for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
203                             otuBySeqLookUp[i].push_back(0);
204                         }
205                     }
206                     
207                     break;
208                 }
209             }
210                 }
211                 
212                 listFile.close();
213                 
214                 //the listfile does not contain a threshold greater than the cutoff so use highest value
215                 if (adjustCutoff) {
216             
217             InputData input(listFileName, "list");
218             ListVector* list = input.getListVector(lastLabel);
219             
220             int numOTUs = list->getNumBins();
221             otuFreq.resize(numOTUs, 0);
222                         
223                         for(int i=0;i<numOTUs;i++){
224                                 
225                                 if (m->control_pressed) { return 0; }
226                                 
227                                 string otu = list->get(i);
228                                 
229                                 int count = 0;
230                                 string number = "";
231                                 
232                                 for(int j=0;j<otu.size();j++){
233                                         if(otu[j] != ','){
234                                                 number += otu[j];
235                                         }
236                                         else{
237                                                 int index = atoi(number.c_str());
238                                                 otuData[index] = i;
239                                                 count++;
240                                                 number = "";
241                                         }
242                                 }
243                                 
244                                 int index = atoi(number.c_str());
245                                 otuData[index] = i;
246                                 count++;
247                                 
248                                 otuFreq[i] = count;
249                         }
250                         
251                         otuBySeqLookUp.resize(numOTUs);
252                         
253                         int numSeqs = otuData.size();
254                         
255                         for(int i=0;i<numSeqs;i++){
256                                 if (m->control_pressed) { return 0; }
257                                 otuBySeqLookUp[otuData[i]].push_back(i);
258                         }
259                         for(int i=0;i<numOTUs;i++){
260                                 if (m->control_pressed) { return 0; }
261                                 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
262                                         otuBySeqLookUp[i].push_back(0);
263                                 }
264                         }
265                         
266             delete list;
267                 }
268                 
269                 return 0;
270         }
271         catch(exception& e) {
272                 m->errorOut(e, "seqNoise", "getListData");
273                 exit(1);
274         }
275 }
276
277 /**************************************************************************************************/
278 int seqNoise::updateOTUCountData(vector<int> otuFreq,
279                                                                  vector<vector<int> > otuBySeqLookUp,
280                                                                  vector<vector<int> > aanI,
281                                                                  vector<int>& anP,
282                                                                  vector<int>& anI,
283                                                                  vector<int>& cumCount
284                                                                  ){
285         try {
286                 int numOTUs = otuFreq.size();
287                 
288                 int count = 0;
289                 
290                 for(int i=0;i<numOTUs;i++){
291                         cumCount[i] = count;
292                         
293                         if (m->control_pressed) { return 0; }
294                         
295                         for(int j=0;j<otuFreq[i];j++){
296                                 anP[count] = otuBySeqLookUp[i][j];
297                                 anI[count] = aanI[i][j];
298                                 
299                                 count++;
300                         }
301                 }
302                 
303                 return 0;
304         }
305         catch(exception& e) {
306                 m->errorOut(e, "seqNoise", "updateOTUCountData");
307                 exit(1);
308         }
309 }
310 /**************************************************************************************************/
311 double seqNoise::calcNewWeights(
312                                           vector<double>& weights,      //
313                                           vector<int> seqFreq,          //
314                                           vector<int> anI,                      //
315                                           vector<int> cumCount,         //
316                                           vector<int> anP,                      //
317                                           vector<int> otuFreq,          //
318                                           vector<double> tau            //
319                                           ){
320         try {
321                 
322                 int numOTUs = weights.size();
323                 double maxChange = -1;
324                 
325                 cout.flush();
326                 
327                 for(int i=0;i<numOTUs;i++){
328                         
329                         if (m->control_pressed) { return 0; }
330                         
331                         double change = weights[i];
332                         
333                         weights[i] = 0.0000;
334                         
335                         for(int j=0;j<otuFreq[i];j++){
336                                 
337                                 int index1 = cumCount[i] + j;
338                                 int index2 = anI[index1];
339                                 
340                                 double currentTau = tau[anP[index1]];
341                                 double freq = double(seqFreq[index2]);
342                                 
343                                 weights[i] += currentTau * freq;
344                         }
345                         change = fabs(weights[i] - change);
346                         
347                         if(change > maxChange){ maxChange = change;     }
348                         cout.flush();
349                 }
350                 return maxChange;
351                 
352         }
353         catch(exception& e) {
354                 m->errorOut(e, "seqNoise", "calcNewWeights");
355                 exit(1);
356         }
357 }
358
359 /**************************************************************************************************/
360
361 int seqNoise::calcCentroids(
362                                    vector<int> anI,
363                                    vector<int> anP,
364                                    vector<int>& change, 
365                                    vector<int>& centroids, 
366                                    vector<int> cumCount,
367                                    vector<double> distances,///
368                                    vector<int> seqFreq, 
369                                    vector<int> otuFreq, 
370                                    vector<double> tau 
371                                    ){
372         try {
373                 int numOTUs = change.size();
374                 int numSeqs = seqFreq.size();
375                 
376                 for(int i=0;i<numOTUs;i++){
377                         
378                         if (m->control_pressed) { return 0; }
379                         
380                         int minFIndex = -1;
381                         double minFValue = 1e10;
382                         
383                         change[i] = 0;
384                         double count = 0.00000;
385                         
386                         int freqOfOTU = otuFreq[i];
387                         
388                         for(int j=0;j<freqOfOTU;j++){
389                                 int index = cumCount[i] + j;
390                                 count += seqFreq[anI[index]]*tau[anP[index]];
391                         }
392                         
393                         if(freqOfOTU > 0 && count > MIN_COUNT){
394                                 
395                                 vector<double> adF(freqOfOTU);
396                                 vector<int> anL(freqOfOTU);
397                                 
398                                 for(int j=0;j<freqOfOTU;j++){
399                                         anL[j] = anI[cumCount[i] + j];
400                                         adF[j] = 0.0000;
401                                 }
402                                 
403                                 for(int j=0;j<freqOfOTU;j++){           
404                                         int index = cumCount[i] + j;
405                                         double curTau = tau[anP[index]];
406                                         
407                                         for(int k=0;k<freqOfOTU;k++){
408                                                 double dist = distances[anL[j]*numSeqs + anL[k]];
409                                                 
410                                                 adF[k] += dist * curTau * seqFreq[anL[j]];
411                                         }
412                                 }
413                                 
414                                 for(int j=0;j<freqOfOTU;j++){
415                                         if(adF[j] < minFValue){
416                                                 minFIndex = j;
417                                                 minFValue = adF[j];
418                                         }
419                                 }
420                                 
421                                 if(centroids[i] != anL[minFIndex]){
422                                         change[i] = 1;
423                                         centroids[i] = anL[minFIndex];
424                                 }
425                         }
426                         else if(centroids[i] != -1){
427                                 change[i] = 1;
428                                 centroids[i] = -1;                      
429                         }
430                 }
431                 
432                 return 0;
433         }
434         catch(exception& e) {
435                 m->errorOut(e, "seqNoise", "calcCentroids");
436                 exit(1);
437         }
438 }
439
440 /**************************************************************************************************/
441
442 int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
443         try {
444                 int numOTUs = centroids.size();
445                 vector<int> unique(numOTUs, 1);
446                 
447                 double minWeight = MIN_WEIGHT;
448                 for(int i=0;i<numOTUs;i++){
449                         if (m->control_pressed) { return 0; }
450                         if(weights[i] < minWeight){     unique[i] = -1; }
451                 }
452                 
453                 for(int i=0;i<numOTUs;i++){
454                         if (m->control_pressed) { return 0; }
455                         if(unique[i] == 1){
456                                 for(int j=i+1; j<numOTUs;j++){
457                                         if(unique[j] == 1){
458                                                 if(centroids[i] == centroids[j]){
459                                                         unique[j] = 0;
460                                                         weights[i] += weights[j];
461                                                         weights[j] = 0;
462                                                 }
463                                         }
464                                 }
465                         }               
466                 }
467                 
468                 return 0;
469         }
470         catch(exception& e) {
471                 m->errorOut(e, "seqNoise", "checkCentroids");
472                 exit(1);
473         }
474 }
475
476 /**************************************************************************************************/
477
478 int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
479         try {
480
481                 int numOTUs = cumCount.size();
482                 int numSeqs = otuData.size();
483                 
484                 vector<double> bestTau(numSeqs, 0);
485                 vector<double> bestIndex(numSeqs, -1);
486                 
487                 for(int i=0;i<numOTUs;i++){
488                         if (m->control_pressed) { return 0; }
489                         for(int j=0;j<otuFreq[i];j++){
490                                 
491                                 int index1 = cumCount[i] + j;
492                                 double thisTau = tau[anP[index1]];
493                                 int index2 = anI[index1];
494                                 
495                                 if(thisTau > bestTau[index2]){
496                                         bestTau[index2] = thisTau;
497                                         bestIndex[index2] = i;
498                                 }
499                         }               
500                 }
501                 
502                 for(int i=0;i<numSeqs;i++){
503                         if (m->control_pressed) { return 0; }
504                         otuData[i] = bestIndex[i];
505                         percentage[i] = 1 - bestTau[i];
506                 }
507                 return 0;
508         }
509         catch(exception& e) {
510                 m->errorOut(e, "seqNoise", "setUpOTUData");
511                 exit(1);
512         }
513 }
514
515 /**************************************************************************************************/
516
517 int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
518         try {
519                 int numSeqs = otuData.size();
520                 int numOTUs = otuFreq.size();
521                 int total = numSeqs;
522                 
523                 otuFreq.assign(numOTUs, 0);
524                 tau.assign(numSeqs, 1);
525                 anP.assign(numSeqs, 0);
526                 anI.assign(numSeqs, 0);
527                 
528                 for(int i=0;i<numSeqs;i++){
529                         if (m->control_pressed) { return 0; }
530                         int otu = otuData[i];
531                         total++;
532                         
533                         otuBySeqLookUp[otu][otuFreq[otu]] = i;
534                         aanI[otu][otuFreq[otu]] = i;
535                         otuFreq[otu]++;
536                 }
537                 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
538                 return 0;
539         }
540         catch(exception& e) {
541                 m->errorOut(e, "seqNoise", "finishOTUData");
542                 exit(1);
543         }
544 }
545
546 /**************************************************************************************************/
547
548 int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
549         try{
550                 char nullReturn = -1;
551                 
552                 while(i>=1 && j>=1){
553                         if (m->control_pressed) { return nullReturn; }
554                         if(direction == 'd'){
555                                 if(seqA[i-1] == seqB[j-1])      {       return seqA[i-1];       }
556                                 else                                            {       return nullReturn;      }
557                         }
558                         
559                         else if(direction == 'l')               {       j--;                            }
560                         else                                                    {       i--;                            }
561                         
562                         direction = alignMoves[i][j];
563                 }
564                 
565                 return nullReturn;
566         }
567         catch(exception& e) {
568                 m->errorOut(e, "seqNoise", "getLastMatch");
569                 exit(1);
570         }
571 }
572
573 /**************************************************************************************************/
574
575 int seqNoise::countDiffs(vector<int> query, vector<int> ref){
576         try {
577                 //double MATCH = 5.0;
578                 //double MISMATCH = -2.0;
579                 //double GAP = -2.0;
580                 
581                 vector<vector<double> > correctMatrix(4);
582                 for(int i=0;i<4;i++){   correctMatrix[i].resize(4);     }
583                 
584                 correctMatrix[0][0] = 0.000000;         //AA
585                 correctMatrix[1][0] = 11.619259;        //CA
586                 correctMatrix[2][0] = 11.694004;        //TA
587                 correctMatrix[3][0] = 7.748623;         //GA
588                 
589                 correctMatrix[1][1] = 0.000000;         //CC
590                 correctMatrix[2][1] = 7.619657;         //TC
591                 correctMatrix[3][1] = 12.852562;        //GC
592                 
593                 correctMatrix[2][2] = 0.000000;         //TT
594                 correctMatrix[3][2] = 10.964048;        //TG
595                 
596                 correctMatrix[3][3] = 0.000000;         //GG
597                 
598                 for(int i=0;i<4;i++){
599                         for(int j=0;j<i;j++){
600                                 correctMatrix[j][i] = correctMatrix[i][j];
601                         }
602                 }
603                 
604                 int queryLength = query.size();
605                 int refLength = ref.size();
606                 
607                 vector<vector<double> > alignMatrix(queryLength + 1);
608                 vector<vector<char> > alignMoves(queryLength + 1);
609                 
610                 for(int i=0;i<=queryLength;i++){
611                         if (m->control_pressed) { return 0; }
612                         alignMatrix[i].resize(refLength + 1, 0);
613                         alignMoves[i].resize(refLength + 1, 'x');
614                 }
615                 
616                 for(int i=0;i<=queryLength;i++){
617                         if (m->control_pressed) { return 0; }
618                         alignMatrix[i][0] = 15.0 * i;
619                         alignMoves[i][0] = 'u';
620                 }
621                 
622                 for(int i=0;i<=refLength;i++){
623                         if (m->control_pressed) { return 0; }
624                         alignMatrix[0][i] = 15.0 * i;
625                         alignMoves[0][i] = 'l';
626                 }
627                 
628                 for(int i=1;i<=queryLength;i++){
629                         if (m->control_pressed) { return 0; }
630                         for(int j=1;j<=refLength;j++){
631                                 
632                                 double nogap;           
633                                 nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
634                                 
635                                 
636                                 double gap;
637                                 double left;
638                                 if(i == queryLength){ //terminal gap
639                                         left = alignMatrix[i][j-1];
640                                 }
641                                 else{
642                                         if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
643                                                 gap = 4.0;
644                                         }
645                                         else{
646                                                 gap = 15.0;
647                                         }
648                                         
649                                         left = alignMatrix[i][j-1] + gap;
650                                 }
651                                 
652                                 
653                                 double up;
654                                 if(j == refLength){ //terminal gap
655                                         up = alignMatrix[i-1][j];
656                                 }
657                                 else{
658                                         
659                                         if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
660                                                 gap = 4.0;
661                                         }
662                                         else{
663                                                 gap = 15.0;
664                                         }
665                                         
666                                         up = alignMatrix[i-1][j] + gap;
667                                 }
668                                 
669                                 
670                                 
671                                 if(nogap < left){
672                                         if(nogap < up){
673                                                 alignMoves[i][j] = 'd';
674                                                 alignMatrix[i][j] = nogap;
675                                         }
676                                         else{
677                                                 alignMoves[i][j] = 'u';
678                                                 alignMatrix[i][j] = up;
679                                         }
680                                 }
681                                 else{
682                                         if(left < up){
683                                                 alignMoves[i][j] = 'l';
684                                                 alignMatrix[i][j] = left;
685                                         }
686                                         else{
687                                                 alignMoves[i][j] = 'u';
688                                                 alignMatrix[i][j] = up;
689                                         }
690                                 }
691                         }
692                 }
693                 
694                 int i = queryLength;
695                 int j = refLength;
696                 int diffs = 0;
697                 
698                 //      string alignA = "";
699                 //      string alignB = "";
700                 //      string bases = "ACTG";
701                 
702                 while(i > 0 && j > 0){
703                         if (m->control_pressed) { return 0; }
704                         if(alignMoves[i][j] == 'd'){
705                                 //                      alignA = bases[query[i-1]] + alignA;
706                                 //                      alignB = bases[ref[j-1]] + alignB;
707                                 
708                                 if(query[i-1] != ref[j-1])      {       diffs++;        }
709                                 
710                                 i--;
711                                 j--;
712                         }
713                         else if(alignMoves[i][j] == 'u'){
714                                 if(j != refLength){
715                                         //                              alignA = bases[query[i-1]] + alignA;
716                                         //                              alignB = '-' + alignB;
717                                         
718                                         diffs++;
719                                 }
720                                 
721                                 i--;
722                         }
723                         else if(alignMoves[i][j] == 'l'){
724                                 if(i != queryLength){
725                                         //                              alignA = '-' + alignA;
726                                         //                              alignB = bases[ref[j-1]] + alignB;
727                                         
728                                         diffs++;
729                                 }
730                                 
731                                 j--;
732                         }
733                 }
734                 
735                 //      cout << diffs << endl;
736                 //      cout << alignA << endl;
737                 //      cout << alignB << endl;
738                 //      cout << endl;
739                 
740                 return diffs;
741         }
742         catch(exception& e) {
743                 m->errorOut(e, "seqNoise", "countDiffs");
744                 exit(1);
745         }
746         
747 }
748
749 /**************************************************************************************************/
750
751 vector<int> seqNoise::convertSeq(string bases){
752         try {
753                 vector<int> numbers(bases.length(), -1);
754                 
755                 for(int i=0;i<bases.length();i++){
756                         if (m->control_pressed) { return numbers; }
757                         
758                         char b = bases[i];
759                         
760                         if(b == 'A')    {       numbers[i] = 0; }
761                         else if(b=='C') {       numbers[i] = 1; }
762                         else if(b=='T') {       numbers[i] = 2; }
763                         else if(b=='G') {       numbers[i] = 3; }
764                         else                    {       numbers[i] = 0; }
765                 }
766                 
767                 return numbers;
768         }
769         catch(exception& e) {
770                 m->errorOut(e, "seqNoise", "convertSeq");
771                 exit(1);
772         }
773 }
774
775 /**************************************************************************************************/
776
777 string seqNoise::degapSeq(string aligned){
778         try {
779                 string unaligned = "";
780                 
781                 for(int i=0;i<aligned.length();i++){
782                         
783                         if (m->control_pressed) { return ""; }
784                         
785                         if(aligned[i] != '-' && aligned[i] != '.'){
786                                 unaligned += aligned[i];
787                         }
788                 }
789                 
790                 return unaligned;
791         }
792         catch(exception& e) {
793                 m->errorOut(e, "seqNoise", "degapSeq");
794                 exit(1);
795         }
796 }
797
798 /**************************************************************************************************/
799
800 int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
801         try {
802                 int numOTUs = finalTau.size();
803                 int numSeqs = uniqueNames.size();
804                 
805                 ofstream fastaFile(fastaFileName.c_str());
806                 ofstream namesFile(namesFileName.c_str());
807                 ofstream uMapFile(uMapFileName.c_str());
808                 
809                 vector<int> maxSequenceAbund(numOTUs, 0);
810                 vector<int> maxSequenceIndex(numOTUs, 0);
811                 
812                 for(int i=0;i<numSeqs;i++){
813                         if (m->control_pressed) { return 0; }
814                         if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
815                                 maxSequenceAbund[otuData[i]] = seqFreq[i];
816                                 maxSequenceIndex[otuData[i]] = i;
817                         }
818                 }
819                 
820                 int count = 1;
821                 
822                 for(int i=0;i<numOTUs;i++){
823                         if (m->control_pressed) { return 0; }
824                         
825                         if(finalTau[i] > 0){
826                                 
827                                 if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
828                                         //                              cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
829                                         centroids[i] = maxSequenceIndex[i];
830                                 }
831                                 
832                                 int index = centroids[i];
833                                 
834                                 fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
835                                 namesFile << uniqueNames[index] << '\t';
836                                 
837                                 string refSeq = sequences[index];
838                                 string redundantSeqs = redundantNames[index];;
839                                 
840                                 
841                                 vector<freqData> frequencyData;
842                                 
843                                 for(int j=0;j<numSeqs;j++){
844                                         if(otuData[j] == i && j != index){
845                                                 frequencyData.push_back(freqData(j, seqFreq[j]));
846                                         }
847                                 }
848                                 sort(frequencyData.rbegin(), frequencyData.rend());
849                                 
850                                 string refDegap = degapSeq(refSeq);
851                                 vector<int> rUnalign = convertSeq(refDegap);
852                                 
853                                 uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
854                                 uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
855                                 
856                                 
857                                 for(int j=0;j<frequencyData.size();j++){
858                                         if (m->control_pressed) { return 0; }
859                                         redundantSeqs += ',' + redundantNames[frequencyData[j].index];
860                                         
861                                         uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
862                                         
863                                         string querySeq = sequences[frequencyData[j].index];
864                                         
865                                         string queryDegap = degapSeq(querySeq);
866                                         vector<int> qUnalign = convertSeq(queryDegap);
867                                         
868                                         int udiffs = countDiffs(qUnalign, rUnalign);
869                                         uMapFile << udiffs << '\t' << queryDegap << endl;
870                                         
871                                 }                                       
872                                 
873                                 uMapFile << endl;
874                                 namesFile << redundantSeqs << endl;
875                                 count++;
876                                 
877                         }
878                 }
879                 fastaFile.close();
880                 namesFile.close();
881                 uMapFile.close();
882                 return 0;
883         }
884         catch(exception& e) {
885                 m->errorOut(e, "seqNoise", "writeOutput");
886                 exit(1);
887         }
888 }
889
890 /**************************************************************************************************
891
892 int main(int argc, char *argv[]){
893         
894         double sigma = 100;
895         sigma = atof(argv[5]);
896         
897         double cutOff = 0.08;
898         int minIter = 10;
899         int maxIter = 1000;
900         double minDelta = 1e-6;
901         
902         string sequenceFileName = argv[1];
903         string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
904         
905         vector<string> sequences;
906         getSequenceData(sequenceFileName, sequences);
907         
908         int numSeqs = sequences.size();
909         
910         vector<string> uniqueNames(numSeqs);
911         vector<string> redundantNames(numSeqs);
912         vector<int> seqFreq(numSeqs);
913         
914         string namesFileName = argv[4];
915         getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
916         
917         string distFileName = argv[2];
918         vector<double> distances(numSeqs * numSeqs);
919         getDistanceData(distFileName, distances);
920         
921         string listFileName = argv[3];
922         vector<int> otuData(numSeqs);
923         vector<int> otuFreq;
924         vector<vector<int> > otuBySeqLookUp;
925         
926         getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
927         
928         int numOTUs = otuFreq.size();
929         
930         vector<double> weights(numOTUs, 0);
931         vector<int> change(numOTUs, 1);
932         vector<int> centroids(numOTUs, -1);
933         vector<int> cumCount(numOTUs, 0);
934         
935         vector<double> tau(numSeqs, 1);
936         vector<int> anP(numSeqs, 0);
937         vector<int> anI(numSeqs, 0);
938         vector<int> anN(numSeqs, 0);
939         vector<vector<int> > aanI = otuBySeqLookUp;
940         
941         int numIters = 0;
942         double maxDelta = 1e6;
943         
944         while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
945                 
946                 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
947                 maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
948                 
949                 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
950                 checkCentroids(weights, centroids);
951                 
952                 otuFreq.assign(numOTUs, 0);
953                 
954                 int total = 0;
955                 
956                 for(int i=0;i<numSeqs;i++){
957                         double offset = 1e6;
958                         double norm = 0.0000;
959                         double minWeight = MIN_WEIGHT;
960                         vector<double> currentTau(numOTUs);
961                         
962                         for(int j=0;j<numOTUs;j++){
963                                 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
964                                         offset = distances[i * numSeqs+centroids[j]];
965                                 }
966                         }
967                         
968                         for(int j=0;j<numOTUs;j++){
969                                 if(weights[j] > minWeight){
970                                         currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
971                                         norm += currentTau[j];
972                                 }
973                                 else{
974                                         currentTau[j] = 0.0000;
975                                 }
976                         }                       
977                         
978                         for(int j=0;j<numOTUs;j++){
979                                 currentTau[j] /= norm;
980                         }
981                         
982                         for(int j=0;j<numOTUs;j++){
983                                 
984                                 if(currentTau[j] > MIN_TAU){
985                                         int oldTotal = total;
986                                         total++;
987                                         
988                                         tau.resize(oldTotal+1);
989                                         tau[oldTotal] = currentTau[j];
990                                         otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
991                                         aanI[j][otuFreq[j]] = i;
992                                         otuFreq[j]++;
993                                         
994                                 }
995                         }
996                         
997                         anP.resize(total);
998                         anI.resize(total);
999                 }
1000                 
1001                 numIters++;
1002         }
1003         
1004         updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
1005         
1006         vector<double> percentage(numSeqs);
1007         setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
1008         finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
1009         
1010         change.assign(numOTUs, 1);
1011         calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
1012         
1013         
1014         vector<int> finalTau(numOTUs, 0);
1015         for(int i=0;i<numSeqs;i++){
1016                 finalTau[otuData[i]] += int(seqFreq[i]);
1017         }
1018         
1019         writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
1020         
1021         return 0;
1022 }
1023
1024 **************************************************************************************************/