]> git.donarmstrong.com Git - mothur.git/blob - seqnoise.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / seqnoise.cpp
1 /*
2  *  mySeqNoise.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 8/31/11.
6  *  Copyright 2011 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqnoise.h"
11 #include "sequence.hpp"
12
13 #define MIN_DELTA 1.0e-6
14 #define MIN_ITER 20
15 #define MAX_ITER 1000
16 #define MIN_COUNT 0.1
17 #define MIN_TAU   1.0e-4
18 #define MIN_WEIGHT 0.1
19
20
21 /**************************************************************************************************/
22 int seqNoise::getSequenceData(string sequenceFileName, vector<string>& sequences){
23         try {
24                 
25                 ifstream sequenceFile;
26                 m->openInputFile(sequenceFileName, sequenceFile);
27                 
28                 while(!sequenceFile.eof()){
29                         
30                         if (m->control_pressed) { break; }
31                         
32                         Sequence temp(sequenceFile); m->gobble(sequenceFile);
33                         
34                         if (temp.getName() != "") {
35                                 sequences.push_back(temp.getAligned());
36                         }
37                 }
38                 sequenceFile.close();
39                 
40                 return 0;
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "seqNoise", "getSequenceData");
44                 exit(1);
45         }
46 }
47 /**************************************************************************************************/
48 int seqNoise::addSeq(string seq, vector<string>& sequences){ 
49         try {
50                 sequences.push_back(seq);
51                 return 0;
52         }
53         catch(exception& e) {
54                 m->errorOut(e, "seqNoise", "addSeq");
55                 exit(1);
56         }
57 }       
58 /**************************************************************************************************/
59 //no checks for file mismatches
60 int seqNoise::getRedundantNames(string namesFileName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){
61         try {
62                 string unique, redundant;
63                 ifstream namesFile;
64                 m->openInputFile(namesFileName, namesFile);
65                 
66                 for(int i=0;i<redundantNames.size();i++){
67                         
68                         if (m->control_pressed) { break; }
69                         
70                         namesFile >> uniqueNames[i]; m->gobble(namesFile);
71                         namesFile >> redundantNames[i]; m->gobble(namesFile);
72                         
73                         seqFreq[i] = m->getNumNames(redundantNames[i]);
74                 }
75                 namesFile.close();
76                 
77                 return 0;
78         }
79         catch(exception& e) {
80                 m->errorOut(e, "seqNoise", "getRedundantNames");
81                 exit(1);
82         }
83 }
84 /**************************************************************************************************/
85 int seqNoise::addRedundantName(string uniqueName, string redundantName, vector<string>& uniqueNames, vector<string>& redundantNames, vector<int>& seqFreq){ 
86         try {
87                 
88                 uniqueNames.push_back(uniqueName);
89                 redundantNames.push_back(redundantName);
90                 seqFreq.push_back(m->getNumNames(redundantName));
91                 
92                 return 0;
93         }
94         catch(exception& e) {
95                 m->errorOut(e, "seqNoise", "addRedundantName");
96                 exit(1);
97         }
98 }       
99 /**************************************************************************************************/
100 int seqNoise::getDistanceData(string distFileName, vector<double>& distances){
101         try {
102                 
103                 ifstream distFile;
104                 m->openInputFile(distFileName, distFile);
105                 
106                 int numSeqs = 0;
107                 string name = "";
108                 
109                 distFile >> numSeqs;
110                 
111                 for(int i=0;i<numSeqs;i++){
112                         
113                         if (m->control_pressed) {  break; }
114                         
115                         distances[i * numSeqs + i] = 0.0000;
116                         
117                         distFile >> name;
118                         
119                         for(int j=0;j<i;j++){
120                                 distFile >> distances[i * numSeqs + j];
121                                 distances[j * numSeqs + i] = distances[i * numSeqs + j];
122                         }
123                 }
124                 
125                 distFile.close();       
126                 
127                 return 0;
128         }
129         catch(exception& e) {
130                 m->errorOut(e, "seqNoise", "getDistanceData");
131                 exit(1);
132         }
133 }
134
135 /**************************************************************************************************/
136 int seqNoise::getListData(string listFileName, double cutOff, vector<int>& otuData, vector<int>& otuFreq, vector<vector<int> >& otuBySeqLookUp){
137         try {
138                 
139                 ifstream listFile;
140                 m->openInputFile(listFileName, listFile);
141                 double threshold;
142                 int numOTUs;
143                 string line = "";
144                 bool adjustCutoff = true;
145                 
146                 if(listFile.peek() == 'u'){     m->getline(listFile);   }
147                 
148                 while(listFile){
149                         listFile >> threshold;
150                         
151                         if(threshold < cutOff){
152                                 line = m->getline(listFile); m->gobble(listFile);
153                         }
154                         else{
155                                 adjustCutoff = false;
156                                 listFile >> numOTUs;
157                                 otuFreq.resize(numOTUs, 0);
158                                 
159                                 for(int i=0;i<numOTUs;i++){
160                                         
161                                         if (m->control_pressed) { return 0; }
162                                         
163                                         string otu;
164                                         listFile >> otu;
165                                         
166                                         int count = 0;
167                                         
168                                         string number = "";
169                                         
170                                         for(int j=0;j<otu.size();j++){
171                                                 if(otu[j] != ','){
172                                                         number += otu[j];
173                                                 }
174                                                 else{
175                                                         int index = atoi(number.c_str());
176                                                         otuData[index] = i;
177                                                         count++;
178                                                         number = "";
179                                                 }
180                                         }
181                                         
182                                         int index = atoi(number.c_str());
183                                         otuData[index] = i;
184                                         count++;
185                                         
186                                         otuFreq[i] = count;
187                                 }
188                                 
189                                 otuBySeqLookUp.resize(numOTUs);
190                                 
191                                 int numSeqs = otuData.size();
192                                 
193                                 for(int i=0;i<numSeqs;i++){
194                                         if (m->control_pressed) { return 0; }
195                                         otuBySeqLookUp[otuData[i]].push_back(i);
196                                 }
197                                 for(int i=0;i<numOTUs;i++){
198                                         if (m->control_pressed) { return 0; }
199                                         for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
200                                                 otuBySeqLookUp[i].push_back(0);
201                                         }
202                                 }
203                                 
204                                 break;
205                         }
206                 }
207                 
208                 listFile.close();
209                 
210                 //the listfile does not contain a threshold greater than the cutoff so use highest value
211                 if (adjustCutoff) {
212                         istringstream iss (line,istringstream::in);
213                         
214                         iss >> numOTUs;
215                         otuFreq.resize(numOTUs, 0);
216                         
217                         for(int i=0;i<numOTUs;i++){
218                                 
219                                 if (m->control_pressed) { return 0; }
220                                 
221                                 string otu;
222                                 iss >> otu;
223                                 
224                                 int count = 0;
225                                 
226                                 string number = "";
227                                 
228                                 for(int j=0;j<otu.size();j++){
229                                         if(otu[j] != ','){
230                                                 number += otu[j];
231                                         }
232                                         else{
233                                                 int index = atoi(number.c_str());
234                                                 otuData[index] = i;
235                                                 count++;
236                                                 number = "";
237                                         }
238                                 }
239                                 
240                                 int index = atoi(number.c_str());
241                                 otuData[index] = i;
242                                 count++;
243                                 
244                                 otuFreq[i] = count;
245                         }
246                         
247                         otuBySeqLookUp.resize(numOTUs);
248                         
249                         int numSeqs = otuData.size();
250                         
251                         for(int i=0;i<numSeqs;i++){
252                                 if (m->control_pressed) { return 0; }
253                                 otuBySeqLookUp[otuData[i]].push_back(i);
254                         }
255                         for(int i=0;i<numOTUs;i++){
256                                 if (m->control_pressed) { return 0; }
257                                 for(int j=otuBySeqLookUp[i].size();j<numSeqs;j++){
258                                         otuBySeqLookUp[i].push_back(0);
259                                 }
260                         }
261                         
262                 }
263                 
264                 return 0;
265         }
266         catch(exception& e) {
267                 m->errorOut(e, "seqNoise", "getListData");
268                 exit(1);
269         }
270 }
271
272 /**************************************************************************************************/
273 int seqNoise::updateOTUCountData(vector<int> otuFreq,
274                                                                  vector<vector<int> > otuBySeqLookUp,
275                                                                  vector<vector<int> > aanI,
276                                                                  vector<int>& anP,
277                                                                  vector<int>& anI,
278                                                                  vector<int>& cumCount
279                                                                  ){
280         try {
281                 int numOTUs = otuFreq.size();
282                 
283                 int count = 0;
284                 
285                 for(int i=0;i<numOTUs;i++){
286                         cumCount[i] = count;
287                         
288                         if (m->control_pressed) { return 0; }
289                         
290                         for(int j=0;j<otuFreq[i];j++){
291                                 anP[count] = otuBySeqLookUp[i][j];
292                                 anI[count] = aanI[i][j];
293                                 
294                                 count++;
295                         }
296                 }
297                 
298                 return 0;
299         }
300         catch(exception& e) {
301                 m->errorOut(e, "seqNoise", "updateOTUCountData");
302                 exit(1);
303         }
304 }
305 /**************************************************************************************************/
306 double seqNoise::calcNewWeights(
307                                           vector<double>& weights,      //
308                                           vector<int> seqFreq,          //
309                                           vector<int> anI,                      //
310                                           vector<int> cumCount,         //
311                                           vector<int> anP,                      //
312                                           vector<int> otuFreq,          //
313                                           vector<double> tau            //
314                                           ){
315         try {
316                 
317                 int numOTUs = weights.size();
318                 double maxChange = -1;
319                 
320                 cout.flush();
321                 
322                 for(int i=0;i<numOTUs;i++){
323                         
324                         if (m->control_pressed) { return 0; }
325                         
326                         double change = weights[i];
327                         
328                         weights[i] = 0.0000;
329                         
330                         for(int j=0;j<otuFreq[i];j++){
331                                 
332                                 int index1 = cumCount[i] + j;
333                                 int index2 = anI[index1];
334                                 
335                                 double currentTau = tau[anP[index1]];
336                                 double freq = double(seqFreq[index2]);
337                                 
338                                 weights[i] += currentTau * freq;
339                         }
340                         change = fabs(weights[i] - change);
341                         
342                         if(change > maxChange){ maxChange = change;     }
343                         cout.flush();
344                 }
345                 return maxChange;
346                 
347         }
348         catch(exception& e) {
349                 m->errorOut(e, "seqNoise", "calcNewWeights");
350                 exit(1);
351         }
352 }
353
354 /**************************************************************************************************/
355
356 int seqNoise::calcCentroids(
357                                    vector<int> anI,
358                                    vector<int> anP,
359                                    vector<int>& change, 
360                                    vector<int>& centroids, 
361                                    vector<int> cumCount,
362                                    vector<double> distances,///
363                                    vector<int> seqFreq, 
364                                    vector<int> otuFreq, 
365                                    vector<double> tau 
366                                    ){
367         try {
368                 int numOTUs = change.size();
369                 int numSeqs = seqFreq.size();
370                 
371                 for(int i=0;i<numOTUs;i++){
372                         
373                         if (m->control_pressed) { return 0; }
374                         
375                         int minFIndex = -1;
376                         double minFValue = 1e10;
377                         
378                         change[i] = 0;
379                         double count = 0.00000;
380                         
381                         int freqOfOTU = otuFreq[i];
382                         
383                         for(int j=0;j<freqOfOTU;j++){
384                                 int index = cumCount[i] + j;
385                                 count += seqFreq[anI[index]]*tau[anP[index]];
386                         }
387                         
388                         if(freqOfOTU > 0 && count > MIN_COUNT){
389                                 
390                                 vector<double> adF(freqOfOTU);
391                                 vector<int> anL(freqOfOTU);
392                                 
393                                 for(int j=0;j<freqOfOTU;j++){
394                                         anL[j] = anI[cumCount[i] + j];
395                                         adF[j] = 0.0000;
396                                 }
397                                 
398                                 for(int j=0;j<freqOfOTU;j++){           
399                                         int index = cumCount[i] + j;
400                                         double curTau = tau[anP[index]];
401                                         
402                                         for(int k=0;k<freqOfOTU;k++){
403                                                 double dist = distances[anL[j]*numSeqs + anL[k]];
404                                                 
405                                                 adF[k] += dist * curTau * seqFreq[anL[j]];
406                                         }
407                                 }
408                                 
409                                 for(int j=0;j<freqOfOTU;j++){
410                                         if(adF[j] < minFValue){
411                                                 minFIndex = j;
412                                                 minFValue = adF[j];
413                                         }
414                                 }
415                                 
416                                 if(centroids[i] != anL[minFIndex]){
417                                         change[i] = 1;
418                                         centroids[i] = anL[minFIndex];
419                                 }
420                         }
421                         else if(centroids[i] != -1){
422                                 change[i] = 1;
423                                 centroids[i] = -1;                      
424                         }
425                 }
426                 
427                 return 0;
428         }
429         catch(exception& e) {
430                 m->errorOut(e, "seqNoise", "calcCentroids");
431                 exit(1);
432         }
433 }
434
435 /**************************************************************************************************/
436
437 int seqNoise::checkCentroids(vector<double>& weights, vector<int> centroids){
438         try {
439                 int numOTUs = centroids.size();
440                 vector<int> unique(numOTUs, 1);
441                 
442                 double minWeight = MIN_WEIGHT;
443                 for(int i=0;i<numOTUs;i++){
444                         if (m->control_pressed) { return 0; }
445                         if(weights[i] < minWeight){     unique[i] = -1; }
446                 }
447                 
448                 for(int i=0;i<numOTUs;i++){
449                         if (m->control_pressed) { return 0; }
450                         if(unique[i] == 1){
451                                 for(int j=i+1; j<numOTUs;j++){
452                                         if(unique[j] == 1){
453                                                 if(centroids[i] == centroids[j]){
454                                                         unique[j] = 0;
455                                                         weights[i] += weights[j];
456                                                         weights[j] = 0;
457                                                 }
458                                         }
459                                 }
460                         }               
461                 }
462                 
463                 return 0;
464         }
465         catch(exception& e) {
466                 m->errorOut(e, "seqNoise", "checkCentroids");
467                 exit(1);
468         }
469 }
470
471 /**************************************************************************************************/
472
473 int seqNoise::setUpOTUData(vector<int>& otuData, vector<double>& percentage, vector<int> cumCount, vector<double> tau, vector<int> otuFreq, vector<int> anP, vector<int> anI){
474         try {
475
476                 int numOTUs = cumCount.size();
477                 int numSeqs = otuData.size();
478                 
479                 vector<double> bestTau(numSeqs, 0);
480                 vector<double> bestIndex(numSeqs, -1);
481                 
482                 for(int i=0;i<numOTUs;i++){
483                         if (m->control_pressed) { return 0; }
484                         for(int j=0;j<otuFreq[i];j++){
485                                 
486                                 int index1 = cumCount[i] + j;
487                                 double thisTau = tau[anP[index1]];
488                                 int index2 = anI[index1];
489                                 
490                                 if(thisTau > bestTau[index2]){
491                                         bestTau[index2] = thisTau;
492                                         bestIndex[index2] = i;
493                                 }
494                         }               
495                 }
496                 
497                 for(int i=0;i<numSeqs;i++){
498                         if (m->control_pressed) { return 0; }
499                         otuData[i] = bestIndex[i];
500                         percentage[i] = 1 - bestTau[i];
501                 }
502                 return 0;
503         }
504         catch(exception& e) {
505                 m->errorOut(e, "seqNoise", "setUpOTUData");
506                 exit(1);
507         }
508 }
509
510 /**************************************************************************************************/
511
512 int seqNoise::finishOTUData(vector<int> otuData, vector<int>& otuFreq, vector<int>& anP, vector<int>& anI, vector<int>& cumCount, vector<vector<int> >& otuBySeqLookUp, vector<vector<int> >& aanI, vector<double>& tau){
513         try {
514                 int numSeqs = otuData.size();
515                 int numOTUs = otuFreq.size();
516                 int total = numSeqs;
517                 
518                 otuFreq.assign(numOTUs, 0);
519                 tau.assign(numSeqs, 1);
520                 anP.assign(numSeqs, 0);
521                 anI.assign(numSeqs, 0);
522                 
523                 for(int i=0;i<numSeqs;i++){
524                         if (m->control_pressed) { return 0; }
525                         int otu = otuData[i];
526                         total++;
527                         
528                         otuBySeqLookUp[otu][otuFreq[otu]] = i;
529                         aanI[otu][otuFreq[otu]] = i;
530                         otuFreq[otu]++;
531                 }
532                 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
533                 return 0;
534         }
535         catch(exception& e) {
536                 m->errorOut(e, "seqNoise", "finishOTUData");
537                 exit(1);
538         }
539 }
540
541 /**************************************************************************************************/
542
543 int seqNoise::getLastMatch(char direction, vector<vector<char> >& alignMoves, int i, int j, vector<int>& seqA, vector<int>& seqB){
544         try{
545                 char nullReturn = -1;
546                 
547                 while(i>=1 && j>=1){
548                         if (m->control_pressed) { return nullReturn; }
549                         if(direction == 'd'){
550                                 if(seqA[i-1] == seqB[j-1])      {       return seqA[i-1];       }
551                                 else                                            {       return nullReturn;      }
552                         }
553                         
554                         else if(direction == 'l')               {       j--;                            }
555                         else                                                    {       i--;                            }
556                         
557                         direction = alignMoves[i][j];
558                 }
559                 
560                 return nullReturn;
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "seqNoise", "getLastMatch");
564                 exit(1);
565         }
566 }
567
568 /**************************************************************************************************/
569
570 int seqNoise::countDiffs(vector<int> query, vector<int> ref){
571         try {
572                 //double MATCH = 5.0;
573                 //double MISMATCH = -2.0;
574                 //double GAP = -2.0;
575                 
576                 vector<vector<double> > correctMatrix(4);
577                 for(int i=0;i<4;i++){   correctMatrix[i].resize(4);     }
578                 
579                 correctMatrix[0][0] = 0.000000;         //AA
580                 correctMatrix[1][0] = 11.619259;        //CA
581                 correctMatrix[2][0] = 11.694004;        //TA
582                 correctMatrix[3][0] = 7.748623;         //GA
583                 
584                 correctMatrix[1][1] = 0.000000;         //CC
585                 correctMatrix[2][1] = 7.619657;         //TC
586                 correctMatrix[3][1] = 12.852562;        //GC
587                 
588                 correctMatrix[2][2] = 0.000000;         //TT
589                 correctMatrix[3][2] = 10.964048;        //TG
590                 
591                 correctMatrix[3][3] = 0.000000;         //GG
592                 
593                 for(int i=0;i<4;i++){
594                         for(int j=0;j<i;j++){
595                                 correctMatrix[j][i] = correctMatrix[i][j];
596                         }
597                 }
598                 
599                 int queryLength = query.size();
600                 int refLength = ref.size();
601                 
602                 vector<vector<double> > alignMatrix(queryLength + 1);
603                 vector<vector<char> > alignMoves(queryLength + 1);
604                 
605                 for(int i=0;i<=queryLength;i++){
606                         if (m->control_pressed) { return 0; }
607                         alignMatrix[i].resize(refLength + 1, 0);
608                         alignMoves[i].resize(refLength + 1, 'x');
609                 }
610                 
611                 for(int i=0;i<=queryLength;i++){
612                         if (m->control_pressed) { return 0; }
613                         alignMatrix[i][0] = 15.0 * i;
614                         alignMoves[i][0] = 'u';
615                 }
616                 
617                 for(int i=0;i<=refLength;i++){
618                         if (m->control_pressed) { return 0; }
619                         alignMatrix[0][i] = 15.0 * i;
620                         alignMoves[0][i] = 'l';
621                 }
622                 
623                 for(int i=1;i<=queryLength;i++){
624                         if (m->control_pressed) { return 0; }
625                         for(int j=1;j<=refLength;j++){
626                                 
627                                 double nogap;           
628                                 nogap = alignMatrix[i-1][j-1] + correctMatrix[query[i-1]][ref[j-1]];
629                                 
630                                 
631                                 double gap;
632                                 double left;
633                                 if(i == queryLength){ //terminal gap
634                                         left = alignMatrix[i][j-1];
635                                 }
636                                 else{
637                                         if(ref[j-1] == getLastMatch('l', alignMoves, i, j, query, ref)){
638                                                 gap = 4.0;
639                                         }
640                                         else{
641                                                 gap = 15.0;
642                                         }
643                                         
644                                         left = alignMatrix[i][j-1] + gap;
645                                 }
646                                 
647                                 
648                                 double up;
649                                 if(j == refLength){ //terminal gap
650                                         up = alignMatrix[i-1][j];
651                                 }
652                                 else{
653                                         
654                                         if(query[i-1] == getLastMatch('u', alignMoves, i, j, query, ref)){
655                                                 gap = 4.0;
656                                         }
657                                         else{
658                                                 gap = 15.0;
659                                         }
660                                         
661                                         up = alignMatrix[i-1][j] + gap;
662                                 }
663                                 
664                                 
665                                 
666                                 if(nogap < left){
667                                         if(nogap < up){
668                                                 alignMoves[i][j] = 'd';
669                                                 alignMatrix[i][j] = nogap;
670                                         }
671                                         else{
672                                                 alignMoves[i][j] = 'u';
673                                                 alignMatrix[i][j] = up;
674                                         }
675                                 }
676                                 else{
677                                         if(left < up){
678                                                 alignMoves[i][j] = 'l';
679                                                 alignMatrix[i][j] = left;
680                                         }
681                                         else{
682                                                 alignMoves[i][j] = 'u';
683                                                 alignMatrix[i][j] = up;
684                                         }
685                                 }
686                         }
687                 }
688                 
689                 int i = queryLength;
690                 int j = refLength;
691                 int diffs = 0;
692                 
693                 //      string alignA = "";
694                 //      string alignB = "";
695                 //      string bases = "ACTG";
696                 
697                 while(i > 0 && j > 0){
698                         if (m->control_pressed) { return 0; }
699                         if(alignMoves[i][j] == 'd'){
700                                 //                      alignA = bases[query[i-1]] + alignA;
701                                 //                      alignB = bases[ref[j-1]] + alignB;
702                                 
703                                 if(query[i-1] != ref[j-1])      {       diffs++;        }
704                                 
705                                 i--;
706                                 j--;
707                         }
708                         else if(alignMoves[i][j] == 'u'){
709                                 if(j != refLength){
710                                         //                              alignA = bases[query[i-1]] + alignA;
711                                         //                              alignB = '-' + alignB;
712                                         
713                                         diffs++;
714                                 }
715                                 
716                                 i--;
717                         }
718                         else if(alignMoves[i][j] == 'l'){
719                                 if(i != queryLength){
720                                         //                              alignA = '-' + alignA;
721                                         //                              alignB = bases[ref[j-1]] + alignB;
722                                         
723                                         diffs++;
724                                 }
725                                 
726                                 j--;
727                         }
728                 }
729                 
730                 //      cout << diffs << endl;
731                 //      cout << alignA << endl;
732                 //      cout << alignB << endl;
733                 //      cout << endl;
734                 
735                 return diffs;
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "seqNoise", "countDiffs");
739                 exit(1);
740         }
741         
742 }
743
744 /**************************************************************************************************/
745
746 vector<int> seqNoise::convertSeq(string bases){
747         try {
748                 vector<int> numbers(bases.length(), -1);
749                 
750                 for(int i=0;i<bases.length();i++){
751                         if (m->control_pressed) { return numbers; }
752                         
753                         char b = bases[i];
754                         
755                         if(b == 'A')    {       numbers[i] = 0; }
756                         else if(b=='C') {       numbers[i] = 1; }
757                         else if(b=='T') {       numbers[i] = 2; }
758                         else if(b=='G') {       numbers[i] = 3; }
759                         else                    {       numbers[i] = 0; }
760                 }
761                 
762                 return numbers;
763         }
764         catch(exception& e) {
765                 m->errorOut(e, "seqNoise", "convertSeq");
766                 exit(1);
767         }
768 }
769
770 /**************************************************************************************************/
771
772 string seqNoise::degapSeq(string aligned){
773         try {
774                 string unaligned = "";
775                 
776                 for(int i=0;i<aligned.length();i++){
777                         
778                         if (m->control_pressed) { return ""; }
779                         
780                         if(aligned[i] != '-' && aligned[i] != '.'){
781                                 unaligned += aligned[i];
782                         }
783                 }
784                 
785                 return unaligned;
786         }
787         catch(exception& e) {
788                 m->errorOut(e, "seqNoise", "degapSeq");
789                 exit(1);
790         }
791 }
792
793 /**************************************************************************************************/
794
795 int seqNoise::writeOutput(string fastaFileName, string namesFileName, string uMapFileName, vector<int> finalTau, vector<int> centroids, vector<int> otuData, vector<string> sequences, vector<string> uniqueNames, vector<string> redundantNames, vector<int> seqFreq, vector<double>& distances){
796         try {
797                 int numOTUs = finalTau.size();
798                 int numSeqs = uniqueNames.size();
799                 
800                 ofstream fastaFile(fastaFileName.c_str());
801                 ofstream namesFile(namesFileName.c_str());
802                 ofstream uMapFile(uMapFileName.c_str());
803                 
804                 vector<int> maxSequenceAbund(numOTUs, 0);
805                 vector<int> maxSequenceIndex(numOTUs, 0);
806                 
807                 for(int i=0;i<numSeqs;i++){
808                         if (m->control_pressed) { return 0; }
809                         if(maxSequenceAbund[otuData[i]] < seqFreq[i]){
810                                 maxSequenceAbund[otuData[i]] = seqFreq[i];
811                                 maxSequenceIndex[otuData[i]] = i;
812                         }
813                 }
814                 
815                 int count = 1;
816                 
817                 for(int i=0;i<numOTUs;i++){
818                         if (m->control_pressed) { return 0; }
819                         
820                         if(finalTau[i] > 0){
821                                 
822                                 if(maxSequenceIndex[i] != centroids[i] && distances[maxSequenceIndex[i]*numSeqs + centroids[i]] == 0){
823                                         //                              cout << uniqueNames[centroids[i]] << '\t' << uniqueNames[maxSequenceIndex[i]] << '\t' << count << endl;
824                                         centroids[i] = maxSequenceIndex[i];
825                                 }
826                                 
827                                 int index = centroids[i];
828                                 
829                                 fastaFile << '>' << uniqueNames[index] << endl << sequences[index] << endl;
830                                 namesFile << uniqueNames[index] << '\t';
831                                 
832                                 string refSeq = sequences[index];
833                                 string redundantSeqs = redundantNames[index];;
834                                 
835                                 
836                                 vector<freqData> frequencyData;
837                                 
838                                 for(int j=0;j<numSeqs;j++){
839                                         if(otuData[j] == i && j != index){
840                                                 frequencyData.push_back(freqData(j, seqFreq[j]));
841                                         }
842                                 }
843                                 sort(frequencyData.rbegin(), frequencyData.rend());
844                                 
845                                 string refDegap = degapSeq(refSeq);
846                                 vector<int> rUnalign = convertSeq(refDegap);
847                                 
848                                 uMapFile << "ideal_seq_" << count << '\t' << finalTau[i] << endl;
849                                 uMapFile << uniqueNames[index] << '\t' << seqFreq[index] << "\t0\t" << refDegap << endl;
850                                 
851                                 
852                                 for(int j=0;j<frequencyData.size();j++){
853                                         if (m->control_pressed) { return 0; }
854                                         redundantSeqs += ',' + redundantNames[frequencyData[j].index];
855                                         
856                                         uMapFile << uniqueNames[frequencyData[j].index] << '\t' << seqFreq[frequencyData[j].index] << '\t';
857                                         
858                                         string querySeq = sequences[frequencyData[j].index];
859                                         
860                                         string queryDegap = degapSeq(querySeq);
861                                         vector<int> qUnalign = convertSeq(queryDegap);
862                                         
863                                         int udiffs = countDiffs(qUnalign, rUnalign);
864                                         uMapFile << udiffs << '\t' << queryDegap << endl;
865                                         
866                                 }                                       
867                                 
868                                 uMapFile << endl;
869                                 namesFile << redundantSeqs << endl;
870                                 count++;
871                                 
872                         }
873                 }
874                 fastaFile.close();
875                 namesFile.close();
876                 uMapFile.close();
877                 return 0;
878         }
879         catch(exception& e) {
880                 m->errorOut(e, "seqNoise", "writeOutput");
881                 exit(1);
882         }
883 }
884
885 /**************************************************************************************************
886
887 int main(int argc, char *argv[]){
888         
889         double sigma = 100;
890         sigma = atof(argv[5]);
891         
892         double cutOff = 0.08;
893         int minIter = 10;
894         int maxIter = 1000;
895         double minDelta = 1e-6;
896         
897         string sequenceFileName = argv[1];
898         string fileNameStub = sequenceFileName.substr(0,sequenceFileName.find_last_of('.')) + ".shhh";
899         
900         vector<string> sequences;
901         getSequenceData(sequenceFileName, sequences);
902         
903         int numSeqs = sequences.size();
904         
905         vector<string> uniqueNames(numSeqs);
906         vector<string> redundantNames(numSeqs);
907         vector<int> seqFreq(numSeqs);
908         
909         string namesFileName = argv[4];
910         getRedundantNames(namesFileName, uniqueNames, redundantNames, seqFreq);
911         
912         string distFileName = argv[2];
913         vector<double> distances(numSeqs * numSeqs);
914         getDistanceData(distFileName, distances);
915         
916         string listFileName = argv[3];
917         vector<int> otuData(numSeqs);
918         vector<int> otuFreq;
919         vector<vector<int> > otuBySeqLookUp;
920         
921         getListData(listFileName, cutOff, otuData, otuFreq, otuBySeqLookUp);
922         
923         int numOTUs = otuFreq.size();
924         
925         vector<double> weights(numOTUs, 0);
926         vector<int> change(numOTUs, 1);
927         vector<int> centroids(numOTUs, -1);
928         vector<int> cumCount(numOTUs, 0);
929         
930         vector<double> tau(numSeqs, 1);
931         vector<int> anP(numSeqs, 0);
932         vector<int> anI(numSeqs, 0);
933         vector<int> anN(numSeqs, 0);
934         vector<vector<int> > aanI = otuBySeqLookUp;
935         
936         int numIters = 0;
937         double maxDelta = 1e6;
938         
939         while(numIters < minIter || ((maxDelta > minDelta) && (numIters < maxIter))){
940                 
941                 updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
942                 maxDelta = calcNewWeights(weights, seqFreq, anI, cumCount, anP, otuFreq, tau);
943                 
944                 calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
945                 checkCentroids(weights, centroids);
946                 
947                 otuFreq.assign(numOTUs, 0);
948                 
949                 int total = 0;
950                 
951                 for(int i=0;i<numSeqs;i++){
952                         double offset = 1e6;
953                         double norm = 0.0000;
954                         double minWeight = MIN_WEIGHT;
955                         vector<double> currentTau(numOTUs);
956                         
957                         for(int j=0;j<numOTUs;j++){
958                                 if(weights[j] > minWeight && distances[i * numSeqs+centroids[j]] < offset){
959                                         offset = distances[i * numSeqs+centroids[j]];
960                                 }
961                         }
962                         
963                         for(int j=0;j<numOTUs;j++){
964                                 if(weights[j] > minWeight){
965                                         currentTau[j] = exp(sigma * (-distances[(i * numSeqs + centroids[j])] + offset)) * weights[j];
966                                         norm += currentTau[j];
967                                 }
968                                 else{
969                                         currentTau[j] = 0.0000;
970                                 }
971                         }                       
972                         
973                         for(int j=0;j<numOTUs;j++){
974                                 currentTau[j] /= norm;
975                         }
976                         
977                         for(int j=0;j<numOTUs;j++){
978                                 
979                                 if(currentTau[j] > MIN_TAU){
980                                         int oldTotal = total;
981                                         total++;
982                                         
983                                         tau.resize(oldTotal+1);
984                                         tau[oldTotal] = currentTau[j];
985                                         otuBySeqLookUp[j][otuFreq[j]] = oldTotal;
986                                         aanI[j][otuFreq[j]] = i;
987                                         otuFreq[j]++;
988                                         
989                                 }
990                         }
991                         
992                         anP.resize(total);
993                         anI.resize(total);
994                 }
995                 
996                 numIters++;
997         }
998         
999         updateOTUCountData(otuFreq, otuBySeqLookUp, aanI, anP, anI, cumCount);
1000         
1001         vector<double> percentage(numSeqs);
1002         setUpOTUData(otuData, percentage, cumCount, tau, otuFreq, anP, anI);
1003         finishOTUData(otuData, otuFreq, anP, anI, cumCount, otuBySeqLookUp, aanI, tau);
1004         
1005         change.assign(numOTUs, 1);
1006         calcCentroids(anI, anP, change, centroids, cumCount, distances, seqFreq, otuFreq, tau);
1007         
1008         
1009         vector<int> finalTau(numOTUs, 0);
1010         for(int i=0;i<numSeqs;i++){
1011                 finalTau[otuData[i]] += int(seqFreq[i]);
1012         }
1013         
1014         writeOutput(fileNameStub, finalTau, centroids, otuData, sequences, uniqueNames, redundantNames, seqFreq, distances);
1015         
1016         return 0;
1017 }
1018
1019 **************************************************************************************************/