]> git.donarmstrong.com Git - mothur.git/blob - ccode.cpp
added more descriptive error messages to sharedlist
[mothur.git] / ccode.cpp
1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14
15 //***************************************************************************************************************
16 Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
17 //***************************************************************************************************************
18
19 Ccode::~Ccode() {
20         try {
21                 for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
22                 for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
23                 delete distCalc;
24         }
25         catch(exception& e) {
26                 errorOut(e, "Ccode", "~Ccode");
27                 exit(1);
28         }
29 }       
30 //***************************************************************************************************************
31 void Ccode::print(ostream& out) {
32         try {
33                 
34                 mothurOutEndLine();
35                 
36                 for (int i = 0; i < querySeqs.size(); i++) {
37                         
38                 }
39         }
40         catch(exception& e) {
41                 errorOut(e, "Ccode", "print");
42                 exit(1);
43         }
44 }
45
46 //***************************************************************************************************************
47 void Ccode::getChimeras() {
48         try {
49                 
50                 //read in query sequences and subject sequences
51                 mothurOut("Reading sequences and template file... "); cout.flush();
52                 querySeqs = readSeqs(fastafile);
53                 templateSeqs = readSeqs(templateFile);
54                 mothurOut("Done."); mothurOutEndLine();
55                 
56                 int numSeqs = querySeqs.size();
57                 
58                 closest.resize(numSeqs);
59                 
60                 //break up file if needed
61                 int linesPerProcess = numSeqs / processors ;
62                 
63                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
64                         //find breakup of sequences for all times we will Parallelize
65                         if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
66                         else {
67                                 //fill line pairs
68                                 for (int i = 0; i < (processors-1); i++) {                      
69                                         lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
70                                 }
71                                 //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
72                                 int i = processors - 1;
73                                 lines.push_back(new linePair((i*linesPerProcess), numSeqs));
74                         }
75                         
76                         //find breakup of templatefile for quantiles
77                         if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
78                         else { 
79                                 for (int i = 0; i < processors; i++) {
80                                         templateLines.push_back(new linePair());
81                                         templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
82                                         templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
83                                 }
84                         }
85                 #else
86                         lines.push_back(new linePair(0, numSeqs));
87                         templateLines.push_back(new linePair(0, templateSeqs.size()));
88                 #endif
89         
90                 distCalc = new eachGapDist();
91                 decalc = new DeCalculator();
92                 
93                 //find closest
94                 if (processors == 1) { 
95                         mothurOut("Finding top matches for sequences... "); cout.flush();
96                         closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
97                         mothurOut("Done."); mothurOutEndLine();
98                 }else {         createProcessesClosest();               }
99
100                 
101 for (int i = 0; i < closest.size(); i++) {
102         cout << querySeqs[i]->getName() << ": ";
103         for (int j = 0; j < closest[i].size(); j++) {
104                         
105                 cout << closest[i][j]->getName() << '\t';
106         }
107         cout << endl;
108 }       
109
110                 //mask sequences if the user wants to 
111                 if (seqMask != "") {
112                         //mask querys
113                         for (int i = 0; i < querySeqs.size(); i++) {
114                                 decalc->runMask(querySeqs[i]);
115                         }
116                 
117                         //mask templates
118                         for (int i = 0; i < templateSeqs.size(); i++) {
119                                 decalc->runMask(templateSeqs[i]);
120                         }
121                 }
122                 
123                 if (filter) {
124                         vector<Sequence*> temp = templateSeqs;
125                         for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
126                         
127                         createFilter(temp);
128                         
129                         runFilter(querySeqs);
130                         runFilter(templateSeqs);
131                 }
132
133                 //trim sequences - this follows ccodes remove_extra_gaps 
134                 //just need to pass it query and template since closest points to template
135                 trimSequences();
136                 
137                 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().  
138                 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
139                 windows = findWindows();  
140                 
141                 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
142                 for (int i = 0; i < closest.size(); i++) {
143                         removeBadReferenceSeqs(closest[i], i);
144                 }
145                         
146                                         
147                 //free memory
148                 for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
149                 for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
150                         
151         }
152         catch(exception& e) {
153                 errorOut(e, "Ccode", "getChimeras");
154                 exit(1);
155         }
156 }
157 /***************************************************************************************************************/
158 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
159 void Ccode::trimSequences() {
160         try {
161                 
162                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
163                 int rearPos = querySeqs[0]->getAligned().length();
164                 
165                 //********find first position in all seqs that is a non gap character***********//
166                 //find first position all query seqs that is a non gap character
167                 for (int i = 0; i < querySeqs.size(); i++) {
168                         
169                         string aligned = querySeqs[i]->getAligned();
170                         int pos = 0;
171                         
172                         //find first spot in this seq
173                         for (int j = 0; j < aligned.length(); j++) {
174                                 if (isalpha(aligned[j])) {
175                                         pos = j;
176                                         break;
177                                 }
178                         }
179                         
180                         //save this spot if it is the farthest
181                         if (pos > frontPos) { frontPos = pos; }
182                 }
183                 
184                 //find first position all template seqs that is a non gap character
185                 for (int i = 0; i < templateSeqs.size(); i++) {
186                         
187                         string aligned = templateSeqs[i]->getAligned();
188                         int pos = 0;
189                         
190                         //find first spot in this seq
191                         for (int j = 0; j < aligned.length(); j++) {
192                                 if (isalpha(aligned[j])) {
193                                         pos = j;
194                                         break;
195                                 }
196                         }
197                         
198                         //save this spot if it is the farthest
199                         if (pos > frontPos) { frontPos = pos; }
200                 }
201
202                 
203                 //********find last position in all seqs that is a non gap character***********//
204                 //find last position all query seqs that is a non gap character
205                 for (int i = 0; i < querySeqs.size(); i++) {
206                         
207                         string aligned = querySeqs[i]->getAligned();
208                         int pos = aligned.length();
209                         
210                         //find first spot in this seq
211                         for (int j = aligned.length()-1; j >= 0; j--) {
212                                 if (isalpha(aligned[j])) {
213                                         pos = j;
214                                         break;
215                                 }
216                         }
217                         
218                         //save this spot if it is the farthest
219                         if (pos < rearPos) { rearPos = pos; }
220                 }
221                 
222                 //find last position all template seqs that is a non gap character
223                 for (int i = 0; i < templateSeqs.size(); i++) {
224                         
225                         string aligned = templateSeqs[i]->getAligned();
226                         int pos = aligned.length();
227                         
228                         //find first spot in this seq
229                         for (int j = aligned.length()-1; j >= 0; j--) {
230                                 if (isalpha(aligned[j])) {
231                                         pos = j;
232                                         break;
233                                 }
234                         }
235                         
236                         //save this spot if it is the farthest
237                         if (pos < rearPos) { rearPos = pos; }
238                 }
239
240                 
241                 //check to make sure that is not whole seq
242                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
243                 
244                 //***********trim all seqs to that position***************//
245                 for (int i = 0; i < querySeqs.size(); i++) {
246                         
247                         string aligned = querySeqs[i]->getAligned();
248                         
249                         //between the two points
250                         aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
251                         
252                         querySeqs[i]->setAligned(aligned);
253                 }
254                 
255                 for (int i = 0; i < templateSeqs.size(); i++) {
256                         
257                         string aligned = templateSeqs[i]->getAligned();
258                         
259                         //between the two points
260                         aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
261                         
262                         templateSeqs[i]->setAligned(aligned);
263                 }
264         
265         }
266         catch(exception& e) {
267                 errorOut(e, "Ccode", "trimSequences");
268                 exit(1);
269         }
270
271 }
272 /***************************************************************************************************************/
273 vector<int> Ccode::findWindows() {
274         try {
275                 
276                 vector<int> win; 
277                 int length = querySeqs[0]->getAligned().length();
278                 
279                 //default is wanted = 10% of total length
280                 if (window > length) { 
281                         mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
282                         window = length / 10;
283                 }else if (window == 0) { window = length / 10;  }
284                 else if (window > (length / 20)) {
285                         mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
286                 }else if (window < (length / 5)) {
287                         mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway."); mothurOutEndLine();
288                 }
289                 
290                 //save starting points of each window
291                 for (int m = 0;  m < (length-window); m+=window) {  win.push_back(m);  }
292
293                 return win;
294         
295         }
296         catch(exception& e) {
297                 errorOut(e, "Ccode", "findWindows");
298                 exit(1);
299         }
300 }
301 //***************************************************************************************************************
302 int Ccode::getDiff(string seqA, string seqB) {
303         try {
304                 
305                 int numDiff = 0;
306                 
307                 for (int i = 0; i < seqA.length(); i++) {
308                         //if you are both not gaps
309                         //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
310                                 //are you different
311                                 if (seqA[i] != seqB[i]) { numDiff++; }
312                         //}
313                 }
314                 
315                 return numDiff;
316         
317         }
318         catch(exception& e) {
319                 errorOut(e, "Ccode", "getDiff");
320                 exit(1);
321         }
322 }
323 //***************************************************************************************************************
324 //tried to make this look most like ccode original implementation
325 void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
326         try {
327                 
328                 vector< vector<int> > numDiffBases;
329                 numDiffBases.resize(seqs.size());
330                 //initialize to 0
331                 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
332                 
333                 int length = seqs[0]->getAligned().length();
334                 
335                 //calc differences from each sequence to everyother seq in the set
336                 for (int i = 0; i < seqs.size(); i++) {
337                         
338                         string seqA = seqs[i]->getAligned();
339                         
340                         //so you don't calc i to j and j to i since they are the same
341                         for (int j = 0; j < i; j++) {
342                                 
343                                 string seqB = seqs[j]->getAligned();
344                                 
345                                 //compare strings
346                                 int numDiff = getDiff(seqA, seqB);
347                                 
348                                 numDiffBases[i][j] = numDiff;
349                                 numDiffBases[j][i] = numDiff;
350                         }
351                 }
352                 
353                 //initailize remove to 0
354                 vector<int> remove;  remove.resize(seqs.size(), 0);
355                 
356                 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
357                 for (int i = 0; i < numDiffBases.size(); i++) {
358                         for (int j = 0; j < numDiffBases[i].size(); j++) {
359                                 
360                                 //are you more than 20% different
361                                 if (numDiffBases[i][j] > ((20*length) / 100))           {  remove[j] = 1;  }
362                                 //are you less than 0.5% different
363                                 if (numDiffBases[i][j] < ((0.5*length) / 100))  {  remove[j] = 1;  }
364                         }
365                 }
366                 
367                 int numSeqsLeft = 0;
368                 
369                 //count seqs that are not going to be removed
370                 for (int i = 0; i < remove.size(); i++) {  
371                         if (remove[i] == 0)  { numSeqsLeft++;  }
372                 }
373                 
374                 //if you have enough then remove bad ones
375                 if (numSeqsLeft >= 3) {
376                         vector<Sequence*> goodSeqs;
377                         //remove bad seqs
378                         for (int i = 0; i < remove.size(); i++) {
379                                 if (remove[i] == 0) { 
380                                         goodSeqs.push_back(seqs[i]);
381                                 }
382                         }
383                         
384                         seqs = goodSeqs;
385                         
386                 }else { //warn, but dont remove any
387                         mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check."); mothurOutEndLine();  
388                 }
389                         
390
391         }
392         catch(exception& e) {
393                 errorOut(e, "Ccode", "removeBadReferenceSeqs");
394                 exit(1);
395         }
396 }
397 //***************************************************************************************************************
398 vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
399         try{
400         
401                 vector< vector<Sequence*> > topMatches;  topMatches.resize(querySeqs.size());
402         
403                 float smallestOverall, smallestLeft, smallestRight;
404                 smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
405                 
406                 //for each sequence in querySeqs - find top matches to use as reference
407                 for(int j = start; j < end; j++){
408                         
409                         Sequence query = *(querySeqs[j]);
410                         
411                         vector<SeqDist> distances;
412                         
413                         //calc distance to each sequence in template seqs
414                         for (int i = 0; i < templateSeqs.size(); i++) {
415                         
416                                 Sequence ref = *(templateSeqs[i]); 
417                                         
418                                 //find overall dist
419                                 distCalc->calcDist(query, ref);
420                                 float dist = distCalc->getDist();       
421                                 
422                                 //save distance
423                                 SeqDist temp;
424                                 temp.seq = templateSeqs[i];
425                                 temp.dist = dist;
426
427                                 distances.push_back(temp);
428                         }
429                         
430                         sort(distances.begin(), distances.end(), compareSeqDist);
431                         
432                         //save the number of top matches wanted
433                         for (int h = 0; h < numWanted; h++) {
434                                 topMatches[j].push_back(distances[h].seq);
435                         }
436                 }
437                         
438                 return topMatches;
439
440         }
441         catch(exception& e) {
442                 errorOut(e, "Ccode", "findClosestSides");
443                 exit(1);
444         }
445 }
446 /**************************************************************************************************/
447 vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
448         try {
449         }
450         catch(exception& e) {
451                 errorOut(e, "Ccode", "getAverageRef");
452                 exit(1);
453         }
454 }
455 /**************************************************************************************************/
456 vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
457         try {
458         
459         
460         }
461         catch(exception& e) {
462                 errorOut(e, "Ccode", "getAverageQuery");
463                 exit(1);
464         }
465 }
466 /**************************************************************************************************/
467 void Ccode::createProcessesClosest() {
468         try {
469 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
470                 int process = 0;
471                 vector<int> processIDS;
472                 
473                 //loop through and create all the processes you want
474                 while (process != processors) {
475                         int pid = fork();
476                         
477                         if (pid > 0) {
478                                 processIDS.push_back(pid);  
479                                 process++;
480                         }else if (pid == 0){
481                                 
482                                 mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
483                                 closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
484                                 mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
485                                 
486                                 //write out data to file so parent can read it
487                                 ofstream out;
488                                 string s = toString(getpid()) + ".temp";
489                                 openOutputFile(s, out);
490                                 
491                                 //output pairs
492                                 for (int i = lines[process]->start; i < lines[process]->end; i++) {
493                                          for (int j = 0; j < closest[i].size(); j++) {
494                                                 closest[i][j]->printSequence(out);
495                                          }
496                                 }
497                                 out.close();
498                                 
499                                 exit(0);
500                         }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
501                 }
502                 
503                 //force parent to wait until all the processes are done
504                 for (int i=0;i<processors;i++) { 
505                         int temp = processIDS[i];
506                         wait(&temp);
507                 }
508                 
509                 //get data created by processes
510                 for (int i=0;i<processors;i++) { 
511                         ifstream in;
512                         string s = toString(processIDS[i]) + ".temp";
513                         openInputFile(s, in);
514                         
515                         //get pairs
516                         for (int k = lines[i]->start; k < lines[i]->end; k++) {
517                                 
518                                 vector<Sequence*> tempVector;
519                                 
520                                 for (int j = 0; j < numWanted; j++) {
521                                 
522                                         Sequence* temp = new Sequence(in);
523                                         gobble(in);
524                                                 
525                                         tempVector.push_back(temp);
526                                 }
527                                 
528                                 closest[k] = tempVector;
529                         }
530                         
531                         in.close();
532                         remove(s.c_str());
533                 }
534                         
535         
536 #else
537                 closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
538 #endif  
539
540         }
541         catch(exception& e) {
542                 errorOut(e, "Ccode", "createProcessesClosest");
543                 exit(1);
544         }
545 }
546
547 //***************************************************************************************************************
548