]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
1336876966f110dc25e0e18eee8bd13ec4e8d6fd
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
12 #include "kmerdb.hpp"
13 #include "blastdb.hpp"
14
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {       
18         try {
19                 fastafile = file;
20                 templateFileName = temp; templateSeqs = readSeqs(temp);
21                 searchMethod = mode;
22                 kmerSize = k;
23                 match = ms;
24                 misMatch = mms;
25                 window = win;
26                 divR = div;
27                 minSim = minsim;
28                 minCov = mincov;
29                 minBS = minbs;
30                 minSNP = minsnp;
31                 parents = par;
32                 iters = it;
33                 increment = inc;
34                 numWanted = numw;
35                 realign = r; 
36         
37                 decalc = new DeCalculator();    
38                 
39                 doPrep();
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
43                 exit(1);
44         }
45 }
46 //***************************************************************************************************************
47 int ChimeraSlayer::doPrep() {
48         try {
49                 
50                 //read in all query seqs
51                 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
52                 
53                 vector<Sequence*> temp = templateSeqs;
54                 for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
55                 
56                 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
57                 
58                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
59                 
60                 if (m->control_pressed) {  return 0; } 
61                 
62                 //run filter on template
63                 for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  return 0; }  runFilter(templateSeqs[i]);  }
64                 
65                 string  kmerDBNameLeft;
66                 string  kmerDBNameRight;
67         
68                 //generate the kmerdb to pass to maligner
69                 if (searchMethod == "kmer") { 
70                         string templatePath = m->hasPath(templateFileName);
71                         string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
72                         databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
73                                 
74                         string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
75                         databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
76                 #ifdef USE_MPI
77                         for (int i = 0; i < templateSeqs.size(); i++) {
78                                         
79                                 if (m->control_pressed) { return 0; } 
80                                         
81                                 string leftFrag = templateSeqs[i]->getUnaligned();
82                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
83                                         
84                                 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
85                                 databaseLeft->addSequence(leftTemp);    
86                         }
87                         databaseLeft->generateDB();
88                         databaseLeft->setNumSeqs(templateSeqs.size());
89                         
90                         for (int i = 0; i < templateSeqs.size(); i++) {
91                                 if (m->control_pressed) { return 0; } 
92                                         
93                                 string rightFrag = templateSeqs[i]->getUnaligned();
94                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
95                                         
96                                 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
97                                 databaseRight->addSequence(rightTemp);  
98                         }
99                         databaseRight->generateDB();
100                         databaseRight->setNumSeqs(templateSeqs.size());
101
102                 #else   
103                         //leftside
104                         kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
105                         ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
106                         bool needToGenerateLeft = true;
107                         
108                         if(kmerFileTestLeft){   
109                                 bool GoodFile = m->checkReleaseVersion(kmerFileTestLeft, m->getVersion());
110                                 if (GoodFile) {  needToGenerateLeft = false;    }
111                         }
112                         
113                         if(needToGenerateLeft){ 
114                         
115                                 for (int i = 0; i < templateSeqs.size(); i++) {
116                                         
117                                         if (m->control_pressed) { return 0; } 
118                                         
119                                         string leftFrag = templateSeqs[i]->getUnaligned();
120                                         leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
121                                         
122                                         Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
123                                         databaseLeft->addSequence(leftTemp);    
124                                 }
125                                 databaseLeft->generateDB();
126                                 
127                         }else { 
128                                 databaseLeft->readKmerDB(kmerFileTestLeft);     
129                         }
130                         kmerFileTestLeft.close();
131                         
132                         databaseLeft->setNumSeqs(templateSeqs.size());
133                         
134                         //rightside
135                         kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
136                         ifstream kmerFileTestRight(kmerDBNameRight.c_str());
137                         bool needToGenerateRight = true;
138                         
139                         if(kmerFileTestRight){  
140                                 bool GoodFile = m->checkReleaseVersion(kmerFileTestRight, m->getVersion());
141                                 if (GoodFile) {  needToGenerateRight = false;   }
142                         }
143                         
144                         if(needToGenerateRight){        
145                         
146                                 for (int i = 0; i < templateSeqs.size(); i++) {
147                                         if (m->control_pressed) { return 0; } 
148                                         
149                                         string rightFrag = templateSeqs[i]->getUnaligned();
150                                         rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
151                                         
152                                         Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
153                                         databaseRight->addSequence(rightTemp);  
154                                 }
155                                 databaseRight->generateDB();
156                                 
157                         }else { 
158                                 databaseRight->readKmerDB(kmerFileTestRight);   
159                         }
160                         kmerFileTestRight.close();
161                         
162                         databaseRight->setNumSeqs(templateSeqs.size());
163                 #endif  
164                 }else if (searchMethod == "blast") {
165                 
166                         //generate blastdb
167                         databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
168                         for (int i = 0; i < templateSeqs.size(); i++) {         databaseLeft->addSequence(*templateSeqs[i]);    }
169                         databaseLeft->generateDB();
170                         databaseLeft->setNumSeqs(templateSeqs.size());
171                 }
172                 
173                 return 0;
174
175         }
176         catch(exception& e) {
177                 m->errorOut(e, "ChimeraSlayer", "doprep");
178                 exit(1);
179         }
180 }
181 //***************************************************************************************************************
182 ChimeraSlayer::~ChimeraSlayer() {       
183         delete decalc;  
184         if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
185         else if (searchMethod == "blast") {  delete databaseLeft; }
186 }
187 //***************************************************************************************************************
188 void ChimeraSlayer::printHeader(ostream& out) {
189         m->mothurOutEndLine();
190         m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
191         m->mothurOutEndLine();
192         
193         out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
194 }
195 //***************************************************************************************************************
196 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
197         try {
198                 if (chimeraFlags == "yes") {
199                         string chimeraFlag = "no";
200                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
201                            ||
202                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
203                         
204                         
205                         if (chimeraFlag == "yes") {     
206                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
207                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
208                                         outAcc << querySeq->getName() << endl;
209                                 }
210                         }
211                         
212                         printBlock(chimeraResults[0], chimeraFlag, out);
213                         out << endl;
214                 }else {  out << querySeq->getName() << "\tno" << endl;  }
215                 
216                 return 0;
217                 
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "ChimeraSlayer", "print");
221                 exit(1);
222         }
223 }
224 #ifdef USE_MPI
225 //***************************************************************************************************************
226 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
227         try {
228                 MPI_Status status;
229                 bool results = false;
230                 string outAccString = "";
231                 string outputString = "";
232                 
233                 if (chimeraFlags == "yes") {
234                         string chimeraFlag = "no";
235                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
236                            ||
237                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
238                         
239                         
240                         if (chimeraFlag == "yes") {     
241                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
242                                         cout << querySeq->getName() <<  "\tyes" << endl;
243                                         outAccString += querySeq->getName() + "\n";
244                                         results = true;
245                                         
246                                         //write to accnos file
247                                         int length = outAccString.length();
248                                         char* buf2 = new char[length];
249                                         memcpy(buf2, outAccString.c_str(), length);
250                                 
251                                         MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
252                                         delete buf2;
253                                 }
254                         }
255                         
256                         outputString = getBlock(chimeraResults[0], chimeraFlag);
257                         outputString += "\n";
258         //cout << outputString << endl;         
259                         //write to output file
260                         int length = outputString.length();
261                         char* buf = new char[length];
262                         memcpy(buf, outputString.c_str(), length);
263                                 
264                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
265                         delete buf;
266
267                 }else {  
268                         outputString += querySeq->getName() + "\tno\n";  
269         //cout << outputString << endl;
270                         //write to output file
271                         int length = outputString.length();
272                         char* buf = new char[length];
273                         memcpy(buf, outputString.c_str(), length);
274                                 
275                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
276                         delete buf;
277                 }
278                 
279                 
280                 return results;
281         }
282         catch(exception& e) {
283                 m->errorOut(e, "ChimeraSlayer", "print");
284                 exit(1);
285         }
286 }
287 #endif
288
289 //***************************************************************************************************************
290 int ChimeraSlayer::getChimeras(Sequence* query) {
291         try {
292                 chimeraFlags = "no";
293
294                 //filter query
295                 spotMap = runFilter(query);     
296                 
297                 querySeq = query;
298                 
299                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
300                 Maligner maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
301                 Slayer slayer(window, increment, minSim, divR, iters, minSNP);
302         
303                 if (m->control_pressed) {  return 0;  }
304                 
305                 string chimeraFlag = maligner.getResults(query, decalc);
306                 if (m->control_pressed) {  return 0;  }
307                 vector<results> Results = maligner.getOutput();
308                         
309                 //found in testing realigning only made things worse
310                 if (realign) {
311                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
312                         realigner.reAlign(query, Results);
313                 }
314
315                 if (chimeraFlag == "yes") {
316                         
317                         //get sequence that were given from maligner results
318                         vector<SeqDist> seqs;
319                         map<string, float> removeDups;
320                         map<string, float>::iterator itDup;
321                         map<string, string> parentNameSeq;
322                         map<string, string>::iterator itSeq;
323                         for (int j = 0; j < Results.size(); j++) {
324                                 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
325                                 //only add if you are not a duplicate
326                                 itDup = removeDups.find(Results[j].parent);
327                                 if (itDup == removeDups.end()) { //this is not duplicate
328                                         removeDups[Results[j].parent] = dist;
329                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
330                                 }else if (dist > itDup->second) { //is this a stronger number for this parent
331                                         removeDups[Results[j].parent] = dist;
332                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
333                                 }
334                         }
335                         
336                         for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
337                                 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
338                                 itSeq = parentNameSeq.find(itDup->first);
339 //cout << itDup->first << itSeq->second << endl;
340                                 Sequence* seq = new Sequence(itDup->first, itSeq->second);
341                                 
342                                 SeqDist member;
343                                 member.seq = seq;
344                                 member.dist = itDup->second;
345                                 
346                                 seqs.push_back(member);
347                         }
348                         
349                         //limit number of parents to explore - default 3
350                         if (Results.size() > parents) {
351                                 //sort by distance
352                                 sort(seqs.begin(), seqs.end(), compareSeqDist);
353                                 //prioritize larger more similiar sequence fragments
354                                 reverse(seqs.begin(), seqs.end());
355                                 
356                                 for (int k = seqs.size()-1; k > (parents-1); k--)  {  
357                                         delete seqs[k].seq;
358                                         seqs.pop_back();        
359                                 }
360                         }
361                         
362                         //put seqs into vector to send to slayer
363                         vector<Sequence*> seqsForSlayer;
364                         for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
365                         
366                         //mask then send to slayer...
367                         if (seqMask != "") {
368                                 decalc->setMask(seqMask);
369                                 
370                                 //mask querys
371                                 decalc->runMask(query);
372                                 
373                                 //mask parents
374                                 for (int k = 0; k < seqsForSlayer.size(); k++) {
375                                         decalc->runMask(seqsForSlayer[k]);
376                                 }
377                                 
378                                 spotMap = decalc->getMaskMap();
379                         }
380                         
381                         if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
382                         
383                         //send to slayer
384                         chimeraFlags = slayer.getResults(query, seqsForSlayer);
385                         if (m->control_pressed) {  return 0;  }
386                         chimeraResults = slayer.getOutput();
387                         
388                         //free memory
389                         for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
390                 }
391                 
392                 //delete maligner;
393                 //delete slayer;
394                 
395                 return 0;
396         }
397         catch(exception& e) {
398                 m->errorOut(e, "ChimeraSlayer", "getChimeras");
399                 exit(1);
400         }
401 }
402 //***************************************************************************************************************
403 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
404         try {
405         //out << ":)\n";
406                 
407                 out << querySeq->getName() << '\t';
408                 out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
409                 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
410                 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
411                 
412                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
413                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
414                 
415                 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
416                 
417                 //out << "Similarity of parents: " << data.ab << endl;
418                 //out << "Similarity of query to parentA: " << data.qa << endl;
419                 //out << "Similarity of query to parentB: " << data.qb << endl;
420                 
421                 
422                 //out << "Per_id(QL,A): " << data.qla << endl;
423                 //out << "Per_id(QL,B): " << data.qlb << endl;
424                 //out << "Per_id(QR,A): " << data.qra << endl;
425                 //out << "Per_id(QR,B): " << data.qrb << endl;
426
427                 
428                 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
429                 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
430
431         }
432         catch(exception& e) {
433                 m->errorOut(e, "ChimeraSlayer", "printBlock");
434                 exit(1);
435         }
436 }
437 //***************************************************************************************************************
438 string ChimeraSlayer::getBlock(data_struct data, string flag){
439         try {
440                 
441                 string outputString = "";
442                 
443                 outputString += querySeq->getName() + "\t";
444                 outputString += data.parentA.getName() + "\t" + data.parentB.getName()  + "\t";
445                         
446                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
447                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
448                 
449                 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
450                 
451                 return outputString;
452         }
453         catch(exception& e) {
454                 m->errorOut(e, "ChimeraSlayer", "getBlock");
455                 exit(1);
456         }
457 }
458 //***************************************************************************************************************/
459