]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
ff13590cd3a91cb8960e26af803ca7fa63918035
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
12 #include "kmerdb.hpp"
13 #include "blastdb.hpp"
14
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {       
18         try {
19                 fastafile = file;
20                 templateFileName = temp; templateSeqs = readSeqs(temp);
21                 searchMethod = mode;
22                 kmerSize = k;
23                 match = ms;
24                 misMatch = mms;
25                 window = win;
26                 divR = div;
27                 minSim = minsim;
28                 minCov = mincov;
29                 minBS = minbs;
30                 minSNP = minsnp;
31                 parents = par;
32                 iters = it;
33                 increment = inc;
34                 numWanted = numw;
35                 realign = r; 
36         
37                 decalc = new DeCalculator();    
38                 
39                 doPrep();
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
43                 exit(1);
44         }
45 }
46 //***************************************************************************************************************
47 int ChimeraSlayer::doPrep() {
48         try {
49                 
50                 //read in all query seqs
51                 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
52                 
53                 vector<Sequence*> temp = templateSeqs;
54                 for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
55                 
56                 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
57                 
58                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
59                 
60                 if (m->control_pressed) {  return 0; } 
61                 
62                 //run filter on template
63                 for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  return 0; }  runFilter(templateSeqs[i]);  }
64                 
65                 string  kmerDBNameLeft;
66                 string  kmerDBNameRight;
67         
68                 //generate the kmerdb to pass to maligner
69                 if (searchMethod == "kmer") { 
70                         string rightTemplateFileName = "right." + templateFileName;
71                         databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
72                                 
73                         string leftTemplateFileName = "left." + templateFileName;
74                         databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
75                 #ifdef USE_MPI
76                         for (int i = 0; i < templateSeqs.size(); i++) {
77                                         
78                                 if (m->control_pressed) { return 0; } 
79                                         
80                                 string leftFrag = templateSeqs[i]->getUnaligned();
81                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
82                                         
83                                 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
84                                 databaseLeft->addSequence(leftTemp);    
85                         }
86                         databaseLeft->generateDB();
87                         databaseLeft->setNumSeqs(templateSeqs.size());
88                         
89                         for (int i = 0; i < templateSeqs.size(); i++) {
90                                 if (m->control_pressed) { return 0; } 
91                                         
92                                 string rightFrag = templateSeqs[i]->getUnaligned();
93                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
94                                         
95                                 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
96                                 databaseRight->addSequence(rightTemp);  
97                         }
98                         databaseRight->generateDB();
99                         databaseRight->setNumSeqs(templateSeqs.size());
100
101                 #else   
102                         //leftside
103                         kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
104                         ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
105                         bool needToGenerateLeft = true;
106                         
107                         if(kmerFileTestLeft){   
108                                 bool GoodFile = checkReleaseVersion(kmerFileTestLeft, m->getVersion());
109                                 if (GoodFile) {  needToGenerateLeft = false;    }
110                         }
111                         
112                         if(needToGenerateLeft){ 
113                         
114                                 for (int i = 0; i < templateSeqs.size(); i++) {
115                                         
116                                         if (m->control_pressed) { return 0; } 
117                                         
118                                         string leftFrag = templateSeqs[i]->getUnaligned();
119                                         leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
120                                         
121                                         Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
122                                         databaseLeft->addSequence(leftTemp);    
123                                 }
124                                 databaseLeft->generateDB();
125                                 
126                         }else { 
127                                 databaseLeft->readKmerDB(kmerFileTestLeft);     
128                         }
129                         kmerFileTestLeft.close();
130                         
131                         databaseLeft->setNumSeqs(templateSeqs.size());
132                         
133                         //rightside
134                         kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
135                         ifstream kmerFileTestRight(kmerDBNameRight.c_str());
136                         bool needToGenerateRight = true;
137                         
138                         if(kmerFileTestRight){  
139                                 bool GoodFile = checkReleaseVersion(kmerFileTestRight, m->getVersion());
140                                 if (GoodFile) {  needToGenerateRight = false;   }
141                         }
142                         
143                         if(needToGenerateRight){        
144                         
145                                 for (int i = 0; i < templateSeqs.size(); i++) {
146                                         if (m->control_pressed) { return 0; } 
147                                         
148                                         string rightFrag = templateSeqs[i]->getUnaligned();
149                                         rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
150                                         
151                                         Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
152                                         databaseRight->addSequence(rightTemp);  
153                                 }
154                                 databaseRight->generateDB();
155                                 
156                         }else { 
157                                 databaseRight->readKmerDB(kmerFileTestRight);   
158                         }
159                         kmerFileTestRight.close();
160                         
161                         databaseRight->setNumSeqs(templateSeqs.size());
162                 #endif  
163                 }else if (searchMethod == "blast") {
164                 
165                         //generate blastdb
166                         databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
167                         for (int i = 0; i < templateSeqs.size(); i++) {         databaseLeft->addSequence(*templateSeqs[i]);    }
168                         databaseLeft->generateDB();
169                         databaseLeft->setNumSeqs(templateSeqs.size());
170                 }
171                 
172                 return 0;
173
174         }
175         catch(exception& e) {
176                 m->errorOut(e, "ChimeraSlayer", "doprep");
177                 exit(1);
178         }
179 }
180 //***************************************************************************************************************
181 ChimeraSlayer::~ChimeraSlayer() {       
182         delete decalc;  
183         if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
184         else if (searchMethod == "blast") {  delete databaseLeft; }
185 }
186 //***************************************************************************************************************
187 void ChimeraSlayer::printHeader(ostream& out) {
188         m->mothurOutEndLine();
189         m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
190         m->mothurOutEndLine();
191         
192         out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
193 }
194 //***************************************************************************************************************
195 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
196         try {
197                 if (chimeraFlags == "yes") {
198                         string chimeraFlag = "no";
199                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
200                            ||
201                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
202                         
203                         
204                         if (chimeraFlag == "yes") {     
205                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
206                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
207                                         outAcc << querySeq->getName() << endl;
208                                 }
209                         }
210                         
211                         printBlock(chimeraResults[0], chimeraFlag, out);
212                         out << endl;
213                 }else {  out << querySeq->getName() << "\tno" << endl;  }
214                 
215                 return 0;
216                 
217         }
218         catch(exception& e) {
219                 m->errorOut(e, "ChimeraSlayer", "print");
220                 exit(1);
221         }
222 }
223 #ifdef USE_MPI
224 //***************************************************************************************************************
225 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
226         try {
227                 MPI_Status status;
228                 bool results = false;
229                 string outAccString = "";
230                 string outputString = "";
231                 
232                 if (chimeraFlags == "yes") {
233                         string chimeraFlag = "no";
234                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
235                            ||
236                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
237                         
238                         
239                         if (chimeraFlag == "yes") {     
240                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
241                                         cout << querySeq->getName() <<  "\tyes" << endl;
242                                         outAccString += querySeq->getName() + "\n";
243                                         results = true;
244                                         
245                                         //write to accnos file
246                                         int length = outAccString.length();
247                                         char* buf2 = new char[length];
248                                         memcpy(buf2, outAccString.c_str(), length);
249                                 
250                                         MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
251                                         delete buf2;
252                                 }
253                         }
254                         
255                         outputString = getBlock(chimeraResults[0], chimeraFlag);
256                         outputString += "\n";
257         //cout << outputString << endl;         
258                         //write to output file
259                         int length = outputString.length();
260                         char* buf = new char[length];
261                         memcpy(buf, outputString.c_str(), length);
262                                 
263                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
264                         delete buf;
265
266                 }else {  
267                         outputString += querySeq->getName() + "\tno\n";  
268         //cout << outputString << endl;
269                         //write to output file
270                         int length = outputString.length();
271                         char* buf = new char[length];
272                         memcpy(buf, outputString.c_str(), length);
273                                 
274                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
275                         delete buf;
276                 }
277                 
278                 
279                 return results;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "ChimeraSlayer", "print");
283                 exit(1);
284         }
285 }
286 #endif
287
288 //***************************************************************************************************************
289 int ChimeraSlayer::getChimeras(Sequence* query) {
290         try {
291                 chimeraFlags = "no";
292
293                 //filter query
294                 spotMap = runFilter(query);     
295                 
296                 querySeq = query;
297                 
298                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
299                 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
300                 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
301         
302                 if (m->control_pressed) {  return 0;  }
303                 
304                 string chimeraFlag = maligner->getResults(query, decalc);
305                 if (m->control_pressed) {  return 0;  }
306                 vector<results> Results = maligner->getOutput();
307                         
308                 //found in testing realigning only made things worse
309                 if (realign) {
310                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
311                         realigner.reAlign(query, Results);
312                 }
313
314                 if (chimeraFlag == "yes") {
315                         
316                         //get sequence that were given from maligner results
317                         vector<SeqDist> seqs;
318                         map<string, float> removeDups;
319                         map<string, float>::iterator itDup;
320                         map<string, string> parentNameSeq;
321                         map<string, string>::iterator itSeq;
322                         for (int j = 0; j < Results.size(); j++) {
323                                 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
324                                 //only add if you are not a duplicate
325                                 itDup = removeDups.find(Results[j].parent);
326                                 if (itDup == removeDups.end()) { //this is not duplicate
327                                         removeDups[Results[j].parent] = dist;
328                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
329                                 }else if (dist > itDup->second) { //is this a stronger number for this parent
330                                         removeDups[Results[j].parent] = dist;
331                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
332                                 }
333                         }
334                         
335                         for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
336                                 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
337                                 itSeq = parentNameSeq.find(itDup->first);
338 //cout << itDup->first << itSeq->second << endl;
339                                 Sequence* seq = new Sequence(itDup->first, itSeq->second);
340                                 
341                                 SeqDist member;
342                                 member.seq = seq;
343                                 member.dist = itDup->second;
344                                 
345                                 seqs.push_back(member);
346                         }
347                         
348                         //limit number of parents to explore - default 3
349                         if (Results.size() > parents) {
350                                 //sort by distance
351                                 sort(seqs.begin(), seqs.end(), compareSeqDist);
352                                 //prioritize larger more similiar sequence fragments
353                                 reverse(seqs.begin(), seqs.end());
354                                 
355                                 for (int k = seqs.size()-1; k > (parents-1); k--)  {  
356                                         delete seqs[k].seq;
357                                         seqs.pop_back();        
358                                 }
359                         }
360                         
361                         //put seqs into vector to send to slayer
362                         vector<Sequence*> seqsForSlayer;
363                         for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
364                         
365                         //mask then send to slayer...
366                         if (seqMask != "") {
367                                 decalc->setMask(seqMask);
368                                 
369                                 //mask querys
370                                 decalc->runMask(query);
371                                 
372                                 //mask parents
373                                 for (int k = 0; k < seqsForSlayer.size(); k++) {
374                                         decalc->runMask(seqsForSlayer[k]);
375                                 }
376                                 
377                                 spotMap = decalc->getMaskMap();
378                         }
379                         
380                         if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
381                         
382                         //send to slayer
383                         chimeraFlags = slayer->getResults(query, seqsForSlayer);
384                         if (m->control_pressed) {  return 0;  }
385                         chimeraResults = slayer->getOutput();
386                         
387                         //free memory
388                         for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
389                 }
390                 
391                 delete maligner;
392                 delete slayer;
393                 
394                 return 0;
395         }
396         catch(exception& e) {
397                 m->errorOut(e, "ChimeraSlayer", "getChimeras");
398                 exit(1);
399         }
400 }
401 //***************************************************************************************************************
402 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
403         try {
404         //out << ":)\n";
405                 
406                 out << querySeq->getName() << '\t';
407                 out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
408                 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
409                 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
410                 
411                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
412                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
413                 
414                 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
415                 
416                 //out << "Similarity of parents: " << data.ab << endl;
417                 //out << "Similarity of query to parentA: " << data.qa << endl;
418                 //out << "Similarity of query to parentB: " << data.qb << endl;
419                 
420                 
421                 //out << "Per_id(QL,A): " << data.qla << endl;
422                 //out << "Per_id(QL,B): " << data.qlb << endl;
423                 //out << "Per_id(QR,A): " << data.qra << endl;
424                 //out << "Per_id(QR,B): " << data.qrb << endl;
425
426                 
427                 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
428                 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
429
430         }
431         catch(exception& e) {
432                 m->errorOut(e, "ChimeraSlayer", "printBlock");
433                 exit(1);
434         }
435 }
436 //***************************************************************************************************************
437 string ChimeraSlayer::getBlock(data_struct data, string flag){
438         try {
439                 
440                 string outputString = "";
441                 
442                 outputString += querySeq->getName() + "\t";
443                 outputString += data.parentA.getName() + "\t" + data.parentB.getName()  + "\t";
444                         
445                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
446                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
447                 
448                 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
449                 
450                 return outputString;
451         }
452         catch(exception& e) {
453                 m->errorOut(e, "ChimeraSlayer", "getBlock");
454                 exit(1);
455         }
456 }
457 //***************************************************************************************************************/
458