]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
5295eace3546e14db6488ceb41e144915531c830
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
12 #include "kmerdb.hpp"
13 #include "blastdb.hpp"
14
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {       
18         try {
19                 fastafile = file;
20                 templateFileName = temp; templateSeqs = readSeqs(temp);
21                 searchMethod = mode;
22                 kmerSize = k;
23                 match = ms;
24                 misMatch = mms;
25                 window = win;
26                 divR = div;
27                 minSim = minsim;
28                 minCov = mincov;
29                 minBS = minbs;
30                 minSNP = minsnp;
31                 parents = par;
32                 iters = it;
33                 increment = inc;
34                 numWanted = numw;
35                 realign = r; 
36         
37                 decalc = new DeCalculator();    
38                 
39                 doPrep();
40         }
41         catch(exception& e) {
42                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
43                 exit(1);
44         }
45 }
46 //***************************************************************************************************************
47 int ChimeraSlayer::doPrep() {
48         try {
49                 
50                 //read in all query seqs
51                 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
52                 
53                 vector<Sequence*> temp = templateSeqs;
54                 for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
55                 
56                 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
57                 
58                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
59                 
60                 if (m->control_pressed) {  return 0; } 
61                 
62                 //run filter on template
63                 for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  return 0; }  runFilter(templateSeqs[i]);  }
64                 
65                 string  kmerDBNameLeft;
66                 string  kmerDBNameRight;
67         
68                 //generate the kmerdb to pass to maligner
69                 if (searchMethod == "kmer") { 
70                         string rightTemplateFileName = "right." + templateFileName;
71                         databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
72                                 
73                         string leftTemplateFileName = "left." + templateFileName;
74                         databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
75                 #ifdef USE_MPI
76                         for (int i = 0; i < templateSeqs.size(); i++) {
77                                         
78                                 if (m->control_pressed) { return 0; } 
79                                         
80                                 string leftFrag = templateSeqs[i]->getUnaligned();
81                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
82                                         
83                                 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
84                                 databaseLeft->addSequence(leftTemp);    
85                         }
86                         databaseLeft->generateDB();
87                         databaseLeft->setNumSeqs(templateSeqs.size());
88                         
89                         for (int i = 0; i < templateSeqs.size(); i++) {
90                                 if (m->control_pressed) { return 0; } 
91                                         
92                                 string rightFrag = templateSeqs[i]->getUnaligned();
93                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
94                                         
95                                 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
96                                 databaseRight->addSequence(rightTemp);  
97                         }
98                         databaseRight->generateDB();
99                         databaseRight->setNumSeqs(templateSeqs.size());
100
101                 #else   
102                         //leftside
103                         kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
104                         ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
105                         
106                         if(!kmerFileTestLeft){  
107                         
108                                 for (int i = 0; i < templateSeqs.size(); i++) {
109                                         
110                                         if (m->control_pressed) { return 0; } 
111                                         
112                                         string leftFrag = templateSeqs[i]->getUnaligned();
113                                         leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
114                                         
115                                         Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
116                                         databaseLeft->addSequence(leftTemp);    
117                                 }
118                                 databaseLeft->generateDB();
119                                 
120                         }else { 
121                                 databaseLeft->readKmerDB(kmerFileTestLeft);     
122                         }
123                         kmerFileTestLeft.close();
124                         
125                         databaseLeft->setNumSeqs(templateSeqs.size());
126                         
127                         //rightside
128                         kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
129                         ifstream kmerFileTestRight(kmerDBNameRight.c_str());
130                         
131                         if(!kmerFileTestRight){ 
132                         
133                                 for (int i = 0; i < templateSeqs.size(); i++) {
134                                         if (m->control_pressed) { return 0; } 
135                                         
136                                         string rightFrag = templateSeqs[i]->getUnaligned();
137                                         rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
138                                         
139                                         Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
140                                         databaseRight->addSequence(rightTemp);  
141                                 }
142                                 databaseRight->generateDB();
143                                 
144                         }else { 
145                                 databaseRight->readKmerDB(kmerFileTestRight);   
146                         }
147                         kmerFileTestRight.close();
148                         
149                         databaseRight->setNumSeqs(templateSeqs.size());
150                 #endif  
151                 }else if (searchMethod == "blast") {
152                 
153                         //generate blastdb
154                         databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
155                         for (int i = 0; i < templateSeqs.size(); i++) {         databaseLeft->addSequence(*templateSeqs[i]);    }
156                         databaseLeft->generateDB();
157                         databaseLeft->setNumSeqs(templateSeqs.size());
158                 }
159                 
160                 return 0;
161
162         }
163         catch(exception& e) {
164                 m->errorOut(e, "ChimeraSlayer", "doprep");
165                 exit(1);
166         }
167 }
168 //***************************************************************************************************************
169 ChimeraSlayer::~ChimeraSlayer() {       
170         delete decalc;  
171         if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
172         else if (searchMethod == "blast") {  delete databaseLeft; }
173 }
174 //***************************************************************************************************************
175 void ChimeraSlayer::printHeader(ostream& out) {
176         m->mothurOutEndLine();
177         m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
178         m->mothurOutEndLine();
179         
180         out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
181 }
182 //***************************************************************************************************************
183 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
184         try {
185                 if (chimeraFlags == "yes") {
186                         string chimeraFlag = "no";
187                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
188                            ||
189                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
190                         
191                         
192                         if (chimeraFlag == "yes") {     
193                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
194                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
195                                         outAcc << querySeq->getName() << endl;
196                                 }
197                         }
198                         
199                         printBlock(chimeraResults[0], out);
200                         out << endl;
201                 }else {  out << querySeq->getName() << "\tno" << endl;  }
202                 
203                 return 0;
204                 
205         }
206         catch(exception& e) {
207                 m->errorOut(e, "ChimeraSlayer", "print");
208                 exit(1);
209         }
210 }
211 #ifdef USE_MPI
212 //***************************************************************************************************************
213 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
214         try {
215                 MPI_Status status;
216                 bool results = false;
217                 string outAccString = "";
218                 string outputString = "";
219                 
220                 if (chimeraFlags == "yes") {
221                         string chimeraFlag = "no";
222                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
223                            ||
224                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
225                         
226                         
227                         if (chimeraFlag == "yes") {     
228                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
229                                         cout << querySeq->getName() <<  "\tyes" << endl;
230                                         outAccString += querySeq->getName() + "\n";
231                                         results = true;
232                                         
233                                         //write to accnos file
234                                         int length = outAccString.length();
235                                         char* buf2 = new char[length];
236                                         memcpy(buf2, outAccString.c_str(), length);
237                                 
238                                         MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
239                                         delete buf2;
240                                 }
241                         }
242                         
243                         outputString = getBlock(chimeraResults[0]);
244                         outputString += "\n";
245         //cout << outputString << endl;         
246                         //write to output file
247                         int length = outputString.length();
248                         char* buf = new char[length];
249                         memcpy(buf, outputString.c_str(), length);
250                                 
251                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
252                         delete buf;
253
254                 }else {  
255                         outputString += querySeq->getName() + "\tno\n";  
256         //cout << outputString << endl;
257                         //write to output file
258                         int length = outputString.length();
259                         char* buf = new char[length];
260                         memcpy(buf, outputString.c_str(), length);
261                                 
262                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
263                         delete buf;
264                 }
265                 
266                 
267                 return results;
268         }
269         catch(exception& e) {
270                 m->errorOut(e, "ChimeraSlayer", "print");
271                 exit(1);
272         }
273 }
274 #endif
275
276 //***************************************************************************************************************
277 int ChimeraSlayer::getChimeras(Sequence* query) {
278         try {
279                 chimeraFlags = "no";
280
281                 //filter query
282                 spotMap = runFilter(query);     
283                 
284                 querySeq = query;
285                 
286                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
287                 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
288                 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
289         
290                 if (m->control_pressed) {  return 0;  }
291                 
292                 string chimeraFlag = maligner->getResults(query, decalc);
293                 if (m->control_pressed) {  return 0;  }
294                 vector<results> Results = maligner->getOutput();
295                         
296                 //found in testing realigning only made things worse
297                 if (realign) {
298                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
299                         realigner.reAlign(query, Results);
300                 }
301
302                 if (chimeraFlag == "yes") {
303                         
304                         //get sequence that were given from maligner results
305                         vector<SeqDist> seqs;
306                         map<string, float> removeDups;
307                         map<string, float>::iterator itDup;
308                         map<string, string> parentNameSeq;
309                         map<string, string>::iterator itSeq;
310                         for (int j = 0; j < Results.size(); j++) {
311                                 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
312                                 //only add if you are not a duplicate
313                                 itDup = removeDups.find(Results[j].parent);
314                                 if (itDup == removeDups.end()) { //this is not duplicate
315                                         removeDups[Results[j].parent] = dist;
316                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
317                                 }else if (dist > itDup->second) { //is this a stronger number for this parent
318                                         removeDups[Results[j].parent] = dist;
319                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
320                                 }
321                         }
322                         
323                         for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
324                                 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
325                                 itSeq = parentNameSeq.find(itDup->first);
326 //cout << itDup->first << itSeq->second << endl;
327                                 Sequence* seq = new Sequence(itDup->first, itSeq->second);
328                                 
329                                 SeqDist member;
330                                 member.seq = seq;
331                                 member.dist = itDup->second;
332                                 
333                                 seqs.push_back(member);
334                         }
335                         
336                         //limit number of parents to explore - default 3
337                         if (Results.size() > parents) {
338                                 //sort by distance
339                                 sort(seqs.begin(), seqs.end(), compareSeqDist);
340                                 //prioritize larger more similiar sequence fragments
341                                 reverse(seqs.begin(), seqs.end());
342                                 
343                                 for (int k = seqs.size()-1; k > (parents-1); k--)  {  
344                                         delete seqs[k].seq;
345                                         seqs.pop_back();        
346                                 }
347                         }
348                         
349                         //put seqs into vector to send to slayer
350                         vector<Sequence*> seqsForSlayer;
351                         for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
352                         
353                         //mask then send to slayer...
354                         if (seqMask != "") {
355                                 decalc->setMask(seqMask);
356                                 
357                                 //mask querys
358                                 decalc->runMask(query);
359                                 
360                                 //mask parents
361                                 for (int k = 0; k < seqsForSlayer.size(); k++) {
362                                         decalc->runMask(seqsForSlayer[k]);
363                                 }
364                                 
365                                 spotMap = decalc->getMaskMap();
366                         }
367                         
368                         if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
369                         
370                         //send to slayer
371                         chimeraFlags = slayer->getResults(query, seqsForSlayer);
372                         if (m->control_pressed) {  return 0;  }
373                         chimeraResults = slayer->getOutput();
374                         
375                         //free memory
376                         for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
377                 }
378                 
379                 delete maligner;
380                 delete slayer;
381                 
382                 return 0;
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "ChimeraSlayer", "getChimeras");
386                 exit(1);
387         }
388 }
389 //***************************************************************************************************************
390 void ChimeraSlayer::printBlock(data_struct data, ostream& out){
391         try {
392         //out << ":)\n";
393                 
394                 out << querySeq->getName() << '\t';
395                 out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
396                 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
397                 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
398                 
399                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
400                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
401                 
402                 out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
403                 
404                 //out << "Similarity of parents: " << data.ab << endl;
405                 //out << "Similarity of query to parentA: " << data.qa << endl;
406                 //out << "Similarity of query to parentB: " << data.qb << endl;
407                 
408                 
409                 //out << "Per_id(QL,A): " << data.qla << endl;
410                 //out << "Per_id(QL,B): " << data.qlb << endl;
411                 //out << "Per_id(QR,A): " << data.qra << endl;
412                 //out << "Per_id(QR,B): " << data.qrb << endl;
413
414                 
415                 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
416                 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
417
418         }
419         catch(exception& e) {
420                 m->errorOut(e, "ChimeraSlayer", "printBlock");
421                 exit(1);
422         }
423 }
424 //***************************************************************************************************************
425 string ChimeraSlayer::getBlock(data_struct data){
426         try {
427                 
428                 string outputString = "";
429                 
430                 outputString += querySeq->getName() + "\t";
431                 outputString += data.parentA.getName() + "\t" + data.parentB.getName()  + "\t";
432                         
433                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
434                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
435                 
436                 outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
437                 
438                 return outputString;
439         }
440         catch(exception& e) {
441                 m->errorOut(e, "ChimeraSlayer", "getBlock");
442                 exit(1);
443         }
444 }
445 //***************************************************************************************************************/
446