]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
12 #include "kmerdb.hpp"
13
14 //***************************************************************************************************************
15 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
16 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {       
17         try {
18                 fastafile = file;
19                 templateFileName = temp; templateSeqs = readSeqs(temp);
20                 searchMethod = mode;
21                 kmerSize = k;
22                 match = ms;
23                 misMatch = mms;
24                 window = win;
25                 divR = div;
26                 minSim = minsim;
27                 minCov = mincov;
28                 minBS = minbs;
29                 minSNP = minsnp;
30                 parents = par;
31                 iters = it;
32                 increment = inc;
33                 numWanted = numw;
34                 realign = r; 
35         
36                 decalc = new DeCalculator();    
37                 
38                 doPrep();
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
42                 exit(1);
43         }
44 }
45 //***************************************************************************************************************
46 int ChimeraSlayer::doPrep() {
47         try {
48                 
49                 
50                 //read in all query seqs
51                 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
52                 
53                 vector<Sequence*> temp = templateSeqs;
54                 for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
55                 
56                 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
57                 
58                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
59                 
60                 if (m->control_pressed) {  return 0; } 
61                 
62                 //run filter on template
63                 for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  return 0; }  runFilter(templateSeqs[i]);  }
64                 
65                 string  kmerDBNameLeft;
66                 string  kmerDBNameRight;
67         
68                 //generate the kmerdb to pass to maligner
69                 if (searchMethod == "kmer") { 
70                         string rightTemplateFileName = "right." + templateFileName;
71                         databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
72                                 
73                         string leftTemplateFileName = "left." + templateFileName;
74                         databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
75                 #ifdef USE_MPI
76                         for (int i = 0; i < templateSeqs.size(); i++) {
77                                         
78                                 if (m->control_pressed) { return 0; } 
79                                         
80                                 string leftFrag = templateSeqs[i]->getUnaligned();
81                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
82                                         
83                                 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
84                                 databaseLeft->addSequence(leftTemp);    
85                         }
86                         databaseLeft->generateDB();
87                         databaseLeft->setNumSeqs(templateSeqs.size());
88                         
89                         for (int i = 0; i < templateSeqs.size(); i++) {
90                                 if (m->control_pressed) { return 0; } 
91                                         
92                                 string rightFrag = templateSeqs[i]->getUnaligned();
93                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
94                                         
95                                 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
96                                 databaseRight->addSequence(rightTemp);  
97                         }
98                         databaseRight->generateDB();
99                         databaseRight->setNumSeqs(templateSeqs.size());
100
101                 #else   
102                         //leftside
103                         kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
104                         ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
105                         
106                         if(!kmerFileTestLeft){  
107                         
108                                 for (int i = 0; i < templateSeqs.size(); i++) {
109                                         
110                                         if (m->control_pressed) { return 0; } 
111                                         
112                                         string leftFrag = templateSeqs[i]->getUnaligned();
113                                         leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
114                                         
115                                         Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
116                                         databaseLeft->addSequence(leftTemp);    
117                                 }
118                                 databaseLeft->generateDB();
119                                 
120                         }else { 
121                                 databaseLeft->readKmerDB(kmerFileTestLeft);     
122                         }
123                         kmerFileTestLeft.close();
124                         
125                         databaseLeft->setNumSeqs(templateSeqs.size());
126                         
127                         //rightside
128                         kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
129                         ifstream kmerFileTestRight(kmerDBNameRight.c_str());
130                         
131                         if(!kmerFileTestRight){ 
132                         
133                                 for (int i = 0; i < templateSeqs.size(); i++) {
134                                         if (m->control_pressed) { return 0; } 
135                                         
136                                         string rightFrag = templateSeqs[i]->getUnaligned();
137                                         rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
138                                         
139                                         Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
140                                         databaseRight->addSequence(rightTemp);  
141                                 }
142                                 databaseRight->generateDB();
143                                 
144                         }else { 
145                                 databaseRight->readKmerDB(kmerFileTestRight);   
146                         }
147                         kmerFileTestRight.close();
148                         
149                         databaseRight->setNumSeqs(templateSeqs.size());
150                 #endif  
151                 }
152                 
153                 return 0;
154
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "ChimeraSlayer", "doprep");
158                 exit(1);
159         }
160 }
161 //***************************************************************************************************************
162 ChimeraSlayer::~ChimeraSlayer() {       delete decalc;  if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }    }
163 //***************************************************************************************************************
164 void ChimeraSlayer::printHeader(ostream& out) {
165         m->mothurOutEndLine();
166         m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
167         m->mothurOutEndLine();
168         
169         out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
170 }
171 //***************************************************************************************************************
172 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
173         try {
174                 if (chimeraFlags == "yes") {
175                         string chimeraFlag = "no";
176                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
177                            ||
178                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
179                         
180                         
181                         if (chimeraFlag == "yes") {     
182                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
183                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
184                                         outAcc << querySeq->getName() << endl;
185                                 }
186                         }
187                         
188                         printBlock(chimeraResults[0], out);
189                         out << endl;
190                 }else {  out << querySeq->getName() << "\tno" << endl;  }
191                 
192                 return 0;
193                 
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "ChimeraSlayer", "print");
197                 exit(1);
198         }
199 }
200 #ifdef USE_MPI
201 //***************************************************************************************************************
202 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
203         try {
204                 MPI_Status status;
205                 bool results = false;
206                 string outAccString = "";
207                 string outputString = "";
208                 
209                 if (chimeraFlags == "yes") {
210                         string chimeraFlag = "no";
211                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
212                            ||
213                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
214                         
215                         
216                         if (chimeraFlag == "yes") {     
217                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
218                                         cout << querySeq->getName() <<  "\tyes" << endl;
219                                         outAccString += querySeq->getName() + "\n";
220                                         results = true;
221                                         
222                                         //write to accnos file
223                                         int length = outAccString.length();
224                                         char buf2[length];
225                                         strcpy(buf2, outAccString.c_str()); 
226                                 
227                                         MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
228                                 }
229                         }
230                         
231                         outputString = getBlock(chimeraResults[0]);
232                         outputString += "\n";
233                         
234                 }else {  outputString += querySeq->getName() + "\tno\n";  }
235                 
236                 //write to output file
237                 int length = outputString.length();
238                 char buf[length];
239                 strcpy(buf, outputString.c_str()); 
240                 
241                 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
242
243                 return results;
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "ChimeraSlayer", "print");
247                 exit(1);
248         }
249 }
250 #endif
251
252 //***************************************************************************************************************
253 int ChimeraSlayer::getChimeras(Sequence* query) {
254         try {
255                 chimeraFlags = "no";
256                 
257                 //filter query
258                 spotMap = runFilter(query);     
259                 
260                 querySeq = query;
261                 
262                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
263                 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
264                 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
265                 
266                 if (m->control_pressed) {  return 0;  }
267                 
268                 string chimeraFlag = maligner->getResults(query, decalc);
269                 if (m->control_pressed) {  return 0;  }
270                 vector<results> Results = maligner->getOutput();
271                                 
272                 //found in testing realigning only made things worse
273                 if (realign) {
274                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
275                         realigner.reAlign(query, Results);
276                 }
277
278                 if (chimeraFlag == "yes") {
279                         
280                         //get sequence that were given from maligner results
281                         vector<SeqDist> seqs;
282                         map<string, float> removeDups;
283                         map<string, float>::iterator itDup;
284                         map<string, string> parentNameSeq;
285                         map<string, string>::iterator itSeq;
286                         for (int j = 0; j < Results.size(); j++) {
287                                 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
288                                 //only add if you are not a duplicate
289                                 itDup = removeDups.find(Results[j].parent);
290                                 if (itDup == removeDups.end()) { //this is not duplicate
291                                         removeDups[Results[j].parent] = dist;
292                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
293                                 }else if (dist > itDup->second) { //is this a stronger number for this parent
294                                         removeDups[Results[j].parent] = dist;
295                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
296                                 }
297                         }
298                         
299                         for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
300                                 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
301                                 itSeq = parentNameSeq.find(itDup->first);
302 //cout << itDup->first << itSeq->second << endl;
303                                 Sequence* seq = new Sequence(itDup->first, itSeq->second);
304                                 
305                                 SeqDist member;
306                                 member.seq = seq;
307                                 member.dist = itDup->second;
308                                 
309                                 seqs.push_back(member);
310                         }
311                         
312                         //limit number of parents to explore - default 3
313                         if (Results.size() > parents) {
314                                 //sort by distance
315                                 sort(seqs.begin(), seqs.end(), compareSeqDist);
316                                 //prioritize larger more similiar sequence fragments
317                                 reverse(seqs.begin(), seqs.end());
318                                 
319                                 for (int k = seqs.size()-1; k > (parents-1); k--)  {  
320                                         delete seqs[k].seq;
321                                         seqs.pop_back();        
322                                 }
323                         }
324                         
325                         //put seqs into vector to send to slayer
326                         vector<Sequence*> seqsForSlayer;
327                         for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
328                         
329                         //mask then send to slayer...
330                         if (seqMask != "") {
331                                 decalc->setMask(seqMask);
332                                 
333                                 //mask querys
334                                 decalc->runMask(query);
335                                 
336                                 //mask parents
337                                 for (int k = 0; k < seqsForSlayer.size(); k++) {
338                                         decalc->runMask(seqsForSlayer[k]);
339                                 }
340                                 
341                                 spotMap = decalc->getMaskMap();
342                         }
343                         
344                         if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
345                         
346                         //send to slayer
347                         chimeraFlags = slayer->getResults(query, seqsForSlayer);
348                         if (m->control_pressed) {  return 0;  }
349                         chimeraResults = slayer->getOutput();
350                         
351                         //free memory
352                         for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
353                 }
354                 
355                 delete maligner;
356                 delete slayer;
357                 
358                 return 0;
359         }
360         catch(exception& e) {
361                 m->errorOut(e, "ChimeraSlayer", "getChimeras");
362                 exit(1);
363         }
364 }
365 //***************************************************************************************************************
366 void ChimeraSlayer::printBlock(data_struct data, ostream& out){
367         try {
368         //out << ":)\n";
369                 
370                 out << querySeq->getName() << '\t';
371                 out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
372                 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
373                 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
374                 
375                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
376                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
377                 
378                 out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
379                 
380                 //out << "Similarity of parents: " << data.ab << endl;
381                 //out << "Similarity of query to parentA: " << data.qa << endl;
382                 //out << "Similarity of query to parentB: " << data.qb << endl;
383                 
384                 
385                 //out << "Per_id(QL,A): " << data.qla << endl;
386                 //out << "Per_id(QL,B): " << data.qlb << endl;
387                 //out << "Per_id(QR,A): " << data.qra << endl;
388                 //out << "Per_id(QR,B): " << data.qrb << endl;
389
390                 
391                 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
392                 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
393
394         }
395         catch(exception& e) {
396                 m->errorOut(e, "ChimeraSlayer", "printBlock");
397                 exit(1);
398         }
399 }
400 //***************************************************************************************************************
401 string ChimeraSlayer::getBlock(data_struct data){
402         try {
403                 
404                 string outputString = "";
405                 
406                 outputString += querySeq->getName() + "\t";
407                 outputString += data.parentA.getName() + "\t" + data.parentB.getName()  + "\t";
408                         
409                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
410                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
411                 
412                 outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
413                 
414                 return outputString;
415         }
416         catch(exception& e) {
417                 m->errorOut(e, "ChimeraSlayer", "getBlock");
418                 exit(1);
419         }
420 }
421 //***************************************************************************************************************/
422