]> git.donarmstrong.com Git - mothur.git/blob - chimeraslayer.cpp
1.9
[mothur.git] / chimeraslayer.cpp
1 /*
2  *  chimeraslayer.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/25/09.
6  *  Copyright 2009 Pschloss Lab. All rights reserved.
7  *
8  */
9
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
12 #include "kmerdb.hpp"
13
14 //***************************************************************************************************************
15 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
16 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {       
17         try {
18                 fastafile = file;
19                 templateFileName = temp; templateSeqs = readSeqs(temp);
20                 searchMethod = mode;
21                 kmerSize = k;
22                 match = ms;
23                 misMatch = mms;
24                 window = win;
25                 divR = div;
26                 minSim = minsim;
27                 minCov = mincov;
28                 minBS = minbs;
29                 minSNP = minsnp;
30                 parents = par;
31                 iters = it;
32                 increment = inc;
33                 numWanted = numw;
34                 realign = r; 
35         
36                 decalc = new DeCalculator();    
37                 
38                 doPrep();
39         }
40         catch(exception& e) {
41                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
42                 exit(1);
43         }
44 }
45 //***************************************************************************************************************
46 int ChimeraSlayer::doPrep() {
47         try {
48                 
49                 
50                 //read in all query seqs
51                 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
52                 
53                 vector<Sequence*> temp = templateSeqs;
54                 for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
55                 
56                 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
57                 
58                 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
59                 
60                 if (m->control_pressed) {  return 0; } 
61                 
62                 //run filter on template
63                 for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  return 0; }  runFilter(templateSeqs[i]);  }
64                 
65                 string  kmerDBNameLeft;
66                 string  kmerDBNameRight;
67         
68                 //generate the kmerdb to pass to maligner
69                 if (searchMethod == "kmer") { 
70                         string rightTemplateFileName = "right." + templateFileName;
71                         databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
72                                 
73                         string leftTemplateFileName = "left." + templateFileName;
74                         databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
75                 #ifdef USE_MPI
76                         for (int i = 0; i < templateSeqs.size(); i++) {
77                                         
78                                 if (m->control_pressed) { return 0; } 
79                                         
80                                 string leftFrag = templateSeqs[i]->getUnaligned();
81                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
82                                         
83                                 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
84                                 databaseLeft->addSequence(leftTemp);    
85                         }
86                         databaseLeft->generateDB();
87                         databaseLeft->setNumSeqs(templateSeqs.size());
88                         
89                         for (int i = 0; i < templateSeqs.size(); i++) {
90                                 if (m->control_pressed) { return 0; } 
91                                         
92                                 string rightFrag = templateSeqs[i]->getUnaligned();
93                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
94                                         
95                                 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
96                                 databaseRight->addSequence(rightTemp);  
97                         }
98                         databaseRight->generateDB();
99                         databaseRight->setNumSeqs(templateSeqs.size());
100
101                 #else   
102                         //leftside
103                         kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
104                         ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
105                         
106                         if(!kmerFileTestLeft){  
107                         
108                                 for (int i = 0; i < templateSeqs.size(); i++) {
109                                         
110                                         if (m->control_pressed) { return 0; } 
111                                         
112                                         string leftFrag = templateSeqs[i]->getUnaligned();
113                                         leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
114                                         
115                                         Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
116                                         databaseLeft->addSequence(leftTemp);    
117                                 }
118                                 databaseLeft->generateDB();
119                                 
120                         }else { 
121                                 databaseLeft->readKmerDB(kmerFileTestLeft);     
122                         }
123                         kmerFileTestLeft.close();
124                         
125                         databaseLeft->setNumSeqs(templateSeqs.size());
126                         
127                         //rightside
128                         kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
129                         ifstream kmerFileTestRight(kmerDBNameRight.c_str());
130                         
131                         if(!kmerFileTestRight){ 
132                         
133                                 for (int i = 0; i < templateSeqs.size(); i++) {
134                                         if (m->control_pressed) { return 0; } 
135                                         
136                                         string rightFrag = templateSeqs[i]->getUnaligned();
137                                         rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
138                                         
139                                         Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
140                                         databaseRight->addSequence(rightTemp);  
141                                 }
142                                 databaseRight->generateDB();
143                                 
144                         }else { 
145                                 databaseRight->readKmerDB(kmerFileTestRight);   
146                         }
147                         kmerFileTestRight.close();
148                         
149                         databaseRight->setNumSeqs(templateSeqs.size());
150                 #endif  
151                 }
152                 
153                 return 0;
154
155         }
156         catch(exception& e) {
157                 m->errorOut(e, "ChimeraSlayer", "doprep");
158                 exit(1);
159         }
160 }
161 //***************************************************************************************************************
162 ChimeraSlayer::~ChimeraSlayer() {       delete decalc;  if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }    }
163 //***************************************************************************************************************
164 void ChimeraSlayer::printHeader(ostream& out) {
165         m->mothurOutEndLine();
166         m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
167         m->mothurOutEndLine();
168         
169         out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
170 }
171 //***************************************************************************************************************
172 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
173         try {
174                 if (chimeraFlags == "yes") {
175                         string chimeraFlag = "no";
176                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
177                            ||
178                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
179                         
180                         
181                         if (chimeraFlag == "yes") {     
182                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
183                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
184                                         outAcc << querySeq->getName() << endl;
185                                 }
186                         }
187                         
188                         printBlock(chimeraResults[0], out);
189                         out << endl;
190                 }else {  out << querySeq->getName() << "\tno" << endl;  }
191                 
192                 return 0;
193                 
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "ChimeraSlayer", "print");
197                 exit(1);
198         }
199 }
200 #ifdef USE_MPI
201 //***************************************************************************************************************
202 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
203         try {
204                 MPI_Status status;
205                 bool results = false;
206                 string outAccString = "";
207                 string outputString = "";
208                 
209                 if (chimeraFlags == "yes") {
210                         string chimeraFlag = "no";
211                         if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
212                            ||
213                            (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
214                         
215                         
216                         if (chimeraFlag == "yes") {     
217                                 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
218                                         cout << querySeq->getName() <<  "\tyes" << endl;
219                                         outAccString += querySeq->getName() + "\n";
220                                         results = true;
221                                         
222                                         //write to accnos file
223                                         int length = outAccString.length();
224                                         char* buf2 = new char[length];\r
225                                         memcpy(buf2, outAccString.c_str(), length);
226                                 
227                                         MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
228                                         delete buf2;
229                                 }
230                         }
231                         
232                         outputString = getBlock(chimeraResults[0]);
233                         outputString += "\n";
234                         
235                 }else {  outputString += querySeq->getName() + "\tno\n";  }
236                 
237                 //write to output file
238                 int length = outputString.length();
239                 char* buf = new char[length];\r
240                 memcpy(buf, outputString.c_str(), length);
241                                 
242                 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
243                 delete buf;
244
245                 return results;
246         }
247         catch(exception& e) {
248                 m->errorOut(e, "ChimeraSlayer", "print");
249                 exit(1);
250         }
251 }
252 #endif
253
254 //***************************************************************************************************************
255 int ChimeraSlayer::getChimeras(Sequence* query) {
256         try {
257                 chimeraFlags = "no";
258                 
259                 //filter query
260                 spotMap = runFilter(query);     
261                 
262                 querySeq = query;
263                 
264                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
265                 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
266                 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
267                 
268                 if (m->control_pressed) {  return 0;  }
269                 
270                 string chimeraFlag = maligner->getResults(query, decalc);
271                 if (m->control_pressed) {  return 0;  }
272                 vector<results> Results = maligner->getOutput();
273                                 
274                 //found in testing realigning only made things worse
275                 if (realign) {
276                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
277                         realigner.reAlign(query, Results);
278                 }
279
280                 if (chimeraFlag == "yes") {
281                         
282                         //get sequence that were given from maligner results
283                         vector<SeqDist> seqs;
284                         map<string, float> removeDups;
285                         map<string, float>::iterator itDup;
286                         map<string, string> parentNameSeq;
287                         map<string, string>::iterator itSeq;
288                         for (int j = 0; j < Results.size(); j++) {
289                                 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
290                                 //only add if you are not a duplicate
291                                 itDup = removeDups.find(Results[j].parent);
292                                 if (itDup == removeDups.end()) { //this is not duplicate
293                                         removeDups[Results[j].parent] = dist;
294                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
295                                 }else if (dist > itDup->second) { //is this a stronger number for this parent
296                                         removeDups[Results[j].parent] = dist;
297                                         parentNameSeq[Results[j].parent] = Results[j].parentAligned;
298                                 }
299                         }
300                         
301                         for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
302                                 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
303                                 itSeq = parentNameSeq.find(itDup->first);
304 //cout << itDup->first << itSeq->second << endl;
305                                 Sequence* seq = new Sequence(itDup->first, itSeq->second);
306                                 
307                                 SeqDist member;
308                                 member.seq = seq;
309                                 member.dist = itDup->second;
310                                 
311                                 seqs.push_back(member);
312                         }
313                         
314                         //limit number of parents to explore - default 3
315                         if (Results.size() > parents) {
316                                 //sort by distance
317                                 sort(seqs.begin(), seqs.end(), compareSeqDist);
318                                 //prioritize larger more similiar sequence fragments
319                                 reverse(seqs.begin(), seqs.end());
320                                 
321                                 for (int k = seqs.size()-1; k > (parents-1); k--)  {  
322                                         delete seqs[k].seq;
323                                         seqs.pop_back();        
324                                 }
325                         }
326                         
327                         //put seqs into vector to send to slayer
328                         vector<Sequence*> seqsForSlayer;
329                         for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
330                         
331                         //mask then send to slayer...
332                         if (seqMask != "") {
333                                 decalc->setMask(seqMask);
334                                 
335                                 //mask querys
336                                 decalc->runMask(query);
337                                 
338                                 //mask parents
339                                 for (int k = 0; k < seqsForSlayer.size(); k++) {
340                                         decalc->runMask(seqsForSlayer[k]);
341                                 }
342                                 
343                                 spotMap = decalc->getMaskMap();
344                         }
345                         
346                         if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
347                         
348                         //send to slayer
349                         chimeraFlags = slayer->getResults(query, seqsForSlayer);
350                         if (m->control_pressed) {  return 0;  }
351                         chimeraResults = slayer->getOutput();
352                         
353                         //free memory
354                         for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
355                 }
356                 
357                 delete maligner;
358                 delete slayer;
359                 
360                 return 0;
361         }
362         catch(exception& e) {
363                 m->errorOut(e, "ChimeraSlayer", "getChimeras");
364                 exit(1);
365         }
366 }
367 //***************************************************************************************************************
368 void ChimeraSlayer::printBlock(data_struct data, ostream& out){
369         try {
370         //out << ":)\n";
371                 
372                 out << querySeq->getName() << '\t';
373                 out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
374                 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
375                 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
376                 
377                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
378                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
379                 
380                 out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
381                 
382                 //out << "Similarity of parents: " << data.ab << endl;
383                 //out << "Similarity of query to parentA: " << data.qa << endl;
384                 //out << "Similarity of query to parentB: " << data.qb << endl;
385                 
386                 
387                 //out << "Per_id(QL,A): " << data.qla << endl;
388                 //out << "Per_id(QL,B): " << data.qlb << endl;
389                 //out << "Per_id(QR,A): " << data.qra << endl;
390                 //out << "Per_id(QR,B): " << data.qrb << endl;
391
392                 
393                 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
394                 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
395
396         }
397         catch(exception& e) {
398                 m->errorOut(e, "ChimeraSlayer", "printBlock");
399                 exit(1);
400         }
401 }
402 //***************************************************************************************************************
403 string ChimeraSlayer::getBlock(data_struct data){
404         try {
405                 
406                 string outputString = "";
407                 
408                 outputString += querySeq->getName() + "\t";
409                 outputString += data.parentA.getName() + "\t" + data.parentB.getName()  + "\t";
410                         
411                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
412                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
413                 
414                 outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
415                 
416                 return outputString;
417         }
418         catch(exception& e) {
419                 m->errorOut(e, "ChimeraSlayer", "getBlock");
420                 exit(1);
421         }
422 }
423 //***************************************************************************************************************/
424