5 * Created by westcott on 9/25/09.
6 * Copyright 2009 Pschloss Lab. All rights reserved.
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
13 #include "blastdb.hpp"
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div,
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
37 decalc = new DeCalculator();
42 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
46 //***************************************************************************************************************
47 int ChimeraSlayer::doPrep() {
50 //read in all query seqs
51 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
53 vector<Sequence*> temp = templateSeqs;
54 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
56 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
58 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
60 if (m->control_pressed) { return 0; }
62 //run filter on template
63 for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
65 string kmerDBNameLeft;
66 string kmerDBNameRight;
68 //generate the kmerdb to pass to maligner
69 if (searchMethod == "kmer") {
70 string templatePath = m->hasPath(templateFileName);
71 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
72 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
74 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
75 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
77 for (int i = 0; i < templateSeqs.size(); i++) {
79 if (m->control_pressed) { return 0; }
81 string leftFrag = templateSeqs[i]->getUnaligned();
82 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
84 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
85 databaseLeft->addSequence(leftTemp);
87 databaseLeft->generateDB();
88 databaseLeft->setNumSeqs(templateSeqs.size());
90 for (int i = 0; i < templateSeqs.size(); i++) {
91 if (m->control_pressed) { return 0; }
93 string rightFrag = templateSeqs[i]->getUnaligned();
94 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
96 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
97 databaseRight->addSequence(rightTemp);
99 databaseRight->generateDB();
100 databaseRight->setNumSeqs(templateSeqs.size());
104 kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
105 ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
106 bool needToGenerateLeft = true;
108 if(kmerFileTestLeft){
109 bool GoodFile = m->checkReleaseVersion(kmerFileTestLeft, m->getVersion());
110 if (GoodFile) { needToGenerateLeft = false; }
113 if(needToGenerateLeft){
115 for (int i = 0; i < templateSeqs.size(); i++) {
117 if (m->control_pressed) { return 0; }
119 string leftFrag = templateSeqs[i]->getUnaligned();
120 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
122 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
123 databaseLeft->addSequence(leftTemp);
125 databaseLeft->generateDB();
128 databaseLeft->readKmerDB(kmerFileTestLeft);
130 kmerFileTestLeft.close();
132 databaseLeft->setNumSeqs(templateSeqs.size());
135 kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
136 ifstream kmerFileTestRight(kmerDBNameRight.c_str());
137 bool needToGenerateRight = true;
139 if(kmerFileTestRight){
140 bool GoodFile = m->checkReleaseVersion(kmerFileTestRight, m->getVersion());
141 if (GoodFile) { needToGenerateRight = false; }
144 if(needToGenerateRight){
146 for (int i = 0; i < templateSeqs.size(); i++) {
147 if (m->control_pressed) { return 0; }
149 string rightFrag = templateSeqs[i]->getUnaligned();
150 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
152 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
153 databaseRight->addSequence(rightTemp);
155 databaseRight->generateDB();
158 databaseRight->readKmerDB(kmerFileTestRight);
160 kmerFileTestRight.close();
162 databaseRight->setNumSeqs(templateSeqs.size());
164 }else if (searchMethod == "blast") {
167 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
168 for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); }
169 databaseLeft->generateDB();
170 databaseLeft->setNumSeqs(templateSeqs.size());
176 catch(exception& e) {
177 m->errorOut(e, "ChimeraSlayer", "doprep");
181 //***************************************************************************************************************
182 ChimeraSlayer::~ChimeraSlayer() {
184 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
185 else if (searchMethod == "blast") { delete databaseLeft; }
187 //***************************************************************************************************************
188 void ChimeraSlayer::printHeader(ostream& out) {
189 m->mothurOutEndLine();
190 m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
191 m->mothurOutEndLine();
193 out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
195 //***************************************************************************************************************
196 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
198 if (chimeraFlags == "yes") {
199 string chimeraFlag = "no";
200 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
202 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
205 if (chimeraFlag == "yes") {
206 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
207 m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
208 outAcc << querySeq->getName() << endl;
212 printBlock(chimeraResults[0], chimeraFlag, out);
214 }else { out << querySeq->getName() << "\tno" << endl; }
219 catch(exception& e) {
220 m->errorOut(e, "ChimeraSlayer", "print");
225 //***************************************************************************************************************
226 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
229 bool results = false;
230 string outAccString = "";
231 string outputString = "";
233 if (chimeraFlags == "yes") {
234 string chimeraFlag = "no";
235 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
237 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
240 if (chimeraFlag == "yes") {
241 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
242 cout << querySeq->getName() << "\tyes" << endl;
243 outAccString += querySeq->getName() + "\n";
246 //write to accnos file
247 int length = outAccString.length();
248 char* buf2 = new char[length];
249 memcpy(buf2, outAccString.c_str(), length);
251 MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
256 outputString = getBlock(chimeraResults[0], chimeraFlag);
257 outputString += "\n";
258 //cout << outputString << endl;
259 //write to output file
260 int length = outputString.length();
261 char* buf = new char[length];
262 memcpy(buf, outputString.c_str(), length);
264 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
268 outputString += querySeq->getName() + "\tno\n";
269 //cout << outputString << endl;
270 //write to output file
271 int length = outputString.length();
272 char* buf = new char[length];
273 memcpy(buf, outputString.c_str(), length);
275 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
282 catch(exception& e) {
283 m->errorOut(e, "ChimeraSlayer", "print");
289 //***************************************************************************************************************
290 int ChimeraSlayer::getChimeras(Sequence* query) {
295 spotMap = runFilter(query);
299 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
300 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
301 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
303 if (m->control_pressed) { return 0; }
305 string chimeraFlag = maligner->getResults(query, decalc);
306 if (m->control_pressed) { return 0; }
307 vector<results> Results = maligner->getOutput();
309 //found in testing realigning only made things worse
311 ChimeraReAligner realigner(templateSeqs, match, misMatch);
312 realigner.reAlign(query, Results);
315 if (chimeraFlag == "yes") {
317 //get sequence that were given from maligner results
318 vector<SeqDist> seqs;
319 map<string, float> removeDups;
320 map<string, float>::iterator itDup;
321 map<string, string> parentNameSeq;
322 map<string, string>::iterator itSeq;
323 for (int j = 0; j < Results.size(); j++) {
324 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
325 //only add if you are not a duplicate
326 itDup = removeDups.find(Results[j].parent);
327 if (itDup == removeDups.end()) { //this is not duplicate
328 removeDups[Results[j].parent] = dist;
329 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
330 }else if (dist > itDup->second) { //is this a stronger number for this parent
331 removeDups[Results[j].parent] = dist;
332 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
336 for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
337 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
338 itSeq = parentNameSeq.find(itDup->first);
339 //cout << itDup->first << itSeq->second << endl;
340 Sequence* seq = new Sequence(itDup->first, itSeq->second);
344 member.dist = itDup->second;
346 seqs.push_back(member);
349 //limit number of parents to explore - default 3
350 if (Results.size() > parents) {
352 sort(seqs.begin(), seqs.end(), compareSeqDist);
353 //prioritize larger more similiar sequence fragments
354 reverse(seqs.begin(), seqs.end());
356 for (int k = seqs.size()-1; k > (parents-1); k--) {
362 //put seqs into vector to send to slayer
363 vector<Sequence*> seqsForSlayer;
364 for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
366 //mask then send to slayer...
368 decalc->setMask(seqMask);
371 decalc->runMask(query);
374 for (int k = 0; k < seqsForSlayer.size(); k++) {
375 decalc->runMask(seqsForSlayer[k]);
378 spotMap = decalc->getMaskMap();
381 if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
384 chimeraFlags = slayer->getResults(query, seqsForSlayer);
385 if (m->control_pressed) { return 0; }
386 chimeraResults = slayer->getOutput();
389 for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
397 catch(exception& e) {
398 m->errorOut(e, "ChimeraSlayer", "getChimeras");
402 //***************************************************************************************************************
403 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
407 out << querySeq->getName() << '\t';
408 out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
409 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
410 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
412 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
413 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
415 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
417 //out << "Similarity of parents: " << data.ab << endl;
418 //out << "Similarity of query to parentA: " << data.qa << endl;
419 //out << "Similarity of query to parentB: " << data.qb << endl;
422 //out << "Per_id(QL,A): " << data.qla << endl;
423 //out << "Per_id(QL,B): " << data.qlb << endl;
424 //out << "Per_id(QR,A): " << data.qra << endl;
425 //out << "Per_id(QR,B): " << data.qrb << endl;
428 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
429 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
432 catch(exception& e) {
433 m->errorOut(e, "ChimeraSlayer", "printBlock");
437 //***************************************************************************************************************
438 string ChimeraSlayer::getBlock(data_struct data, string flag){
441 string outputString = "";
443 outputString += querySeq->getName() + "\t";
444 outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t";
446 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
447 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
449 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
453 catch(exception& e) {
454 m->errorOut(e, "ChimeraSlayer", "getBlock");
458 //***************************************************************************************************************/