5 * Created by westcott on 9/25/09.
6 * Copyright 2009 Pschloss Lab. All rights reserved.
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
13 #include "blastdb.hpp"
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div,
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
37 decalc = new DeCalculator();
42 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
46 //***************************************************************************************************************
47 int ChimeraSlayer::doPrep() {
50 //read in all query seqs
51 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
53 vector<Sequence*> temp = templateSeqs;
54 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
56 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
58 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
60 if (m->control_pressed) { return 0; }
62 //run filter on template
63 for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
65 string kmerDBNameLeft;
66 string kmerDBNameRight;
68 //generate the kmerdb to pass to maligner
69 if (searchMethod == "kmer") {
70 string rightTemplateFileName = "right." + templateFileName;
71 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
73 string leftTemplateFileName = "left." + templateFileName;
74 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
76 for (int i = 0; i < templateSeqs.size(); i++) {
78 if (m->control_pressed) { return 0; }
80 string leftFrag = templateSeqs[i]->getUnaligned();
81 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
83 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
84 databaseLeft->addSequence(leftTemp);
86 databaseLeft->generateDB();
87 databaseLeft->setNumSeqs(templateSeqs.size());
89 for (int i = 0; i < templateSeqs.size(); i++) {
90 if (m->control_pressed) { return 0; }
92 string rightFrag = templateSeqs[i]->getUnaligned();
93 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
95 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
96 databaseRight->addSequence(rightTemp);
98 databaseRight->generateDB();
99 databaseRight->setNumSeqs(templateSeqs.size());
103 kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
104 ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
105 bool needToGenerateLeft = true;
107 if(kmerFileTestLeft){
108 bool GoodFile = checkReleaseVersion(kmerFileTestLeft, m->getVersion());
109 if (GoodFile) { needToGenerateLeft = false; }
112 if(needToGenerateLeft){
114 for (int i = 0; i < templateSeqs.size(); i++) {
116 if (m->control_pressed) { return 0; }
118 string leftFrag = templateSeqs[i]->getUnaligned();
119 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
121 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
122 databaseLeft->addSequence(leftTemp);
124 databaseLeft->generateDB();
127 databaseLeft->readKmerDB(kmerFileTestLeft);
129 kmerFileTestLeft.close();
131 databaseLeft->setNumSeqs(templateSeqs.size());
134 kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
135 ifstream kmerFileTestRight(kmerDBNameRight.c_str());
136 bool needToGenerateRight = true;
138 if(kmerFileTestRight){
139 bool GoodFile = checkReleaseVersion(kmerFileTestRight, m->getVersion());
140 if (GoodFile) { needToGenerateRight = false; }
143 if(needToGenerateRight){
145 for (int i = 0; i < templateSeqs.size(); i++) {
146 if (m->control_pressed) { return 0; }
148 string rightFrag = templateSeqs[i]->getUnaligned();
149 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
151 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
152 databaseRight->addSequence(rightTemp);
154 databaseRight->generateDB();
157 databaseRight->readKmerDB(kmerFileTestRight);
159 kmerFileTestRight.close();
161 databaseRight->setNumSeqs(templateSeqs.size());
163 }else if (searchMethod == "blast") {
166 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
167 for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); }
168 databaseLeft->generateDB();
169 databaseLeft->setNumSeqs(templateSeqs.size());
175 catch(exception& e) {
176 m->errorOut(e, "ChimeraSlayer", "doprep");
180 //***************************************************************************************************************
181 ChimeraSlayer::~ChimeraSlayer() {
183 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
184 else if (searchMethod == "blast") { delete databaseLeft; }
186 //***************************************************************************************************************
187 void ChimeraSlayer::printHeader(ostream& out) {
188 m->mothurOutEndLine();
189 m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
190 m->mothurOutEndLine();
192 out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
194 //***************************************************************************************************************
195 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
197 if (chimeraFlags == "yes") {
198 string chimeraFlag = "no";
199 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
201 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
204 if (chimeraFlag == "yes") {
205 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
206 m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
207 outAcc << querySeq->getName() << endl;
211 printBlock(chimeraResults[0], chimeraFlag, out);
213 }else { out << querySeq->getName() << "\tno" << endl; }
218 catch(exception& e) {
219 m->errorOut(e, "ChimeraSlayer", "print");
224 //***************************************************************************************************************
225 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
228 bool results = false;
229 string outAccString = "";
230 string outputString = "";
232 if (chimeraFlags == "yes") {
233 string chimeraFlag = "no";
234 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
236 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
239 if (chimeraFlag == "yes") {
240 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
241 cout << querySeq->getName() << "\tyes" << endl;
242 outAccString += querySeq->getName() + "\n";
245 //write to accnos file
246 int length = outAccString.length();
247 char* buf2 = new char[length];
248 memcpy(buf2, outAccString.c_str(), length);
250 MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
255 outputString = getBlock(chimeraResults[0], chimeraFlag);
256 outputString += "\n";
257 //cout << outputString << endl;
258 //write to output file
259 int length = outputString.length();
260 char* buf = new char[length];
261 memcpy(buf, outputString.c_str(), length);
263 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
267 outputString += querySeq->getName() + "\tno\n";
268 //cout << outputString << endl;
269 //write to output file
270 int length = outputString.length();
271 char* buf = new char[length];
272 memcpy(buf, outputString.c_str(), length);
274 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
281 catch(exception& e) {
282 m->errorOut(e, "ChimeraSlayer", "print");
288 //***************************************************************************************************************
289 int ChimeraSlayer::getChimeras(Sequence* query) {
294 spotMap = runFilter(query);
298 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
299 maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
300 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
302 if (m->control_pressed) { return 0; }
304 string chimeraFlag = maligner->getResults(query, decalc);
305 if (m->control_pressed) { return 0; }
306 vector<results> Results = maligner->getOutput();
308 //found in testing realigning only made things worse
310 ChimeraReAligner realigner(templateSeqs, match, misMatch);
311 realigner.reAlign(query, Results);
314 if (chimeraFlag == "yes") {
316 //get sequence that were given from maligner results
317 vector<SeqDist> seqs;
318 map<string, float> removeDups;
319 map<string, float>::iterator itDup;
320 map<string, string> parentNameSeq;
321 map<string, string>::iterator itSeq;
322 for (int j = 0; j < Results.size(); j++) {
323 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
324 //only add if you are not a duplicate
325 itDup = removeDups.find(Results[j].parent);
326 if (itDup == removeDups.end()) { //this is not duplicate
327 removeDups[Results[j].parent] = dist;
328 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
329 }else if (dist > itDup->second) { //is this a stronger number for this parent
330 removeDups[Results[j].parent] = dist;
331 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
335 for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
336 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
337 itSeq = parentNameSeq.find(itDup->first);
338 //cout << itDup->first << itSeq->second << endl;
339 Sequence* seq = new Sequence(itDup->first, itSeq->second);
343 member.dist = itDup->second;
345 seqs.push_back(member);
348 //limit number of parents to explore - default 3
349 if (Results.size() > parents) {
351 sort(seqs.begin(), seqs.end(), compareSeqDist);
352 //prioritize larger more similiar sequence fragments
353 reverse(seqs.begin(), seqs.end());
355 for (int k = seqs.size()-1; k > (parents-1); k--) {
361 //put seqs into vector to send to slayer
362 vector<Sequence*> seqsForSlayer;
363 for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
365 //mask then send to slayer...
367 decalc->setMask(seqMask);
370 decalc->runMask(query);
373 for (int k = 0; k < seqsForSlayer.size(); k++) {
374 decalc->runMask(seqsForSlayer[k]);
377 spotMap = decalc->getMaskMap();
380 if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
383 chimeraFlags = slayer->getResults(query, seqsForSlayer);
384 if (m->control_pressed) { return 0; }
385 chimeraResults = slayer->getOutput();
388 for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
396 catch(exception& e) {
397 m->errorOut(e, "ChimeraSlayer", "getChimeras");
401 //***************************************************************************************************************
402 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
406 out << querySeq->getName() << '\t';
407 out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
408 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
409 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
411 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
412 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
414 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
416 //out << "Similarity of parents: " << data.ab << endl;
417 //out << "Similarity of query to parentA: " << data.qa << endl;
418 //out << "Similarity of query to parentB: " << data.qb << endl;
421 //out << "Per_id(QL,A): " << data.qla << endl;
422 //out << "Per_id(QL,B): " << data.qlb << endl;
423 //out << "Per_id(QR,A): " << data.qra << endl;
424 //out << "Per_id(QR,B): " << data.qrb << endl;
427 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
428 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
431 catch(exception& e) {
432 m->errorOut(e, "ChimeraSlayer", "printBlock");
436 //***************************************************************************************************************
437 string ChimeraSlayer::getBlock(data_struct data, string flag){
440 string outputString = "";
442 outputString += querySeq->getName() + "\t";
443 outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t";
445 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
446 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
448 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
452 catch(exception& e) {
453 m->errorOut(e, "ChimeraSlayer", "getBlock");
457 //***************************************************************************************************************/