*/
#include "chimeraslayer.h"
+#include "chimerarealigner.h"
+#include "kmerdb.hpp"
//***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string filename, string temp) { fastafile = filename; templateFile = temp; }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) {
+ decalc = new DeCalculator();
+}
//***************************************************************************************************************
-
-ChimeraSlayer::~ChimeraSlayer() {
+int ChimeraSlayer::doPrep() {
try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+
+ string kmerDBNameLeft;
+ string kmerDBNameRight;
+
+ //generate the kmerdb to pass to maligner
+ if (searchMethod == "kmer") {
+ //leftside
+ string leftTemplateFileName = "left." + templateFileName;
+ databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
+ kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
+
+ if(!kmerFileTestLeft){
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ if (m->control_pressed) { return 0; }
+
+ string leftFrag = templateSeqs[i]->getUnaligned();
+ leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+
+ Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
+ databaseLeft->addSequence(leftTemp);
+ }
+ databaseLeft->generateDB();
+
+ }else {
+ databaseLeft->readKmerDB(kmerFileTestLeft);
+ }
+ kmerFileTestLeft.close();
+
+ databaseLeft->setNumSeqs(templateSeqs.size());
+
+ //rightside
+ string rightTemplateFileName = "right." + templateFileName;
+ databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
+ kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTestRight(kmerDBNameRight.c_str());
+
+ if(!kmerFileTestRight){
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ if (m->control_pressed) { return 0; }
+
+ string rightFrag = templateSeqs[i]->getUnaligned();
+ rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+
+ Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
+ databaseRight->addSequence(rightTemp);
+ }
+ databaseRight->generateDB();
+
+ }else {
+ databaseRight->readKmerDB(kmerFileTestRight);
+ }
+ kmerFileTestRight.close();
+
+ databaseRight->setNumSeqs(templateSeqs.size());
+
+ }
+
+ int start = time(NULL);
+ //filter the sequences
+ //read in all query seqs
+ ifstream in;
+ openInputFile(fastafile, in);
+
+ vector<Sequence*> tempQuerySeqs;
+ while(!in.eof()){
+ if (m->control_pressed) { for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } return 0; }
+
+ Sequence* s = new Sequence(in);
+ gobble(in);
+
+ if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+ }
+ in.close();
+
+ vector<Sequence*> temp = templateSeqs;
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
+
+ createFilter(temp, 0.0); //just removed columns where all seqs have a gap
+
+ for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
+
+ if (m->control_pressed) { return 0; }
+
+
+ //run filter on template
+ for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
+
+ m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to filter."); m->mothurOutEndLine();
+
+ return 0;
+
}
catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "~ChimeraSlayer");
+ m->errorOut(e, "ChimeraSlayer", "doprep");
exit(1);
}
-}
+}
//***************************************************************************************************************
-void ChimeraSlayer::print(ostream& out) {
+ChimeraSlayer::~ChimeraSlayer() { delete decalc; if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } }
+//***************************************************************************************************************
+void ChimeraSlayer::printHeader(ostream& out) {
+ m->mothurOutEndLine();
+ m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
+ m->mothurOutEndLine();
+
+ out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+}
+//***************************************************************************************************************
+int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
try {
- mothurOutEndLine();
- mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
- mothurOutEndLine();
-
- for (int i = 0; i < querySeqs.size(); i++) {
-
- if (chimeraFlags[i] == "yes") {
- cout << i << endl;
- if ((chimeraResults[i][0].bsa >= 90.0) || (chimeraResults[i][0].bsb >= 90.0)) {
- mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine();
- out << querySeqs[i]->getName() << "\tyes" << endl;
- }else {
- out << querySeqs[i]->getName() << "\tyes" << endl;
- //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine();
+ if (chimeraFlags == "yes") {
+ string chimeraFlag = "no";
+ if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
+ ||
+ (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
+
+
+ if (chimeraFlag == "yes") {
+ if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
+ m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
+ outAcc << querySeq->getName() << endl;
}
-
- printBlock(chimeraResults[i][0], out, i);
-
- out << endl;
- }else{
- out << querySeqs[i]->getName() << "\tno" << endl;
- //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine();
}
- }
-
+
+ printBlock(chimeraResults[0], out);
+ out << endl;
+ }else { out << querySeq->getName() << "\tno" << endl; }
+
+ return 0;
+
}
catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "print");
+ m->errorOut(e, "ChimeraSlayer", "print");
exit(1);
}
}
-
//***************************************************************************************************************
-int ChimeraSlayer::getChimeras() {
+int ChimeraSlayer::getChimeras(Sequence* query) {
try {
+ chimeraFlags = "no";
- //read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- templateSeqs = readSeqs(templateFile);
- mothurOut("Done."); mothurOutEndLine();
-
- int numSeqs = querySeqs.size();
+ //filter query
+ spotMap = runFilter(query);
- if (unaligned) { mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); return 1; }
+ querySeq = query;
- chimeraResults.resize(numSeqs);
- chimeraFlags.resize(numSeqs, "no");
- spotMap.resize(numSeqs);
+ //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
+ maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
+ slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
+ if (m->control_pressed) { return 0; }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ string chimeraFlag = maligner->getResults(query, decalc);
+ if (m->control_pressed) { return 0; }
+ vector<results> Results = maligner->getOutput();
+
+ //found in testing realigning only made things worse
+ if (realign) {
+ ChimeraReAligner realigner(templateSeqs, match, misMatch);
+ realigner.reAlign(query, Results);
+ }
+
+ if (chimeraFlag == "yes") {
+
+ //get sequence that were given from maligner results
+ vector<SeqDist> seqs;
+ map<string, float> removeDups;
+ map<string, float>::iterator itDup;
+ map<string, string> parentNameSeq;
+ map<string, string>::iterator itSeq;
+ for (int j = 0; j < Results.size(); j++) {
+ float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+ //only add if you are not a duplicate
+ itDup = removeDups.find(Results[j].parent);
+ if (itDup == removeDups.end()) { //this is not duplicate
+ removeDups[Results[j].parent] = dist;
+ parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+ }else if (dist > itDup->second) { //is this a stronger number for this parent
+ removeDups[Results[j].parent] = dist;
+ parentNameSeq[Results[j].parent] = Results[j].parentAligned;
}
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
- #else
- lines.push_back(new linePair(0, numSeqs));
- #endif
-
- if (seqMask != "") { decalc = new DeCalculator(); } //to use below
-
- //initialize spotMap
- for (int j = 0; j < numSeqs; j++) {
- for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
- spotMap[j][i] = i;
}
- }
-
- //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
- maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim);
- slayer = new Slayer(window, increment, minSim, divR, iters);
-
- for (int i = 0; i < querySeqs.size(); i++) {
-
- string chimeraFlag = maligner->getResults(querySeqs[i]);
- //float percentIdentical = maligner->getPercentID();
- vector<results> Results = maligner->getOutput();
- cout << "Processing sequence: " << i+1 << endl;
+ for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+ //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
+ itSeq = parentNameSeq.find(itDup->first);
+//cout << itDup->first << itSeq->second << endl;
+ Sequence* seq = new Sequence(itDup->first, itSeq->second);
+
+ SeqDist member;
+ member.seq = seq;
+ member.dist = itDup->second;
+
+ seqs.push_back(member);
+ }
- //for (int j = 0; j < Results.size(); j++) {
- //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl;
- //}
+ //limit number of parents to explore - default 3
+ if (Results.size() > parents) {
+ //sort by distance
+ sort(seqs.begin(), seqs.end(), compareSeqDist);
+ //prioritize larger more similiar sequence fragments
+ reverse(seqs.begin(), seqs.end());
+
+ for (int k = seqs.size()-1; k > (parents-1); k--) {
+ delete seqs[k].seq;
+ seqs.pop_back();
+ }
+ }
- //if (chimeraFlag == "yes") {
+ //put seqs into vector to send to slayer
+ vector<Sequence*> seqsForSlayer;
+ for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
- //get sequence that were given from maligner results
- vector<SeqDist> seqs;
- map<string, float> removeDups;
- map<string, float>::iterator itDup;
- for (int j = 0; j < Results.size(); j++) {
- float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
- //only add if you are not a duplicate
- itDup = removeDups.find(Results[j].parent);
- if (itDup == removeDups.end()) { //this is not duplicate
- removeDups[Results[j].parent] = dist;
- }else if (dist > itDup->second) { //is this a stronger number for this parent
- removeDups[Results[j].parent] = dist;
- }
- }
+ //mask then send to slayer...
+ if (seqMask != "") {
+ decalc->setMask(seqMask);
- for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
- Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
-
- SeqDist member;
- member.seq = seq;
- member.dist = itDup->second;
-
- seqs.push_back(member);
- }
+ //mask querys
+ decalc->runMask(query);
- //limit number of parents to explore - default 5
- if (Results.size() > parents) {
- //sort by distance
- sort(seqs.begin(), seqs.end(), compareSeqDist);
- //prioritize larger more similiar sequence fragments
- reverse(seqs.begin(), seqs.end());
-
- for (int k = seqs.size()-1; k > (parents-1); k--) {
- delete seqs[k].seq;
- seqs.pop_back();
- }
- }
-
- //put seqs into vector to send to slayer
- vector<Sequence*> seqsForSlayer;
- for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
-//ofstream out;
-//string name = querySeqs[i]->getName();
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = name + ".seqsforslayer";
-//openOutputFile(filename, out);
-//for (int u = 0; u < seqsForSlayer.size(); u++) { seqsForSlayer[u]->printSequence(out); }
-//out.close();
-//filename = name + ".fasta";
-//openOutputFile(filename, out);
-//q->printSequence(out);
-//out.close();
-
-
- //mask then send to slayer...
- if (seqMask != "") {
- decalc->setMask(seqMask);
-
- //mask querys
- decalc->runMask(querySeqs[i]);
-
- //mask parents
- for (int k = 0; k < seqsForSlayer.size(); k++) {
- decalc->runMask(seqsForSlayer[k]);
- }
-
- for (int i = 0; i < numSeqs; i++) {
- spotMap[i] = decalc->getMaskMap();
- }
-
+ //mask parents
+ for (int k = 0; k < seqsForSlayer.size(); k++) {
+ decalc->runMask(seqsForSlayer[k]);
}
- //send to slayer
- chimeraFlags[i] = slayer->getResults(querySeqs[i], seqsForSlayer);
- chimeraResults[i] = slayer->getOutput();
+ spotMap = decalc->getMaskMap();
+ }
- //free memory
- for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
- //}
+ if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
- }
- //free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
-
- if (seqMask != "") {
- delete decalc;
+ //send to slayer
+ chimeraFlags = slayer->getResults(query, seqsForSlayer);
+ if (m->control_pressed) { return 0; }
+ chimeraResults = slayer->getOutput();
+
+ //free memory
+ for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
}
+ delete maligner;
+ delete slayer;
+
return 0;
}
catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "getChimeras");
+ m->errorOut(e, "ChimeraSlayer", "getChimeras");
exit(1);
}
}
//***************************************************************************************************************
-Sequence* ChimeraSlayer::getSequence(string name) {
- try{
- Sequence* temp;
-
- //look through templateSeqs til you find it
- int spot = -1;
- for (int i = 0; i < templateSeqs.size(); i++) {
- if (name == templateSeqs[i]->getName()) {
- spot = i;
- break;
- }
- }
-
- if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; }
+void ChimeraSlayer::printBlock(data_struct data, ostream& out){
+ try {
+ //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
- temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
+ out << querySeq->getName() << '\t';
+ out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
+ //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+ //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
- return temp;
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "getSequence");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){
- try {
+ out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
+ out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
- out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl;
- out << "Left Window: " << spotMap[i][data.winLStart] << " " << spotMap[i][data.winLEnd] << endl;
- out << "Right Window: " << spotMap[i][data.winRStart] << " " << spotMap[i][data.winREnd] << endl;
+ out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
- out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl;
- out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl;
+ //out << "Similarity of parents: " << data.ab << endl;
+ //out << "Similarity of query to parentA: " << data.qa << endl;
+ //out << "Similarity of query to parentB: " << data.qb << endl;
- out << "Similarity of parents: " << data.ab << endl;
- out << "Similarity of query to parentA: " << data.qa << endl;
- out << "Similarity of query to parentB: " << data.qb << endl;
//out << "Per_id(QL,A): " << data.qla << endl;
//out << "Per_id(QL,B): " << data.qlb << endl;
//out << "Per_id(QR,B): " << data.qrb << endl;
- out << "DeltaL: " << (data.qla - data.qlb) << endl;
- out << "DeltaR: " << (data.qra - data.qrb) << endl;
+ //out << "DeltaL: " << (data.qla - data.qlb) << endl;
+ //out << "DeltaR: " << (data.qra - data.qrb) << endl;
}
catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "printBlock");
+ m->errorOut(e, "ChimeraSlayer", "printBlock");
exit(1);
}
}