}
}
//***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, int k, int ms, int mms, int win, float div,
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map<string, int>& prior, string mode, int k, int ms, int mms, int win, float div,
int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
try {
fastafile = file; templateSeqs = readSeqs(fastafile);
numWanted = numw;
realign = r;
trimChimera = trim;
+ priority = prior;
decalc = new DeCalculator();
createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
//run filter on template
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } templateSeqs.clear();
-
+ for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { break; } runFilter(templateSeqs[i]); }
+
+
}
catch(exception& e) {
m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
}
}
//***************************************************************************************************************
-int ChimeraSlayer::getTemplate(Sequence* q) {
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
try {
+ //when template=self, the query file is sorted from most abundance to least abundant
+ //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant
+ vector<Sequence*> userTemplate;
+
+ int myAbund = priority[q->getName()];
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ if (m->control_pressed) { return userTemplate; }
+
+ //have I reached a sequence with the same abundance as myself?
+ if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; }
+
+ //if its am not chimeric add it
+ if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); }
+ }
+
string kmerDBNameLeft;
string kmerDBNameRight;
#ifdef USE_MPI
for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { return userTemplate; }
string leftFrag = userTemplate[i]->getUnaligned();
leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
databaseLeft->setNumSeqs(userTemplate.size());
for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { return userTemplate; }
string rightFrag = userTemplate[i]->getUnaligned();
rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { return userTemplate; }
string leftFrag = userTemplate[i]->getUnaligned();
leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
databaseLeft->setNumSeqs(userTemplate.size());
for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { return userTemplate; }
string rightFrag = userTemplate[i]->getUnaligned();
rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
//generate blastdb
databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
- for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return 0; } databaseLeft->addSequence(*userTemplate[i]); }
+ for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; } databaseLeft->addSequence(*userTemplate[i]); }
databaseLeft->generateDB();
databaseLeft->setNumSeqs(userTemplate.size());
}
- return 0;
+ return userTemplate;
}
catch(exception& e) {
if (templateFileName != "self") {
if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
else if (searchMethod == "blast") { delete databaseLeft; }
- }else {
- //delete userTemplate
- for (int i = 0; i < userTemplate.size(); i++) {
- delete userTemplate[i];
- }
- userTemplate.clear();
}
}
//***************************************************************************************************************
m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
outAcc << querySeq->getName() << endl;
+ if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); }
+
if (trimChimera) {
int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
out << endl;
}else {
out << querySeq->getName() << "\tno" << endl;
- if (templateFileName == "self") {
- Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
- runFilter(temp);
- userTemplate.push_back(temp);
- }
}
return trim;
m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
outAcc << querySeq->getName() << endl;
+ if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); }
+
if (trimChimera) {
string newAligned = trim->getAligned();
out << endl;
}else {
out << querySeq->getName() << "\tno" << endl;
- if (templateFileName == "self") {
- Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
- runFilter(temp);
- userTemplate.push_back(temp);
- }
}
return trim;
outAccString += querySeq->getName() + "\n";
results = true;
+ if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); }
+
//write to accnos file
int length = outAccString.length();
char* buf2 = new char[length];
MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
delete buf;
-
- if (template == "self") {
- Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
- runFilter(temp);
- userTemplate.push_back(temp);
- }
}
outAccString += querySeq->getName() + "\n";
results = true;
+ if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); }
+
//write to accnos file
int length = outAccString.length();
char* buf2 = new char[length];
MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
delete buf;
-
- if (template == "self") {
- Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
- runFilter(temp);
- userTemplate.push_back(temp);
- }
}
return trim;
//you must create a template
vector<Sequence*> thisTemplate;
if (templateFileName != "self") { thisTemplate = templateSeqs; }
- else { getTemplate(query); thisTemplate = userTemplate; } //fills this template and creates the databases
+ else { thisTemplate = getTemplate(query); } //fills this template and creates the databases
if (m->control_pressed) { return 0; }
public:
ChimeraSlayer(string, string, bool, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
- ChimeraSlayer(string, string, bool, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+ ChimeraSlayer(string, string, bool, map<string, int>&, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
~ChimeraSlayer();
map<int, int> spotMap;
Database* databaseRight;
Database* databaseLeft;
- vector<Sequence*> userTemplate; //when template=self, the query file is sorted from most abundance to least abundant
- //userTemplate grows as the query file is processed by adding sequences that are not chimeric
- set<string> namesOfChimericSeqs; //only used when template=self
+ map<string, int> priority; //for template=self, seqname, seqAligned, abundance
+ set<string> chimericSeqs; //for template=self, so we don't add chimeric sequences to the userTemplate set
vector<data_struct> chimeraResults;
data_results printResults;
string getBlock(data_struct, string);
string getBlock(data_results, data_results, bool, bool, string);
//int readNameFile(string);
- int getTemplate(Sequence*);
+ vector<Sequence*> getTemplate(Sequence*);
};
//sort fastafile by abundance, returns new sorted fastafile name
m->mothurOut("Sorting fastafile according to abundance..."); cout.flush();
- fastaFileNames[s] = sortFastaFile(fastaFileNames[s], nameFile);
+ map<string, int> priority = sortFastaFile(fastaFileNames[s], nameFile);
m->mothurOut("Done."); m->mothurOutEndLine();
if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
- chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
}
if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
MPIPos = m->setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
- //send file positions to all processes
- for(int i = 1; i < processors; i++) {
- MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
- MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ if (templatefile != "self") { //if template=self we can only use 1 processor
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
}
-
//figure out how many sequences you have to align
numSeqsPerProcessor = numSeqs / processors;
int startIndex = pid * numSeqsPerProcessor;
if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
-
+
+ if (templatefile == "self") { //if template=self we can only use 1 processor
+ startIndex = 0;
+ numSeqsPerProcessor = numSeqs;
+ }
+
//do your part
driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } remove(outputFileName.c_str()); remove(accnosFileName.c_str()); delete chimera; return 0; }
}else{ //you are a child process
- MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
- MPIPos.resize(numSeqs+1);
- MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+ if (templatefile != "self") { //if template=self we can only use 1 processor
+ MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ MPIPos.resize(numSeqs+1);
+ MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
- //figure out how many sequences you have to align
- numSeqsPerProcessor = numSeqs / processors;
- int startIndex = pid * numSeqsPerProcessor;
- if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
+ //figure out how many sequences you have to align
+ numSeqsPerProcessor = numSeqs / processors;
+ int startIndex = pid * numSeqsPerProcessor;
+ if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
- //do your part
- driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
+ //do your part
+ driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); } MPI_File_close(&outMPIAccnos); for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } delete chimera; return 0; }
+
+ }
}
//close files
}
}
/**************************************************************************************************/
-
-string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
+map<string, int> ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
try {
+ map<string, int> nameAbund;
//read through fastafile and store info
map<string, string> seqs;
while (!in.eof()) {
- if (m->control_pressed) { in.close(); return ""; }
+ if (m->control_pressed) { in.close(); return nameAbund; }
Sequence seq(in); m->gobble(in);
seqs[seq.getName()] = seq.getAligned();
vector<seqPriorityNode> nameMapCount;
int error = m->readNames(nameFile, nameMapCount, seqs);
- if (m->control_pressed) { return ""; }
+ if (m->control_pressed) { return nameAbund; }
- if (error == 1) { m->control_pressed = true; return ""; }
- if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return ""; }
+ if (error == 1) { m->control_pressed = true; return nameAbund; }
+ if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return nameAbund; }
sort(nameMapCount.begin(), nameMapCount.end(), compareSeqPriorityNodes);
//print new file in order of
for (int i = 0; i < nameMapCount.size(); i++) {
out << ">" << nameMapCount[i].name << endl << nameMapCount[i].seq << endl;
+ nameAbund[nameMapCount[i].name] = nameMapCount[i].numIdentical;
}
out.close();
rename(newFasta.c_str(), fastaFile.c_str());
- return fastaFile;
+ return nameAbund;
}
catch(exception& e) {
int driver(linePair*, string, string, string, string);
int createProcesses(string, string, string, string);
int divideInHalf(Sequence, string&, string&);
- string sortFastaFile(string, string);
+ map<string, int> sortFastaFile(string, string);
#ifdef USE_MPI
int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);