GCC_OPTIMIZATION_LEVEL = 3;
GCC_PREPROCESSOR_DEFINITIONS = (
"MOTHUR_FILES=\"\\\"../release\\\"\"",
- "VERSION=\"\\\"1.18.0\\\"\"",
- "RELEASE_DATE=\"\\\"4/11/2011\\\"\"",
+ "VERSION=\"\\\"1.18.1\\\"\"",
+ "RELEASE_DATE=\"\\\"4/15/2011\\\"\"",
);
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
GCC_MODEL_TUNING = "";
GCC_OPTIMIZATION_LEVEL = 3;
GCC_PREPROCESSOR_DEFINITIONS = (
- "VERSION=\"\\\"1.18.0\\\"\"",
- "RELEASE_DATE=\"\\\"4/11/2011\\\"\"",
+ "VERSION=\"\\\"1.18.1\\\"\"",
+ "RELEASE_DATE=\"\\\"4/15/2011\\\"\"",
);
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
"-lreadline",
);
PREBINDING = NO;
- SDKROOT = macosx10.5;
+ SDKROOT = macosx10.6;
};
name = Release;
};
}
}
//***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, string abunds, int k, int ms, int mms, int win, float div,
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, int k, int ms, int mms, int win, float div,
int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
try {
fastafile = file; templateSeqs = readSeqs(fastafile);
increment = inc;
numWanted = numw;
realign = r;
- includeAbunds = abunds;
trimChimera = trim;
- //read name file and create nameMapRank
- readNameFile(name);
-
decalc = new DeCalculator();
createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
//run filter on template
- for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
-
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } templateSeqs.clear();
+
}
catch(exception& e) {
m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
exit(1);
}
}
-//***************************************************************************************************************
-int ChimeraSlayer::readNameFile(string name) {
- try {
- ifstream in;
- m->openInputFile(name, in);
-
- int maxRank = 0;
- int minRank = 10000000;
-
- while(!in.eof()){
-
- if (m->control_pressed) { in.close(); return 0; }
-
- string thisname, repnames;
-
- in >> thisname; m->gobble(in); //read from first column
- in >> repnames; //read from second column
-
- map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
- if (it == nameMapRank.end()) {
-
- vector<string> splitRepNames;
- m->splitAtComma(repnames, splitRepNames);
-
- nameMapRank[thisname] = splitRepNames;
-
- if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
- if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
-
- }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
-
- m->gobble(in);
- }
- in.close();
-
- //sanity check to make sure files match
- for (int i = 0; i < templateSeqs.size(); i++) {
- map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
-
- if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; }
- }
-
- if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; }
-
- return 0;
-
- }
- catch(exception& e) {
- m->errorOut(e, "ChimeraSlayer", "readNameFile");
- exit(1);
- }
-}
-
//***************************************************************************************************************
int ChimeraSlayer::doPrep() {
try {
}
}
//***************************************************************************************************************
-vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
+int ChimeraSlayer::getTemplate(Sequence* q) {
try {
- vector<Sequence*> thisTemplate;
-
- int thisRank;
- string thisName = q->getName();
- map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
- thisRank = (itRank->second).size();
-
- //create list of names we want to put into the template
- set<string> namesToAdd;
- for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
- if (itRank->first != thisName) {
- if (includeAbunds == "greaterequal") {
- if ((itRank->second).size() >= thisRank) {
- //you are more abundant than me or equal to my abundance
- for (int i = 0; i < (itRank->second).size(); i++) {
- namesToAdd.insert((itRank->second)[i]);
- }
- }
- }else if (includeAbunds == "greater") {
- if ((itRank->second).size() > thisRank) {
- //you are more abundant than me
- for (int i = 0; i < (itRank->second).size(); i++) {
- namesToAdd.insert((itRank->second)[i]);
- }
- }
- }else if (includeAbunds == "all") {
- //add everyone
- for (int i = 0; i < (itRank->second).size(); i++) {
- namesToAdd.insert((itRank->second)[i]);
- }
- }
- }
- }
-
- for (int i = 0; i < templateSeqs.size(); i++) {
- if (namesToAdd.count(templateSeqs[i]->getName()) != 0) {
- thisTemplate.push_back(templateSeqs[i]);
- }
- }
-
string kmerDBNameLeft;
string kmerDBNameRight;
string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
#ifdef USE_MPI
- for (int i = 0; i < thisTemplate.size(); i++) {
+ for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return thisTemplate; }
+ if (m->control_pressed) { return 0; }
- string leftFrag = thisTemplate[i]->getUnaligned();
+ string leftFrag = userTemplate[i]->getUnaligned();
leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
- Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+ Sequence leftTemp(userTemplate[i]->getName(), leftFrag);
databaseLeft->addSequence(leftTemp);
}
databaseLeft->generateDB();
- databaseLeft->setNumSeqs(thisTemplate.size());
+ databaseLeft->setNumSeqs(userTemplate.size());
- for (int i = 0; i < thisTemplate.size(); i++) {
- if (m->control_pressed) { return thisTemplate; }
+ for (int i = 0; i < userTemplate.size(); i++) {
+ if (m->control_pressed) { return 0; }
- string rightFrag = thisTemplate[i]->getUnaligned();
+ string rightFrag = userTemplate[i]->getUnaligned();
rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
- Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+ Sequence rightTemp(userTemplate[i]->getName(), rightFrag);
databaseRight->addSequence(rightTemp);
}
databaseRight->generateDB();
- databaseRight->setNumSeqs(thisTemplate.size());
+ databaseRight->setNumSeqs(userTemplate.size());
#else
- for (int i = 0; i < thisTemplate.size(); i++) {
+ for (int i = 0; i < userTemplate.size(); i++) {
- if (m->control_pressed) { return thisTemplate; }
+ if (m->control_pressed) { return 0; }
- string leftFrag = thisTemplate[i]->getUnaligned();
+ string leftFrag = userTemplate[i]->getUnaligned();
leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
- Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+ Sequence leftTemp(userTemplate[i]->getName(), leftFrag);
databaseLeft->addSequence(leftTemp);
}
databaseLeft->generateDB();
- databaseLeft->setNumSeqs(thisTemplate.size());
+ databaseLeft->setNumSeqs(userTemplate.size());
- for (int i = 0; i < thisTemplate.size(); i++) {
- if (m->control_pressed) { return thisTemplate; }
+ for (int i = 0; i < userTemplate.size(); i++) {
+ if (m->control_pressed) { return 0; }
- string rightFrag = thisTemplate[i]->getUnaligned();
+ string rightFrag = userTemplate[i]->getUnaligned();
rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
- Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+ Sequence rightTemp(userTemplate[i]->getName(), rightFrag);
databaseRight->addSequence(rightTemp);
}
databaseRight->generateDB();
- databaseRight->setNumSeqs(thisTemplate.size());
+ databaseRight->setNumSeqs(userTemplate.size());
#endif
}else if (searchMethod == "blast") {
//generate blastdb
databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
- for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); }
+ for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return 0; } databaseLeft->addSequence(*userTemplate[i]); }
databaseLeft->generateDB();
- databaseLeft->setNumSeqs(thisTemplate.size());
+ databaseLeft->setNumSeqs(userTemplate.size());
}
- return thisTemplate;
+ return 0;
}
catch(exception& e) {
if (templateFileName != "self") {
if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
else if (searchMethod == "blast") { delete databaseLeft; }
+ }else {
+ //delete userTemplate
+ for (int i = 0; i < userTemplate.size(); i++) {
+ delete userTemplate[i];
+ }
+ userTemplate.clear();
}
}
//***************************************************************************************************************
Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
try {
Sequence* trim = NULL;
- if (trimChimera) { trim = trimQuery; }
+ if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); }
if (chimeraFlags == "yes") {
string chimeraFlag = "no";
}
trim->setAligned(newAligned);
}
-
}
}
printBlock(chimeraResults[0], chimeraFlag, out);
out << endl;
- }else { out << querySeq->getName() << "\tno" << endl; }
+ }else {
+ out << querySeq->getName() << "\tno" << endl;
+ if (templateFileName == "self") {
+ Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
+ runFilter(temp);
+ userTemplate.push_back(temp);
+ }
+ }
return trim;
printBlock(leftPiece, rightPiece, leftChimeric, rightChimeric, chimeraFlag, out);
out << endl;
- }else { out << querySeq->getName() << "\tno" << endl; }
+ }else {
+ out << querySeq->getName() << "\tno" << endl;
+ if (templateFileName == "self") {
+ Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
+ runFilter(temp);
+ userTemplate.push_back(temp);
+ }
+ }
return trim;
MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
delete buf;
+
+ if (template == "self") {
+ Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
+ runFilter(temp);
+ userTemplate.push_back(temp);
+ }
}
string outputString = "";
Sequence* trim = NULL;
- if (trimChimera) { trim = trimQuery; }
+ if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); }
if (chimeraFlags == "yes") {
string chimeraFlag = "no";
MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
delete buf;
+
+ if (template == "self") {
+ Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
+ runFilter(temp);
+ userTemplate.push_back(temp);
+ }
}
return trim;
//***************************************************************************************************************
int ChimeraSlayer::getChimeras(Sequence* query) {
try {
- if (trimChimera) {
- trimQuery = new Sequence(query->getName(), query->getAligned());
- printResults.trimQuery = *trimQuery;
- }
+
+ trimQuery.setName(query->getName()); trimQuery.setAligned(query->getAligned());
+ printResults.trimQuery = trimQuery;
chimeraFlags = "no";
printResults.flag = "no";
//you must create a template
vector<Sequence*> thisTemplate;
if (templateFileName != "self") { thisTemplate = templateSeqs; }
- else { thisTemplate = getTemplate(query); } //fills this template and creates the databases
+ else { getTemplate(query); thisTemplate = userTemplate; } //fills this template and creates the databases
if (m->control_pressed) { return 0; }
public:
ChimeraSlayer(string, string, bool, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
- ChimeraSlayer(string, string, bool, string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+ ChimeraSlayer(string, string, bool, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
~ChimeraSlayer();
private:
Sequence* querySeq;
- Sequence* trimQuery;
+ Sequence trimQuery;
DeCalculator* decalc;
map<int, int> spotMap;
Database* databaseRight;
Database* databaseLeft;
- map<string, vector<string> > nameMapRank; //sequence name to rank so you can construct a template of the abundant sequences if the user uses itself as template
+ vector<Sequence*> userTemplate; //when template=self, the query file is sorted from most abundance to least abundant
+ //userTemplate grows as the query file is processed by adding sequences that are not chimeric
+ set<string> namesOfChimericSeqs; //only used when template=self
vector<data_struct> chimeraResults;
data_results printResults;
- string chimeraFlags, searchMethod, fastafile, includeAbunds;
+ string chimeraFlags, searchMethod, fastafile;
bool realign, trimChimera;
int window, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, increment;
float divR;
void printBlock(data_results, data_results, bool, bool, string, ostream&);
string getBlock(data_struct, string);
string getBlock(data_results, data_results, bool, bool, string);
- int readNameFile(string);
- vector<Sequence*> getTemplate(Sequence*);
+ //int readNameFile(string);
+ int getTemplate(Sequence*);
};
CommandParameter pminsnp("minsnp", "Number", "", "100", "", "", "",false,false); parameters.push_back(pminsnp);
CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
- CommandParameter pinclude("include", "Multiple", "greater-greaterequal-all", "greater", "", "", "",false,false); parameters.push_back(pinclude);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter prealign("realign", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prealign);
CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n";
- helpString += "The include parameter is used when template=self and allows you to choose which sequences will make up the \"template\". Options are greater, greaterequal and all, default=greater, meaning sequences with greater abundance than the query sequence. \n";
helpString += "The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n";
#ifdef USE_MPI
helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
m->setProcessors(temp);
convert(temp, processors);
- includeAbunds = validParameter.validFile(parameters, "include", false); if (includeAbunds == "not found") { includeAbunds = "greater"; }
- if ((includeAbunds != "greater") && (includeAbunds != "greaterequal") && (includeAbunds != "all")) { includeAbunds = "greater"; m->mothurOut("Invalid include setting. options are greater, greaterequal or all. using greater."); m->mothurOutEndLine(); }
-
temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; }
convert(temp, ksize);
if (templatefile != "self") { //you want to run slayer with a refernce template
chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
}else {
+ if (processors != 1) { m->mothurOut("When using template=self, mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
+ string nameFile = "";
if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
- chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFileNames[s], search, includeAbunds, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ nameFile = nameFileNames[s];
}else {
-
m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
//use unique.seqs to create new name and fastafile
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
- string nameFile = filenames["name"][0];
+ nameFile = filenames["name"][0];
fastaFileNames[s] = filenames["fasta"][0];
-
- chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, includeAbunds, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
}
+
+ //sort fastafile by abundance, returns new sorted fastafile name
+ m->mothurOut("Sorting fastafile according to abundance..."); cout.flush();
+ fastaFileNames[s] = sortFastaFile(fastaFileNames[s], nameFile);
+ m->mothurOut("Done."); m->mothurOutEndLine();
+
+ if (m->control_pressed) { for (int j = 0; j < outputNames.size(); j++) { remove(outputNames[j].c_str()); } return 0; }
+
+ chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
}
if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
exit(1);
}
}
+/**************************************************************************************************/
+
+string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
+ try {
+
+ //read through fastafile and store info
+ map<string, string> seqs;
+ ifstream in;
+ m->openInputFile(fastaFile, in);
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { in.close(); return ""; }
+
+ Sequence seq(in); m->gobble(in);
+ seqs[seq.getName()] = seq.getAligned();
+ }
+
+ in.close();
+
+ //read namefile
+ vector<seqPriorityNode> nameMapCount;
+ int error = m->readNames(nameFile, nameMapCount, seqs);
+
+ if (m->control_pressed) { return ""; }
+
+ if (error == 1) { m->control_pressed = true; return ""; }
+ if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return ""; }
+
+ sort(nameMapCount.begin(), nameMapCount.end(), compareSeqPriorityNodes);
+
+ string newFasta = fastaFile + ".temp";
+ ofstream out;
+ m->openOutputFile(newFasta, out);
+
+ //print new file in order of
+ for (int i = 0; i < nameMapCount.size(); i++) {
+ out << ">" << nameMapCount[i].name << endl << nameMapCount[i].seq << endl;
+ }
+ out.close();
+
+ rename(newFasta.c_str(), fastaFile.c_str());
+
+ return fastaFile;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraSlayerCommand", "sortFastaFile");
+ exit(1);
+ }
+}
/**************************************************************************************************/
#include "command.hpp"
#include "chimera.h"
-
/***********************************************************/
class ChimeraSlayerCommand : public Command {
int driver(linePair*, string, string, string, string);
int createProcesses(string, string, string, string);
int divideInHalf(Sequence, string&, string&);
+ string sortFastaFile(string, string);
#ifdef USE_MPI
int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
#endif
bool abort, realign, trim, trimera;
- string fastafile, templatefile, outputDir, search, namefile, includeAbunds;
+ string fastafile, templatefile, outputDir, search, namefile;
int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
float divR;
Chimera* chimera;
int end;
};
+/************************************************************/
+struct seqPriorityNode {
+ int numIdentical;
+ string seq;
+ string name;
+ seqPriorityNode() {}
+ seqPriorityNode(int n, string s, string nm) : numIdentical(n), seq(s), name(nm) {}
+ ~seqPriorityNode() {}
+};
/***************************************************************/
struct spearmanRank {
string name;
return (left.score > right.score);
}
//********************************************************************************************************************
+//sorts highest to lowest
+inline bool compareSeqPriorityNodes(seqPriorityNode left, seqPriorityNode right){
+ return (left.numIdentical > right.numIdentical);
+}
+//********************************************************************************************************************
//sorts lowest to highest
inline bool compareSpearmanReverse(spearmanRank left, spearmanRank right){
return (left.score < right.score);
exit(1);
}
}
+/**********************************************************************************************************************/
+int MothurOut::readNames(string namefile, vector<seqPriorityNode>& nameVector, map<string, string>& fastamap) {
+ try {
+ int error = 0;
+
+ //open input file
+ ifstream in;
+ openInputFile(namefile, in);
+
+ while (!in.eof()) {
+ if (control_pressed) { break; }
+
+ string firstCol, secondCol;
+ in >> firstCol >> secondCol; gobble(in);
+
+ int num = getNumNames(secondCol);
+
+ map<string, string>::iterator it = fastamap.find(firstCol);
+ if (it == fastamap.end()) {
+ error = 1;
+ mothurOut("[ERROR]: " + firstCol + " is not in your fastafile, but is in your namesfile, please correct."); mothurOutEndLine();
+ }else {
+ seqPriorityNode temp(num, it->second, firstCol);
+ nameVector.push_back(temp);
+ }
+ }
+ in.close();
+
+ return error;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "MothurOut", "readNames");
+ exit(1);
+ }
+}
/***********************************************************************/
void gobble(istream&);
void gobble(istringstream&);
map<string, int> readNames(string);
+ int readNames(string, vector<seqPriorityNode>&, map<string, string>&);
//searchs and checks
bool checkReleaseVersion(ifstream&, string);