A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
+ A7E8A22F1125939F0011D39C /* chimerarealigner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */; };
A7F5759710CEBC0600E20E47 /* libreadline.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A7F5759610CEBC0600E20E47 /* libreadline.a */; };
EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; };
A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; };
A7E4A823106A9AD700688F62 /* maligner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = maligner.cpp; sourceTree = SOURCE_ROOT; };
+ A7E8A22D1125939F0011D39C /* chimerarealigner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerarealigner.h; sourceTree = SOURCE_ROOT; };
+ A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerarealigner.cpp; sourceTree = SOURCE_ROOT; };
A7F5759610CEBC0600E20E47 /* libreadline.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libreadline.a; path = "../readline-6.0/libreadline.a"; sourceTree = SOURCE_ROOT; };
C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; };
A75B887B104C16860083C454 /* ccode.cpp */,
A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */,
A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */,
+ A7E8A22D1125939F0011D39C /* chimerarealigner.h */,
+ A7E8A22E1125939F0011D39C /* chimerarealigner.cpp */,
A7B04491106CC3E60046FC83 /* chimeraslayer.h */,
A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */,
372A55531017922B00C5194B /* decalc.h */,
A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */,
A70B00C8110885EB002F453A /* setdircommand.cpp in Sources */,
A794201111107897003AECCD /* distancedb.cpp in Sources */,
+ A7E8A22F1125939F0011D39C /* chimerarealigner.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
string alignFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "align";
string reportFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "align.report";
string accnosFileName = outputDir + getRootName(getSimpleName(candidateFileNames[s])) + "flip.accnos";
+ bool hasAccnos = true;
int numFastaSeqs = 0;
for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well.");
}else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
mothurOutEndLine();
- }
+ }else{ hasAccnos = false; }
}
#else
ifstream inFASTA;
driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]);
//delete accnos file if its blank else report to user
- if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); }
+ if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); hasAccnos = false; }
else {
mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
if (!flip) {
#endif
-
+ mothurOut("Output File Names: " + alignFileName + ", " + reportFileName);
+ if (hasAccnos) { mothurOut(", " + accnosFileName + "."); }
+ else { mothurOut("."); }
+ mothurOutEndLine();
mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences.");
mothurOutEndLine();
sort(pref.begin(), pref.end(), comparePref);
return 0;
-
+
}
catch(exception& e) {
errorOut(e, "Bellerophon", "getChimeras");
}
/**************************************************************************************************/
+//assumes you have added all the template sequences using the addSequence function and run generateDB.
+map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
+ try{
+ map<int, float> topMatches;
+
+ ofstream queryFile;
+ openOutputFile(queryFileName, queryFile);
+ queryFile << '>' << seq->getName() << endl;
+ queryFile << seq->getUnaligned() << endl;
+ queryFile.close();
+
+ // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
+ // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+ // long. With this setting, it seems comparable in speed to the suffix tree approach.
+
+ string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n);
+ blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+ system(blastCommand.c_str());
+
+ ifstream m8FileHandle;
+ openInputFile(blastFileName, m8FileHandle);
+
+ string dummy;
+ int templateAccession;
+ gobble(m8FileHandle);
+//string name = seq->getName();
+//ofstream out;
+//openOutputFileAppend(name, out);
+ while(!m8FileHandle.eof()){
+ m8FileHandle >> dummy >> templateAccession >> searchScore;
+//out << dummy << '\t' << templateAccession << '\t' << searchScore << endl;
+ //get rest of junk in line
+ while (!m8FileHandle.eof()) { char c = m8FileHandle.get();
+ //out << c;
+ if (c == 10 || c == 13){ break; } }
+
+ gobble(m8FileHandle);
+ topMatches[templateAccession] = searchScore;
+ }
+ m8FileHandle.close();
+//out.close();
+ return topMatches;
+ }
+ catch(exception& e) {
+ errorOut(e, "BlastDB", "findClosest");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
void BlastDB::addSequence(Sequence seq) {
try {
void BlastDB::generateDB() {
try {
- mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
+ //mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
path = globaldata->argv;
path = path.substr(0, (path.find_last_of('m')));
string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
// option tells formatdb that seqs are DNA, not prot
- mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
+ //mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
}
catch(exception& e) {
errorOut(e, "BlastDB", "generateDB");
void generateDB();
void addSequence(Sequence);
vector<int> findClosestSequences(Sequence*, int);
+ map<int, float> findClosest(Sequence*, int); //template index -> searchscore
private:
string dbFileName;
//***************************************************************************************************************
-Ccode::Ccode(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; }
+Ccode::Ccode(string filename, string o) {
+ fastafile = filename; outputDir = o;
+ distCalc = new eachGapDist();
+ decalc = new DeCalculator();
+
+ mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
+ ofstream out2;
+ openOutputFile(mapInfo, out2);
+
+ out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
+ out2.close();
+}
//***************************************************************************************************************
-
Ccode::~Ccode() {
- try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "~Ccode");
- exit(1);
- }
+ delete distCalc;
+ delete decalc;
}
//***************************************************************************************************************
+void Ccode::printHeader(ostream& out) {
+ out << "For full window mapping info refer to " << mapInfo << endl << endl;
+}
+//***************************************************************************************************************
void Ccode::print(ostream& out) {
try {
mothurOutEndLine();
- string mapInfo = outputDir + getRootName(getSimpleName(fastafile)) + "mapinfo";
ofstream out2;
- openOutputFile(mapInfo, out2);
-
- out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
+ openOutputFileAppend(mapInfo, out2);
- for (int j = 0; j < querySeqs.size(); j++) {
- out2 << querySeqs[j]->getName() << endl;
- for (it = spotMap[j].begin(); it!= spotMap[j].end(); it++) {
- out2 << it->first << '\t' << it->second << endl;
- }
+ out2 << querySeq->getName() << endl;
+ for (it = spotMap.begin(); it!= spotMap.end(); it++) {
+ out2 << it->first << '\t' << it->second << endl;
}
out2.close();
out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
- out << "For full window mapping info refer to " << mapInfo << endl << endl;
-
- for (int i = 0; i < querySeqs.size(); i++) {
-
- out << querySeqs[i]->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
+ out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
- for (int j = 0; j < closest[i].size(); j++) {
- out << setprecision(3) << closest[i][j].seq->getName() << '\t' << closest[i][j].dist << endl;
- }
- out << endl << endl;
+ for (int j = 0; j < closest.size(); j++) {
+ out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
+ }
+ out << endl << endl;
- //for each window
- //window mapping info.
- out << "Mapping information: ";
- //you mask and did not filter
- if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
+ //for each window
+ //window mapping info.
+ out << "Mapping information: ";
+ //you mask and did not filter
+ if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
- //you filtered and did not mask
- if ((seqMask == "") && (filter)) { out << "filter and trim."; }
+ //you filtered and did not mask
+ if ((seqMask == "") && (filter)) { out << "filter and trim."; }
- //you masked and filtered
- if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
+ //you masked and filtered
+ if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
- out << endl << "Window\tStartPos\tEndPos" << endl;
- it = trim[i].begin();
+ out << endl << "Window\tStartPos\tEndPos" << endl;
+ it = trim.begin();
- for (int k = 0; k < windows[i].size()-1; k++) {
- out << k+1 << '\t' << spotMap[i][windows[i][k]-it->first] << '\t' << spotMap[i][windows[i][k]-it->first+windowSizes[i]] << endl;
- }
+ for (int k = 0; k < windows.size()-1; k++) {
+ out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
+ }
- out << windows[i].size() << '\t' << spotMap[i][windows[i][windows[i].size()-1]-it->first] << '\t' << spotMap[i][it->second-it->first-1] << endl;
- out << endl;
+ out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
+ out << endl;
- out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
- for (int k = 0; k < windows[i].size(); k++) {
- float ds = averageQuery[i][k] / averageRef[i][k];
- out << k+1 << '\t' << averageQuery[i][k] << '\t' << sdQuery[i][k] << '\t' << averageRef[i][k] << '\t'<< sdRef[i][k] << '\t' << ds << '\t' << anova[i][k] << endl;
- }
- out << endl;
+ out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
+ for (int k = 0; k < windows.size(); k++) {
+ float ds = averageQuery[k] / averageRef[k];
+ out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
+ }
+ out << endl;
- //varRef
- //varQuery
- /* F test for differences among variances.
- * varQuery is expected to be higher or similar than varRef */
- //float fs = varQuery[query][i] / varRef[query][i]; /* F-Snedecor, test for differences of variances */
+ //varRef
+ //varQuery
+ /* F test for differences among variances.
+ * varQuery is expected to be higher or similar than varRef */
+ //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
- bool results = false;
+ bool results = false;
- //confidence limit, t - Student, anova
- out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
+ //confidence limit, t - Student, anova
+ out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
- for (int k = 0; k < windows[i].size(); k++) {
- string temp = "";
- if (isChimericConfidence[i][k]) { temp += "*\t"; }
- else { temp += "\t"; }
+ for (int k = 0; k < windows.size(); k++) {
+ string temp = "";
+ if (isChimericConfidence[k]) { temp += "*\t"; }
+ else { temp += "\t"; }
- if (isChimericTStudent[i][k]) { temp += "*\t"; }
- else { temp += "\t"; }
+ if (isChimericTStudent[k]) { temp += "*\t"; }
+ else { temp += "\t"; }
- if (isChimericANOVA[i][k]) { temp += "*\t"; }
- else { temp += "\t"; }
+ if (isChimericANOVA[k]) { temp += "*\t"; }
+ else { temp += "\t"; }
- out << k+1 << '\t' << temp << endl;
+ out << k+1 << '\t' << temp << endl;
- if (temp == "*\t*\t*\t") { results = true; }
- }
- out << endl;
+ if (temp == "*\t*\t*\t") { results = true; }
+ }
+ out << endl;
- if (results) {
- mothurOut(querySeqs[i]->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
- }
+ if (results) {
+ mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
}
+
}
catch(exception& e) {
errorOut(e, "Ccode", "print");
exit(1);
}
}
-
//***************************************************************************************************************
-int Ccode::getChimeras() {
+int Ccode::getChimeras(Sequence* query) {
try {
-
- //read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- templateSeqs = readSeqs(templateFile);
- mothurOut("Done."); mothurOutEndLine();
-
- int numSeqs = querySeqs.size();
-
- if (unaligned) { mothurOut("Your sequences need to be aligned when you use the bellerophon ccode."); mothurOutEndLine(); return 1; }
-
- closest.resize(numSeqs);
-
- refCombo.resize(numSeqs, 0);
- sumRef.resize(numSeqs);
- varRef.resize(numSeqs);
- varQuery.resize(numSeqs);
- sdRef.resize(numSeqs);
- sdQuery.resize(numSeqs);
- sumQuery.resize(numSeqs);
- sumSquaredRef.resize(numSeqs);
- sumSquaredQuery.resize(numSeqs);
- averageRef.resize(numSeqs);
- averageQuery.resize(numSeqs);
- anova.resize(numSeqs);
- isChimericConfidence.resize(numSeqs);
- isChimericTStudent.resize(numSeqs);
- isChimericANOVA.resize(numSeqs);
- trim.resize(numSeqs);
- spotMap.resize(numSeqs);
- windowSizes.resize(numSeqs, window);
- windows.resize(numSeqs);
-
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
- }
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
-
- #else
- lines.push_back(new linePair(0, numSeqs));
- #endif
- distCalc = new eachGapDist();
- decalc = new DeCalculator();
-
- //find closest
- if (processors == 1) {
- mothurOut("Finding top matches for sequences... "); cout.flush();
- closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
- mothurOut("Done."); mothurOutEndLine();
- }else { createProcessesClosest(); }
+ closest.clear();
+ refCombo = 0;
+ sumRef.clear();
+ varRef.clear();
+ varQuery.clear();
+ sdRef.clear();
+ sdQuery.clear();
+ sumQuery.clear();
+ sumSquaredRef.clear();
+ sumSquaredQuery.clear();
+ averageRef.clear();
+ averageQuery.clear();
+ anova.clear();
+ isChimericConfidence.clear();
+ isChimericTStudent.clear();
+ isChimericANOVA.clear();
+ trim.clear();
+ spotMap.clear();
+ windowSizes = window;
+ windows.clear();
- //initialize spotMap
- for (int j = 0; j < numSeqs; j++) {
- for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
- spotMap[j][i] = i;
- }
- }
+
+ querySeq = query;
+
+ //find closest matches to query
+ closest = findClosest(query, numWanted);
+ //initialize spotMap
+ for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; }
+
//mask sequences if the user wants to
if (seqMask != "") {
decalc->setMask(seqMask);
- //mask querys
- for (int i = 0; i < querySeqs.size(); i++) {
- decalc->runMask(querySeqs[i]);
- }
-
- //mask templates
- for (int i = 0; i < templateSeqs.size(); i++) {
- decalc->runMask(templateSeqs[i]);
- }
+ decalc->runMask(query);
- for (int i = 0; i < numSeqs; i++) {
- spotMap[i] = decalc->getMaskMap();
- }
+ //mask closest
+ for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); }
+
+ spotMap = decalc->getMaskMap();
}
if (filter) {
- vector<Sequence*> temp = templateSeqs;
- for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
+ vector<Sequence*> temp;
+ for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); }
+ temp.push_back(query);
createFilter(temp);
- runFilter(querySeqs);
- runFilter(templateSeqs);
+ for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]); }
//update spotMap
map<int, int> newMap;
int spot = 0;
- int j = 0;
for (int i = 0; i < filterString.length(); i++) {
if (filterString[i] == '1') {
//add to newMap
- newMap[spot] = spotMap[j][i];
+ newMap[spot] = spotMap[i];
spot++;
}
}
-
- for (int i = 0; i < numSeqs; i++) {
- spotMap[i] = newMap;
- }
+ spotMap = newMap;
}
//trim sequences - this follows ccodes remove_extra_gaps
- for (int i = 0; i < querySeqs.size(); i++) {
- trimSequences(i);
- }
+ trimSequences(query);
+
//windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
//Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
- for (int i = 0; i < querySeqs.size(); i++) {
- windows[i] = findWindows(i);
- }
+ windows = findWindows();
+
//remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
- if (processors == 1) {
- for (int i = 0; i < closest.size(); i++) {
- removeBadReferenceSeqs(closest[i], i);
- }
- }else { createProcessesRemoveBad(); }
-
+ removeBadReferenceSeqs(closest);
- if (processors == 1) {
- //find the averages for each querys references
- for (int i = 0; i < numSeqs; i++) {
- getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
- }
-
- //find the averages for the query
- for (int i = 0; i < numSeqs; i++) {
- getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
- }
- }else { createProcessesAverages(); }
- if (processors == 1) {
- //find the averages for each querys references
- for (int i = 0; i < numSeqs; i++) {
- findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
-
- //find the averages for the query
- for (int i = 0; i < numSeqs; i++) {
- findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
- }else { createProcessesVariances(); }
+ //find the averages for each querys references
+ getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo.
+ getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery.
+
- if (processors == 1) {
- for (int i = 0; i < numSeqs; i++) {
- determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
- }
- }else { createProcessesDetermine(); }
+ //find the averages for each querys references
+ findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
+
+
+ //find the averages for the query
+ findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
+
+ determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
//free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
- delete distCalc;
- delete decalc;
+ for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
return 0;
}
}
/***************************************************************************************************************/
//ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
-void Ccode::trimSequences(int query) {
+void Ccode::trimSequences(Sequence* query) {
try {
int frontPos = 0; //should contain first position in all seqs that is not a gap character
- int rearPos = querySeqs[query]->getAligned().length();
+ int rearPos = query->getAligned().length();
//********find first position in closest seqs that is a non gap character***********//
//find first position all query seqs that is a non gap character
- for (int i = 0; i < closest[query].size(); i++) {
+ for (int i = 0; i < closest.size(); i++) {
- string aligned = closest[query][i].seq->getAligned();
+ string aligned = closest[i].seq->getAligned();
int pos = 0;
//find first spot in this seq
}
//find first position all querySeq[query] that is a non gap character
- string aligned = querySeqs[query]->getAligned();
+ string aligned = query->getAligned();
int pos = 0;
//find first spot in this seq
//********find last position in closest seqs that is a non gap character***********//
- for (int i = 0; i < closest[query].size(); i++) {
+ for (int i = 0; i < closest.size(); i++) {
- string aligned = closest[query][i].seq->getAligned();
+ string aligned = closest[i].seq->getAligned();
int pos = aligned.length();
//find first spot in this seq
}
//find last position all querySeqs[query] that is a non gap character
- aligned = querySeqs[query]->getAligned();
+ aligned = query->getAligned();
pos = aligned.length();
//find first spot in this seq
tempTrim[frontPos] = rearPos;
//save trimmed locations
- trim[query] = tempTrim;
+ trim = tempTrim;
//update spotMask
map<int, int> newMap;
for (int i = frontPos; i < rearPos; i++) {
//add to newMap
-//cout << query << '\t' << i << '\t' << spotMap[query][i] << endl;
- newMap[spot] = spotMap[query][i];
+ newMap[spot] = spotMap[i];
spot++;
}
-
- //for (it = newMap.begin(); it!=newMap.end(); it++) {
- //cout << query << '\t' << it->first << '\t' << it->second << endl;
- //}
-
- spotMap[query] = newMap;
-
-
+ spotMap = newMap;
}
catch(exception& e) {
errorOut(e, "Ccode", "trimSequences");
exit(1);
}
-
}
/***************************************************************************************************************/
-vector<int> Ccode::findWindows(int query) {
+vector<int> Ccode::findWindows() {
try {
vector<int> win;
- it = trim[query].begin();
+ it = trim.begin();
int length = it->second - it->first;
//default is wanted = 10% of total length
- if (windowSizes[query] > length) {
+ if (windowSizes > length) {
mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
- windowSizes[query] = length / 10;
- }else if (windowSizes[query] == 0) { windowSizes[query] = length / 10; }
- else if (windowSizes[query] > (length * 0.20)) {
+ windowSizes = length / 10;
+ }else if (windowSizes == 0) { windowSizes = length / 10; }
+ else if (windowSizes > (length * 0.20)) {
mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
- }else if (windowSizes[query] < (length * 0.05)) {
+ }else if (windowSizes < (length * 0.05)) {
mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
}
//save starting points of each window
- for (int m = it->first; m < (it->second-windowSizes[query]); m+=windowSizes[query]) { win.push_back(m); }
+ for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); }
//save last window
if (win[win.size()-1] < (it->first+length)) {
- win.push_back(win[win.size()-1]+windowSizes[query]); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
+ win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
} //with this you would get 1,25,50,75,100
-
return win;
-
}
catch(exception& e) {
errorOut(e, "Ccode", "findWindows");
}
//***************************************************************************************************************
//tried to make this look most like ccode original implementation
-void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs, int query) {
+void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
try {
vector< vector<int> > numDiffBases;
//initialize to 0
for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
- it = trim[query].begin();
+ it = trim.begin();
int length = it->second - it->first;
//calc differences from each sequence to everyother seq in the set
seqs = goodSeqs;
}else { //warn, but dont remove any
- mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine();
+ mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine();
}
-
}
catch(exception& e) {
}
}
//***************************************************************************************************************
-vector< vector<SeqDist> > Ccode::findClosest(int start, int end, int numWanted) {
+//makes copy of templateseq for filter
+vector<SeqDist> Ccode::findClosest(Sequence* q, int numWanted) {
try{
- vector< vector<SeqDist> > topMatches; topMatches.resize(querySeqs.size());
-
- float smallestOverall, smallestLeft, smallestRight;
- smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
+ vector<SeqDist> topMatches;
- //for each sequence in querySeqs - find top matches to use as reference
- for(int j = start; j < end; j++){
-
- Sequence query = *(querySeqs[j]);
+ Sequence query = *(q);
- vector<SeqDist> distances;
+ //calc distance to each sequence in template seqs
+ for (int i = 0; i < templateSeqs.size(); i++) {
- //calc distance to each sequence in template seqs
- for (int i = 0; i < templateSeqs.size(); i++) {
-
- Sequence ref = *(templateSeqs[i]);
+ Sequence ref = *(templateSeqs[i]);
- //find overall dist
- distCalc->calcDist(query, ref);
- float dist = distCalc->getDist();
+ //find overall dist
+ distCalc->calcDist(query, ref);
+ float dist = distCalc->getDist();
- //save distance
- SeqDist temp;
- temp.seq = templateSeqs[i];
- temp.dist = dist;
+ //save distance
+ SeqDist temp;
+ temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
+ temp.dist = dist;
- distances.push_back(temp);
- }
-
- sort(distances.begin(), distances.end(), compareSeqDist);
-
- //save the number of top matches wanted
- for (int h = 0; h < numWanted; h++) {
- topMatches[j].push_back(distances[h]);
- }
+ topMatches.push_back(temp);
}
+ sort(topMatches.begin(), topMatches.end(), compareSeqDist);
+
return topMatches;
}
}
/**************************************************************************************************/
//find the distances from each reference sequence to every other reference sequence for each window for this query
-void Ccode::getAverageRef(vector<SeqDist> ref, int query) {
+void Ccode::getAverageRef(vector<SeqDist> ref) {
try {
- vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
+ vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
//initialize diffs vector
diffs.resize(ref.size());
for (int i = 0; i < diffs.size(); i++) {
diffs[i].resize(ref.size());
for (int j = 0; j < diffs[i].size(); j++) {
- diffs[i][j].resize(windows[query].size(), 0);
+ diffs[i][j].resize(windows.size(), 0);
}
}
- it = trim[query].begin();
+ it = trim.begin();
//find the distances from each reference sequence to every other reference sequence for each window for this query
for (int i = 0; i < ref.size(); i++) {
string refJ = ref[j].seq->getAligned();
- for (int k = 0; k < windows[query].size(); k++) {
+ for (int k = 0; k < windows.size(); k++) {
string refIWindowk, refJWindowk;
- if (k < windows[query].size()-1) {
+ if (k < windows.size()-1) {
//get window strings
- refIWindowk = refI.substr(windows[query][k], windowSizes[query]);
- refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
+ refIWindowk = refI.substr(windows[k], windowSizes);
+ refJWindowk = refJ.substr(windows[k], windowSizes);
}else { //last window may be smaller than rest - see findwindows
//get window strings
- refIWindowk = refI.substr(windows[query][k], (it->second-windows[query][k]));
- refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
+ refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
+ refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
}
//find differences
int diff = getDiff(refIWindowk, refJWindowk);
-//cout << i << '\t' << j << '\t' << k << '\t' << diff << endl;
+
//save differences in [i][j][k] and [j][i][k] since they are the same
diffs[i][j][k] = diff;
diffs[j][i][k] = diff;
}//i
//initialize sumRef for this query
- sumRef[query].resize(windows[query].size(), 0);
- sumSquaredRef[query].resize(windows[query].size(), 0);
- averageRef[query].resize(windows[query].size(), 0);
+ sumRef.resize(windows.size(), 0);
+ sumSquaredRef.resize(windows.size(), 0);
+ averageRef.resize(windows.size(), 0);
//find the sum of the differences for hte reference sequences
for (int i = 0; i < diffs.size(); i++) {
for (int j = 0; j < i; j++) {
//increment this querys reference sequences combos
- refCombo[query]++;
+ refCombo++;
for (int k = 0; k < diffs[i][j].size(); k++) {
- sumRef[query][k] += diffs[i][j][k];
- sumSquaredRef[query][k] += (diffs[i][j][k]*diffs[i][j][k]);
+ sumRef[k] += diffs[i][j][k];
+ sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
}//k
}//j
//find the average of the differences for the references for each window
- for (int i = 0; i < windows[query].size(); i++) {
- averageRef[query][i] = sumRef[query][i] / (float) refCombo[query];
+ for (int i = 0; i < windows.size(); i++) {
+ averageRef[i] = sumRef[i] / (float) refCombo;
}
}
}
}
/**************************************************************************************************/
-void Ccode::getAverageQuery (vector<SeqDist> ref, int query) {
+void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
try {
vector< vector<int> > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
//initialize diffs vector
diffs.resize(ref.size());
for (int j = 0; j < diffs.size(); j++) {
- diffs[j].resize(windows[query].size(), 0);
+ diffs[j].resize(windows.size(), 0);
}
- it = trim[query].begin();
+ it = trim.begin();
- string refQuery = querySeqs[query]->getAligned();
+ string refQuery = query->getAligned();
//j<i, so you don't find distances from i to j and then j to i.
for (int j = 0; j < ref.size(); j++) {
string refJ = ref[j].seq->getAligned();
- for (int k = 0; k < windows[query].size(); k++) {
+ for (int k = 0; k < windows.size(); k++) {
string QueryWindowk, refJWindowk;
- if (k < windows[query].size()-1) {
+ if (k < windows.size()-1) {
//get window strings
- QueryWindowk = refQuery.substr(windows[query][k], windowSizes[query]);
- refJWindowk = refJ.substr(windows[query][k], windowSizes[query]);
+ QueryWindowk = refQuery.substr(windows[k], windowSizes);
+ refJWindowk = refJ.substr(windows[k], windowSizes);
}else { //last window may be smaller than rest - see findwindows
//get window strings
- QueryWindowk = refQuery.substr(windows[query][k], (it->second-windows[query][k]));
- refJWindowk = refJ.substr(windows[query][k], (it->second-windows[query][k]));
+ QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
+ refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
}
//find differences
int diff = getDiff(QueryWindowk, refJWindowk);
-//cout << j << '\t' << k << '\t' << diff << endl;
+
//save differences
diffs[j][k] = diff;
//initialize sumRef for this query
- sumQuery[query].resize(windows[query].size(), 0);
- sumSquaredQuery[query].resize(windows[query].size(), 0);
- averageQuery[query].resize(windows[query].size(), 0);
+ sumQuery.resize(windows.size(), 0);
+ sumSquaredQuery.resize(windows.size(), 0);
+ averageQuery.resize(windows.size(), 0);
//find the sum of the differences
for (int j = 0; j < diffs.size(); j++) {
for (int k = 0; k < diffs[j].size(); k++) {
- sumQuery[query][k] += diffs[j][k];
- sumSquaredQuery[query][k] += (diffs[j][k]*diffs[j][k]);
+ sumQuery[k] += diffs[j][k];
+ sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
}//k
}//j
//find the average of the differences for the references for each window
- for (int i = 0; i < windows[query].size(); i++) {
- averageQuery[query][i] = sumQuery[query][i] / (float) ref.size();
+ for (int i = 0; i < windows.size(); i++) {
+ averageQuery[i] = sumQuery[i] / (float) ref.size();
}
-
-
}
catch(exception& e) {
errorOut(e, "Ccode", "getAverageQuery");
}
}
/**************************************************************************************************/
-void Ccode::findVarianceRef (int query) {
+void Ccode::findVarianceRef() {
try {
- varRef[query].resize(windows[query].size(), 0);
- sdRef[query].resize(windows[query].size(), 0);
+ varRef.resize(windows.size(), 0);
+ sdRef.resize(windows.size(), 0);
//for each window
- for (int i = 0; i < windows[query].size(); i++) {
- varRef[query][i] = (sumSquaredRef[query][i] - ((sumRef[query][i]*sumRef[query][i])/(float)refCombo[query])) / (float)(refCombo[query]-1);
- sdRef[query][i] = sqrt(varRef[query][i]);
+ for (int i = 0; i < windows.size(); i++) {
+ varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
+ sdRef[i] = sqrt(varRef[i]);
//set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
- if (averageRef[query][i] < 0.001) { averageRef[query][i] = 0.001; }
- if (sumRef[query][i] < 0.001) { sumRef[query][i] = 0.001; }
- if (varRef[query][i] < 0.001) { varRef[query][i] = 0.001; }
- if (sumSquaredRef[query][i] < 0.001) { sumSquaredRef[query][i] = 0.001; }
- if (sdRef[query][i] < 0.001) { sdRef[query][i] = 0.001; }
+ if (averageRef[i] < 0.001) { averageRef[i] = 0.001; }
+ if (sumRef[i] < 0.001) { sumRef[i] = 0.001; }
+ if (varRef[i] < 0.001) { varRef[i] = 0.001; }
+ if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; }
+ if (sdRef[i] < 0.001) { sdRef[i] = 0.001; }
}
}
}
}
/**************************************************************************************************/
-void Ccode::findVarianceQuery (int query) {
+void Ccode::findVarianceQuery() {
try {
- varQuery[query].resize(windows[query].size(), 0);
- sdQuery[query].resize(windows[query].size(), 0);
+ varQuery.resize(windows.size(), 0);
+ sdQuery.resize(windows.size(), 0);
//for each window
- for (int i = 0; i < windows[query].size(); i++) {
- varQuery[query][i] = (sumSquaredQuery[query][i] - ((sumQuery[query][i]*sumQuery[query][i])/(float) closest[query].size())) / (float) (closest[query].size()-1);
- sdQuery[query][i] = sqrt(varQuery[query][i]);
+ for (int i = 0; i < windows.size(); i++) {
+ varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
+ sdQuery[i] = sqrt(varQuery[i]);
//set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
- if (averageQuery[query][i] < 0.001) { averageQuery[query][i] = 0.001; }
- if (sumQuery[query][i] < 0.001) { sumQuery[query][i] = 0.001; }
- if (varQuery[query][i] < 0.001) { varQuery[query][i] = 0.001; }
- if (sumSquaredQuery[query][i] < 0.001) { sumSquaredQuery[query][i] = 0.001; }
- if (sdQuery[query][i] < 0.001) { sdQuery[query][i] = 0.001; }
+ if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; }
+ if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; }
+ if (varQuery[i] < 0.001) { varQuery[i] = 0.001; }
+ if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; }
+ if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; }
}
}
}
}
/**************************************************************************************************/
-void Ccode::determineChimeras (int query) {
+void Ccode::determineChimeras() {
try {
- isChimericConfidence[query].resize(windows[query].size(), false);
- isChimericTStudent[query].resize(windows[query].size(), false);
- isChimericANOVA[query].resize(windows[query].size(), false);
- anova[query].resize(windows[query].size());
+ isChimericConfidence.resize(windows.size(), false);
+ isChimericTStudent.resize(windows.size(), false);
+ isChimericANOVA.resize(windows.size(), false);
+ anova.resize(windows.size());
//for each window
- for (int i = 0; i < windows[query].size(); i++) {
+ for (int i = 0; i < windows.size(); i++) {
//get confidence limits
- float t = getT(closest[query].size()-1); //how many seqs you are comparing to this querySeq
- float dsUpper = (averageQuery[query][i] + (t * sdQuery[query][i])) / averageRef[query][i];
- float dsLower = (averageQuery[query][i] - (t * sdQuery[query][i])) / averageRef[query][i];
-//cout << t << '\t' << "ds upper = " << dsUpper << " dsLower = " << dsLower << endl;
- if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[query][i] > averageRef[query][i])) { /* range does not include 1 */
- isChimericConfidence[query][i] = true; /* significantly higher at P<0.05 */
-//cout << i << " made it here" << endl;
+ float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq
+ float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i];
+ float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i];
+
+ if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */
+ isChimericConfidence[i] = true; /* significantly higher at P<0.05 */
+
}
//student t test
- int degreeOfFreedom = refCombo[query] + closest[query].size() - 2;
- float denomForT = (((refCombo[query]-1) * varQuery[query][i] + (closest[query].size() - 1) * varRef[query][i]) / (float) degreeOfFreedom) * ((refCombo[query] + closest[query].size()) / (float) (refCombo[query] * closest[query].size())); /* denominator, without sqrt(), for ts calculations */
+ int degreeOfFreedom = refCombo + closest.size() - 2;
+ float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */
- float ts = fabs((averageQuery[query][i] - averageRef[query][i]) / (sqrt(denomForT))); /* value of ts for t-student test */
+ float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */
t = getT(degreeOfFreedom);
-//cout << i << '\t' << t << '\t' << ts << endl;
- if ((ts >= t) && (averageQuery[query][i] > averageRef[query][i])) {
- isChimericTStudent[query][i] = true; /* significantly higher at P<0.05 */
+
+ if ((ts >= t) && (averageQuery[i] > averageRef[i])) {
+ isChimericTStudent[i] = true; /* significantly higher at P<0.05 */
}
//anova test
- float value1 = sumQuery[query][i] + sumRef[query][i];
- float value2 = sumSquaredQuery[query][i] + sumSquaredRef[query][i];
- float value3 = ((sumQuery[query][i]*sumQuery[query][i]) / (float) (closest[query].size())) + ((sumRef[query][i] * sumRef[query][i]) / (float) refCombo[query]);
- float value4 = (value1 * value1) / ( (float) (closest[query].size() + refCombo[query]) );
+ float value1 = sumQuery[i] + sumRef[i];
+ float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
+ float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
+ float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
float value5 = value2 - value4;
float value6 = value3 - value4;
float value7 = value5 - value6;
float f = getF(degreeOfFreedom);
- if ((anovaValue >= f) && (averageQuery[query][i] > averageRef[query][i])) {
- isChimericANOVA[query][i] = true; /* significant P<0.05 */
+ if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) {
+ isChimericANOVA[i] = true; /* significant P<0.05 */
}
if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
- anova[query][i] = anovaValue;
+ anova[i] = anovaValue;
}
}
exit(1);
}
}
-
-/**************************************************************************************************/
-void Ccode::createProcessesClosest() {
- try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
- closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
- mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < closest[i].size(); j++) {
- closest[i][j].seq->printSequence(out);
- }
- }
- out << ">" << endl; //to stop sequence read
-
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < closest[i].size(); j++) {
- out << closest[i][j].dist << '\t';
- }
- out << endl;
- }
- out.close();
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- vector< vector<Sequence*> > tempClosest; tempClosest.resize(querySeqs.size());
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- vector<Sequence*> tempVector;
-
- for (int j = 0; j < numWanted; j++) {
-
- Sequence* temp = new Sequence(in);
- gobble(in);
-
- tempVector.push_back(temp);
- }
-
- tempClosest[k] = tempVector;
- }
-
- string junk;
- in >> junk; gobble(in); // to get ">"
-
- vector< vector<float> > dists; dists.resize(querySeqs.size());
-
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- dists[k].resize(numWanted);
- for (int j = 0; j < numWanted; j++) {
- in >> dists[k][j];
- }
- gobble(in);
-
- }
-
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- closest[k].resize(numWanted);
- for (int j = 0; j < closest[k].size(); j++) {
- closest[k][j].seq = tempClosest[k][j];
- closest[k][j].dist = dists[k][j];
- }
- }
-
- in.close();
- remove(s.c_str());
- }
-
-
-#else
- closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
-#endif
-
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "createProcessesClosest");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-void Ccode::createProcessesRemoveBad() {
- try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- removeBadReferenceSeqs(closest[i], i);
- }
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << closest[i].size() << endl;
- for (int j = 0; j < closest[i].size(); j++) {
- closest[i][j].seq->printSequence(out);
- }
- out << ">" << endl; //to stop sequence read
- }
-
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < closest[i].size(); j++) {
- out << closest[i][j].dist << '\t';
- }
- out << endl;
- }
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- vector< vector<Sequence*> > tempClosest; tempClosest.resize(querySeqs.size());
- vector<int> sizes;
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
-
- int num;
- in >> num;
- sizes.push_back(num);
- gobble(in);
-
- vector<Sequence*> tempVector;
-
- for (int j = 0; j < num; j++) {
-
- Sequence* temp = new Sequence(in);
- gobble(in);
-
- tempVector.push_back(temp);
- }
- string junk;
- in >> junk; gobble(in); // to get ">"
-
- tempClosest[k] = tempVector;
- }
-
- vector< vector<float> > dists; dists.resize(querySeqs.size());
- int count = 0;
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- dists[k].resize(sizes[count]);
- for (int j = 0; j < sizes[count]; j++) {
- in >> dists[k][j];
- }
- gobble(in);
- count++;
- }
-
- count = 0;
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- for (int j = 0; j < sizes[count]; j++) {
- closest[k][j].seq = tempClosest[k][j];
- closest[k][j].dist = dists[k][j];
- }
- count++;
- }
-
- in.close();
- remove(s.c_str());
- }
-#else
- for (int i = 0; i < closest.size(); i++) {
- removeBadReferenceSeqs(closest[i], i);
- }
-#endif
-
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "createProcessesRemoveBad");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void Ccode::createProcessesAverages() {
- try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- //find the averages for each querys references
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
- }
-
- //find the averages for the query
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
- }
-
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < windows[i].size(); j++) {
- out << sumRef[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << averageRef[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << sumSquaredRef[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << sumQuery[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << averageQuery[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << sumSquaredQuery[i][j] << '\t';
- }
- out << endl;
- out << refCombo[i] << endl;
- }
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- sumRef[k].resize(windows[k].size());
- averageRef[k].resize(windows[k].size());
- sumSquaredRef[k].resize(windows[k].size());
- averageQuery[k].resize(windows[k].size());
- sumQuery[k].resize(windows[k].size());
- sumSquaredQuery[k].resize(windows[k].size());
-
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sumRef[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> averageRef[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sumSquaredRef[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sumQuery[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> averageQuery[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sumSquaredQuery[k][j];
- }
- gobble(in);
- in >> refCombo[k];
- gobble(in);
- }
-
- in.close();
- remove(s.c_str());
- }
-#else
- //find the averages for each querys references
- for (int i = 0; i < querySeqs.size(); i++) {
- getAverageRef(closest[i], i); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
- }
-
- //find the averages for the query
- for (int i = 0; i < querySeqs.size(); i++) {
- getAverageQuery(closest[i], i); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
- }
-
-#endif
-
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "createProcessesAverages");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void Ccode::createProcessesVariances() {
- try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- //find the averages for each querys references
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
-
- //find the averages for the query
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- findVarianceQuery(i); //fills varQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
-
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < windows[i].size(); j++) {
- out << varRef[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << sdRef[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << varQuery[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << sdQuery[i][j] << '\t';
- }
- out << endl;
- }
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- varRef[k].resize(windows[k].size());
- sdRef[k].resize(windows[k].size());
- varQuery[k].resize(windows[k].size());
- sdQuery[k].resize(windows[k].size());
-
- for (int j = 0; j < windows[k].size(); j++) {
- in >> varRef[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sdRef[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> varQuery[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> sdQuery[k][j];
- }
- gobble(in);
- }
-
- in.close();
- remove(s.c_str());
- }
-#else
- //find the averages for each querys references
- for (int i = 0; i < querySeqs.size(); i++) {
- findVarianceRef(i); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
-
- //find the averages for the query
- for (int i = 0; i < querySeqs.size(); i++) {
- findVarianceQuery(i); //fills v arQuery[i] and sdQuery[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- }
-#endif
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "createProcessesVariances");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void Ccode::createProcessesDetermine() {
- try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- //find the averages for each querys references
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
- }
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- for (int j = 0; j < windows[i].size(); j++) {
- out << anova[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << isChimericConfidence[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << isChimericTStudent[i][j] << '\t';
- }
- out << endl;
- for (int j = 0; j < windows[i].size(); j++) {
- out << isChimericANOVA[i][j] << '\t';
- }
- out << endl;
- }
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
- anova[k].resize(windows[k].size());
- isChimericConfidence[k].resize(windows[k].size(), false);
- isChimericTStudent[k].resize(windows[k].size(), false);
- isChimericANOVA[k].resize(windows[k].size(), false);
- int tempBool;
-
- for (int j = 0; j < windows[k].size(); j++) {
- in >> anova[k][j];
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> tempBool;
- if (tempBool == 1) { isChimericConfidence[k][j] = true; }
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> tempBool;
- if (tempBool == 1) { isChimericTStudent[k][j] = true; }
- }
- gobble(in);
- for (int j = 0; j < windows[k].size(); j++) {
- in >> tempBool;
- if (tempBool == 1) { isChimericANOVA[k][j] = true; }
- }
- gobble(in);
- }
-
- in.close();
- remove(s.c_str());
- }
- #else
- for (int i = 0; i < querySeqs.size(); i++) {
- determineChimeras(i); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
- }
- #endif
-
- }
- catch(exception& e) {
- errorOut(e, "Ccode", "createProcessesDetermine");
- exit(1);
- }
-}
//***************************************************************************************************************
class Ccode : public Chimera {
public:
- Ccode(string, string, string);
+ Ccode(string, string);
~Ccode();
- int getChimeras();
+ int getChimeras(Sequence* query);
void print(ostream&);
-
- void setCons(string c) {}
- void setQuantiles(string q) {}
-
-
+ void printHeader(ostream&);
private:
Dist* distCalc;
DeCalculator* decalc;
int iters;
- string fastafile, templateFile;
+ string fastafile, mapInfo;
+ Sequence* querySeq;
- vector<linePair*> lines;
- vector<Sequence*> querySeqs;
- vector<Sequence*> templateSeqs;
- vector< map<int, int> > spotMap;
+ map<int, int> spotMap;
map<int, int>::iterator it;
- vector< vector<int> > windows; //windows[0] is the vector of window breaks for querySeqs[0]
- vector<int> windowSizes; //windowSizes[0] is the size of the windows for querySeqs[0]
- vector< map<int, int> > trim; //trim[0] is the map containing the starting and ending positions for querySeqs[0] set of seqs
- vector< vector<SeqDist> > closest; //closest[0] is a vector of sequence at are closest to queryseqs[0]...
- vector< vector<float> > averageRef; //averageRef[0] is the average distance at each window for the references for querySeqs[0]
- vector< vector<float> > averageQuery; //averageQuery[0] is the average distance at each winow for the query for querySeqs[0]
- vector< vector<float> > sumRef; //sumRef[0] is the sum of distances at each window for the references for querySeqs[0]
- vector< vector<float> > sumSquaredRef; //sumSquaredRef[0] is the sum of squared distances at each window for the references for querySeqs[0]
- vector< vector<float> > sumQuery; //sumQuery[0] is the sum of distances at each window for the comparison of query to references for querySeqs[0]
- vector< vector<float> > sumSquaredQuery; //sumSquaredQuery[0] is the sum of squared distances at each window for the comparison of query to references for querySeqs[0]
- vector< vector<float> > varRef; //varRef[0] is the variance among references seqs at each window for querySeqs[0]
- vector< vector<float> > varQuery; //varQuery[0] is the variance among references and querySeqs[0] at each window
- vector< vector<float> > sdRef; //sdRef[0] is the standard deviation of references seqs at each window for querySeqs[0]
- vector< vector<float> > sdQuery; //sdQuery[0] is the standard deviation of references and querySeqs[0] at each window
- vector< vector<float> > anova; //anova[0] is the vector of anova scores for each window for querySeqs[0]
- vector<int> refCombo; //refCombo[0] is the number of reference sequences combinations for querySeqs[0]
- vector< vector<bool> > isChimericConfidence; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits
- vector< vector<bool> > isChimericTStudent; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits
- vector< vector<bool> > isChimericANOVA; //isChimericConfidence[0] indicates whether querySeqs[0] is chimeric at a given window according to the confidence limits
+ vector<int> windows; //windows is the vector of window breaks for query
+ int windowSizes; //windowSizes is the size of the windows for query
+ map<int, int> trim; //trim is the map containing the starting and ending positions for query
+ vector<SeqDist> closest; //closest is a vector of sequence at are closest to query
+ vector<float> averageRef; //averageRef is the average distance at each window for the references for query
+ vector<float> averageQuery; //averageQuery is the average distance at each winow for the query for query
+ vector<float> sumRef; //sumRef is the sum of distances at each window for the references for query
+ vector<float> sumSquaredRef; //sumSquaredRef is the sum of squared distances at each window for the references for query
+ vector<float> sumQuery; //sumQuery is the sum of distances at each window for the comparison of query to references for query
+ vector<float> sumSquaredQuery; //sumSquaredQuery is the sum of squared distances at each window for the comparison of query to references for query
+ vector<float> varRef; //varRef is the variance among references seqs at each window for query
+ vector<float> varQuery; //varQuery is the variance among references and query at each window
+ vector<float> sdRef; //sdRef is the standard deviation of references seqs at each window for query
+ vector<float> sdQuery; //sdQuery is the standard deviation of references and query at each window
+ vector<float> anova; //anova is the vector of anova scores for each window for query
+ int refCombo; //refCombo is the number of reference sequences combinations for query
+ vector<bool> isChimericConfidence; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
+ vector<bool> isChimericTStudent; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
+ vector<bool> isChimericANOVA; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
- vector< vector<SeqDist> > findClosest(int, int, int);
- void removeBadReferenceSeqs(vector<SeqDist>&, int); //removes sequences from closest that are to different of too similar to eachother.
- void trimSequences(int);
- vector<int> findWindows(int);
- void getAverageRef(vector<SeqDist>, int); //fills sumRef[i], averageRef[i], sumSquaredRef[i] and refCombo[i].
- void getAverageQuery (vector<SeqDist>, int); //fills sumQuery[i], averageQuery[i], sumSquaredQuery[i].
- void findVarianceRef (int); //fills varRef[i] and sdRef[i] also sets minimum error rate to 0.001 to avoid divide by 0.
- void findVarianceQuery (int); //fills varQuery[i] and sdQuery[i]
- void determineChimeras (int); //fills anova, isChimericConfidence[i], isChimericTStudent[i] and isChimericANOVA[i].
+ vector<SeqDist> findClosest(Sequence*, int);
+ void removeBadReferenceSeqs(vector<SeqDist>&); //removes sequences from closest that are to different of too similar to eachother.
+ void trimSequences(Sequence*);
+ vector<int> findWindows();
+ void getAverageRef(vector<SeqDist>); //fills sumRef, averageRef, sumSquaredRef and refCombo.
+ void getAverageQuery (vector<SeqDist>, Sequence*); //fills sumQuery, averageQuery, sumSquaredQuery.
+ void findVarianceRef (); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
+ void findVarianceQuery (); //fills varQuery and sdQuery
+ void determineChimeras (); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch
float getT(int);
float getF(int);
-
- void createProcessesClosest();
- void createProcessesRemoveBad();
- void createProcessesAverages();
- void createProcessesVariances();
- void createProcessesDetermine();
-
};
/***********************************************************/
//this is a vertical soft filter
void Chimera::createFilter(vector<Sequence*> seqs) {
try {
-
+ filterString = "";
int threshold = int (0.5 * seqs.size());
//cout << "threshhold = " << threshold << endl;
}
}
//***************************************************************************************************************
-void Chimera::runFilter(vector<Sequence*> seqs) {
+void Chimera::runFilter(Sequence* seq) {
try {
- //for each sequence
- for (int i = 0; i < seqs.size(); i++) {
-
- string seqAligned = seqs[i]->getAligned();
- string newAligned = "";
+ string seqAligned = seq->getAligned();
+ string newAligned = "";
- for (int j = 0; j < seqAligned.length(); j++) {
- //if this spot is a gap
- if (filterString[j] == '1') { newAligned += seqAligned[j]; }
- }
-
- seqs[i]->setAligned(newAligned);
+ for (int j = 0; j < seqAligned.length(); j++) {
+ //if this spot is a gap
+ if (filterString[j] == '1') { newAligned += seqAligned[j]; }
}
-
+
+ seq->setAligned(newAligned);
}
catch(exception& e) {
errorOut(e, "Chimera", "runFilter");
exit(1);
}
}
-
//***************************************************************************************************************
vector<Sequence*> Chimera::readSeqs(string file) {
try {
+
+ mothurOut("Reading sequences... "); cout.flush();
ifstream in;
openInputFile(file, in);
vector<Sequence*> container;
}
in.close();
+ mothurOut("Done."); mothurOutEndLine();
+
return container;
}
catch(exception& e) {
}
}
//***************************************************************************************************************
+Sequence* Chimera::getSequence(string name) {
+ try{
+ Sequence* temp;
+
+ //look through templateSeqs til you find it
+ int spot = -1;
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ if (name == templateSeqs[i]->getName()) {
+ spot = i;
+ break;
+ }
+ }
+
+ if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; }
+
+ temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
+
+ return temp;
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "getSequence");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
#include "mothur.h"
#include "sequence.hpp"
-
+/***********************************************************************/
struct Preference {
string name;
vector<string> leftParent; //keep the name of closest left associated with the two scores
int midpoint;
};
+/***********************************************************************/
+struct score_struct {
+ int prev;
+ int score;
+ int row;
+ int col;
+};
+/***********************************************************************/
+struct trace_struct {
+ int col;
+ int oldCol;
+ int row;
+};
+/***********************************************************************/
+struct results {
+ int regionStart;
+ int regionEnd;
+ int nastRegionStart;
+ int nastRegionEnd;
+ string parent;
+ float queryToParent;
+ float queryToParentLocal;
+ float divR;
+};
+/***********************************************************************/
+struct rank {
+ int num;
+ float score;
+ rank(int n, float s) : num(n), score(s) {}
+};
+/***********************************************************************/
struct SeqDist {
Sequence* seq;
float dist;
};
-
+//********************************************************************************************************************
+//sorts highest to lowest
+inline bool compareMembers(rank left, rank right){
+ return (left.score > right.score);
+}
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareRegionStart(results left, results right){
+ return (left.nastRegionStart < right.nastRegionStart);
+}
//********************************************************************************************************************
//sorts lowest to highest
inline bool compareSeqDist(SeqDist left, SeqDist right){
public:
Chimera(){};
+ Chimera(string);
Chimera(string, string);
- Chimera(string, string, string);
virtual ~Chimera(){};
virtual void setFilter(bool f) { filter = f; }
virtual void setCorrection(bool c) { correction = c; }
virtual void setDivR(float d) { divR = d; }
virtual void setParents(int p) { parents = p; }
virtual void setMinSim(int s) { minSim = s; }
+ virtual void setMinCoverage(int c) { minCov = c; }
+ virtual void setMinBS(int b) { minBS = b; }
+ virtual void setMinSNP(int s) { minSNP = s; }
virtual void setIters(int i) { iters = i; }
-
+ virtual void setTemplateSeqs(vector<Sequence*> t) { templateSeqs = t; }
+ virtual bool getUnaligned() { return unaligned; }
+ virtual void setTemplateFile(string t) { templateFileName = t; }
virtual void setCons(string){};
virtual void setQuantiles(string){};
+ virtual void doPrep(){};
virtual vector<Sequence*> readSeqs(string);
virtual vector< vector<float> > readQuantiles();
virtual void setMask(string);
- virtual void runFilter(vector<Sequence*>);
+ virtual void runFilter(Sequence*);
virtual void createFilter(vector<Sequence*>);
+ virtual void printHeader(ostream&){};
+ virtual int getChimeras(Sequence*){ return 0; }
+ virtual int getChimeras(){ return 0; }
+ virtual void print(ostream&){};
- //pure functions
- virtual int getChimeras() = 0;
- virtual void print(ostream&) = 0;
protected:
+ vector<Sequence*> templateSeqs;
bool filter, correction, svg, unaligned;
- int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, parents, iters;
+ int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters;
float divR;
- string seqMask, quanfile, filterString, name, outputDir;
-
-
+ string seqMask, quanfile, filterString, name, outputDir, templateFileName;
+ Sequence* getSequence(string); //find sequence from name
};
/***********************************************************************/
#include "chimeracheckrdp.h"
//***************************************************************************************************************
-ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; }
+ChimeraCheckRDP::ChimeraCheckRDP(string filename, string o) { fastafile = filename; outputDir = o; }
//***************************************************************************************************************
ChimeraCheckRDP::~ChimeraCheckRDP() {
try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
delete templateDB;
delete kmer;
}
void ChimeraCheckRDP::print(ostream& out) {
try {
- mothurOutEndLine();
-
- //vector<bool> isChimeric; isChimeric.resize(querySeqs.size(), false);
+ mothurOut("Processing: " + querySeq->getName()); mothurOutEndLine();
- for (int i = 0; i < querySeqs.size(); i++) {
+ out << querySeq->getName() << endl;
+ out << "IS scores: " << '\t';
- out << querySeqs[i]->getName() << endl;
- out << "IS scores: " << '\t';
-
- //int lastChimericWindowFound = 0;
-
- for (int k = 0; k < IS[i].size(); k++) {
- out << IS[i][k].score << '\t';
- //if (IS[i][k].score > chimeraCutoff) { isChimeric[i] = true; lastChimericWindowFound = k; }
- }
- out << endl;
- //if (isChimeric[i]) {
- //mothurOut(querySeqs[i]->getName() + "\tIS: " + toString(IS[i][lastChimericWindowFound].score) + "\tbreakpoint: " + toString(IS[i][lastChimericWindowFound].midpoint) + "\tleft parent: " + IS[i][lastChimericWindowFound].leftParent + "\tright parent: " + IS[i][lastChimericWindowFound].rightParent); mothurOutEndLine();
- //out << endl << "chimera: YES" << endl;
- //}else{
- //out << endl << "chimera: NO" << endl;
- //}
+ for (int k = 0; k < IS.size(); k++) {
+ out << IS[k].score << '\t';
+ }
+ out << endl;
+
+ if (svg) {
+ if (name != "") { //if user has specific names
+ map<string, string>::iterator it = names.find(querySeq->getName());
- if (svg) {
-
- if (name != "") { //if user has specific names
- map<string, string>::iterator it = names.find(querySeqs[i]->getName());
-
- if (it != names.end()) { //user wants pic of this
- makeSVGpic(IS[i], i); //zeros out negative results
- }
- }else{//output them all
- makeSVGpic(IS[i], i); //zeros out negative results
- }
+ if (it != names.end()) { //user wants pic of this
+ makeSVGpic(IS); //zeros out negative results
}
+ }else{//output them all
+ makeSVGpic(IS); //zeros out negative results
+ }
}
-
- mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine();
}
catch(exception& e) {
errorOut(e, "ChimeraCheckRDP", "print");
exit(1);
}
}
-
//***************************************************************************************************************
-int ChimeraCheckRDP::getChimeras() {
+void ChimeraCheckRDP::doPrep() {
try {
-
- //read in query sequences and subject sequences
- mothurOutEndLine();
- mothurOut("Reading query sequences... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- mothurOut("Done.");
- //templateSeqs = readSeqs(templateFile);
- templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
+ templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
mothurOutEndLine();
- int numSeqs = querySeqs.size();
-
- IS.resize(numSeqs);
- closest.resize(numSeqs);
-
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
- }
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
-
- #else
- lines.push_back(new linePair(0, numSeqs));
- #endif
-
kmer = new Kmer(kmerSize);
if (name != "") {
readName(name); //fills name map with names of seqs the user wants to have .svg for.
}
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "doPrep");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ChimeraCheckRDP::getChimeras(Sequence* query) {
+ try {
- //find closest seq to each querySeq
- for (int i = 0; i < querySeqs.size(); i++) {
- closest[i] = templateDB->findClosestSequence(querySeqs[i]);
- }
-
- //for each query find IS value
- if (processors == 1) {
- for (int i = 0; i < querySeqs.size(); i++) {
- IS[i] = findIS(i);
- }
- }else { createProcessesIS(); }
-
+ IS.clear();
+
+ querySeq = query;
+
+ closest = templateDB->findClosestSequence(query);
+
+ IS = findIS();
+
//determine chimera report cutoff - window score above 95%
//getCutoff(); - not very acurate predictor
- //free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
-
return 0;
}
catch(exception& e) {
}
}
//***************************************************************************************************************
-vector<sim> ChimeraCheckRDP::findIS(int query) {
+vector<sim> ChimeraCheckRDP::findIS() {
try {
vector< map<int, int> > subjectKmerInfo;
vector<sim> isValues;
- string queryName = querySeqs[query]->getName();
- string seq = querySeqs[query]->getUnaligned();
-
- mothurOut("Finding IS values for sequence " + toString(query+1)); mothurOutEndLine();
+ string queryName = querySeq->getName();
+ string seq = querySeq->getUnaligned();
queryKmerInfo = kmer->getKmerCounts(seq);
- subjectKmerInfo = kmer->getKmerCounts(closest[query].getUnaligned());
+ subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned());
//find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
delete left;
delete right;
- }
+ }
return isValues;
}
//***************************************************************************************************************
-void ChimeraCheckRDP::getCutoff() {
- try{
-
- vector<float> temp;
-
- //store all is scores for all windows
- for (int i = 0; i < IS.size(); i++) {
- for (int j = 0; j < IS[i].size(); j++) {
- temp.push_back(IS[i][j].score);
- }
- }
-
- //sort them
- sort(temp.begin(), temp.end());
-
- //get 95%
- chimeraCutoff = temp[int(temp.size() * 0.95)];
-
- }
- catch(exception& e) {
- errorOut(e, "ChimeraCheckRDP", "getCutoff");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-void ChimeraCheckRDP::makeSVGpic(vector<sim> info, int query) {
+void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
try{
- string file = outputDir + querySeqs[query]->getName() + ".chimeracheck.svg";
+ string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
ofstream outsvg;
openOutputFile(file, outsvg);
outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
outsvg << "<g>\n";
- outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeqs[query]->getName() + "</text>\n";
+ outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
outsvg << "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";
outsvg << "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
}
}
//***************************************************************************************************************
-void ChimeraCheckRDP::createProcessesIS() {
- try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- IS[i] = findIS(i);
- }
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << IS[i].size() << endl;
- for (int j = 0; j < IS[i].size(); j++) {
- out << IS[i][j].leftParent << '\t'<< IS[i][j].rightParent << '\t' << IS[i][j].midpoint << '\t' << IS[i][j].score << endl;
- }
- }
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- //get pairs
- for (int k = lines[i]->start; k < lines[i]->end; k++) {
-
- int size;
- in >> size; gobble(in);
-
- string left, right;
- int mid;
- float score;
-
- IS[k].clear();
-
- for (int j = 0; j < size; j++) {
- in >> left >> right >> mid >> score; gobble(in);
-
- sim temp;
- temp.leftParent = left;
- temp.rightParent = right;
- temp.midpoint = mid;
- temp.score = score;
-
- IS[k].push_back(temp);
- }
- }
-
- in.close();
- remove(s.c_str());
- }
-#else
- for (int i = 0; i < querySeqs.size(); i++) {
- IS[i] = findIS(i);
- }
-#endif
- }
- catch(exception& e) {
- errorOut(e, "ChimeraCheckRDP", "createProcessesIS");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
class ChimeraCheckRDP : public Chimera {
public:
- ChimeraCheckRDP(string, string, string);
+ ChimeraCheckRDP(string, string);
~ChimeraCheckRDP();
- int getChimeras();
+ int getChimeras(Sequence*);
void print(ostream&);
-
- void setCons(string){};
- void setQuantiles(string q) {};
-
+ void doPrep();
private:
- vector<linePair*> lines;
- vector<Sequence*> querySeqs;
+
+ Sequence* querySeq;
AlignmentDB* templateDB;
Kmer* kmer;
-
- vector< vector<sim> > IS; //IS[0] is the vector of IS values for each window for querySeqs[0]
- float chimeraCutoff;
-
-
- //map<string, vector< map<int, int> > >:: iterator it;
- //map<int, int>::iterator it2;
-
- vector<Sequence> closest; //closest[0] is the closest overall seq to querySeqs[0].
-
- string fastafile, templateFile;
+ Sequence closest; //closest is the closest overall seq to query
+
+ vector<sim> IS; //IS is the vector of IS values for each window for query
+ string fastafile;
map<string, string> names;
- vector<sim> findIS(int);
+ vector<sim> findIS();
int calcKmers(map<int, int>, map<int, int>);
- void getCutoff();
- void makeSVGpic(vector<sim>, int);
+ void makeSVGpic(vector<sim>);
void readName(string);
-
- void createProcessesIS();
-
};
-
/***********************************************************/
#endif
--- /dev/null
+/*
+ * chimerarealigner.cpp
+ * Mothur
+ *
+ * Created by westcott on 2/12/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "chimerarealigner.h"
+#include "needlemanoverlap.hpp"
+#include "nast.hpp"
+
+//***************************************************************************************************************
+ChimeraReAligner::ChimeraReAligner(vector<Sequence*> t, int m, int mm) : match(m), misMatch(mm) { templateSeqs = t; }
+//***************************************************************************************************************
+ChimeraReAligner::~ChimeraReAligner() {}
+//***************************************************************************************************************
+void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
+ try {
+ if (parents.size() != 0) {
+ vector<Sequence*> queryParts;
+ vector<Sequence*> parentParts; //queryParts[0] relates to parentParts[0]
+
+ string qAligned = query->getAligned();
+ string newQuery = "";
+
+ //sort parents by region start
+ sort(parents.begin(), parents.end(), compareRegionStart);
+
+ //make sure you don't cutoff beginning of query
+ if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart+1); }
+ int longest = 0;
+ //take query and break apart into pieces using breakpoints given by results of parents
+ for (int i = 0; i < parents.size(); i++) {
+ int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
+ string q = qAligned.substr(parents[i].nastRegionStart, length);
+ Sequence* queryFrag = new Sequence(query->getName(), q);
+
+ queryParts.push_back(queryFrag);
+
+ Sequence* parent = getSequence(parents[i].parent);
+ string p = parent->getAligned();
+ p = p.substr(parents[i].nastRegionStart, length);
+ parent->setAligned(p);
+
+ parentParts.push_back(parent);
+
+ if (q.length() > longest) { longest = q.length(); }
+ if (p.length() > longest) { longest = p.length(); }
+ }
+
+ //align each peice to correct parent from results
+ for (int i = 0; i < queryParts.size(); i++) {
+ alignment = new NeedlemanOverlap(-2.0, match, misMatch, longest+1); //default gapopen, match, mismatch, longestbase
+ Nast nast(alignment, queryParts[i], parentParts[i]);
+ delete alignment;
+ }
+
+ //recombine pieces to form new query sequence
+ for (int i = 0; i < queryParts.size(); i++) {
+ newQuery += queryParts[i]->getAligned();
+ }
+
+ //make sure you don't cutoff end of query
+ if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1); }
+
+ //set query to new aligned string
+ query->setAligned(newQuery);
+
+ //free memory
+ for (int i = 0; i < queryParts.size(); i++) { delete queryParts[i]; }
+ for (int i = 0; i < parentParts.size(); i++) { delete parentParts[i]; }
+
+ } //else leave query alone, you have bigger problems...
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraReAligner", "reAlign");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+Sequence* ChimeraReAligner::getSequence(string name) {
+ try{
+ Sequence* temp;
+
+ //look through templateSeqs til you find it
+ int spot = -1;
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ if (name == templateSeqs[i]->getName()) {
+ spot = i;
+ break;
+ }
+ }
+
+ if(spot == -1) { mothurOut("Error: Could not find sequence."); mothurOutEndLine(); return NULL; }
+
+ temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
+
+ return temp;
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraReAligner", "getSequence");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
--- /dev/null
+#ifndef CHIMERAREALIGNER_H
+#define CHIMERAREALIGNER_H
+
+/*
+ * chimerarealigner.h
+ * Mothur
+ *
+ * Created by westcott on 2/12/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+#include "alignment.hpp"
+
+/***********************************************************/
+
+class ChimeraReAligner {
+
+ public:
+ ChimeraReAligner(vector<Sequence*>, int, int);
+ ~ChimeraReAligner();
+
+ void reAlign(Sequence*, vector<results>);
+
+ private:
+ Sequence* querySeq;
+ Alignment* alignment;
+ vector<Sequence*> templateSeqs;
+ int match, misMatch;
+
+ Sequence* getSequence(string); //find sequence from name
+};
+/***********************************************************/
+
+#endif
+
else {
//valid paramters for this command
- string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim", "parents", "iters","outputdir","inputdir" };
+ string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask",
+ "numwanted", "ksize", "svg", "name", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
svg = isTrue(temp);
temp = validParameter.validFile(parameters, "window", false);
- if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }
+ if ((temp == "not found") && (method == "chimeraslayer")) { temp = "50"; }
else if (temp == "not found") { temp = "0"; }
convert(temp, window);
temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found") { temp = "-4"; }
convert(temp, mismatch);
- temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.0"; }
+ temp = validParameter.validFile(parameters, "divergence", false); if (temp == "not found") { temp = "1.007"; }
convert(temp, divR);
temp = validParameter.validFile(parameters, "minsim", false); if (temp == "not found") { temp = "90"; }
convert(temp, minSimilarity);
- temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "5"; }
+ temp = validParameter.validFile(parameters, "mincov", false); if (temp == "not found") { temp = "70"; }
+ convert(temp, minCoverage);
+
+ temp = validParameter.validFile(parameters, "minbs", false); if (temp == "not found") { temp = "90"; }
+ convert(temp, minBS);
+
+ temp = validParameter.validFile(parameters, "minsnp", false); if (temp == "not found") { temp = "10"; }
+ convert(temp, minSNP);
+
+ temp = validParameter.validFile(parameters, "parents", false); if (temp == "not found") { temp = "3"; }
convert(temp, parents);
- temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
+ temp = validParameter.validFile(parameters, "iters", false);
+ if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }
+ else if (temp == "not found") { temp = "1000"; }
convert(temp, iters);
temp = validParameter.validFile(parameters, "increment", false);
- if ((temp == "not found") && ((method == "chimeracheck") || (method == "chimeraslayer"))) { temp = "10"; }
+ if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; }
+ else if ((temp == "not found") && (method == "chimeraslayer")) { temp = "5"; }
else if (temp == "not found") { temp = "25"; }
convert(temp, increment);
temp = validParameter.validFile(parameters, "numwanted", false);
- if ((temp == "not found") && (method == "chimeraslayer")) { temp = "10"; }
+ if ((temp == "not found") && (method == "chimeraslayer")) { temp = "15"; }
else if (temp == "not found") { temp = "20"; }
convert(temp, numwanted);
mothurOut("The ksize parameter allows you to input kmersize. \n");
mothurOut("The svg parameter allows you to specify whether or not you would like a svg file outputted for each query sequence.\n");
mothurOut("The name parameter allows you to enter a file containing names of sequences you would like .svg files for.\n");
- //mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n");
+ mothurOut("The iters parameter allows you to specify the number of bootstrap iters to do with the chimeraslayer method.\n");
+ mothurOut("The minsim parameter allows you .... \n");
+ mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
+ mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
+ mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
mothurOut("Details for each method: \n");
mothurOut("\tpintail: \n");
if (abort == true) { return 0; }
+ int start = time(NULL);
+
if (method == "bellerophon") { chimera = new Bellerophon(fastafile, outputDir); }
- else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile, outputDir); }
- else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile, outputDir); }
- else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, templatefile, outputDir); }
- else if (method == "chimeraslayer") { chimera = new ChimeraSlayer(fastafile, templatefile); }
+ else if (method == "pintail") { chimera = new Pintail(fastafile, outputDir); }
+ else if (method == "ccode") { chimera = new Ccode(fastafile, outputDir); }
+ else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, outputDir); }
+ else if (method == "chimeraslayer") { chimera = new ChimeraSlayer("blast"); }
else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
//set user options
if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); }
- //saves time to avoid generating it
chimera->setCons(consfile);
-
- //saves time to avoid generating it
chimera->setQuantiles(quanfile);
-
chimera->setMask(maskfile);
chimera->setFilter(filter);
chimera->setCorrection(correction);
chimera->setDivR(divR);
chimera->setParents(parents);
chimera->setMinSim(minSimilarity);
+ chimera->setMinCoverage(minCoverage);
+ chimera->setMinBS(minBS);
+ chimera->setMinSNP(minSNP);
chimera->setIters(iters);
+ chimera->setTemplateFile(templatefile);
+
-
- //find chimeras
- int error = chimera->getChimeras();
- //there was a problem
- if (error == 1) { return 0; }
+ vector<Sequence*> templateSeqs;
+ if ((method != "bellerophon") && (method != "chimeracheck")) {
+ templateSeqs = chimera->readSeqs(templatefile);
+ if (chimera->getUnaligned()) {
+ mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine();
+ //free memory
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ return 0;
+ }
+
+ //set options
+ chimera->setTemplateSeqs(templateSeqs);
+ }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it
+ chimera->getChimeras();
+
+ string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
+ ofstream out;
+ openOutputFile(outputFName, out);
+
+ chimera->print(out);
+ out.close();
+ return 0;
+ }
+
+ //some methods need to do prep work before processing the chimeras
+ chimera->doPrep();
+
+ ofstream outHeader;
+ string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader";
+ openOutputFile(tempHeader, outHeader);
+
+ chimera->printHeader(outHeader);
+ outHeader.close();
+
string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras";
- ofstream out;
- openOutputFile(outputFileName, out);
+
+ //break up file
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ if(processors == 1){
+ ifstream inFASTA;
+ openInputFile(fastafile, inFASTA);
+ numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+
+ lines.push_back(new linePair(0, numSeqs));
+
+ driver(lines[0], outputFileName, fastafile);
+
+ }else{
+ vector<int> positions;
+ processIDS.resize(0);
+
+ ifstream inFASTA;
+ openInputFile(fastafile, inFASTA);
+
+ string input;
+ while(!inFASTA.eof()){
+ input = getline(inFASTA);
+ if (input.length() != 0) {
+ if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ }
+ }
+ inFASTA.close();
+
+ numSeqs = positions.size();
+
+ int numSeqsPerProcessor = numSeqs / processors;
+
+ for (int i = 0; i < processors; i++) {
+ long int startPos = positions[ i * numSeqsPerProcessor ];
+ if(i == processors - 1){
+ numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;
+ }
+ lines.push_back(new linePair(startPos, numSeqsPerProcessor));
+ }
+
+
+ createProcesses(outputFileName, fastafile);
+
+ rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
+
+ //append alignment and report files
+ for(int i=1;i<processors;i++){
+ appendOutputFiles((outputFileName + toString(processIDS[i]) + ".temp"), outputFileName);
+ remove((outputFileName + toString(processIDS[i]) + ".temp").c_str());
+ }
+ }
+
+ #else
+ ifstream inFASTA;
+ openInputFile(candidateFileNames[s], inFASTA);
+ numSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+ lines.push_back(new linePair(0, numSeqs));
+
+ driver(lines[0], outputFileName, fastafile);
+ #endif
- //print results
- chimera->print(out);
+ //mothurOut("Output File Names: ");
+ //if ((filter) && (method == "bellerophon")) { mothurOut(
+ //if (outputDir == "") { fastafile = getRootName(fastafile) + "filter.fasta"; }
+ // else { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
- out.close();
+ appendOutputFiles(tempHeader, outputFileName);
+ remove(tempHeader.c_str());
+
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+
+ if (method == "chimeracheck") { mothurOutEndLine(); mothurOut("This method does not determine if a sequence is chimeric, but allows you to make that determination based on the IS values."); mothurOutEndLine(); }
- delete chimera;
+ mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(numSeqs) + " sequences."); mothurOutEndLine();
return 0;
errorOut(e, "ChimeraSeqsCommand", "execute");
exit(1);
}
+}//**********************************************************************************************************************
+
+int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filename){
+ try {
+ ofstream out;
+ openOutputFile(outputFName, out);
+
+ ifstream inFASTA;
+ openInputFile(filename, inFASTA);
+
+ inFASTA.seekg(line->start);
+
+ for(int i=0;i<line->numSeqs;i++){
+
+ Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
+
+ if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
+
+ //find chimeras
+ chimera->getChimeras(candidateSeq);
+
+ //print results
+ chimera->print(out);
+ }
+ delete candidateSeq;
+
+ //report progress
+ if((i+1) % 100 == 0){ mothurOut("Processing sequence: " + toString(i+1)); mothurOutEndLine(); }
+ }
+ //report progress
+ if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine(); }
+
+ out.close();
+ inFASTA.close();
+
+ return 1;
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraSeqsCommand", "driver");
+ exit(1);
+ }
}
+
/**************************************************************************************************/
+void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ // processIDS.resize(0);
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename);
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraSeqsCommand", "createProcesses");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) {
+ try{
+
+ ofstream output;
+ ifstream input;
+
+ openOutputFileAppend(temp, output);
+ openInputFile(filename, input);
+
+ while(char c = input.get()){
+ if(input.eof()) { break; }
+ else { output << c; }
+ }
+
+ input.close();
+ output.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraSeqsCommand", "appendOuputFiles");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
private:
+
+ struct linePair {
+ int start;
+ int numSeqs;
+ linePair(long int i, int j) : start(i), numSeqs(j) {}
+ };
+ vector<int> processIDS; //processid
+ vector<linePair*> lines;
+ int driver(linePair*, string, string);
+ void createProcesses(string, string);
+ void appendOutputFiles(string, string);
+
bool abort;
string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir;
bool filter, correction, svg, printAll;
- int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity;
+ int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs;
float divR;
Chimera* chimera;
*/
#include "chimeraslayer.h"
+#include "chimerarealigner.h"
//***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string filename, string temp) { fastafile = filename; templateFile = temp; }
+ChimeraSlayer::ChimeraSlayer(string mode) : searchMethod(mode) { decalc = new DeCalculator(); }
//***************************************************************************************************************
-
-ChimeraSlayer::~ChimeraSlayer() {
- try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "~ChimeraSlayer");
- exit(1);
- }
-}
+ChimeraSlayer::~ChimeraSlayer() { delete decalc; }
+//***************************************************************************************************************
+void ChimeraSlayer::printHeader(ostream& out) {
+ mothurOutEndLine();
+ mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
+ mothurOutEndLine();
+}
//***************************************************************************************************************
void ChimeraSlayer::print(ostream& out) {
try {
- mothurOutEndLine();
- mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
- mothurOutEndLine();
-
- for (int i = 0; i < querySeqs.size(); i++) {
-
- if (chimeraFlags[i] == "yes") {
-// cout << i << endl;
- if ((chimeraResults[i][0].bsa >= 90.0) || (chimeraResults[i][0].bsb >= 90.0)) {
- mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine();
- out << querySeqs[i]->getName() << "\tyes" << endl;
- }else {
- out << querySeqs[i]->getName() << "\tyes" << endl;
- //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine();
+ if (chimeraFlags == "yes") {
+ string chimeraFlag = "no";
+ if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
+ ||
+ (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
+
+
+ if (chimeraFlag == "yes") {
+ if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
+ mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine();
}
-
- printBlock(chimeraResults[i][0], out, i);
-
- out << endl;
- }else{
- out << querySeqs[i]->getName() << "\tno" << endl;
- //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine();
}
- }
-
+ out << querySeq->getName() << "\tyes" << endl;
+ printBlock(chimeraResults[0], out);
+ out << endl;
+ }else { out << querySeq->getName() << "\tno" << endl; }
+
}
catch(exception& e) {
errorOut(e, "ChimeraSlayer", "print");
exit(1);
}
}
-
//***************************************************************************************************************
-int ChimeraSlayer::getChimeras() {
+int ChimeraSlayer::getChimeras(Sequence* query) {
try {
+ chimeraFlags = "no";
+ querySeq = query;
- //read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- templateSeqs = readSeqs(templateFile);
- mothurOut("Done."); mothurOutEndLine();
-
- int numSeqs = querySeqs.size();
-
- if (unaligned) { mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); return 1; }
-
- chimeraResults.resize(numSeqs);
- chimeraFlags.resize(numSeqs, "no");
- spotMap.resize(numSeqs);
-
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
- }
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
- #else
- lines.push_back(new linePair(0, numSeqs));
- #endif
-
- if (seqMask != "") { decalc = new DeCalculator(); } //to use below
-
- //initialize spotMap
- for (int j = 0; j < numSeqs; j++) {
- for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) {
- spotMap[j][i] = i;
- }
+ for (int i = 0; i < query->getAligned().length(); i++) {
+ spotMap[i] = i;
}
//referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
- maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim);
- slayer = new Slayer(window, increment, minSim, divR, iters);
-
- for (int i = 0; i < querySeqs.size(); i++) {
+ maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod);
+ slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
- string chimeraFlag = maligner->getResults(querySeqs[i]);
- //float percentIdentical = maligner->getPercentID();
- vector<results> Results = maligner->getOutput();
+ string chimeraFlag = maligner->getResults(query, decalc);
+ vector<results> Results = maligner->getOutput();
+
+ //realign query to parents to improve slayers detection rate
+ //ChimeraReAligner realigner(templateSeqs, match, misMatch);
+ //realigner.reAlign(query, Results);
- cout << "Processing sequence: " << i+1 << endl;
+ //if (chimeraFlag == "yes") {
- //for (int j = 0; j < Results.size(); j++) {
- //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl;
- //}
+ //get sequence that were given from maligner results
+ vector<SeqDist> seqs;
+ map<string, float> removeDups;
+ map<string, float>::iterator itDup;
+ for (int j = 0; j < Results.size(); j++) {
+ float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+ //only add if you are not a duplicate
+ itDup = removeDups.find(Results[j].parent);
+ if (itDup == removeDups.end()) { //this is not duplicate
+ removeDups[Results[j].parent] = dist;
+ }else if (dist > itDup->second) { //is this a stronger number for this parent
+ removeDups[Results[j].parent] = dist;
+ }
+ }
+
+ for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+ Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
- //if (chimeraFlag == "yes") {
+ SeqDist member;
+ member.seq = seq;
+ member.dist = itDup->second;
- //get sequence that were given from maligner results
- vector<SeqDist> seqs;
- map<string, float> removeDups;
- map<string, float>::iterator itDup;
- for (int j = 0; j < Results.size(); j++) {
- float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
- //only add if you are not a duplicate
- itDup = removeDups.find(Results[j].parent);
- if (itDup == removeDups.end()) { //this is not duplicate
- removeDups[Results[j].parent] = dist;
- }else if (dist > itDup->second) { //is this a stronger number for this parent
- removeDups[Results[j].parent] = dist;
- }
- }
-
- for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
- Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
-
- SeqDist member;
- member.seq = seq;
- member.dist = itDup->second;
-
- seqs.push_back(member);
- }
-
- //limit number of parents to explore - default 5
- if (Results.size() > parents) {
- //sort by distance
- sort(seqs.begin(), seqs.end(), compareSeqDist);
- //prioritize larger more similiar sequence fragments
- reverse(seqs.begin(), seqs.end());
-
- for (int k = seqs.size()-1; k > (parents-1); k--) {
- delete seqs[k].seq;
- seqs.pop_back();
- }
- }
+ seqs.push_back(member);
+ }
+
+ //limit number of parents to explore - default 3
+ if (Results.size() > parents) {
+ //sort by distance
+ sort(seqs.begin(), seqs.end(), compareSeqDist);
+ //prioritize larger more similiar sequence fragments
+ reverse(seqs.begin(), seqs.end());
+
+ for (int k = seqs.size()-1; k > (parents-1); k--) {
+ delete seqs[k].seq;
+ seqs.pop_back();
+ }
+ }
- //put seqs into vector to send to slayer
- vector<Sequence*> seqsForSlayer;
- for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
+ //put seqs into vector to send to slayer
+ vector<Sequence*> seqsForSlayer;
+ for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
+ //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl;
//ofstream out;
//string name = querySeqs[i]->getName();
//cout << name << endl;
//cout << name << endl;
//name = name.substr(0, name.find_last_of("|"));
//cout << name << endl;
-//string filename = name + ".seqsforslayer";
+//string filename = toString(i+1) + ".seqsforslayer";
//openOutputFile(filename, out);
-//for (int u = 0; u < seqsForSlayer.size(); u++) { seqsForSlayer[u]->printSequence(out); }
+//cout << querySeqs[i]->getName() << endl;
+//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out); }
+//cout << endl;
//out.close();
-//filename = name + ".fasta";
+//filename = toString(i+1) + ".fasta";
//openOutputFile(filename, out);
-//q->printSequence(out);
+//querySeqs[i]->printSequence(out);
//out.close();
-
- //mask then send to slayer...
- if (seqMask != "") {
- decalc->setMask(seqMask);
-
- //mask querys
- decalc->runMask(querySeqs[i]);
-
- //mask parents
- for (int k = 0; k < seqsForSlayer.size(); k++) {
- decalc->runMask(seqsForSlayer[k]);
- }
-
- for (int i = 0; i < numSeqs; i++) {
- spotMap[i] = decalc->getMaskMap();
- }
- }
-
- //send to slayer
- chimeraFlags[i] = slayer->getResults(querySeqs[i], seqsForSlayer);
- chimeraResults[i] = slayer->getOutput();
-
- //free memory
- for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
- //}
- }
- //free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
-
+ //mask then send to slayer...
if (seqMask != "") {
- delete decalc;
+ decalc->setMask(seqMask);
+
+ //mask querys
+ decalc->runMask(query);
+
+ //mask parents
+ for (int k = 0; k < seqsForSlayer.size(); k++) {
+ decalc->runMask(seqsForSlayer[k]);
+ }
+
+ spotMap = decalc->getMaskMap();
}
+ //send to slayer
+ chimeraFlags = slayer->getResults(query, seqsForSlayer);
+ chimeraResults = slayer->getOutput();
+
+ //free memory
+ for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
+ //}
+
return 0;
}
catch(exception& e) {
}
}
//***************************************************************************************************************
-Sequence* ChimeraSlayer::getSequence(string name) {
- try{
- Sequence* temp;
-
- //look through templateSeqs til you find it
- int spot = -1;
- for (int i = 0; i < templateSeqs.size(); i++) {
- if (name == templateSeqs[i]->getName()) {
- spot = i;
- break;
- }
- }
-
- if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; }
-
- temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
-
- return temp;
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSlayer", "getSequence");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){
+void ChimeraSlayer::printBlock(data_struct data, ostream& out){
try {
out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl;
- out << "Left Window: " << spotMap[i][data.winLStart] << " " << spotMap[i][data.winLEnd] << endl;
- out << "Right Window: " << spotMap[i][data.winRStart] << " " << spotMap[i][data.winREnd] << endl;
+ out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+ out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl;
out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl;
out << "Similarity of query to parentA: " << data.qa << endl;
out << "Similarity of query to parentB: " << data.qb << endl;
+ out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl;
+ out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl;
//out << "Per_id(QL,A): " << data.qla << endl;
//out << "Per_id(QL,B): " << data.qlb << endl;
//out << "Per_id(QR,A): " << data.qra << endl;
class ChimeraSlayer : public Chimera {
public:
- ChimeraSlayer(string, string);
+ ChimeraSlayer(string);
~ChimeraSlayer();
- int getChimeras();
+ int getChimeras(Sequence*);
void print(ostream&);
-
- void setCons(string){};
- void setQuantiles(string q) {};
-
+ void printHeader(ostream&);
private:
+ Sequence* querySeq;
DeCalculator* decalc;
Maligner* maligner;
Slayer* slayer;
- vector<linePair*> lines;
- vector<Sequence*> querySeqs;
- vector<Sequence*> templateSeqs;
- vector< map<int, int> > spotMap;
+ map<int, int> spotMap;
- vector< vector<data_struct> > chimeraResults;
- vector<string> chimeraFlags;
-
- string fastafile, templateFile;
-
- Sequence* getSequence(string); //find sequence from name
- void printBlock(data_struct, ostream&, int i);
+ vector<data_struct> chimeraResults;
+ string chimeraFlags, searchMethod;
+ string fastafile;
+
+ void printBlock(data_struct, ostream&);
};
/************************************************************************/
commands["pcoa"] = "pcoa";
commands["otu.hierarchy"] = "otu.hierarchy";
commands["set.dir"] = "set.dir";
+ commands["merge.files"] = "merge.files";
}
/***********************************************************/
virtual void generateDB() = 0;
virtual void addSequence(Sequence) = 0; //add sequence to search engine
virtual vector<int> findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query
+ virtual map<int, float> findClosest(Sequence*, int){ return results; } // returns of n closest sequences to query and their search scores
virtual float getSearchScore();
virtual int getLongestBase();
virtual void readKmerDB(ifstream&){};
protected:
int numSeqs, longest;
float searchScore;
+ map<int, float> results;
};
/**************************************************************************************************/
#endif
if (size == 0) { if (cutoff > 1200) { size = 300; }
else{ size = (cutoff / 4); } }
else if (size > (cutoff / 4)) {
- mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
+ mothurOut("You have selected too large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
size = (cutoff / 4);
}
}
}
//***************************************************************************************************************
+//gets closest matches to each end, since chimeras will most likely have different parents on each end
vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int numWanted) {
try {
Dist* distcalculator = new eachGapDist();
- Sequence query = *(querySeq);
+ string queryAligned = querySeq->getAligned();
+ string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
+ string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
+ Sequence queryLeft(querySeq->getName(), leftQuery);
+ Sequence queryRight(querySeq->getName(), rightQuery);
+
for(int j = 0; j < db.size(); j++){
- Sequence temp = *(db[j]);
+ string dbAligned = db[j]->getAligned();
+ string leftDB = dbAligned.substr(0, (dbAligned.length() / 3)); //first 1/3 of the sequence
+ string rightDB = dbAligned.substr(((dbAligned.length() / 3)*2)); //last 1/3 of the sequence
+
+ Sequence dbLeft(db[j]->getName(), leftDB);
+ Sequence dbRight(db[j]->getName(), rightDB);
+
+ distcalculator->calcDist(queryLeft, dbLeft);
+ float distLeft = distcalculator->getDist();
+
+ distcalculator->calcDist(queryRight, dbRight);
+ float distRight = distcalculator->getDist();
- distcalculator->calcDist(query, temp);
- float dist = distcalculator->getDist();
+ float dist = min(distLeft, distRight); //keep smallest distance
SeqDist subject;
subject.seq = db[j];
}
}
/***************************************************************************************************************/
-void DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
+map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
try {
int frontPos = 0; //should contain first position in all seqs that is not a gap character
topMatches[i]->setAligned(newAligned);
}
+ map<int, int> trimmedPos;
+
+ for (int i = 0; i < newAligned.length(); i++) {
+ trimmedPos[i] = i+frontPos;
+ }
+
+ return trimmedPos;
}
catch(exception& e) {
errorOut(e, "DeCalculator", "trimSequences");
void setAlignmentLength(int l) { alignLength = l; }
void runMask(Sequence*);
void trimSeqs(Sequence*, Sequence*, map<int, int>&);
- void trimSeqs(Sequence*, vector<Sequence*>);
+ map<int, int> trimSeqs(Sequence*, vector<Sequence*>);
void removeObviousOutliers(vector< vector<quanMember> >&, int);
vector<float> calcFreq(vector<Sequence*>, string);
vector<int> findWindows(Sequence*, int, int, int&, int);
#ifdef USE_READLINE
char* nextCommand = NULL;
nextCommand = readline("mothur > ");
- if(nextCommand != NULL) { add_history(nextCommand); }
+
+ if(nextCommand != NULL) { add_history(nextCommand); }
+ else{ //^D causes null string and we want it to quit mothur
+ nextCommand = "quit";
+ cout << nextCommand << endl;
+ }
+
mothurOutJustToLog("mothur > " + toString(nextCommand));
return nextCommand;
#else
string nextCommand = "";
mothurOut("mothur > ");
getline(cin, nextCommand);
+ mothurOutJustToLog("mothur > " + toString(nextCommand));
return nextCommand;
#endif
#else
string nextCommand = "";
mothurOut("mothur > ");
getline(cin, nextCommand);
+ mothurOutJustToLog("mothur > " + toString(nextCommand));
return nextCommand;
#endif
labels.clear(); Groups.clear();
allLines = 1;
runParse = true;
+ names.clear();
}
catch(exception& e) {
errorOut(e, "GlobalData", "newRead");
*/
#include "maligner.h"
+#include "database.hpp"
+#include "blastdb.hpp"
/***********************************************************************/
-Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int minCov) :
- db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minCoverage(minCov) {}
+Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
+ db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
/***********************************************************************/
-string Maligner::getResults(Sequence* q) {
+string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
try {
outputResults.clear();
query = new Sequence(q->getName(), q->getAligned());
string chimera;
-
- decalc = new DeCalculator();
- //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
- refSeqs = decalc->findClosest(query, db, numWanted);
-
- refSeqs = minCoverageFilter(refSeqs);
+ if (searchMethod != "blast") {
+ //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
+ refSeqs = decalc->findClosest(query, db, numWanted);
+ }else{
+ refSeqs = getBlastSeqs(query, numWanted);
+ }
+//ofstream out;
+//string name = toString(numi+1);
+//cout << name << endl;
+//name = name.substr(name.find_first_of("|")+1);
+//cout << name << endl;
+//name = name.substr(name.find_first_of("|")+1);
+//cout << name << endl;
+//name = name.substr(0, name.find_last_of("|"));
+//cout << name << endl;
+//string filename = name + ".seqsformaligner";
+//openOutputFile(filename, out);
+//for (int u = 0; u < refSeqs.size(); u++) { refSeqs[u]->printSequence(out); }
+//out.close();
+//filename = name + ".fasta";
+//openOutputFile(filename, out);
+//query->printSequence(out);
+//out.close();
+
+//for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl; }
+//cout << "before = " << refSeqs.size() << endl;
+ refSeqs = minCoverageFilter(refSeqs);
+//cout << "after = " << refSeqs.size() << endl;
if (refSeqs.size() < 2) {
for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
percentIdenticalQueryChimera = 0.0;
}
int chimeraPenalty = computeChimeraPenalty();
-
+//cout << chimeraPenalty << endl;
+ //fills outputResults
+ chimera = chimeraMaligner(chimeraPenalty, decalc);
+
+
+ //free memory
+ delete query;
+ for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
+
+ return chimera;
+ }
+ catch(exception& e) {
+ errorOut(e, "Maligner", "getResults");
+ exit(1);
+ }
+}
+/***********************************************************************/
+string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
+ try {
+
+ string chimera;
+
//trims seqs to first non gap char in all seqs and last non gap char in all seqs
- decalc->trimSeqs(query, refSeqs);
+ spotMap = decalc->trimSeqs(query, refSeqs);
vector<Sequence*> temp = refSeqs;
temp.push_back(query);
percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
- delete decalc;
-
//save output results
for (int i = 0; i < trace.size(); i++) {
int regionStart = trace[i].col;
results temp;
temp.parent = refSeqs[seqIndex]->getName();
+ temp.nastRegionStart = spotMap[regionStart];
+ temp.nastRegionEnd = spotMap[regionEnd];
temp.regionStart = regionStart;
temp.regionEnd = regionEnd;
outputResults.push_back(temp);
}
-
- //free memory
- delete query;
- for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
-
+
return chimera;
}
catch(exception& e) {
- errorOut(e, "Maligner", "getResults");
+ errorOut(e, "Maligner", "chimeraMaligner");
exit(1);
}
}
if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
}
+ map<int, int> newMap;
//for each sequence
for (int i = 0; i < seqs.size(); i++) {
string seqAligned = seqs[i]->getAligned();
string newAligned = "";
+ int count = 0;
for (int j = 0; j < seqAligned.length(); j++) {
//if this spot is not a gap
- if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+ if (filterString[j] == '1') {
+ newAligned += seqAligned[j];
+ newMap[count] = spotMap[j];
+ count++;
+ }
}
seqs[i]->setAligned(newAligned);
}
-
+ spotMap = newMap;
}
catch(exception& e) {
errorOut(e, "Maligner", "verticalFilter");
}
}
//***************************************************************************************************************
+vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
+ try {
+cout << q->getName() << endl;
+ //generate blastdb
+ Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
+ for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); }
+ database->generateDB();
+ database->setNumSeqs(db.size());
+
+ //get parts of query
+ string queryAligned = q->getAligned();
+ string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
+ string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
+
+ Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+ Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+
+ map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
+ map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
+
+ //merge results
+ vector<rank> mergedResults;
+
+ map<int, float>::iterator it;
+ map<int, float>::iterator it2;
+
+ //add in right guys merging common finds
+ for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
+ it2 = tempIndexesLeft.find(it->first);
+
+ if (it2 == tempIndexesLeft.end()) { //result only present in right
+ rank temp(it->first, it->second);
+ mergedResults.push_back(temp);
+
+ }else { //result present in both save best score
+ float bestscore;
+ if (it->second > it2->second) { bestscore = it->second; }
+ else { bestscore = it2->second; }
+
+ rank temp(it->first, bestscore);
+ mergedResults.push_back(temp);
+
+ tempIndexesLeft.erase(it2);
+ }
+ }
+
+ //add in unique left guys
+ for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
+ rank temp(it->first, it->second);
+ mergedResults.push_back(temp);
+ }
+
+ sort(mergedResults.begin(), mergedResults.end(), compareMembers);
+
+ vector<Sequence*> refResults;
+ for (int i = 0; i < numWanted; i++) {
+ Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
+ refResults.push_back(temp);
+cout << db[mergedResults[i].num]->getName() << endl;
+ }
+
+ delete queryRight;
+ delete queryLeft;
+ delete database;
+
+ return refResults;
+ }
+ catch(exception& e) {
+ errorOut(e, "Maligner", "getBlastSeqs");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
*/
#include "decalc.h"
+#include "chimera.h"
/***********************************************************************/
//This class was modeled after the chimeraMaligner written by the Broad Institute
-/***********************************************************************/
-struct score_struct {
- int prev;
- int score;
- int row;
- int col;
-};
-/***********************************************************************/
-struct trace_struct {
- int col;
- int oldCol;
- int row;
-};
-/***********************************************************************/
-struct results {
- int regionStart;
- int regionEnd;
- string parent;
- float queryToParent;
- float queryToParentLocal;
- float divR;
-};
-
/**********************************************************************/
class Maligner {
public:
- Maligner(vector<Sequence*>, int, int, int, float, int);
+ Maligner(vector<Sequence*>, int, int, int, float, int, int, string);
~Maligner() {};
- string getResults(Sequence*);
+ string getResults(Sequence*, DeCalculator*);
float getPercentID() { return percentIdenticalQueryChimera; }
vector<results> getOutput() { return outputResults; }
private:
- DeCalculator* decalc;
Sequence* query;
vector<Sequence*> refSeqs;
vector<Sequence*> db;
- int numWanted, matchScore, misMatchPenalty, minCoverage;
+ int numWanted, matchScore, misMatchPenalty, minCoverage, minSimilarity;
+ string searchMethod;
float minDivR, percentIdenticalQueryChimera;
vector<results> outputResults;
+ map<int, int> spotMap;
vector<Sequence*> minCoverageFilter(vector<Sequence*>); //removes top matches that do not have minimum coverage with query.
int computeChimeraPenalty();
vector<trace_struct> mapTraceRegionsToAlignment(vector<score_struct>, vector<Sequence*>);
string constructChimericSeq(vector<trace_struct>, vector<Sequence*>);
float computePercentID(string, string);
+ string chimeraMaligner(int, DeCalculator*);
+ vector<Sequence*> getBlastSeqs(Sequence*, int);
};
openInputFile(fileNames[i], inputFile);
- while(!inputFile.eof()){ c = inputFile.get(); outputFile << c; }
+ while(!inputFile.eof()){
+ c = inputFile.get();
+ //-1 is eof char
+ if (int(c) != -1) { outputFile << c; }
+ }
inputFile.close();
}
mothurOutJustToLog("Windows version");
mothurOutEndLine(); mothurOutEndLine();
#endif
-
+
+ #ifdef USE_READLINE
+ mothurOutJustToLog("Using ReadLine");
+ mothurOutEndLine(); mothurOutEndLine();
+ #endif
//header
mothurOut("mothur v.1.8");
exit(1);
}
}
+/**************************************************************************************************/
+//returns true if any of the strings in first vector are in second vector
+inline bool inUsersGroups(vector<string> groupnames, vector<string> Groups) {
+ try {
+
+ for (int i = 0; i < groupnames.size(); i++) {
+ if (inUsersGroups(groupnames[i], Groups)) { return true; }
+ }
+ return false;
+ }
+ catch(exception& e) {
+ errorOut(e, "mothur", "inUsersGroups");
+ exit(1);
+ }
+}
/**************************************************************************************************/
maxInsertLength = 0;
pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al.
regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al.
-
}
catch(exception& e) {
errorOut(e, "Nast", "Nast");
try {
alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
-
+
string candAln = alignment->getSeqAAln();
string tempAln = alignment->getSeqBAln();
else {
//valid paramters for this command
- string Array[] = {"list","label","outputdir","inputdir"};
+ string Array[] = {"list","label","output","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
else {
splitAtDash(label, labels);
if (labels.size() != 2) { mothurOut("You must provide 2 labels."); mothurOutEndLine(); abort = true; }
- }
+ }
+
+ output = validParameter.validFile(parameters, "output", false); if (output == "not found") { output = "name"; }
+
+ if ((output != "name") && (output != "number")) { mothurOut("output options are name and number. I will use name."); mothurOutEndLine(); output = "name"; }
}
}
void OtuHierarchyCommand::help(){
try {
mothurOut("The otu.hierarchy command is used to see how otus relate at two distances. \n");
- mothurOut("The otu.hierarchy command parameters are list and label. Both parameters are required. \n");
+ mothurOut("The otu.hierarchy command parameters are list, label and output. list and label parameters are required. \n");
+ mothurOut("The output parameter allows you to output the names of the sequence in the OTUs or the OTU numbers. Options are name and number, default is name. \n");
mothurOut("The otu.hierarchy command should be in the following format: \n");
mothurOut("otu.hierarchy(list=yourListFile, label=yourLabels).\n");
mothurOut("Example otu.hierarchy(list=amazon.fn.list, label=0.01-0.03).\n");
string names = lists[1].get(i);
//output column 1
- out << names << '\t';
+ if (output == "name") { out << names << '\t'; }
+ else { out << i << '\t'; }
map<int, int> bins; //bin numbers in little that are in this bin in big
map<int, int>::iterator it;
string col2 = "";
for (it = bins.begin(); it != bins.end(); it++) {
- col2 += lists[0].get(it->first) + "\t";
+ if (output == "name") { col2 += lists[0].get(it->first) + "\t"; }
+ else { col2 += toString(it->first) + "\t"; }
}
//output column 2
private:
bool abort;
set<string> labels; //holds labels to be used
- string label, listFile, outputDir;
+ string label, listFile, outputDir, output;
vector<ListVector> getListVectors();
}
//***************************************************************************************************************
-Pintail::Pintail(string filename, string temp, string o) { fastafile = filename; templateFile = temp; outputDir = o; }
+Pintail::Pintail(string filename, string o) {
+ fastafile = filename; outputDir = o;
+ distcalculator = new eachGapDist();
+ decalc = new DeCalculator();
+}
//***************************************************************************************************************
Pintail::~Pintail() {
try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
- for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; }
- }
- catch(exception& e) {
- errorOut(e, "Pintail", "~Pintail");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void Pintail::print(ostream& out) {
- try {
-
- mothurOutEndLine();
- for (int i = 0; i < querySeqs.size(); i++) {
-
- int index = ceil(deviation[i]);
-
- //is your DE value higher than the 95%
- string chimera;
- if (quantiles[index][4] == 0.0) {
- chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
- }else {
- if (DE[i] > quantiles[index][4]) { chimera = "Yes"; }
- else { chimera = "No"; }
- }
-
- out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
- if (chimera == "Yes") {
- mothurOut(querySeqs[i]->getName() + "\tdiv: " + toString(deviation[i]) + "\tstDev: " + toString(DE[i]) + "\tchimera flag: " + chimera); mothurOutEndLine();
- }
- out << "Observed\t";
-
- for (int j = 0; j < obsDistance[i].size(); j++) { out << obsDistance[i][j] << '\t'; }
- out << endl;
-
- out << "Expected\t";
-
- for (int m = 0; m < expectedDistance[i].size(); m++) { out << expectedDistance[i][m] << '\t'; }
- out << endl;
-
- }
+ delete distcalculator;
+ delete decalc;
}
catch(exception& e) {
- errorOut(e, "Pintail", "print");
+ errorOut(e, "Pintail", "~Pintail");
exit(1);
}
}
-
//***************************************************************************************************************
-int Pintail::getChimeras() {
+void Pintail::doPrep() {
try {
-
- //read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- templateSeqs = readSeqs(templateFile);
- mothurOut("Done."); mothurOutEndLine();
-
- int numSeqs = querySeqs.size();
-
- if (unaligned) { mothurOut("Your sequences need to be aligned when you use the pintail method."); mothurOutEndLine(); return 1; }
-
- obsDistance.resize(numSeqs);
- expectedDistance.resize(numSeqs);
- seqCoef.resize(numSeqs);
- DE.resize(numSeqs);
- Qav.resize(numSeqs);
- bestfit.resize(numSeqs);
- deviation.resize(numSeqs);
- trimmed.resize(numSeqs);
- windowSizes.resize(numSeqs, window);
+
windowSizesTemplate.resize(templateSeqs.size(), window);
- windowsForeachQuery.resize(numSeqs);
- h.resize(numSeqs);
quantiles.resize(100); //one for every percent mismatch
quantilesMembers.resize(100); //one for every percent mismatch
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
+ //if the user does not enter a mask then you want to keep all the spots in the alignment
+ if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); }
+ else { decalc->setAlignmentLength(seqMask.length()); }
+
+ decalc->setMask(seqMask);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
- }
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
-
//find breakup of templatefile for quantiles
if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
else {
}
}
#else
- lines.push_back(new linePair(0, numSeqs));
templateLines.push_back(new linePair(0, templateSeqs.size()));
#endif
-
- distcalculator = new eachGapDist();
- decalc = new DeCalculator();
-
- //if the user does enter a mask then you want to keep all the spots in the alignment
- if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); }
- else { decalc->setAlignmentLength(seqMask.length()); }
-
- decalc->setMask(seqMask);
-
- //find pairs
- if (processors == 1) {
- mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
- bestfit = findPairs(lines[0]->start, lines[0]->end);
- mothurOut("Done."); mothurOutEndLine();
- }else { createProcessesPairs(); }
-
-//string o = "closestmatch.eachgap.fasta";
-//ofstream out7;
-//openOutputFile(o, out7);
-//for (int i = 0; i < bestfit.size(); i++) {
- //out7 << ">" << querySeqs[i]->getName() << "-"<< bestfit[i]->getName() << endl;
- //out7 << bestfit[i]->getAligned() << endl;
-//}
-//out7.close();
- //find P
+
mothurOut("Getting conservation... "); cout.flush();
if (consfile == "") {
mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush();
- probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFile));
+ probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName));
mothurOut("Done."); mothurOutEndLine();
- }else { probabilityProfile = readFreq(); }
-
- //make P into Q
- for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
- mothurOut("Done."); mothurOutEndLine();
-
- //mask sequences if the user wants to
- if (seqMask != "") {
- //mask querys
- for (int i = 0; i < querySeqs.size(); i++) {
- decalc->runMask(querySeqs[i]);
- }
-
- //mask templates
- for (int i = 0; i < templateSeqs.size(); i++) {
- decalc->runMask(templateSeqs[i]);
- }
-
- for (int i = 0; i < bestfit.size(); i++) {
- decalc->runMask(bestfit[i]);
- }
-
- }
-
- if (filter) {
- vector<Sequence*> temp = templateSeqs;
- for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
-
- createFilter(temp);
-
- runFilter(querySeqs);
- runFilter(templateSeqs);
- runFilter(bestfit);
- }
-
-
- if (processors == 1) {
-
- for (int j = 0; j < bestfit.size(); j++) {
- decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
- }
-
- mothurOut("Finding window breaks... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- it = trimmed[i].begin();
- vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
- windowsForeachQuery[i] = win;
- }
- mothurOut("Done."); mothurOutEndLine();
-
- }else { createProcessesSpots(); }
-
- if (processors == 1) {
-
- mothurOut("Calculating observed distance... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
+ }else { probabilityProfile = readFreq(); mothurOut("Done."); }
+ mothurOutEndLine();
- obsDistance[i] = obsi;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
- mothurOut("Finding variability... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
-
- Qav[i] = q;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
- mothurOut("Calculating alpha... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
- seqCoef[i] = alpha;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
- mothurOut("Calculating expected distance... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
- expectedDistance[i] = exp;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
- mothurOut("Finding deviation... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
- DE[i] = de;
-
- it = trimmed[i].begin();
- float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
- deviation[i] = dist;
- }
- mothurOut("Done."); mothurOutEndLine();
-
- }
- else { createProcesses(); }
-
//quantiles are used to determine whether the de values found indicate a chimera
//if you have to calculate them, its time intensive because you are finding the de and deviation values for each
//combination of sequences in the template
if (quanfile != "") { quantiles = readQuantiles(); }
else {
+ //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
+ if (seqMask != "") {
+ //mask templates
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ decalc->runMask(templateSeqs[i]);
+ }
+ }
+
+ if (filter) {
+ createFilter(templateSeqs);
+ for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
+ }
+
+
mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
if (processors == 1) {
quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
string noOutliers, outliers;
if ((!filter) && (seqMask == "")) {
- noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.quan";
+ noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan";
}else if ((filter) && (seqMask == "")) {
- noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.filtered.quan";
+ noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan";
}else if ((!filter) && (seqMask != "")) {
- noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.masked.quan";
+ noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan";
}else if ((filter) && (seqMask != "")) {
- noOutliers = outputDir + getRootName(getSimpleName(templateFile)) + "pintail.filtered.masked.quan";
+ noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan";
}
- //outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS";
-
- /*openOutputFile(outliers, out4);
-
- //adjust quantiles
- for (int i = 0; i < quantilesMembers.size(); i++) {
- vector<float> temp;
-
- if (quantilesMembers[i].size() == 0) {
- //in case this is not a distance found in your template files
- for (int g = 0; g < 6; g++) {
- temp.push_back(0.0);
- }
- }else{
-
- sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
-
- //save 10%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
- //save 25%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
- //save 50%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
- //save 75%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
- //save 95%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
- //save 99%
- temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
-
- }
-
- //output quan value
- out4 << i+1 << '\t';
- for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
- out4 << endl;
-
- quantiles[i] = temp;
-
- }
-
- out4.close();*/
-
+
decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
openOutputFile(noOutliers, out5);
mothurOut("Done."); mothurOutEndLine();
- }
+ //if mask reread template
+ if ((seqMask != "") || (filter)) {
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ templateSeqs = readSeqs(templateFileName);
+ }
+ }
+
//free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
- for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
-
- delete distcalculator;
- delete decalc;
+ for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "doPrep");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Pintail::print(ostream& out) {
+ try {
+ int index = ceil(deviation);
+
+ //is your DE value higher than the 95%
+ string chimera;
+ if (index != 0) { //if index is 0 then its an exact match to a template seq
+ if (quantiles[index][4] == 0.0) {
+ chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index);
+ }else {
+ if (DE > quantiles[index][4]) { chimera = "Yes"; }
+ else { chimera = "No"; }
+ }
+ }else{ chimera = "No"; }
+
+ out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl;
+ if (chimera == "Yes") {
+ mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine();
+ }
+ out << "Observed\t";
+
+ for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; }
+ out << endl;
+
+ out << "Expected\t";
+
+ for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; }
+ out << endl;
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+int Pintail::getChimeras(Sequence* query) {
+ try {
+ querySeq = query;
+ trimmed.clear();
+ windowSizes = window;
+
+ //find pairs
+ bestfit = findPairs(query);
+
+ //make P into Q
+ for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
+
+ //mask sequences if the user wants to
+ if (seqMask != "") {
+ decalc->runMask(query);
+ decalc->runMask(bestfit);
+ }
+
+ if (filter) {
+ runFilter(query);
+ runFilter(bestfit);
+ }
+
+ //trim seq
+ decalc->trimSeqs(query, bestfit, trimmed);
+
+ //find windows
+ it = trimmed.begin();
+ windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment);
+
+ //find observed distance
+ obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes);
+
+ Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile);
+
+ //find alpha
+ seqCoef = decalc->getCoef(obsDistance, Qav);
+
+ //calculating expected distance
+ expectedDistance = decalc->calcExpected(Qav, seqCoef);
+
+ //finding de
+ DE = decalc->calcDE(obsDistance, expectedDistance);
+
+ //find distance between query and closest match
+ it = trimmed.begin();
+ deviation = decalc->calcDist(query, bestfit, it->first, it->second);
+ delete bestfit;
+
return 0;
}
catch(exception& e) {
//***************************************************************************************************************
//calculate the distances from each query sequence to all sequences in the template to find the closest sequence
-vector<Sequence*> Pintail::findPairs(int start, int end) {
+Sequence* Pintail::findPairs(Sequence* q) {
try {
- vector<Sequence*> seqsMatches;
+ Sequence* seqsMatches;
- for(int i = start; i < end; i++){
-
- vector<Sequence*> copy = decalc->findClosest(querySeqs[i], templateSeqs, 1);
- seqsMatches.push_back(copy[0]);
- }
+ vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
+ seqsMatches = copy[0];
return seqsMatches;
exit(1);
}
}
-
-/**************************************************************************************************/
-
-void Pintail::createProcessesSpots() {
- try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- for (int j = lines[process]->start; j < lines[process]->end; j++) {
-
- //chops off beginning and end of sequences so they both start and end with a base
- map<int, int> trim;
-
- decalc->trimSeqs(querySeqs[j], bestfit[j], trim);
- trimmed[j] = trim;
-
- }
-
- mothurOut("Finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- it = trimmed[i].begin();
- windowsForeachQuery[i] = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
- }
- mothurOut("Done finding window breaks for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output windowsForeachQuery
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << windowsForeachQuery[i].size() << '\t';
- for (int j = 0; j < windowsForeachQuery[i].size(); j++) {
- out << windowsForeachQuery[i][j] << '\t';
- }
- out << endl;
- }
-
- //output windowSizes
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << windowSizes[i] << '\t';
- }
- out << endl;
-
- //output trimmed values
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- it = trimmed[i].begin();
- out << it->first << '\t' << it->second << endl;
- }
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- int size = lines[i]->end - lines[i]->start;
-
- int count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- int num;
- in >> num;
-
- vector<int> win; int w;
- for (int j = 0; j < num; j++) {
- in >> w;
- win.push_back(w);
- }
-
- windowsForeachQuery[count] = win;
- count++;
- gobble(in);
- }
-
- gobble(in);
- count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- int num;
- in >> num;
-
- windowSizes[count] = num;
- count++;
- }
-
- gobble(in);
-
- count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- int front, back;
- in >> front >> back;
-
- map<int, int> t;
-
- t[front] = back;
-
- trimmed[count] = t;
- count++;
-
- gobble(in);
- }
-
-
- in.close();
- remove(s.c_str());
- }
-
-
-#else
- for (int j = 0; j < bestfit.size(); j++) {
- //chops off beginning and end of sequences so they both start and end with a base
- decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
- }
-
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- it = trimmed[i].begin();
- vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
- windowsForeachQuery[i] = win;
- }
-
-#endif
- }
- catch(exception& e) {
- errorOut(e, "Pintail", "createProcessesSpots");
- exit(1);
- }
-}
/**************************************************************************************************/
-
-void Pintail::createProcessesPairs() {
- try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- mothurOut("Finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
- bestfit = findPairs(lines[process]->start, lines[process]->end);
- mothurOut("Done finding pairs for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output range and size
- out << bestfit.size() << endl;
-
- //output pairs
- for (int i = 0; i < bestfit.size(); i++) {
- out << ">" << bestfit[i]->getName() << endl << bestfit[i]->getAligned() << endl;
- }
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- int size;
- in >> size; gobble(in);
-
- //get pairs
- int count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- Sequence* temp = new Sequence(in);
- bestfit[count] = temp;
-
- count++;
- gobble(in);
- }
-
- in.close();
- remove(s.c_str());
- }
-
-
-#else
- bestfit = findPairs(lines[0]->start, lines[0]->end);
-#endif
- }
- catch(exception& e) {
- errorOut(e, "Pintail", "createProcessesPairs");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-void Pintail::createProcesses() {
- try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- mothurOut("Calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
-
- vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
- obsDistance[i] = obsi;
-
- //calc Qav
- vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
-
- //get alpha
- float alpha = decalc->getCoef(obsDistance[i], q);
-
- //find expected
- vector<float> exp = decalc->calcExpected(q, alpha);
- expectedDistance[i] = exp;
-
- //get de and deviation
- float dei = decalc->calcDE(obsi, exp);
- DE[i] = dei;
-
- it = trimmed[i].begin();
- float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
- deviation[i] = dist;
- }
- mothurOut("Done calculating observed, expected and de values for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- int size = lines[process]->end - lines[process]->start;
- out << size << endl;
-
- //output observed distances
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << obsDistance[i].size() << '\t';
- for (int j = 0; j < obsDistance[i].size(); j++) {
- out << obsDistance[i][j] << '\t';
- }
- out << endl;
- }
-
-
- //output expected distances
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << expectedDistance[i].size() << '\t';
- for (int j = 0; j < expectedDistance[i].size(); j++) {
- out << expectedDistance[i][j] << '\t';
- }
- out << endl;
- }
-
-
- //output de values
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << DE[i] << '\t';
- }
- out << endl;
-
- //output de values
- for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << deviation[i] << '\t';
- }
- out << endl;
-
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- int size;
- in >> size; gobble(in);
-
- //get observed distances
- int count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- int num;
- in >> num;
-
- vector<float> obs; float w;
- for (int j = 0; j < num; j++) {
- in >> w;
- obs.push_back(w);
- }
-
- obsDistance[count] = obs;
- count++;
- gobble(in);
- }
-
- gobble(in);
-
- //get expected distances
- count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- int num;
- in >> num;
-
- vector<float> exp; float w;
- for (int j = 0; j < num; j++) {
- in >> w;
- exp.push_back(w);
- }
-
- expectedDistance[count] = exp;
- count++;
- gobble(in);
- }
-
- gobble(in);
-
- count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- float num;
- in >> num;
-
- DE[count] = num;
- count++;
- }
-
- gobble(in);
-
- count = lines[i]->start;
- for (int m = 0; m < size; m++) {
- float num;
- in >> num;
-
- deviation[count] = num;
- count++;
- }
-
- in.close();
- remove(s.c_str());
- }
-
-
-#else
- mothurOut("Calculating observed distance... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
- obsDistance[i] = obsi;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
-
- mothurOut("Finding variability... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
- Qav[i] = q;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
-
- mothurOut("Calculating alpha... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
- seqCoef.push_back(alpha);
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
-
- mothurOut("Calculating expected distance... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- vector<float> exp = decalc->calcExpected(Qav[i], seqCoef[i]);
- expectedDistance[i] = exp;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-
-
- mothurOut("Finding deviation... "); cout.flush();
- for (int i = lines[0]->start; i < lines[0]->end; i++) {
- float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
- DE[i] = de;
-
- it = trimmed[i].begin();
- float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
- deviation[i] = dist;
- }
- mothurOut("Done."); mothurOutEndLine();
-
-#endif
- }
- catch(exception& e) {
- errorOut(e, "Pintail", "createProcesses");
- exit(1);
- }
-}
-
-
-/**************************************************************************************************/
-
void Pintail::createProcessesQuan() {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
class Pintail : public Chimera {
public:
- Pintail(string, string, string);
+ Pintail(string, string);
~Pintail();
- int getChimeras();
+ int getChimeras(Sequence*);
void print(ostream&);
void setCons(string c) { consfile = c; }
Dist* distcalculator;
DeCalculator* decalc;
int iters;
- string fastafile, templateFile, consfile;
+ string fastafile, consfile;
-
- vector<linePair*> lines;
vector<linePair*> templateLines;
- vector<Sequence*> querySeqs;
- vector<Sequence*> templateSeqs;
-
- vector<Sequence*> bestfit; //bestfit[0] matches queryseqs[0]...
+ Sequence*querySeq;
+
+ Sequence* bestfit; //closest match to query in template
- vector< vector<float> > obsDistance; //obsDistance[0] is the vector of observed distances for queryseqs[0]...
- vector< vector<float> > expectedDistance; //expectedDistance[0] is the vector of expected distances for queryseqs[0]...
- vector<float> deviation; //deviation[0] is the percentage of mismatched pairs over the whole seq between querySeqs[0] and its best match.
- vector< vector<int> > windowsForeachQuery; // windowsForeachQuery[0] is a vector containing the starting spot in queryseqs[0] aligned sequence for each window.
+ vector<float> obsDistance; //obsDistance is the vector of observed distances for query
+ vector<float> expectedDistance; //expectedDistance is the vector of expected distances for query
+ float deviation; //deviation is the percentage of mismatched pairs over the whole seq between query and its best match.
+ vector<int> windowsForeachQuery; // windowsForeachQuery is a vector containing the starting spot in query aligned sequence for each window.
//this is needed so you can move by bases and not just spots in the alignment
- vector<int> windowSizes; //windowSizes[0] = window size of querySeqs[0]
+ int windowSizes; //windowSizes = window size of query
vector<int> windowSizesTemplate; //windowSizesTemplate[0] = window size of templateSeqs[0]
- vector< map<int, int> > trimmed; //trimmed[0] = start and stop of trimmed sequences for querySeqs[0]
+ map<int, int> trimmed; //trimmed = start and stop of trimmed sequences for query
map<int, int>::iterator it;
- vector< vector<float> > Qav; //Qav[0] is the vector of average variablility for queryseqs[0]...
- vector<float> seqCoef; //seqCoef[0] is the coeff for queryseqs[0]...
- vector<float> DE; //DE[0] is the deviaation for queryseqs[0]...
+ vector<float> Qav; //Qav is the vector of average variablility for query
+ float seqCoef; //seqCoef is the coeff for query
+ float DE; //DE is the deviaation for query
vector<float> probabilityProfile;
vector< vector<float> > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
vector< vector<quanMember> > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
- vector< set<int> > h;
+ set<int> h;
vector<float> readFreq();
- vector<Sequence*> findPairs(int, int);
+ Sequence* findPairs(Sequence*);
- void createProcessesSpots();
- void createProcessesPairs();
- void createProcesses();
void createProcessesQuan();
+ void doPrep();
};
group = "xxx";
}
- T->tree[n1].setGroup(group);
+ vector<string> tempGroup; tempGroup.push_back(group);
+
+ T->tree[n1].setGroup(tempGroup);
T->tree[n1].setChildren(-1,-1);
if(blen == 1){
namefile = validParameter.validFile(parameters, "name", true);
if (namefile == "not open") { abort = true; }
- else if (namefile == "not found") { treefile = ""; }
+ else if (namefile == "not found") { namefile = ""; }
else { readNamesFile(); }
if (abort == false) {
if (it == nameMap.end()) {
mothurOut(treeMap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); mothurOutEndLine();
treeMap->removeSeq(treeMap->namesOfSeqs[i]);
+ i--; //need this because removeSeq removes name from namesOfSeqs
}
}
}
#include "slayer.h"
/***********************************************************************/
-Slayer::Slayer(int win, int increment, int parentThreshold, float div, int i) :
- windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div), iters(i){}
+Slayer::Slayer(int win, int increment, int parentThreshold, float div, int i, int snp) :
+ windowSize(win), windowStep(increment), parentFragmentThreshold(parentThreshold), divRThreshold(div), iters(i), percentSNPSample(snp){}
/***********************************************************************/
string Slayer::getResults(Sequence* query, vector<Sequence*> refSeqs) {
try {
for (int i = 0; i < refSeqs.size(); i++) {
for (int j = i+1; j < refSeqs.size(); j++) {
-
+
//make copies of query and each parent because runBellerophon removes gaps and messes them up
Sequence* q = new Sequence(query->getName(), query->getAligned());
Sequence* leftParent = new Sequence(refSeqs[i]->getName(), refSeqs[i]->getAligned());
Sequence* rightParent = new Sequence(refSeqs[j]->getName(), refSeqs[j]->getAligned());
- vector<data_struct> divs = runBellerophon(q, leftParent, rightParent);
+ map<int, int> spots; //map from spot in original sequence to spot in filtered sequence for query and both parents
+ vector<data_struct> divs = runBellerophon(q, leftParent, rightParent, spots);
vector<data_struct> selectedDivs;
for (int k = 0; k < divs.size(); k++) {
//require at least 3 SNPs on each side of the break
if ((numSNPSLeft >= 3) && (numSNPSRight >= 3)) {
+
+ //removed in 12/09 version of chimeraSlayer
+ //int winSizeLeft = divs[k].winLEnd - divs[k].winLStart + 1;
+ //int winSizeRight = divs[k].winREnd - divs[k].winRStart + 1;
- int winSizeLeft = divs[k].winLEnd - divs[k].winLStart + 1;
- int winSizeRight = divs[k].winREnd - divs[k].winRStart + 1;
-
- float snpRateLeft = numSNPSLeft / (float) winSizeLeft;
- float snpRateRight = numSNPSRight / (float) winSizeRight;
- float logR = log(snpRateLeft / snpRateRight) / log(2.0);
+ //float snpRateLeft = numSNPSLeft / (float) winSizeLeft;
+ //float snpRateRight = numSNPSRight / (float) winSizeRight;
+ //float logR = log(snpRateLeft / snpRateRight) / log(2.0);
// do not accept excess snp ratio on either side of the break
- if (abs(logR) < 1 ) {
+ //if (abs(logR) < 1 ) {
float BS_A, BS_B;
bootstrapSNPS(snpsLeft, snpsRight, BS_A, BS_B);
divs[k].bsMax = max(BS_A, BS_B);
divs[k].chimeraMax = max(divs[k].qla_qrb, divs[k].qlb_qra);
+
+ //so results reflect orignal alignment
+ divs[k].winLStart = spots[divs[k].winLStart];
+ divs[k].winLEnd = spots[divs[k].winLEnd];
+ divs[k].winRStart = spots[divs[k].winRStart];
+ divs[k].winREnd = spots[divs[k].winREnd];
selectedDivs.push_back(divs[k]);
- }
+ //}
}
}
}
}
-
+
// compute bootstrap support
if (all.size() > 0) {
//sort them
}
}
/***********************************************************************/
-vector<data_struct> Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* pB) {
+vector<data_struct> Slayer::runBellerophon(Sequence* q, Sequence* pA, Sequence* pB, map<int, int>& spots) {
try{
vector<data_struct> data;
-
+
//vertical filter
vector<Sequence*> temp;
temp.push_back(q); temp.push_back(pA); temp.push_back(pB);
- map<int, int> spots = verticalFilter(temp);
+
+ //maps spot in new alignment to spot in alignment before filter
+ spots = verticalFilter(temp); //fills baseSpots
//get these to avoid numerous function calls
string query = q->getAligned();
string parentA = pA->getAligned();
string parentB = pB->getAligned();
int length = query.length();
-
+//cout << q->getName() << endl << q->getAligned() << endl << endl;
+//cout << pA->getName() << endl << pA->getAligned() << endl << endl;
+//cout << pB->getName() << endl << pB->getAligned() << endl << endl;
+//cout << " length = " << length << endl;
+//cout << q->getName() << endl;
+//cout << pA->getName() << '\t';
+//cout << pB->getName() << endl;
+
//check window size
if (length < (2*windowSize+windowStep)) {
mothurOut("Your window size is too large for " + q->getName() + ". I will make the window size " + toString(length/4) + " which is 1/4 the filtered length."); mothurOutEndLine();
//float avgQA_QB = ((QA*leftLength) + (QB*rightLength)) / (float) length;
float divR_QLA_QRB = min((QLA_QRB/QA), (QLA_QRB/QB));
-
float divR_QLB_QRA = min((QLB_QRA/QA), (QLB_QRA/QB));
+//cout << leftLength << '\t' << rightLength << '\t' << QLA << '\t' << QRB << '\t' << QLB << '\t' << QRA << '\t' << LAB << '\t' << RAB << '\t' << AB << '\t' << QA << '\t' << QB << '\t' << QLA_QRB << '\t' << QLB_QRA << endl;
+//cout << divRThreshold << endl;
+//cout << breakpoint << '\t' << divR_QLA_QRB << '\t' << divR_QLB_QRA << endl;
//is one of them above the
if (divR_QLA_QRB >= divRThreshold || divR_QLB_QRA >= divRThreshold) {
member.rab = RAB;
member.qra = QRA;
member.qlb = QLB;
- member.winLStart = spots[0];
- member.winLEnd = spots[breakpoint]; //so breakpoint reflects spot in alignment before filter
- member.winRStart = spots[breakpoint+1];
- member.winREnd = spots[length-1];
+ member.winLStart = 0;
+ member.winLEnd = breakpoint;
+ member.winRStart = breakpoint+1;
+ member.winREnd = length-1;
member.querySeq = *(q);
member.parentA = *(pA);
member.parentB = *(pB);
try {
vector<snps> data;
-
+//cout << left << '\t' << right << endl;
for (int i = left; i <= right; i++) {
char A = parentA[i];
char B = parentB[i];
if ((A != Q) || (B != Q)) {
- snps member;
- member.queryChar = Q;
- member.parentAChar = A;
- member.parentBChar = B;
+//cout << "not equal " << Q << '\t' << A << '\t' << B << endl;
+
+ //ensure not neighboring a gap. change to 12/09 release of chimeraSlayer - not sure what this adds, but it eliminates alot of SNPS
+ if (
+ //did query loose a base here during filter??
+ ( i == 0 || abs (baseSpots[0][i] - baseSpots[0][i-1]) == 1) &&
+ ( i == query.length() || abs (baseSpots[0][i] - baseSpots[0][i+1]) == 1)
+ &&
+ //did parentA loose a base here during filter??
+ ( i == 0 || abs (baseSpots[1][i] - baseSpots[1][i-1]) == 1) &&
+ ( i == parentA.length() || abs (baseSpots[1][i] - baseSpots[1][i+1]) == 1)
+ &&
+ //did parentB loose a base here during filter??
+ ( i == 0 || abs (baseSpots[2][i] - baseSpots[2][i-1]) == 1) &&
+ ( i == parentB.length() || abs (baseSpots[2][i] - baseSpots[2][i+1]) == 1)
+ )
+ {
- data.push_back(member);
+ snps member;
+ member.queryChar = Q;
+ member.parentAChar = A;
+ member.parentBChar = B;
+//cout << "not neighboring a gap " << Q << '\t' << A << '\t' << B << '\t' << baseSpots[0][i] << '\t' << baseSpots[0][i+1] << '\t' << baseSpots[0][i-1] << '\t' << baseSpots[1][i] << '\t' << baseSpots[1][i+1] << '\t' << baseSpots[1][i-1] << '\t' << baseSpots[2][i] << '\t' << baseSpots[2][i+1] << '\t' << baseSpots[2][i-1] << endl;
+ data.push_back(member);
+ }
}
}
int count_A = 0; // sceneario QLA,QRB supported
int count_B = 0; // sceneario QLB,QRA supported
- int numLeft = max(1, int(left.size()/10 +0.5));
- int numRight = max(1, int(right.size()/10 + 0.5));
-
+ int numLeft = max(1, int(left.size() * (percentSNPSample/(float)100) + 0.5));
+ int numRight = max(1, int(right.size() * (percentSNPSample/(float)100) + 0.5));
+
for (int i = 0; i < iters; i++) {
//random sampling with replacement.
if ((QLB > QLA) && (QRA > QRB)) {
count_B++;
}
-
+
+//cout << "selected left snp: \n";
+//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].parentAChar; }
+//cout << endl;
+//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].queryChar; }
+//cout << endl;
+//for (int j = 0; j < selectedLeft.size(); j++) { cout << selectedLeft[j].parentBChar; }
+//cout << endl;
+//cout << "selected right snp: \n";
+//for (int j = 0; j < selectedRight.size(); j++) { cout << selectedRight[j].parentAChar; }
+//cout << endl;
+//for (int i = 0; i < selectedRight.size(); i++) { cout << selectedRight[i].queryChar; }
+//cout << endl;
+//for (int i = 0; i < selectedRight.size(); i++) { cout << selectedRight[i].parentBChar; }
+//cout << endl;
}
+
+
+
BSA = ((float) count_A / (float) iters) * 100;
BSB = ((float) count_B / (float) iters) * 100;
+//cout << "bsa = " << BSA << " bsb = " << BSB << endl;
}
catch(exception& e) {
try {
int total = 0;
int matches = 0;
-
+
for (int i = left; i <= right; i++) {
total++;
if (queryFrag[i] == parent[i]) {
//remove columns that contain any gaps
map<int, int> Slayer::verticalFilter(vector<Sequence*> seqs) {
try {
+ //find baseSpots
+ baseSpots.clear();
+ baseSpots.resize(3); //query, parentA, parentB
+
vector<int> gaps; gaps.resize(seqs[0]->getAligned().length(), 0);
string filterString = (string(seqs[0]->getAligned().length(), '1'));
for (int j = 0; j < seqAligned.length(); j++) {
//if this spot is a gap
- if ((seqAligned[j] == '-') || (seqAligned[j] == '.') || (toupper(seqAligned[j]) == 'N')) { gaps[j]++; }
+ if ((seqAligned[j] == '-') || (seqAligned[j] == '.') || (toupper(seqAligned[j]) == 'N')) { gaps[j]++; }
}
}
- //zero out spot where all sequences have blanks
+ //zero out spot where any sequences have blanks
int numColRemoved = 0;
int count = 0;
map<int, int> maskMap; maskMap.clear();
count++;
}
}
-
+
//for each sequence
for (int i = 0; i < seqs.size(); i++) {
string seqAligned = seqs[i]->getAligned();
string newAligned = "";
+ int baseCount = 0;
+ int count = 0;
for (int j = 0; j < seqAligned.length(); j++) {
+ //are you a base
+ if ((seqAligned[j] != '-') && (seqAligned[j] != '.') && (toupper(seqAligned[j]) != 'N')) { baseCount++; }
+
//if this spot is not a gap
- if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+ if (filterString[j] == '1') {
+ newAligned += seqAligned[j];
+ baseSpots[i][count] = baseCount;
+ count++;
+ }
}
seqs[i]->setAligned(newAligned);
public:
- Slayer(int, int, int, float, int);
+ Slayer(int, int, int, float, int, int);
~Slayer() {};
string getResults(Sequence*, vector<Sequence*>);
private:
- int windowSize, windowStep, parentFragmentThreshold, iters;
+ int windowSize, windowStep, parentFragmentThreshold, iters, percentSNPSample;
float divRThreshold;
vector<data_struct> outputResults;
+ vector< map<int, int> > baseSpots;
map<int, int> verticalFilter(vector<Sequence*>);
float computePercentID(string, string, int, int);
- vector<data_struct> runBellerophon(Sequence*, Sequence*, Sequence*);
+ vector<data_struct> runBellerophon(Sequence*, Sequence*, Sequence*, map<int, int>&);
vector<snps> getSNPS(string, string, string, int, int);
void bootstrapSNPS(vector<snps>, vector<snps>, float&, float&);
float snpQA(vector<snps>);
//initialize leaf nodes
if (i <= (numLeaves-1)) {
tree[i].setName(globaldata->Treenames[i]);
- tree[i].setGroup(globaldata->gTreemap->getGroup(globaldata->Treenames[i]));
+ vector<string> tempGroups; tempGroups.push_back(globaldata->gTreemap->getGroup(globaldata->Treenames[i]));
+ tree[i].setGroup(tempGroups);
//set pcount and pGroup for groupname to 1.
tree[i].pcount[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1;
tree[i].pGroups[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1;
//intialize non leaf nodes
}else if (i > (numLeaves-1)) {
tree[i].setName("");
- tree[i].setGroup("");
+ vector<string> tempGroups;
+ tree[i].setGroup(tempGroups);
}
}
}
}
}//end if
+ //update groups to reflect all the groups this node represents
+ vector<string> nodeGroups;
+ map<string, int>::iterator itGroups;
+ for (itGroups = tree[i].pcount.begin(); itGroups != tree[i].pcount.end(); itGroups++) {
+ nodeGroups.push_back(itGroups->first);
+ }
+ tree[i].setGroup(nodeGroups);
+
}//end else
}//end for
tree[z].pGroups = (tree[i].pGroups);
tree[i].pGroups = (lib_hold);
- string zgroup = tree[z].getGroup();
+ vector<string> zgroup = tree[z].getGroup();
tree[z].setGroup(tree[i].getGroup());
tree[i].setGroup(zgroup);
exit(1);
}
}
-/**************************************************************************************************/
+/**************************************************************************************************
void Tree::randomLabels(string groupA, string groupB) {
try {
}
/*************************************************************************************************/
void Tree::assembleRandomUnifracTree(string groupA, string groupB) {
- randomLabels(groupA, groupB);
+
+ vector<string> temp; temp.push_back(groupA); temp.push_back(groupB);
+ randomLabels(temp);
assembleTree();
}
}
}
}else { //you are a leaf
- out << tree[node].getGroup();
+ string leafGroup = globaldata->gTreemap->getGroup(tree[node].getName());
+
+ out << leafGroup;
if (mode == "branch") {
//if there is a branch length then print it
if (tree[node].getBranchLength() != -1) {
void randomTopology();
void randomBlengths();
void randomLabels(vector<string>);
- void randomLabels(string, string);
+ //void randomLabels(string, string);
void printBranch(int, ostream&, string); //recursively print out tree
void parseTreeFile(); //parses through tree file to find names of nodes and number of them
//this is required in case user has sequences in the names file that are
//erase name from namesOfSeqs
for (int i = 0; i < namesOfSeqs.size(); i++) {
if (namesOfSeqs[i] == seqName) {
- namesOfSeqs.erase (namesOfSeqs.begin()+i);
+ namesOfSeqs.erase(namesOfSeqs.begin()+i);
break;
}
}
//remove seq from treemap
it = treemap.find(seqName);
treemap.erase(it);
-
-
}
/************************************************************/
/****************************************************************/
void Node::setName(string Name) { name = Name; }
/****************************************************************/
-void Node::setGroup(string groups) { group =groups; }
+void Node::setGroup(vector<string> groups) { group =groups; }
/****************************************************************/
void Node::setBranchLength(float l) { branchLength = l; }
/****************************************************************/
/****************************************************************/
string Node::getName() { return name; }
/****************************************************************/
-string Node::getGroup() { return group; }
+vector<string> Node::getGroup() { return group; }
/****************************************************************/
float Node::getBranchLength() { return branchLength; }
/****************************************************************/
//to be used by printTree in the Tree class to print the leaf info
void Node::printNode() {
try{
- mothurOut(toString(parent) + " " + toString(lchild) + " " + toString(rchild) + " " + group);
+ mothurOut(toString(parent) + " " + toString(lchild) + " " + toString(rchild) + " ");
+
+ for (int i = 0; i < group.size(); i++) { mothurOut( group[i] + " "); }
+
//there is a branch length
if (branchLength != -1) {
mothurOut(" " + toString(branchLength));
}
mothurOut(" |");
+
map<string, int>::iterator it;
for(it=pGroups.begin();it!=pGroups.end();it++){
mothurOut(" " + it->first + ":" + toString(it->second));
~Node() { pGroups.clear(); pcount.clear(); };
void setName(string);
- void setGroup(string);
+ void setGroup(vector<string>);
void setBranchLength(float);
void setLabel(float);
void setParent(int);
void setLengthToLeaves(float);
string getName();
- string getGroup();
+ vector<string> getGroup();
float getBranchLength();
float getLengthToLeaves();
float getLabel();
private:
string name;
- string group;
+ vector<string> group;
float branchLength, length2leaf, label;
int parent;
int lchild;
//groups in this combo
groups.push_back(globaldata->Groups[a]); groups.push_back(globaldata->Groups[l]);
- for(int i=t->getNumLeaves();i<t->getNumNodes();i++){
-
- int lc = t->tree[i].getLChild(); //lc = vector index of left child
- int rc = t->tree[i].getRChild(); //rc = vector index of right child
-
- /**********************************************************************/
- //This section adds in all lengths that are non leaf
-
+ for(int i=0;i<t->getNumNodes();i++){
+
copyIpcount = t->tree[i].pcount;
for (it = copyIpcount.begin(); it != copyIpcount.end();) {
if (inUsersGroups(it->first, groups) != true) {
totalBL += abs(t->tree[i].getBranchLength());
}
- /**********************************************************************/
- //This section adds in all lengths that are leaf
-
- //if i's chidren are leaves
- if (t->tree[rc].getRChild() == -1) {
- //if rc is a valid group and rc has a BL
- if ((inUsersGroups(t->tree[rc].getGroup(), groups) == true) && (t->tree[rc].getBranchLength() != -1)) {
- UniqueBL += abs(t->tree[rc].getBranchLength());
- totalBL += abs(t->tree[rc].getBranchLength());
- }
- }
-
- if (t->tree[lc].getLChild() == -1) {
- //if lc is a valid group and lc has a BL
- if ((inUsersGroups(t->tree[lc].getGroup(), groups) == true) && (t->tree[lc].getBranchLength() != -1)) {
- UniqueBL += abs(t->tree[lc].getBranchLength());
- totalBL += abs(t->tree[lc].getBranchLength());
- }
- }
-
- /**********************************************************************/
}
UW = (UniqueBL / totalBL);
+ //cout << globaldata->Groups[a] << globaldata->Groups[l] << '\t' << UniqueBL << '\t' << totalBL << endl;
if (isnan(UW) || isinf(UW)) { UW = 0; }
UW = 0.00; //Unweighted Value = UniqueBL / totalBL;
copyIpcount.clear();
- for(int i=t->getNumLeaves();i<t->getNumNodes();i++){
-
- int lc = t->tree[i].getLChild(); //lc = vector index of left child
- int rc = t->tree[i].getRChild(); //rc = vector index of right child
-
- /**********************************************************************/
- //This section adds in all lengths that are non leaf
-
+ for(int i=0;i<t->getNumNodes();i++){
+
copyIpcount = t->tree[i].pcount;
for (it = copyIpcount.begin(); it != copyIpcount.end();) {
if (inUsersGroups(it->first, groups) != true) {
totalBL += abs(t->tree[i].getBranchLength());
}
- /**********************************************************************/
- //This section adds in all lengths that are leaf
-
- //if i's chidren are leaves
- if (t->tree[rc].getRChild() == -1) {
- //if rc is a valid group and rc has a BL
- if ((inUsersGroups(t->tree[rc].getGroup(), groups) == true) && (t->tree[rc].getBranchLength() != -1)) {
- UniqueBL += abs(t->tree[rc].getBranchLength());
- totalBL += abs(t->tree[rc].getBranchLength());
- }
- }
-
- if (t->tree[lc].getLChild() == -1) {
- //if lc is a valid group and lc has a BL
- if ((inUsersGroups(t->tree[lc].getGroup(), groups) == true) && (t->tree[lc].getBranchLength() != -1)) {
- UniqueBL += abs(t->tree[lc].getBranchLength());
- totalBL += abs(t->tree[lc].getBranchLength());
- }
- }
-
- /**********************************************************************/
}
UW = (UniqueBL / totalBL);
copyTree->assembleRandomUnifracTree(groups[0], groups[1]);
//copyTree->createNewickFile("random"+groupA+toString(count));
-
- for(int i=copyTree->getNumLeaves();i<copyTree->getNumNodes();i++){
-
- int lc = copyTree->tree[i].getLChild(); //lc = vector index of left child
- int rc = copyTree->tree[i].getRChild(); //rc = vector index of right child
-
+
+ for(int i=0;i<copyTree->getNumNodes();i++){
+
/**********************************************************************/
//This section adds in all lengths that are non leaf
copyIpcount = copyTree->tree[i].pcount;
totalBL += abs(copyTree->tree[i].getBranchLength());
}
- /**********************************************************************/
- //This section adds in all lengths that are leaf
-
- //if i's chidren are leaves
- if (copyTree->tree[rc].getRChild() == -1) {
- //if rc is a valid group and rc has a BL
- if ((inUsersGroups(copyTree->tree[rc].getGroup(), groups) == true) && (copyTree->tree[rc].getBranchLength() != -1)) {
- UniqueBL += abs(copyTree->tree[rc].getBranchLength());
- totalBL += abs(copyTree->tree[rc].getBranchLength());
- }
- }
-
- if (copyTree->tree[lc].getLChild() == -1) {
- //if lc is a valid group and lc has a BL
- if ((inUsersGroups(copyTree->tree[lc].getGroup(), groups) == true) && (copyTree->tree[lc].getBranchLength() != -1)) {
- UniqueBL += abs(copyTree->tree[lc].getBranchLength());
- totalBL += abs(copyTree->tree[lc].getBranchLength());
- }
- }
-
- /**********************************************************************/
}
UW = (UniqueBL / totalBL);
//swap labels in all the groups you want to compare
copyTree->assembleRandomUnifracTree(groups);
- for(int i=copyTree->getNumLeaves();i<copyTree->getNumNodes();i++){
-
- int lc = copyTree->tree[i].getLChild(); //lc = vector index of left child
- int rc = copyTree->tree[i].getRChild(); //rc = vector index of right child
-
- /**********************************************************************/
- //This section adds in all lengths that are non leaf
+ for(int i=0;i<copyTree->getNumNodes();i++){
copyIpcount = copyTree->tree[i].pcount;
for (it = copyIpcount.begin(); it != copyIpcount.end();) {
totalBL += abs(copyTree->tree[i].getBranchLength());
}
- /**********************************************************************/
- //This section adds in all lengths that are leaf
-
- //if i's chidren are leaves
- if (copyTree->tree[rc].getRChild() == -1) {
- //if rc is a valid group and rc has a BL
- if ((inUsersGroups(copyTree->tree[rc].getGroup(), groups) == true) && (copyTree->tree[rc].getBranchLength() != -1)) {
- UniqueBL += abs(copyTree->tree[rc].getBranchLength());
- totalBL += abs(copyTree->tree[rc].getBranchLength());
- }
- }
-
- if (copyTree->tree[lc].getLChild() == -1) {
- //if lc is a valid group and lc has a BL
- if ((inUsersGroups(copyTree->tree[lc].getGroup(), groups) == true) && (copyTree->tree[lc].getBranchLength() != -1)) {
- UniqueBL += abs(copyTree->tree[lc].getBranchLength());
- totalBL += abs(copyTree->tree[lc].getBranchLength());
- }
- }
-
- /**********************************************************************/
}
UW = (UniqueBL / totalBL);
//initialize weighted scores
WScore[globaldata->Groups[i]+globaldata->Groups[l]] = 0.0;
+ vector<string> groups; groups.push_back(globaldata->Groups[i]); groups.push_back(globaldata->Groups[l]);
+
D.push_back(0.0000); //initialize a spot in D for each combination
/********************************************************/
//is this sum from a sequence which is in one of the users groups
if (inUsersGroups(t->tree[v].getGroup(), globaldata->Groups) == true) {
//is this sum from a sequence which is in this groupCombo
- if ((t->tree[v].getGroup() == globaldata->Groups[i]) || (t->tree[v].getGroup() == globaldata->Groups[l])) {
- sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()];
- D[count] += sum;
+ if (inUsersGroups(t->tree[v].getGroup(), groups)) {
+ int numSeqsInGroupI, numSeqsInGroupL;
+
+ map<string, int>::iterator it;
+ it = t->tree[v].pcount.find(groups[0]);
+ if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group i
+ numSeqsInGroupI = it->second;
+ }else{ numSeqsInGroupI = 0; }
+
+ it = t->tree[v].pcount.find(groups[1]);
+ if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group l
+ numSeqsInGroupL = it->second;
+ }else{ numSeqsInGroupL = 0; }
+
+ double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]) + ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]);
+
+ //sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()];
+
+ D[count] += weightedSum;
}
}
}
WScore[(groupA+groupB)] = 0.0;
float D = 0.0;
+ vector<string> groups; groups.push_back(groupA); groups.push_back(groupB);
/********************************************************/
//calculate a D value for the group combo
sum += abs(t->tree[index].getBranchLength());
}
- if ((t->tree[v].getGroup() == groupA) || (t->tree[v].getGroup() == groupB)) {
- sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()];
- D += sum;
+ if (inUsersGroups(t->tree[v].getGroup(), groups)) {
+ int numSeqsInGroupI, numSeqsInGroupL;
+
+ map<string, int>::iterator it;
+ it = t->tree[v].pcount.find(groups[0]);
+ if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group i
+ numSeqsInGroupI = it->second;
+ }else{ numSeqsInGroupI = 0; }
+
+ it = t->tree[v].pcount.find(groups[1]);
+ if (it != t->tree[v].pcount.end()) { //this leaf node contains seqs from group l
+ numSeqsInGroupL = it->second;
+ }else{ numSeqsInGroupL = 0; }
+
+ double weightedSum = ((numSeqsInGroupI * sum) / (double)tmap->seqsPerGroup[groups[0]]) + ((numSeqsInGroupL * sum) / (double)tmap->seqsPerGroup[groups[1]]);
+
+ //sum /= (double)tmap->seqsPerGroup[t->tree[v].getGroup()];
+
+ D += weightedSum;
}
}
/********************************************************/