From: pschloss Date: Tue, 19 Jan 2010 14:20:22 +0000 (+0000) Subject: pat's mods to morisitahorn and pre.cluster X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5b9b3e01150055e3b4bb1a911ea4c61d0b755e89 pat's mods to morisitahorn and pre.cluster --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 7f8568e..8e21f64 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -581,8 +581,8 @@ A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = SOURCE_ROOT; }; A7D86C8A10FE09FE00865661 /* formatphylip.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatphylip.h; sourceTree = SOURCE_ROOT; }; A7D86C8B10FE09FE00865661 /* formatphylip.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatphylip.cpp; sourceTree = SOURCE_ROOT; }; - A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = ""; }; - A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = ""; }; + A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = SOURCE_ROOT; }; + A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = SOURCE_ROOT; }; A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; }; A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; }; A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; }; @@ -1370,6 +1370,8 @@ 1DEB923608733DC60010E9CD /* Debug */ = { isa = XCBuildConfiguration; buildSettings = { + ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)"; + ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64; GCC_OPTIMIZATION_LEVEL = 3; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES; @@ -1386,10 +1388,8 @@ 1DEB923708733DC60010E9CD /* Release */ = { isa = XCBuildConfiguration; buildSettings = { - ARCHS = ( - ppc, - i386, - ); + ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)"; + ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64; GCC_OPTIMIZATION_LEVEL = 3; GCC_WARN_ABOUT_MISSING_NEWLINE = YES; GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES; diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 39dda9f..3e4f6ad 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -82,6 +82,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ namefile = validParameter.validFile(parameters, "name", false); if (namefile == "not found") { namefile = ""; } + else { splitAtDash(namefile, namefileNames); diff --git a/fileoutput.cpp b/fileoutput.cpp index 004ed83..cd3a1c7 100644 --- a/fileoutput.cpp +++ b/fileoutput.cpp @@ -306,7 +306,7 @@ void OneColumnFile::initFile(string label){ } else{ openOutputFile(outName, outFile); - outFile << "numsequences\t" << label << endl; + outFile << "numsampled\t" << label << endl; } outFile.setf(ios::fixed, ios::floatfield); diff --git a/heatmapsim.cpp b/heatmapsim.cpp index 88a49ed..c68d0cc 100644 --- a/heatmapsim.cpp +++ b/heatmapsim.cpp @@ -50,7 +50,8 @@ void HeatMapSim::getPic(vector lookup, vector } sims.clear(); - double biggest = -1; +// double biggest = -1; + double biggest = 1; float scaler; //get sim for each comparison and save them so you can find the relative similairity @@ -65,7 +66,7 @@ void HeatMapSim::getPic(vector lookup, vector sims.push_back(data[0]); //save biggest similairity to set relative sim - if (data[0] > biggest) { biggest = data[0]; } +// if (data[0] > biggest) { biggest = data[0]; } } } @@ -171,6 +172,7 @@ void HeatMapSim::getPic(vector< vector > dists, vector groups) { void HeatMapSim::printLegend(int y, float maxSim) { try { + maxSim = 1; //output legend and color labels //go through map and give each score a color value @@ -180,22 +182,22 @@ void HeatMapSim::printLegend(int y, float maxSim) { //prints legend for (int i = 1; i < 255; i++) { color = toHex(int((float)(i))); - outsvg << "\n"; - x += 1; + outsvg << "\n"; + x += 3; } float scaler = maxSim / 5.0; //prints legend labels - x = 10; - for (int i = 1; i<=5; i++) { + x = 0; + for (int i = 0; i<=5; i++) { float label = scaler*i; label = int(label * 1000 + 0.5); label /= 1000.0; - string text = toString(label, 3); + string text = toString(label, 1); outsvg << "" + text + "\n"; - x += 60; + x += 153; } } diff --git a/preclustercommand.cpp b/preclustercommand.cpp index 479a862..66fd850 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -43,9 +43,10 @@ PreClusterCommand::PreClusterCommand(string option){ //check for optional parameter and set defaults // ...at some point should added some additional type checking... namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { abort = true; } - else { readNameFile(); } +// else { readNameFile(); } string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } convert(temp, diffs); @@ -87,68 +88,62 @@ int PreClusterCommand::execute(){ if (abort == true) { return 0; } //reads fasta file and return number of seqs - int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active + int numSeqs = readNamesFASTA(); //fills alignSeqs and makes all seqs active if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0; } if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0; } //clear sizes since you only needed this info to build the alignSeqs seqPNode structs - sizes.clear(); +// sizes.clear(); //sort seqs by number of identical seqs sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); //go through active list and cluster everthing you can, until all nodes are inactive //taking advantage of the fact that maps are already sorted - map::iterator itActive; - map::iterator it2Active; +// map::iterator itActive; +// map::iterator it2Active; int count = 0; - for (int i = 0; i < alignSeqs.size(); i++) { - - //are you active - itActive = active.find(alignSeqs[i].seq.getName()); + for (int i = 0; i < numSeqs; i++) { - if (itActive != active.end()) { //this sequence has not been merged yet + //are you active + // itActive = active.find(alignSeqs[i].seq.getName()); - //try to merge it with all smaller seqs - for (int j = i; j < alignSeqs.size(); j++) { - - if (i != j) { - //are you active - it2Active = active.find(alignSeqs[j].seq.getName()); - if (it2Active != active.end()) { //this sequence has not been merged yet - //are you within "diff" bases - int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); - - if (mismatch <= diffs) { - //merge - names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()]; - - //remove from active list - active.erase(it2Active); - - //set numIdentical to 0, so you only print the representative seqs in the fasta file - alignSeqs[j].numIdentical = 0; - count++; - } - }//end if j active - }//end if i != j - }//end for loop + if (alignSeqs[i].active) { //this sequence has not been merged yet - //remove from active list - active.erase(itActive); + //try to merge it with all smaller seqs + for (int j = i+1; j < numSeqs; j++) { + if (alignSeqs[j].active) { //this sequence has not been merged yet + //are you within "diff" bases + int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); + + if (mismatch <= diffs) { + //merge + alignSeqs[i].names += ',' + alignSeqs[j].names; + alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; + + alignSeqs[j].active = 0; + alignSeqs[j].numIdentical = 0; + count++; + } + }//end if j active + }//end if i != j + + //remove from active list + alignSeqs[i].active = 0; }//end if active i + if(i % 100 == 0) { cout << i << '\t' << numSeqs - count << '\t' << count << endl; } } string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile); string newNamesFile = getRootName(fastafile) + "precluster.names"; - printData(newFastaFile, newNamesFile); mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine(); mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); - + printData(newFastaFile, newNamesFile); + return 0; } @@ -158,65 +153,85 @@ int PreClusterCommand::execute(){ } } /**************************************************************************************************/ -int PreClusterCommand::readSeqs(){ +int PreClusterCommand::readFASTA(){ try { - ifstream inFasta; - openInputFile(fastafile, inFasta); - length = 0; - map::iterator it; - - while (!inFasta.eof()) { - Sequence temp(inFasta); //read seq - - if (temp.getName() != "") { - if (namefile != "") { - //make sure fasta and name files match - it = names.find(temp.getName()); - if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } - }else { sizes[temp.getName()] = 1; } - - seqPNode tempNode(sizes[temp.getName()], temp); - alignSeqs.push_back(tempNode); - active[temp.getName()] = true; - } - gobble(inFasta); - } - inFasta.close(); - - if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); } - +// ifstream inFasta; +// openInputFile(fastafile, inFasta); +// length = 0; +//// map::iterator it; +// +// while (!inFasta.eof()) { +// Sequence temp(inFasta); //read seq +// +// if (temp.getName() != "") { +// if (namefile != "") { +// //make sure fasta and name files match +// it = names.find(temp.getName()); +// if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } +// }else { sizes[temp.getName()] = 1; } +// +// seqPNode tempNode(sizes[temp.getName()], temp); +// alignSeqs.push_back(tempNode); +// active[temp.getName()] = true; +// } +// gobble(inFasta); +// } +// inFasta.close(); +// +// if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); } +// return alignSeqs.size(); } catch(exception& e) { - errorOut(e, "PreClusterCommand", "readSeqs"); + errorOut(e, "PreClusterCommand", "readFASTA"); exit(1); } } /**************************************************************************************************/ -void PreClusterCommand::readNameFile(){ + +int PreClusterCommand::readNamesFASTA(){ try { - ifstream in; - openInputFile(namefile, in); - string firstCol, secondCol; - - while (!in.eof()) { - in >> firstCol >> secondCol; gobble(in); - names[firstCol] = secondCol; + ifstream inNames; + ifstream inFasta; + + openInputFile(namefile, inNames); + openInputFile(fastafile, inFasta); + + string firstCol, secondCol, nameString; + length = 0; + + while (inFasta && inNames) { + + inNames >> firstCol >> secondCol; + nameString = secondCol; + + gobble(inNames); int size = 1; while (secondCol.find_first_of(',') != -1) { size++; secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); } - sizes[firstCol] = size; + Sequence seq(inFasta); + if (seq.getName() != firstCol) { mothurOut(seq.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } + else{ + seqPNode tempNode(size, seq, nameString); + alignSeqs.push_back(tempNode); + if (seq.getAligned().length() > length) { length = alignSeqs[0].seq.getAligned().length(); } + } } - in.close(); + inFasta.close(); + inNames.close(); + return alignSeqs.size(); } + catch(exception& e) { - errorOut(e, "PreClusterCommand", "readNameFile"); + errorOut(e, "PreClusterCommand", "readNamesFASTA"); exit(1); } } + /**************************************************************************************************/ + int PreClusterCommand::calcMisMatches(string seq1, string seq2){ try { int numBad = 0; @@ -234,23 +249,22 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){ exit(1); } } + /**************************************************************************************************/ + void PreClusterCommand::printData(string newfasta, string newname){ try { ofstream outFasta; ofstream outNames; + openOutputFile(newfasta, outFasta); openOutputFile(newname, outNames); - map::iterator itNames; for (int i = 0; i < alignSeqs.size(); i++) { if (alignSeqs[i].numIdentical != 0) { alignSeqs[i].seq.printSequence(outFasta); - - itNames = names.find(alignSeqs[i].seq.getName()); - - outNames << itNames->first << '\t' << itNames->second << endl; + outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; } } diff --git a/preclustercommand.h b/preclustercommand.h index bb666cd..5113fa4 100644 --- a/preclustercommand.h +++ b/preclustercommand.h @@ -19,8 +19,10 @@ struct seqPNode { int numIdentical; Sequence seq; + string names; + bool active; seqPNode() {} - seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {} + seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {} ~seqPNode() {} }; /************************************************************/ @@ -38,13 +40,13 @@ private: bool abort; string fastafile, namefile; vector alignSeqs; //maps the number of identical seqs to a sequence - map names; //represents the names file first column maps to second column - map sizes; //this map a seq name to the number of identical seqs in the names file - map active; //maps sequence name to whether it has already been merged or not. +// map names; //represents the names file first column maps to second column +// map sizes; //this map a seq name to the number of identical seqs in the names file +// map active; //maps sequence name to whether it has already been merged or not. - int readSeqs(); + int readFASTA(); + int readNamesFASTA(); int calcMisMatches(string, string); - void readNameFile(); void printData(string, string); //fasta filename, names file name }; diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index 9225e99..32426f3 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -151,6 +151,7 @@ int ScreenSeqsCommand::execute(){ gobble(inFASTA); } if(namefile != "" && groupfile != "") { screenNameGroupFile(badSeqNames); } // this screens both names and groups + else if(namefile != "") { screenNameGroupFile(badSeqNames); } else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the groups if(alignreport != "") { screenAlignReport(badSeqNames); } diff --git a/sharedmorisitahorn.cpp b/sharedmorisitahorn.cpp index f9c1b6b..d5cdeec 100644 --- a/sharedmorisitahorn.cpp +++ b/sharedmorisitahorn.cpp @@ -14,7 +14,7 @@ EstOutput MorHorn::getValues(vector shared) { try { data.resize(1,0); - int Atotal, Btotal, tempA, tempB; + float Atotal, Btotal, tempA, tempB; Atotal = 0; Btotal = 0; float morhorn, sumSharedA, sumSharedB, a, b, d; morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0; @@ -26,23 +26,21 @@ EstOutput MorHorn::getValues(vector shared) { Btotal += shared[1]->getAbundance(i); } - //calculate the theta denominator sums + //calculate the denominator sums for (int j = 0; j < shared[0]->size(); j++) { //store in temps to avoid multiple repetitive function calls tempA = shared[0]->getAbundance(j); tempB = shared[1]->getAbundance(j); + float relA = tempA / Atotal; + float relB = tempB / Btotal; - a += tempA * tempA; - b += tempB * tempB; - d += tempA * tempB; + a += relA * relA; + b += relB * relB; + d += relA * relB; } - a /= double(Atotal * Atotal); - b /= double(Btotal * Btotal); - d /= double(Atotal * Btotal); - morhorn = (2 * d) / (a + b); - + if (isnan(morhorn) || isinf(morhorn)) { morhorn = 0; } data[0] = morhorn; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index f777324..a6950ac 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -185,15 +185,17 @@ int TrimSeqsCommand::execute(){ } if(!success) { trashCode += 'q'; } } + if(barcodes.size() != 0){ - success = stripBarcode(currSeq, group); if(!success){ trashCode += 'b'; } } + if(numFPrimers != 0){ success = stripForward(currSeq); if(!success){ trashCode += 'f'; } } + if(numRPrimers != 0){ success = stripReverse(currSeq); if(!success){ trashCode += 'r'; }