37AD4DCA0F28F3DD00AA2D49 /* readtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DC90F28F3DD00AA2D49 /* readtree.cpp */; };
37AFC71F0F445386005F492D /* sharedsobscollectsummary.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */; };
37B28F680F27590100808A62 /* deconvolutecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37B28F670F27590100808A62 /* deconvolutecommand.cpp */; };
+ 37B73C761004BEFD008C4B41 /* listseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */; };
37C1D9730F86506E0059E3F0 /* binsequencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */; };
37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37C753CD0FB3415200DBD02E /* distancecommand.cpp */; };
37D928550F21331F001D4494 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37D927B80F21331F001D4494 /* ace.cpp */; };
37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedsobscollectsummary.cpp; sourceTree = SOURCE_ROOT; };
37B28F660F27590100808A62 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = SOURCE_ROOT; };
37B28F670F27590100808A62 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = SOURCE_ROOT; };
+ 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = "<group>"; };
+ 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = "<group>"; };
37C1D9710F86506E0059E3F0 /* binsequencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binsequencecommand.h; sourceTree = SOURCE_ROOT; };
37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binsequencecommand.cpp; sourceTree = SOURCE_ROOT; };
37C753CC0FB3415200DBD02E /* distancecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distancecommand.h; sourceTree = SOURCE_ROOT; };
37D927E30F21331F001D4494 /* helpcommand.cpp */,
375873F40F7D648F0040F377 /* libshuffcommand.h */,
375873F30F7D648F0040F377 /* libshuffcommand.cpp */,
+ 37B73C741004BEFD008C4B41 /* listseqscommand.h */,
+ 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */,
21E859D60FC4632E005E1A48 /* matrixoutputcommand.h */,
21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */,
7E5A17AD0FE57292003C6A03 /* mergefilecommand.h */,
378599100FDD7E8E00EF9D03 /* optionparser.cpp in Sources */,
7E5A17AF0FE57292003C6A03 /* mergefilecommand.cpp in Sources */,
379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */,
+ 37B73C761004BEFD008C4B41 /* listseqscommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
ifstream inFASTA;
openInputFile(candidateFileName, inFASTA);
+ string input;
while(!inFASTA.eof()){
- char c = inFASTA.get();
- if(c == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos-1); }
- while (!inFASTA.eof()) { c = inFASTA.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ getline(inFASTA, input);
+ if (input.length() != 0) {
+ if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ }
}
inFASTA.close();
*/
#include "chimeraseqscommand.h"
+#include "eachgapdist.h"
//***************************************************************************************************************
else {
//valid paramters for this command
- string Array[] = {"fasta", "filter", "correction", "processors", "method" };
+ string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
string temp;
- temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
+ temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; }
filter = isTrue(temp);
temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
correction = isTrue(temp);
- temp = validParameter.validFile(parameters, "processors", true); if (temp == "not found") { temp = "1"; }
+ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
convert(temp, processors);
+ temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
+ convert(temp, window);
+
+ temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; }
+ convert(temp, increment);
+
method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; }
+
+ if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; }
}
}
exit(1);
}
}
+//********************************************************************************************************************
+//sorts highest score to lowest
+inline bool comparePref(Preference left, Preference right){
+ return (left.score[0] > right.score[0]);
+}
//***************************************************************************************************************
if (abort == true) { return 0; }
+
//do soft filter
if (filter) {
- string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F";
+ string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
filterSeqs = new FilterSeqsCommand(optionString);
filterSeqs->execute();
delete filterSeqs;
fastafile = getRootName(fastafile) + "filter.fasta";
}
+ distCalculator = new eachGapDist();
+
//read in sequences
readSeqs();
- //int numSeqs = seqs.size();
+ int numSeqs = seqs.size();
- //find average midpoint of seqs
- midpoint = findAverageMidPoint();
+ if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
- //create 2 vectors of sequences, 1 for left side and one for right side
- vector<Sequence> left; vector<Sequence> right;
+ //set default window to 25% of sequence length
+ string seq0 = seqs[0].getAligned();
+ if (window == 0) { window = seq0.length() / 4; }
+ else if (window > (seq0.length() / 2)) {
+ mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
+ window = (seq0.length() / 2);
+ }
- for (int i = 0; i < seqs.size(); i++) {
- //save left side
- string seqLeft = seqs[i].getAligned();
- seqLeft = seqLeft.substr(0, midpoint);
- Sequence tempLeft(seqs[i].getName(), seqLeft);
- left.push_back(tempLeft);
+ if (increment > (seqs[0].getAlignLength() - (2*window))) {
+ if (increment != 10) {
- //save right side
- string seqRight = seqs[i].getAligned();
- seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1));
- Sequence tempRight(seqs[i].getName(), seqRight);
- right.push_back(tempRight);
+ mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
+ increment = 10;
+ if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
+
+ }else{ increment = 0; }
+ }
+cout << "increment = " << increment << endl;
+ if (increment == 0) { iters = 1; }
+ else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+
+ //initialize pref
+ pref.resize(numSeqs);
+
+ for (int i = 0; i < numSeqs; i++ ) {
+ pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
+ pref[i].name = seqs[i].getName();
+ pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
+ pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
+ pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
}
+
+ int midpoint = window;
+ int count = 0;
+ while (count < iters) {
+
+ //create 2 vectors of sequences, 1 for left side and one for right side
+ vector<Sequence> left; vector<Sequence> right;
+
+ for (int i = 0; i < seqs.size(); i++) {
+//cout << "whole = " << seqs[i].getAligned() << endl;
+ //save left side
+ string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
+ Sequence tempLeft;
+ tempLeft.setName(seqs[i].getName());
+ tempLeft.setAligned(seqLeft);
+ left.push_back(tempLeft);
+//cout << "left = " << tempLeft.getAligned() << endl;
+ //save right side
+ string seqRight = seqs[i].getAligned().substr(midpoint, window);
+ Sequence tempRight;
+ tempRight.setName(seqs[i].getName());
+ tempRight.setAligned(seqRight);
+ right.push_back(tempRight);
+//cout << "right = " << seqRight << endl;
+ }
+
+ //adjust midpoint by increment
+ midpoint += increment;
+
+
+ //this should be parallelized
+ //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
+ //create a matrix containing the distance from left to left and right to right
+ //calculate distances
+ SparseMatrix* SparseLeft = new SparseMatrix();
+ SparseMatrix* SparseRight = new SparseMatrix();
+
+ createSparseMatrix(0, left.size(), SparseLeft, left);
+ createSparseMatrix(0, right.size(), SparseRight, right);
+
+ vector<SeqMap> distMapRight;
+ vector<SeqMap> distMapLeft;
+
+ // Create a data structure to quickly access the distance information.
+ // It consists of a vector of distance maps, where each map contains
+ // all distances of a certain sequence. Vector and maps are accessed
+ // via the index of a sequence in the distance matrix
+ distMapRight = vector<SeqMap>(numSeqs);
+ distMapLeft = vector<SeqMap>(numSeqs);
+ //cout << "left" << endl << endl;
+ for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
+ distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
+ //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+ }
+ //cout << "right" << endl << endl;
+ for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
+ distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
+ //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+ }
+
+ delete SparseLeft;
+ delete SparseRight;
- //this should be parallelized
- //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
- //create a matrix containing the distance from left to left and right to right
- //calculate distances
- SparseMatrix* SparseLeft = new SparseMatrix();
- SparseMatrix* SparseRight = new SparseMatrix();
+
+ //fill preference structure
+ generatePreferences(distMapLeft, distMapRight, midpoint);
+
+ count++;
+
+ }
- createSparseMatrix(0, left.size(), SparseLeft, left);
- createSparseMatrix(0, right.size(), SparseRight, right);
+ delete distCalculator;
+ //find average pref score across windows
+ //if (increment != 0) {
+
+ //for (int i = 0; i < pref.size(); i++) {
+ //pref[i].score[0] = pref[i].score[0] / iters;
+ //}
+ //}
- //vector<SeqMap> distMapRight;
- //vector<SeqMap> distMapLeft;
+ //sort Preferences highest to lowest
+ sort(pref.begin(), pref.end(), comparePref);
- // Create a data structure to quickly access the distance information.
- // It consists of a vector of distance maps, where each map contains
- // all distances of a certain sequence. Vector and maps are accessed
- // via the index of a sequence in the distance matrix
- //distMapRight = vector<SeqMap>(globaldata->gListVector->size());
- //distMapLeft = vector<SeqMap>(globaldata->gListVector->size());
- for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
- //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
- }
- for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
- //distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
- }
-
+ string outputFileName = getRootName(fastafile) + "chimeras";
+ ofstream out;
+ openOutputFile(outputFileName, out);
- //fill preference structure
- //generatePreferences(distMapLeft, distMapRight);
+ int above1 = 0;
+ out << "Name\tScore\tLeft\tRight\t" << endl;
+ //output prefenence structure to .chimeras file
+ for (int i = 0; i < pref.size(); i++) {
+ out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
+
+ //calc # of seqs with preference above 1.0
+ if (pref[i].score[0] > 1.0) {
+ above1++;
+ mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
+ mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
+ }
+
+
+ }
-
- //output results to screen
+ //output results to screen
mothurOutEndLine();
- mothurOut("\t\t"); mothurOutEndLine();
- //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
- //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine();
- //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine();
- //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine();
- //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine();
- //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine();
- //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine();
- //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine();
+ mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+ int spot;
+ spot = pref.size()-1;
+ mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.975;
+ mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.75;
+ mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.50;
+ mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.25;
+ mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.025;
+ mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = 0;
+ mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- //outSummary.close();
return 0;
}
catch(exception& e) {
while(!inFASTA.eof()){
Sequence current(inFASTA);
+ if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
+
seqs.push_back(current);
gobble(inFASTA);
}
}
-
-//***************************************************************************************************************
-int ChimeraSeqsCommand::findAverageMidPoint(){
- try {
- int totalMids = 0;
- int averageMid = 0;
-
- //loop through the seqs and find midpoint
- for (int i = 0; i < seqs.size(); i++) {
-
- //get unaligned sequence
- seqs[i].setUnaligned(seqs[i].getUnaligned()); //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned
-
- string unaligned = seqs[i].getUnaligned();
- string aligned = seqs[i].getAligned();
-
- //find midpoint of this seq
- int count = 0;
- int thismid = 0;
- for (int j = 0; j < aligned.length(); j++) {
-
- thismid++;
-
- //if you are part of the unaligned sequence increment
- if (isalpha(aligned[j])) { count++; }
-
- //if you have reached the halfway point stop
- if (count >= (unaligned.length() / 2)) { break; }
- }
-
- //add this mid to total
- totalMids += thismid;
-
- }
-
- averageMid = (totalMids / seqs.size());
-
- return averageMid;
-
-
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint");
- exit(1);
- }
-}
-
/***************************************************************************************************************/
int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
try {
for(int j=0;j<i;j++){
- //distCalculator->calcDist(s.get(i), s.get(j));
+ distCalculator->calcDist(s[i], s[j]);
float dist = distCalculator->getDist();
-
+
PCell temp(i, j, dist);
sparse->addCell(temp);
exit(1);
}
}
-/***************************************************************************************************************
-void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right){
+/***************************************************************************************************************/
+void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
try {
+
+ float dme = 0.0;
+ SeqMap::iterator itR;
+ SeqMap::iterator itL;
+
+ //initialize pref[i]
+ for (int i = 0; i < pref.size(); i++) {
+ pref[i].score[1] = 0.0;
+ pref[i].closestLeft[1] = 100000.0;
+ pref[i].closestRight[1] = 100000.0;
+ pref[i].leftParent[1] = "";
+ pref[i].rightParent[1] = "";
+ }
+//cout << "in generate left.size() = " << left.size() << endl;
for (int i = 0; i < left.size(); i++) {
- int iscore = 0;
- float closestLeft = 100000.0;
- float closestRight = 100000.0;
+ SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
+ // 1 to the distance of sequence 3 to sequence 1,
+ // 2 to the distance of sequence 3 to sequence 2.
+ SeqMap currentRight = right[i]; // same as left but with distances on the right side.
- for (int j = 0; j < left.size(); j++) {
+ for (int j = 0; j < i; j++) {
- //iscore += abs(left
-
+ itL = currentLeft.find(j);
+ itR = currentRight.find(j);
+cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
+cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
+
+ //if you can find this entry update the preferences
+ if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
+
+ if (!correction) {
+ pref[i].score[1] += abs((itL->second - itR->second));
+ pref[j].score[1] += abs((itL->second - itR->second));
+cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+cout << "abs = " << abs((itL->second - itR->second)) << endl;
+cout << i << " score = " << pref[i].score[1] << endl;
+cout << j << " score = " << pref[j].score[1] << endl;
+ }else {
+ pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+ pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
+cout << i << " score = " << pref[i].score[1] << endl;
+cout << j << " score = " << pref[j].score[1] << endl;
+ }
+cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
+ //are you the closest left sequence
+ if (itL->second < pref[i].closestLeft[1]) {
+
+ pref[i].closestLeft[1] = itL->second;
+ pref[i].leftParent[1] = seqs[j].getName();
+cout << "updating closest left to " << pref[i].leftParent[1] << endl;
+ }
+cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
+ if (itL->second < pref[j].closestLeft[1]) {
+ pref[j].closestLeft[1] = itL->second;
+ pref[j].leftParent[1] = seqs[i].getName();
+cout << "updating closest left to " << pref[j].leftParent[1] << endl;
+ }
+
+ //are you the closest right sequence
+ if (itR->second < pref[i].closestRight[1]) {
+ pref[i].closestRight[1] = itR->second;
+ pref[i].rightParent[1] = seqs[j].getName();
+ }
+ if (itR->second < pref[j].closestRight[1]) {
+ pref[j].closestRight[1] = itR->second;
+ pref[j].rightParent[1] = seqs[i].getName();
+ }
+
+ }
}
}
+
+
+
+ //calculate the dme
+ int count0 = 0;
+ for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
+
+ float expectedPercent = 1 / (float) (pref.size() - count0);
+cout << endl << "dme = " << dme << endl;
+ //recalculate prefernences based on dme
+ for (int i = 0; i < pref.size(); i++) {
+cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
+ // gives the actual percentage of the dme this seq adds
+ pref[i].score[1] = pref[i].score[1] / dme;
+
+ //how much higher or lower is this than expected
+ pref[i].score[1] = pref[i].score[1] / expectedPercent;
+
+ //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
+cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
+ }
+
+ //is this score bigger then the last score
+ for (int i = 0; i < pref.size(); i++) {
+
+ //update biggest score
+ if (pref[i].score[1] > pref[i].score[0]) {
+ pref[i].score[0] = pref[i].score[1];
+ pref[i].leftParent[0] = pref[i].leftParent[1];
+ pref[i].rightParent[0] = pref[i].rightParent[1];
+ pref[i].closestLeft[0] = pref[i].closestLeft[1];
+ pref[i].closestRight[0] = pref[i].closestRight[1];
+ pref[i].midpoint = mid;
+ }
+
+ //total of preference scores across windows
+ //pref[i].score[0] += pref[i].score[1];
+ }
}
catch(exception& e) {
typedef list<PCell>::iterator MatData;
typedef map<int, float> SeqMap; //maps sequence to all distance for that seqeunce
+struct Preference {
+ string name;
+ vector<string> leftParent; //keep the name of closest left associated with the two scores
+ vector<string> rightParent; //keep the name of closest right associated with the two scores
+ vector<float> score; //so you can keep last score and calc this score and keep whichever is bigger.
+ vector<float> closestLeft; //keep the closest left associated with the two scores
+ vector<float> closestRight; //keep the closest right associated with the two scores
+ int midpoint;
+
+};
+
+
+
/***********************************************************/
class ChimeraSeqsCommand : public Command {
int execute();
void help();
+
private:
- //Dist* distCalculator;
- struct Preference {
- string leftParent;
- string rightParent;
- float score;
-
- };
-
Dist* distCalculator;
bool abort;
string method, fastafile;
bool filter, correction;
- int processors, midpoint;
+ int processors, midpoint, averageLeft, averageRight, window, iters, increment;
FilterSeqsCommand* filterSeqs;
+ ListVector* list;
vector<Sequence> seqs;
vector<Preference> pref;
- int findAverageMidPoint();
void readSeqs();
- void generatePreferences(SparseMatrix*, SparseMatrix*);
+ void generatePreferences(vector<SeqMap>, vector<SeqMap>, int);
int createSparseMatrix(int, int, SparseMatrix*, vector<Sequence>);
#include "reversecommand.h"
#include "trimseqscommand.h"
#include "mergefilecommand.h"
+#include "chimeraseqscommand.h"
+#include "listseqscommand.h"
/***********************************************************/
commands["screen.seqs"] = "screen.seqs";
commands["reverse.seqs"] = "reverse.seqs";
commands["trim.seqs"] = "trim.seqs";
+ commands["chimera.seqs"] = "chimera.seqs";
+ commands["list.seqs"] = "list.seqs";
commands["quit"] = "quit";
}
else if(commandName == "tree.shared") { command = new TreeGroupCommand(optionString); }
else if(commandName == "dist.shared") { command = new MatrixOutputCommand(optionString); }
else if(commandName == "bootstrap.shared") { command = new BootSharedCommand(optionString); }
- //else if(commandName == "consensus") { command = new ConcensusCommand(optionString); }
+ //else if(commandName == "consensus") { command = new ConcensusCommand(optionString); }
else if(commandName == "dist.seqs") { command = new DistanceCommand(optionString); }
else if(commandName == "align.seqs") { command = new AlignCommand(optionString); }
else if(commandName == "summary.seqs") { command = new SeqSummaryCommand(optionString); }
else if(commandName == "screen.seqs") { command = new ScreenSeqsCommand(optionString); }
else if(commandName == "reverse.seqs") { command = new ReverseSeqsCommand(optionString); }
else if(commandName == "trim.seqs") { command = new TrimSeqsCommand(optionString); }
+ else if(commandName == "chimera.seqs") { command = new ChimeraSeqsCommand(optionString); }
+ else if(commandName == "list.seqs") { command = new ListSeqsCommand(optionString); }
else if(commandName == "merge.files") { command = new MergeFileCommand(optionString); }
else { command = new NoCommand(optionString); }
string seqA = A.getAligned();
string seqB = B.getAligned();
+
int alignLength = seqA.length();
for(int i=0; i<alignLength; i++){
if(length == 0) { dist = 1.0000; }
else { dist = ((double)diff / (double)length); }
+
}
};
TreeMap* gTreemap;
SequenceDB* gSequenceDB;
string inputFileName, argv;
- bool allLines, runParse;
+ bool allLines, runParse, jumble;
vector<string> Estimators, Groups; //holds estimators to be used
set<int> lines; //hold lines to be used
set<string> labels; //holds labels to be used
string getNameFile(); //do we need this?
string getGroupFile(); //do we need this?
string getOrderFile();
-// string getFastaFile();
string getTreeFile();
string getSharedFile();
string getFormat(); //do we need this?
-// string getCandidateFile();
-// string getTemplateFile();
+
void setListFile(string);
-// void setFastaFile(string);
void setTreeFile(string);
-// void setCandidateFile(string);
-// void setTemplateFile(string);
void setGroupFile(string); //do we need this?
void setPhylipFile(string);
void setColumnFile(string);
--- /dev/null
+/*
+ * listseqscommand.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/8/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "listseqscommand.h"
+#include "sequence.hpp"
+
+//**********************************************************************************************************************
+
+ListSeqsCommand::ListSeqsCommand(string option){
+ try {
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+ //valid paramters for this command
+ string Array[] = {"fasta","name", "group", "align" };
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not open") { abort = true; }
+ else if (fastafile == "not found") { fastafile = ""; }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+
+ alignfile = validParameter.validFile(parameters, "align", true);
+ if (alignfile == "not open") { abort = true; }
+ else if (alignfile == "not found") { alignfile = ""; }
+
+ if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "")) { mothurOut("You must provide a file."); mothurOutEndLine(); abort = true; }
+
+ if (parameters.size() > 1) { mothurOut("You may only enter one file."); mothurOutEndLine(); abort = true; }
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "ListSeqsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+void ListSeqsCommand::help(){
+ try {
+ mothurOut("The list.seqs command reads a fasta, name, group or alignreport file and outputs a .accnos file containing sequence names.\n");
+ mothurOut("The list.seqs command parameters are fasta, name, group and align. You must provide one of these parameters.\n");
+ mothurOut("The list.seqs command should be in the following format: list.seqs(fasta=yourFasta).\n");
+ mothurOut("Example list.seqs(fasta=amazon.fasta).\n");
+ mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "help");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+int ListSeqsCommand::execute(){
+ try {
+
+ if (abort == true) { return 0; }
+
+ //read functions fill names vector
+ if (fastafile != "") { inputFileName = fastafile; readFasta(); }
+ else if (namefile != "") { inputFileName = namefile; readName(); }
+ else if (groupfile != "") { inputFileName = groupfile; readGroup(); }
+ else if (alignfile != "") { inputFileName = alignfile; readAlign(); }
+
+ //sort in alphabetical order
+ sort(names.begin(), names.end());
+
+ string outputFileName = getRootName(inputFileName) + "accnos";
+ ofstream out;
+ openOutputFile(outputFileName, out);
+
+ //output to .accnos file
+ for (int i = 0; i < names.size(); i++) {
+ out << names[i] << endl;
+ }
+ out.close();
+
+ return 0;
+ }
+
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "execute");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+void ListSeqsCommand::readFasta(){
+ try {
+
+ ifstream in;
+ openInputFile(fastafile, in);
+ string name;
+
+ while(!in.eof()){
+ Sequence currSeq(in);
+ name = currSeq.getName();
+
+ names.push_back(name);
+
+ gobble(in);
+ }
+ in.close();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "readFasta");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+void ListSeqsCommand::readName(){
+ try {
+
+ ifstream in;
+ openInputFile(namefile, in);
+ string name, firstCol, secondCol;
+
+ while(!in.eof()){
+
+ in >> firstCol;
+ in >> secondCol;
+
+ //parse second column saving each name
+ while (secondCol.find_first_of(',') != -1) {
+ name = secondCol.substr(0,secondCol.find_first_of(','));
+ secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
+ names.push_back(name);
+ }
+
+ //get name after last ,
+ names.push_back(secondCol);
+
+ gobble(in);
+ }
+ in.close();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "readName");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+void ListSeqsCommand::readGroup(){
+ try {
+
+ ifstream in;
+ openInputFile(groupfile, in);
+ string name, group;
+
+ while(!in.eof()){
+
+ in >> name; //read from first column
+ in >> group; //read from second column
+
+ names.push_back(name);
+
+ gobble(in);
+ }
+ in.close();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "readGroup");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+//alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
+void ListSeqsCommand::readAlign(){
+ try {
+
+ ifstream in;
+ openInputFile(alignfile, in);
+ string name, junk;
+
+ //read column headers
+ for (int i = 0; i < 16; i++) {
+ if (!in.eof()) { in >> junk; }
+ else { break; }
+ }
+
+
+ while(!in.eof()){
+
+ in >> name; //read from first column
+
+ //read rest
+ for (int i = 0; i < 15; i++) {
+ if (!in.eof()) { in >> junk; }
+ else { break; }
+ }
+
+ names.push_back(name);
+
+ gobble(in);
+ }
+ in.close();
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ListSeqsCommand", "readAlign");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
--- /dev/null
+#ifndef LISTSEQSCOMMAND_H
+#define LISTSEQSCOMMAND_H
+
+/*
+ * listseqscommand.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/8/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+
+class ListSeqsCommand : public Command {
+
+ public:
+
+ ListSeqsCommand(string);
+ ~ListSeqsCommand(){};
+ int execute();
+ void help();
+
+ private:
+ vector<string> names;
+ string fastafile, namefile, groupfile, alignfile, inputFileName;
+ bool abort;
+
+ void readFasta();
+ void readName();
+ void readGroup();
+ void readAlign();
+
+};
+
+#endif
+
mothurOut("Error: ");
mothurOut(toString(e.what()));
- mothurOut(" has occurred in the " + object + " class function " + function + "Please contact Pat Schloss at pschloss@microbio.umass.edu, and be sure to include the mothur.logFile with your inquiry.");
+ mothurOut(" has occurred in the " + object + " class function " + function + ". Please contact Pat Schloss at pschloss@microbio.umass.edu, and be sure to include the mothur.logFile with your inquiry.");
mothurOutEndLine();
}
candidateReportFile << alignmentMethod << '\t' << candidateStartPosition << "\t" << candidateEndPosition << '\t';
candidateReportFile << templateStartPosition << "\t" << templateEndPosition << '\t';
candidateReportFile << pairwiseAlignmentLength << '\t' << totalGapsInQuery << '\t' << totalGapsInTemplate << '\t';
- candidateReportFile << longestInsert << '\t';;
+ candidateReportFile << longestInsert << '\t';
candidateReportFile << setprecision(2) << similarityToTemplate;
candidateReportFile << endl;
rcd->registerDisplay(displays[i]);
}
+ //if jumble is false all iters will be the same
+ if (globaldata->jumble == false) { nIters = 1; }
+
for(int iter=0;iter<nIters;iter++){
for(int i=0;i<displays.size();i++){
displays[i]->init(label);
}
-
- //randomize the groups
- random_shuffle(lookup.begin(), lookup.end());
+
+ if (globaldata->jumble == true) {
+ //randomize the groups
+ random_shuffle(lookup.begin(), lookup.end());
+ }
//make merge the size of lookup[0]
SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size());
#include "raredisplay.h"
#include "ordervector.hpp"
#include "mothur.h"
+#include "globaldata.hpp"
class Rarefact {
Rarefact(OrderVector* o, vector<Display*> disp) :
numSeqs(o->getNumSeqs()), order(o), displays(disp), label(o->getLabel()) {};
Rarefact(vector<SharedRAbundVector*> shared, vector<Display*> disp) :
- lookup(shared), displays(disp) {};
+ lookup(shared), displays(disp) { globaldata = GlobalData::getInstance(); };
~Rarefact(){};
void getCurve(int, int);
void getSharedCurve(int, int);
private:
+ GlobalData* globaldata;
OrderVector* order;
vector<Display*> displays;
int numSeqs, numGroupComb;
else {
//valid paramters for this command
- string Array[] = {"iters","line","label","calc","groups"};
+ string Array[] = {"iters","line","label","calc","groups", "jumble"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
convert(temp, nIters);
+ temp = validParameter.validFile(parameters, "jumble", false); if (temp == "not found") { temp = "T"; }
+ if (isTrue(temp)) { jumble = true; }
+ else { jumble = false; }
+ globaldata->jumble = jumble;
+
if (abort == false) {
string fileNameRoot = getRootName(globaldata->inputFileName);
void RareFactSharedCommand::help(){
try {
mothurOut("The rarefaction.shared command can only be executed after a successful read.otu command.\n");
- mothurOut("The rarefaction.shared command parameters are label, line, iters, groups and calc. No parameters are required, but you may not use \n");
+ mothurOut("The rarefaction.shared command parameters are label, line, iters, groups, jumble and calc. No parameters are required, but you may not use \n");
mothurOut("both the line and label parameters at the same time. The rarefaction command should be in the following format: \n");
- mothurOut("rarefaction.shared(label=yourLabel, line=yourLines, iters=yourIters, calc=yourEstimators, groups=yourGroups).\n");
- mothurOut("Example rarefaction.shared(label=unique-.01-.03, line=0-5-10, iters=10000, groups=B-C, calc=sharedobserved).\n");
+ mothurOut("rarefaction.shared(label=yourLabel, line=yourLines, iters=yourIters, calc=yourEstimators, jumble=yourJumble, groups=yourGroups).\n");
+ mothurOut("Example rarefaction.shared(label=unique-.01-.03, line=0-5-10, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n");
mothurOut("The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n");
- mothurOut("The default value for groups is all the groups in your groupfile.\n");
+ mothurOut("The default value for groups is all the groups in your groupfile, and jumble is true.\n");
validCalculator->printCalc("sharedrarefaction", cout);
mothurOut("The label and line parameters are used to analyze specific lines in your input.\n");
mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
int freq, nIters;
string format;
- bool abort, allLines;
+ bool abort, allLines, jumble;
set<int> lines; //hold lines to be used
set<string> labels; //holds labels to be used
string line, label, calc, groups;
}
}
- if(sequence.find_first_of('-') != string::npos){ // if there are any gaps in the sequence, assume that it is
+ if((sequence.find_first_of('-') != string::npos) || (sequence.find_first_of('.') != string::npos)) { // if there are any gaps in the sequence, assume that it is
setAligned(sequence); // an alignment file
}
setUnaligned(sequence); // also set the unaligned sequence file