From: westcott Date: Mon, 14 Jun 2010 17:06:03 +0000 (+0000) Subject: fixed read.tree so that it can read trees generated by fasttree X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=28d6f2bddc5c0718ae7f63648be3130a35fb0f02 fixed read.tree so that it can read trees generated by fasttree --- diff --git a/getseqscommand.cpp b/getseqscommand.cpp index a9463a0..8bf93ce 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -130,9 +130,15 @@ GetSeqsCommand::GetSeqsCommand(string option) { taxfile = validParameter.validFile(parameters, "taxonomy", true); if (taxfile == "not open") { abort = true; } else if (taxfile == "not found") { taxfile = ""; } - + + string usedDups = "true"; + string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } + dups = isTrue(temp); if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; } + + if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } + } } @@ -147,7 +153,8 @@ void GetSeqsCommand::help(){ try { m->mothurOut("The get.seqs command reads an .accnos file and any of the following file types: fasta, name, group, list, taxonomy or alignreport file.\n"); m->mothurOut("It outputs a file containing only the sequences in the .accnos file.\n"); - m->mothurOut("The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy and alignreport. You must provide accnos and at least one of the other parameters.\n"); + m->mothurOut("The get.seqs command parameters are accnos, fasta, name, group, list, taxonomy, alignreport and dups. You must provide accnos and at least one of the other parameters.\n"); + m->mothurOut("The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n"); m->mothurOut("The get.seqs command should be in the following format: get.seqs(accnos=yourAccnos, fasta=yourFasta).\n"); m->mothurOut("Example get.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"); m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); @@ -171,8 +178,8 @@ int GetSeqsCommand::execute(){ if (m->control_pressed) { return 0; } //read through the correct file and output lines you want to keep - if (fastafile != "") { readFasta(); } if (namefile != "") { readName(); } + if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } @@ -220,7 +227,7 @@ int GetSeqsCommand::readFasta(){ if (name != "") { //if this name is in the accnos file - if (names.count(name) == 1) { + if (names.count(name) != 0) { wroteSomething = true; currSeq.printSequence(out); @@ -280,11 +287,11 @@ int GetSeqsCommand::readList(){ binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length()); //if that name is in the .accnos file, add it - if (names.count(name) == 1) { newNames += name + ","; } + if (names.count(name) != 0) { newNames += name + ","; } } //get last name - if (names.count(binnames) == 1) { newNames += binnames + ","; } + if (names.count(binnames) != 0) { newNames += binnames + ","; } //if there are names in this bin add to new list if (newNames != "") { @@ -338,7 +345,10 @@ int GetSeqsCommand::readName(){ if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } in >> firstCol; - in >> secondCol; + in >> secondCol; + + string hold = ""; + if (dups) { hold = secondCol; } vector parsedNames; //parse second column saving each name @@ -353,39 +363,43 @@ int GetSeqsCommand::readName(){ vector validSecond; for (int i = 0; i < parsedNames.size(); i++) { - if (names.count(parsedNames[i]) == 1) { + if (names.count(parsedNames[i]) != 0) { validSecond.push_back(parsedNames[i]); } } - - //if the name in the first column is in the set then print it and any other names in second column also in set - if (names.count(firstCol) == 1) { - + if ((dups) && (validSecond.size() != 0)) { //dups = true and we want to add someone, then add everyone + for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); } + out << firstCol << '\t' << hold << endl; wroteSomething = true; - - out << firstCol << '\t'; - - //you know you have at least one valid second since first column is valid - for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } - out << validSecond[validSecond.size()-1] << endl; - - - //make first name in set you come to first column and then add the remaining names to second column }else { - //you want part of this row - if (validSecond.size() != 0) { + //if the name in the first column is in the set then print it and any other names in second column also in set + if (names.count(firstCol) != 0) { wroteSomething = true; - out << validSecond[0] << '\t'; - + out << firstCol << '\t'; + //you know you have at least one valid second since first column is valid for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } out << validSecond[validSecond.size()-1] << endl; + + + //make first name in set you come to first column and then add the remaining names to second column + }else { + //you want part of this row + if (validSecond.size() != 0) { + + wroteSomething = true; + + out << validSecond[0] << '\t'; + + //you know you have at least one valid second since first column is valid + for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; } + out << validSecond[validSecond.size()-1] << endl; + } } } - gobble(in); } in.close(); @@ -429,7 +443,7 @@ int GetSeqsCommand::readGroup(){ in >> group; //read from second column //if this name is in the accnos file - if (names.count(name) == 1) { + if (names.count(name) != 0) { wroteSomething = true; out << name << '\t' << group << endl; @@ -475,7 +489,7 @@ int GetSeqsCommand::readTax(){ in >> tax; //read from second column //if this name is in the accnos file - if (names.count(name) == 1) { + if (names.count(name) != 0) { wroteSomething = true; out << name << '\t' << tax << endl; @@ -530,7 +544,7 @@ int GetSeqsCommand::readAlign(){ in >> name; //read from first column //if this name is in the accnos file - if (names.count(name) == 1) { + if (names.count(name) != 0) { wroteSomething = true; out << name << '\t'; diff --git a/getseqscommand.h b/getseqscommand.h index 9d8e796..4f4c809 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -25,7 +25,7 @@ class GetSeqsCommand : public Command { set names; vector outputNames; string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, outputDir; - bool abort; + bool abort, dups; int readFasta(); int readName(); diff --git a/readtree.cpp b/readtree.cpp index c805edc..1dd77f9 100644 --- a/readtree.cpp +++ b/readtree.cpp @@ -165,6 +165,7 @@ int ReadNewickTree::read() { if (error != 0) { readOk = error; } filehandle.close(); + return readOk; } catch(exception& e) { @@ -237,14 +238,15 @@ int ReadNewickTree::readTreeString() { // ';' means end of tree. else if((ch=filehandle.peek())==';' || ch=='['){ rooted = 1; - } + } + if(rooted != 1){ rc = readNewickInt(filehandle, n, T); if (rc == -1) { m->mothurOut("error with rc"); m->mothurOutEndLine(); return -1; } //reports an error in reading if(filehandle.peek() == ')'){ readSpecialChar(filehandle,')',"right parenthesis"); } - } + } } //note: treeclimber had the code below added - not sure why? else{ @@ -263,7 +265,8 @@ int ReadNewickTree::readTreeString() { lc = rc = -1; } - while(((ch=filehandle.get())!=';') && (filehandle.eof() != true)){;} + while(((ch=filehandle.get())!=';') && (filehandle.eof() != true)){;} + if(rooted != 1){ T->tree[n].setChildren(lc,rc); T->tree[n].setBranchLength(0); @@ -271,6 +274,8 @@ int ReadNewickTree::readTreeString() { if(lc!=-1){ T->tree[lc].setParent(n); } if(rc!=-1){ T->tree[rc].setParent(n); } } + + //T->printTree(); cout << endl; return 0; } @@ -287,9 +292,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T) { if (m->control_pressed) { return -1; } int c = readNodeChar(f); - string k; -k = c; - cout << "at beginning = " << k <mothurOutEndLine(); return -1; } //then force into 2 node structure @@ -317,31 +317,24 @@ k = c; int lc, rc; if (i == 1) { lc = childrenNodes[i-1]; rc = childrenNodes[i]; } - else { lc = n; rc = childrenNodes[i]; } - cout << i << '\t' << lc << '\t' << rc << endl; + else { lc = n-1; rc = childrenNodes[i]; } + //cout << i << '\t' << lc << '\t' << rc << endl; T->tree[n].setChildren(lc,rc); T->tree[lc].setParent(n); T->tree[rc].setParent(n); - T->printTree(); cout << endl; + //T->printTree(); cout << endl; n++; } //to account for extra ++ in looping n--; - //int lc = readNewickInt(f, n, T); - //if (lc == -1) { return -1; } //reports an error in reading - - //readSpecialChar(f,',',"comma"); - - //int rc = readNewickInt(f, n, T); - //if (rc == -1) { return -1; } //reports an error in reading if(f.peek()==')'){ readSpecialChar(f,')',"right parenthesis"); //to pass over labels in trees c=filehandle.get(); - while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); } + while((c!=',') && (c != -1) && (c!= ':') && (c!=';')&& (c!=')')){ c=filehandle.get(); } filehandle.putback(c); } @@ -355,47 +348,6 @@ k = c; T->tree[n].setBranchLength(0.0); } - //to account for multifurcating trees generated by fasttree, we are forcing them to be bifurcating - /*while(f.peek() == ','){ - string k; - k = f.peek(); - cout << "in here k = " << k << '\t' << f.tellg() <tree[n].setChildren(lc,rc); - T->tree[lc].setParent(n); - T->tree[rc].setParent(n); - - T->printTree(); cout << endl; - lc = n; - n++; - - readSpecialChar(f,',',"comma"); - - rc = readNewickInt(f, n, T); - - if (rc == -1) { return -1; } //reports an error in reading - - if(f.peek()==')'){ - readSpecialChar(f,')',"right parenthesis"); - //to pass over labels in trees - c=filehandle.get(); - while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); } - filehandle.putback(c); - - if(f.peek() == ':'){ - readSpecialChar(f,':',"colon"); - - if(n >= numNodes){ m->mothurOut("Error: Too many nodes in input tree\n"); readOk = -1; return -1; } - - T->tree[n].setBranchLength(readBranchLength(f)); - }else{ - T->tree[n].setBranchLength(0.0); - } - - break; - } - }*/ - //T->tree[n].setChildren(lc,rc); //T->tree[lc].setParent(n); //T->tree[rc].setParent(n); @@ -411,7 +363,7 @@ k = c; name += d; d=f.get(); } - cout << name << endl; +//cout << name << endl; int blen = 0; if(d == ':') { blen = 1; } @@ -455,6 +407,7 @@ k = c; } while((c=f.get())!=0 && (c != ':' && c != ',' && c!=')') ) {;} + f.putback(c); return n1; diff --git a/readtreecommand.cpp b/readtreecommand.cpp index 1762983..8e292af 100644 --- a/readtreecommand.cpp +++ b/readtreecommand.cpp @@ -147,7 +147,7 @@ int ReadTreeCommand::execute(){ delete globaldata->gTreemap; return 0; } - + T[i]->assembleTree(); } diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp index da6bc44..5f5fe65 100644 --- a/removeseqscommand.cpp +++ b/removeseqscommand.cpp @@ -154,7 +154,7 @@ void RemoveSeqsCommand::help(){ m->mothurOut("The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy or alignreport file.\n"); m->mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n"); m->mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, alignreport and dups. You must provide accnos and at least one of the file parameters.\n"); - m->mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. If dups=true, then remove.seqs outputs a new .accnos file containing all the sequences removed. \n"); + m->mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. \n"); m->mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n"); m->mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n"); m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); @@ -178,8 +178,8 @@ int RemoveSeqsCommand::execute(){ if (m->control_pressed) { return 0; } //read through the correct file and output lines you want to keep - if (fastafile != "") { readFasta(); } if (namefile != "") { readName(); } + if (fastafile != "") { readFasta(); } if (groupfile != "") { readGroup(); } if (alignfile != "") { readAlign(); } if (listfile != "") { readList(); } @@ -325,12 +325,7 @@ int RemoveSeqsCommand::readName(){ try { if (outputDir == "") { outputDir += hasPath(namefile); } string outputFileName = outputDir + getRootName(getSimpleName(namefile)) + "pick" + getExtension(namefile); - string outputFileName2 = outputDir + getRootName(getSimpleName(namefile)) + "dups.accnos"; - ofstream out2; - if (dups) { openOutputFile(outputFileName2, out2); } - bool wroteDups = false; - ofstream out; openOutputFile(outputFileName, out); @@ -341,7 +336,7 @@ int RemoveSeqsCommand::readName(){ bool wroteSomething = false; while(!in.eof()){ - if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); if (dups) { out2.close(); remove(outputFileName2.c_str()); } return 0; } + if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; } in >> firstCol; in >> secondCol; @@ -365,9 +360,8 @@ int RemoveSeqsCommand::readName(){ } } - if ((dups) && (validSecond.size() != parsedNames.size())) { - wroteDups = true; - for (int i = 0; i < parsedNames.size(); i++) { out2 << parsedNames[i] << endl; } + if ((dups) && (validSecond.size() != parsedNames.size())) { //if dups is true and we want to get rid of anyone, get rid of everyone + for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); } }else { //if the name in the first column is in the set then print it and any other names in second column also in set if (names.count(firstCol) == 0) { @@ -401,11 +395,6 @@ int RemoveSeqsCommand::readName(){ in.close(); out.close(); - if (dups) { out2.close(); } - if (wroteDups == false) { - remove(outputFileName2.c_str()); - }else { outputNames.push_back(outputFileName2); } - if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); remove(outputFileName.c_str()); diff --git a/tree.cpp b/tree.cpp index 400a72a..8e45981 100644 --- a/tree.cpp +++ b/tree.cpp @@ -736,20 +736,39 @@ int Tree::readTreeString(ifstream& filehandle) { //cout << " at beginning of while " << k << endl; if(c == ')') { //to pass over labels in trees - string label = readLabel(filehandle); + c=filehandle.get(); + while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); } + filehandle.putback(c); } - if(c == ';') { return 0; } if(c == -1) { return 0; } - //if you are a name if((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != '\t') && (c != 32)) { //32 is space - name = readName(filehandle); + name = ""; + c = filehandle.get(); + //k = c; +//cout << k << endl; + while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) { + name += c; + c = filehandle.get(); + //k = c; +//cout << " in name while " << k << endl; + } + +//cout << "name = " << name << endl; globaldata->Treenames.push_back(name); + filehandle.putback(c); +//k = c; +//cout << " after putback" << k << endl; } if(c == ':') { //read until you reach the end of the branch length - string bl = readBranchLength(filehandle); + while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) { + c = filehandle.get(); + //k = c; + //cout << " in branch while " << k << endl; + } + filehandle.putback(c); } c = filehandle.get(); @@ -770,68 +789,6 @@ int Tree::readTreeString(ifstream& filehandle) { } /*******************************************************/ -string Tree::readLabel(ifstream& filehandle) { - try { - - string label = ""; - - //to pass over labels in trees - int c=filehandle.get(); - while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ label += c; c=filehandle.get(); } - filehandle.putback(c); - - return label; - - } - catch(exception& e) { - m->errorOut(e, "Tree", "readLabel"); - exit(1); - } -} -/*******************************************************/ -string Tree::readName(ifstream& filehandle) { - try { - - string name = ""; - int c = filehandle.get(); - - while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) { - name += c; - c = filehandle.get(); - } - -//cout << "name = " << name << endl; - filehandle.putback(c); - - return name; - - } - catch(exception& e) { - m->errorOut(e, "Tree", "readName"); - exit(1); - } -} -/*******************************************************/ -string Tree::readBranchLength(ifstream& filehandle) { - try { - - string br = ""; - int c; - while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) { - br += c; - c = filehandle.get(); - } - filehandle.putback(c); - - return br; - - } - catch(exception& e) { - m->errorOut(e, "Tree", "readBranchLength"); - exit(1); - } -} /*******************************************************/ -/*******************************************************/ diff --git a/tree.h b/tree.h index 99cd68f..603f1c3 100644 --- a/tree.h +++ b/tree.h @@ -62,10 +62,7 @@ private: //not included in the tree. //only takes names from the first tree in the tree file and assumes that all trees use the same names. int readTreeString(ifstream&); - string readLabel(ifstream&); - string readName(ifstream&); - string readBranchLength(ifstream&); - + MothurOut* m; };