vector<int> topMatches;
ofstream queryFile;
- openOutputFile(queryFileName, queryFile);
+ openOutputFile((queryFileName+seq->getName()), queryFile);
queryFile << '>' << seq->getName() << endl;
queryFile << seq->getUnaligned() << endl;
queryFile.close();
+
// the goal here is to quickly survey the database to find the closest match. To do this we are using the default
// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
// long. With this setting, it seems comparable in speed to the suffix tree approach.
string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
- blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+ blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
system(blastCommand.c_str());
ifstream m8FileHandle;
- openInputFile(blastFileName, m8FileHandle);
+ openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
string dummy;
int templateAccession;
topMatches.push_back(templateAccession);
}
m8FileHandle.close();
-
+ remove((queryFileName+seq->getName()).c_str());
+ remove((blastFileName+seq->getName()).c_str());
+
return topMatches;
}
catch(exception& e) {
vector<int> topMatches;
ofstream queryFile;
- openOutputFile(queryFileName, queryFile);
+ openOutputFile((queryFileName+seq->getName()), queryFile);
queryFile << '>' << seq->getName() << endl;
queryFile << seq->getUnaligned() << endl;
queryFile.close();
// long. With this setting, it seems comparable in speed to the suffix tree approach.
string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
- blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+ blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
system(blastCommand.c_str());
-
+
ifstream m8FileHandle;
- openInputFile(blastFileName, m8FileHandle, "no error");
+ openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
string dummy;
int templateAccession;
//cout << templateAccession << endl;
}
m8FileHandle.close();
+ remove((queryFileName+seq->getName()).c_str());
+ remove((blastFileName+seq->getName()).c_str());
//cout << "\n\n" ;
return topMatches;
}
string qAligned = query->getAligned();
string newQuery = "";
-
+
//sort parents by region start
sort(parents.begin(), parents.end(), compareRegionStart);
//make sure you don't cutoff beginning of query
if (parents[0].nastRegionStart > 0) { newQuery += qAligned.substr(0, parents[0].nastRegionStart); }
int longest = 0;
+
//take query and break apart into pieces using breakpoints given by results of parents
for (int i = 0; i < parents.size(); i++) {
+
int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
string q = qAligned.substr(parents[i].nastRegionStart, length);
Sequence* queryFrag = new Sequence(query->getName(), q);
-
+
queryParts.push_back(queryFrag);
Sequence* parent = getSequence(parents[i].parent);
parent->setAligned(p);
parentParts.push_back(parent);
-
+
if (q.length() > longest) { longest = q.length(); }
if (p.length() > longest) { longest = p.length(); }
}
-
+
//align each peice to correct parent from results
for (int i = 0; i < queryParts.size(); i++) {
alignment = new NeedlemanOverlap(-2.0, match, misMatch, longest+1); //default gapopen, match, mismatch, longestbase
Nast nast(alignment, queryParts[i], parentParts[i]);
delete alignment;
}
-
+
//recombine pieces to form new query sequence
for (int i = 0; i < queryParts.size(); i++) {
+ //sometimes the parent regions do not meet, for example region 1 may end at 1000 and region 2 starts at 1100.
+ //we don't want to loose length so in this case we will leave query alone
+ if (i != 0) {
+ int space = parents[i].nastRegionStart - parents[i-1].nastRegionEnd - 1;
+ if (space > 0) { //they don't meet and we need to add query piece
+ string q = qAligned.substr(parents[i-1].nastRegionEnd+1, space);
+ newQuery += q;
+ }
+ }
+
newQuery += queryParts[i]->getAligned();
}
-
//make sure you don't cutoff end of query
- if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd); }
+ if (parents[parents.size()-1].nastRegionEnd < (qAligned.length()-1)) { newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd+1); }
//set query to new aligned string
query->setAligned(newQuery);
int startIndex = pid * numSeqsPerProcessor;
if(pid == (processors - 1)){ numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; }
-
//align your part
driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
lines.push_back(new linePair(startPos, numSeqsPerProcessor));
}
-
createProcesses(outputFileName, fastafile, accnosFileName);
rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
else {
//valid paramters for this command
- string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
+ string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
hard = isTrue(temp);
+ temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
+ large = isTrue(temp);
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
convert(temp, processors);
void ClusterSplitCommand::help(){
try {
- m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
+ m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Phylip or column and name are required.\n");
m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
+ m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
m->mothurOut("The cluster.split command should be in the following format: \n");
m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
if (m->control_pressed) { return 0; }
time_t estart = time(NULL);
+ m->mothurOut("Splitting the distance file..."); m->mothurOutEndLine();
//split matrix into non-overlapping groups
SplitMatrix* split;
- if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod); }
- else { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod); }
+ if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
+ else { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
split->split();
ifstream in;
openInputFile(filename, in);
+ in >> tag; gobble(in);
+
while(!in.eof()) {
string tempName;
in >> tempName; gobble(in);
while(!in2.eof()) {
string tempName;
- in2 >> tempName; gobble(in);
+ in2 >> tempName; gobble(in2);
if (labels.count(tempName) == 0) { labels.insert(tempName); }
}
in2.close();
if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
+ m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
+
//****************** merge list file and create rabund and sabund files ******************************//
+ estart = time(NULL);
+ m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
+
ListVector* listSingle;
map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
- m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
}
}
//**********************************************************************************************************************
-map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string> userLabels, ListVector*& listSingle){
+map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
try {
map<float, int> labelBin;
if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
else if (*it == "unique") { temp = -1.0; }
-
+
orderFloat.push_back(temp);
labelBin[temp] = numSingleBins; //initialize numbins
}
string filename = toString(getpid()) + ".temp";
ofstream out;
openOutputFile(filename, out);
+ out << tag << endl;
for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl; }
out.close();
string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile;
double cutoff, splitcutoff;
int precision, length, processors, taxLevelCutoff;
- bool print_start, abort, hard;
+ bool print_start, abort, hard, large;
time_t start;
ofstream outList, outRabund, outSabund;
int createProcesses(vector < vector < map<string, string> > >);
vector<string> cluster(vector< map<string, string> >, set<string>&);
int mergeLists(vector<string>, map<float, int>, ListVector*);
- map<float, int> completeListFile(vector<string>, string, set<string>, ListVector*&);
+ map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
};
#endif
./splitmatrix.o : splitmatrix.cpp\r
$(CC) $(CC_OPTIONS) splitmatrix.cpp -c $(INCLUDE) -o ./splitmatrix.o\r
\r
-# Item # 207 -- splitmatrix --\r
+# Item # 207 -- clustersplit --\r
./clustersplitcommand.o : clustersplitcommand.cpp\r
$(CC) $(CC_OPTIONS) clustersplitcommand.cpp -c $(INCLUDE) -o ./clustersplitcommand.o\r
\r
-# Item # 207 -- splitmatrix --\r
+# Item # 208 -- classifyotu --\r
./classifyotucommand.o : classifyotucommand.cpp\r
$(CC) $(CC_OPTIONS) classifyotucommand.cpp -c $(INCLUDE) -o ./classifyotucommand.o\r
\r
string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
- Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
- Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+ Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
+ Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
-
- //if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1))) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
+
+ //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
vector<int> smaller;
vector<int> larger;
}else{
T->tree[n].setBranchLength(0.0);
}
-
+
+ //to account for multifurcating trees generated by fasttree, we are forcing them to be bifurcating
+ /* if(f.peek() == ','){
+
+ //force this node to be left child and read new rc
+ T->tree[n].setChildren(lc,rc);
+ T->tree[lc].setParent(n);
+ T->tree[rc].setParent(n);
+
+ n++;
+
+
+
+ }*/
+
T->tree[n].setChildren(lc,rc);
T->tree[lc].setParent(n);
T->tree[rc].setParent(n);
SplitAbundCommand::SplitAbundCommand(string option) {
try {
abort = false;
- wroteRareList = false;
- wroteAbundList = false;
allLines = 1;
//allow user to run help
else {
//valid paramters for this command
- string Array[] = {"list","name","group","label","accnos","cutoff","outputdir","inputdir"};
+ string Array[] = {"name","group","label","accnos","groups","fasta","cutoff","outputdir","inputdir"}; //"list",
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
if (path == "") { parameters["group"] = inputDir + it->second; }
}
+ it = parameters.find("fasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["fasta"] = inputDir + it->second; }
+ }
+
it = parameters.find("name");
//user has given a template file
if(it != parameters.end()){
//check for required parameters
listfile = validParameter.validFile(parameters, "list", true);
if (listfile == "not open") { abort = true; }
- else if (listfile == "not found") { listfile = ""; }
+ else if (listfile == "not found") { listfile = ""; }
+ else{ inputFile = listfile; }
- //check for required parameters
namefile = validParameter.validFile(parameters, "name", true);
if (namefile == "not open") { abort = true; }
else if (namefile == "not found") { namefile = ""; }
+ else{ inputFile = namefile; }
+
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not open") { abort = true; }
+ else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the split.abund command. "); m->mothurOutEndLine(); abort = true; }
groupfile = validParameter.validFile(parameters, "group", true);
- if (groupfile == "not open") { abort = true; }
+ if (groupfile == "not open") { groupfile = ""; abort = true; }
else if (groupfile == "not found") { groupfile = ""; }
else {
groupMap = new GroupMap(groupfile);
int error = groupMap->readMap();
if (error == 1) { abort = true; }
+
}
+ groups = validParameter.validFile(parameters, "groups", false);
+ if (groups == "not found") { groups = ""; }
+ else if (groups == "all") {
+ if (groupfile != "") { Groups = groupMap->namesOfGroups; }
+ else { m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = ""; }
+ }else {
+ splitAtDash(groups, Groups);
+ }
+
+ if ((groupfile == "") && (groups != "")) { m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = ""; Groups.clear(); }
+
//do you have all files needed
if ((listfile == "") && (namefile == "")) { m->mothurOut("You must either a listfile or a namefile for the split.abund command. "); m->mothurOutEndLine(); abort = true; }
- if ((listfile != "") && (namefile != "")) { m->mothurOut("You must either a listfile or a namefile for the split.abund command, but NOT BOTH. "); m->mothurOutEndLine(); abort = true; }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
//**********************************************************************************************************************
void SplitAbundCommand::help(){
try {
- m->mothurOut("The split.abund command reads a list or a names file splits the sequences into rare and abundant groups.. \n");
- m->mothurOut("The split.abund command parameters are list, name, cutoff, group, label and accnos.\n");
- m->mothurOut("The list or name parameter is required, and you must provide a cutoff value.\n");
+ m->mothurOut("The split.abund command reads a fasta file and a list or a names file splits the sequences into rare and abundant groups. \n");
+ m->mothurOut("The split.abund command parameters are list, name, cutoff, group, label, groups and accnos.\n");
+ m->mothurOut("The fasta and a list or name parameter are required, and you must provide a cutoff value.\n");
m->mothurOut("The cutoff parameter is used to qualify what is abundant and rare.\n");
m->mothurOut("The group parameter allows you to parse a group file into rare and abundant groups.\n");
m->mothurOut("The label parameter is used to read specific labels in your listfile you want to use.\n");
m->mothurOut("The accnos parameter allows you to output a .rare.accnos and .abund.accnos files to use with the get.seqs and remove.seqs commands.\n");
+ m->mothurOut("The groups parameter allows you to parse the files into rare and abundant files by group. \n");
+ m->mothurOut("For example if you set groups=A-B-C, you will get a .A.abund, .A.rare, .B.abund, .B.rare, .C.abund, .C.rare files. \n");
+ m->mothurOut("If you want .abund and .rare files for all groups, set groups=all. \n");
m->mothurOut("The split.abund command should be used in the following format: split.abund(list=yourListFile, group=yourGroupFile, label=yourLabels, cutoff=yourCutoff).\n");
m->mothurOut("Example: split.abundt(list=abrecovery.fn.list, group=abrecovery.groups, label=0.03, cutoff=2).\n");
m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n");
if (abort == true) { return 0; }
- if (namefile != "") { split(); }
- else {
+ if (listfile != "") { //you are using a listfile to determine abundance
//remove old files so you can append later....
string fileroot = outputDir + getRootName(getSimpleName(listfile));
- remove((fileroot + "rare.list").c_str());
- remove((fileroot + "abund.list").c_str());
+ if (Groups.size() == 0) {
+ remove((fileroot + "rare.list").c_str());
+ remove((fileroot + "abund.list").c_str());
+
+ wroteListFile["rare"] = false;
+ wroteListFile["abund"] = false;
+ }else{
+ for (int i=0; i<Groups.size(); i++) {
+ remove((fileroot + Groups[i] + ".rare.list").c_str());
+ remove((fileroot + Groups[i] + ".abund.list").c_str());
+
+ wroteListFile[(Groups[i] + ".rare")] = false;
+ wroteListFile[(Groups[i] + ".abund")] = false;
+ }
+ }
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
set<string> processedLabels;
list = input->getListVector();
string lastLabel = list->getLabel();
+ //do you have a namefile or do we need to similate one?
+ if (namefile != "") { readNamesFile(); }
+ else { createNameMap(list); }
+
if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
if(allLines == 1 || labels.count(list->getLabel()) == 1){
m->mothurOut(list->getLabel()); m->mothurOutEndLine();
- split(list);
+ splitList(list);
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
list = input->getListVector(lastLabel); //get new list vector to process
m->mothurOut(list->getLabel()); m->mothurOutEndLine();
- split(list);
+ splitList(list);
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
list = input->getListVector(lastLabel); //get new list vector to process
m->mothurOut(list->getLabel()); m->mothurOutEndLine();
- split(list);
+ splitList(list);
delete list;
}
delete input;
+ for (map<string, bool>::iterator itBool = wroteListFile.begin(); itBool != wroteListFile.end(); itBool++) {
+ string filename = fileroot + itBool->first;
+ if (itBool->second) { //we wrote to this file
+ outputNames.push_back(filename);
+ }else{
+ remove(filename.c_str());
+ }
+ }
+
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+
+
+ }else { //you are using the namefile to determine abundance
+
+ splitNames();
+ writeNames();
- if (wroteAbundList) { outputNames.push_back(fileroot + "abund.list"); }
- if (wroteRareList) { outputNames.push_back(fileroot + "rare.list"); }
+ string tag = "";
+ if (groupfile != "") { parseGroup(tag); }
+ if (accnos) { writeAccnos(tag); }
+ if (fastafile != "") { parseFasta(tag); }
}
-
-
+
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
}
}
/**********************************************************************************************************************/
-int SplitAbundCommand::split(ListVector* thisList) {
+int SplitAbundCommand::splitList(ListVector* thisList) {
try {
-
- SAbundVector* sabund = new SAbundVector();
- *sabund = thisList->getSAbundVector();
+ rareNames.clear();
+ abundNames.clear();
- //find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time
- // and don't have to store the bins until you are done with the whole vector, this save alot of space.
- int numRareBins = 0;
- for (int i = 0; i <= sabund->getMaxRank(); i++) {
- if (i > cutoff) { break; }
- numRareBins += sabund->get(i);
- }
- int numAbundBins = thisList->getNumBins() - numRareBins;
- delete sabund;
+ //get rareNames and abundNames
+ for (int i = 0; i < thisList->getNumBins(); i++) {
+ if (m->control_pressed) { return 0; }
+
+ string bin = thisList->get(i);
+
+ vector<string> names;
+ splitAtComma(bin, names); //parses bin into individual sequence names
+ int size = names.size();
+
+ if (size <= cutoff) {
+ for (int j = 0; j < names.size(); j++) { rareNames.insert(names[j]); }
+ }else{
+ for (int j = 0; j < names.size(); j++) { abundNames.insert(names[j]); }
+ }
+ }//end for
+
+ writeList(thisList);
+
+ string tag = thisList->getLabel() + ".";
+ if (groupfile != "") { parseGroup(tag); }
+ if (accnos) { writeAccnos(tag); }
+ if (fastafile != "") { parseFasta(tag); }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "splitList");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::writeList(ListVector* thisList) {
+ try {
+
+ map<string, ofstream*> filehandles;
- //setup output files
- ofstream outListAbund;
- ofstream outListRare;
- ofstream outGroupRare;
- ofstream outGroupAbund;
- ofstream outAccnosRare;
- ofstream outAccnosAbund;
+ if (Groups.size() == 0) {
+ SAbundVector* sabund = new SAbundVector();
+ *sabund = thisList->getSAbundVector();
- string fileroot = outputDir + getRootName(getSimpleName(listfile));
- if (numRareBins > 0) {
- wroteRareList = true;
- string listRareName = fileroot + "rare.list";
- openOutputFileAppend(listRareName, outListRare);
- outListRare << thisList->getLabel() << '\t' << numRareBins << '\t';
+ //find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time
+ // and don't have to store the bins until you are done with the whole vector, this save alot of space.
+ int numRareBins = 0;
+ for (int i = 0; i <= sabund->getMaxRank(); i++) {
+ if (i > cutoff) { break; }
+ numRareBins += sabund->get(i);
+ }
+ int numAbundBins = thisList->getNumBins() - numRareBins;
+ delete sabund;
+
+ ofstream aout;
+ ofstream rout;
+
+ if (rareNames.size() != 0) {
+ string rare = outputDir + getRootName(getSimpleName(listfile)) + ".rare.list";
+ wroteListFile["rare"] = true;
+ openOutputFileAppend(rare, rout);
+ rout << thisList->getLabel() << '\t' << numRareBins << '\t';
+ }
- if (accnos) {
- string accnosName = fileroot + thisList->getLabel() + ".rare.accnos";
- openOutputFile(accnosName, outAccnosRare);
- outputNames.push_back(accnosName);
+ if (abundNames.size() != 0) {
+ string abund = outputDir + getRootName(getSimpleName(listfile)) + ".abund.list";
+ wroteListFile["abund"] = true;
+ openOutputFileAppend(abund, aout);
+ rout << thisList->getLabel() << '\t' << numAbundBins << '\t';
}
+
+ for (int i = 0; i < thisList->getNumBins(); i++) {
+ if (m->control_pressed) { break; }
+
+ string bin = list->get(i);
- if (groupfile != "") {
- string groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".rare.group";
- openOutputFile(groupFileName, outGroupRare);
- outputNames.push_back(groupFileName);
+ int size = getNumNames(bin);
+
+ if (size <= cutoff) { rout << bin << '\t'; }
+ else { aout << bin << '\t'; }
}
- }
-
- if (numAbundBins > 0) {
- wroteAbundList = true;
- string listAbundName = fileroot + "abund.list";
- openOutputFileAppend(listAbundName, outListAbund);
- outListAbund << thisList->getLabel() << '\t' << numAbundBins << '\t';
- if (accnos) {
- string accnosName = fileroot + thisList->getLabel() + ".abund.accnos";
- openOutputFile(accnosName, outAccnosAbund);
- outputNames.push_back(accnosName);
+ if (rareNames.size() != 0) { rout << endl; rout.close(); }
+ if (abundNames.size() != 0) { aout << endl; aout.close(); }
+
+ }else{ //parse names by abundance and group
+ string fileroot = outputDir + getRootName(getSimpleName(listfile));
+ ofstream* temp;
+ ofstream* temp2;
+ map<string, bool> wroteFile;
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]+".rare"] = temp;
+ temp2 = new ofstream;
+ filehandles[Groups[i]+".abund"] = temp2;
+
+ openOutputFileAppend(fileroot + Groups[i] + ".rare.list", *(filehandles[Groups[i]+".rare"]));
+ openOutputFileAppend(fileroot + Groups[i] + ".abund.list", *(filehandles[Groups[i]+".abund"]));
}
- if (groupfile != "") {
- string groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".abund.group";
- openOutputFile(groupFileName, outGroupAbund);
- outputNames.push_back(groupFileName);
+ map<string, string> groupVector;
+ map<string, string>::iterator itGroup;
+ map<string, int> groupNumBins;
+
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ groupNumBins[it3->first] = 0;
+ groupVector[it3->first] = "";
}
- }
- for (int i = 0; i < thisList->getNumBins(); i++) {
- if (m->control_pressed) { break; }
-
- string bin = list->get(i);
+ for (int i = 0; i < thisList->getNumBins(); i++) {
+ if (m->control_pressed) { break; }
- int size = getNumNames(bin);
+ map<string, string> groupBins;
+ string bin = list->get(i);
- if (size <= cutoff) { outListRare << bin << '\t'; }
- else { outListAbund << bin << '\t'; }
-
- if ((groupfile != "") || (accnos)) { //you need to parse the bin...
vector<string> names;
splitAtComma(bin, names); //parses bin into individual sequence names
-
+
//parse bin into list of sequences in each group
for (int j = 0; j < names.size(); j++) {
-
- //write to accnos file
- if (accnos) {
- if (size <= cutoff) { outAccnosRare << names[j] << endl; }
- else { outAccnosAbund << names[j] << endl; }
+ string rareAbund;
+ if (rareNames.count(names[j]) != 0) { //you are a rare name
+ rareAbund = ".rare";
+ }else{ //you are a abund name
+ rareAbund = ".abund";
}
- //write to groupfile
- if (groupfile != "") {
- string group = groupMap->getGroup(names[j]);
-
- if (group == "not found") { //error in groupfile so close and remove output file and disregard groupfile
- m->mothurOut(names[j] + " is not in your groupfile. disregarding groupfile."); m->mothurOutEndLine();
- delete groupMap;
- if (numAbundBins > 0) {
- outGroupAbund.close();
- remove((outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".abund.group").c_str());
- }
- if (numRareBins > 0) {
- outGroupRare.close();
- remove((outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".rare.group").c_str());
- }
- groupfile = "";
- }else {
- if (size <= cutoff) { outGroupRare << names[j] << '\t' << group << endl; }
- else { outGroupAbund << names[j] << '\t' << group << endl; }
+ string group = groupMap->getGroup(names[j]);
+
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ itGroup = groupBins.find(group+rareAbund);
+ if(itGroup == groupBins.end()) {
+ groupBins[group+rareAbund] = names[j]; //add first name
+ groupNumBins[group+rareAbund]++;
+ }else{ //add another name
+ groupBins[group+rareAbund] += "," + names[j];
}
+ }else if(group == "not found") {
+ m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
}
-
- }//end for names
- }//end if parse
- }//end for list
-
-
- //close files
- if (numRareBins > 0) {
- outListRare << endl;
- outListRare.close();
- if (accnos) { outAccnosRare.close(); }
- if (groupfile != "") { outGroupRare.close(); }
+ }
+
+
+ for (itGroup = groupBins.begin(); itGroup != groupBins.end(); itGroup++) {
+ groupVector[itGroup->first] += itGroup->second + '\t';
+ }
+ }
+
+ //end list vector
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ (*(filehandles[it3->first])) << thisList->getLabel() << '\t' << groupNumBins[it3->first] << '\t' << groupVector[it3->first] << endl; // label numBins listvector for that group
+ wroteListFile[it3->first] = true;
+ (*(filehandles[it3->first])).close();
+ delete it3->second;
+ }
}
- if (numAbundBins > 0) {
- outListAbund << endl;
- outListAbund.close();
- if (accnos) { outAccnosAbund.close(); }
- if (groupfile != "") { outGroupAbund.close(); }
- }
-
return 0;
}
catch(exception& e) {
- m->errorOut(e, "SplitAbundCommand", "split");
+ m->errorOut(e, "SplitAbundCommand", "writeList");
exit(1);
}
}
/**********************************************************************************************************************/
-int SplitAbundCommand::split() { //namefile
+int SplitAbundCommand::splitNames() { //namefile
try {
- //setup output files
- ofstream outNameAbund;
- ofstream outNameRare;
- ofstream outGroupRare;
- ofstream outGroupAbund;
- ofstream outAccnosRare;
- ofstream outAccnosAbund;
- bool wroteNameAbund = false;
- bool wroteNameRare = false;
- bool wroteGroupRare = false;
- bool wroteGroupAbund = false;
- bool wroteAccnosRare = false;
- bool wroteAccnosAbund = false;
-
- //prepare output files
- string fileroot = outputDir + getRootName(getSimpleName(namefile));
-
- string nameRareName = fileroot + "rare.names";
- openOutputFile(nameRareName, outNameRare);
- string nameAbundName = fileroot + "abund.names";
- openOutputFile(nameAbundName, outNameAbund);
+ rareNames.clear();
+ abundNames.clear();
- if (accnos) {
- string accnosName = fileroot + "rare.accnos";
- openOutputFile(accnosName, outAccnosRare);
+ //open input file
+ ifstream in;
+ openInputFile(namefile, in);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
- accnosName = fileroot + "abund.accnos";
- openOutputFile(accnosName, outAccnosAbund);
- }
+ string firstCol, secondCol;
+ in >> firstCol >> secondCol; gobble(in);
- if (groupfile != "") {
- string groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + ".rare.group";
- openOutputFile(groupFileName, outGroupRare);
+ nameMap[firstCol] = secondCol;
- groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + ".abund.group";
- openOutputFile(groupFileName, outGroupAbund);
+ int size = getNumNames(secondCol);
+
+ if (size <= cutoff) {
+ rareNames.insert(firstCol);
+ }else{
+ abundNames.insert(firstCol);
+ }
}
-
-
+ in.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "splitNames");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::readNamesFile() {
+ try {
//open input file
ifstream in;
openInputFile(namefile, in);
string firstCol, secondCol;
in >> firstCol >> secondCol; gobble(in);
- int size = getNumNames(secondCol);
+ nameMap[firstCol] = secondCol;
+ }
+ in.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "readNamesFile");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::createNameMap(ListVector* thisList) {
+ try {
+
+ if (thisList != NULL) {
+ for (int i = 0; i < thisList->getNumBins(); i++) {
+ if (m->control_pressed) { return 0; }
+
+ string bin = thisList->get(i);
+
+ vector<string> names;
+ splitAtComma(bin, names); //parses bin into individual sequence names
- if (size <= cutoff) { outNameRare << firstCol << '\t' << secondCol << endl; wroteNameRare = true; }
- else { outNameAbund << firstCol << '\t' << secondCol << endl; wroteNameAbund = true; }
+ for (int j = 0; j < names.size(); j++) { nameMap[names[j]] = names[j]; }
+ }//end for
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "createNameMap");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::writeNames() { //namefile
+ try {
+
+ map<string, ofstream*> filehandles;
+
+ if (Groups.size() == 0) {
+ ofstream aout;
+ ofstream rout;
+
+ if (rareNames.size() != 0) {
+ string rare = outputDir + getRootName(getSimpleName(namefile)) + "rare.names";
+ openOutputFile(rare, rout);
+ outputNames.push_back(rare);
+
+ for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+ rout << (*itRare) << '\t' << nameMap[(*itRare)] << endl;
+ }
+ rout.close();
+ }
+
+ if (abundNames.size() != 0) {
+ string abund = outputDir + getRootName(getSimpleName(namefile)) + "abund.names";
+ openOutputFile(abund, aout);
+ outputNames.push_back(abund);
+
+ for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+ aout << (*itAbund) << '\t' << nameMap[(*itAbund)] << endl;
+ }
+ aout.close();
+ }
+
+ }else{ //parse names by abundance and group
+ string fileroot = outputDir + getRootName(getSimpleName(namefile));
+ ofstream* temp;
+ ofstream* temp2;
+ map<string, bool> wroteFile;
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]+".rare"] = temp;
+ temp2 = new ofstream;
+ filehandles[Groups[i]+".abund"] = temp2;
+
+ openOutputFile(fileroot + Groups[i] + ".rare.names", *(filehandles[Groups[i]+".rare"]));
+ openOutputFile(fileroot + Groups[i] + ".abund.names", *(filehandles[Groups[i]+".abund"]));
+
+ wroteFile[Groups[i] + ".rare"] = false;
+ wroteFile[Groups[i] + ".abund"] = false;
+ }
- if ((groupfile != "") || (accnos)) { //you need to parse the bin...
+ for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {
vector<string> names;
- splitAtComma(secondCol, names); //parses bin into individual sequence names
+ splitAtComma(itName->second, names); //parses bin into individual sequence names
- //parse bin into list of sequences in each group
- for (int j = 0; j < names.size(); j++) {
+ string rareAbund;
+ if (rareNames.count(itName->first) != 0) { //you are a rare name
+ rareAbund = ".rare";
+ }else{ //you are a abund name
+ rareAbund = ".abund";
+ }
+
+ map<string, string> outputStrings;
+ map<string, string>::iterator itout;
+ for (int i = 0; i < names.size(); i++) {
- //write to accnos file
- if (accnos) {
- if (size <= cutoff) { outAccnosRare << names[j] << endl; wroteAccnosRare = true; }
- else { outAccnosAbund << names[j] << endl; wroteAccnosAbund = true; }
+ string group = groupMap->getGroup(names[i]);
+
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ itout = outputStrings.find(group+rareAbund);
+ if (itout == outputStrings.end()) {
+ outputStrings[group+rareAbund] = names[i] + '\t' + names[i];
+ }else { outputStrings[group+rareAbund] += "," + names[i]; }
+ }else if(group == "not found") {
+ m->mothurOut(names[i] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
}
+ }
+
+ for (itout = outputStrings.begin(); itout != outputStrings.end(); itout++) {
+ *(filehandles[itout->first]) << itout->second << endl;
+ wroteFile[itout->first] = true;
+ }
+ }
+
+
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ (*(filehandles[it3->first])).close();
+ if (wroteFile[it3->first] == true) { outputNames.push_back(fileroot + it3->first + ".names"); }
+ else { remove((it3->first).c_str()); }
+ delete it3->second;
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "writeNames");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+//just write the unique names - if a namesfile is given
+int SplitAbundCommand::writeAccnos(string tag) {
+ try {
+
+ map<string, ofstream*> filehandles;
+
+ if (Groups.size() == 0) {
+ ofstream aout;
+ ofstream rout;
+
+ if (rareNames.size() != 0) {
+ string rare = outputDir + getRootName(getSimpleName(inputFile)) + tag + "rare.accnos";
+ openOutputFile(rare, rout);
+ outputNames.push_back(rare);
+
+ for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+ rout << (*itRare) << endl;
+ }
+ rout.close();
+ }
+
+ if (abundNames.size() != 0) {
+ string abund = outputDir + getRootName(getSimpleName(inputFile)) + tag + "abund.accnos";
+ openOutputFile(abund, aout);
+ outputNames.push_back(abund);
+
+ for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+ aout << (*itAbund) << endl;
+ }
+ aout.close();
+ }
+ }else{ //parse names by abundance and group
+ string fileroot = outputDir + getRootName(getSimpleName(inputFile));
+ ofstream* temp;
+ ofstream* temp2;
+ map<string, bool> wroteFile;
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]+".rare"] = temp;
+ temp2 = new ofstream;
+ filehandles[Groups[i]+".abund"] = temp2;
+
+ openOutputFile(fileroot + tag + Groups[i] + ".rare.accnos", *(filehandles[Groups[i]+".rare"]));
+ openOutputFile(fileroot + tag + Groups[i] + ".abund.accnos", *(filehandles[Groups[i]+".abund"]));
+
+ wroteFile[Groups[i] + ".rare"] = false;
+ wroteFile[Groups[i] + ".abund"] = false;
+ }
+
+ //write rare
+ for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+ string group = groupMap->getGroup(*itRare);
- //write to groupfile
- if (groupfile != "") {
- string group = groupMap->getGroup(names[j]);
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ *(filehandles[group+".rare"]) << *itRare << endl;
+ wroteFile[group+".rare"] = true;
+ }
+ }
+
+ //write abund
+ for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+ string group = groupMap->getGroup(*itAbund);
- if (group == "not found") { //error in groupfile so close and remove output file and disregard groupfile
- m->mothurOut(names[j] + " is not in your groupfile. disregarding groupfile."); m->mothurOutEndLine();
- delete groupMap;
-
- outGroupAbund.close();
- remove((outputDir + getRootName(getSimpleName(groupfile)) + ".abund.group").c_str());
- outGroupRare.close();
- remove((outputDir + getRootName(getSimpleName(groupfile)) + ".rare.group").c_str());
-
- groupfile = "";
- wroteGroupRare = false;
- wroteGroupAbund = false;
- }else {
- if (size <= cutoff) { outGroupRare << names[j] << '\t' << group << endl; wroteGroupRare = true; }
- else { outGroupAbund << names[j] << '\t' << group << endl; wroteGroupAbund = true; }
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ *(filehandles[group+".abund"]) << *itAbund << endl;
+ wroteFile[group+".abund"] = true;
+ }
+ }
+
+ //close files
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ (*(filehandles[it3->first])).close();
+ if (wroteFile[it3->first] == true) { outputNames.push_back(fileroot + tag + it3->first + ".accnos"); }
+ else { remove((fileroot + tag + it3->first + ".accnos").c_str()); }
+ delete it3->second;
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "writeAccnos");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::parseGroup(string tag) { //namefile
+ try {
+
+ map<string, ofstream*> filehandles;
+
+ if (Groups.size() == 0) {
+ ofstream aout;
+ ofstream rout;
+
+ if (rareNames.size() != 0) {
+ string rare = outputDir + getRootName(getSimpleName(groupfile)) + tag + "rare.groups";
+ openOutputFile(rare, rout);
+ outputNames.push_back(rare);
+ }
+
+ if (abundNames.size() != 0) {
+ string abund = outputDir + getRootName(getSimpleName(groupfile)) + tag + "abund.groups";
+ openOutputFile(abund, aout);
+ outputNames.push_back(abund);
+ }
+
+
+ for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {
+ vector<string> names;
+ splitAtComma(itName->second, names); //parses bin into individual sequence names
+
+ for (int i = 0; i < names.size(); i++) {
+
+ string group = groupMap->getGroup(names[i]);
+
+ if (group == "not found") {
+ m->mothurOut(names[i] + " is not in your groupfile, ignoring, please correct."); m->mothurOutEndLine();
+ }else {
+ if (rareNames.count(itName->first) != 0) { //you are a rare name
+ rout << names[i] << '\t' << group << endl;
+ }else{ //you are a abund name
+ rout << names[i] << '\t' << group << endl;
}
}
-
- }//end for names
- }//end if parse
- }//end while
+ }
+ }
+
+ if (rareNames.size() != 0) { rout.close(); }
+ if (abundNames.size() != 0) { aout.close(); }
+
+ }else{ //parse names by abundance and group
+ string fileroot = outputDir + getRootName(getSimpleName(groupfile));
+ ofstream* temp;
+ ofstream* temp2;
+ map<string, bool> wroteFile;
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]+".rare"] = temp;
+ temp2 = new ofstream;
+ filehandles[Groups[i]+".abund"] = temp2;
+
+ openOutputFile(fileroot + tag + Groups[i] + ".rare.groups", *(filehandles[Groups[i]+".rare"]));
+ openOutputFile(fileroot + tag + Groups[i] + ".abund.groups", *(filehandles[Groups[i]+".abund"]));
+
+ wroteFile[Groups[i] + ".rare"] = false;
+ wroteFile[Groups[i] + ".abund"] = false;
+ }
+
+ for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {
+ vector<string> names;
+ splitAtComma(itName->second, names); //parses bin into individual sequence names
+
+ string rareAbund;
+ if (rareNames.count(itName->first) != 0) { //you are a rare name
+ rareAbund = ".rare";
+ }else{ //you are a abund name
+ rareAbund = ".abund";
+ }
+
+ for (int i = 0; i < names.size(); i++) {
+
+ string group = groupMap->getGroup(names[i]);
+
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ *(filehandles[group+rareAbund]) << names[i] << '\t' << group << endl;
+ wroteFile[group+rareAbund] = true;
+ }
+ }
+ }
+
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ (*(filehandles[it3->first])).close();
+ if (wroteFile[it3->first] == true) { outputNames.push_back(fileroot + tag + it3->first + ".groups"); }
+ else { remove((fileroot + tag + it3->first + ".groups").c_str()); }
+ delete it3->second;
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitAbundCommand", "parseGroups");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::parseFasta(string tag) { //namefile
+ try {
+ map<string, ofstream*> filehandles;
- //close files
- in.close();
- outNameRare.close();
- outNameAbund.close();
- if (!wroteNameRare) { remove((fileroot + "rare.names").c_str()); }
- else { outputNames.push_back((fileroot + "rare.names")); }
- if (!wroteNameAbund) { remove((fileroot + "abund.names").c_str()); }
- else { outputNames.push_back((fileroot + "abund.names")); }
+ if (Groups.size() == 0) {
+ ofstream aout;
+ ofstream rout;
+
+ if (rareNames.size() != 0) {
+ string rare = outputDir + getRootName(getSimpleName(fastafile)) + tag + "rare.fasta";
+ openOutputFile(rare, rout);
+ outputNames.push_back(rare);
+ }
+
+ if (abundNames.size() != 0) {
+ string abund = outputDir + getRootName(getSimpleName(fastafile)) + tag + "abund.fasta";
+ openOutputFile(abund, aout);
+ outputNames.push_back(abund);
+ }
+
+
+ //open input file
+ ifstream in;
+ openInputFile(fastafile, in);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
- if (groupfile != "") {
- outGroupRare.close(); outGroupAbund.close();
- if (!wroteGroupRare) { remove((outputDir + getRootName(getSimpleName(groupfile)) + ".rare.group").c_str()); }
- else { outputNames.push_back((outputDir + getRootName(getSimpleName(groupfile)) + ".rare.group")); }
- if (!wroteGroupAbund) { remove((outputDir + getRootName(getSimpleName(groupfile)) + ".abund.group").c_str()); }
- else { outputNames.push_back((outputDir + getRootName(getSimpleName(groupfile)) + ".abund.group")); }
- }
+ Sequence seq(in); gobble(in);
+
+ if (seq.getName() != "") {
+
+ map<string, string>::iterator itNames;
+
+ itNames = nameMap.find(seq.getName());
+
+ if (itNames == nameMap.end()) {
+ m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+ }else{
+ if (rareNames.count(seq.getName()) != 0) { //you are a rare name
+ seq.printSequence(rout);
+ }else{ //you are a abund name
+ seq.printSequence(aout);
+ }
+ }
+ }
+ }
+ in.close();
+ if (rareNames.size() != 0) { rout.close(); }
+ if (abundNames.size() != 0) { aout.close(); }
+
+ }else{ //parse names by abundance and group
+ string fileroot = outputDir + getRootName(getSimpleName(fastafile));
+ ofstream* temp;
+ ofstream* temp2;
+ map<string, bool> wroteFile;
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
+ for (int i=0; i<Groups.size(); i++) {
+ temp = new ofstream;
+ filehandles[Groups[i]+".rare"] = temp;
+ temp2 = new ofstream;
+ filehandles[Groups[i]+".abund"] = temp2;
+
+ openOutputFile(fileroot + tag + Groups[i] + ".rare.fasta", *(filehandles[Groups[i]+".rare"]));
+ openOutputFile(fileroot + tag + Groups[i] + ".abund.fasta", *(filehandles[Groups[i]+".abund"]));
+
+ wroteFile[Groups[i] + ".rare"] = false;
+ wroteFile[Groups[i] + ".abund"] = false;
+ }
+
+ //open input file
+ ifstream in;
+ openInputFile(fastafile, in);
- if (accnos) {
- outAccnosAbund.close(); outAccnosRare.close();
- if (!wroteAccnosRare) { remove((fileroot + "rare.accnos").c_str()); }
- else { outputNames.push_back((fileroot + "rare.accnos")); }
- if (!wroteAccnosAbund) { remove((fileroot + "abund.accnos").c_str()); }
- else { outputNames.push_back((fileroot + "abund.accnos")); }
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
+
+ Sequence seq(in); gobble(in);
+
+ if (seq.getName() != "") {
+ map<string, string>::iterator itNames = nameMap.find(seq.getName());
+
+ if (itNames == nameMap.end()) {
+ m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+ }else{
+ vector<string> names;
+ splitAtComma(itNames->second, names); //parses bin into individual sequence names
+
+ string rareAbund;
+ if (rareNames.count(itNames->first) != 0) { //you are a rare name
+ rareAbund = ".rare";
+ }else{ //you are a abund name
+ rareAbund = ".abund";
+ }
+
+ for (int i = 0; i < names.size(); i++) {
+
+ string group = groupMap->getGroup(seq.getName());
+
+ if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+ seq.printSequence(*(filehandles[group+rareAbund]));
+ wroteFile[group+rareAbund] = true;
+ }else if(group == "not found") {
+ m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
+ }
+ }
+ }
+ }
+ }
+ in.close();
+
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ (*(filehandles[it3->first])).close();
+ if (wroteFile[it3->first] == true) { outputNames.push_back(fileroot + tag + it3->first + ".fasta"); }
+ else { remove((fileroot + tag + it3->first + ".fasta").c_str()); }
+ delete it3->second;
+ }
}
-
+
return 0;
}
catch(exception& e) {
- m->errorOut(e, "SplitAbundCommand", "split");
+ m->errorOut(e, "SplitAbundCommand", "parseFasta");
exit(1);
}
}
-
/**********************************************************************************************************************/
-
private:
- int split(ListVector*);
- int split(); //namefile
+ int splitList(ListVector*);
+ int splitNames(); //namefile
+ int writeNames();
+ int writeList(ListVector*);
+ int writeAccnos(string);
+ int parseGroup(string);
+ int parseFasta(string);
+ int readNamesFile(); //namefile
+ int createNameMap(ListVector*);
vector<string> outputNames;
ListVector* list;
GroupMap* groupMap;
InputData* input;
- string outputDir, listfile, namefile, groupfile, label;
- set<string> labels;
- bool abort, allLines, accnos, wroteRareList, wroteAbundList;
+ string outputDir, listfile, namefile, groupfile, label, groups, fastafile, inputFile;
+ set<string> labels, rareNames, abundNames;
+ vector<string> Groups;
+ bool abort, allLines, accnos;
int cutoff;
+ map<string, bool> wroteListFile;
+ map<string, string> nameMap;
/***********************************************************************/
-SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
+SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
m = MothurOut::getInstance();
distFile = distfile;
cutoff = c;
namefile = name;
method = t;
taxFile = tax;
+ large = l;
}
/***********************************************************************/
int SplitMatrix::splitDistance(){
try {
- vector<set<string> > groups;
+ if (large) { splitDistanceLarge(); }
+ else { splitDistanceRAM(); }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistance");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
+int SplitMatrix::splitClassify(){
+ try {
+ cutoff = int(cutoff);
+
+ map<string, int> seqGroup;
+ map<string, int>::iterator it;
+ map<string, int>::iterator it2;
+
+ int numGroups = 0;
+
+ //build tree from users taxonomy file
+ PhyloTree* phylo = new PhyloTree();
+
+ ifstream in;
+ openInputFile(taxFile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string seqname, tax;
+ while(!in.eof()){
+ in >> seqname >> tax; gobble(in);
+
+ phylo->addSeqToTree(seqname, tax);
+ }
+ in.close();
+
+ phylo->assignHeirarchyIDs(0);
+
+ //make sure the cutoff is not greater than maxlevel
+ if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
+
+ //for each node in tree
+ for (int i = 0; i < phylo->getNumNodes(); i++) {
+
+ //is this node within the cutoff
+ TaxNode taxon = phylo->get(i);
+
+ if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
+ if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
+ for (int j = 0; j < taxon.accessions.size(); j++) {
+ seqGroup[taxon.accessions[j]] = numGroups;
+ }
+ numGroups++;
+ }
+ }
+ }
+
+ ifstream dFile;
+ openInputFile(distFile, dFile);
+ ofstream outFile;
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ remove((distFile + "." + toString(i) + ".temp").c_str());
+ }
+
//for buffering the io to improve speed
//allow for 10 dists to be stored, then output.
+ vector<string> outputs; outputs.resize(numGroups, "");
+ vector<int> numOutputs; numOutputs.resize(numGroups, 0);
+
+ //for each distance
+ while(dFile){
+ string seqA, seqB;
+ float dist;
+
+ if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
+
+ dFile >> seqA >> seqB >> dist; gobble(dFile);
+
+ //if both sequences are in the same group then they are within the cutoff
+ it = seqGroup.find(seqA);
+ it2 = seqGroup.find(seqB);
+
+ if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
+ if (it->second == it2->second) { //they are from the same group so add the distance
+ if (numOutputs[it->second] > 10) {
+ openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
+ outFile.close();
+ outputs[it->second] = "";
+ numOutputs[it->second] = 0;
+ }else{
+ outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+ numOutputs[it->second]++;
+ }
+ }
+ }
+ }
+ dFile.close();
+
+ for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+ remove((namefile + "." + toString(i) + ".temp").c_str());
+
+ //write out any remaining buffers
+ if (numOutputs[it->second] > 0) {
+ openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+ outFile << outputs[i];
+ outFile.close();
+ outputs[i] = "";
+ numOutputs[i] = 0;
+ }
+ }
+
+ ifstream bigNameFile;
+ openInputFile(namefile, bigNameFile);
+
+ singleton = namefile + ".extra.temp";
+ ofstream remainingNames;
+ openOutputFile(singleton, remainingNames);
+
+ bool wroteExtra = false;
+
+ string name, nameList;
+ while(!bigNameFile.eof()){
+ bigNameFile >> name >> nameList; gobble(bigNameFile);
+
+ //did this sequence get assigned a group
+ it = seqGroup.find(name);
+
+ if (it != seqGroup.end()) {
+ openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+ outFile << name << '\t' << nameList << endl;
+ outFile.close();
+ }else{
+ wroteExtra = true;
+ remainingNames << name << '\t' << nameList << endl;
+ }
+ }
+ bigNameFile.close();
+ remainingNames.close();
+
+ if (!wroteExtra) {
+ remove(singleton.c_str());
+ singleton = "none";
+ }
+
+ for(int i=0;i<numGroups;i++){
+ string tempNameFile = namefile + "." + toString(i) + ".temp";
+ string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+ map<string, string> temp;
+ temp[tempDistFile] = tempNameFile;
+ dists.push_back(temp);
+ }
+
+ if (m->control_pressed) {
+ for (int i = 0; i < dists.size(); i++) {
+ remove((dists[i].begin()->first).c_str());
+ remove((dists[i].begin()->second).c_str());
+ }
+ dists.clear();
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitClassify");
+ exit(1);
+ }
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceLarge(){
+ try {
+ vector<set<string> > groups;
+
+ //for buffering the io to improve speed
+ //allow for 30 dists to be stored, then output.
vector<string> outputs;
vector<int> numOutputs;
vector<bool> wroteOutPut;
dFile >> seqA >> seqB >> dist;
- if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
+ if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
if(dist < cutoff){
//cout << "in cutoff: " << dist << endl;
int groupIDA = -1;
int groupIDB = -1;
int groupID = -1;
- int prevGroupID = -1;
for(int i=0;i<numGroups;i++){
set<string>::iterator aIt = groups[i].find(seqA);
newGroup.insert(seqB);
groups.push_back(newGroup);
- outFile.close();
- string fileName = distFile + "." + toString(numGroups) + ".temp";
- outFile.open(fileName.c_str(), ios::ate);
-
string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
outputs.push_back(tempOut);
numOutputs.push_back(1);
}
else{
string fileName = distFile + "." + toString(groupID) + ".temp";
-
- if(groupID != prevGroupID){
- outFile.close();
- outFile.open(fileName.c_str(), ios::app);
- prevGroupID = groupID;
- }
-
+
//have we reached the max buffer size
- if (numOutputs[groupID] > 10) { //write out sequence
+ if (numOutputs[groupID] > 60) { //write out sequence
+ outFile.open(fileName.c_str(), ios::app);
outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
+ outFile.close();
+
outputs[groupID] = "";
numOutputs[groupID] = 0;
wroteOutPut[groupID] = true;
if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
string row, column, distance;
if(groupIDA<groupIDB){
-
+
+ //merge memory
numOutputs[groupID] += numOutputs[groupIDB];
outputs[groupID] += outputs[groupIDB];
+ outputs[groupIDB] = "";
+ numOutputs[groupIDB] = 0;
+
+ //if groupB is written to file it is above buffer size so read and write to new merged file
if (wroteOutPut[groupIDB]) {
- string fileName = distFile + "." + toString(groupIDB) + ".temp";
- ifstream fileB(fileName.c_str(), ios::ate);
+ string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
+ ifstream fileB(fileName2.c_str(), ios::ate);
+
+ outFile.open(fileName.c_str(), ios::app);
long size;
char* memblock;
delete memblock;
fileB.close();
- remove(fileName.c_str());
+ remove(fileName2.c_str());
+
+ //write out the merged memory
+ if (numOutputs[groupID] > 60) {
+ outFile << outputs[groupID];
+ outputs[groupID] = "";
+ numOutputs[groupID] = 0;
+ }
+
+ outFile.close();
wroteOutPut[groupID] = true;
wroteOutPut[groupIDB] = false;
- }
-
- if (numOutputs[groupID] != 0) {
- outFile << outputs[groupID];
- wroteOutPut[groupID] = true;
- outputs[groupID] = "";
- numOutputs[groupID] = 0;
-
- outputs[groupIDB] = "";
- numOutputs[groupIDB] = 0;
- }
-
+ }else{ } //just merge b's memory with a's memory
}
else{
numOutputs[groupID] += numOutputs[groupIDA];
outputs[groupID] += outputs[groupIDA];
+ outputs[groupIDA] = "";
+ numOutputs[groupIDA] = 0;
+
if (wroteOutPut[groupIDA]) {
- string fileName = distFile + "." + toString(groupIDA) + ".temp";
- ifstream fileB(fileName.c_str(), ios::ate);
+ string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
+ ifstream fileB(fileName2.c_str(), ios::ate);
+
+ outFile.open(fileName.c_str(), ios::app);
long size;
char* memblock;
delete memblock;
fileB.close();
- remove(fileName.c_str());
+ remove(fileName2.c_str());
+
+ //write out the merged memory
+ if (numOutputs[groupID] > 60) {
+ outFile << outputs[groupID];
+ outputs[groupID] = "";
+ numOutputs[groupID] = 0;
+ }
+
+ outFile.close();
wroteOutPut[groupID] = true;
wroteOutPut[groupIDA] = false;
- }
-
- if (numOutputs[groupID] != 0) {
- outFile << outputs[groupID];
- wroteOutPut[groupID] = true;
- outputs[groupID] = "";
- numOutputs[groupID] = 0;
-
- outputs[groupIDA] = "";
- numOutputs[groupIDA] = 0;
- }
-
+ }else { } //just merge memory
}
}
}
}
gobble(dFile);
}
- outFile.close();
dFile.close();
for (int i = 0; i < numGroups; i++) {
outFile.close();
}
}
+
+ splitNames(groups);
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
+ exit(1);
+ }
+}
+//********************************************************************************************************************
+int SplitMatrix::splitNames(vector<set<string> >& groups){
+ try {
+ int numGroups = groups.size();
ifstream bigNameFile(namefile.c_str());
if(!bigNameFile){
}
return 0;
-
}
catch(exception& e) {
- m->errorOut(e, "SplitMatrix", "splitDistance");
+ m->errorOut(e, "SplitMatrix", "splitNames");
exit(1);
}
}
-
-/***********************************************************************/
-int SplitMatrix::splitClassify(){
+//********************************************************************************************************************
+int SplitMatrix::splitDistanceRAM(){
try {
- cutoff = int(cutoff);
-
- map<string, int> seqGroup;
- map<string, int>::iterator it;
- map<string, int>::iterator it2;
+ vector<set<string> > groups;
+ vector<string> outputs;
int numGroups = 0;
-
- //build tree from users taxonomy file
- PhyloTree* phylo = new PhyloTree();
-
- ifstream in;
- openInputFile(taxFile, in);
-
- //read in users taxonomy file and add sequences to tree
- string seqname, tax;
- while(!in.eof()){
- in >> seqname >> tax; gobble(in);
-
- phylo->addSeqToTree(seqname, tax);
- }
- in.close();
-
- phylo->assignHeirarchyIDs(0);
-
- //make sure the cutoff is not greater than maxlevel
- if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
-
- //for each node in tree
- for (int i = 0; i < phylo->getNumNodes(); i++) {
-
- //is this node within the cutoff
- TaxNode taxon = phylo->get(i);
-
- if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
- if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
- for (int j = 0; j < taxon.accessions.size(); j++) {
- seqGroup[taxon.accessions[j]] = numGroups;
- }
- numGroups++;
- }
- }
- }
ifstream dFile;
openInputFile(distFile, dFile);
- ofstream outFile;
-
- for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
- remove((distFile + "." + toString(i) + ".temp").c_str());
- }
-
-
- //for buffering the io to improve speed
- //allow for 10 dists to be stored, then output.
- vector<string> outputs; outputs.resize(numGroups, "");
- vector<int> numOutputs; numOutputs.resize(numGroups, 0);
-
- //for each distance
+
while(dFile){
string seqA, seqB;
float dist;
+
+ dFile >> seqA >> seqB >> dist;
- if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
-
- dFile >> seqA >> seqB >> dist; gobble(dFile);
-
- //if both sequences are in the same group then they are within the cutoff
- it = seqGroup.find(seqA);
- it2 = seqGroup.find(seqB);
-
- if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
- if (it->second == it2->second) { //they are from the same group so add the distance
- if (numOutputs[it->second] > 10) {
- openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
- outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
- outFile.close();
- outputs[it->second] = "";
- numOutputs[it->second] = 0;
- }else{
- outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
- numOutputs[it->second]++;
+ if (m->control_pressed) { dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
+
+ if(dist < cutoff){
+ //cout << "in cutoff: " << dist << endl;
+ int groupIDA = -1;
+ int groupIDB = -1;
+ int groupID = -1;
+
+ for(int i=0;i<numGroups;i++){
+ set<string>::iterator aIt = groups[i].find(seqA);
+ set<string>::iterator bIt = groups[i].find(seqB);
+
+ if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
+ groups[i].insert(seqB);
+ groupIDA = i;
+ groupID = groupIDA;
+
+ //cout << "in aIt: " << groupID << endl;
+ // break;
+ }
+ else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
+ groups[i].insert(seqA);
+ groupIDB = i;
+ groupID = groupIDB;
+
+ // cout << "in bIt: " << groupID << endl;
+ // break;
+ }
+
+ if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
+ if(groupIDA < groupIDB){
+ // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
+ groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
+ groups[groupIDB].clear();
+ groupID = groupIDA;
+ }
+ else{
+ // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
+ groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
+ groups[groupIDA].clear();
+ groupID = groupIDB;
+ }
+ break;
+ }
+ }
+
+ //windows is gonna gag on the reuse of outFile, will need to make it local...
+
+ if(groupIDA == -1 && groupIDB == -1){ //we need a new group
+ set<string> newGroup;
+ newGroup.insert(seqA);
+ newGroup.insert(seqB);
+ groups.push_back(newGroup);
+
+ string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+ outputs.push_back(tempOut);
+ numGroups++;
+ }
+ else{
+
+ outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+
+ if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
+ string row, column, distance;
+ if(groupIDA<groupIDB){
+ //merge memory
+ outputs[groupID] += outputs[groupIDB];
+ outputs[groupIDB] = "";
+ }else{
+ outputs[groupID] += outputs[groupIDA];
+ outputs[groupIDA] = "";
+ }
}
}
}
+ gobble(dFile);
}
dFile.close();
-
- for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
- remove((namefile + "." + toString(i) + ".temp").c_str());
-
- //write out any remaining buffers
- if (numOutputs[it->second] > 0) {
- openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
- outFile << outputs[i];
- outFile.close();
- outputs[i] = "";
- numOutputs[i] = 0;
- }
- }
- ifstream bigNameFile;
- openInputFile(namefile, bigNameFile);
-
- singleton = namefile + ".extra.temp";
- ofstream remainingNames;
- openOutputFile(singleton, remainingNames);
-
- bool wroteExtra = false;
-
- string name, nameList;
- while(!bigNameFile.eof()){
- bigNameFile >> name >> nameList; gobble(bigNameFile);
-
- //did this sequence get assigned a group
- it = seqGroup.find(name);
-
- if (it != seqGroup.end()) {
- openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
- outFile << name << '\t' << nameList << endl;
+ for (int i = 0; i < numGroups; i++) {
+ if (outputs[i] != "") {
+ ofstream outFile;
+ string fileName = distFile + "." + toString(i) + ".temp";
+ outFile.open(fileName.c_str(), ios::ate);
+ outFile << outputs[i];
outFile.close();
- }else{
- wroteExtra = true;
- remainingNames << name << '\t' << nameList << endl;
}
}
- bigNameFile.close();
- remainingNames.close();
-
- if (!wroteExtra) {
- remove(singleton.c_str());
- singleton = "none";
- }
-
- for(int i=0;i<numGroups;i++){
- string tempNameFile = namefile + "." + toString(i) + ".temp";
- string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+ splitNames(groups);
- map<string, string> temp;
- temp[tempDistFile] = tempNameFile;
- dists.push_back(temp);
- }
-
- if (m->control_pressed) {
- for (int i = 0; i < dists.size(); i++) {
- remove((dists[i].begin()->first).c_str());
- remove((dists[i].begin()->second).c_str());
- }
- dists.clear();
- }
-
- return 0;
-
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "SplitMatrix", "splitClassify");
+ m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
exit(1);
}
}
public:
- SplitMatrix(string, string, string, float, string); //column formatted distance file, namesfile, cutoff, method
+ SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method
~SplitMatrix();
int split();
vector< map<string, string> > getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
string distFile, namefile, singleton, method, taxFile;
vector< map< string, string> > dists;
float cutoff;
+ bool large;
int splitDistance();
int splitClassify();
+ int splitDistanceLarge();
+ int splitDistanceRAM();
+ int splitNames(vector<set<string> >& groups);
};
/******************************************************/
//cout << " at beginning of while " << k << endl;
if(c == ')') {
//to pass over labels in trees
- c=filehandle.get();
- while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); }
- filehandle.putback(c);
+ string label = readLabel(filehandle);
}
+
if(c == ';') { return 0; }
if(c == -1) { return 0; }
+
//if you are a name
if((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != '\t') && (c != 32)) { //32 is space
- name = "";
- c = filehandle.get();
- //k = c;
-//cout << k << endl;
- while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) {
- name += c;
- c = filehandle.get();
- //k = c;
-//cout << " in name while " << k << endl;
- }
-
-//cout << "name = " << name << endl;
+ name = readName(filehandle);
globaldata->Treenames.push_back(name);
- filehandle.putback(c);
-//k = c;
-//cout << " after putback" << k << endl;
}
if(c == ':') { //read until you reach the end of the branch length
- while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) {
- c = filehandle.get();
- //k = c;
- //cout << " in branch while " << k << endl;
- }
- filehandle.putback(c);
+ string bl = readBranchLength(filehandle);
}
c = filehandle.get();
}
/*******************************************************/
+string Tree::readLabel(ifstream& filehandle) {
+ try {
+
+ string label = "";
+
+ //to pass over labels in trees
+ int c=filehandle.get();
+ while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ label += c; c=filehandle.get(); }
+ filehandle.putback(c);
+
+ return label;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "readLabel");
+ exit(1);
+ }
+}
+/*******************************************************/
+string Tree::readName(ifstream& filehandle) {
+ try {
+
+ string name = "";
+ int c = filehandle.get();
+
+ while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) {
+ name += c;
+ c = filehandle.get();
+ }
+
+//cout << "name = " << name << endl;
+ filehandle.putback(c);
+
+ return name;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "readName");
+ exit(1);
+ }
+}
+/*******************************************************/
+string Tree::readBranchLength(ifstream& filehandle) {
+ try {
+
+ string br = "";
+ int c;
+ while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) {
+ br += c;
+ c = filehandle.get();
+ }
+ filehandle.putback(c);
+
+ return br;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Tree", "readBranchLength");
+ exit(1);
+ }
+}
/*******************************************************/
+/*******************************************************/
//not included in the tree.
//only takes names from the first tree in the tree file and assumes that all trees use the same names.
int readTreeString(ifstream&);
+ string readLabel(ifstream&);
+ string readName(ifstream&);
+ string readBranchLength(ifstream&);
+
MothurOut* m;
};