5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
13 /***********************************************************************/
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
16 m = MothurOut::getInstance();
24 /***********************************************************************/
26 int SplitMatrix::split(){
29 if (method == "distance") {
31 }else if (method == "classify") {
34 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
35 map<string, string> temp;
36 temp[distFile] = namefile;
37 dists.push_back(temp);
43 m->errorOut(e, "SplitMatrix", "split");
47 /***********************************************************************/
48 int SplitMatrix::splitDistance(){
51 vector<set<string> > groups;
53 //for buffering the io to improve speed
54 //allow for 10 dists to be stored, then output.
55 vector<string> outputs;
56 vector<int> numOutputs;
57 vector<bool> wroteOutPut;
63 openInputFile(distFile, dFile);
69 dFile >> seqA >> seqB >> dist;
71 if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
74 //cout << "in cutoff: " << dist << endl;
80 for(int i=0;i<numGroups;i++){
81 set<string>::iterator aIt = groups[i].find(seqA);
82 set<string>::iterator bIt = groups[i].find(seqB);
84 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
85 groups[i].insert(seqB);
89 //cout << "in aIt: " << groupID << endl;
92 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
93 groups[i].insert(seqA);
97 // cout << "in bIt: " << groupID << endl;
101 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
102 if(groupIDA < groupIDB){
103 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
104 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
105 groups[groupIDB].clear();
109 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
110 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
111 groups[groupIDA].clear();
118 //windows is gonna gag on the reuse of outFile, will need to make it local...
120 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
121 set<string> newGroup;
122 newGroup.insert(seqA);
123 newGroup.insert(seqB);
124 groups.push_back(newGroup);
127 string fileName = distFile + "." + toString(numGroups) + ".temp";
128 outFile.open(fileName.c_str(), ios::ate);
130 string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
131 outputs.push_back(tempOut);
132 numOutputs.push_back(1);
133 wroteOutPut.push_back(false);
138 string fileName = distFile + "." + toString(groupID) + ".temp";
140 if(groupID != prevGroupID){
142 outFile.open(fileName.c_str(), ios::app);
143 prevGroupID = groupID;
146 //have we reached the max buffer size
147 if (numOutputs[groupID] > 10) { //write out sequence
148 outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
149 outputs[groupID] = "";
150 numOutputs[groupID] = 0;
151 wroteOutPut[groupID] = true;
153 outputs[groupID] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
154 numOutputs[groupID]++;
157 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
158 string row, column, distance;
159 if(groupIDA<groupIDB){
161 numOutputs[groupID] += numOutputs[groupIDB];
162 outputs[groupID] += outputs[groupIDB];
164 if (wroteOutPut[groupIDB]) {
165 string fileName = distFile + "." + toString(groupIDB) + ".temp";
166 ifstream fileB(fileName.c_str(), ios::ate);
171 size = fileB.tellg();
173 fileB.seekg (0, ios::beg);
175 int numRead = size / 1024;
176 int lastRead = size % 1024;
178 for (int i = 0; i < numRead; i++) {
180 memblock = new char [1024];
182 fileB.read (memblock, 1024);
184 string temp = memblock;
185 outFile << temp.substr(0, 1024);
190 memblock = new char [lastRead];
192 fileB.read (memblock, lastRead);
194 //not sure why but it will read more than lastRead char...??
195 string temp = memblock;
196 outFile << temp.substr(0, lastRead);
200 remove(fileName.c_str());
202 wroteOutPut[groupID] = true;
203 wroteOutPut[groupIDB] = false;
206 if (numOutputs[groupID] != 0) {
207 outFile << outputs[groupID];
208 wroteOutPut[groupID] = true;
209 outputs[groupID] = "";
210 numOutputs[groupID] = 0;
212 outputs[groupIDB] = "";
213 numOutputs[groupIDB] = 0;
218 numOutputs[groupID] += numOutputs[groupIDA];
219 outputs[groupID] += outputs[groupIDA];
221 if (wroteOutPut[groupIDA]) {
222 string fileName = distFile + "." + toString(groupIDA) + ".temp";
223 ifstream fileB(fileName.c_str(), ios::ate);
228 size = fileB.tellg();
230 fileB.seekg (0, ios::beg);
232 int numRead = size / 1024;
233 int lastRead = size % 1024;
235 for (int i = 0; i < numRead; i++) {
237 memblock = new char [1024];
239 fileB.read (memblock, 1024);
240 string temp = memblock;
241 outFile << temp.substr(0, 1024);
246 memblock = new char [lastRead];
248 fileB.read (memblock, lastRead);
250 //not sure why but it will read more than lastRead char...??
251 string temp = memblock;
252 outFile << temp.substr(0, lastRead);
257 remove(fileName.c_str());
259 wroteOutPut[groupID] = true;
260 wroteOutPut[groupIDA] = false;
263 if (numOutputs[groupID] != 0) {
264 outFile << outputs[groupID];
265 wroteOutPut[groupID] = true;
266 outputs[groupID] = "";
267 numOutputs[groupID] = 0;
269 outputs[groupIDA] = "";
270 numOutputs[groupIDA] = 0;
282 for (int i = 0; i < numGroups; i++) {
283 if (numOutputs[i] > 0) {
284 string fileName = distFile + "." + toString(i) + ".temp";
285 outFile.open(fileName.c_str(), ios::app);
286 outFile << outputs[i];
291 ifstream bigNameFile(namefile.c_str());
293 cerr << "Error: We can't open the name file\n";
297 map<string, string> nameMap;
298 string name, nameList;
300 bigNameFile >> name >> nameList;
301 nameMap[name] = nameList;
306 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
307 int numSeqsInGroup = groups[i].size();
309 if(numSeqsInGroup > 0){
310 string fileName = namefile + "." + toString(i) + ".temp";
311 ofstream smallNameFile(fileName.c_str(), ios::ate);
313 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
314 map<string,string>::iterator nIt = nameMap.find(*gIt);
315 if (nIt != nameMap.end()) {
316 smallNameFile << nIt->first << '\t' << nIt->second << endl;
319 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
322 smallNameFile.close();
326 //names of singletons
327 if (nameMap.size() != 0) {
328 singleton = namefile + ".extra.temp";
329 ofstream remainingNames(singleton.c_str(), ios::ate);
330 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
331 remainingNames << nIt->first << '\t' << nIt->second << endl;
333 remainingNames.close();
334 }else { singleton = "none"; }
336 for(int i=0;i<numGroups;i++){
337 if(groups[i].size() > 0){
338 string tempNameFile = namefile + "." + toString(i) + ".temp";
339 string tempDistFile = distFile + "." + toString(i) + ".temp";
341 map<string, string> temp;
342 temp[tempDistFile] = tempNameFile;
343 dists.push_back(temp);
347 if (m->control_pressed) {
348 for (int i = 0; i < dists.size(); i++) {
349 remove((dists[i].begin()->first).c_str());
350 remove((dists[i].begin()->second).c_str());
358 catch(exception& e) {
359 m->errorOut(e, "SplitMatrix", "splitDistance");
364 /***********************************************************************/
365 int SplitMatrix::splitClassify(){
367 cutoff = int(cutoff);
369 map<string, int> seqGroup;
370 map<string, int>::iterator it;
371 map<string, int>::iterator it2;
375 //build tree from users taxonomy file
376 PhyloTree* phylo = new PhyloTree();
379 openInputFile(taxFile, in);
381 //read in users taxonomy file and add sequences to tree
384 in >> seqname >> tax; gobble(in);
386 phylo->addSeqToTree(seqname, tax);
390 phylo->assignHeirarchyIDs(0);
392 //make sure the cutoff is not greater than maxlevel
393 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
395 //for each node in tree
396 for (int i = 0; i < phylo->getNumNodes(); i++) {
398 //is this node within the cutoff
399 TaxNode taxon = phylo->get(i);
401 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
402 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
403 for (int j = 0; j < taxon.accessions.size(); j++) {
404 seqGroup[taxon.accessions[j]] = numGroups;
412 openInputFile(distFile, dFile);
415 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
416 remove((distFile + "." + toString(i) + ".temp").c_str());
420 //for buffering the io to improve speed
421 //allow for 10 dists to be stored, then output.
422 vector<string> outputs; outputs.resize(numGroups, "");
423 vector<int> numOutputs; numOutputs.resize(numGroups, 0);
430 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
432 dFile >> seqA >> seqB >> dist; gobble(dFile);
434 //if both sequences are in the same group then they are within the cutoff
435 it = seqGroup.find(seqA);
436 it2 = seqGroup.find(seqB);
438 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
439 if (it->second == it2->second) { //they are from the same group so add the distance
440 if (numOutputs[it->second] > 10) {
441 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
442 outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
444 outputs[it->second] = "";
445 numOutputs[it->second] = 0;
447 outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
448 numOutputs[it->second]++;
455 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
456 remove((namefile + "." + toString(i) + ".temp").c_str());
458 //write out any remaining buffers
459 if (numOutputs[it->second] > 0) {
460 openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
461 outFile << outputs[i];
468 ifstream bigNameFile;
469 openInputFile(namefile, bigNameFile);
471 singleton = namefile + ".extra.temp";
472 ofstream remainingNames;
473 openOutputFile(singleton, remainingNames);
475 bool wroteExtra = false;
477 string name, nameList;
478 while(!bigNameFile.eof()){
479 bigNameFile >> name >> nameList; gobble(bigNameFile);
481 //did this sequence get assigned a group
482 it = seqGroup.find(name);
484 if (it != seqGroup.end()) {
485 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
486 outFile << name << '\t' << nameList << endl;
490 remainingNames << name << '\t' << nameList << endl;
494 remainingNames.close();
497 remove(singleton.c_str());
501 for(int i=0;i<numGroups;i++){
502 string tempNameFile = namefile + "." + toString(i) + ".temp";
503 string tempDistFile = distFile + "." + toString(i) + ".temp";
505 map<string, string> temp;
506 temp[tempDistFile] = tempNameFile;
507 dists.push_back(temp);
510 if (m->control_pressed) {
511 for (int i = 0; i < dists.size(); i++) {
512 remove((dists[i].begin()->first).c_str());
513 remove((dists[i].begin()->second).c_str());
521 catch(exception& e) {
522 m->errorOut(e, "SplitMatrix", "splitClassify");
526 //********************************************************************************************************************
527 //sorts biggest to smallest
528 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
533 //get num bytes in file
534 string filename = left.begin()->first;
535 pFile = fopen (filename.c_str(),"rb");
536 string error = "Error opening " + filename;
537 if (pFile==NULL) perror (error.c_str());
539 fseek (pFile, 0, SEEK_END);
540 leftsize=ftell (pFile);
547 //get num bytes in file
548 filename = right.begin()->first;
549 pFile2 = fopen (filename.c_str(),"rb");
550 error = "Error opening " + filename;
551 if (pFile2==NULL) perror (error.c_str());
553 fseek (pFile2, 0, SEEK_END);
554 rightsize=ftell (pFile2);
558 return (leftsize > rightsize);
560 /***********************************************************************/
561 //returns map of distance files -> namefile sorted by distance file size
562 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
565 sort(dists.begin(), dists.end(), compareFileSizes);
569 catch(exception& e) {
570 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
574 /***********************************************************************/
575 SplitMatrix::~SplitMatrix(){}
576 /***********************************************************************/