5 * Created by westcott on 5/19/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "splitmatrix.h"
11 #include "phylotree.h"
13 /***********************************************************************/
15 SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
16 m = MothurOut::getInstance();
24 /***********************************************************************/
26 int SplitMatrix::split(){
29 if (method == "distance") {
31 }else if (method == "classify") {
34 m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
35 map<string, string> temp;
36 temp[distFile] = namefile;
37 dists.push_back(temp);
43 m->errorOut(e, "SplitMatrix", "split");
47 /***********************************************************************/
48 int SplitMatrix::splitDistance(){
51 vector<set<string> > groups;
56 openInputFile(distFile, dFile);
62 dFile >> seqA >> seqB >> dist;
64 if (m->control_pressed) { outFile.close(); dFile.close(); for(int i=0;i<numGroups;i++){ if(groups[i].size() > 0){ remove((distFile + "." + toString(i) + ".temp").c_str()); } } return 0; }
67 //cout << "in cutoff: " << dist << endl;
73 for(int i=0;i<numGroups;i++){
74 set<string>::iterator aIt = groups[i].find(seqA);
75 set<string>::iterator bIt = groups[i].find(seqB);
77 if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
78 groups[i].insert(seqB);
82 //cout << "in aIt: " << groupID << endl;
85 else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
86 groups[i].insert(seqA);
90 // cout << "in bIt: " << groupID << endl;
94 if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
95 if(groupIDA < groupIDB){
96 // cout << "A: " << groupIDA << "\t" << groupIDB << endl;
97 groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
98 groups[groupIDB].clear();
102 // cout << "B: " << groupIDA << "\t" << groupIDB << endl;
103 groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
104 groups[groupIDA].clear();
111 //windows is gonna gag on the reuse of outFile, will need to make it local...
113 if(groupIDA == -1 && groupIDB == -1){ //we need a new group
114 set<string> newGroup;
115 newGroup.insert(seqA);
116 newGroup.insert(seqB);
117 groups.push_back(newGroup);
120 string fileName = distFile + "." + toString(numGroups) + ".temp";
121 outFile.open(fileName.c_str(), ios::ate);
123 outFile << seqA << '\t' << seqB << '\t' << dist << endl;
127 string fileName = distFile + "." + toString(groupID) + ".temp";
128 if(groupID != prevGroupID){
130 outFile.open(fileName.c_str(), ios::app);
131 prevGroupID = groupID;
133 outFile << seqA << '\t' << seqB << '\t' << dist << endl;
135 if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
136 string row, column, distance;
137 if(groupIDA<groupIDB){
138 string fileName = distFile + "." + toString(groupIDB) + ".temp";
139 ifstream fileB(fileName.c_str());
141 fileB >> row >> column >> distance;
142 outFile << row << '\t' << column << '\t' << distance << endl;
146 remove(fileName.c_str());
149 string fileName = distFile + "." + toString(groupIDA) + ".temp";
150 ifstream fileA(fileName.c_str());
152 fileA >> row >> column >> distance;
153 outFile << row << '\t' << column << '\t' << distance << endl;
157 remove(fileName.c_str());
167 ifstream bigNameFile(namefile.c_str());
169 cerr << "Error: We can't open the name file\n";
173 map<string, string> nameMap;
174 string name, nameList;
176 bigNameFile >> name >> nameList;
177 nameMap[name] = nameList;
182 for(int i=0;i<numGroups;i++){ //parse names file to match distance files
183 int numSeqsInGroup = groups[i].size();
185 if(numSeqsInGroup > 0){
186 string fileName = namefile + "." + toString(i) + ".temp";
187 ofstream smallNameFile(fileName.c_str(), ios::ate);
189 for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
190 map<string,string>::iterator nIt = nameMap.find(*gIt);
192 if (nIt != nameMap.end()) {
193 smallNameFile << nIt->first << '\t' << nIt->second << endl;
196 m->mothurOut((*gIt) + " is in your distance file and not in your namefile. Please correct."); m->mothurOutEndLine(); exit(1);
199 smallNameFile.close();
203 //names of singletons
204 if (nameMap.size() != 0) {
205 singleton = namefile + ".extra.temp";
206 ofstream remainingNames(singleton.c_str(), ios::ate);
207 for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
208 remainingNames << nIt->first << '\t' << nIt->second << endl;
210 remainingNames.close();
211 }else { singleton = "none"; }
213 for(int i=0;i<numGroups;i++){
214 if(groups[i].size() > 0){
215 string tempNameFile = namefile + "." + toString(i) + ".temp";
216 string tempDistFile = distFile + "." + toString(i) + ".temp";
218 map<string, string> temp;
219 temp[tempDistFile] = tempNameFile;
220 dists.push_back(temp);
224 if (m->control_pressed) {
225 for (int i = 0; i < dists.size(); i++) {
226 remove((dists[i].begin()->first).c_str());
227 remove((dists[i].begin()->second).c_str());
235 catch(exception& e) {
236 m->errorOut(e, "SplitMatrix", "splitDistance");
241 /***********************************************************************/
242 int SplitMatrix::splitClassify(){
244 cutoff = int(cutoff);
246 map<string, int> seqGroup;
247 map<string, int>::iterator it;
248 map<string, int>::iterator it2;
252 //build tree from users taxonomy file
253 PhyloTree* phylo = new PhyloTree();
256 openInputFile(taxFile, in);
258 //read in users taxonomy file and add sequences to tree
261 in >> seqname >> tax; gobble(in);
263 phylo->addSeqToTree(seqname, tax);
267 phylo->assignHeirarchyIDs(0);
269 //make sure the cutoff is not greater than maxlevel
270 if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
272 //for each node in tree
273 for (int i = 0; i < phylo->getNumNodes(); i++) {
275 //is this node within the cutoff
276 TaxNode taxon = phylo->get(i);
278 if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
279 if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
280 for (int j = 0; j < taxon.accessions.size(); j++) {
281 seqGroup[taxon.accessions[j]] = numGroups;
289 openInputFile(distFile, dFile);
292 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
293 remove((distFile + "." + toString(i) + ".temp").c_str());
301 if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str()); } }
303 dFile >> seqA >> seqB >> dist; gobble(dFile);
305 //if both sequences are in the same group then they are within the cutoff
306 it = seqGroup.find(seqA);
307 it2 = seqGroup.find(seqB);
309 if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons
310 if (it->second == it2->second) { //they are from the same group so add the distance
311 openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
312 outFile << seqA << '\t' << seqB << '\t' << dist << endl;
320 for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
321 remove((namefile + "." + toString(i) + ".temp").c_str());
324 ifstream bigNameFile;
325 openInputFile(namefile, bigNameFile);
327 singleton = namefile + ".extra.temp";
328 ofstream remainingNames;
329 openOutputFile(singleton, remainingNames);
331 bool wroteExtra = false;
333 string name, nameList;
334 while(!bigNameFile.eof()){
335 bigNameFile >> name >> nameList; gobble(bigNameFile);
337 //did this sequence get assigned a group
338 it = seqGroup.find(name);
340 if (it != seqGroup.end()) {
341 openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
342 outFile << name << '\t' << nameList << endl;
346 remainingNames << name << '\t' << nameList << endl;
350 remainingNames.close();
353 remove(singleton.c_str());
357 for(int i=0;i<numGroups;i++){
358 string tempNameFile = namefile + "." + toString(i) + ".temp";
359 string tempDistFile = distFile + "." + toString(i) + ".temp";
361 map<string, string> temp;
362 temp[tempDistFile] = tempNameFile;
363 dists.push_back(temp);
366 if (m->control_pressed) {
367 for (int i = 0; i < dists.size(); i++) {
368 remove((dists[i].begin()->first).c_str());
369 remove((dists[i].begin()->second).c_str());
377 catch(exception& e) {
378 m->errorOut(e, "SplitMatrix", "splitClassify");
382 //********************************************************************************************************************
383 //sorts biggest to smallest
384 inline bool compareFileSizes(map<string, string> left, map<string, string> right){
389 //get num bytes in file
390 string filename = left.begin()->first;
391 pFile = fopen (filename.c_str(),"rb");
392 string error = "Error opening " + filename;
393 if (pFile==NULL) perror (error.c_str());
395 fseek (pFile, 0, SEEK_END);
396 leftsize=ftell (pFile);
403 //get num bytes in file
404 filename = right.begin()->first;
405 pFile2 = fopen (filename.c_str(),"rb");
406 error = "Error opening " + filename;
407 if (pFile2==NULL) perror (error.c_str());
409 fseek (pFile2, 0, SEEK_END);
410 rightsize=ftell (pFile2);
414 return (leftsize > rightsize);
416 /***********************************************************************/
417 //returns map of distance files -> namefile sorted by distance file size
418 vector< map< string, string> > SplitMatrix::getDistanceFiles(){
421 sort(dists.begin(), dists.end(), compareFileSizes);
425 catch(exception& e) {
426 m->errorOut(e, "SplitMatrix", "getDistanceFiles");
430 /***********************************************************************/
431 SplitMatrix::~SplitMatrix(){}
432 /***********************************************************************/