X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=readblast.cpp;h=84fddcf263cfdeb572f72ff3bd658325f31271e0;hp=7a092f96cc8b92cd245decd37d388034737a73ed;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=c82900be3ceed3d9bc491bdc98b1819ef85c1af7 diff --git a/readblast.cpp b/readblast.cpp index 7a092f9..84fddcf 100644 --- a/readblast.cpp +++ b/readblast.cpp @@ -12,16 +12,17 @@ //******************************************************************************************************************** //sorts lowest to highest -inline bool compareOverlap(DistNode left, DistNode right){ +inline bool compareOverlap(seqDist left, seqDist right){ return (left.dist < right.dist); } /*********************************************************************************************/ -ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) { +ReadBlast::ReadBlast(string file, float c, float p, int l, bool ms, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(ms), hclusterWanted(h) { try { + m = MothurOut::getInstance(); matrix = NULL; } catch(exception& e) { - errorOut(e, "ReadBlast", "ReadBlast"); + m->errorOut(e, "ReadBlast", "ReadBlast"); exit(1); } } @@ -29,15 +30,17 @@ ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : bla //assumptions about the blast file: //1. if duplicate lines occur the first line is always best and is chosen //2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score... -void ReadBlast::read(NameAssignment* nameMap) { +int ReadBlast::read(NameAssignment* nameMap) { try { //if the user has not given a names file read names from blastfile if (nameMap->size() == 0) { readNames(nameMap); } int nseqs = nameMap->size(); + + if (m->control_pressed) { return 0; } ifstream fileHandle; - openInputFile(blastfile, fileHandle); + m->openInputFile(blastfile, fileHandle); string firstName, secondName, eScore, currentRow; string repeatName = ""; @@ -51,15 +54,23 @@ void ReadBlast::read(NameAssignment* nameMap) { //create objects needed for read if (!hclusterWanted) { - matrix = new SparseMatrix(); + matrix = new SparseDistanceMatrix(); + matrix->resize(nseqs); }else{ - overlapFile = getRootName(blastfile) + "overlap.dist"; - distFile = getRootName(blastfile) + "hclusterDists.dist"; + overlapFile = m->getRootName(blastfile) + "overlap.dist"; + distFile = m->getRootName(blastfile) + "hclusterDists.dist"; - openOutputFile(overlapFile, outOverlap); - openOutputFile(distFile, outDist); + m->openOutputFile(overlapFile, outOverlap); + m->openOutputFile(distFile, outDist); } - + + if (m->control_pressed) { + fileHandle.close(); + if (!hclusterWanted) { delete matrix; } + else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); } + return 0; + } + Progress* reading = new Progress("Reading blast: ", nseqs * nseqs); //this is used to quickly find if we already have a distance for this combo @@ -69,7 +80,7 @@ void ReadBlast::read(NameAssignment* nameMap) { if (!fileHandle.eof()) { //read in line from file fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; - gobble(fileHandle); + m->gobble(fileHandle); currentRow = firstName; lengthThisSeq = numBases; @@ -80,8 +91,8 @@ void ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } thisRowsBlastScores[itB->second] = score; @@ -91,23 +102,31 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); overlap.push_back(overlapValue); }else { outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; } } } - }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); } + }else { m->mothurOut("Error in your blast file, cannot read."); m->mothurOutEndLine(); exit(1); } - + //read file while(!fileHandle.eof()){ + + if (m->control_pressed) { + fileHandle.close(); + if (!hclusterWanted) { delete matrix; } + else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); } + delete reading; + return 0; + } //read in line from file fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl; - gobble(fileHandle); + m->gobble(fileHandle); string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file @@ -125,8 +144,8 @@ void ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } //save score thisRowsBlastScores[itB->second] = score; @@ -137,7 +156,7 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl; overlap.push_back(overlapValue); }else { @@ -151,6 +170,7 @@ void ReadBlast::read(NameAssignment* nameMap) { map::iterator itDist; for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { distance = 1.0 - (it->second / refScore); + //do we already have the distance calculated for b->a map::iterator itA = nameMap->find(currentRow); @@ -158,16 +178,22 @@ void ReadBlast::read(NameAssignment* nameMap) { //if we have it then compare if (itDist != dists[it->first].end()) { + //if you want the minimum blast score ratio, then pick max distance if(minWanted) { distance = max(itDist->second, distance); } else{ distance = min(itDist->second, distance); } - + //is this distance below cutoff if (distance < cutoff) { if (!hclusterWanted) { - PCell value(itA->second, it->first, distance); - matrix->addCell(value); - }else{ + if (itA->second < it->first) { + PDistCell value(it->first, distance); + matrix->addCell(itA->second, value); + }else { + PDistCell value(itA->second, distance); + matrix->addCell(it->first, value); + } + }else{ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; } } @@ -190,8 +216,8 @@ void ReadBlast::read(NameAssignment* nameMap) { //convert name to number map::iterator itA = nameMap->find(firstName); map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + if(itA == nameMap->end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); } + if(itB == nameMap->end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); } thisRowsBlastScores[itB->second] = score; @@ -201,7 +227,7 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); overlap.push_back(overlapValue); }else { outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; @@ -232,8 +258,13 @@ void ReadBlast::read(NameAssignment* nameMap) { //is this distance below cutoff if (distance < cutoff) { if (!hclusterWanted) { - PCell value(itA->second, it->first, distance); - matrix->addCell(value); + if (itA->second < it->first) { + PDistCell value(it->first, distance); + matrix->addCell(itA->second, value); + }else { + PDistCell value(itA->second, distance); + matrix->addCell(it->first, value); + } }else{ outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; } @@ -248,6 +279,14 @@ void ReadBlast::read(NameAssignment* nameMap) { thisRowsBlastScores.clear(); dists.clear(); + if (m->control_pressed) { + fileHandle.close(); + if (!hclusterWanted) { delete matrix; } + else { outOverlap.close(); m->mothurRemove(overlapFile); outDist.close(); m->mothurRemove(distFile); } + delete reading; + return 0; + } + if (!hclusterWanted) { sort(overlap.begin(), overlap.end(), compareOverlap); }else { @@ -255,45 +294,66 @@ void ReadBlast::read(NameAssignment* nameMap) { outOverlap.close(); } + if (m->control_pressed) { + fileHandle.close(); + if (!hclusterWanted) { delete matrix; } + else { m->mothurRemove(overlapFile); m->mothurRemove(distFile); } + delete reading; + return 0; + } + reading->finish(); delete reading; fileHandle.close(); + + return 0; } catch(exception& e) { - errorOut(e, "ReadBlast", "read"); + m->errorOut(e, "ReadBlast", "read"); exit(1); } } /*********************************************************************************************/ -void ReadBlast::readNames(NameAssignment* nameMap) { +int ReadBlast::readNames(NameAssignment* nameMap) { try { - mothurOut("Reading names... "); cout.flush(); + m->mothurOut("Reading names... "); cout.flush(); string name, hold, prevName; int num = 1; ifstream in; - openInputFile(blastfile, in); + m->openInputFile(blastfile, in); + + //ofstream outName; + //m->openOutputFile((blastfile + ".tempOutNames"), outName); //read first line in >> prevName; + for (int i = 0; i < 11; i++) { in >> hold; } - gobble(in); - + m->gobble(in); + //save name in nameMap nameMap->push_back(prevName); while (!in.eof()) { + if (m->control_pressed) { in.close(); return 0; } //read line in >> name; + for (int i = 0; i < 11; i++) { in >> hold; } - gobble(in); + m->gobble(in); //is this a new name? if (name != prevName) { prevName = name; - nameMap->push_back(name); + + if (nameMap->get(name) != -1) { m->mothurOut("[ERROR]: trying to exact names from blast file, and I found dups. Are you sequence names unique? quitting.\n"); m->control_pressed = true; } + else { + nameMap->push_back(name); + } + //outName << name << '\t' << name << endl; num++; } } @@ -301,17 +361,21 @@ void ReadBlast::readNames(NameAssignment* nameMap) { in.close(); //write out names file - //string outNames = getRootName(blastfile) + "names"; + //string outNames = m->getRootName(blastfile) + "names"; //ofstream out; - //openOutputFile(outNames, out); + //m->openOutputFile(outNames, out); //nameMap->print(out); //out.close(); - mothurOut(toString(num) + " names read."); mothurOutEndLine(); + if (m->control_pressed) { return 0; } + + m->mothurOut(toString(num) + " names read."); m->mothurOutEndLine(); + + return 0; } catch(exception& e) { - errorOut(e, "ReadBlast", "readNames"); + m->errorOut(e, "ReadBlast", "readNames"); exit(1); } }