X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=sequenceparser.cpp;h=37891eb44a0b42c2b9f0879dfd084da1893c289b;hp=44012d8ba8321fea520fbe613e0dac8f8911c760;hb=050a3ff02473a3d4c0980964e1a9ebe52e55d6b8;hpb=ae57e166b2ed7b475ec3f466106bd76fabadd063 diff --git a/sequenceparser.cpp b/sequenceparser.cpp index 44012d8..37891eb 100644 --- a/sequenceparser.cpp +++ b/sequenceparser.cpp @@ -7,7 +7,7 @@ * */ -#include "sequenceParser.h" +#include "sequenceparser.h" /************************************************************/ @@ -37,13 +37,16 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi m->openInputFile(fastaFile, in); map seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file + int fastaCount = 0; while (!in.eof()) { if (m->control_pressed) { break; } Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } - if (seq.getName() != "") { + if (seq.getName() != "") { string group = groupMap->getGroup(seq.getName()); if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } @@ -56,79 +59,180 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi in.close(); if (error == 1) { m->control_pressed = true; } - + //read name file ifstream inName; m->openInputFile(nameFile, inName); - string first, second; + //string first, second; int countName = 0; - while(!inName.eof()) { - + set thisnames1; + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + + while (!inName.eof()) { if (m->control_pressed) { break; } - inName >> first; m->gobble(inName); - inName >> second; m->gobble(inName); - - vector names; - m->splitAtChar(second, names, ','); - - //get aligned string for these seqs from the fasta file - string alignedString = ""; - map::iterator itAligned = seqName.find(names[0]); - if (itAligned == seqName.end()) { - error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); - }else { - alignedString = itAligned->second; - } - - //separate by group - parse one line in name file - map splitMap; //group -> name1,name2,... - map::iterator it; - for (int i = 0; i < names.size(); i++) { - - string group = groupMap->getGroup(names[i]); - if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } - else { - - it = splitMap.find(group); - if (it != splitMap.end()) { //adding seqs to this group - (it->second) += "," + names[i]; - countName++; - }else { //first sighting of this group - splitMap[group] = names[i]; - countName++; - - //is this seq in the fasta file? - if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match - Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. - seqs[group].push_back(tempSeq); - } - } - } - } - - - //fill nameMapPerGroup - holds all lines in namefile separated by group - for (it = splitMap.begin(); it != splitMap.end(); it++) { - //grab first name - string firstName = ""; - for(int i = 0; i < (it->second).length(); i++) { - if (((it->second)[i]) != ',') { - firstName += ((it->second)[i]); - }else { break; } - } - - //group1 -> seq1 -> seq1,seq2,seq3 - nameMapPerGroup[it->first][firstName] = it->second; - } + inName.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, inName.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { //save one line + if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); } + vector names; + m->splitAtChar(secondCol, names, ','); + + //get aligned string for these seqs from the fasta file + string alignedString = ""; + map::iterator itAligned = seqName.find(names[0]); + if (itAligned == seqName.end()) { + error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); + }else { + alignedString = itAligned->second; + } + + //separate by group - parse one line in name file + map splitMap; //group -> name1,name2,... + map::iterator it; + for (int i = 0; i < names.size(); i++) { + + string group = groupMap->getGroup(names[i]); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + + it = splitMap.find(group); + if (it != splitMap.end()) { //adding seqs to this group + (it->second) += "," + names[i]; + thisnames1.insert(names[i]); + countName++; + }else { //first sighting of this group + splitMap[group] = names[i]; + countName++; + thisnames1.insert(names[i]); + + //is this seq in the fasta file? + if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match + Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. + seqs[group].push_back(tempSeq); + } + } + } + + allSeqsMap[names[i]] = names[0]; + } + + + //fill nameMapPerGroup - holds all lines in namefile separated by group + for (it = splitMap.begin(); it != splitMap.end(); it++) { + //grab first name + string firstName = ""; + for(int i = 0; i < (it->second).length(); i++) { + if (((it->second)[i]) != ',') { + firstName += ((it->second)[i]); + }else { break; } + } + + //group1 -> seq1 -> seq1,seq2,seq3 + nameMapPerGroup[it->first][firstName] = it->second; + } + + pairDone = false; + } + } } - inName.close(); + + //in case file does not end in white space + if (rest != "") { + vector pieces = m->splitWhiteSpace(rest); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { //save one line + if (m->debug) { m->mothurOut("[DEBUG]: reading names: " + firstCol + '\t' + secondCol + ".\n"); } + vector names; + m->splitAtChar(secondCol, names, ','); + + //get aligned string for these seqs from the fasta file + string alignedString = ""; + map::iterator itAligned = seqName.find(names[0]); + if (itAligned == seqName.end()) { + error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); + }else { + alignedString = itAligned->second; + } + + //separate by group - parse one line in name file + map splitMap; //group -> name1,name2,... + map::iterator it; + for (int i = 0; i < names.size(); i++) { + + string group = groupMap->getGroup(names[i]); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + + it = splitMap.find(group); + if (it != splitMap.end()) { //adding seqs to this group + (it->second) += "," + names[i]; + thisnames1.insert(names[i]); + countName++; + }else { //first sighting of this group + splitMap[group] = names[i]; + countName++; + thisnames1.insert(names[i]); + + //is this seq in the fasta file? + if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match + Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. + seqs[group].push_back(tempSeq); + } + } + } + + allSeqsMap[names[i]] = names[0]; + } + + + //fill nameMapPerGroup - holds all lines in namefile separated by group + for (it = splitMap.begin(); it != splitMap.end(); it++) { + //grab first name + string firstName = ""; + for(int i = 0; i < (it->second).length(); i++) { + if (((it->second)[i]) != ',') { + firstName += ((it->second)[i]); + }else { break; } + } + + //group1 -> seq1 -> seq1,seq2,seq3 + nameMapPerGroup[it->first][firstName] = it->second; + } + + pairDone = false; + } + } + } if (error == 1) { m->control_pressed = true; } - + if (countName != (groupMap->getNumSeqs())) { + vector groupseqsnames = groupMap->getNamesSeqs(); + + for (int i = 0; i < groupseqsnames.size(); i++) { + set::iterator itnamesfile = thisnames1.find(groupseqsnames[i]); + if (itnamesfile == thisnames1.end()){ + cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl; + } + } + m->mothurOutEndLine(); m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); m->mothurOutEndLine(); @@ -206,8 +310,6 @@ vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGr /************************************************************/ bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } /************************************************************/ -string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } -/************************************************************/ int SequenceParser::getNumSeqs(string g){ try { map >::iterator it; @@ -238,6 +340,7 @@ vector SequenceParser::getSeqs(string g){ m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); }else { seqForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); } } return seqForThisGroup; @@ -247,6 +350,79 @@ vector SequenceParser::getSeqs(string g){ exit(1); } } +/************************************************************/ +int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ + try { + map >::iterator it; + vector seqForThisGroup; + vector nameVector; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + + ofstream out; + m->openOutputFile(filename, out); + + seqForThisGroup = it->second; + + if (uchimeFormat) { + // format should look like + //>seqName /ab=numRedundantSeqs/ + //sequence + + map nameMapForThisGroup = getNameMap(g); + map::iterator itNameMap; + int error = 0; + + for (int i = 0; i < seqForThisGroup.size(); i++) { + itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName()); + + if (itNameMap == nameMapForThisGroup.end()){ + error = 1; + m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine(); + }else { + int num = m->getNumNames(itNameMap->second); + + seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName()); + nameVector.push_back(temp); + } + } + + if (error == 1) { out.close(); m->mothurRemove(filename); return 1; } + + //sort by num represented + sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); + + //print new file in order of + for (int i = 0; i < nameVector.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; // + } + + }else { + //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n"); + for (int i = 0; i < seqForThisGroup.size(); i++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + seqForThisGroup[i].printSequence(out); + } + } + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getSeqs"); + exit(1); + } +} + /************************************************************/ map SequenceParser::getNameMap(string g){ try { @@ -258,6 +434,7 @@ map SequenceParser::getNameMap(string g){ m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); }else { nameMapForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences."); } } return nameMapForThisGroup; @@ -268,6 +445,38 @@ map SequenceParser::getNameMap(string g){ } } /************************************************************/ +int SequenceParser::getNameMap(string g, string filename){ + try { + map >::iterator it; + map nameMapForThisGroup; + + it = nameMapPerGroup.find(g); + if(it == nameMapPerGroup.end()) { + m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + nameMapForThisGroup = it->second; + + ofstream out; + m->openOutputFile(filename, out); + + for (map::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) { + + if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } + + out << itFile->first << '\t' << itFile->second << endl; + } + + out.close(); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getNameMap"); + exit(1); + } +} +/************************************************************/