X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=sequenceparser.cpp;h=3eb508dd737e15d4f06cb04c202608b5d5e1fe7c;hb=2c97dd48b8e27ee0a6a86c7a082f4c504c3357c6;hp=e60f19b0df0034de0c1a693d7c5a55c59ef0ef82;hpb=c47e480b743d1c242b8c527b6d12f992c68b8c2c;p=mothur.git diff --git a/sequenceparser.cpp b/sequenceparser.cpp index e60f19b..3eb508d 100644 --- a/sequenceparser.cpp +++ b/sequenceparser.cpp @@ -7,7 +7,7 @@ * */ -#include "sequenceParser.h" +#include "sequenceparser.h" /************************************************************/ @@ -37,13 +37,16 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi m->openInputFile(fastaFile, in); map seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file + int fastaCount = 0; while (!in.eof()) { if (m->control_pressed) { break; } Sequence seq(in); m->gobble(in); + fastaCount++; + if (m->debug) { if((fastaCount) % 1000 == 0){ m->mothurOut("[DEBUG]: reading seq " + toString(fastaCount) + "\n."); } } - if (seq.getName() != "") { + if (seq.getName() != "") { string group = groupMap->getGroup(seq.getName()); if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } @@ -63,6 +66,8 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi string first, second; int countName = 0; + set thisnames1; + while(!inName.eof()) { if (m->control_pressed) { break; } @@ -94,10 +99,12 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi it = splitMap.find(group); if (it != splitMap.end()) { //adding seqs to this group (it->second) += "," + names[i]; + thisnames1.insert(names[i]); countName++; }else { //first sighting of this group splitMap[group] = names[i]; countName++; + thisnames1.insert(names[i]); //is this seq in the fasta file? if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match @@ -129,8 +136,17 @@ SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFi inName.close(); if (error == 1) { m->control_pressed = true; } - + if (countName != (groupMap->getNumSeqs())) { + vector groupseqsnames = groupMap->getNamesSeqs(); + + for (int i = 0; i < groupseqsnames.size(); i++) { + set::iterator itnamesfile = thisnames1.find(groupseqsnames[i]); + if (itnamesfile == thisnames1.end()){ + cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl; + } + } + m->mothurOutEndLine(); m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); m->mothurOutEndLine(); @@ -240,6 +256,7 @@ vector SequenceParser::getSeqs(string g){ m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); }else { seqForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " fasta file has " + toString(seqForThisGroup.size()) + " sequences."); } } return seqForThisGroup; @@ -303,6 +320,7 @@ int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ } }else { + //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n"); for (int i = 0; i < seqForThisGroup.size(); i++) { if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } @@ -332,6 +350,7 @@ map SequenceParser::getNameMap(string g){ m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); }else { nameMapForThisGroup = it->second; + if (m->debug) { m->mothurOut("[DEBUG]: group " + g + " name file has " + toString(nameMapForThisGroup.size()) + " unique sequences."); } } return nameMapForThisGroup;