X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=sequenceparser.cpp;fp=sequenceparser.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=fd94b246a43217cc9cbfa131bfe4cd672955e001;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/sequenceparser.cpp b/sequenceparser.cpp deleted file mode 100644 index fd94b24..0000000 --- a/sequenceparser.cpp +++ /dev/null @@ -1,393 +0,0 @@ -/* - * sequenceParser.cpp - * Mothur - * - * Created by westcott on 9/9/11. - * Copyright 2011 Schloss Lab. All rights reserved. - * - */ - -#include "sequenceparser.h" - - -/************************************************************/ -SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) { - try { - - m = MothurOut::getInstance(); - int error; - - //read group file - groupMap = new GroupMap(groupFile); - error = groupMap->readMap(); - - if (error == 1) { m->control_pressed = true; } - - //initialize maps - vector namesOfGroups = groupMap->getNamesOfGroups(); - for (int i = 0; i < namesOfGroups.size(); i++) { - vector temp; - map tempMap; - seqs[namesOfGroups[i]] = temp; - nameMapPerGroup[namesOfGroups[i]] = tempMap; - } - - //read fasta file making sure each sequence is in the group file - ifstream in; - m->openInputFile(fastaFile, in); - - map seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file - while (!in.eof()) { - - if (m->control_pressed) { break; } - - Sequence seq(in); m->gobble(in); - - if (seq.getName() != "") { - - string group = groupMap->getGroup(seq.getName()); - if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } - else { - seqs[group].push_back(seq); - seqName[seq.getName()] = seq.getAligned(); - } - } - } - in.close(); - - if (error == 1) { m->control_pressed = true; } - - //read name file - ifstream inName; - m->openInputFile(nameFile, inName); - - string first, second; - int countName = 0; - set thisnames1; - - while(!inName.eof()) { - - if (m->control_pressed) { break; } - - inName >> first; m->gobble(inName); - inName >> second; m->gobble(inName); - - vector names; - m->splitAtChar(second, names, ','); - - //get aligned string for these seqs from the fasta file - string alignedString = ""; - map::iterator itAligned = seqName.find(names[0]); - if (itAligned == seqName.end()) { - error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); - }else { - alignedString = itAligned->second; - } - - //separate by group - parse one line in name file - map splitMap; //group -> name1,name2,... - map::iterator it; - for (int i = 0; i < names.size(); i++) { - - string group = groupMap->getGroup(names[i]); - if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } - else { - - it = splitMap.find(group); - if (it != splitMap.end()) { //adding seqs to this group - (it->second) += "," + names[i]; - thisnames1.insert(names[i]); - countName++; - }else { //first sighting of this group - splitMap[group] = names[i]; - countName++; - thisnames1.insert(names[i]); - - //is this seq in the fasta file? - if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match - Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. - seqs[group].push_back(tempSeq); - } - } - } - - allSeqsMap[names[i]] = names[0]; - } - - - //fill nameMapPerGroup - holds all lines in namefile separated by group - for (it = splitMap.begin(); it != splitMap.end(); it++) { - //grab first name - string firstName = ""; - for(int i = 0; i < (it->second).length(); i++) { - if (((it->second)[i]) != ',') { - firstName += ((it->second)[i]); - }else { break; } - } - - //group1 -> seq1 -> seq1,seq2,seq3 - nameMapPerGroup[it->first][firstName] = it->second; - } - } - - inName.close(); - - if (error == 1) { m->control_pressed = true; } - - if (countName != (groupMap->getNumSeqs())) { - vector groupseqsnames = groupMap->getNamesSeqs(); - - for (int i = 0; i < groupseqsnames.size(); i++) { - set::iterator itnamesfile = thisnames1.find(groupseqsnames[i]); - if (itnamesfile == thisnames1.end()){ - cout << "missing name " + groupseqsnames[i] << '\t' << allSeqsMap[groupseqsnames[i]] << endl; - } - } - - m->mothurOutEndLine(); - m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); - m->mothurOutEndLine(); - m->control_pressed = true; - } - - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "SequenceParser"); - exit(1); - } -} -/************************************************************/ -SequenceParser::SequenceParser(string groupFile, string fastaFile) { - try { - - m = MothurOut::getInstance(); - int error; - - //read group file - groupMap = new GroupMap(groupFile); - error = groupMap->readMap(); - - if (error == 1) { m->control_pressed = true; } - - //initialize maps - vector namesOfGroups = groupMap->getNamesOfGroups(); - for (int i = 0; i < namesOfGroups.size(); i++) { - vector temp; - seqs[namesOfGroups[i]] = temp; - } - - //read fasta file making sure each sequence is in the group file - ifstream in; - m->openInputFile(fastaFile, in); - int count = 0; - - while (!in.eof()) { - - if (m->control_pressed) { break; } - - Sequence seq(in); m->gobble(in); - - if (seq.getName() != "") { - - string group = groupMap->getGroup(seq.getName()); - if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } - else { seqs[group].push_back(seq); count++; } - } - } - in.close(); - - if (error == 1) { m->control_pressed = true; } - - if (count != (groupMap->getNumSeqs())) { - m->mothurOutEndLine(); - m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); - if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); } - m->mothurOutEndLine(); - m->control_pressed = true; - } - - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "SequenceParser"); - exit(1); - } -} -/************************************************************/ -SequenceParser::~SequenceParser(){ delete groupMap; } -/************************************************************/ -int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); } -/************************************************************/ -vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); } -/************************************************************/ -bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } -/************************************************************/ -string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } -/************************************************************/ -int SequenceParser::getNumSeqs(string g){ - try { - map >::iterator it; - int num = 0; - - it = seqs.find(g); - if(it == seqs.end()) { - m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine(); - }else { - num = (it->second).size(); - } - - return num; - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "getNumSeqs"); - exit(1); - } -} -/************************************************************/ -vector SequenceParser::getSeqs(string g){ - try { - map >::iterator it; - vector seqForThisGroup; - - it = seqs.find(g); - if(it == seqs.end()) { - m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); - }else { - seqForThisGroup = it->second; - } - - return seqForThisGroup; - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "getSeqs"); - exit(1); - } -} -/************************************************************/ -int SequenceParser::getSeqs(string g, string filename, bool uchimeFormat=false){ - try { - map >::iterator it; - vector seqForThisGroup; - vector nameVector; - - it = seqs.find(g); - if(it == seqs.end()) { - m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); - }else { - - ofstream out; - m->openOutputFile(filename, out); - - seqForThisGroup = it->second; - - if (uchimeFormat) { - // format should look like - //>seqName /ab=numRedundantSeqs/ - //sequence - - map nameMapForThisGroup = getNameMap(g); - map::iterator itNameMap; - int error = 0; - - for (int i = 0; i < seqForThisGroup.size(); i++) { - itNameMap = nameMapForThisGroup.find(seqForThisGroup[i].getName()); - - if (itNameMap == nameMapForThisGroup.end()){ - error = 1; - m->mothurOut("[ERROR]: " + seqForThisGroup[i].getName() + " is in your fastafile, but is not in your namesfile, please correct."); m->mothurOutEndLine(); - }else { - int num = m->getNumNames(itNameMap->second); - - seqPriorityNode temp(num, seqForThisGroup[i].getAligned(), seqForThisGroup[i].getName()); - nameVector.push_back(temp); - } - } - - if (error == 1) { out.close(); m->mothurRemove(filename); return 1; } - - //sort by num represented - sort(nameVector.begin(), nameVector.end(), compareSeqPriorityNodes); - - //print new file in order of - for (int i = 0; i < nameVector.size(); i++) { - - if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } - - out << ">" << nameVector[i].name << "/ab=" << nameVector[i].numIdentical << "/" << endl << nameVector[i].seq << endl; - } - - }else { - //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n"); - for (int i = 0; i < seqForThisGroup.size(); i++) { - - if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } - - seqForThisGroup[i].printSequence(out); - } - } - out.close(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "getSeqs"); - exit(1); - } -} - -/************************************************************/ -map SequenceParser::getNameMap(string g){ - try { - map >::iterator it; - map nameMapForThisGroup; - - it = nameMapPerGroup.find(g); - if(it == nameMapPerGroup.end()) { - m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); - }else { - nameMapForThisGroup = it->second; - } - - return nameMapForThisGroup; - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "getNameMap"); - exit(1); - } -} -/************************************************************/ -int SequenceParser::getNameMap(string g, string filename){ - try { - map >::iterator it; - map nameMapForThisGroup; - - it = nameMapPerGroup.find(g); - if(it == nameMapPerGroup.end()) { - m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); - }else { - nameMapForThisGroup = it->second; - - ofstream out; - m->openOutputFile(filename, out); - - for (map::iterator itFile = nameMapForThisGroup.begin(); itFile != nameMapForThisGroup.end(); itFile++) { - - if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; } - - out << itFile->first << '\t' << itFile->second << endl; - } - - out.close(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SequenceParser", "getNameMap"); - exit(1); - } -} -/************************************************************/ - - -