X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=sequenceparser.cpp;fp=sequenceparser.cpp;h=44012d8ba8321fea520fbe613e0dac8f8911c760;hb=ae57e166b2ed7b475ec3f466106bd76fabadd063;hp=0000000000000000000000000000000000000000;hpb=55386dddad84cc1140d736cabaf4dd0ae16f2e01;p=mothur.git diff --git a/sequenceparser.cpp b/sequenceparser.cpp new file mode 100644 index 0000000..44012d8 --- /dev/null +++ b/sequenceparser.cpp @@ -0,0 +1,273 @@ +/* + * sequenceParser.cpp + * Mothur + * + * Created by westcott on 9/9/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "sequenceParser.h" + + +/************************************************************/ +SequenceParser::SequenceParser(string groupFile, string fastaFile, string nameFile) { + try { + + m = MothurOut::getInstance(); + int error; + + //read group file + groupMap = new GroupMap(groupFile); + error = groupMap->readMap(); + + if (error == 1) { m->control_pressed = true; } + + //initialize maps + vector namesOfGroups = groupMap->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + map tempMap; + seqs[namesOfGroups[i]] = temp; + nameMapPerGroup[namesOfGroups[i]] = tempMap; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastaFile, in); + + map seqName; //stores name -> sequence string so we can make new "unique" sequences when we parse the name file + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + + string group = groupMap->getGroup(seq.getName()); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + seqs[group].push_back(seq); + seqName[seq.getName()] = seq.getAligned(); + } + } + } + in.close(); + + if (error == 1) { m->control_pressed = true; } + + //read name file + ifstream inName; + m->openInputFile(nameFile, inName); + + string first, second; + int countName = 0; + while(!inName.eof()) { + + if (m->control_pressed) { break; } + + inName >> first; m->gobble(inName); + inName >> second; m->gobble(inName); + + vector names; + m->splitAtChar(second, names, ','); + + //get aligned string for these seqs from the fasta file + string alignedString = ""; + map::iterator itAligned = seqName.find(names[0]); + if (itAligned == seqName.end()) { + error = 1; m->mothurOut("[ERROR]: " + names[0] + " is in your name file and not in your fasta file, please correct."); m->mothurOutEndLine(); + }else { + alignedString = itAligned->second; + } + + //separate by group - parse one line in name file + map splitMap; //group -> name1,name2,... + map::iterator it; + for (int i = 0; i < names.size(); i++) { + + string group = groupMap->getGroup(names[i]); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + names[i] + " is in your name file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { + + it = splitMap.find(group); + if (it != splitMap.end()) { //adding seqs to this group + (it->second) += "," + names[i]; + countName++; + }else { //first sighting of this group + splitMap[group] = names[i]; + countName++; + + //is this seq in the fasta file? + if (i != 0) { //if not then we need to add a duplicate sequence to the seqs for this group so the new "fasta" and "name" files will match + Sequence tempSeq(names[i], alignedString); //get the first guys sequence string since he's in the fasta file. + seqs[group].push_back(tempSeq); + } + } + } + } + + + //fill nameMapPerGroup - holds all lines in namefile separated by group + for (it = splitMap.begin(); it != splitMap.end(); it++) { + //grab first name + string firstName = ""; + for(int i = 0; i < (it->second).length(); i++) { + if (((it->second)[i]) != ',') { + firstName += ((it->second)[i]); + }else { break; } + } + + //group1 -> seq1 -> seq1,seq2,seq3 + nameMapPerGroup[it->first][firstName] = it->second; + } + } + + inName.close(); + + if (error == 1) { m->control_pressed = true; } + + if (countName != (groupMap->getNumSeqs())) { + m->mothurOutEndLine(); + m->mothurOut("[ERROR]: Your name file contains " + toString(countName) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + m->mothurOutEndLine(); + m->control_pressed = true; + } + + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "SequenceParser"); + exit(1); + } +} +/************************************************************/ +SequenceParser::SequenceParser(string groupFile, string fastaFile) { + try { + + m = MothurOut::getInstance(); + int error; + + //read group file + groupMap = new GroupMap(groupFile); + error = groupMap->readMap(); + + if (error == 1) { m->control_pressed = true; } + + //initialize maps + vector namesOfGroups = groupMap->getNamesOfGroups(); + for (int i = 0; i < namesOfGroups.size(); i++) { + vector temp; + seqs[namesOfGroups[i]] = temp; + } + + //read fasta file making sure each sequence is in the group file + ifstream in; + m->openInputFile(fastaFile, in); + int count = 0; + + while (!in.eof()) { + + if (m->control_pressed) { break; } + + Sequence seq(in); m->gobble(in); + + if (seq.getName() != "") { + + string group = groupMap->getGroup(seq.getName()); + if (group == "not found") { error = 1; m->mothurOut("[ERROR]: " + seq.getName() + " is in your fasta file and not in your groupfile, please correct."); m->mothurOutEndLine(); } + else { seqs[group].push_back(seq); count++; } + } + } + in.close(); + + if (error == 1) { m->control_pressed = true; } + + if (count != (groupMap->getNumSeqs())) { + m->mothurOutEndLine(); + m->mothurOut("[ERROR]: Your fasta file contains " + toString(count) + " valid sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); + if (count < (groupMap->getNumSeqs())) { m->mothurOut(" Did you forget to include the name file?"); } + m->mothurOutEndLine(); + m->control_pressed = true; + } + + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "SequenceParser"); + exit(1); + } +} +/************************************************************/ +SequenceParser::~SequenceParser(){ delete groupMap; } +/************************************************************/ +int SequenceParser::getNumGroups(){ return groupMap->getNumGroups(); } +/************************************************************/ +vector SequenceParser::getNamesOfGroups(){ return groupMap->getNamesOfGroups(); } +/************************************************************/ +bool SequenceParser::isValidGroup(string g){ return groupMap->isValidGroup(g); } +/************************************************************/ +string SequenceParser::getGroup(string g){ return groupMap->getGroup(g); } +/************************************************************/ +int SequenceParser::getNumSeqs(string g){ + try { + map >::iterator it; + int num = 0; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: " + g + " is not a valid group, please correct."); m->mothurOutEndLine(); + }else { + num = (it->second).size(); + } + + return num; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getNumSeqs"); + exit(1); + } +} +/************************************************************/ +vector SequenceParser::getSeqs(string g){ + try { + map >::iterator it; + vector seqForThisGroup; + + it = seqs.find(g); + if(it == seqs.end()) { + m->mothurOut("[ERROR]: No sequences available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + seqForThisGroup = it->second; + } + + return seqForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getSeqs"); + exit(1); + } +} +/************************************************************/ +map SequenceParser::getNameMap(string g){ + try { + map >::iterator it; + map nameMapForThisGroup; + + it = nameMapPerGroup.find(g); + if(it == nameMapPerGroup.end()) { + m->mothurOut("[ERROR]: No nameMap available for group " + g + ", please correct."); m->mothurOutEndLine(); + }else { + nameMapForThisGroup = it->second; + } + + return nameMapForThisGroup; + } + catch(exception& e) { + m->errorOut(e, "SequenceParser", "getNameMap"); + exit(1); + } +} +/************************************************************/ + + +