From: Sarah Westcott Date: Mon, 11 Feb 2013 12:14:38 +0000 (-0500) Subject: added ':' name check to seq. error and fastq.info. sffinfo now ignores clipQualRight... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=567e4bca5d62bd8ea316ce5def320d070d7507b8 added ':' name check to seq. error and fastq.info. sffinfo now ignores clipQualRight values of 0. fixed bug in pcr.seqs if file was aligned and both forward and reverse primers were given, reverse primer was not trimmed properly. added pdiffs to pcr.seqs. --- diff --git a/collectcommand.cpp b/collectcommand.cpp index de92e49..b892a91 100644 --- a/collectcommand.cpp +++ b/collectcommand.cpp @@ -67,8 +67,8 @@ string CollectCommand::getHelpString(){ helpString += "The collect.single command parameters are list, sabund, rabund, shared, label, freq, calc and abund. list, sabund, rabund or shared is required unless you have a valid current file. \n"; helpString += "The collect.single command should be in the following format: \n"; helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; - helpString += "collect.single(label=yourLabel, iters=yourIters, freq=yourFreq, calc=yourEstimators).\n"; - helpString += "Example collect(label=unique-.01-.03, iters=10000, freq=10, calc=sobs-chao-ace-jack).\n"; + helpString += "collect.single(label=yourLabel, freq=yourFreq, calc=yourEstimators).\n"; + helpString += "Example collect(label=unique-.01-.03, freq=10, calc=sobs-chao-ace-jack).\n"; helpString += "The default values for freq is 100, and calc are sobs-chao-ace-jack-shannon-npshannon-simpson.\n"; helpString += validCalculator.printCalc("single"); helpString += "The label parameter is used to analyze specific labels in your input.\n"; diff --git a/mothur.h b/mothur.h index cd14056..98b34c3 100644 --- a/mothur.h +++ b/mothur.h @@ -170,7 +170,6 @@ inline bool compareIndexes(PDistCell left, PDistCell right){ return (left.index > right.index); } //******************************************************************************************************************** -//sorts highest to lowest inline bool compareSpearman(spearmanRank left, spearmanRank right){ return (left.score < right.score); } @@ -185,11 +184,7 @@ inline bool compareSeqPriorityNodes(seqPriorityNode left, seqPriorityNode right) } return false; } -//******************************************************************************************************************** -//sorts lowest to highest -inline bool compareSpearmanReverse(spearmanRank left, spearmanRank right){ - return (left.score < right.score); -} + /************************************************************/ //sorts lowest to highest inline bool compareDistLinePairs(distlinePair left, distlinePair right){ diff --git a/mothurout.cpp b/mothurout.cpp index dc77490..9f678fe 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -453,7 +453,7 @@ void MothurOut::errorOut(exception& e, string object, string function) { if (object == "cluster"){ mothurOut(" has occurred in the " + object + " class function " + function + ". This error indicates your computer is running out of memory. There are two common causes for this, file size and format.\n\nFile Size:\nThe cluster command loads your distance matrix into RAM, and your distance file is most likely too large to fit in RAM. There are two options to help with this. The first is to use a cutoff. By using a cutoff mothur will only load distances that are below the cutoff. If that is still not enough, there is a command called cluster.split, http://www.mothur.org/wiki/cluster.split which divides the distance matrix, and clusters the smaller pieces separately. You may also be able to reduce the size of the original distance matrix by using the commands outlined in the Schloss SOP, http://www.mothur.org/wiki/Schloss_SOP. \n\nWrong Format:\nThis error can be caused by trying to read a column formatted distance matrix using the phylip parameter. By default, the dist.seqs command generates a column formatted distance matrix. To make a phylip formatted matrix set the dist.seqs command parameter output to lt. \n\nIf you are uable to resolve the issue, please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry."); }else { - mothurOut(" has occurred in the " + object + " class function " + function + ". This error indicates your computer is running out of memory. This is most commonly caused by trying to process a dataset too large, or a file format issue. If you are running our 32bit version, your memory usage is limited to 4G. If you have more than 4G of RAM and are running a 64bit OS, using our 64bit version may resolve your issue. Also, you may be able to reduce the size of your dataset by using the commands outlined in the Schloss SOP, http://www.mothur.org/wiki/Schloss_SOP. If you are uable to resolve the issue, please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry."); + mothurOut(" has occurred in the " + object + " class function " + function + ". This error indicates your computer is running out of memory. This is most commonly caused by trying to process a dataset too large, using multiple processors, or a file format issue. If you are running our 32bit version, your memory usage is limited to 4G. If you have more than 4G of RAM and are running a 64bit OS, using our 64bit version may resolve your issue. If you are using multiple processors, try running the command with processors=1, the more processors you use the more memory is required. Also, you may be able to reduce the size of your dataset by using the commands outlined in the Schloss SOP, http://www.mothur.org/wiki/Schloss_SOP. If you are uable to resolve the issue, please contact Pat Schloss at mothur.bugs@gmail.com, and be sure to include the mothur.logFile with your inquiry."); } } } @@ -951,7 +951,7 @@ string MothurOut::getFullPathName(string fileName){ } for (int i = index; i >= 0; i--) { - newFileName = dirs[i] + "\\\\" + newFileName; + newFileName = dirs[i] + "\\" + newFileName; } return newFileName; @@ -2776,6 +2776,7 @@ void MothurOut::splitAtDash(string& estim, set& container) { string individual = ""; int estimLength = estim.size(); bool prevEscape = false; + /* for(int i=0;i& container) { } } } - container.insert(individual); + */ + + for(int i=0;i& container) { int lineNum; int estimLength = estim.size(); bool prevEscape = false; + /* for(int i=0;i& container) { prevEscape = false; } } - } + }*/ + + for(int i=0;i OptionParser::getParameters() { it->second = m->getDesignFile(); }else if (it->first == "sff") { it->second = m->getSFFFile(); + }else if (it->first == "flow") { + it->second = m->getFlowFile(); }else if (it->first == "oligos") { it->second = m->getOligosFile(); }else if (it->first == "accnos") { diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp index 89f97ac..704e752 100644 --- a/parsefastaqcommand.cpp +++ b/parsefastaqcommand.cpp @@ -183,7 +183,10 @@ int ParseFastaQCommand::execute(){ string name = m->getline(in); m->gobble(in); if (name == "") { m->mothurOut("[ERROR]: Blank fasta name."); m->mothurOutEndLine(); m->control_pressed = true; break; } else if (name[0] != '@') { m->mothurOut("[ERROR]: reading " + name + " expected a name with @ as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else { name = name.substr(1); } + else { + name = name.substr(1); + for (int i = 0; i < name.length(); i++) { if (name[i] == ':') { name[i] = '_'; m->changedSeqNames = true; } } + } //read sequence string sequence = m->getline(in); m->gobble(in); @@ -193,7 +196,10 @@ int ParseFastaQCommand::execute(){ string name2 = m->getline(in); m->gobble(in); if (name2 == "") { m->mothurOut("[ERROR]: Blank quality name."); m->mothurOutEndLine(); m->control_pressed = true; break; } else if (name2[0] != '+') { m->mothurOut("[ERROR]: reading " + name2 + " expected a name with + as a leading character."); m->mothurOutEndLine(); m->control_pressed = true; break; } - else { name2 = name2.substr(1); } + else { + name2 = name2.substr(1); + for (int i = 0; i < name2.length(); i++) { if (name2[i] == ':') { name2[i] = '_'; m->changedSeqNames = true; } } + } //read quality scores string quality = m->getline(in); m->gobble(in); diff --git a/pcrseqscommand.h b/pcrseqscommand.h index c6f7ff9..5ac6437 100644 --- a/pcrseqscommand.h +++ b/pcrseqscommand.h @@ -48,10 +48,10 @@ private: bool getOligos(vector >&, vector >&, vector >&); bool abort, keepprimer, keepdots; string fastafile, oligosfile, taxfile, groupfile, namefile, countfile, ecolifile, outputDir, nomatch; - int start, end, processors, length; + int start, end, processors, length, pdiffs; vector revPrimer, outputNames; - vector primers; + map primers; int writeAccnos(set); int readName(set&); @@ -62,10 +62,7 @@ private: bool readEcoli(); int driverPcr(string, string, string, set&, linePair); int createProcesses(string, string, string, set&); - bool findForward(Sequence&, int&, int&); - bool findReverse(Sequence&, int&, int&); bool isAligned(string, map&); - bool compareDNASeq(string, string); string reverseOligo(string); }; @@ -78,7 +75,7 @@ struct pcrData { string goodFasta, badFasta, oligosfile, ecolifile, nomatch; unsigned long long fstart; unsigned long long fend; - int count, start, end, length; + int count, start, end, length, pdiffs; MothurOut* m; vector primers; vector revPrimer; @@ -87,7 +84,7 @@ struct pcrData { pcrData(){} - pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector pr, vector rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) { + pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector pr, vector rpr, string nm, bool kp, bool kd, int st, int en, int l, int pd, unsigned long long fst, unsigned long long fen) { filename = f; goodFasta = gf; badFasta = bfn; @@ -104,6 +101,7 @@ struct pcrData { length = l; fstart = fst; fend = fen; + pdiffs = pd; count = 0; } }; @@ -132,6 +130,9 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ } set lengths; + //pdiffs, bdiffs, primers, barcodes, revPrimers + map faked; + TrimOligos trim(pdiffs, 0, primers, faked, revPrimer); for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process pDataArray->count++; @@ -160,62 +161,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ //process primers if (pDataArray->primers.size() != 0) { int primerStart = 0; int primerEnd = 0; - //bool good = findForward(currSeq, primerStart, primerEnd); - /////////////////////////////////////////////////////////////// - bool good = false; - string rawSequence = currSeq.getUnaligned(); - - for(int j=0;jprimers.size();j++){ - string oligo = pDataArray->primers[j]; - - if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } - - if(rawSequence.length() < oligo.length()) { break; } - - //search for primer - int olength = oligo.length(); - for (int l = 0; l < rawSequence.length()-olength; l++){ - if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } - string rawChunk = rawSequence.substr(l, olength); - //compareDNASeq(oligo, rawChunk) - //////////////////////////////////////////////////////// - bool success = 1; - for(int k=0;knomatch == "reject") { goodSeq = false; } trashCode += "f"; } else{ @@ -229,6 +175,15 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerStart]); } else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } } + /////////////////////////////////////////////////////////////// + mapAligned.clear(); + string seq = currSeq.getAligned(); + int countBases = 0; + for (int k = 0; k < seq.length(); k++) { + if (!isalpha(seq[k])) { ; } + else { mapAligned[countBases] = k; countBases++; } + } + /////////////////////////////////////////////////////////////// }else { if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } @@ -239,60 +194,8 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ //process reverse primers if (pDataArray->revPrimer.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = false; - //findReverse(currSeq, primerStart, primerEnd); - /////////////////////////////////////////////////////////////// - string rawSequence = currSeq.getUnaligned(); - - for(int j=0;jrevPrimer.size();j++){ - string oligo = pDataArray->revPrimer[j]; - if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; } - if(rawSequence.length() < oligo.length()) { break; } - - //search for primer - int olength = oligo.length(); - for (int l = rawSequence.length()-olength; l >= 0; l--){ - - string rawChunk = rawSequence.substr(l, olength); - //compareDNASeq(oligo, rawChunk) - //////////////////////////////////////////////////////// - bool success = 1; - for(int k=0;knomatch == "reject") { goodSeq = false; } trashCode += "r"; } else{ //are you aligned diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index 4d5b6d9..5fa71be 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -21,6 +21,8 @@ vector PcrSeqsCommand::setParameters(){ CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart); CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend); CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer); CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots); @@ -41,7 +43,7 @@ string PcrSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The pcr.seqs command reads a fasta file.\n"; - helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n"; + helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n"; helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n"; helpString += "The start parameter allows you to provide a starting position to trim to.\n"; helpString += "The end parameter allows you to provide a ending position to trim from.\n"; @@ -49,6 +51,7 @@ string PcrSeqsCommand::getHelpString(){ helpString += "The processors parameter allows you to use multiple processors.\n"; helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n"; helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n"; return helpString; @@ -262,6 +265,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; } @@ -319,7 +325,7 @@ int PcrSeqsCommand::execute(){ length = 0; - if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; } + if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; } if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; } vector positions; @@ -499,7 +505,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); } // Allocate memory for thread data. - pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end); + pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end); pDataArray.push_back(tempPcr); //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier @@ -561,6 +567,10 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta int count = 0; set lengths; + //pdiffs, bdiffs, primers, barcodes, revPrimers + map faked; + TrimOligos trim(pdiffs, 0, primers, faked, revPrimer); + while (!done) { if (m->control_pressed) { break; } @@ -570,6 +580,8 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta string trashCode = ""; if (currSeq.getName() != "") { + if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); } + bool goodSeq = true; if (oligosfile != "") { map mapAligned; @@ -578,7 +590,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process primers if (primers.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findForward(currSeq, primerStart, primerEnd); + bool good = trim.findForward(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; } else{ @@ -590,8 +602,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } else { if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } + else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } } + isAligned(currSeq.getAligned(), mapAligned); }else { if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } @@ -602,10 +615,10 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process reverse primers if (revPrimer.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findReverse(currSeq, primerStart, primerEnd); + bool good = trim.findReverse(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; } else{ - //are you aligned + //are you aligned if (aligned) { if (!keepprimer) { if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); } @@ -700,75 +713,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } } //********************************************************************/ -bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int j=0;jcontrol_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } -} -//******************************************************************/ -bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int i=0;i= 0; j--){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "findReverse"); - exit(1); - } -} -//********************************************************************/ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ try { + aligned.clear(); bool isAligned = false; int countBases = 0; @@ -832,6 +779,7 @@ bool PcrSeqsCommand::readOligos(){ m->openInputFile(oligosfile, inOligos); string type, oligo, group; + int primerCount = 0; while(!inOligos.eof()){ @@ -859,7 +807,7 @@ bool PcrSeqsCommand::readOligos(){ if (c == 10 || c == 13){ break; } else if (c == 32 || c == 9){;} //space or tab } - primers.push_back(oligo); + primers[oligo] = primerCount; primerCount++; }else if(type == "REVERSE"){ string oligoRC = reverseOligo(oligo); revPrimer.push_back(oligoRC); @@ -935,43 +883,6 @@ int PcrSeqsCommand::writeAccnos(set badNames){ exit(1); } -} -//******************************************************************/ -bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "PcrSeqsCommand", "compareDNASeq"); - exit(1); - } - } //*************************************************************************************************************** int PcrSeqsCommand::readName(set& names){ diff --git a/qualityscores.cpp b/qualityscores.cpp index 4998b3d..33ca172 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -34,13 +34,17 @@ QualityScores::QualityScores(ifstream& qFile){ int score; seqName = getSequenceName(qFile); + if (m->debug) { m->mothurOut("[DEBUG]: name = '" + seqName + "'\n."); } + if (!m->control_pressed) { string qScoreString = m->getline(qFile); - //cout << qScoreString << endl; + + if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + qScoreString + "'\n."); } + while(qFile.peek() != '>' && qFile.peek() != EOF){ if (m->control_pressed) { break; } string temp = m->getline(qFile); - //cout << temp << endl; + if (m->debug) { m->mothurOut("[DEBUG]: scores = '" + temp + "'\n."); } qScoreString += ' ' + temp; } //cout << "done reading " << endl; @@ -51,6 +55,8 @@ QualityScores::QualityScores(ifstream& qFile){ string temp; qScoreStringStream >> temp; m->gobble(qScoreStringStream); + if (m->debug) { m->mothurOut("[DEBUG]: score " + toString(qScores.size()) + " = '" + temp + "'\n."); } + //check temp to make sure its a number if (!m->isContainingOnlyDigits(temp)) { m->mothurOut("[ERROR]: In sequence " + seqName + "'s quality scores, expected a number and got " + temp + ", setting score to 0."); m->mothurOutEndLine(); temp = "0"; } convert(temp, score); diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp index 67e43aa..0a6eae9 100644 --- a/seqerrorcommand.cpp +++ b/seqerrorcommand.cpp @@ -1381,6 +1381,10 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam string sname = ""; nameStream >> sname; sname = sname.substr(1); + + for (int i = 0; i < sname.length(); i++) { + if (sname[i] == ':') { sname[i] = '_'; m->changedSeqNames = true; } + } map::iterator it = firstSeqNames.find(sname); @@ -1441,6 +1445,10 @@ int SeqErrorCommand::setLines(string filename, string qfilename, string rfilenam istringstream nameStream(input); string sname = ""; nameStream >> sname; + for (int i = 0; i < sname.length(); i++) { + if (sname[i] == ':') { sname[i] = '_'; m->changedSeqNames = true; } + } + map::iterator it = firstSeqNamesReport.find(sname); if(it != firstSeqNamesReport.end()) { //this is the start of a new chunk diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp index a1b7066..0148ab1 100644 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -1039,7 +1039,11 @@ int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& pr if (trim) { if(header.clipQualRight < header.clipQualLeft){ - seq = "NNNN"; + if (header.clipQualRight == 0) { //don't trim right + seq = seq.substr(header.clipQualLeft-1); + }else { + seq = "NNNN"; + } } else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); @@ -1260,7 +1264,11 @@ int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& head if (trim) { if(header.clipQualRight < header.clipQualLeft){ - seq = "NNNN"; + if (header.clipQualRight == 0) { //don't trim right + seq = seq.substr(header.clipQualLeft-1); + }else { + seq = "NNNN"; + } } else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); @@ -1296,8 +1304,13 @@ int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& heade if (trim) { if(header.clipQualRight < header.clipQualLeft){ - out << ">" << header.name << " xy=" << header.xy << endl; - out << "0\t0\t0\t0"; + if (header.clipQualRight == 0) { //don't trim right + out << ">" << header.name << " xy=" << header.xy << " length=" << (read.qualScores.size()-header.clipQualLeft) << endl; + for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + }else { + out << ">" << header.name << " xy=" << header.xy << endl; + out << "0\t0\t0\t0"; + } } else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; @@ -1325,6 +1338,8 @@ int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& heade //********************************************************************************************************************** int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { try { + if (header.clipQualRight == 0) { header.clipQualRight = read.flowgram.size(); } + if(header.clipQualRight > header.clipQualLeft){ int rightIndex = 0; diff --git a/trimoligos.cpp b/trimoligos.cpp index ecdca9b..a53f4e8 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -144,6 +144,24 @@ TrimOligos::TrimOligos(int p, int b, map pr, map br, v barcodes = br; primers = pr; revPrimer = r; + + maxFBarcodeLength = 0; + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(oligo.length() > maxFBarcodeLength){ + maxFBarcodeLength = oligo.length(); + } + } + maxRBarcodeLength = maxFBarcodeLength; + + maxFPrimerLength = 0; + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(oligo.length() > maxFPrimerLength){ + maxFPrimerLength = oligo.length(); + } + } + maxRPrimerLength = maxFPrimerLength; } catch(exception& e) { m->errorOut(e, "TrimOligos", "TrimOligos"); @@ -152,7 +170,135 @@ TrimOligos::TrimOligos(int p, int b, map pr, map br, v } /********************************************************************/ TrimOligos::~TrimOligos() {} +//********************************************************************/ +bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ + try { + + string rawSequence = seq.getUnaligned(); + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + + if(rawSequence.length() < oligo.length()) { break; } + + //search for primer + int olength = oligo.length(); + for (int j = 0; j < rawSequence.length()-olength; j++){ + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + string rawChunk = rawSequence.substr(j, olength); + if(compareDNASeq(oligo, rawChunk)) { + primerStart = j; + primerEnd = primerStart + olength; + return true; + } + + } + } + + primerStart = 0; primerEnd = 0; + //if you don't want to allow for diffs + if (pdiffs == 0) { return false; } + else { //try aligning and see if you can find it + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + + Alignment* alignment; + if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + for(map::iterator it=primers.begin();it!=primers.end();it++){ + + string prim = it->first; + //search for primer + int olength = prim.length(); + if (rawSequence.length() < olength) {} //ignore primers too long for this seq + else{ + for (int j = 0; j < rawSequence.length()-olength; j++){ + + string oligo = it->first; + + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + + string rawChunk = rawSequence.substr(j, olength+pdiffs); + + //use needleman to align first primer.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawChunk); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + primerStart = j; + primerEnd = primerStart + alnLength; + }else if(numDiff == minDiff){ minCount++; } + } + } + } + + if (alignment != NULL) { delete alignment; } + + if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; return false; } //no good matches + else if(minCount > 1) { primerStart = 0; primerEnd = 0; return false; } //can't tell the difference between multiple primers + else{ return true; } + } + + primerStart = 0; primerEnd = 0; + return false; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripForward"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ + try { + + string rawSequence = seq.getUnaligned(); + + for(int i=0;i= 0; j--){ + if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } + string rawChunk = rawSequence.substr(j, olength); + + if(compareDNASeq(oligo, rawChunk)) { + primerStart = j; + primerEnd = primerStart + olength; + return true; + } + + } + } + + primerStart = 0; primerEnd = 0; + return false; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "findReverse"); + exit(1); + } +} //*******************************************************************/ + int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ try { diff --git a/trimoligos.h b/trimoligos.h index 8aa44bd..d72b814 100644 --- a/trimoligos.h +++ b/trimoligos.h @@ -50,6 +50,11 @@ class TrimOligos { bool stripSpacer(Sequence&); bool stripSpacer(Sequence&, QualityScores&); + + //seq, primerStart, primerEnd + bool findForward(Sequence&, int&, int&); + bool findReverse(Sequence&, int&, int&); + private: