From 1244c4907c07baea86b0f0676d098a29d2e95a39 Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 9 Sep 2009 20:20:23 +0000 Subject: [PATCH] continued work on chimeras and fixed bug in trim.seqs and reverse.seqs that was due to the fact that we changed the sequence class so that all seqs now have an aligned version. that change was made to fix another bug. --- Mothur.xcodeproj/project.pbxproj | 12 +- aligncommand.cpp | 5 + alignedsimilarity.cpp | 543 ------------------------------- alignedsimilarity.h | 67 ---- alignment.cpp | 15 + alignment.hpp | 2 + ccode.cpp | 11 - ccode.h | 1 - chimera.h | 7 +- chimeracheckrdp.cpp | 295 +++++++++++++++++ chimeracheckrdp.h | 72 ++++ chimeraseqscommand.cpp | 28 +- chimeraseqscommand.h | 2 +- database.cpp | 1 + kmer.cpp | 26 ++ kmer.hpp | 2 +- kmerdb.hpp | 1 + nast.cpp | 16 +- needlemanoverlap.cpp | 7 +- needlemanoverlap.hpp | 1 + sequence.cpp | 1 + trimseqscommand.cpp | 7 +- 22 files changed, 465 insertions(+), 657 deletions(-) delete mode 100644 alignedsimilarity.cpp delete mode 100644 alignedsimilarity.h create mode 100644 chimeracheckrdp.cpp create mode 100644 chimeracheckrdp.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 75ef0d5..6aa6b27 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -162,7 +162,7 @@ A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; }; A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; }; A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; - A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; }; + A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; }; A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; }; EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; }; EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; }; @@ -515,8 +515,8 @@ A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; }; A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; }; - A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = SOURCE_ROOT; }; - A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = SOURCE_ROOT; }; + A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = ""; }; + A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = ""; }; A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; }; A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; }; C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = ""; }; @@ -649,12 +649,12 @@ children = ( 374F2FD51006320200E97C66 /* chimera.h */, 372095C1103196D70004D347 /* chimera.cpp */, - A75B887A104C16860083C454 /* alignedsimilarity.h */, - A75B8879104C16860083C454 /* alignedsimilarity.cpp */, 374F2FE9100634B600E97C66 /* bellerophon.h */, 374F2FEA100634B600E97C66 /* bellerophon.cpp */, A75B887C104C16860083C454 /* ccode.h */, A75B887B104C16860083C454 /* ccode.cpp */, + A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */, + A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */, 372A55531017922B00C5194B /* decalc.h */, 372A55541017922B00C5194B /* decalc.cpp */, 374F301110063B6F00E97C66 /* pintail.h */, @@ -1177,8 +1177,8 @@ 374F301310063B6F00E97C66 /* pintail.cpp in Sources */, 372A55551017922B00C5194B /* decalc.cpp in Sources */, 372095C2103196D70004D347 /* chimera.cpp in Sources */, - A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */, A75B887E104C16860083C454 /* ccode.cpp in Sources */, + A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 958f29b..da807bd 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -272,6 +272,11 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ for(int i=0;inumSeqs;i++){ Sequence* candidateSeq = new Sequence(inFASTA); + + if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { + alignment->resize(candidateSeq->getUnaligned().length()+1); + } + report.setCandidate(candidateSeq); Sequence temp = templateDB->findClosestSequence(candidateSeq); diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp deleted file mode 100644 index 0ed9ce3..0000000 --- a/alignedsimilarity.cpp +++ /dev/null @@ -1,543 +0,0 @@ -/* - * alignedsimilarity.cpp - * Mothur - * - * Created by westcott on 8/18/09. - * Copyright 2009 Schloss Lab. All rights reserved. - * - */ - -#include "alignedsimilarity.h" -#include "ignoregaps.h" - - -//*************************************************************************************************************** -AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; } -//*************************************************************************************************************** - -AlignSim::~AlignSim() { - try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - delete distCalc; - } - catch(exception& e) { - errorOut(e, "AlignSim", "~AlignSim"); - exit(1); - } -} -//*************************************************************************************************************** -void AlignSim::print(ostream& out) { - try { - - mothurOutEndLine(); - - for (int i = 0; i < querySeqs.size(); i++) { - - int j = 0; float largest = -10; - //find largest sim value - for (int k = 0; k < IS[i].size(); k++) { - //is this score larger - if (IS[i][k].score > largest) { - j = k; - largest = IS[i][k].score; - } - } - - //find parental similarity - distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent)); - float dist = distCalc->getDist(); - - //convert to similarity - dist = (1 - dist) * 100; - - //warn about parental similarity - if its above 82% may not detect a chimera - if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); } - - int index = ceil(dist); - - if (index == 0) { index=1; } - - //is your DE value higher than the 95% - string chimera; - if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; } - else { chimera = "No"; } - - out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl; - - if (chimera == "Yes") { - mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine(); - } - out << "Improvement Score\t"; - - for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; } - out << endl; - } - } - catch(exception& e) { - errorOut(e, "AlignSim", "print"); - exit(1); - } -} - -//*************************************************************************************************************** -void AlignSim::getChimeras() { - try { - - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - IS.resize(numSeqs); - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - - //find breakup of templatefile for quantiles - if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } - else { - for (int i = 0; i < processors; i++) { - templateLines.push_back(new linePair()); - templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); - templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); - } - } - #else - lines.push_back(new linePair(0, numSeqs)); - templateLines.push_back(new linePair(0, templateSeqs.size())); - #endif - - - distCalc = new ignoreGaps(); - - //find window breaks - windowBreak = findWindows(); -//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; } -//cout << endl; - - mothurOut("Finding the IS values for your sequences..."); cout.flush(); - if (processors == 1) { - IS = findIS(lines[0]->start, lines[0]->end, querySeqs); - }else { IS = createProcessesIS(querySeqs, lines); } - mothurOut("Done."); mothurOutEndLine(); - - //quantiles are used to determine whether the de values found indicate a chimera - if (quanfile != "") { quantile = readQuantiles(); } - else { - - mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); - //get IS quantiles as a reference to these IS scores - //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see - if (processors == 1) { - templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs); - }else { templateIS = createProcessesIS(templateSeqs, templateLines); } -//cout << "here" << endl; - ofstream out4; - string o; - - o = getRootName(templateFile) + "alignedsimilarity.quan"; - - openOutputFile(o, out4); - - //adjust quantiles - //construct table - quantile.resize(100); - - for (int i = 0; i < templateIS.size(); i++) { - - if (templateIS[i].size() != 0) { - int j = 0; float score = -1000; - //find highest IS score - //cout << templateIS[i].size() << endl; - for (int k = 0; k < templateIS[i].size(); k++) { - if (templateIS[i][k].score > score) { - score = templateIS[i][k].score; - j = k; - } - } - //cout << j << endl; - //find similarity of parents - distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent)); - float dist = distCalc->getDist(); - - //convert to similarity - dist = (1 - dist) * 100; - // cout << dist << endl; - int index = ceil(dist); - // cout << "index = " << index << endl; - if (index == 0) { index = 1; } - quantile[index-1].push_back(templateIS[i][j].score); - } - } - - - for (int i = 0; i < quantile.size(); i++) { - vector temp; - - if (quantile[i].size() == 0) { - //in case this is not a distance found in your template files - for (int g = 0; g < 6; g++) { - temp.push_back(0.0); - } - }else{ - - sort(quantile[i].begin(), quantile[i].end()); - - //save 10% - temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]); - //save 25% - temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]); - //save 50% - temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]); - //save 75% - temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]); - //save 95% - temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]); - //save 99% - temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]); - - } - - //output quan value - out4 << i+1 << '\t'; - for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; } - out4 << endl; - - quantile[i] = temp; - - } - - out4.close(); - - mothurOut("Done."); mothurOutEndLine(); - - } - - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } - - } - catch(exception& e) { - errorOut(e, "AlignSim", "getChimeras"); - exit(1); - } -} -//*************************************************************************************************************** -vector AlignSim::findWindows() { - try { - - vector win; - - if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; } - - for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); } - - return win; - - } - catch(exception& e) { - errorOut(e, "AlignSim", "findWindows"); - exit(1); - } -} - -//*************************************************************************************************************** -vector< vector > AlignSim::findIS(int start, int end, vector seqs) { - try { - - vector< vector > isValues; - isValues.resize(seqs.size()); - - //for each sequence - for(int i = start; i < end; i++){ - - vector temp; temp.resize(windowBreak.size()); - - mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine(); - - //for each window - for (int m = 0; m < windowBreak.size(); m++) { - - vector closest; //left, right, overall - vector similarity; //left+right and overall - - //find closest left, closest right and closest overall - closest = findClosestSides(seqs[i], windowBreak[m], similarity, i); - - int totalNumBases = seqs[i]->getUnaligned().length(); - - //IS = left+right-overall - float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases); - - ///save IS, leftparent, rightparent, breakpoint - temp[m].leftParent = closest[0]; - temp[m].rightParent = closest[1]; - temp[m].score = is; - temp[m].midpoint = windowBreak[m]; -//cout << is << '\t'; - } -//cout << endl; - isValues[i] = temp; - - } - - return isValues; - - } - catch(exception& e) { - errorOut(e, "AlignSim", "findIS"); - exit(1); - } -} -//*************************************************************************************************************** -vector AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector& sim, int i) { - try{ - - vector closest; - - Sequence query, queryLeft, queryRight; - string frag = seq->getAligned(); - string fragLeft = frag.substr(0, breakpoint); - string fragRight = frag.substr(breakpoint, frag.length()); - - //get pieces - query = *(seq); - queryLeft = *(seq); - queryRight = *(seq); - queryLeft.setAligned(fragLeft); - queryRight.setAligned(fragRight); - - //initialize - Sequence* overall = templateSeqs[0]; - Sequence* left = templateSeqs[0]; - Sequence* right = templateSeqs[0]; - - float smallestOverall, smallestLeft, smallestRight; - smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; - - //go through the templateSeqs and search for the closest - for(int j = 0; j < templateSeqs.size(); j++){ - - //so you don't pick yourself - if (j != i) { - Sequence temp, tempLeft, tempRight; - string fragTemp = templateSeqs[j]->getAligned(); - string fragTempLeft = fragTemp.substr(0, breakpoint); - string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length()); - - //get pieces - temp = *(templateSeqs[j]); - tempLeft = *(templateSeqs[j]); - tempRight = *(templateSeqs[j]); - tempLeft.setAligned(fragTempLeft); - tempRight.setAligned(fragTempRight); - - //find overall dist - distCalc->calcDist(query, temp); - float dist = distCalc->getDist(); - - if (dist < smallestOverall) { - overall = templateSeqs[j]; - smallestOverall = dist; - } - - //find left dist - distCalc->calcDist(queryLeft, tempLeft); - dist = distCalc->getDist(); - - if (dist < smallestLeft) { - left = templateSeqs[j]; - smallestLeft = dist; - } - - //find left dist - distCalc->calcDist(queryRight, tempRight); - dist = distCalc->getDist(); - - if (dist < smallestRight) { - right = templateSeqs[j]; - smallestRight = dist; - } - } - - } - - closest.push_back(left); - closest.push_back(right); - closest.push_back(overall); - - //fill sim with number of matched bases - Sequence tempL = *(left); - string tempLFrag = tempL.getAligned(); - tempL.setAligned(tempLFrag.substr(0, breakpoint)); - - int bothMatches = findNumMatchedBases(queryLeft, tempL); - sim.push_back(bothMatches); - - Sequence tempR = *(right); - string tempRFrag = tempR.getAligned(); - tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length())); - - bothMatches = findNumMatchedBases(queryRight, tempR); - sim.push_back(bothMatches); - - bothMatches = findNumMatchedBases(query, *(overall)); - sim.push_back(bothMatches); - - return closest; - - - } - catch(exception& e) { - errorOut(e, "AlignSim", "findClosestSides"); - exit(1); - } -} -//*************************************************************************************************************** - -int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) { - try{ - - int num = 0; - - string query = seq.getAligned(); - string subject = temp.getAligned(); -//cout << seq.getName() << endl << endl; -//cout << temp.getName() << endl << endl; - - for (int i = 0; i < query.length(); i++) { - //count matches if query[i] is a base and subject is same bases or gap - if (isalpha(query[i])) { - if(!isalpha(subject[i])) { num++; } - else if (query[i] == subject[i]) { num++; } - else { } - } - } -//cout << "num = " << num << endl; - return num; - } - catch(exception& e) { - errorOut(e, "AlignSim", "findNumMatchedBases"); - exit(1); - } -} - -/**************************************************************************************************/ -vector< vector > AlignSim::createProcessesIS(vector seqs, vector linesToProcess) { - try { - vector< vector > localIs; localIs.resize(seqs.size()); - -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine(); - localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs); - mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine(); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - //output pairs - for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) { - out << localIs[i].size() << endl; - for (int j = 0; j < localIs[i].size(); j++) { - localIs[i][j].leftParent->printSequence(out); - localIs[i][j].rightParent->printSequence(out); - out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl; - } - } - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;istart; k < linesToProcess[i]->end; k++) { - int size; - in >> size; - gobble(in); - - vector tempVector; - - for (int j = 0; j < size; j++) { - - sim temp; - - temp.leftParent = new Sequence(in); - gobble(in); - //temp.leftParent->printSequence(cout); - temp.rightParent = new Sequence(in); - gobble(in); -//temp.rightParent->printSequence(cout); - string throwaway; - in >> throwaway >> temp.score >> temp.midpoint; - //cout << temp.score << '\t' << temp.midpoint << endl; - gobble(in); - - tempVector.push_back(temp); - } - - localIs[k] = tempVector; - } - - in.close(); - remove(s.c_str()); - } - - -#else - localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs); -#endif - - return localIs; - } - catch(exception& e) { - errorOut(e, "AlignSim", "createProcessesIS"); - exit(1); - } -} - -//*************************************************************************************************************** diff --git a/alignedsimilarity.h b/alignedsimilarity.h deleted file mode 100644 index 716b8b7..0000000 --- a/alignedsimilarity.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef ALIGNSIM_H -#define ALIGNSIM_H - -/* - * alignedsimilarity.h - * Mothur - * - * Created by westcott on 8/18/09. - * Copyright 2009 Schloss Lab. All rights reserved. - * - */ - - -#include "chimera.h" -#include "dist.h" - -//This class was created using the algorythms described in the -//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper -//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2 - - -/***********************************************************/ - -class AlignSim : public Chimera { - - public: - AlignSim(string, string); - ~AlignSim(); - - void getChimeras(); - void print(ostream&); - - void setCons(string){}; - void setQuantiles(string q) { quanfile = q; }; - - - private: - - Dist* distCalc; - vector lines; - vector templateLines; - - vector querySeqs; - vector templateSeqs; - - vector< vector > IS; //IS[0] is the vector os sim values for each window for querySeqs[0] - vector< vector > templateIS; //templateIS[0] is the vector os sim values for each window for templateSeqs[0] - vector windowBreak; - - string fastafile, templateFile; - int iters; - - vector< vector > findIS(int, int, vector); - vector findClosestSides(Sequence*, int, vector&, int); - int findNumMatchedBases(Sequence, Sequence); - vector findWindows(); - - vector< vector > quantile; - - vector< vector > createProcessesIS(vector, vector); - -}; - -/***********************************************************/ - -#endif - diff --git a/alignment.cpp b/alignment.cpp index 25b2760..2734c38 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -31,7 +31,22 @@ Alignment::Alignment(int A) : nCols(A), nRows(A) { exit(1); } } +/**************************************************************************************************/ +void Alignment::resize(int A) { + try { + nCols = A; + nRows = A; + alignment.resize(nRows); + for(int i=0;istart = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); - templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); - } - } #else lines.push_back(new linePair(0, numSeqs)); - templateLines.push_back(new linePair(0, templateSeqs.size())); #endif distCalc = new eachGapDist(); @@ -305,7 +295,6 @@ void Ccode::getChimeras() { //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } delete distCalc; delete decalc; diff --git a/ccode.h b/ccode.h index 2888aea..bc1aad8 100644 --- a/ccode.h +++ b/ccode.h @@ -42,7 +42,6 @@ class Ccode : public Chimera { vector lines; - vector templateLines; vector querySeqs; vector templateSeqs; vector< map > spotMap; diff --git a/chimera.h b/chimera.h index 366068b..8d7e08d 100644 --- a/chimera.h +++ b/chimera.h @@ -39,8 +39,8 @@ inline bool compareSeqDist(SeqDist left, SeqDist right){ //******************************************************************************************************************** struct sim { - Sequence* leftParent; - Sequence* rightParent; + string leftParent; + string rightParent; float score; int midpoint; }; @@ -68,6 +68,7 @@ class Chimera { virtual void setWindow(int w) { window = w; } virtual void setIncrement(int i) { increment = i; } virtual void setNumWanted(int n) { numWanted = n; } + virtual void setKmerSize(int k) { kmerSize = k; } virtual void setCons(string){}; virtual void setQuantiles(string){}; @@ -85,7 +86,7 @@ class Chimera { protected: bool filter, correction; - int processors, window, increment, numWanted; + int processors, window, increment, numWanted, kmerSize; string seqMask, quanfile, filterString; diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp new file mode 100644 index 0000000..73bc1a0 --- /dev/null +++ b/chimeracheckrdp.cpp @@ -0,0 +1,295 @@ +/* + * chimeracheckrdp.cpp + * Mothur + * + * Created by westcott on 9/8/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "chimeracheckrdp.h" + +//*************************************************************************************************************** +ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) { fastafile = filename; templateFile = temp; } +//*************************************************************************************************************** + +ChimeraCheckRDP::~ChimeraCheckRDP() { + try { + for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } + delete templateDB; + delete kmer; + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "~AlignSim"); + exit(1); + } +} +//*************************************************************************************************************** +void ChimeraCheckRDP::print(ostream& out) { + try { + + mothurOutEndLine(); + /* + for (int i = 0; i < querySeqs.size(); i++) { + + int j = 0; float largest = -10; + //find largest sim value + for (int k = 0; k < IS[i].size(); k++) { + //is this score larger + if (IS[i][k].score > largest) { + j = k; + largest = IS[i][k].score; + } + } + + //find parental similarity + distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent)); + float dist = distCalc->getDist(); + + //convert to similarity + dist = (1 - dist) * 100; + + //warn about parental similarity - if its above 82% may not detect a chimera + if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); } + + int index = ceil(dist); + + if (index == 0) { index=1; } + + //is your DE value higher than the 95% + string chimera; + if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; } + else { chimera = "No"; } + + out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl; + + if (chimera == "Yes") { + mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine(); + } + out << "Improvement Score\t"; + + for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; } + out << endl; + }*/ + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +void ChimeraCheckRDP::getChimeras() { + try { + + //read in query sequences and subject sequences + mothurOut("Reading sequences and template file... "); cout.flush(); + querySeqs = readSeqs(fastafile); + //templateSeqs = readSeqs(templateFile); + templateDB = new KmerDB(templateFile, kmerSize); + mothurOut("Done."); mothurOutEndLine(); + + int numSeqs = querySeqs.size(); + + IS.resize(numSeqs); + closest.resize(numSeqs); + + //break up file if needed + int linesPerProcess = numSeqs / processors ; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //find breakup of sequences for all times we will Parallelize + if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } + else { + //fill line pairs + for (int i = 0; i < (processors-1); i++) { + lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); + } + //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end + int i = processors - 1; + lines.push_back(new linePair((i*linesPerProcess), numSeqs)); + } + + #else + lines.push_back(new linePair(0, numSeqs)); + #endif + + kmer = new Kmer(kmerSize); + + //find closest seq to each querySeq + for (int i = 0; i < querySeqs.size(); i++) { + closest[i] = templateDB->findClosestSequence(querySeqs[i]); + } + + //fill seqKmerInfo for query seqs + for (int i = 0; i < querySeqs.size(); i++) { + seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned()); + } + + //fill seqKmerInfo for closest + for (int i = 0; i < closest.size(); i++) { + seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned()); + } + + + //for each query find IS value - this should be paralellized, + //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split + for (int i = 0; i < querySeqs.size(); i++) { + IS[i] = findIS(i); //fills seqKmerInfo + } + + + //free memory + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + + + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "getChimeras"); + exit(1); + } +} +//*************************************************************************************************************** +vector ChimeraCheckRDP::findIS(int query) { + try { + + vector isValues; + string queryName = querySeqs[query]->getName(); + string seq = querySeqs[query]->getUnaligned(); + + mothurOut("Finding IS values for sequence " + query); mothurOutEndLine(); + + //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each + int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]); + + //you don't want the starting point to be virtually at hte end so move it in 10% + int start = seq.length() / 10; + + //for each window + for (int m = start; m < (seq.length() - start); m+=increment) { + + sim temp; + + string fragLeft = seq.substr(0, m); //left side of breakpoint + string fragRight = seq.substr(m, seq.length()); //right side of breakpoint + + //make a sequence of the left side and right side + Sequence* left = new Sequence(queryName, fragLeft); + Sequence* right = new Sequence(queryName, fragRight); + + //find seqs closest to each fragment + Sequence closestLeft = templateDB->findClosestSequence(left); + Sequence closestRight = templateDB->findClosestSequence(right); + + map::iterator itleft; + map::iterator itleftclose; + + //get kmer in the closest seqs + //if it's not found calc kmer info and save, otherwise use already calculated data + //left + it = seqKmerInfo.find(closestLeft.getName()); + if (it == seqKmerInfo.end()) { //you have to calc it + seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned()); + } + + //right + it = seqKmerInfo.find(closestRight.getName()); + if (it == seqKmerInfo.end()) { //you have to calc it + seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned()); + } + + //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side + //iterate through left sides map to subtract the number of times you saw things before you got the the right side + map rightside; + for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) { + int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first]; //times that kmer was seen in total + + //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side + int howmanyright = howManyTotal - itleft->second; + + //if any were seen just on the right add that ammount to map + if (howmanyright > 0) { + rightside[itleft->first] = howmanyright; + } + } + + //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right + //this way you can get the map for just the fragment you want to compare and not hte whole sequence + map rightsideclose; + for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) { + int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first]; //times that kmer was seen in total + + //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side + int howmanyright = howManyTotal - itleftclose->second; + + //if any were seen just on the right add that ammount to map + if (howmanyright > 0) { + rightsideclose[itleftclose->first] = howmanyright; + } + } + + int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]); + int nRight = calcKmers(rightsideclose, rightside); + + int is = nLeft + nRight - nTotal; + + //save IS, leftparent, rightparent, breakpoint + temp.leftParent = closestLeft.getName(); + temp.rightParent = closestRight.getName(); + temp.score = is; + temp.midpoint = m; + + isValues.push_back(temp); + + delete left; + delete right; + } + + return isValues; + + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "findIS"); + exit(1); + } +} + +//*************************************************************************************************************** +//find the smaller map and iterate through it and count kmers in common +int ChimeraCheckRDP::calcKmers(map query, map subject) { + try{ + + int common = 0; + map::iterator small; + map::iterator large; + + if (query.size() < subject.size()) { + + for (small = query.begin(); small != query.end(); small++) { + large = subject.find(small->first); + + //if you found it they have that kmer in common + if (large != subject.end()) { common++; } + } + + }else { + + for (small = subject.begin(); small != subject.end(); small++) { + large = query.find(small->first); + + //if you found it they have that kmer in common + if (large != query.end()) { common++; } + } + } + + return common; + + } + catch(exception& e) { + errorOut(e, "ChimeraCheckRDP", "calcKmers"); + exit(1); + } +} + +//*************************************************************************************************************** + diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h new file mode 100644 index 0000000..e31cae7 --- /dev/null +++ b/chimeracheckrdp.h @@ -0,0 +1,72 @@ +#ifndef CHIMERACHECK_H +#define CHIMERACHECK_H + +/* + * chimeracheckrdp.h + * Mothur + * + * Created by westcott on 9/8/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + + +#include "chimera.h" +#include "kmer.hpp" +#include "kmerdb.hpp" +#include "database.hpp" + +//This class was created using the algorythms described in +//CHIMERA_CHECK version 2.7 written by Niels Larsen. + +/***********************************************************/ + +class ChimeraCheckRDP : public Chimera { + + public: + ChimeraCheckRDP(string, string); + ~ChimeraCheckRDP(); + + void getChimeras(); + void print(ostream&); + + void setCons(string){}; + void setQuantiles(string q) {}; + + + private: + + vector lines; + vector querySeqs; + Database* templateDB; + Kmer* kmer; + + vector< vector > IS; //IS[0] is the vector of IS values for each window for querySeqs[0] + + //map of vector of maps- I know its a little convaluted but I am trying to save time + //I think that since the window is only sliding 10 bases there is a good probability that the closest seq to each fragment + //will be the same for several windows so I want to save the vector of maps containing its kmer info rather than regenerating it. + //So... + map > > seqKmerInfo; // outer map - sequence name -> kmer info + // kmer info: inner vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq + //example: seqKmerInfo["ecoli"][50] = map containing the kmers found in the first 50 + kmersize characters of ecoli. + //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the + //kmers 2 seqs had in common. There may be a better way to do this thats why I am leaving so many comments... + map > >:: iterator it; + map::iterator it2; + + vector closest; //closest[0] is the closest overall seq to querySeqs[0]. + + string fastafile, templateFile; + + + vector findIS(int); + int calcKmers(map, map); + vector< vector > createProcessesIS(vector, vector); + +}; + +/***********************************************************/ + +#endif + diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 6af8aa3..e8fa34d 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -10,8 +10,8 @@ #include "chimeraseqscommand.h" #include "bellerophon.h" #include "pintail.h" -#include "alignedsimilarity.h" #include "ccode.h" +#include "chimeracheckrdp.h" //*************************************************************************************************************** @@ -25,7 +25,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -63,7 +63,9 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ if (ableToOpen == 1) { abort = true; } in.close(); } - + + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } + string temp; temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } filter = isTrue(temp); @@ -74,16 +76,21 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; } + convert(temp, ksize); + temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; } convert(temp, window); - temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } + temp = validParameter.validFile(parameters, "increment", false); + if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; } + else if (temp == "not found") { temp = "25"; } convert(temp, increment); - temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; } + temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; } convert(temp, numwanted); - method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } + if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true; } @@ -136,10 +143,10 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } - if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } - else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); } - //else if (method == "alignsim") { chimera = new AlignSim(fastafile, templatefile); } - else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); } + if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } + else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); } + else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); } + else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, templatefile); } else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options @@ -160,6 +167,7 @@ int ChimeraSeqsCommand::execute(){ chimera->setWindow(window); chimera->setIncrement(increment); chimera->setNumWanted(numwanted); + chimera->setKmerSize(ksize); //find chimeras chimera->getChimeras(); diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 6274c48..b2bce81 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -30,7 +30,7 @@ private: bool abort; string method, fastafile, templatefile, consfile, quanfile, maskfile; bool filter, correction; - int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted; + int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize; Chimera* chimera; diff --git a/database.cpp b/database.cpp index 9784ef9..f84c9e1 100644 --- a/database.cpp +++ b/database.cpp @@ -92,3 +92,4 @@ int Database::getLongestBase() { return longest+1; } /**************************************************************************************************/ + diff --git a/kmer.cpp b/kmer.cpp index b3bf022..6ee84d3 100644 --- a/kmer.cpp +++ b/kmer.cpp @@ -38,6 +38,32 @@ string Kmer::getKmerString(string sequence){ // Calculate kmer for each position return kmerString; } +/**************************************************************************************************/ + +vector< map > Kmer::getKmerCounts(string sequence){ // Calculate kmer for each position in the sequence, save info in a map + int length = sequence.length(); // so you know at each spot in the sequence what kmers were found + int nKmers = length - kmerSize + 1; // + vector< map > counts; counts.resize(nKmers); // a map kmer counts for each spot + map::iterator it; + + for(int i=0;i > getKmerCounts(string sequence); //for use in chimeraCheck private: char getASCII(int); diff --git a/kmerdb.hpp b/kmerdb.hpp index 5019858..d50c8b2 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -20,6 +20,7 @@ */ #include "mothur.h" +#include "database.hpp" class KmerDB : public Database { diff --git a/nast.cpp b/nast.cpp index 804e3c1..294940c 100644 --- a/nast.cpp +++ b/nast.cpp @@ -94,7 +94,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl try { int longAlignmentLength = newTemplateAlign.length(); - + for(int i=0; i maxInsertLength){ maxInsertLength = insertLength; } - + if((leftRoom + rightRoom) >= insertLength){ // Parts D & E from Fig. 2 of DeSantis et al. @@ -168,6 +168,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl } else{ // not enough room to the right, have to steal some + // space to the left lets move left and then right... string leftTemplateString = newTemplateAlign.substr(0,i); string rightTemplateString = newTemplateAlign.substr(i+insertLength); @@ -183,7 +184,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl } } else{ - // there could be a case where there isn't enough room in + // there could be a case where there isn't enough room in string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom); newTemplateAlign = leftTemplateString + rightTemplateString; @@ -193,9 +194,9 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); string rightCandidateString = candAln.substr(rightIndex+rightRoom); candAln = leftCandidateString + insertString + rightCandidateString; - + } - + i -= insertLength; } } @@ -337,12 +338,11 @@ void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis candAln[i] = toupper(candAln[i]); // everything is upper case } - + if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig. } // 2 of Desantis et al. - - + candidateSeq->setAligned(candAln); } catch(exception& e) { diff --git a/needlemanoverlap.cpp b/needlemanoverlap.cpp index b14d8e3..80ba4ca 100644 --- a/needlemanoverlap.cpp +++ b/needlemanoverlap.cpp @@ -42,8 +42,6 @@ gap(gO), match(m), mismatch(mm), Alignment(r) { // the gap openning penalt errorOut(e, "NeedlemanOverlap", "NeedlemanOverlap"); exit(1); } - - } /**************************************************************************************************/ @@ -56,7 +54,7 @@ void NeedlemanOverlap::align(string A, string B){ seqA = ' ' + A; lA = seqA.length(); // algorithm requires a dummy space at the beginning of each string seqB = ' ' + B; lB = seqB.length(); // algorithm requires a dummy space at the beginning of each string - if (lA > nRows) { mothurOut("Your one of your candidate sequences is longer than you longest template sequence."); mothurOutEndLine(); } + if (lA > nRows) { mothurOut("One of your candidate sequences is longer than you longest template sequence. Your longest template sequence is " + toString(nRows) + ". Your candidate is " + toString(lA) + "."); mothurOutEndLine(); } for(int i=1;i 0 || maxLength > 0){ success = cullLength(currSeq); - if(!success){ trashCode += 'l'; } + if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; } + if(!success){ trashCode += 'l'; } } if(maxHomoP > 0){ success = cullHomoP(currSeq); @@ -200,6 +202,7 @@ int TrimSeqsCommand::execute(){ if(flip){ currSeq.reverseComplement(); } // should go last if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. currSeq.printSequence(outFASTA); if(barcodes.size() != 0){ outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; -- 2.39.2