A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
- A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; };
+ A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; };
A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; };
A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; };
A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
- A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = SOURCE_ROOT; };
- A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = SOURCE_ROOT; };
+ A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = "<group>"; };
+ A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = "<group>"; };
A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; };
A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; };
C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
children = (
374F2FD51006320200E97C66 /* chimera.h */,
372095C1103196D70004D347 /* chimera.cpp */,
- A75B887A104C16860083C454 /* alignedsimilarity.h */,
- A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
374F2FE9100634B600E97C66 /* bellerophon.h */,
374F2FEA100634B600E97C66 /* bellerophon.cpp */,
A75B887C104C16860083C454 /* ccode.h */,
A75B887B104C16860083C454 /* ccode.cpp */,
+ A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */,
+ A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */,
372A55531017922B00C5194B /* decalc.h */,
372A55541017922B00C5194B /* decalc.cpp */,
374F301110063B6F00E97C66 /* pintail.h */,
374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
372A55551017922B00C5194B /* decalc.cpp in Sources */,
372095C2103196D70004D347 /* chimera.cpp in Sources */,
- A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */,
A75B887E104C16860083C454 /* ccode.cpp in Sources */,
+ A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
for(int i=0;i<line->numSeqs;i++){
Sequence* candidateSeq = new Sequence(inFASTA);
+
+ if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
+ alignment->resize(candidateSeq->getUnaligned().length()+1);
+ }
+
report.setCandidate(candidateSeq);
Sequence temp = templateDB->findClosestSequence(candidateSeq);
+++ /dev/null
-/*
- * alignedsimilarity.cpp
- * Mothur
- *
- * Created by westcott on 8/18/09.
- * Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "alignedsimilarity.h"
-#include "ignoregaps.h"
-
-
-//***************************************************************************************************************
-AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; }
-//***************************************************************************************************************
-
-AlignSim::~AlignSim() {
- try {
- for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
- for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
- delete distCalc;
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "~AlignSim");
- exit(1);
- }
-}
-//***************************************************************************************************************
-void AlignSim::print(ostream& out) {
- try {
-
- mothurOutEndLine();
-
- for (int i = 0; i < querySeqs.size(); i++) {
-
- int j = 0; float largest = -10;
- //find largest sim value
- for (int k = 0; k < IS[i].size(); k++) {
- //is this score larger
- if (IS[i][k].score > largest) {
- j = k;
- largest = IS[i][k].score;
- }
- }
-
- //find parental similarity
- distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
- float dist = distCalc->getDist();
-
- //convert to similarity
- dist = (1 - dist) * 100;
-
- //warn about parental similarity - if its above 82% may not detect a chimera
- if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
-
- int index = ceil(dist);
-
- if (index == 0) { index=1; }
-
- //is your DE value higher than the 95%
- string chimera;
- if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
- else { chimera = "No"; }
-
- out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
-
- if (chimera == "Yes") {
- mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
- }
- out << "Improvement Score\t";
-
- for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
- out << endl;
- }
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "print");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-void AlignSim::getChimeras() {
- try {
-
- //read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
- querySeqs = readSeqs(fastafile);
- templateSeqs = readSeqs(templateFile);
- mothurOut("Done."); mothurOutEndLine();
-
- int numSeqs = querySeqs.size();
-
- IS.resize(numSeqs);
-
- //break up file if needed
- int linesPerProcess = numSeqs / processors ;
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- //find breakup of sequences for all times we will Parallelize
- if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
- else {
- //fill line pairs
- for (int i = 0; i < (processors-1); i++) {
- lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
- }
- //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
- int i = processors - 1;
- lines.push_back(new linePair((i*linesPerProcess), numSeqs));
- }
-
- //find breakup of templatefile for quantiles
- if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
- else {
- for (int i = 0; i < processors; i++) {
- templateLines.push_back(new linePair());
- templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
- templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
- }
- }
- #else
- lines.push_back(new linePair(0, numSeqs));
- templateLines.push_back(new linePair(0, templateSeqs.size()));
- #endif
-
-
- distCalc = new ignoreGaps();
-
- //find window breaks
- windowBreak = findWindows();
-//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; }
-//cout << endl;
-
- mothurOut("Finding the IS values for your sequences..."); cout.flush();
- if (processors == 1) {
- IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
- }else { IS = createProcessesIS(querySeqs, lines); }
- mothurOut("Done."); mothurOutEndLine();
-
- //quantiles are used to determine whether the de values found indicate a chimera
- if (quanfile != "") { quantile = readQuantiles(); }
- else {
-
- mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
- //get IS quantiles as a reference to these IS scores
- //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
- if (processors == 1) {
- templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
- }else { templateIS = createProcessesIS(templateSeqs, templateLines); }
-//cout << "here" << endl;
- ofstream out4;
- string o;
-
- o = getRootName(templateFile) + "alignedsimilarity.quan";
-
- openOutputFile(o, out4);
-
- //adjust quantiles
- //construct table
- quantile.resize(100);
-
- for (int i = 0; i < templateIS.size(); i++) {
-
- if (templateIS[i].size() != 0) {
- int j = 0; float score = -1000;
- //find highest IS score
- //cout << templateIS[i].size() << endl;
- for (int k = 0; k < templateIS[i].size(); k++) {
- if (templateIS[i][k].score > score) {
- score = templateIS[i][k].score;
- j = k;
- }
- }
- //cout << j << endl;
- //find similarity of parents
- distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
- float dist = distCalc->getDist();
-
- //convert to similarity
- dist = (1 - dist) * 100;
- // cout << dist << endl;
- int index = ceil(dist);
- // cout << "index = " << index << endl;
- if (index == 0) { index = 1; }
- quantile[index-1].push_back(templateIS[i][j].score);
- }
- }
-
-
- for (int i = 0; i < quantile.size(); i++) {
- vector<float> temp;
-
- if (quantile[i].size() == 0) {
- //in case this is not a distance found in your template files
- for (int g = 0; g < 6; g++) {
- temp.push_back(0.0);
- }
- }else{
-
- sort(quantile[i].begin(), quantile[i].end());
-
- //save 10%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
- //save 25%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
- //save 50%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
- //save 75%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
- //save 95%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
- //save 99%
- temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
-
- }
-
- //output quan value
- out4 << i+1 << '\t';
- for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
- out4 << endl;
-
- quantile[i] = temp;
-
- }
-
- out4.close();
-
- mothurOut("Done."); mothurOutEndLine();
-
- }
-
- //free memory
- for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
- for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
-
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "getChimeras");
- exit(1);
- }
-}
-//***************************************************************************************************************
-vector<int> AlignSim::findWindows() {
- try {
-
- vector<int> win;
-
- if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
-
- for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
-
- return win;
-
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "findWindows");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-vector< vector<sim> > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
- try {
-
- vector< vector<sim> > isValues;
- isValues.resize(seqs.size());
-
- //for each sequence
- for(int i = start; i < end; i++){
-
- vector<sim> temp; temp.resize(windowBreak.size());
-
- mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
-
- //for each window
- for (int m = 0; m < windowBreak.size(); m++) {
-
- vector<Sequence*> closest; //left, right, overall
- vector<int> similarity; //left+right and overall
-
- //find closest left, closest right and closest overall
- closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
-
- int totalNumBases = seqs[i]->getUnaligned().length();
-
- //IS = left+right-overall
- float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
-
- ///save IS, leftparent, rightparent, breakpoint
- temp[m].leftParent = closest[0];
- temp[m].rightParent = closest[1];
- temp[m].score = is;
- temp[m].midpoint = windowBreak[m];
-//cout << is << '\t';
- }
-//cout << endl;
- isValues[i] = temp;
-
- }
-
- return isValues;
-
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "findIS");
- exit(1);
- }
-}
-//***************************************************************************************************************
-vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
- try{
-
- vector<Sequence*> closest;
-
- Sequence query, queryLeft, queryRight;
- string frag = seq->getAligned();
- string fragLeft = frag.substr(0, breakpoint);
- string fragRight = frag.substr(breakpoint, frag.length());
-
- //get pieces
- query = *(seq);
- queryLeft = *(seq);
- queryRight = *(seq);
- queryLeft.setAligned(fragLeft);
- queryRight.setAligned(fragRight);
-
- //initialize
- Sequence* overall = templateSeqs[0];
- Sequence* left = templateSeqs[0];
- Sequence* right = templateSeqs[0];
-
- float smallestOverall, smallestLeft, smallestRight;
- smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
-
- //go through the templateSeqs and search for the closest
- for(int j = 0; j < templateSeqs.size(); j++){
-
- //so you don't pick yourself
- if (j != i) {
- Sequence temp, tempLeft, tempRight;
- string fragTemp = templateSeqs[j]->getAligned();
- string fragTempLeft = fragTemp.substr(0, breakpoint);
- string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
-
- //get pieces
- temp = *(templateSeqs[j]);
- tempLeft = *(templateSeqs[j]);
- tempRight = *(templateSeqs[j]);
- tempLeft.setAligned(fragTempLeft);
- tempRight.setAligned(fragTempRight);
-
- //find overall dist
- distCalc->calcDist(query, temp);
- float dist = distCalc->getDist();
-
- if (dist < smallestOverall) {
- overall = templateSeqs[j];
- smallestOverall = dist;
- }
-
- //find left dist
- distCalc->calcDist(queryLeft, tempLeft);
- dist = distCalc->getDist();
-
- if (dist < smallestLeft) {
- left = templateSeqs[j];
- smallestLeft = dist;
- }
-
- //find left dist
- distCalc->calcDist(queryRight, tempRight);
- dist = distCalc->getDist();
-
- if (dist < smallestRight) {
- right = templateSeqs[j];
- smallestRight = dist;
- }
- }
-
- }
-
- closest.push_back(left);
- closest.push_back(right);
- closest.push_back(overall);
-
- //fill sim with number of matched bases
- Sequence tempL = *(left);
- string tempLFrag = tempL.getAligned();
- tempL.setAligned(tempLFrag.substr(0, breakpoint));
-
- int bothMatches = findNumMatchedBases(queryLeft, tempL);
- sim.push_back(bothMatches);
-
- Sequence tempR = *(right);
- string tempRFrag = tempR.getAligned();
- tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
-
- bothMatches = findNumMatchedBases(queryRight, tempR);
- sim.push_back(bothMatches);
-
- bothMatches = findNumMatchedBases(query, *(overall));
- sim.push_back(bothMatches);
-
- return closest;
-
-
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "findClosestSides");
- exit(1);
- }
-}
-//***************************************************************************************************************
-
-int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
- try{
-
- int num = 0;
-
- string query = seq.getAligned();
- string subject = temp.getAligned();
-//cout << seq.getName() << endl << endl;
-//cout << temp.getName() << endl << endl;
-
- for (int i = 0; i < query.length(); i++) {
- //count matches if query[i] is a base and subject is same bases or gap
- if (isalpha(query[i])) {
- if(!isalpha(subject[i])) { num++; }
- else if (query[i] == subject[i]) { num++; }
- else { }
- }
- }
-//cout << "num = " << num << endl;
- return num;
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "findNumMatchedBases");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
- try {
- vector< vector<sim> > localIs; localIs.resize(seqs.size());
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
-
- mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
- localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
- mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
-
- //write out data to file so parent can read it
- ofstream out;
- string s = toString(getpid()) + ".temp";
- openOutputFile(s, out);
-
- //output pairs
- for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
- out << localIs[i].size() << endl;
- for (int j = 0; j < localIs[i].size(); j++) {
- localIs[i][j].leftParent->printSequence(out);
- localIs[i][j].rightParent->printSequence(out);
- out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
- }
- }
- out.close();
-
- exit(0);
- }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
- }
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- //get data created by processes
- for (int i=0;i<processors;i++) {
- ifstream in;
- string s = toString(processIDS[i]) + ".temp";
- openInputFile(s, in);
-
- //get pairs
- for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
- int size;
- in >> size;
- gobble(in);
-
- vector<sim> tempVector;
-
- for (int j = 0; j < size; j++) {
-
- sim temp;
-
- temp.leftParent = new Sequence(in);
- gobble(in);
- //temp.leftParent->printSequence(cout);
- temp.rightParent = new Sequence(in);
- gobble(in);
-//temp.rightParent->printSequence(cout);
- string throwaway;
- in >> throwaway >> temp.score >> temp.midpoint;
- //cout << temp.score << '\t' << temp.midpoint << endl;
- gobble(in);
-
- tempVector.push_back(temp);
- }
-
- localIs[k] = tempVector;
- }
-
- in.close();
- remove(s.c_str());
- }
-
-
-#else
- localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
-#endif
-
- return localIs;
- }
- catch(exception& e) {
- errorOut(e, "AlignSim", "createProcessesIS");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
+++ /dev/null
-#ifndef ALIGNSIM_H
-#define ALIGNSIM_H
-
-/*
- * alignedsimilarity.h
- * Mothur
- *
- * Created by westcott on 8/18/09.
- * Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-
-#include "chimera.h"
-#include "dist.h"
-
-//This class was created using the algorythms described in the
-//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper
-//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2
-
-
-/***********************************************************/
-
-class AlignSim : public Chimera {
-
- public:
- AlignSim(string, string);
- ~AlignSim();
-
- void getChimeras();
- void print(ostream&);
-
- void setCons(string){};
- void setQuantiles(string q) { quanfile = q; };
-
-
- private:
-
- Dist* distCalc;
- vector<linePair*> lines;
- vector<linePair*> templateLines;
-
- vector<Sequence*> querySeqs;
- vector<Sequence*> templateSeqs;
-
- vector< vector<sim> > IS; //IS[0] is the vector os sim values for each window for querySeqs[0]
- vector< vector<sim> > templateIS; //templateIS[0] is the vector os sim values for each window for templateSeqs[0]
- vector<int> windowBreak;
-
- string fastafile, templateFile;
- int iters;
-
- vector< vector<sim> > findIS(int, int, vector<Sequence*>);
- vector<Sequence*> findClosestSides(Sequence*, int, vector<int>&, int);
- int findNumMatchedBases(Sequence, Sequence);
- vector<int> findWindows();
-
- vector< vector<float> > quantile;
-
- vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
-
-};
-
-/***********************************************************/
-
-#endif
-
exit(1);
}
}
+/**************************************************************************************************/
+void Alignment::resize(int A) {
+ try {
+ nCols = A;
+ nRows = A;
+ alignment.resize(nRows);
+ for(int i=0;i<nRows;i++){
+ alignment[i].resize(nCols);
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "Alignment", "resize");
+ exit(1);
+ }
+}
/**************************************************************************************************/
void Alignment::traceBack(){ // This traceback routine is used by the dynamic programming algorithms
int getTemplateEndPos();
int getPairwiseLength();
+ void resize(int);
+ int getnRows() { return nRows; }
// int getLongestTemplateGap();
protected:
lines.push_back(new linePair((i*linesPerProcess), numSeqs));
}
- //find breakup of templatefile for quantiles
- if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
- else {
- for (int i = 0; i < processors; i++) {
- templateLines.push_back(new linePair());
- templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
- templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
- }
- }
#else
lines.push_back(new linePair(0, numSeqs));
- templateLines.push_back(new linePair(0, templateSeqs.size()));
#endif
distCalc = new eachGapDist();
//free memory
for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
- for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
delete distCalc;
delete decalc;
vector<linePair*> lines;
- vector<linePair*> templateLines;
vector<Sequence*> querySeqs;
vector<Sequence*> templateSeqs;
vector< map<int, int> > spotMap;
//********************************************************************************************************************
struct sim {
- Sequence* leftParent;
- Sequence* rightParent;
+ string leftParent;
+ string rightParent;
float score;
int midpoint;
};
virtual void setWindow(int w) { window = w; }
virtual void setIncrement(int i) { increment = i; }
virtual void setNumWanted(int n) { numWanted = n; }
+ virtual void setKmerSize(int k) { kmerSize = k; }
virtual void setCons(string){};
virtual void setQuantiles(string){};
protected:
bool filter, correction;
- int processors, window, increment, numWanted;
+ int processors, window, increment, numWanted, kmerSize;
string seqMask, quanfile, filterString;
--- /dev/null
+/*
+ * chimeracheckrdp.cpp
+ * Mothur
+ *
+ * Created by westcott on 9/8/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "chimeracheckrdp.h"
+
+//***************************************************************************************************************
+ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) { fastafile = filename; templateFile = temp; }
+//***************************************************************************************************************
+
+ChimeraCheckRDP::~ChimeraCheckRDP() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ delete templateDB;
+ delete kmer;
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "~AlignSim");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void ChimeraCheckRDP::print(ostream& out) {
+ try {
+
+ mothurOutEndLine();
+ /*
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ int j = 0; float largest = -10;
+ //find largest sim value
+ for (int k = 0; k < IS[i].size(); k++) {
+ //is this score larger
+ if (IS[i][k].score > largest) {
+ j = k;
+ largest = IS[i][k].score;
+ }
+ }
+
+ //find parental similarity
+ distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+ float dist = distCalc->getDist();
+
+ //convert to similarity
+ dist = (1 - dist) * 100;
+
+ //warn about parental similarity - if its above 82% may not detect a chimera
+ if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
+
+ int index = ceil(dist);
+
+ if (index == 0) { index=1; }
+
+ //is your DE value higher than the 95%
+ string chimera;
+ if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
+ else { chimera = "No"; }
+
+ out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+
+ if (chimera == "Yes") {
+ mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+ }
+ out << "Improvement Score\t";
+
+ for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
+ out << endl;
+ }*/
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void ChimeraCheckRDP::getChimeras() {
+ try {
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ //templateSeqs = readSeqs(templateFile);
+ templateDB = new KmerDB(templateFile, kmerSize);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ IS.resize(numSeqs);
+ closest.resize(numSeqs);
+
+ //break up file if needed
+ int linesPerProcess = numSeqs / processors ;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ #else
+ lines.push_back(new linePair(0, numSeqs));
+ #endif
+
+ kmer = new Kmer(kmerSize);
+
+ //find closest seq to each querySeq
+ for (int i = 0; i < querySeqs.size(); i++) {
+ closest[i] = templateDB->findClosestSequence(querySeqs[i]);
+ }
+
+ //fill seqKmerInfo for query seqs
+ for (int i = 0; i < querySeqs.size(); i++) {
+ seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned());
+ }
+
+ //fill seqKmerInfo for closest
+ for (int i = 0; i < closest.size(); i++) {
+ seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned());
+ }
+
+
+ //for each query find IS value - this should be paralellized,
+ //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split
+ for (int i = 0; i < querySeqs.size(); i++) {
+ IS[i] = findIS(i); //fills seqKmerInfo
+ }
+
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "getChimeras");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<sim> ChimeraCheckRDP::findIS(int query) {
+ try {
+
+ vector<sim> isValues;
+ string queryName = querySeqs[query]->getName();
+ string seq = querySeqs[query]->getUnaligned();
+
+ mothurOut("Finding IS values for sequence " + query); mothurOutEndLine();
+
+ //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
+ int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]);
+
+ //you don't want the starting point to be virtually at hte end so move it in 10%
+ int start = seq.length() / 10;
+
+ //for each window
+ for (int m = start; m < (seq.length() - start); m+=increment) {
+
+ sim temp;
+
+ string fragLeft = seq.substr(0, m); //left side of breakpoint
+ string fragRight = seq.substr(m, seq.length()); //right side of breakpoint
+
+ //make a sequence of the left side and right side
+ Sequence* left = new Sequence(queryName, fragLeft);
+ Sequence* right = new Sequence(queryName, fragRight);
+
+ //find seqs closest to each fragment
+ Sequence closestLeft = templateDB->findClosestSequence(left);
+ Sequence closestRight = templateDB->findClosestSequence(right);
+
+ map<int, int>::iterator itleft;
+ map<int, int>::iterator itleftclose;
+
+ //get kmer in the closest seqs
+ //if it's not found calc kmer info and save, otherwise use already calculated data
+ //left
+ it = seqKmerInfo.find(closestLeft.getName());
+ if (it == seqKmerInfo.end()) { //you have to calc it
+ seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned());
+ }
+
+ //right
+ it = seqKmerInfo.find(closestRight.getName());
+ if (it == seqKmerInfo.end()) { //you have to calc it
+ seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned());
+ }
+
+ //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
+ //iterate through left sides map to subtract the number of times you saw things before you got the the right side
+ map<int, int> rightside;
+ for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) {
+ int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first]; //times that kmer was seen in total
+
+ //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
+ int howmanyright = howManyTotal - itleft->second;
+
+ //if any were seen just on the right add that ammount to map
+ if (howmanyright > 0) {
+ rightside[itleft->first] = howmanyright;
+ }
+ }
+
+ //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right
+ //this way you can get the map for just the fragment you want to compare and not hte whole sequence
+ map<int, int> rightsideclose;
+ for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) {
+ int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first]; //times that kmer was seen in total
+
+ //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
+ int howmanyright = howManyTotal - itleftclose->second;
+
+ //if any were seen just on the right add that ammount to map
+ if (howmanyright > 0) {
+ rightsideclose[itleftclose->first] = howmanyright;
+ }
+ }
+
+ int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]);
+ int nRight = calcKmers(rightsideclose, rightside);
+
+ int is = nLeft + nRight - nTotal;
+
+ //save IS, leftparent, rightparent, breakpoint
+ temp.leftParent = closestLeft.getName();
+ temp.rightParent = closestRight.getName();
+ temp.score = is;
+ temp.midpoint = m;
+
+ isValues.push_back(temp);
+
+ delete left;
+ delete right;
+ }
+
+ return isValues;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "findIS");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+//find the smaller map and iterate through it and count kmers in common
+int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
+ try{
+
+ int common = 0;
+ map<int, int>::iterator small;
+ map<int, int>::iterator large;
+
+ if (query.size() < subject.size()) {
+
+ for (small = query.begin(); small != query.end(); small++) {
+ large = subject.find(small->first);
+
+ //if you found it they have that kmer in common
+ if (large != subject.end()) { common++; }
+ }
+
+ }else {
+
+ for (small = subject.begin(); small != subject.end(); small++) {
+ large = query.find(small->first);
+
+ //if you found it they have that kmer in common
+ if (large != query.end()) { common++; }
+ }
+ }
+
+ return common;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ChimeraCheckRDP", "calcKmers");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
--- /dev/null
+#ifndef CHIMERACHECK_H
+#define CHIMERACHECK_H
+
+/*
+ * chimeracheckrdp.h
+ * Mothur
+ *
+ * Created by westcott on 9/8/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "kmer.hpp"
+#include "kmerdb.hpp"
+#include "database.hpp"
+
+//This class was created using the algorythms described in
+//CHIMERA_CHECK version 2.7 written by Niels Larsen.
+
+/***********************************************************/
+
+class ChimeraCheckRDP : public Chimera {
+
+ public:
+ ChimeraCheckRDP(string, string);
+ ~ChimeraCheckRDP();
+
+ void getChimeras();
+ void print(ostream&);
+
+ void setCons(string){};
+ void setQuantiles(string q) {};
+
+
+ private:
+
+ vector<linePair*> lines;
+ vector<Sequence*> querySeqs;
+ Database* templateDB;
+ Kmer* kmer;
+
+ vector< vector<sim> > IS; //IS[0] is the vector of IS values for each window for querySeqs[0]
+
+ //map of vector of maps- I know its a little convaluted but I am trying to save time
+ //I think that since the window is only sliding 10 bases there is a good probability that the closest seq to each fragment
+ //will be the same for several windows so I want to save the vector of maps containing its kmer info rather than regenerating it.
+ //So...
+ map<string, vector< map<int, int> > > seqKmerInfo; // outer map - sequence name -> kmer info
+ // kmer info: inner vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
+ //example: seqKmerInfo["ecoli"][50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
+ //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the
+ //kmers 2 seqs had in common. There may be a better way to do this thats why I am leaving so many comments...
+ map<string, vector< map<int, int> > >:: iterator it;
+ map<int, int>::iterator it2;
+
+ vector<Sequence> closest; //closest[0] is the closest overall seq to querySeqs[0].
+
+ string fastafile, templateFile;
+
+
+ vector<sim> findIS(int);
+ int calcKmers(map<int, int>, map<int, int>);
+ vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
+
+};
+
+/***********************************************************/
+
+#endif
+
#include "chimeraseqscommand.h"
#include "bellerophon.h"
#include "pintail.h"
-#include "alignedsimilarity.h"
#include "ccode.h"
+#include "chimeracheckrdp.h"
//***************************************************************************************************************
else {
//valid paramters for this command
- string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" };
+ string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
if (ableToOpen == 1) { abort = true; }
in.close();
}
-
+
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
+
string temp;
temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
filter = isTrue(temp);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
convert(temp, processors);
+ temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found") { temp = "7"; }
+ convert(temp, ksize);
+
temp = validParameter.validFile(parameters, "window", false); if (temp == "not found") { temp = "0"; }
convert(temp, window);
- temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
+ temp = validParameter.validFile(parameters, "increment", false);
+ if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; }
+ else if (temp == "not found") { temp = "25"; }
convert(temp, increment);
- temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; }
+ temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; }
convert(temp, numwanted);
- method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
+
if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true; }
if (abort == true) { return 0; }
- if (method == "bellerophon") { chimera = new Bellerophon(fastafile); }
- else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); }
- //else if (method == "alignsim") { chimera = new AlignSim(fastafile, templatefile); }
- else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); }
+ if (method == "bellerophon") { chimera = new Bellerophon(fastafile); }
+ else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); }
+ else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); }
+ else if (method == "chimeracheck") { chimera = new ChimeraCheckRDP(fastafile, templatefile); }
else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
//set user options
chimera->setWindow(window);
chimera->setIncrement(increment);
chimera->setNumWanted(numwanted);
+ chimera->setKmerSize(ksize);
//find chimeras
chimera->getChimeras();
bool abort;
string method, fastafile, templatefile, consfile, quanfile, maskfile;
bool filter, correction;
- int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted;
+ int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize;
Chimera* chimera;
/**************************************************************************************************/
+
return kmerString;
}
+/**************************************************************************************************/
+
+vector< map<int, int> > Kmer::getKmerCounts(string sequence){ // Calculate kmer for each position in the sequence, save info in a map
+ int length = sequence.length(); // so you know at each spot in the sequence what kmers were found
+ int nKmers = length - kmerSize + 1; //
+ vector< map<int, int> > counts; counts.resize(nKmers); // a map kmer counts for each spot
+ map<int, int>::iterator it;
+
+ for(int i=0;i<nKmers;i++){ // Go though sequence and get the number between 0 and maxKmer for that
+ if (i == 0) {
+ int kmerNumber = getKmerNumber(sequence, i);// kmer.
+ counts[i][kmerNumber] = 1; // add this kmer if not already there
+ }else {
+ //your count is everything that came before and whatever you find now
+ counts[i] = counts[i-1];
+ int kmerNumber = getKmerNumber(sequence, i);// kmer.
+
+ it = counts[i].find(kmerNumber);
+ if (it!= counts[i].end()) { counts[i][kmerNumber]++; } //increment number of times you have seen this kmer
+ else { counts[i][kmerNumber] = 1; } // add this kmer since not already there
+ }
+ }
+
+ return counts;
+}
+
/**************************************************************************************************/
string getKmerString(string);
int getKmerNumber(string, int);
string getKmerBases(int);
-
+ vector< map<int, int> > getKmerCounts(string sequence); //for use in chimeraCheck
private:
char getASCII(int);
*/
#include "mothur.h"
+#include "database.hpp"
class KmerDB : public Database {
try {
int longAlignmentLength = newTemplateAlign.length();
-
+
for(int i=0; i<longAlignmentLength; i++){ // use the long alignment as the standard
int rightIndex, rightRoom, leftIndex, leftRoom;
int insertLength = 0; // figure out how long the anomaly is
while(!isalpha(newTemplateAlign[i + insertLength])) { insertLength++; }
if(insertLength > maxInsertLength){ maxInsertLength = insertLength; }
-
+
if((leftRoom + rightRoom) >= insertLength){
// Parts D & E from Fig. 2 of DeSantis et al.
}
else{ // not enough room to the right, have to steal some
+
// space to the left lets move left and then right...
string leftTemplateString = newTemplateAlign.substr(0,i);
string rightTemplateString = newTemplateAlign.substr(i+insertLength);
}
}
else{
- // there could be a case where there isn't enough room in
+ // there could be a case where there isn't enough room in
string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff
string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
newTemplateAlign = leftTemplateString + rightTemplateString;
string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
string rightCandidateString = candAln.substr(rightIndex+rightRoom);
candAln = leftCandidateString + insertString + rightCandidateString;
-
+
}
-
+
i -= insertLength;
}
}
candAln[i] = toupper(candAln[i]); // everything is upper case
}
-
+
if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official
removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig.
} // 2 of Desantis et al.
-
-
+
candidateSeq->setAligned(candAln);
}
catch(exception& e) {
errorOut(e, "NeedlemanOverlap", "NeedlemanOverlap");
exit(1);
}
-
-
}
/**************************************************************************************************/
seqA = ' ' + A; lA = seqA.length(); // algorithm requires a dummy space at the beginning of each string
seqB = ' ' + B; lB = seqB.length(); // algorithm requires a dummy space at the beginning of each string
- if (lA > nRows) { mothurOut("Your one of your candidate sequences is longer than you longest template sequence."); mothurOutEndLine(); }
+ if (lA > nRows) { mothurOut("One of your candidate sequences is longer than you longest template sequence. Your longest template sequence is " + toString(nRows) + ". Your candidate is " + toString(lA) + "."); mothurOutEndLine(); }
for(int i=1;i<lB;i++){ // This code was largely translated from Perl code provided in Ex 3.1
for(int j=1;j<lA;j++){ // of the O'Reilly BLAST book. I found that the example output had a
}
}
}
+
Overlap over;
over.setOverlap(alignment, lA, lB, 0); // Fix gaps at the beginning and end of the sequences
traceBack(); // Traceback the alignment to populate seqAaln and seqBaln
-
+
}
catch(exception& e) {
errorOut(e, "NeedlemanOverlap", "align");
NeedlemanOverlap(float, float, float, int);
~NeedlemanOverlap();
void align(string, string);
+
private:
float gap;
else { temp += 'N'; }
}
unaligned = temp;
+ aligned = temp;
}
if(qFileName != "") { openInputFile(qFileName, qFile); }
bool success;
-
+
while(!inFASTA.eof()){
Sequence currSeq(inFASTA);
string origSeq = currSeq.getUnaligned();
if(!success) { trashCode += 'q'; }
}
if(barcodes.size() != 0){
+
success = stripBarcode(currSeq, group);
if(!success){ trashCode += 'b'; }
}
}
if(minLength > 0 || maxLength > 0){
success = cullLength(currSeq);
- if(!success){ trashCode += 'l'; }
+ if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; }
+ if(!success){ trashCode += 'l'; }
}
if(maxHomoP > 0){
success = cullHomoP(currSeq);
if(flip){ currSeq.reverseComplement(); } // should go last
if(trashCode.length() == 0){
+ currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed.
currSeq.printSequence(outFASTA);
if(barcodes.size() != 0){
outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;