#include "ccode.h"
#include "ignoregaps.h"
+#include "eachgapdist.h"
//***************************************************************************************************************
templateLines.push_back(new linePair(0, templateSeqs.size()));
#endif
- distCalc = new ignoreGaps();
+ distCalc = new eachGapDist();
+ decalc = new DeCalculator();
//find closest
if (processors == 1) {
}else { createProcessesClosest(); }
- for (int i = 0; i < closest.size(); i++) {
- cout << querySeqs[i]->getName() << ": ";
- for (int j = 0; j < closest[i].size(); j++) {
+for (int i = 0; i < closest.size(); i++) {
+ cout << querySeqs[i]->getName() << ": ";
+ for (int j = 0; j < closest[i].size(); j++) {
- cout << closest[i][j]->getName() << '\t';
+ cout << closest[i][j]->getName() << '\t';
+ }
+ cout << endl;
+}
+
+ //mask sequences if the user wants to
+ if (seqMask != "") {
+ //mask querys
+ for (int i = 0; i < querySeqs.size(); i++) {
+ decalc->runMask(querySeqs[i]);
+ }
+
+ //mask templates
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ decalc->runMask(templateSeqs[i]);
}
- cout << endl;
}
+
+ if (filter) {
+ vector<Sequence*> temp = templateSeqs;
+ for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
+
+ createFilter(temp);
+ runFilter(querySeqs);
+ runFilter(templateSeqs);
+ }
+
+ //trim sequences - this follows ccodes remove_extra_gaps
+ //just need to pass it query and template since closest points to template
+ trimSequences();
+
+ //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
+ //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
+ windows = findWindows();
+
+ //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
+ for (int i = 0; i < closest.size(); i++) {
+ removeBadReferenceSeqs(closest[i], i);
+ }
+
+
//free memory
for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
exit(1);
}
}
-/***************************************************************************************************************
+/***************************************************************************************************************/
+//ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
+void Ccode::trimSequences() {
+ try {
+
+ int frontPos = 0; //should contain first position in all seqs that is not a gap character
+ int rearPos = querySeqs[0]->getAligned().length();
+
+ //********find first position in all seqs that is a non gap character***********//
+ //find first position all query seqs that is a non gap character
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ string aligned = querySeqs[i]->getAligned();
+ int pos = 0;
+
+ //find first spot in this seq
+ for (int j = 0; j < aligned.length(); j++) {
+ if (isalpha(aligned[j])) {
+ pos = j;
+ break;
+ }
+ }
+
+ //save this spot if it is the farthest
+ if (pos > frontPos) { frontPos = pos; }
+ }
+
+ //find first position all template seqs that is a non gap character
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ string aligned = templateSeqs[i]->getAligned();
+ int pos = 0;
+
+ //find first spot in this seq
+ for (int j = 0; j < aligned.length(); j++) {
+ if (isalpha(aligned[j])) {
+ pos = j;
+ break;
+ }
+ }
+
+ //save this spot if it is the farthest
+ if (pos > frontPos) { frontPos = pos; }
+ }
+
+
+ //********find last position in all seqs that is a non gap character***********//
+ //find last position all query seqs that is a non gap character
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ string aligned = querySeqs[i]->getAligned();
+ int pos = aligned.length();
+
+ //find first spot in this seq
+ for (int j = aligned.length()-1; j >= 0; j--) {
+ if (isalpha(aligned[j])) {
+ pos = j;
+ break;
+ }
+ }
+
+ //save this spot if it is the farthest
+ if (pos < rearPos) { rearPos = pos; }
+ }
+
+ //find last position all template seqs that is a non gap character
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ string aligned = templateSeqs[i]->getAligned();
+ int pos = aligned.length();
+
+ //find first spot in this seq
+ for (int j = aligned.length()-1; j >= 0; j--) {
+ if (isalpha(aligned[j])) {
+ pos = j;
+ break;
+ }
+ }
+
+ //save this spot if it is the farthest
+ if (pos < rearPos) { rearPos = pos; }
+ }
+
+
+ //check to make sure that is not whole seq
+ if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); }
+
+ //***********trim all seqs to that position***************//
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ string aligned = querySeqs[i]->getAligned();
+
+ //between the two points
+ aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+
+ querySeqs[i]->setAligned(aligned);
+ }
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ string aligned = templateSeqs[i]->getAligned();
+
+ //between the two points
+ aligned = aligned.substr(frontPos, (rearPos-frontPos-1));
+
+ templateSeqs[i]->setAligned(aligned);
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "trimSequences");
+ exit(1);
+ }
+
+}
+/***************************************************************************************************************/
vector<int> Ccode::findWindows() {
try {
vector<int> win;
+ int length = querySeqs[0]->getAligned().length();
- if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
+ //default is wanted = 10% of total length
+ if (window > length) {
+ mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
+ window = length / 10;
+ }else if (window == 0) { window = length / 10; }
+ else if (window > (length / 20)) {
+ mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
+ }else if (window < (length / 5)) {
+ mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway."); mothurOutEndLine();
+ }
- for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
+ //save starting points of each window
+ for (int m = 0; m < (length-window); m+=window) { win.push_back(m); }
return win;
exit(1);
}
}
-*/
+//***************************************************************************************************************
+int Ccode::getDiff(string seqA, string seqB) {
+ try {
+
+ int numDiff = 0;
+
+ for (int i = 0; i < seqA.length(); i++) {
+ //if you are both not gaps
+ //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
+ //are you different
+ if (seqA[i] != seqB[i]) { numDiff++; }
+ //}
+ }
+
+ return numDiff;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "getDiff");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+//tried to make this look most like ccode original implementation
+void Ccode::removeBadReferenceSeqs(vector<Sequence*>& seqs, int query) {
+ try {
+
+ vector< vector<int> > numDiffBases;
+ numDiffBases.resize(seqs.size());
+ //initialize to 0
+ for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
+
+ int length = seqs[0]->getAligned().length();
+
+ //calc differences from each sequence to everyother seq in the set
+ for (int i = 0; i < seqs.size(); i++) {
+
+ string seqA = seqs[i]->getAligned();
+
+ //so you don't calc i to j and j to i since they are the same
+ for (int j = 0; j < i; j++) {
+
+ string seqB = seqs[j]->getAligned();
+
+ //compare strings
+ int numDiff = getDiff(seqA, seqB);
+
+ numDiffBases[i][j] = numDiff;
+ numDiffBases[j][i] = numDiff;
+ }
+ }
+
+ //initailize remove to 0
+ vector<int> remove; remove.resize(seqs.size(), 0);
+
+ //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
+ for (int i = 0; i < numDiffBases.size(); i++) {
+ for (int j = 0; j < numDiffBases[i].size(); j++) {
+
+ //are you more than 20% different
+ if (numDiffBases[i][j] > ((20*length) / 100)) { remove[j] = 1; }
+ //are you less than 0.5% different
+ if (numDiffBases[i][j] < ((0.5*length) / 100)) { remove[j] = 1; }
+ }
+ }
+
+ int numSeqsLeft = 0;
+
+ //count seqs that are not going to be removed
+ for (int i = 0; i < remove.size(); i++) {
+ if (remove[i] == 0) { numSeqsLeft++; }
+ }
+
+ //if you have enough then remove bad ones
+ if (numSeqsLeft >= 3) {
+ vector<Sequence*> goodSeqs;
+ //remove bad seqs
+ for (int i = 0; i < remove.size(); i++) {
+ if (remove[i] == 0) {
+ goodSeqs.push_back(seqs[i]);
+ }
+ }
+
+ seqs = goodSeqs;
+
+ }else { //warn, but dont remove any
+ mothurOut(querySeqs[query]->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check."); mothurOutEndLine();
+ }
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "removeBadReferenceSeqs");
+ exit(1);
+ }
+}
//***************************************************************************************************************
vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
try{
}
}
/**************************************************************************************************/
+vector<float> Ccode::getAverageRef(vector<Sequence*> ref) {
+ try {
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "getAverageRef");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector<float> Ccode::getAverageQuery (vector<Sequence*> ref, int query) {
+ try {
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "getAverageQuery");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
void Ccode::createProcessesClosest() {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
//output pairs
for (int i = lines[process]->start; i < lines[process]->end; i++) {
- out << closest[i].size() << endl;
for (int j = 0; j < closest[i].size(); j++) {
closest[i][j]->printSequence(out);
}
//get pairs
for (int k = lines[i]->start; k < lines[i]->end; k++) {
- int size;
- in >> size;
- gobble(in);
vector<Sequence*> tempVector;
- for (int j = 0; j < size; j++) {
+ for (int j = 0; j < numWanted; j++) {
Sequence* temp = new Sequence(in);
gobble(in);