X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=clusterfragmentscommand.cpp;h=a8277ffb029fdd6e5f3e110614836e37292ce9a3;hb=0bcfddf7bc721a334bdae42d86a580019303537d;hp=b5681db57833ae10e9bf665b53e77bea363473c2;hpb=8bc3e5b38c2317a1715f53be22fa96455868c281;p=mothur.git diff --git a/clusterfragmentscommand.cpp b/clusterfragmentscommand.cpp index b5681db..a8277ff 100644 --- a/clusterfragmentscommand.cpp +++ b/clusterfragmentscommand.cpp @@ -8,6 +8,7 @@ */ #include "clusterfragmentscommand.h" +#include "needlemanoverlap.hpp" //********************************************************************************************************************** //sort by unaligned @@ -27,7 +28,7 @@ inline bool comparePriority(seqRNode first, seqRNode second) { //********************************************************************************************************************** vector ClusterFragmentsCommand::getValidParameters(){ try { - string AlignArray[] = {"fasta","name","outputdir","inputdir"}; + string AlignArray[] = {"fasta","name","diffs","percent","outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); return myArray; } @@ -39,7 +40,7 @@ vector ClusterFragmentsCommand::getValidParameters(){ //********************************************************************************************************************** ClusterFragmentsCommand::ClusterFragmentsCommand(){ try { - //initialize outputTypes + abort = true; calledHelp = true; vector tempOutNames; outputTypes["fasta"] = tempOutNames; outputTypes["name"] = tempOutNames; @@ -75,14 +76,14 @@ vector ClusterFragmentsCommand::getRequiredFiles(){ //********************************************************************************************************************** ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { try { - abort = false; + abort = false; calledHelp = false; //allow user to run help - if(option == "help") { help(); abort = true; } + if(option == "help") { help(); abort = true; calledHelp = true; } else { //valid paramters for this command - string Array[] = {"fasta","name","outputdir","inputdir"}; + string Array[] = {"fasta","name","diffs","percent","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -137,6 +138,14 @@ ClusterFragmentsCommand::ClusterFragmentsCommand(string option) { if (namefile == "not found") { namefile = ""; } else if (namefile == "not open") { abort = true; } else { readNameFile(); } + + string temp; + temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found"){ temp = "0"; } + convert(temp, diffs); + + temp = validParameter.validFile(parameters, "percent", false); if (temp == "not found"){ temp = "0"; } + convert(temp, percent); + } } @@ -153,8 +162,11 @@ void ClusterFragmentsCommand::help(){ try { m->mothurOut("The cluster.fragments command groups sequences that are part of a larger sequence.\n"); m->mothurOut("The cluster.fragments command outputs a new fasta and name file.\n"); - m->mothurOut("The cluster.fragments command parameters are fasta and name. The fasta parameter is required. \n"); + m->mothurOut("The cluster.fragments command parameters are fasta, name, diffs and percent. The fasta parameter is required. \n"); m->mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"); + m->mothurOut("The diffs parameter allows you to set the number of differences allowed, default=0. \n"); + m->mothurOut("The percent parameter allows you to set percentage of differences allowed, default=0. percent=2 means if the number of difference is less than or equal to two percent of the length of the fragment, then cluster.\n"); + m->mothurOut("You may use diffs and percent at the same time to say something like: If the number or differences is greater than 1 or more than 2% of the fragment length, don't merge. \n"); m->mothurOut("The cluster.fragments command should be in the following format: \n"); m->mothurOut("cluster.fragments(fasta=yourFastaFile, names=yourNamesFile) \n"); m->mothurOut("Example cluster.fragments(fasta=amazon.fasta).\n"); @@ -169,7 +181,7 @@ void ClusterFragmentsCommand::help(){ int ClusterFragmentsCommand::execute(){ try { - if (abort == true) { return 0; } + if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); @@ -200,10 +212,8 @@ int ClusterFragmentsCommand::execute(){ if (alignSeqs[j].active) { //this sequence has not been merged yet string jBases = alignSeqs[j].seq.getUnaligned(); - - int pos = iBases.find(jBases); - - if (pos != string::npos) { + + if (isFragment(iBases, jBases)) { //merge alignSeqs[i].names += ',' + alignSeqs[j].names; alignSeqs[i].numIdentical += alignSeqs[j].numIdentical; @@ -257,7 +267,63 @@ int ClusterFragmentsCommand::execute(){ exit(1); } } - +//*************************************************************************************************************** +bool ClusterFragmentsCommand::isFragment(string seq1, string seq2){ + try { + bool fragment = false; + + //exact match + int pos = seq1.find(seq2); + if (pos != string::npos) { return true; } + //no match, no diffs wanted + else if ((diffs == 0) && (percent == 0)) { return false; } + else { //try aligning and see if you can find it + + //find number of acceptable differences for this sequence fragment + int totalDiffs; + if (diffs == 0) { //you didnt set diffs you want a percentage + totalDiffs = floor((seq2.length() * (percent / 100.0))); + }else if (percent == 0) { //you didn't set percent you want diffs + totalDiffs = diffs; + }else if ((percent != 0) && (diffs != 0)) { //you want both, set total diffs to smaller of 2 + totalDiffs = diffs; + int percentDiff = floor((seq2.length() * (percent / 100.0))); + if (percentDiff < totalDiffs) { totalDiffs = percentDiff; } + } + + Alignment* alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (seq1.length()+totalDiffs+1)); + + //use needleman to align + alignment->align(seq2, seq1); + string tempSeq2 = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + delete alignment; + + //chop gap ends + int startPos = 0; + int endPos = tempSeq2.length()-1; + for (int i = 0; i < tempSeq2.length(); i++) { if (isalpha(tempSeq2[i])) { startPos = i; break; } } + for (int i = tempSeq2.length()-1; i >= 0; i--) { if (isalpha(tempSeq2[i])) { endPos = i; break; } } + + //count number of diffs + int numDiffs = 0; + for (int i = startPos; i <= endPos; i++) { + if (tempSeq2[i] != temp[i]) { numDiffs++; } + } + + if (numDiffs <= totalDiffs) { fragment = true; } + + } + + return fragment; + + } + catch(exception& e) { + m->errorOut(e, "ClusterFragmentsCommand", "isFragment"); + exit(1); + } +} /**************************************************************************************************/ int ClusterFragmentsCommand::readFASTA(){ try {