]> git.donarmstrong.com Git - mothur.git/commitdiff
working on chimeras
authorwestcott <westcott>
Thu, 30 Jul 2009 19:08:54 +0000 (19:08 +0000)
committerwestcott <westcott>
Thu, 30 Jul 2009 19:08:54 +0000 (19:08 +0000)
chimera.h
chimeraseqscommand.cpp
decalc.cpp
decalc.h
getseqscommand.cpp
pintail.cpp

index 106d24c5523310fa602e7d9b68dd753699a4bee7..9142bacaa09a016d1cabd25560f1e428a699a15c 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -65,7 +65,7 @@ class Chimera {
                virtual void setMask(string filename) {
                        try {
                                
-                               if (filename == "") {
+                               if (filename == "default") {
                                        //default is from wigeon  236627 EU009184.1 Shigella dysenteriae str. FBD013
                                        seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
                                }else{
index ba980a50e46405dfc3087f5ba7c02d0b25cab1ff..17f3a71a5033f1d28835bbd562af5d5f9a6b5a18 100644 (file)
@@ -52,9 +52,15 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        if (quanfile == "not open") { abort = true; }
                        else if (quanfile == "not found") { quanfile = "";  }
                                
-                       maskfile = validParameter.validFile(parameters, "mask", true);
-                       if (maskfile == "not open") { abort = true; }
-                       else if (maskfile == "not found") { maskfile = "";  }   
+                       maskfile = validParameter.validFile(parameters, "mask", false);
+                       if (maskfile == "not found") { maskfile = "";  }        
+                       else if (maskfile != "default")  { 
+                               ifstream in;
+                               int     ableToOpen = openInputFile(maskfile, in);
+                               if (ableToOpen == 1) { abort = true; }
+                               in.close();
+                       }
+               
 
                        
 
@@ -129,7 +135,7 @@ int ChimeraSeqsCommand::execute(){
                        if (quanfile != "")                     {               chimera->setQuantiles(quanfile);                                }
                        else                                            {               chimera->setQuantiles("");                                              }
                        
-                       if (maskfile == "") { mothurOut("You have not provided a mask, so I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine();  }
+                       if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine();  }
                        chimera->setMask(maskfile);
                                                
                }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;         }
index 203482f0819ca5658656d582a4412ed4e0b48632..18377c2d77b61d4c92d3628c7a5e91adbd15257e 100644 (file)
@@ -14,13 +14,16 @@ void DeCalculator::setMask(string m) {
        try {
                seqMask = m; 
                
-               //whereever there is a base in the mask, save that value is query and subject
-               for (int i = 0; i < seqMask.length(); i++) {
-                       if (isalpha(seqMask[i])) {
-                               h.insert(i);
+               if (seqMask.length() != 0) {
+                       //whereever there is a base in the mask, save that value is query and subject
+                       for (int i = 0; i < seqMask.length(); i++) {
+                               if (isalpha(seqMask[i])) {
+                                       h.insert(i);
+                               }
                        }
+               }else {
+                       for (int i = 0; i < alignLength; i++) {   h.insert(i);  }
                }
-
        }
        catch(exception& e) {
                errorOut(e, "DeCalculator", "setMask");
@@ -94,7 +97,7 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
                                size = (cutoff / 4); 
                }
        
-               string seq = query->getAligned().substr(front, cutoff);
+       /*      string seq = query->getAligned().substr(front, cutoff);
                        
                //count bases
                int numBases = 0;
@@ -129,9 +132,15 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
        
 
                //get last window if needed
-               if (totalBases < numBases) {   win.push_back(back-size);  cout << back-size << endl;}
+               if (totalBases < numBases) {   win.push_back(back-size);  }
 //cout << endl << endl;
+*/     
                
+               //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
+               for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
+
+
+       
                return win;
        
        }
@@ -154,7 +163,7 @@ vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vec
        //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl;                                           
                        int diff = 0;
                        for (int b = 0; b < seqFrag.length(); b++) {
-               
+
                                if (seqFrag[b] != seqFragsub[b]) { diff++; }
                        }
                
@@ -178,6 +187,7 @@ float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int
                
                //so you only look at the trimmed part of the sequence
                int cutoff = back - front;  
+               int gaps = 0;
                        
                //from first startpoint with length back-front
                string seqFrag = query->getAligned().substr(front, cutoff);
@@ -185,11 +195,13 @@ float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int
                                                                                                                
                int diff = 0;
                for (int b = 0; b < seqFrag.length(); b++) {
+                       //ignore gaps
+                       if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
                        if (seqFrag[b] != seqFragsub[b]) { diff++; }
                }
                
                //percentage of mismatched bases
-               float dist = diff / (float) seqFrag.length() * 100;       
+               float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
                                
                return dist;
        }
@@ -317,12 +329,8 @@ vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float>
                        //while you are in the window for this sequence
                        int count = 0;
                        for (int j = window[m]; j < (window[m]+size); j++) {   
-                               
-                               //is this a spot that is included in the mask
-                               if (h.count(j) > 0) {
-                                       average += probabilityProfile[j];
-                                       count++;
-                               }
+                               average += probabilityProfile[j];
+                               count++;
                        }
                                
                        average = average / count;
@@ -408,17 +416,31 @@ void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
                for (int i = 0; i < quantiles.size(); i++) {
                
                        //find mean of this quantile score
-                       float average = findAverage(quantiles[i]);
+                       sort(quantiles[i].begin(), quantiles[i].end());
                        
+                       float average = quantiles[i][int(quantiles[i].size() * 0.5)];
+cout << i << "\taverage = " << average << "\tquantiles[i].size = " << quantiles[i].size() << endl;                     
                        vector<float> newQuanI;
                        //look at each value in quantiles to see if it is an outlier
                        for (int j = 0; j < quantiles[i].size(); j++) {
                                
-                               float highCutoff, lowCutOff;
+                               float highCutOff, lowCutOff;
                                
+                               //99%
+                               highCutOff = sqrt(((quantiles[i][j] - average + 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
                                
-                       
+                               //1%
+                               lowCutOff = sqrt(((quantiles[i][j] - average - 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
+//cout << "high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl;                            
+                               //if this is below the highcutff and above the lowcutoff
+                               if ((quantiles[i][j] < highCutOff) && (quantiles[i][j] > lowCutOff)) {
+                                       
+                                       newQuanI.push_back(quantiles[i][j]);
+                                       
+                               }else { cout << "removed outlier:  high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl;      }
                        }
+                       
+                       quantiles[i] = newQuanI;
                
                }
 
@@ -452,17 +474,11 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
        try {
        
                //find average prob for this seqs windows
-               float probAverage = 0.0;
-               for (int j = 0; j < qav.size(); j++) {   probAverage += qav[j]; }
-               probAverage = probAverage / (float) qav.size();
-               
+               float probAverage = findAverage(qav);
+                               
                //find observed average 
-               float obsAverage = 0.0;
-               for (int j = 0; j < obs.size(); j++) {   obsAverage += obs[j];  }
-               obsAverage = obsAverage / (float) obs.size();
-//cout << "sum ai / m = " << probAverage << endl;              
-//cout << "sum oi / m = " << obsAverage << endl;       
-       
+               float obsAverage = findAverage(obs);
+                       
                float coef = obsAverage / probAverage;
                                                
                return coef;
index 1579d86cb8950c2cbbc857407233e1014283344f..4ce45bf751915dd9e077ce478ae912358e789900 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -28,7 +28,8 @@ class DeCalculator {
                ~DeCalculator() {};
                
                set<int> getPos() {  return h;  }
-               void setMask(string m); 
+               void setMask(string); 
+               void setAlignmentLength(int l) {  alignLength = l;  }
                void runMask(Sequence*);
                void trimSeqs(Sequence*, Sequence*, map<int, int>&);
                void removeObviousOutliers(vector< vector<float> >&);
@@ -46,6 +47,7 @@ class DeCalculator {
                float findAverage(vector<float> myVector);
                string seqMask;
                set<int> h;
+               int alignLength;
 
 };
 
index 8406b813711aeb655ba786ac47eb6df16020327b..509241d87fb2207c30ce1c7db0133a2a59f357da 100644 (file)
@@ -285,7 +285,7 @@ void GetSeqsCommand::readGroup(){
 //alignreport file has a column header line then all other lines contain 16 columns.  we just want the first column since that contains the name
 void GetSeqsCommand::readAlign(){
        try {
-               string outputFileName = getRootName(alignfile) + "pick" +  getExtension(alignfile);;
+               string outputFileName = getRootName(getRootName(alignfile)) + "pick.align.report";
                ofstream out;
                openOutputFile(outputFileName, out);
 
index bca7df96deabdd28a8f9e83423b6c1608073215e..39ac13f786ad3c6e94d4b1a8252f126a23200522 100644 (file)
@@ -42,8 +42,8 @@ cout << "my quantile = " << quantiles[index][4] << endl;
                        
                        //is your DE value higher than the 95%
                        string chimera;
-                       if (DE[i] > quantiles[index][4])        {       chimera = "Yes";        }
-                       else                                                            {       chimera = "No";         }
+                       if (DE[i] > quantiles[index][4])                {       chimera = "Yes";        }
+                       else                                                                    {       chimera = "No";         }
                        
                        out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
                        if (chimera == "Yes") {
@@ -126,6 +126,10 @@ void Pintail::getChimeras() {
                distcalculator = new ignoreGaps();
                decalc = new DeCalculator();
                
+               //if the user does enter a mask then you want to keep all the spots in the alignment
+               if (seqMask.length() == 0)      {       decalc->setAlignmentLength(querySeqs[0]->getAligned().length());        }
+               else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
+               
                decalc->setMask(seqMask);
                
                //find pairs
@@ -163,7 +167,7 @@ void Pintail::getChimeras() {
                }else                           {   probabilityProfile = readFreq();                      }
 
                //make P into Q
-               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];   }  //cout << i << '\t' << probabilityProfile[i] << endl;
+               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
                mothurOut("Done."); mothurOutEndLine();
                
                //mask querys
@@ -183,15 +187,15 @@ void Pintail::getChimeras() {
                if (processors == 1) { 
        
                        for (int j = 0; j < bestfit.size(); j++) { 
-                       cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
-                       cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
+                       //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
+                       ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
                                decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
                        }
                        
                        mothurOut("Finding window breaks... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                it = trimmed[i].begin();
-cout << i << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
+//cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
                                vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
                                windowsForeachQuery[i] = win;
                        }
@@ -204,13 +208,13 @@ cout << i << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
                                                
                        mothurOut("Calculating observed distance... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
-       cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
+       //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
                                vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
        
        for (int j = 0; j < obsi.size(); j++) {
-               cout << obsi[j] << '\t';
+               //cout << obsi[j] << '\t';
        }
-       cout << endl;
+       //cout << endl;
                                obsDistance[i] = obsi;
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -221,11 +225,11 @@ cout << i << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
                                vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
 
                                Qav[i] = q;
-cout << i+1 << endl;
+//cout << querySeqs[i]->getName() << endl;
 for (int j = 0; j < Qav[i].size(); j++) {
-       cout << Qav[i][j] << '\t';
+       //cout << Qav[i][j] << '\t';
 }
-cout << endl << endl;
+//cout << endl << endl;
 
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -234,7 +238,7 @@ cout << endl << endl;
                        mothurOut("Calculating alpha... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
-cout << i+1 << "\tcoef = " << alpha << endl;
+//cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl;
                                seqCoef[i] = alpha;
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -252,10 +256,10 @@ cout << i+1 << "\tcoef = " << alpha << endl;
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
                                DE[i] = de;
-cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;                                
+//cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;                              
                                it = trimmed[i].begin();
                                float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
-cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
+//cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
                                deviation[i] = dist;
                        }
                        mothurOut("Done."); mothurOutEndLine();