]> git.donarmstrong.com Git - mothur.git/commitdiff
chimera.seqs pintail is working.
authorwestcott <westcott>
Tue, 11 Aug 2009 18:08:37 +0000 (18:08 +0000)
committerwestcott <westcott>
Tue, 11 Aug 2009 18:08:37 +0000 (18:08 +0000)
12 files changed:
Mothur.xcodeproj/project.pbxproj
bellerophon.cpp
bellerophon.h
chimera.cpp [new file with mode: 0644]
chimera.h
chimeraseqscommand.cpp
decalc.cpp
decalc.h
mallard.cpp [new file with mode: 0644]
mallard.h [new file with mode: 0644]
pintail.cpp
pintail.h

index 0935abb8d5081151e2ff884652a399a2ef86c18e..a57000321f8cec786bad83e8a2c70d919525b018 100644 (file)
@@ -13,6 +13,8 @@
                21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */; };
                370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */; };
                371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */; };
+               372095C2103196D70004D347 /* chimera.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C1103196D70004D347 /* chimera.cpp */; };
+               372095CA1031A1D50004D347 /* mallard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C91031A1D50004D347 /* mallard.cpp */; };
                372A55551017922B00C5194B /* decalc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372A55541017922B00C5194B /* decalc.cpp */; };
                372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; };
                372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; };
                370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getoturepcommand.cpp; sourceTree = SOURCE_ROOT; };
                371B30B20FD7EE67000414CA /* screenseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = screenseqscommand.cpp; sourceTree = SOURCE_ROOT; };
                371B30B30FD7EE67000414CA /* screenseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = screenseqscommand.h; sourceTree = SOURCE_ROOT; };
-               372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = SOURCE_ROOT; };
-               372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = SOURCE_ROOT; };
+               372095C1103196D70004D347 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = "<group>"; };
+               372095C81031A1D50004D347 /* mallard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mallard.h; sourceTree = "<group>"; };
+               372095C91031A1D50004D347 /* mallard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mallard.cpp; sourceTree = "<group>"; };
+               372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = "<group>"; };
+               372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = "<group>"; };
                372E126E0F26365B0095CF7E /* readotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readotucommand.h; sourceTree = SOURCE_ROOT; };
                372E126F0F26365B0095CF7E /* readotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readotucommand.cpp; sourceTree = SOURCE_ROOT; };
                372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = SOURCE_ROOT; };
                3749271C0FD58C840031C06B /* getsabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsabundcommand.cpp; sourceTree = SOURCE_ROOT; };
                3749273D0FD5956B0031C06B /* getrabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrabundcommand.h; sourceTree = SOURCE_ROOT; };
                3749273E0FD5956B0031C06B /* getrabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrabundcommand.cpp; sourceTree = SOURCE_ROOT; };
-               374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = SOURCE_ROOT; };
-               374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = SOURCE_ROOT; };
-               374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = SOURCE_ROOT; };
-               374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = SOURCE_ROOT; };
-               374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = SOURCE_ROOT; };
+               374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = "<group>"; };
+               374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = "<group>"; };
+               374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = "<group>"; };
+               374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = "<group>"; };
+               374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = "<group>"; };
                37519A690F80E6EB00FED5E8 /* sharedanderbergs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderbergs.h; sourceTree = SOURCE_ROOT; };
                37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedanderbergs.cpp; sourceTree = SOURCE_ROOT; };
                37519A9F0F810D0200FED5E8 /* venncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = venncommand.h; sourceTree = SOURCE_ROOT; };
                3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
                3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
                3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
-               379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; };
-               379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = SOURCE_ROOT; };
-               379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = SOURCE_ROOT; };
+               379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = "<group>"; };
+               379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = "<group>"; };
+               379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = "<group>"; };
                37AD4CE20F28AEA300AA2D49 /* sharedlistvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedlistvector.h; sourceTree = SOURCE_ROOT; };
                37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedlistvector.cpp; sourceTree = SOURCE_ROOT; };
                37AD4DB90F28E2FE00AA2D49 /* tree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = SOURCE_ROOT; };
                37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedsobscollectsummary.cpp; sourceTree = SOURCE_ROOT; };
                37B28F660F27590100808A62 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = SOURCE_ROOT; };
                37B28F670F27590100808A62 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = SOURCE_ROOT; };
-               37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = SOURCE_ROOT; };
-               37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = SOURCE_ROOT; };
-               37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = SOURCE_ROOT; };
-               37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = SOURCE_ROOT; };
-               37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = SOURCE_ROOT; };
-               37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = SOURCE_ROOT; };
-               37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = SOURCE_ROOT; };
-               37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = SOURCE_ROOT; };
+               37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = "<group>"; };
+               37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = "<group>"; };
+               37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = "<group>"; };
+               37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = "<group>"; };
+               37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = "<group>"; };
+               37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = "<group>"; };
+               37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = "<group>"; };
+               37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = "<group>"; };
                37C1D9710F86506E0059E3F0 /* binsequencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binsequencecommand.h; sourceTree = SOURCE_ROOT; };
                37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binsequencecommand.cpp; sourceTree = SOURCE_ROOT; };
                37C753CC0FB3415200DBD02E /* distancecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distancecommand.h; sourceTree = SOURCE_ROOT; };
                        isa = PBXGroup;
                        children = (
                                374F2FD51006320200E97C66 /* chimera.h */,
+                               372095C1103196D70004D347 /* chimera.cpp */,
                                374F2FE9100634B600E97C66 /* bellerophon.h */,
                                374F2FEA100634B600E97C66 /* bellerophon.cpp */,
                                372A55531017922B00C5194B /* decalc.h */,
                                372A55541017922B00C5194B /* decalc.cpp */,
+                               372095C81031A1D50004D347 /* mallard.h */,
+                               372095C91031A1D50004D347 /* mallard.cpp */,
                                374F301110063B6F00E97C66 /* pintail.h */,
                                374F301210063B6F00E97C66 /* pintail.cpp */,
                        );
                                374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */,
                                374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
                                372A55551017922B00C5194B /* decalc.cpp in Sources */,
+                               372095C2103196D70004D347 /* chimera.cpp in Sources */,
+                               372095CA1031A1D50004D347 /* mallard.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index f2dd821acf896302bc4b265cdb3bd8a03e89428d..86bc7527b9ae1458ed1a9fb81be99faf1728343e 100644 (file)
@@ -92,7 +92,7 @@ void Bellerophon::getChimeras() {
                distCalculator = new eachGapDist();
                
                //read in sequences
-               readSeqs();
+               //seqs = readSeqs(fastafile);
                
                int numSeqs = seqs.size();
                
@@ -173,6 +173,7 @@ cout << "increment = " << increment << endl;
                                vector<SeqMap> distMapLeft;
                                
                                // Create a data structure to quickly access the distance information.
+                               //this is from thallingers reimplementation on get.oturep
                                // It consists of a vector of distance maps, where each map contains
                                // all distances of a certain sequence. Vector and maps are accessed
                                // via the index of a sequence in the distance matrix
@@ -231,31 +232,6 @@ cout << "increment = " << increment << endl;
        }
 }
 
-//***************************************************************************************************************
-void Bellerophon::readSeqs(){
-       try {
-               ifstream inFASTA;
-               openInputFile(fastafile, inFASTA);
-               
-               //read in seqs and store in vector
-               while(!inFASTA.eof()){
-                       Sequence current(inFASTA);
-                       
-                       if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
-                       
-                       seqs.push_back(current);
-                       
-                       gobble(inFASTA);
-               }
-               inFASTA.close();
-
-       }
-       catch(exception& e) {
-               errorOut(e, "Bellerophon", "readSeqs");
-               exit(1);
-       }
-}
-
 /***************************************************************************************************************/
 int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
        try {
@@ -272,8 +248,7 @@ int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* spar
                                
                        }
                }
-                       
-       
+               
                return 1;
        }
        catch(exception& e) {
index 32b38e40328ecccec31359cc112e38f3eeed2924..b79089c97c87e13e8f93288707ef9b374595d34a 100644 (file)
@@ -51,13 +51,8 @@ class Bellerophon : public Chimera {
                string fastafile;
                int iters;
                
-               void readSeqs();
                void generatePreferences(vector<SeqMap>, vector<SeqMap>, int);
                int createSparseMatrix(int, int, SparseMatrix*, vector<Sequence>);
-       
-
-               
-
 };
 
 /***********************************************************/
diff --git a/chimera.cpp b/chimera.cpp
new file mode 100644 (file)
index 0000000..48e5763
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ *  chimera.cpp
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 8/11/09.
+ *  Copyright 2009 Schloss Lab Umass Amherst. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+
+//***************************************************************************************************************
+vector<Sequence*> Chimera::readSeqs(string file) {
+       try {
+               ifstream in;
+               openInputFile(file, in);
+               vector<Sequence*> container;
+               
+               //read in seqs and store in vector
+               while(!in.eof()){
+                       
+                       Sequence* current = new Sequence(in);
+                       container.push_back(current);
+                       gobble(in);
+               }
+               
+               in.close();
+               return container;
+       }
+       catch(exception& e) {
+               errorOut(e, "Chimera", "readSeqs");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+void Chimera::setMask(string filename) {
+       try {
+               
+               if (filename == "default") {
+                       //default is from wigeon  236627 EU009184.1 Shigella dysenteriae str. FBD013
+                       seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
+               }else if (filename == "") {  //do nothing 
+                       seqMask = "";
+               }else{
+                       ifstream infile;
+                       openInputFile(filename, infile);
+                       
+                       while (!infile.eof()) {
+                               Sequence temp(infile);
+                               seqMask = temp.getAligned();
+                               
+                               gobble(infile);
+                       }
+                       
+                       infile.close();
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "Chimera", "setMask");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
index 9142bacaa09a016d1cabd25560f1e428a699a15c..fd6b53a26778ff247ba43ee5f5a5bc6b6fa6c5c8 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -37,56 +37,8 @@ class Chimera {
                
                virtual void setCons(string) {};
                virtual void setQuantiles(string) {};
-               
-               virtual vector<Sequence*> readSeqs(string file) {
-                       try {
-                               ifstream in;
-                               openInputFile(file, in);
-                               vector<Sequence*> container;
-                               
-                               //read in seqs and store in vector
-                               while(!in.eof()){
-
-                                       Sequence* current = new Sequence(in);
-                                       container.push_back(current);
-                                       gobble(in);
-                               }
-                               
-                               in.close();
-                               return container;
-                       }
-                       catch(exception& e) {
-                               errorOut(e, "Chimera", "readSeqs");
-                               exit(1);
-                       }
-               }
-               
-               
-               virtual void setMask(string filename) {
-                       try {
-                               
-                               if (filename == "default") {
-                                       //default is from wigeon  236627 EU009184.1 Shigella dysenteriae str. FBD013
-                                       seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
-                               }else{
-                                       ifstream infile;
-                                       openInputFile(filename, infile);
-                                       
-                                       while (!infile.eof()) {
-                                               Sequence temp(infile);
-                                               seqMask = temp.getAligned();
-                                               
-                                               gobble(infile);
-                                       }
-                                       
-                                       infile.close();
-                               }
-                       }
-                       catch(exception& e) {
-                               errorOut(e, "Chimera", "setMask");
-                               exit(1);
-                       }
-               }
+               virtual vector<Sequence*> readSeqs(string);
+               virtual void setMask(string);
                
                //pure functions
                virtual void getChimeras() = 0; 
index 17f3a71a5033f1d28835bbd562af5d5f9a6b5a18..899d6ef33a5ec05ffe12ea24891c9680b33d0733 100644 (file)
@@ -65,7 +65,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        
 
                        string temp;
-                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "T"; }
+                       temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
                        filter = isTrue(temp);
                        
                        temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
@@ -96,13 +96,20 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 
 void ChimeraSeqsCommand::help(){
        try {
+               mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED.\n");
                mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
-               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask and method.  fasta is required.\n");
-               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
-               mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.   The default is true. \n");
+               mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
+               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon.  The default is false. \n");
+               mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.   The default is true. This only applies when the method is bellerphon.\n");
                mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
-               mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is pintail. \n");
-               mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method.  The default is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n");
+               mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is pintail. Options include..... \n");
+               mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method.  The default is no mask.  If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n");
+               mothurOut("The window parameter allows you to specify the window size for searching for chimeras.  The default is 300 is method is pintail unless the sequence length is less than 300, and 1/4 sequence length for bellerphon.\n");
+               mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences.  The default is 25.\n");
+               mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences for use by the pintail algorythm. It is a required parameter if using pintail.\n");
+               mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+               mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+               mothurOut("If you have run chimera.seqs using pintail a .quan and .freq file will be created if you have not provided them for use in future command executions.");
                mothurOut("The chimera.seqs command should be in the following format: \n");
                mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
                mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
@@ -150,7 +157,7 @@ int ChimeraSeqsCommand::execute(){
                //find chimeras
                chimera->getChimeras();
                
-               string outputFileName = getRootName(fastafile) + method + ".chimeras";
+               string outputFileName = getRootName(fastafile) + method + maskfile + ".chimeras";
                ofstream out;
                openOutputFile(outputFileName, out);
                
index 18377c2d77b61d4c92d3628c7a5e91adbd15257e..5aed358b5fe35d5ff20047fa4d857f72f5986ea3 100644 (file)
@@ -81,7 +81,6 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& t
        }
 }
 //***************************************************************************************************************
-//find the window breaks for each sequence - this is so you can move ahead by bases.
 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
        try {
                
@@ -154,22 +153,25 @@ vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int
 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
        try {
                
-               vector<float> temp;
-//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl;                          
+               vector<float> temp;     
+                               
                for (int m = 0; m < window.size(); m++) {
                                                
                        string seqFrag = query->getAligned().substr(window[m], size);
                        string seqFragsub = subject->getAligned().substr(window[m], size);
-       //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl;                                           
+                               
                        int diff = 0;
                        for (int b = 0; b < seqFrag.length(); b++) {
-
-                               if (seqFrag[b] != seqFragsub[b]) { diff++; }
+                               //if at least one is a base and they are not equal
+                               if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
                        }
                
                        //percentage of mismatched bases
                        float dist;
-                       dist = diff / (float) seqFrag.length() * 100;       
+                       
+                       //if the whole fragment is 0 distance = 0
+                       
+                       dist = diff / (float) (seqFrag.length()) * 100;      
                                
                        temp.push_back(dist);
                }
@@ -199,6 +201,9 @@ float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int
                        if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
                        if (seqFrag[b] != seqFragsub[b]) { diff++; }
                }
+               
+               //if the whole fragment is 0 distance = 0
+               if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
                
                //percentage of mismatched bases
                float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
@@ -346,26 +351,30 @@ vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float>
                exit(1);
        }
 }
-
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+       return (left.score < right.score);      
+} 
 //***************************************************************************************************************
-vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
+//seqs have already been masked
+vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end, vector<float>& highestDE) {
        try {
-               vector< vector<float> > quan; 
+               vector< vector<quanMember> > quan; 
                
                //percentage of mismatched pairs 1 to 100
                quan.resize(100);
                
-               
                //for each sequence
                for(int i = start; i < end; i++){
                
                        mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine();
-                       Sequence* query = seqs[i];
+                       Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
                        
                        //compare to every other sequence in template
                        for(int j = 0; j < i; j++){
                                
-                               Sequence* subject = seqs[j];
+                               Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
                                
                                map<int, int> trim;
                                map<int, int>::iterator it;
@@ -394,12 +403,23 @@ vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vecto
                                
                                dist = ceil(dist);
                                
+                               quanMember newScore(de, i, j);
+                               
                                //dist-1 because vector indexes start at 0.
-                               quan[dist-1].push_back(de);
+                               quan[dist-1].push_back(newScore);
+                               
+                               //save highestDE
+                               if(de > highestDE[i]) { highestDE[i] = de;  }
+                               if(de > highestDE[j]) { highestDE[j] = de;  }
+                               
+                               delete subject;
                                
                        }
+                       
+                       delete query;
                }
 
+               
                return quan;
                                                
        }
@@ -408,45 +428,259 @@ vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vecto
                exit(1);
        }
 }
+
 //***************************************************************************************************************
-void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
+vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
        try {
-               
+               vector< vector<float> > quan; 
+               quan.resize(100);
        
+               /*vector<quanMember> contributions;  
+               vector<int> seen;  //seen[0] is the number of outliers that template seqs[0] was part of.
+               seen.resize(num,0);
+                               
+               //find contributions
                for (int i = 0; i < quantiles.size(); i++) {
                
                        //find mean of this quantile score
-                       sort(quantiles[i].begin(), quantiles[i].end());
+                       sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
                        
-                       float average = quantiles[i][int(quantiles[i].size() * 0.5)];
-cout << i << "\taverage = " << average << "\tquantiles[i].size = " << quantiles[i].size() << endl;                     
-                       vector<float> newQuanI;
+                       float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
+                       float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
+               
                        //look at each value in quantiles to see if it is an outlier
                        for (int j = 0; j < quantiles[i].size(); j++) {
                                
-                               float highCutOff, lowCutOff;
+                               //is this score between 1 and 99%
+                               if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
+                                       
+                               }else {
+                                       //add to contributions
+                                       contributions.push_back(quantiles[i][j]);
+                                       seen[quantiles[i][j].member1]++;
+                                       seen[quantiles[i][j].member2]++;
+                               }
+                       }
+               }
+
+               //find contributer with most offending score related to it
+               int largestContrib = findLargestContrib(seen);
+       
+               //while you still have guys to eliminate
+               while (contributions.size() > 0) {
+               
+                       mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
+                       
+                       //remove from quantiles all scores that were made using this largestContrib
+                       for (int i = 0; i < quantiles.size(); i++) {
+//cout << "before remove " << quantiles[i].size() << '\t';
+                               removeContrib(largestContrib, quantiles[i]);
+//cout << "after remove " << quantiles[i].size() << endl;
+                       }
+//cout << count << " largest contrib = " << largestContrib << endl;  count++;
+                       //remove from contributions all scores that were made using this largestContrib
+                       removeContrib(largestContrib, contributions);
+                       
+                       //"erase" largestContrib
+                       seen[largestContrib] = -1;
+                       
+                       //get next largestContrib
+                       largestContrib = findLargestContrib(seen);
+               }
+               */
+               
+               vector<int> marked = returnObviousOutliers(quantiles, num);
+               vector<int> copyMarked = marked;
+               
+               //find the 99% of marked
+               sort(copyMarked.begin(), copyMarked.end());
+               int high = copyMarked[int(copyMarked.size() * 0.99)];
+cout << "high = " << high << endl;             
+               
+               for(int i = 0; i < marked.size(); i++) {
+                       if (marked[i] > high) { 
+                               mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
+                               for (int i = 0; i < quantiles.size(); i++) {
+                                       removeContrib(marked[i], quantiles[i]);
+                               }
+                       }
+
+               }
+               
+               
+               //adjust quantiles
+               for (int i = 0; i < quantiles.size(); i++) {
+                       vector<float> temp;
+                       
+                       if (quantiles[i].size() == 0) {
+                               //in case this is not a distance found in your template files
+                               for (int g = 0; g < 6; g++) {
+                                       temp.push_back(0.0);
+                               }
+                       }else{
+                               
+                               sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
+                               
+                               //save 10%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
+                               //save 25%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
+                               //save 50%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
+                               //save 75%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
+                               //save 95%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
+                               //save 99%
+                               temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
                                
-                               //99%
-                               highCutOff = sqrt(((quantiles[i][j] - average + 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
+                       }
+                       
+                       quan[i] = temp;
+                       
+               }
+
+               return quan;
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "removeObviousOutliers");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+//follows Mallard algorythn in paper referenced from mallard class
+vector<int> DeCalculator::returnObviousOutliers(vector< vector<quanMember> > quantiles, int num) {
+       try {
+               vector< vector<float> > quan; 
+               quan.resize(100);
+       
+               map<quanMember*, float> contributions;  //map of quanMember to distance from high or low - how bad is it.
+               vector<int> marked;  //marked[0] is the penalty of template seqs[0]. the higher the penalty the more likely the sequence is chimeric
+               marked.resize(num,0);
                                
-                               //1%
-                               lowCutOff = sqrt(((quantiles[i][j] - average - 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
-//cout << "high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl;                            
-                               //if this is below the highcutff and above the lowcutoff
-                               if ((quantiles[i][j] < highCutOff) && (quantiles[i][j] > lowCutOff)) {
-                                       
-                                       newQuanI.push_back(quantiles[i][j]);
+               //find contributions
+               for (int i = 0; i < quantiles.size(); i++) {
+               
+                       //find mean of this quantile score
+                       sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
+                       
+                       float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
+                       float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
+               
+                       //look at each value in quantiles to see if it is an outlier
+                       for (int j = 0; j < quantiles[i].size(); j++) {
+                               
+                               //is this score between 1 and 99%
+                               if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
                                        
-                               }else { cout << "removed outlier:  high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl;      }
+                               }else {
+                                       float dist;
+                                       //add to contributions
+                                       if (quantiles[i][j].score < low) {
+                                               dist = low - quantiles[i][j].score;
+                                               contributions[&(quantiles[i][j])] = dist;
+                                       }else { //you are higher than high
+                                               dist = quantiles[i][j].score - high;
+                                               contributions[&(quantiles[i][j])] = dist;
+                                       }
+                               }
+                       }
+               }
+
+               //find contributer with most offending score related to it
+               vector<quanMember> outliers = sortContrib(contributions);
+               
+               //go through the outliers marking the potential chimeras
+               for (int i = 0; i < outliers.size(); i++) {
+                       
+                       //who is responsible for this outlying score?  
+                       //if member1 has greater score mark him
+                       //if member2 has greater score mark her
+                       //if they are the same mark both
+                       if (marked[outliers[i].member1] > marked[outliers[i].member2])                  {       marked[outliers[i].member1]++;  }
+                       else if (marked[outliers[i].member2] > marked[outliers[i].member1])             {       marked[outliers[i].member2]++;  }
+                       else if (marked[outliers[i].member2] == marked[outliers[i].member1])    {       marked[outliers[i].member2]++;  marked[outliers[i].member1]++;  }
+               }
+               
+               return marked;
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "removeObviousOutliers");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
+       try{
+               
+               vector<quanMember> newQuan;
+               map<quanMember*, float>::iterator it;
+               
+               while (quan.size() > 0) {
+                       
+                        map<quanMember*, float>::iterator largest = quan.begin(); 
+                         
+                       //find biggest member
+                       for (it = quan.begin(); it != quan.end(); it++) {
+                               if (it->second > largest->second) {  largest = it;  }
                        }
                        
-                       quantiles[i] = newQuanI;
+                       //save it 
+                       newQuan.push_back(*(largest->first));
                
+                       //erase from quan
+                       quan.erase(largest);
                }
+               
+               return newQuan;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "sortContrib");
+               exit(1);
+       }
+}
 
+//***************************************************************************************************************
+int DeCalculator::findLargestContrib(vector<int> seen) {
+       try{
+               
+               int largest = 0;
+               
+               int largestContribs;
+               
+               for (int i = 0; i < seen.size(); i++)  {  
+                       
+                       if (seen[i] > largest) {
+                               largestContribs = i;
+                               largest = seen[i];
+                       }
+               }
+               
+               return largestContribs;
+               
        }
        catch(exception& e) {
-               errorOut(e, "DeCalculator", "removeObviousOutliers");
+               errorOut(e, "DeCalculator", "findLargestContribs");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
+       try{
+       
+               vector<quanMember> newQuan;
+               for (int i = 0; i < quan.size(); i++)  {  
+                       if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
+                               newQuan.push_back(quan[i]);  
+                       }
+               }
+               
+               quan = newQuan;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "removeContrib");
                exit(1);
        }
 }
index 4ce45bf751915dd9e077ce478ae912358e789900..65a8f46038e53bc79fc24085e9c6a995f85fda6f 100644 (file)
--- a/decalc.h
+++ b/decalc.h
 
 /***********************************************************************/
 
+//this structure is necessary to determine the sequence that contributed to the outliers when we remove them
+//this way we can remove all scores that are contributed by outlier sequences.
+struct quanMember {
+       float score;
+       int member1;
+       int member2;
+       quanMember (float s, int m, int n) : score(s), member1(m),  member2(n) {}
+       quanMember() {}
+       
+};
+               
+//********************************************************************************************************************
 class DeCalculator {
 
        public:
@@ -32,7 +44,7 @@ class DeCalculator {
                void setAlignmentLength(int l) {  alignLength = l;  }
                void runMask(Sequence*);
                void trimSeqs(Sequence*, Sequence*, map<int, int>&);
-               void removeObviousOutliers(vector< vector<float> >&);
+               vector< vector<float> > removeObviousOutliers(vector< vector<quanMember> >&, int);
                vector<float> calcFreq(vector<Sequence*>, string);
                vector<int> findWindows(Sequence*, int, int, int&, int);
                vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
@@ -41,10 +53,15 @@ class DeCalculator {
                float calcDE(vector<float>, vector<float>);
                float calcDist(Sequence*, Sequence*, int, int);
                float getCoef(vector<float>, vector<float>);
-               vector< vector<float> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
+               vector< vector<quanMember> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int, vector<float>&);
+               
+               vector<int> returnObviousOutliers(vector< vector<quanMember> >, int);
                
        private:
-               float findAverage(vector<float> myVector);
+               vector<quanMember> sortContrib(map<quanMember*, float>);  //used by mallard
+               float findAverage(vector<float>);
+               int findLargestContrib(vector<int>);
+               void removeContrib(int, vector<quanMember>&);
                string seqMask;
                set<int> h;
                int alignLength;
diff --git a/mallard.cpp b/mallard.cpp
new file mode 100644 (file)
index 0000000..7f387c2
--- /dev/null
@@ -0,0 +1,238 @@
+/*
+ *  mallard.cpp
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 8/11/09.
+ *  Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "mallard.h"
+
+//***************************************************************************************************************
+
+Mallard::Mallard(string filename) {  fastafile = filename;  }
+//***************************************************************************************************************
+
+Mallard::~Mallard() {
+       try {
+               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
+       }
+       catch(exception& e) {
+               errorOut(e, "Mallard", "~Mallard");
+               exit(1);
+       }
+}      
+//***************************************************************************************************************
+void Mallard::print(ostream& out) {
+       try {
+               
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       out << querySeqs[i]->getName() << "\thighest de value = " << highestDE[i] << "\tpenalty score = " << marked[i] << endl;
+                       cout << querySeqs[i]->getName() << "\tpenalty score = " << marked[i] << endl;
+                                               
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "Mallard", "print");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+void Mallard::getChimeras() {
+       try {
+               
+               //read in query sequences and subject sequences
+               mothurOut("Reading sequences and template file... "); cout.flush();
+               querySeqs = readSeqs(fastafile);
+               mothurOut("Done."); mothurOutEndLine();
+               
+               int numSeqs = querySeqs.size();
+               
+               windowSizes.resize(numSeqs, window);
+               quantilesMembers.resize(100);  //one for every percent mismatch
+               highestDE.resize(numSeqs, 0.0);  //contains the highest de value for each seq
+               
+               //break up file if needed
+               int linesPerProcess = numSeqs / processors ;
+               
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       //find breakup of sequences for all times we will Parallelize
+                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+                       else {
+                               //fill line pairs
+                               for (int i = 0; i < (processors-1); i++) {                      
+                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+                               }
+                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+                               int i = processors - 1;
+                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+                       }
+                       
+               #else
+                       lines.push_back(new linePair(0, numSeqs));
+               #endif
+               
+               decalc = new DeCalculator();
+               
+               //if the user does enter a mask then you want to keep all the spots in the alignment
+               if (seqMask.length() == 0)      {       decalc->setAlignmentLength(querySeqs[0]->getAligned().length());        }
+               else                                            {       decalc->setAlignmentLength(seqMask.length());                                           }
+               
+               decalc->setMask(seqMask);
+                               
+               //find P
+               mothurOut("Getting conservation... "); cout.flush();
+               probabilityProfile = decalc->calcFreq(querySeqs, fastafile); 
+       
+               //make P into Q
+               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  } 
+               mothurOut("Done."); mothurOutEndLine();
+               
+               //mask sequences if the user wants to 
+               if (seqMask != "") {
+                       //mask querys
+                       for (int i = 0; i < querySeqs.size(); i++) {
+                               decalc->runMask(querySeqs[i]);
+                       }
+               }
+                                               
+               mothurOut("Calculating DE values..."); cout.flush();
+               if (processors == 1) { 
+                       quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE);
+               }else {         createProcessesQuan();          }
+               mothurOut("Done."); mothurOutEndLine();
+               
+               mothurOut("Ranking outliers..."); cout.flush();
+               marked = decalc->returnObviousOutliers(quantilesMembers, querySeqs.size());
+               mothurOut("Done."); mothurOutEndLine();
+
+       
+               //free memory
+               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                }               
+               delete decalc;
+       }
+       catch(exception& e) {
+               errorOut(e, "Mallard", "getChimeras");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void Mallard::createProcessesQuan() {
+       try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               int process = 0;
+               vector<int> processIDS;
+                               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  
+                               process++;
+                       }else if (pid == 0){
+                               
+                               quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, lines[process]->start, lines[process]->end, highestDE);
+                               
+                               //write out data to file so parent can read it
+                               ofstream out;
+                               string s = toString(getpid()) + ".temp";
+                               openOutputFile(s, out);
+                               
+                                                               
+                               //output observed distances
+                               for (int i = 0; i < quantilesMembers.size(); i++) {
+                                       out << quantilesMembers[i].size() << '\t';
+                                       for (int j = 0; j < quantilesMembers[i].size(); j++) {
+                                               out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
+                                       }
+                                       out << endl;
+                               }
+                               
+                               out << highestDE.size() << endl;
+                               for (int i = 0; i < highestDE.size(); i++) {
+                                       out << highestDE[i] << '\t';
+                               }
+                               out << endl;
+                               
+                               out.close();
+                               
+                               exit(0);
+                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+               }
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processors;i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+
+               //get data created by processes
+               for (int i=0;i<processors;i++) { 
+                       ifstream in;
+                       string s = toString(processIDS[i]) + ".temp";
+                       openInputFile(s, in);
+                       
+                       vector< vector<quanMember> > quan; 
+                       quan.resize(100);
+                       
+                       //get quantiles
+                       for (int m = 0; m < quan.size(); m++) {
+                               int num;
+                               in >> num; 
+                               
+                               gobble(in);
+
+                               vector<quanMember> q;  float w; int b, n;
+                               for (int j = 0; j < num; j++) {
+                                       in >> w >> b >> n;
+       //cout << w << '\t' << b << '\t' n << endl;
+                                       quanMember newMember(w, b, n);
+                                       q.push_back(newMember);
+                               }
+//cout << "here" << endl;
+                               quan[m] = q;
+//cout << "now here" << endl;
+                               gobble(in);
+                       }
+                       
+       
+                       //save quan in quantiles
+                       for (int j = 0; j < quan.size(); j++) {
+                               //put all values of q[i] into quan[i]
+                               for (int l = 0; l < quan[j].size(); l++) {  quantilesMembers[j].push_back(quan[j][l]);   }
+                               //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
+                       }
+                       
+                       int num;
+                       in >> num;  gobble(in);
+                       
+                       int count = lines[process]->start;
+                       for (int s = 0; s < num; s++) {
+                               float high;
+                               in >> high;
+                               
+                               highestDE[count] = high;
+                               count++;
+                       }
+                       
+                       in.close();
+                       remove(s.c_str());
+               }
+               
+#else
+               quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE);
+#endif         
+       }
+       catch(exception& e) {
+               errorOut(e, "Mallard", "createProcessesQuan");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+
+
diff --git a/mallard.h b/mallard.h
new file mode 100644 (file)
index 0000000..d356f17
--- /dev/null
+++ b/mallard.h
@@ -0,0 +1,64 @@
+#ifndef MALLARD_H
+#define MALLARD_H
+
+/*
+ *  mallard.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 8/11/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "decalc.h"
+
+//This class was created using the algorythms described in the 
+// "New Screening Software Shows that Most Recent Large 16SrRNA Gene Clone Libraries Contain Chimeras" paper 
+//by Kevin E. Ashelford 1, Nadia A. Chuzhanova 2, John C. Fry 1, Antonia J. Jones 3 and Andrew J. Weightman 1.
+
+/***********************************************************/
+
+class Mallard : public Chimera {
+       
+       public:
+               Mallard(string);        
+               ~Mallard();
+               
+               void getChimeras();
+               void print(ostream&);
+               
+               void setCons(string c)  {};
+               void setQuantiles(string q) {};
+               
+       private:
+       
+               struct linePair {
+                       int start;
+                       int end;
+                       linePair(int i, int j) : start(i), end(j) {}
+                       linePair(){}
+               };
+
+               DeCalculator* decalc;
+               int iters;
+               string fastafile;
+               
+               vector<linePair*> lines;
+               vector<Sequence*> querySeqs;
+               vector<int> windowSizes;                        //windowSizes[0] = window size of querySeqs[0]
+               vector<int> marked;
+               vector<float> highestDE;
+       
+               vector<float> probabilityProfile;
+               vector< vector<quanMember> > quantilesMembers;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
+               
+               void createProcessesQuan();
+               
+};
+
+/***********************************************************/
+
+#endif
+
index 74c34e18be897ecb2c7966e5a481a5fff81d76df..28984c6bd6f275bf40e9002d673e6a0dd360de90 100644 (file)
@@ -19,8 +19,7 @@ Pintail::~Pintail() {
        try {
                for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
                for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
-               
-               if (processors != 1) {   for (int i = 0; i < bestfit.size(); i++)       {  delete bestfit[i];   }  }
+               for (int i = 0; i < bestfit.size(); i++)                {  delete bestfit[i];           }  
        }
        catch(exception& e) {
                errorOut(e, "Pintail", "~Pintail");
@@ -31,14 +30,11 @@ Pintail::~Pintail() {
 void Pintail::print(ostream& out) {
        try {
                
+               mothurOutEndLine();
+               
                for (int i = 0; i < querySeqs.size(); i++) {
                        
                        int index = ceil(deviation[i]);
-float quan = 2.64 * log10(deviation[i]) + 1.46;
-cout << "dist = " << index << endl;
-cout << "de = "  << DE[i] << endl;
-cout << "mallard quantile = " << quan << endl;
-cout << "my quantile = " << quantiles[index][4] << endl;
                        
                        //is your DE value higher than the 95%
                        string chimera;
@@ -92,6 +88,7 @@ void Pintail::getChimeras() {
                windowsForeachQuery.resize(numSeqs);
                h.resize(numSeqs);
                quantiles.resize(100);  //one for every percent mismatch
+               quantilesMembers.resize(100);  //one for every percent mismatch
                
                //break up file if needed
                int linesPerProcess = numSeqs / processors ;
@@ -141,10 +138,11 @@ void Pintail::getChimeras() {
                
                
                for (int j = 0; j < bestfit.size(); j++) { 
+                               
                                //chops off beginning and end of sequences so they both start and end with a base
                                ofstream out;
                                string s = querySeqs[j]->getName();
-                               
+                       
                                openOutputFile(s, out);
                                out << ">" << querySeqs[j]->getName() << endl;
                                out << querySeqs[j]->getAligned() << endl;
@@ -154,7 +152,7 @@ void Pintail::getChimeras() {
                                openOutputFile(t, out);
                                out << ">" << bestfit[j]->getName() << endl;
                                out << bestfit[j]->getAligned() << endl;
-                               out.close();
+                               out.close();    
                }
 
                
@@ -170,51 +168,46 @@ void Pintail::getChimeras() {
                for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
                mothurOut("Done."); mothurOutEndLine();
                
-               //mask querys
-               for (int i = 0; i < querySeqs.size(); i++) {
-               //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl;
-                       decalc->runMask(querySeqs[i]);
-               //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl;
-               }
+               //mask sequences if the user wants to 
+               if (seqMask != "") {
+                       //mask querys
+                       for (int i = 0; i < querySeqs.size(); i++) {
+                               decalc->runMask(querySeqs[i]);
+                       }
                
-               //mask templates
-               for (int i = 0; i < templateSeqs.size(); i++) {
-                       decalc->runMask(templateSeqs[i]);
+                       //mask templates
+                       for (int i = 0; i < templateSeqs.size(); i++) {
+                               decalc->runMask(templateSeqs[i]);
+                       }
+                       
+                       for (int i = 0; i < bestfit.size(); i++) { 
+                               decalc->runMask(bestfit[i]);
+                       }
+
                }
-               
-//for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl;  }
-                               
+                                               
                if (processors == 1) { 
        
                        for (int j = 0; j < bestfit.size(); j++) { 
-                       //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
-                       ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
                                decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);  
                        }
                        
                        mothurOut("Finding window breaks... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                it = trimmed[i].begin();
-//cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
                                vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
                                windowsForeachQuery[i] = win;
                        }
                        mothurOut("Done."); mothurOutEndLine();
                
                }else {         createProcessesSpots();         }
-
                
                if (processors == 1) { 
                                                
                        mothurOut("Calculating observed distance... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
-       //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
                                vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
        
-       for (int j = 0; j < obsi.size(); j++) {
-               //cout << obsi[j] << '\t';
-       }
-       //cout << endl;
                                obsDistance[i] = obsi;
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -225,12 +218,6 @@ void Pintail::getChimeras() {
                                vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
 
                                Qav[i] = q;
-//cout << querySeqs[i]->getName() << endl;
-for (int j = 0; j < Qav[i].size(); j++) {
-       //cout << Qav[i][j] << '\t';
-}
-//cout << endl << endl;
-
                        }
                        mothurOut("Done."); mothurOutEndLine();
                        
@@ -238,7 +225,6 @@ for (int j = 0; j < Qav[i].size(); j++) {
                        mothurOut("Calculating alpha... "); cout.flush();
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
-//cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl;
                                seqCoef[i] = alpha;
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -256,10 +242,9 @@ for (int j = 0; j < Qav[i].size(); j++) {
                        for (int i = lines[0]->start; i < lines[0]->end; i++) {
                                float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
                                DE[i] = de;
-//cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;                              
+                       
                                it = trimmed[i].begin();
                                float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); 
-//cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
                                deviation[i] = dist;
                        }
                        mothurOut("Done."); mothurOutEndLine();
@@ -267,7 +252,6 @@ for (int j = 0; j < Qav[i].size(); j++) {
                } 
                else {          createProcesses();              }
                
-               
                //quantiles are used to determine whether the de values found indicate a chimera
                //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
                //combination of sequences in the template
@@ -276,29 +260,33 @@ for (int j = 0; j < Qav[i].size(); j++) {
                        
                        mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
                        if (processors == 1) { 
-                               quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
+                               quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
                        }else {         createProcessesQuan();          }
                        
-                       
-                       decalc->removeObviousOutliers(quantiles);
+                                               
+                       //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
+                       //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
                        
                        ofstream out4;
-                       string o = getRootName(templateFile) + "quan";
+                       string o;
+                       
+                       o = getRootName(templateFile) + "quan";
                        
                        openOutputFile(o, out4);
                        
                        //adjust quantiles
                        for (int i = 0; i < quantiles.size(); i++) {
+                               vector<float> temp;
+                               
                                if (quantiles[i].size() == 0) {
                                        //in case this is not a distance found in your template files
                                        for (int g = 0; g < 6; g++) {
-                                               quantiles[i].push_back(0.0);
+                                               temp.push_back(0.0);
                                        }
                                }else{
                                        
                                        sort(quantiles[i].begin(), quantiles[i].end());
                                        
-                                       vector<float> temp;
                                        //save 10%
                                        temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
                                        //save 25%
@@ -312,19 +300,23 @@ for (int j = 0; j < Qav[i].size(); j++) {
                                        //save 99%
                                        temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
                                        
-                                       quantiles[i] = temp;
                                }
                                
                                //output quan value
                                out4 << i+1 << '\t';                            
-                               for (int u = 0; u < quantiles[i].size(); u++) {   out4 << quantiles[i][u] << '\t'; }
+                               for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
                                out4 << endl;
-
+                               
+                               quantiles[i] = temp;
+                               
                        }
                        
+                       out4.close();
+                       
                        mothurOut("Done."); mothurOutEndLine();
+                       
                }
-       
+                       
                //free memory
                for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
                for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
@@ -445,7 +437,10 @@ vector<Sequence*> Pintail::findPairs(int start, int end) {
                                }
                        }
                        
-                       seqsMatches.push_back(match);
+                       //make copy so trimSeqs doesn't overtrim
+                       Sequence* copy = new Sequence(match->getName(), match->getAligned());
+                       
+                       seqsMatches.push_back(copy);
                }
                
                return seqsMatches;
@@ -478,6 +473,7 @@ void Pintail::createProcessesSpots() {
                                
                                        //chops off beginning and end of sequences so they both start and end with a base
                                        map<int, int> trim;
+
                                        decalc->trimSeqs(querySeqs[j], bestfit[j], trim); 
                                        trimmed[j] = trim;
                                        
@@ -512,8 +508,7 @@ void Pintail::createProcessesSpots() {
                                
                                //output trimmed values
                                for (int i = lines[process]->start; i < lines[process]->end; i++) {
-                                       it = trimmed[i].begin();
-                                       
+                                       it = trimmed[i].begin();                                        
                                        out << it->first << '\t' << it->second << endl;
                                }
                                out.close();
@@ -568,7 +563,7 @@ void Pintail::createProcessesSpots() {
                        for (int m = 0; m < size; m++) {
                                int front, back;
                                in >> front >> back;
-                               
+                       
                                map<int, int> t;
                                
                                t[front] = back;
@@ -923,7 +918,7 @@ void Pintail::createProcessesQuan() {
                                process++;
                        }else if (pid == 0){
                                
-                               quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
+                               quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant);
                                
                                //write out data to file so parent can read it
                                ofstream out;
@@ -932,10 +927,10 @@ void Pintail::createProcessesQuan() {
                                
                                                                
                                //output observed distances
-                               for (int i = 0; i < quantiles.size(); i++) {
-                                       out << quantiles[i].size() << '\t';
-                                       for (int j = 0; j < quantiles[i].size(); j++) {
-                                               out << quantiles[i][j] << '\t';
+                               for (int i = 0; i < quantilesMembers.size(); i++) {
+                                       out << quantilesMembers[i].size() << '\t';
+                                       for (int j = 0; j < quantilesMembers[i].size(); j++) {
+                                               out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
                                        }
                                        out << endl;
                                }
@@ -951,35 +946,42 @@ void Pintail::createProcessesQuan() {
                        int temp = processIDS[i];
                        wait(&temp);
                }
-               
+
                //get data created by processes
                for (int i=0;i<processors;i++) { 
                        ifstream in;
                        string s = toString(processIDS[i]) + ".temp";
                        openInputFile(s, in);
                        
-                       vector< vector<float> > quan; quan.resize(100);
+                       vector< vector<quanMember> > quan; 
+                       quan.resize(100);
                        
                        //get quantiles
                        for (int m = 0; m < quan.size(); m++) {
                                int num;
-                               in >> num;
-                       
-                               vector<float> q;  float w;
+                               in >> num; 
+                               
+                               gobble(in);
+
+                               vector<quanMember> q;  float w; int b, n;
                                for (int j = 0; j < num; j++) {
-                                       in >> w;
-                                       q.push_back(w);
+                                       in >> w >> b >> n;
+       //cout << w << '\t' << b << '\t' n << endl;
+                                       quanMember newMember(w, b, n);
+                                       q.push_back(newMember);
                                }
-                       
+//cout << "here" << endl;
                                quan[m] = q;
+//cout << "now here" << endl;
                                gobble(in);
                        }
                        
-                       
+       
                        //save quan in quantiles
-                       for (int i = 0; i < quan.size(); i++) {
+                       for (int j = 0; j < quan.size(); j++) {
                                //put all values of q[i] into quan[i]
-                               quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end());
+                               for (int l = 0; l < quan[j].size(); l++) {  quantilesMembers[j].push_back(quan[j][l]);   }
+                               //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
                        }
                                        
                        in.close();
@@ -987,7 +989,7 @@ void Pintail::createProcessesQuan() {
                }
                
 #else
-               quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
+               quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
 #endif         
        }
        catch(exception& e) {
index 2e841475c79f9382fff32314a3a46577696ddfc3..181be2ca4410c32a4ab0222adbe8e5e2c138e1de 100644 (file)
--- a/pintail.h
+++ b/pintail.h
@@ -72,6 +72,7 @@ class Pintail : public Chimera {
                vector<float> DE;                                       //DE[0] is the deviaation for queryseqs[0]...
                vector<float> probabilityProfile;
                vector< vector<float> > quantiles;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
+               vector< vector<quanMember> > quantilesMembers;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
                vector< set<int> > h;
                
                
@@ -84,7 +85,7 @@ class Pintail : public Chimera {
                void createProcesses();
                void createProcessesQuan();
                
-               
+               vector<float> makeCompliant;  //used by decalc->getQuantiles so pintail and mallard can use same function, it contains the highest de value for each seq in the template
                
 };