From 8c3489da5da7fb7274f34bfa091c54aa496e75bd Mon Sep 17 00:00:00 2001 From: westcott Date: Tue, 11 Aug 2009 18:08:37 +0000 Subject: [PATCH] chimera.seqs pintail is working. --- Mothur.xcodeproj/project.pbxproj | 46 +++-- bellerophon.cpp | 31 +--- bellerophon.h | 5 - chimera.cpp | 63 +++++++ chimera.h | 52 +----- chimeraseqscommand.cpp | 21 ++- decalc.cpp | 300 +++++++++++++++++++++++++++---- decalc.h | 23 ++- mallard.cpp | 238 ++++++++++++++++++++++++ mallard.h | 64 +++++++ pintail.cpp | 144 +++++++-------- pintail.h | 3 +- 12 files changed, 774 insertions(+), 216 deletions(-) create mode 100644 chimera.cpp create mode 100644 mallard.cpp create mode 100644 mallard.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 0935abb..a570003 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -13,6 +13,8 @@ 21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */; }; 370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */; }; 371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */; }; + 372095C2103196D70004D347 /* chimera.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C1103196D70004D347 /* chimera.cpp */; }; + 372095CA1031A1D50004D347 /* mallard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C91031A1D50004D347 /* mallard.cpp */; }; 372A55551017922B00C5194B /* decalc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372A55541017922B00C5194B /* decalc.cpp */; }; 372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; }; 372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; }; @@ -196,8 +198,11 @@ 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getoturepcommand.cpp; sourceTree = SOURCE_ROOT; }; 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = screenseqscommand.cpp; sourceTree = SOURCE_ROOT; }; 371B30B30FD7EE67000414CA /* screenseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = screenseqscommand.h; sourceTree = SOURCE_ROOT; }; - 372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = SOURCE_ROOT; }; - 372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = SOURCE_ROOT; }; + 372095C1103196D70004D347 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = ""; }; + 372095C81031A1D50004D347 /* mallard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mallard.h; sourceTree = ""; }; + 372095C91031A1D50004D347 /* mallard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mallard.cpp; sourceTree = ""; }; + 372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = ""; }; + 372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = ""; }; 372E126E0F26365B0095CF7E /* readotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readotucommand.h; sourceTree = SOURCE_ROOT; }; 372E126F0F26365B0095CF7E /* readotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readotucommand.cpp; sourceTree = SOURCE_ROOT; }; 372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = SOURCE_ROOT; }; @@ -247,11 +252,11 @@ 3749271C0FD58C840031C06B /* getsabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsabundcommand.cpp; sourceTree = SOURCE_ROOT; }; 3749273D0FD5956B0031C06B /* getrabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrabundcommand.h; sourceTree = SOURCE_ROOT; }; 3749273E0FD5956B0031C06B /* getrabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrabundcommand.cpp; sourceTree = SOURCE_ROOT; }; - 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = SOURCE_ROOT; }; - 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = SOURCE_ROOT; }; - 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = SOURCE_ROOT; }; - 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = SOURCE_ROOT; }; - 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = SOURCE_ROOT; }; + 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = ""; }; + 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = ""; }; + 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = ""; }; + 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = ""; }; + 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = ""; }; 37519A690F80E6EB00FED5E8 /* sharedanderbergs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderbergs.h; sourceTree = SOURCE_ROOT; }; 37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedanderbergs.cpp; sourceTree = SOURCE_ROOT; }; 37519A9F0F810D0200FED5E8 /* venncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = venncommand.h; sourceTree = SOURCE_ROOT; }; @@ -319,9 +324,9 @@ 3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; }; 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; }; 3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; }; - 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; }; - 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = SOURCE_ROOT; }; - 379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = SOURCE_ROOT; }; + 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = ""; }; + 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = ""; }; + 379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = ""; }; 37AD4CE20F28AEA300AA2D49 /* sharedlistvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedlistvector.h; sourceTree = SOURCE_ROOT; }; 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedlistvector.cpp; sourceTree = SOURCE_ROOT; }; 37AD4DB90F28E2FE00AA2D49 /* tree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = SOURCE_ROOT; }; @@ -332,14 +337,14 @@ 37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedsobscollectsummary.cpp; sourceTree = SOURCE_ROOT; }; 37B28F660F27590100808A62 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = SOURCE_ROOT; }; 37B28F670F27590100808A62 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = SOURCE_ROOT; }; - 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = SOURCE_ROOT; }; - 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = SOURCE_ROOT; }; - 37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = SOURCE_ROOT; }; - 37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = SOURCE_ROOT; }; - 37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = SOURCE_ROOT; }; - 37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = SOURCE_ROOT; }; - 37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = SOURCE_ROOT; }; - 37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = SOURCE_ROOT; }; + 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = ""; }; + 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = ""; }; + 37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = ""; }; + 37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = ""; }; + 37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = ""; }; + 37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = ""; }; + 37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = ""; }; + 37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = ""; }; 37C1D9710F86506E0059E3F0 /* binsequencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binsequencecommand.h; sourceTree = SOURCE_ROOT; }; 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binsequencecommand.cpp; sourceTree = SOURCE_ROOT; }; 37C753CC0FB3415200DBD02E /* distancecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distancecommand.h; sourceTree = SOURCE_ROOT; }; @@ -640,10 +645,13 @@ isa = PBXGroup; children = ( 374F2FD51006320200E97C66 /* chimera.h */, + 372095C1103196D70004D347 /* chimera.cpp */, 374F2FE9100634B600E97C66 /* bellerophon.h */, 374F2FEA100634B600E97C66 /* bellerophon.cpp */, 372A55531017922B00C5194B /* decalc.h */, 372A55541017922B00C5194B /* decalc.cpp */, + 372095C81031A1D50004D347 /* mallard.h */, + 372095C91031A1D50004D347 /* mallard.cpp */, 374F301110063B6F00E97C66 /* pintail.h */, 374F301210063B6F00E97C66 /* pintail.cpp */, ); @@ -1163,6 +1171,8 @@ 374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */, 374F301310063B6F00E97C66 /* pintail.cpp in Sources */, 372A55551017922B00C5194B /* decalc.cpp in Sources */, + 372095C2103196D70004D347 /* chimera.cpp in Sources */, + 372095CA1031A1D50004D347 /* mallard.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/bellerophon.cpp b/bellerophon.cpp index f2dd821..86bc752 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -92,7 +92,7 @@ void Bellerophon::getChimeras() { distCalculator = new eachGapDist(); //read in sequences - readSeqs(); + //seqs = readSeqs(fastafile); int numSeqs = seqs.size(); @@ -173,6 +173,7 @@ cout << "increment = " << increment << endl; vector distMapLeft; // Create a data structure to quickly access the distance information. + //this is from thallingers reimplementation on get.oturep // It consists of a vector of distance maps, where each map contains // all distances of a certain sequence. Vector and maps are accessed // via the index of a sequence in the distance matrix @@ -231,31 +232,6 @@ cout << "increment = " << increment << endl; } } -//*************************************************************************************************************** -void Bellerophon::readSeqs(){ - try { - ifstream inFASTA; - openInputFile(fastafile, inFASTA); - - //read in seqs and store in vector - while(!inFASTA.eof()){ - Sequence current(inFASTA); - - if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } - - seqs.push_back(current); - - gobble(inFASTA); - } - inFASTA.close(); - - } - catch(exception& e) { - errorOut(e, "Bellerophon", "readSeqs"); - exit(1); - } -} - /***************************************************************************************************************/ int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ try { @@ -272,8 +248,7 @@ int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* spar } } - - + return 1; } catch(exception& e) { diff --git a/bellerophon.h b/bellerophon.h index 32b38e4..b79089c 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -51,13 +51,8 @@ class Bellerophon : public Chimera { string fastafile; int iters; - void readSeqs(); void generatePreferences(vector, vector, int); int createSparseMatrix(int, int, SparseMatrix*, vector); - - - - }; /***********************************************************/ diff --git a/chimera.cpp b/chimera.cpp new file mode 100644 index 0000000..48e5763 --- /dev/null +++ b/chimera.cpp @@ -0,0 +1,63 @@ +/* + * chimera.cpp + * Mothur + * + * Created by Sarah Westcott on 8/11/09. + * Copyright 2009 Schloss Lab Umass Amherst. All rights reserved. + * + */ + +#include "chimera.h" + +//*************************************************************************************************************** +vector Chimera::readSeqs(string file) { + try { + ifstream in; + openInputFile(file, in); + vector container; + + //read in seqs and store in vector + while(!in.eof()){ + + Sequence* current = new Sequence(in); + container.push_back(current); + gobble(in); + } + + in.close(); + return container; + } + catch(exception& e) { + errorOut(e, "Chimera", "readSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +void Chimera::setMask(string filename) { + try { + + if (filename == "default") { + //default is from wigeon 236627 EU009184.1 Shigella dysenteriae str. FBD013 + seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................"; + }else if (filename == "") { //do nothing + seqMask = ""; + }else{ + ifstream infile; + openInputFile(filename, infile); + + while (!infile.eof()) { + Sequence temp(infile); + seqMask = temp.getAligned(); + + gobble(infile); + } + + infile.close(); + } + } + catch(exception& e) { + errorOut(e, "Chimera", "setMask"); + exit(1); + } +} +//*************************************************************************************************************** diff --git a/chimera.h b/chimera.h index 9142bac..fd6b53a 100644 --- a/chimera.h +++ b/chimera.h @@ -37,56 +37,8 @@ class Chimera { virtual void setCons(string) {}; virtual void setQuantiles(string) {}; - - virtual vector readSeqs(string file) { - try { - ifstream in; - openInputFile(file, in); - vector container; - - //read in seqs and store in vector - while(!in.eof()){ - - Sequence* current = new Sequence(in); - container.push_back(current); - gobble(in); - } - - in.close(); - return container; - } - catch(exception& e) { - errorOut(e, "Chimera", "readSeqs"); - exit(1); - } - } - - - virtual void setMask(string filename) { - try { - - if (filename == "default") { - //default is from wigeon 236627 EU009184.1 Shigella dysenteriae str. FBD013 - seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................"; - }else{ - ifstream infile; - openInputFile(filename, infile); - - while (!infile.eof()) { - Sequence temp(infile); - seqMask = temp.getAligned(); - - gobble(infile); - } - - infile.close(); - } - } - catch(exception& e) { - errorOut(e, "Chimera", "setMask"); - exit(1); - } - } + virtual vector readSeqs(string); + virtual void setMask(string); //pure functions virtual void getChimeras() = 0; diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 17f3a71..899d6ef 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -65,7 +65,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ string temp; - temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } + temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } filter = isTrue(temp); temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; } @@ -96,13 +96,20 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ void ChimeraSeqsCommand::help(){ try { + mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED.\n"); mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n"); - mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask and method. fasta is required.\n"); - mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n"); - mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n"); + mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n"); + mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon. The default is false. \n"); + mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. This only applies when the method is bellerphon.\n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); - mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n"); - mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n"); + mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include..... \n"); + mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is no mask. If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n"); + mothurOut("The window parameter allows you to specify the window size for searching for chimeras. The default is 300 is method is pintail unless the sequence length is less than 300, and 1/4 sequence length for bellerphon.\n"); + mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences. The default is 25.\n"); + mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences for use by the pintail algorythm. It is a required parameter if using pintail.\n"); + mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n"); + mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n"); + mothurOut("If you have run chimera.seqs using pintail a .quan and .freq file will be created if you have not provided them for use in future command executions."); mothurOut("The chimera.seqs command should be in the following format: \n"); mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n"); @@ -150,7 +157,7 @@ int ChimeraSeqsCommand::execute(){ //find chimeras chimera->getChimeras(); - string outputFileName = getRootName(fastafile) + method + ".chimeras"; + string outputFileName = getRootName(fastafile) + method + maskfile + ".chimeras"; ofstream out; openOutputFile(outputFileName, out); diff --git a/decalc.cpp b/decalc.cpp index 18377c2..5aed358 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -81,7 +81,6 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map& t } } //*************************************************************************************************************** -//find the window breaks for each sequence - this is so you can move ahead by bases. vector DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) { try { @@ -154,22 +153,25 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int vector DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector window, int size) { try { - vector temp; -//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl; + vector temp; + for (int m = 0; m < window.size(); m++) { string seqFrag = query->getAligned().substr(window[m], size); string seqFragsub = subject->getAligned().substr(window[m], size); - //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl; + int diff = 0; for (int b = 0; b < seqFrag.length(); b++) { - - if (seqFrag[b] != seqFragsub[b]) { diff++; } + //if at least one is a base and they are not equal + if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; } } //percentage of mismatched bases float dist; - dist = diff / (float) seqFrag.length() * 100; + + //if the whole fragment is 0 distance = 0 + + dist = diff / (float) (seqFrag.length()) * 100; temp.push_back(dist); } @@ -199,6 +201,9 @@ float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; } if (seqFrag[b] != seqFragsub[b]) { diff++; } } + + //if the whole fragment is 0 distance = 0 + if ((seqFrag.length()-gaps) == 0) { return 0.0; } //percentage of mismatched bases float dist = diff / (float) (seqFrag.length()-gaps) * 100; @@ -346,26 +351,30 @@ vector DeCalculator::findQav(vector window, int size, vector exit(1); } } - +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareQuanMembers(quanMember left, quanMember right){ + return (left.score < right.score); +} //*************************************************************************************************************** -vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { +//seqs have already been masked +vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end, vector& highestDE) { try { - vector< vector > quan; + vector< vector > quan; //percentage of mismatched pairs 1 to 100 quan.resize(100); - //for each sequence for(int i = start; i < end; i++){ mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine(); - Sequence* query = seqs[i]; + Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned()); //compare to every other sequence in template for(int j = 0; j < i; j++){ - Sequence* subject = seqs[j]; + Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned()); map trim; map::iterator it; @@ -394,12 +403,23 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, vecto dist = ceil(dist); + quanMember newScore(de, i, j); + //dist-1 because vector indexes start at 0. - quan[dist-1].push_back(de); + quan[dist-1].push_back(newScore); + + //save highestDE + if(de > highestDE[i]) { highestDE[i] = de; } + if(de > highestDE[j]) { highestDE[j] = de; } + + delete subject; } + + delete query; } + return quan; } @@ -408,45 +428,259 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, vecto exit(1); } } + //*************************************************************************************************************** -void DeCalculator::removeObviousOutliers(vector< vector >& quantiles) { +vector< vector > DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { try { - + vector< vector > quan; + quan.resize(100); + /*vector contributions; + vector seen; //seen[0] is the number of outliers that template seqs[0] was part of. + seen.resize(num,0); + + //find contributions for (int i = 0; i < quantiles.size(); i++) { //find mean of this quantile score - sort(quantiles[i].begin(), quantiles[i].end()); + sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); - float average = quantiles[i][int(quantiles[i].size() * 0.5)]; -cout << i << "\taverage = " << average << "\tquantiles[i].size = " << quantiles[i].size() << endl; - vector newQuanI; + float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; + float low = quantiles[i][int(quantiles[i].size() * 0.01)].score; + //look at each value in quantiles to see if it is an outlier for (int j = 0; j < quantiles[i].size(); j++) { - float highCutOff, lowCutOff; + //is this score between 1 and 99% + if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { + + }else { + //add to contributions + contributions.push_back(quantiles[i][j]); + seen[quantiles[i][j].member1]++; + seen[quantiles[i][j].member2]++; + } + } + } + + //find contributer with most offending score related to it + int largestContrib = findLargestContrib(seen); + + //while you still have guys to eliminate + while (contributions.size() > 0) { + + mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine(); + + //remove from quantiles all scores that were made using this largestContrib + for (int i = 0; i < quantiles.size(); i++) { +//cout << "before remove " << quantiles[i].size() << '\t'; + removeContrib(largestContrib, quantiles[i]); +//cout << "after remove " << quantiles[i].size() << endl; + } +//cout << count << " largest contrib = " << largestContrib << endl; count++; + //remove from contributions all scores that were made using this largestContrib + removeContrib(largestContrib, contributions); + + //"erase" largestContrib + seen[largestContrib] = -1; + + //get next largestContrib + largestContrib = findLargestContrib(seen); + } + */ + + vector marked = returnObviousOutliers(quantiles, num); + vector copyMarked = marked; + + //find the 99% of marked + sort(copyMarked.begin(), copyMarked.end()); + int high = copyMarked[int(copyMarked.size() * 0.99)]; +cout << "high = " << high << endl; + + for(int i = 0; i < marked.size(); i++) { + if (marked[i] > high) { + mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine(); + for (int i = 0; i < quantiles.size(); i++) { + removeContrib(marked[i], quantiles[i]); + } + } + + } + + + //adjust quantiles + for (int i = 0; i < quantiles.size(); i++) { + vector temp; + + if (quantiles[i].size() == 0) { + //in case this is not a distance found in your template files + for (int g = 0; g < 6; g++) { + temp.push_back(0.0); + } + }else{ + + sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); + + //save 10% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score); + //save 25% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score); + //save 50% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score); + //save 75% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score); + //save 95% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score); + //save 99% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score); - //99% - highCutOff = sqrt(((quantiles[i][j] - average + 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1)); + } + + quan[i] = temp; + + } + + return quan; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeObviousOutliers"); + exit(1); + } +} +//*************************************************************************************************************** +//follows Mallard algorythn in paper referenced from mallard class +vector DeCalculator::returnObviousOutliers(vector< vector > quantiles, int num) { + try { + vector< vector > quan; + quan.resize(100); + + map contributions; //map of quanMember to distance from high or low - how bad is it. + vector marked; //marked[0] is the penalty of template seqs[0]. the higher the penalty the more likely the sequence is chimeric + marked.resize(num,0); - //1% - lowCutOff = sqrt(((quantiles[i][j] - average - 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1)); -//cout << "high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl; - //if this is below the highcutff and above the lowcutoff - if ((quantiles[i][j] < highCutOff) && (quantiles[i][j] > lowCutOff)) { - - newQuanI.push_back(quantiles[i][j]); + //find contributions + for (int i = 0; i < quantiles.size(); i++) { + + //find mean of this quantile score + sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); + + float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; + float low = quantiles[i][int(quantiles[i].size() * 0.01)].score; + + //look at each value in quantiles to see if it is an outlier + for (int j = 0; j < quantiles[i].size(); j++) { + + //is this score between 1 and 99% + if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { - }else { cout << "removed outlier: high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl; } + }else { + float dist; + //add to contributions + if (quantiles[i][j].score < low) { + dist = low - quantiles[i][j].score; + contributions[&(quantiles[i][j])] = dist; + }else { //you are higher than high + dist = quantiles[i][j].score - high; + contributions[&(quantiles[i][j])] = dist; + } + } + } + } + + //find contributer with most offending score related to it + vector outliers = sortContrib(contributions); + + //go through the outliers marking the potential chimeras + for (int i = 0; i < outliers.size(); i++) { + + //who is responsible for this outlying score? + //if member1 has greater score mark him + //if member2 has greater score mark her + //if they are the same mark both + if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; } + else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; } + else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; } + } + + return marked; + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeObviousOutliers"); + exit(1); + } +} +//*************************************************************************************************************** +vector DeCalculator::sortContrib(map quan) { + try{ + + vector newQuan; + map::iterator it; + + while (quan.size() > 0) { + + map::iterator largest = quan.begin(); + + //find biggest member + for (it = quan.begin(); it != quan.end(); it++) { + if (it->second > largest->second) { largest = it; } } - quantiles[i] = newQuanI; + //save it + newQuan.push_back(*(largest->first)); + //erase from quan + quan.erase(largest); } + + return newQuan; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "sortContrib"); + exit(1); + } +} +//*************************************************************************************************************** +int DeCalculator::findLargestContrib(vector seen) { + try{ + + int largest = 0; + + int largestContribs; + + for (int i = 0; i < seen.size(); i++) { + + if (seen[i] > largest) { + largestContribs = i; + largest = seen[i]; + } + } + + return largestContribs; + } catch(exception& e) { - errorOut(e, "DeCalculator", "removeObviousOutliers"); + errorOut(e, "DeCalculator", "findLargestContribs"); + exit(1); + } +} +//*************************************************************************************************************** +void DeCalculator::removeContrib(int bad, vector& quan) { + try{ + + vector newQuan; + for (int i = 0; i < quan.size(); i++) { + if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) { + newQuan.push_back(quan[i]); + } + } + + quan = newQuan; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeContrib"); exit(1); } } diff --git a/decalc.h b/decalc.h index 4ce45bf..65a8f46 100644 --- a/decalc.h +++ b/decalc.h @@ -20,6 +20,18 @@ /***********************************************************************/ +//this structure is necessary to determine the sequence that contributed to the outliers when we remove them +//this way we can remove all scores that are contributed by outlier sequences. +struct quanMember { + float score; + int member1; + int member2; + quanMember (float s, int m, int n) : score(s), member1(m), member2(n) {} + quanMember() {} + +}; + +//******************************************************************************************************************** class DeCalculator { public: @@ -32,7 +44,7 @@ class DeCalculator { void setAlignmentLength(int l) { alignLength = l; } void runMask(Sequence*); void trimSeqs(Sequence*, Sequence*, map&); - void removeObviousOutliers(vector< vector >&); + vector< vector > removeObviousOutliers(vector< vector >&, int); vector calcFreq(vector, string); vector findWindows(Sequence*, int, int, int&, int); vector calcObserved(Sequence*, Sequence*, vector, int); @@ -41,10 +53,15 @@ class DeCalculator { float calcDE(vector, vector); float calcDist(Sequence*, Sequence*, int, int); float getCoef(vector, vector); - vector< vector > getQuantiles(vector, vector, int, vector, int, int, int); + vector< vector > getQuantiles(vector, vector, int, vector, int, int, int, vector&); + + vector returnObviousOutliers(vector< vector >, int); private: - float findAverage(vector myVector); + vector sortContrib(map); //used by mallard + float findAverage(vector); + int findLargestContrib(vector); + void removeContrib(int, vector&); string seqMask; set h; int alignLength; diff --git a/mallard.cpp b/mallard.cpp new file mode 100644 index 0000000..7f387c2 --- /dev/null +++ b/mallard.cpp @@ -0,0 +1,238 @@ +/* + * mallard.cpp + * Mothur + * + * Created by Sarah Westcott on 8/11/09. + * Copyright 2009 __MyCompanyName__. All rights reserved. + * + */ + +#include "mallard.h" + +//*************************************************************************************************************** + +Mallard::Mallard(string filename) { fastafile = filename; } +//*************************************************************************************************************** + +Mallard::~Mallard() { + try { + for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } + } + catch(exception& e) { + errorOut(e, "Mallard", "~Mallard"); + exit(1); + } +} +//*************************************************************************************************************** +void Mallard::print(ostream& out) { + try { + + for (int i = 0; i < querySeqs.size(); i++) { + + out << querySeqs[i]->getName() << "\thighest de value = " << highestDE[i] << "\tpenalty score = " << marked[i] << endl; + cout << querySeqs[i]->getName() << "\tpenalty score = " << marked[i] << endl; + + } + } + catch(exception& e) { + errorOut(e, "Mallard", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +void Mallard::getChimeras() { + try { + + //read in query sequences and subject sequences + mothurOut("Reading sequences and template file... "); cout.flush(); + querySeqs = readSeqs(fastafile); + mothurOut("Done."); mothurOutEndLine(); + + int numSeqs = querySeqs.size(); + + windowSizes.resize(numSeqs, window); + quantilesMembers.resize(100); //one for every percent mismatch + highestDE.resize(numSeqs, 0.0); //contains the highest de value for each seq + + //break up file if needed + int linesPerProcess = numSeqs / processors ; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //find breakup of sequences for all times we will Parallelize + if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } + else { + //fill line pairs + for (int i = 0; i < (processors-1); i++) { + lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); + } + //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end + int i = processors - 1; + lines.push_back(new linePair((i*linesPerProcess), numSeqs)); + } + + #else + lines.push_back(new linePair(0, numSeqs)); + #endif + + decalc = new DeCalculator(); + + //if the user does enter a mask then you want to keep all the spots in the alignment + if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); } + else { decalc->setAlignmentLength(seqMask.length()); } + + decalc->setMask(seqMask); + + //find P + mothurOut("Getting conservation... "); cout.flush(); + probabilityProfile = decalc->calcFreq(querySeqs, fastafile); + + //make P into Q + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } + mothurOut("Done."); mothurOutEndLine(); + + //mask sequences if the user wants to + if (seqMask != "") { + //mask querys + for (int i = 0; i < querySeqs.size(); i++) { + decalc->runMask(querySeqs[i]); + } + } + + mothurOut("Calculating DE values..."); cout.flush(); + if (processors == 1) { + quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE); + }else { createProcessesQuan(); } + mothurOut("Done."); mothurOutEndLine(); + + mothurOut("Ranking outliers..."); cout.flush(); + marked = decalc->returnObviousOutliers(quantilesMembers, querySeqs.size()); + mothurOut("Done."); mothurOutEndLine(); + + + //free memory + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + delete decalc; + } + catch(exception& e) { + errorOut(e, "Mallard", "getChimeras"); + exit(1); + } +} +/**************************************************************************************************/ + +void Mallard::createProcessesQuan() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + + quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, lines[process]->start, lines[process]->end, highestDE); + + //write out data to file so parent can read it + ofstream out; + string s = toString(getpid()) + ".temp"; + openOutputFile(s, out); + + + //output observed distances + for (int i = 0; i < quantilesMembers.size(); i++) { + out << quantilesMembers[i].size() << '\t'; + for (int j = 0; j < quantilesMembers[i].size(); j++) { + out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t'; + } + out << endl; + } + + out << highestDE.size() << endl; + for (int i = 0; i < highestDE.size(); i++) { + out << highestDE[i] << '\t'; + } + out << endl; + + out.close(); + + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;i > quan; + quan.resize(100); + + //get quantiles + for (int m = 0; m < quan.size(); m++) { + int num; + in >> num; + + gobble(in); + + vector q; float w; int b, n; + for (int j = 0; j < num; j++) { + in >> w >> b >> n; + //cout << w << '\t' << b << '\t' n << endl; + quanMember newMember(w, b, n); + q.push_back(newMember); + } +//cout << "here" << endl; + quan[m] = q; +//cout << "now here" << endl; + gobble(in); + } + + + //save quan in quantiles + for (int j = 0; j < quan.size(); j++) { + //put all values of q[i] into quan[i] + for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); } + //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end()); + } + + int num; + in >> num; gobble(in); + + int count = lines[process]->start; + for (int s = 0; s < num; s++) { + float high; + in >> high; + + highestDE[count] = high; + count++; + } + + in.close(); + remove(s.c_str()); + } + +#else + quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE); +#endif + } + catch(exception& e) { + errorOut(e, "Mallard", "createProcessesQuan"); + exit(1); + } +} +//*************************************************************************************************************** + + diff --git a/mallard.h b/mallard.h new file mode 100644 index 0000000..d356f17 --- /dev/null +++ b/mallard.h @@ -0,0 +1,64 @@ +#ifndef MALLARD_H +#define MALLARD_H + +/* + * mallard.h + * Mothur + * + * Created by Sarah Westcott on 8/11/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + + +#include "chimera.h" +#include "decalc.h" + +//This class was created using the algorythms described in the +// "New Screening Software Shows that Most Recent Large 16SrRNA Gene Clone Libraries Contain Chimeras" paper +//by Kevin E. Ashelford 1, Nadia A. Chuzhanova 2, John C. Fry 1, Antonia J. Jones 3 and Andrew J. Weightman 1. + +/***********************************************************/ + +class Mallard : public Chimera { + + public: + Mallard(string); + ~Mallard(); + + void getChimeras(); + void print(ostream&); + + void setCons(string c) {}; + void setQuantiles(string q) {}; + + private: + + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + linePair(){} + }; + + DeCalculator* decalc; + int iters; + string fastafile; + + vector lines; + vector querySeqs; + vector windowSizes; //windowSizes[0] = window size of querySeqs[0] + vector marked; + vector highestDE; + + vector probabilityProfile; + vector< vector > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... + + void createProcessesQuan(); + +}; + +/***********************************************************/ + +#endif + diff --git a/pintail.cpp b/pintail.cpp index 74c34e1..28984c6 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -19,8 +19,7 @@ Pintail::~Pintail() { try { for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - - if (processors != 1) { for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } } + for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } } catch(exception& e) { errorOut(e, "Pintail", "~Pintail"); @@ -31,14 +30,11 @@ Pintail::~Pintail() { void Pintail::print(ostream& out) { try { + mothurOutEndLine(); + for (int i = 0; i < querySeqs.size(); i++) { int index = ceil(deviation[i]); -float quan = 2.64 * log10(deviation[i]) + 1.46; -cout << "dist = " << index << endl; -cout << "de = " << DE[i] << endl; -cout << "mallard quantile = " << quan << endl; -cout << "my quantile = " << quantiles[index][4] << endl; //is your DE value higher than the 95% string chimera; @@ -92,6 +88,7 @@ void Pintail::getChimeras() { windowsForeachQuery.resize(numSeqs); h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch + quantilesMembers.resize(100); //one for every percent mismatch //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -141,10 +138,11 @@ void Pintail::getChimeras() { for (int j = 0; j < bestfit.size(); j++) { + //chops off beginning and end of sequences so they both start and end with a base ofstream out; string s = querySeqs[j]->getName(); - + openOutputFile(s, out); out << ">" << querySeqs[j]->getName() << endl; out << querySeqs[j]->getAligned() << endl; @@ -154,7 +152,7 @@ void Pintail::getChimeras() { openOutputFile(t, out); out << ">" << bestfit[j]->getName() << endl; out << bestfit[j]->getAligned() << endl; - out.close(); + out.close(); } @@ -170,51 +168,46 @@ void Pintail::getChimeras() { for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; mothurOut("Done."); mothurOutEndLine(); - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl; - decalc->runMask(querySeqs[i]); - //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl; - } + //mask sequences if the user wants to + if (seqMask != "") { + //mask querys + for (int i = 0; i < querySeqs.size(); i++) { + decalc->runMask(querySeqs[i]); + } - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); + } + + for (int i = 0; i < bestfit.size(); i++) { + decalc->runMask(bestfit[i]); + } + } - -//for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl; } - + if (processors == 1) { for (int j = 0; j < bestfit.size(); j++) { - //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl; - ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl; decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); } mothurOut("Finding window breaks... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { it = trimmed[i].begin(); -//cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl; vector win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); windowsForeachQuery[i] = win; } mothurOut("Done."); mothurOutEndLine(); }else { createProcessesSpots(); } - if (processors == 1) { mothurOut("Calculating observed distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl; vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); - for (int j = 0; j < obsi.size(); j++) { - //cout << obsi[j] << '\t'; - } - //cout << endl; obsDistance[i] = obsi; } mothurOut("Done."); mothurOutEndLine(); @@ -225,12 +218,6 @@ void Pintail::getChimeras() { vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); Qav[i] = q; -//cout << querySeqs[i]->getName() << endl; -for (int j = 0; j < Qav[i].size(); j++) { - //cout << Qav[i][j] << '\t'; -} -//cout << endl << endl; - } mothurOut("Done."); mothurOutEndLine(); @@ -238,7 +225,6 @@ for (int j = 0; j < Qav[i].size(); j++) { mothurOut("Calculating alpha... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { float alpha = decalc->getCoef(obsDistance[i], Qav[i]); -//cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl; seqCoef[i] = alpha; } mothurOut("Done."); mothurOutEndLine(); @@ -256,10 +242,9 @@ for (int j = 0; j < Qav[i].size(); j++) { for (int i = lines[0]->start; i < lines[0]->end; i++) { float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); DE[i] = de; -//cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl; + it = trimmed[i].begin(); float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); -//cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl; deviation[i] = dist; } mothurOut("Done."); mothurOutEndLine(); @@ -267,7 +252,6 @@ for (int j = 0; j < Qav[i].size(); j++) { } else { createProcesses(); } - //quantiles are used to determine whether the de values found indicate a chimera //if you have to calculate them, its time intensive because you are finding the de and deviation values for each //combination of sequences in the template @@ -276,29 +260,33 @@ for (int j = 0; j < Qav[i].size(); j++) { mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant); }else { createProcessesQuan(); } - - decalc->removeObviousOutliers(quantiles); + + //decided against this because we were having trouble setting the sensitivity... may want to revisit this... + //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); ofstream out4; - string o = getRootName(templateFile) + "quan"; + string o; + + o = getRootName(templateFile) + "quan"; openOutputFile(o, out4); //adjust quantiles for (int i = 0; i < quantiles.size(); i++) { + vector temp; + if (quantiles[i].size() == 0) { //in case this is not a distance found in your template files for (int g = 0; g < 6; g++) { - quantiles[i].push_back(0.0); + temp.push_back(0.0); } }else{ sort(quantiles[i].begin(), quantiles[i].end()); - vector temp; //save 10% temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]); //save 25% @@ -312,19 +300,23 @@ for (int j = 0; j < Qav[i].size(); j++) { //save 99% temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]); - quantiles[i] = temp; } //output quan value out4 << i+1 << '\t'; - for (int u = 0; u < quantiles[i].size(); u++) { out4 << quantiles[i][u] << '\t'; } + for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; } out4 << endl; - + + quantiles[i] = temp; + } + out4.close(); + mothurOut("Done."); mothurOutEndLine(); + } - + //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } @@ -445,7 +437,10 @@ vector Pintail::findPairs(int start, int end) { } } - seqsMatches.push_back(match); + //make copy so trimSeqs doesn't overtrim + Sequence* copy = new Sequence(match->getName(), match->getAligned()); + + seqsMatches.push_back(copy); } return seqsMatches; @@ -478,6 +473,7 @@ void Pintail::createProcessesSpots() { //chops off beginning and end of sequences so they both start and end with a base map trim; + decalc->trimSeqs(querySeqs[j], bestfit[j], trim); trimmed[j] = trim; @@ -512,8 +508,7 @@ void Pintail::createProcessesSpots() { //output trimmed values for (int i = lines[process]->start; i < lines[process]->end; i++) { - it = trimmed[i].begin(); - + it = trimmed[i].begin(); out << it->first << '\t' << it->second << endl; } out.close(); @@ -568,7 +563,7 @@ void Pintail::createProcessesSpots() { for (int m = 0; m < size; m++) { int front, back; in >> front >> back; - + map t; t[front] = back; @@ -923,7 +918,7 @@ void Pintail::createProcessesQuan() { process++; }else if (pid == 0){ - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant); //write out data to file so parent can read it ofstream out; @@ -932,10 +927,10 @@ void Pintail::createProcessesQuan() { //output observed distances - for (int i = 0; i < quantiles.size(); i++) { - out << quantiles[i].size() << '\t'; - for (int j = 0; j < quantiles[i].size(); j++) { - out << quantiles[i][j] << '\t'; + for (int i = 0; i < quantilesMembers.size(); i++) { + out << quantilesMembers[i].size() << '\t'; + for (int j = 0; j < quantilesMembers[i].size(); j++) { + out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t'; } out << endl; } @@ -951,35 +946,42 @@ void Pintail::createProcessesQuan() { int temp = processIDS[i]; wait(&temp); } - + //get data created by processes for (int i=0;i > quan; quan.resize(100); + vector< vector > quan; + quan.resize(100); //get quantiles for (int m = 0; m < quan.size(); m++) { int num; - in >> num; - - vector q; float w; + in >> num; + + gobble(in); + + vector q; float w; int b, n; for (int j = 0; j < num; j++) { - in >> w; - q.push_back(w); + in >> w >> b >> n; + //cout << w << '\t' << b << '\t' n << endl; + quanMember newMember(w, b, n); + q.push_back(newMember); } - +//cout << "here" << endl; quan[m] = q; +//cout << "now here" << endl; gobble(in); } - + //save quan in quantiles - for (int i = 0; i < quan.size(); i++) { + for (int j = 0; j < quan.size(); j++) { //put all values of q[i] into quan[i] - quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end()); + for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); } + //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end()); } in.close(); @@ -987,7 +989,7 @@ void Pintail::createProcessesQuan() { } #else - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant); #endif } catch(exception& e) { diff --git a/pintail.h b/pintail.h index 2e84147..181be2c 100644 --- a/pintail.h +++ b/pintail.h @@ -72,6 +72,7 @@ class Pintail : public Chimera { vector DE; //DE[0] is the deviaation for queryseqs[0]... vector probabilityProfile; vector< vector > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... + vector< vector > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2... vector< set > h; @@ -84,7 +85,7 @@ class Pintail : public Chimera { void createProcesses(); void createProcessesQuan(); - + vector makeCompliant; //used by decalc->getQuantiles so pintail and mallard can use same function, it contains the highest de value for each seq in the template }; -- 2.39.2