21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */; };
370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */; };
371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */; };
+ 372095C2103196D70004D347 /* chimera.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C1103196D70004D347 /* chimera.cpp */; };
+ 372095CA1031A1D50004D347 /* mallard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C91031A1D50004D347 /* mallard.cpp */; };
372A55551017922B00C5194B /* decalc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372A55541017922B00C5194B /* decalc.cpp */; };
372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; };
372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; };
370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getoturepcommand.cpp; sourceTree = SOURCE_ROOT; };
371B30B20FD7EE67000414CA /* screenseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = screenseqscommand.cpp; sourceTree = SOURCE_ROOT; };
371B30B30FD7EE67000414CA /* screenseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = screenseqscommand.h; sourceTree = SOURCE_ROOT; };
- 372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = SOURCE_ROOT; };
- 372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = SOURCE_ROOT; };
+ 372095C1103196D70004D347 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = "<group>"; };
+ 372095C81031A1D50004D347 /* mallard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mallard.h; sourceTree = "<group>"; };
+ 372095C91031A1D50004D347 /* mallard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mallard.cpp; sourceTree = "<group>"; };
+ 372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = "<group>"; };
+ 372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = "<group>"; };
372E126E0F26365B0095CF7E /* readotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readotucommand.h; sourceTree = SOURCE_ROOT; };
372E126F0F26365B0095CF7E /* readotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readotucommand.cpp; sourceTree = SOURCE_ROOT; };
372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = SOURCE_ROOT; };
3749271C0FD58C840031C06B /* getsabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsabundcommand.cpp; sourceTree = SOURCE_ROOT; };
3749273D0FD5956B0031C06B /* getrabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrabundcommand.h; sourceTree = SOURCE_ROOT; };
3749273E0FD5956B0031C06B /* getrabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrabundcommand.cpp; sourceTree = SOURCE_ROOT; };
- 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = SOURCE_ROOT; };
- 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = SOURCE_ROOT; };
- 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = SOURCE_ROOT; };
- 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = SOURCE_ROOT; };
- 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = SOURCE_ROOT; };
+ 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = "<group>"; };
+ 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = "<group>"; };
+ 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = "<group>"; };
+ 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = "<group>"; };
+ 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = "<group>"; };
37519A690F80E6EB00FED5E8 /* sharedanderbergs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderbergs.h; sourceTree = SOURCE_ROOT; };
37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedanderbergs.cpp; sourceTree = SOURCE_ROOT; };
37519A9F0F810D0200FED5E8 /* venncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = venncommand.h; sourceTree = SOURCE_ROOT; };
3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
- 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; };
- 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = SOURCE_ROOT; };
- 379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = SOURCE_ROOT; };
+ 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = "<group>"; };
+ 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = "<group>"; };
+ 379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = "<group>"; };
37AD4CE20F28AEA300AA2D49 /* sharedlistvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedlistvector.h; sourceTree = SOURCE_ROOT; };
37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedlistvector.cpp; sourceTree = SOURCE_ROOT; };
37AD4DB90F28E2FE00AA2D49 /* tree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = SOURCE_ROOT; };
37AFC71E0F445386005F492D /* sharedsobscollectsummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedsobscollectsummary.cpp; sourceTree = SOURCE_ROOT; };
37B28F660F27590100808A62 /* deconvolutecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deconvolutecommand.h; sourceTree = SOURCE_ROOT; };
37B28F670F27590100808A62 /* deconvolutecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deconvolutecommand.cpp; sourceTree = SOURCE_ROOT; };
- 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = SOURCE_ROOT; };
- 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = SOURCE_ROOT; };
- 37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = SOURCE_ROOT; };
- 37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = SOURCE_ROOT; };
- 37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = SOURCE_ROOT; };
- 37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = SOURCE_ROOT; };
- 37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = SOURCE_ROOT; };
- 37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = SOURCE_ROOT; };
+ 37B73C741004BEFD008C4B41 /* listseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listseqscommand.h; sourceTree = "<group>"; };
+ 37B73C751004BEFD008C4B41 /* listseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = listseqscommand.cpp; sourceTree = "<group>"; };
+ 37B73CA41004D89A008C4B41 /* getseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getseqscommand.h; sourceTree = "<group>"; };
+ 37B73CA51004D89A008C4B41 /* getseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getseqscommand.cpp; sourceTree = "<group>"; };
+ 37B73CBE1004EB38008C4B41 /* removeseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removeseqscommand.h; sourceTree = "<group>"; };
+ 37B73CBF1004EB38008C4B41 /* removeseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removeseqscommand.cpp; sourceTree = "<group>"; };
+ 37B73CCD1004F5E0008C4B41 /* systemcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = systemcommand.h; sourceTree = "<group>"; };
+ 37B73CCE1004F5E0008C4B41 /* systemcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = systemcommand.cpp; sourceTree = "<group>"; };
37C1D9710F86506E0059E3F0 /* binsequencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = binsequencecommand.h; sourceTree = SOURCE_ROOT; };
37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = binsequencecommand.cpp; sourceTree = SOURCE_ROOT; };
37C753CC0FB3415200DBD02E /* distancecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = distancecommand.h; sourceTree = SOURCE_ROOT; };
isa = PBXGroup;
children = (
374F2FD51006320200E97C66 /* chimera.h */,
+ 372095C1103196D70004D347 /* chimera.cpp */,
374F2FE9100634B600E97C66 /* bellerophon.h */,
374F2FEA100634B600E97C66 /* bellerophon.cpp */,
372A55531017922B00C5194B /* decalc.h */,
372A55541017922B00C5194B /* decalc.cpp */,
+ 372095C81031A1D50004D347 /* mallard.h */,
+ 372095C91031A1D50004D347 /* mallard.cpp */,
374F301110063B6F00E97C66 /* pintail.h */,
374F301210063B6F00E97C66 /* pintail.cpp */,
);
374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */,
374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
372A55551017922B00C5194B /* decalc.cpp in Sources */,
+ 372095C2103196D70004D347 /* chimera.cpp in Sources */,
+ 372095CA1031A1D50004D347 /* mallard.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
distCalculator = new eachGapDist();
//read in sequences
- readSeqs();
+ //seqs = readSeqs(fastafile);
int numSeqs = seqs.size();
vector<SeqMap> distMapLeft;
// Create a data structure to quickly access the distance information.
+ //this is from thallingers reimplementation on get.oturep
// It consists of a vector of distance maps, where each map contains
// all distances of a certain sequence. Vector and maps are accessed
// via the index of a sequence in the distance matrix
}
}
-//***************************************************************************************************************
-void Bellerophon::readSeqs(){
- try {
- ifstream inFASTA;
- openInputFile(fastafile, inFASTA);
-
- //read in seqs and store in vector
- while(!inFASTA.eof()){
- Sequence current(inFASTA);
-
- if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
-
- seqs.push_back(current);
-
- gobble(inFASTA);
- }
- inFASTA.close();
-
- }
- catch(exception& e) {
- errorOut(e, "Bellerophon", "readSeqs");
- exit(1);
- }
-}
-
/***************************************************************************************************************/
int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
try {
}
}
-
-
+
return 1;
}
catch(exception& e) {
string fastafile;
int iters;
- void readSeqs();
void generatePreferences(vector<SeqMap>, vector<SeqMap>, int);
int createSparseMatrix(int, int, SparseMatrix*, vector<Sequence>);
-
-
-
-
};
/***********************************************************/
--- /dev/null
+/*
+ * chimera.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 8/11/09.
+ * Copyright 2009 Schloss Lab Umass Amherst. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+
+//***************************************************************************************************************
+vector<Sequence*> Chimera::readSeqs(string file) {
+ try {
+ ifstream in;
+ openInputFile(file, in);
+ vector<Sequence*> container;
+
+ //read in seqs and store in vector
+ while(!in.eof()){
+
+ Sequence* current = new Sequence(in);
+ container.push_back(current);
+ gobble(in);
+ }
+
+ in.close();
+ return container;
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "readSeqs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Chimera::setMask(string filename) {
+ try {
+
+ if (filename == "default") {
+ //default is from wigeon 236627 EU009184.1 Shigella dysenteriae str. FBD013
+ seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
+ }else if (filename == "") { //do nothing
+ seqMask = "";
+ }else{
+ ifstream infile;
+ openInputFile(filename, infile);
+
+ while (!infile.eof()) {
+ Sequence temp(infile);
+ seqMask = temp.getAligned();
+
+ gobble(infile);
+ }
+
+ infile.close();
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "setMask");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
virtual void setCons(string) {};
virtual void setQuantiles(string) {};
-
- virtual vector<Sequence*> readSeqs(string file) {
- try {
- ifstream in;
- openInputFile(file, in);
- vector<Sequence*> container;
-
- //read in seqs and store in vector
- while(!in.eof()){
-
- Sequence* current = new Sequence(in);
- container.push_back(current);
- gobble(in);
- }
-
- in.close();
- return container;
- }
- catch(exception& e) {
- errorOut(e, "Chimera", "readSeqs");
- exit(1);
- }
- }
-
-
- virtual void setMask(string filename) {
- try {
-
- if (filename == "default") {
- //default is from wigeon 236627 EU009184.1 Shigella dysenteriae str. FBD013
- seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
- }else{
- ifstream infile;
- openInputFile(filename, infile);
-
- while (!infile.eof()) {
- Sequence temp(infile);
- seqMask = temp.getAligned();
-
- gobble(infile);
- }
-
- infile.close();
- }
- }
- catch(exception& e) {
- errorOut(e, "Chimera", "setMask");
- exit(1);
- }
- }
+ virtual vector<Sequence*> readSeqs(string);
+ virtual void setMask(string);
//pure functions
virtual void getChimeras() = 0;
string temp;
- temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; }
+ temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
filter = isTrue(temp);
temp = validParameter.validFile(parameters, "correction", false); if (temp == "not found") { temp = "T"; }
void ChimeraSeqsCommand::help(){
try {
+ mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED.\n");
mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
- mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask and method. fasta is required.\n");
- mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n");
- mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n");
+ mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
+ mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon. The default is false. \n");
+ mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. This only applies when the method is bellerphon.\n");
mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
- mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n");
- mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n");
+ mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include..... \n");
+ mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is no mask. If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n");
+ mothurOut("The window parameter allows you to specify the window size for searching for chimeras. The default is 300 is method is pintail unless the sequence length is less than 300, and 1/4 sequence length for bellerphon.\n");
+ mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences. The default is 25.\n");
+ mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences for use by the pintail algorythm. It is a required parameter if using pintail.\n");
+ mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+ mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences for use by the pintail algorythm. It is not required, but will speed up the pintail method.\n");
+ mothurOut("If you have run chimera.seqs using pintail a .quan and .freq file will be created if you have not provided them for use in future command executions.");
mothurOut("The chimera.seqs command should be in the following format: \n");
mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
//find chimeras
chimera->getChimeras();
- string outputFileName = getRootName(fastafile) + method + ".chimeras";
+ string outputFileName = getRootName(fastafile) + method + maskfile + ".chimeras";
ofstream out;
openOutputFile(outputFileName, out);
}
}
//***************************************************************************************************************
-//find the window breaks for each sequence - this is so you can move ahead by bases.
vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
try {
vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
try {
- vector<float> temp;
-//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl;
+ vector<float> temp;
+
for (int m = 0; m < window.size(); m++) {
string seqFrag = query->getAligned().substr(window[m], size);
string seqFragsub = subject->getAligned().substr(window[m], size);
- //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl;
+
int diff = 0;
for (int b = 0; b < seqFrag.length(); b++) {
-
- if (seqFrag[b] != seqFragsub[b]) { diff++; }
+ //if at least one is a base and they are not equal
+ if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
}
//percentage of mismatched bases
float dist;
- dist = diff / (float) seqFrag.length() * 100;
+
+ //if the whole fragment is 0 distance = 0
+
+ dist = diff / (float) (seqFrag.length()) * 100;
temp.push_back(dist);
}
if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
if (seqFrag[b] != seqFragsub[b]) { diff++; }
}
+
+ //if the whole fragment is 0 distance = 0
+ if ((seqFrag.length()-gaps) == 0) { return 0.0; }
//percentage of mismatched bases
float dist = diff / (float) (seqFrag.length()-gaps) * 100;
exit(1);
}
}
-
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+ return (left.score < right.score);
+}
//***************************************************************************************************************
-vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
+//seqs have already been masked
+vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end, vector<float>& highestDE) {
try {
- vector< vector<float> > quan;
+ vector< vector<quanMember> > quan;
//percentage of mismatched pairs 1 to 100
quan.resize(100);
-
//for each sequence
for(int i = start; i < end; i++){
mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine();
- Sequence* query = seqs[i];
+ Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
//compare to every other sequence in template
for(int j = 0; j < i; j++){
- Sequence* subject = seqs[j];
+ Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
map<int, int> trim;
map<int, int>::iterator it;
dist = ceil(dist);
+ quanMember newScore(de, i, j);
+
//dist-1 because vector indexes start at 0.
- quan[dist-1].push_back(de);
+ quan[dist-1].push_back(newScore);
+
+ //save highestDE
+ if(de > highestDE[i]) { highestDE[i] = de; }
+ if(de > highestDE[j]) { highestDE[j] = de; }
+
+ delete subject;
}
+
+ delete query;
}
+
return quan;
}
exit(1);
}
}
+
//***************************************************************************************************************
-void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles) {
+vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
try {
-
+ vector< vector<float> > quan;
+ quan.resize(100);
+ /*vector<quanMember> contributions;
+ vector<int> seen; //seen[0] is the number of outliers that template seqs[0] was part of.
+ seen.resize(num,0);
+
+ //find contributions
for (int i = 0; i < quantiles.size(); i++) {
//find mean of this quantile score
- sort(quantiles[i].begin(), quantiles[i].end());
+ sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
- float average = quantiles[i][int(quantiles[i].size() * 0.5)];
-cout << i << "\taverage = " << average << "\tquantiles[i].size = " << quantiles[i].size() << endl;
- vector<float> newQuanI;
+ float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
+ float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
+
//look at each value in quantiles to see if it is an outlier
for (int j = 0; j < quantiles[i].size(); j++) {
- float highCutOff, lowCutOff;
+ //is this score between 1 and 99%
+ if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
+
+ }else {
+ //add to contributions
+ contributions.push_back(quantiles[i][j]);
+ seen[quantiles[i][j].member1]++;
+ seen[quantiles[i][j].member2]++;
+ }
+ }
+ }
+
+ //find contributer with most offending score related to it
+ int largestContrib = findLargestContrib(seen);
+
+ //while you still have guys to eliminate
+ while (contributions.size() > 0) {
+
+ mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
+
+ //remove from quantiles all scores that were made using this largestContrib
+ for (int i = 0; i < quantiles.size(); i++) {
+//cout << "before remove " << quantiles[i].size() << '\t';
+ removeContrib(largestContrib, quantiles[i]);
+//cout << "after remove " << quantiles[i].size() << endl;
+ }
+//cout << count << " largest contrib = " << largestContrib << endl; count++;
+ //remove from contributions all scores that were made using this largestContrib
+ removeContrib(largestContrib, contributions);
+
+ //"erase" largestContrib
+ seen[largestContrib] = -1;
+
+ //get next largestContrib
+ largestContrib = findLargestContrib(seen);
+ }
+ */
+
+ vector<int> marked = returnObviousOutliers(quantiles, num);
+ vector<int> copyMarked = marked;
+
+ //find the 99% of marked
+ sort(copyMarked.begin(), copyMarked.end());
+ int high = copyMarked[int(copyMarked.size() * 0.99)];
+cout << "high = " << high << endl;
+
+ for(int i = 0; i < marked.size(); i++) {
+ if (marked[i] > high) {
+ mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
+ for (int i = 0; i < quantiles.size(); i++) {
+ removeContrib(marked[i], quantiles[i]);
+ }
+ }
+
+ }
+
+
+ //adjust quantiles
+ for (int i = 0; i < quantiles.size(); i++) {
+ vector<float> temp;
+
+ if (quantiles[i].size() == 0) {
+ //in case this is not a distance found in your template files
+ for (int g = 0; g < 6; g++) {
+ temp.push_back(0.0);
+ }
+ }else{
+
+ sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
+
+ //save 10%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
+ //save 25%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
+ //save 50%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
+ //save 75%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
+ //save 95%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
+ //save 99%
+ temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
- //99%
- highCutOff = sqrt(((quantiles[i][j] - average + 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
+ }
+
+ quan[i] = temp;
+
+ }
+
+ return quan;
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "removeObviousOutliers");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+//follows Mallard algorythn in paper referenced from mallard class
+vector<int> DeCalculator::returnObviousOutliers(vector< vector<quanMember> > quantiles, int num) {
+ try {
+ vector< vector<float> > quan;
+ quan.resize(100);
+
+ map<quanMember*, float> contributions; //map of quanMember to distance from high or low - how bad is it.
+ vector<int> marked; //marked[0] is the penalty of template seqs[0]. the higher the penalty the more likely the sequence is chimeric
+ marked.resize(num,0);
- //1%
- lowCutOff = sqrt(((quantiles[i][j] - average - 3) * (quantiles[i][j] - average + 3)) / (float)(quantiles[i].size() - 1));
-//cout << "high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl;
- //if this is below the highcutff and above the lowcutoff
- if ((quantiles[i][j] < highCutOff) && (quantiles[i][j] > lowCutOff)) {
-
- newQuanI.push_back(quantiles[i][j]);
+ //find contributions
+ for (int i = 0; i < quantiles.size(); i++) {
+
+ //find mean of this quantile score
+ sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
+
+ float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
+ float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
+
+ //look at each value in quantiles to see if it is an outlier
+ for (int j = 0; j < quantiles[i].size(); j++) {
+
+ //is this score between 1 and 99%
+ if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
- }else { cout << "removed outlier: high = " << highCutOff << " low = " << lowCutOff << " de = " << quantiles[i][j] << endl; }
+ }else {
+ float dist;
+ //add to contributions
+ if (quantiles[i][j].score < low) {
+ dist = low - quantiles[i][j].score;
+ contributions[&(quantiles[i][j])] = dist;
+ }else { //you are higher than high
+ dist = quantiles[i][j].score - high;
+ contributions[&(quantiles[i][j])] = dist;
+ }
+ }
+ }
+ }
+
+ //find contributer with most offending score related to it
+ vector<quanMember> outliers = sortContrib(contributions);
+
+ //go through the outliers marking the potential chimeras
+ for (int i = 0; i < outliers.size(); i++) {
+
+ //who is responsible for this outlying score?
+ //if member1 has greater score mark him
+ //if member2 has greater score mark her
+ //if they are the same mark both
+ if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; }
+ else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; }
+ else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; }
+ }
+
+ return marked;
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "removeObviousOutliers");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
+ try{
+
+ vector<quanMember> newQuan;
+ map<quanMember*, float>::iterator it;
+
+ while (quan.size() > 0) {
+
+ map<quanMember*, float>::iterator largest = quan.begin();
+
+ //find biggest member
+ for (it = quan.begin(); it != quan.end(); it++) {
+ if (it->second > largest->second) { largest = it; }
}
- quantiles[i] = newQuanI;
+ //save it
+ newQuan.push_back(*(largest->first));
+ //erase from quan
+ quan.erase(largest);
}
+
+ return newQuan;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "sortContrib");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int DeCalculator::findLargestContrib(vector<int> seen) {
+ try{
+
+ int largest = 0;
+
+ int largestContribs;
+
+ for (int i = 0; i < seen.size(); i++) {
+
+ if (seen[i] > largest) {
+ largestContribs = i;
+ largest = seen[i];
+ }
+ }
+
+ return largestContribs;
+
}
catch(exception& e) {
- errorOut(e, "DeCalculator", "removeObviousOutliers");
+ errorOut(e, "DeCalculator", "findLargestContribs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
+ try{
+
+ vector<quanMember> newQuan;
+ for (int i = 0; i < quan.size(); i++) {
+ if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
+ newQuan.push_back(quan[i]);
+ }
+ }
+
+ quan = newQuan;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "DeCalculator", "removeContrib");
exit(1);
}
}
/***********************************************************************/
+//this structure is necessary to determine the sequence that contributed to the outliers when we remove them
+//this way we can remove all scores that are contributed by outlier sequences.
+struct quanMember {
+ float score;
+ int member1;
+ int member2;
+ quanMember (float s, int m, int n) : score(s), member1(m), member2(n) {}
+ quanMember() {}
+
+};
+
+//********************************************************************************************************************
class DeCalculator {
public:
void setAlignmentLength(int l) { alignLength = l; }
void runMask(Sequence*);
void trimSeqs(Sequence*, Sequence*, map<int, int>&);
- void removeObviousOutliers(vector< vector<float> >&);
+ vector< vector<float> > removeObviousOutliers(vector< vector<quanMember> >&, int);
vector<float> calcFreq(vector<Sequence*>, string);
vector<int> findWindows(Sequence*, int, int, int&, int);
vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
float calcDE(vector<float>, vector<float>);
float calcDist(Sequence*, Sequence*, int, int);
float getCoef(vector<float>, vector<float>);
- vector< vector<float> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
+ vector< vector<quanMember> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int, vector<float>&);
+
+ vector<int> returnObviousOutliers(vector< vector<quanMember> >, int);
private:
- float findAverage(vector<float> myVector);
+ vector<quanMember> sortContrib(map<quanMember*, float>); //used by mallard
+ float findAverage(vector<float>);
+ int findLargestContrib(vector<int>);
+ void removeContrib(int, vector<quanMember>&);
string seqMask;
set<int> h;
int alignLength;
--- /dev/null
+/*
+ * mallard.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 8/11/09.
+ * Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "mallard.h"
+
+//***************************************************************************************************************
+
+Mallard::Mallard(string filename) { fastafile = filename; }
+//***************************************************************************************************************
+
+Mallard::~Mallard() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ }
+ catch(exception& e) {
+ errorOut(e, "Mallard", "~Mallard");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Mallard::print(ostream& out) {
+ try {
+
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ out << querySeqs[i]->getName() << "\thighest de value = " << highestDE[i] << "\tpenalty score = " << marked[i] << endl;
+ cout << querySeqs[i]->getName() << "\tpenalty score = " << marked[i] << endl;
+
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "Mallard", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Mallard::getChimeras() {
+ try {
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ windowSizes.resize(numSeqs, window);
+ quantilesMembers.resize(100); //one for every percent mismatch
+ highestDE.resize(numSeqs, 0.0); //contains the highest de value for each seq
+
+ //break up file if needed
+ int linesPerProcess = numSeqs / processors ;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ #else
+ lines.push_back(new linePair(0, numSeqs));
+ #endif
+
+ decalc = new DeCalculator();
+
+ //if the user does enter a mask then you want to keep all the spots in the alignment
+ if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); }
+ else { decalc->setAlignmentLength(seqMask.length()); }
+
+ decalc->setMask(seqMask);
+
+ //find P
+ mothurOut("Getting conservation... "); cout.flush();
+ probabilityProfile = decalc->calcFreq(querySeqs, fastafile);
+
+ //make P into Q
+ for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
+ mothurOut("Done."); mothurOutEndLine();
+
+ //mask sequences if the user wants to
+ if (seqMask != "") {
+ //mask querys
+ for (int i = 0; i < querySeqs.size(); i++) {
+ decalc->runMask(querySeqs[i]);
+ }
+ }
+
+ mothurOut("Calculating DE values..."); cout.flush();
+ if (processors == 1) {
+ quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE);
+ }else { createProcessesQuan(); }
+ mothurOut("Done."); mothurOutEndLine();
+
+ mothurOut("Ranking outliers..."); cout.flush();
+ marked = decalc->returnObviousOutliers(quantilesMembers, querySeqs.size());
+ mothurOut("Done."); mothurOutEndLine();
+
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+ delete decalc;
+ }
+ catch(exception& e) {
+ errorOut(e, "Mallard", "getChimeras");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void Mallard::createProcessesQuan() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, lines[process]->start, lines[process]->end, highestDE);
+
+ //write out data to file so parent can read it
+ ofstream out;
+ string s = toString(getpid()) + ".temp";
+ openOutputFile(s, out);
+
+
+ //output observed distances
+ for (int i = 0; i < quantilesMembers.size(); i++) {
+ out << quantilesMembers[i].size() << '\t';
+ for (int j = 0; j < quantilesMembers[i].size(); j++) {
+ out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
+ }
+ out << endl;
+ }
+
+ out << highestDE.size() << endl;
+ for (int i = 0; i < highestDE.size(); i++) {
+ out << highestDE[i] << '\t';
+ }
+ out << endl;
+
+ out.close();
+
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //get data created by processes
+ for (int i=0;i<processors;i++) {
+ ifstream in;
+ string s = toString(processIDS[i]) + ".temp";
+ openInputFile(s, in);
+
+ vector< vector<quanMember> > quan;
+ quan.resize(100);
+
+ //get quantiles
+ for (int m = 0; m < quan.size(); m++) {
+ int num;
+ in >> num;
+
+ gobble(in);
+
+ vector<quanMember> q; float w; int b, n;
+ for (int j = 0; j < num; j++) {
+ in >> w >> b >> n;
+ //cout << w << '\t' << b << '\t' n << endl;
+ quanMember newMember(w, b, n);
+ q.push_back(newMember);
+ }
+//cout << "here" << endl;
+ quan[m] = q;
+//cout << "now here" << endl;
+ gobble(in);
+ }
+
+
+ //save quan in quantiles
+ for (int j = 0; j < quan.size(); j++) {
+ //put all values of q[i] into quan[i]
+ for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
+ //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
+ }
+
+ int num;
+ in >> num; gobble(in);
+
+ int count = lines[process]->start;
+ for (int s = 0; s < num; s++) {
+ float high;
+ in >> high;
+
+ highestDE[count] = high;
+ count++;
+ }
+
+ in.close();
+ remove(s.c_str());
+ }
+
+#else
+ quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE);
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "Mallard", "createProcessesQuan");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+
--- /dev/null
+#ifndef MALLARD_H
+#define MALLARD_H
+
+/*
+ * mallard.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 8/11/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "decalc.h"
+
+//This class was created using the algorythms described in the
+// "New Screening Software Shows that Most Recent Large 16SrRNA Gene Clone Libraries Contain Chimeras" paper
+//by Kevin E. Ashelford 1, Nadia A. Chuzhanova 2, John C. Fry 1, Antonia J. Jones 3 and Andrew J. Weightman 1.
+
+/***********************************************************/
+
+class Mallard : public Chimera {
+
+ public:
+ Mallard(string);
+ ~Mallard();
+
+ void getChimeras();
+ void print(ostream&);
+
+ void setCons(string c) {};
+ void setQuantiles(string q) {};
+
+ private:
+
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ linePair(){}
+ };
+
+ DeCalculator* decalc;
+ int iters;
+ string fastafile;
+
+ vector<linePair*> lines;
+ vector<Sequence*> querySeqs;
+ vector<int> windowSizes; //windowSizes[0] = window size of querySeqs[0]
+ vector<int> marked;
+ vector<float> highestDE;
+
+ vector<float> probabilityProfile;
+ vector< vector<quanMember> > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
+
+ void createProcessesQuan();
+
+};
+
+/***********************************************************/
+
+#endif
+
try {
for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
-
- if (processors != 1) { for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } }
+ for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; }
}
catch(exception& e) {
errorOut(e, "Pintail", "~Pintail");
void Pintail::print(ostream& out) {
try {
+ mothurOutEndLine();
+
for (int i = 0; i < querySeqs.size(); i++) {
int index = ceil(deviation[i]);
-float quan = 2.64 * log10(deviation[i]) + 1.46;
-cout << "dist = " << index << endl;
-cout << "de = " << DE[i] << endl;
-cout << "mallard quantile = " << quan << endl;
-cout << "my quantile = " << quantiles[index][4] << endl;
//is your DE value higher than the 95%
string chimera;
windowsForeachQuery.resize(numSeqs);
h.resize(numSeqs);
quantiles.resize(100); //one for every percent mismatch
+ quantilesMembers.resize(100); //one for every percent mismatch
//break up file if needed
int linesPerProcess = numSeqs / processors ;
for (int j = 0; j < bestfit.size(); j++) {
+
//chops off beginning and end of sequences so they both start and end with a base
ofstream out;
string s = querySeqs[j]->getName();
-
+
openOutputFile(s, out);
out << ">" << querySeqs[j]->getName() << endl;
out << querySeqs[j]->getAligned() << endl;
openOutputFile(t, out);
out << ">" << bestfit[j]->getName() << endl;
out << bestfit[j]->getAligned() << endl;
- out.close();
+ out.close();
}
for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl;
mothurOut("Done."); mothurOutEndLine();
- //mask querys
- for (int i = 0; i < querySeqs.size(); i++) {
- //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl;
- decalc->runMask(querySeqs[i]);
- //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl;
- }
+ //mask sequences if the user wants to
+ if (seqMask != "") {
+ //mask querys
+ for (int i = 0; i < querySeqs.size(); i++) {
+ decalc->runMask(querySeqs[i]);
+ }
- //mask templates
- for (int i = 0; i < templateSeqs.size(); i++) {
- decalc->runMask(templateSeqs[i]);
+ //mask templates
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ decalc->runMask(templateSeqs[i]);
+ }
+
+ for (int i = 0; i < bestfit.size(); i++) {
+ decalc->runMask(bestfit[i]);
+ }
+
}
-
-//for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl; }
-
+
if (processors == 1) {
for (int j = 0; j < bestfit.size(); j++) {
- //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl;
- ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl;
decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]);
}
mothurOut("Finding window breaks... "); cout.flush();
for (int i = lines[0]->start; i < lines[0]->end; i++) {
it = trimmed[i].begin();
-//cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl;
vector<int> win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment);
windowsForeachQuery[i] = win;
}
mothurOut("Done."); mothurOutEndLine();
}else { createProcessesSpots(); }
-
if (processors == 1) {
mothurOut("Calculating observed distance... "); cout.flush();
for (int i = lines[0]->start; i < lines[0]->end; i++) {
- //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl;
vector<float> obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]);
- for (int j = 0; j < obsi.size(); j++) {
- //cout << obsi[j] << '\t';
- }
- //cout << endl;
obsDistance[i] = obsi;
}
mothurOut("Done."); mothurOutEndLine();
vector<float> q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile);
Qav[i] = q;
-//cout << querySeqs[i]->getName() << endl;
-for (int j = 0; j < Qav[i].size(); j++) {
- //cout << Qav[i][j] << '\t';
-}
-//cout << endl << endl;
-
}
mothurOut("Done."); mothurOutEndLine();
mothurOut("Calculating alpha... "); cout.flush();
for (int i = lines[0]->start; i < lines[0]->end; i++) {
float alpha = decalc->getCoef(obsDistance[i], Qav[i]);
-//cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl;
seqCoef[i] = alpha;
}
mothurOut("Done."); mothurOutEndLine();
for (int i = lines[0]->start; i < lines[0]->end; i++) {
float de = decalc->calcDE(obsDistance[i], expectedDistance[i]);
DE[i] = de;
-//cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl;
+
it = trimmed[i].begin();
float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second);
-//cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl;
deviation[i] = dist;
}
mothurOut("Done."); mothurOutEndLine();
}
else { createProcesses(); }
-
//quantiles are used to determine whether the de values found indicate a chimera
//if you have to calculate them, its time intensive because you are finding the de and deviation values for each
//combination of sequences in the template
mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
if (processors == 1) {
- quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
}else { createProcessesQuan(); }
-
- decalc->removeObviousOutliers(quantiles);
+
+ //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
+ //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
ofstream out4;
- string o = getRootName(templateFile) + "quan";
+ string o;
+
+ o = getRootName(templateFile) + "quan";
openOutputFile(o, out4);
//adjust quantiles
for (int i = 0; i < quantiles.size(); i++) {
+ vector<float> temp;
+
if (quantiles[i].size() == 0) {
//in case this is not a distance found in your template files
for (int g = 0; g < 6; g++) {
- quantiles[i].push_back(0.0);
+ temp.push_back(0.0);
}
}else{
sort(quantiles[i].begin(), quantiles[i].end());
- vector<float> temp;
//save 10%
temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
//save 25%
//save 99%
temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
- quantiles[i] = temp;
}
//output quan value
out4 << i+1 << '\t';
- for (int u = 0; u < quantiles[i].size(); u++) { out4 << quantiles[i][u] << '\t'; }
+ for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
out4 << endl;
-
+
+ quantiles[i] = temp;
+
}
+ out4.close();
+
mothurOut("Done."); mothurOutEndLine();
+
}
-
+
//free memory
for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
}
}
- seqsMatches.push_back(match);
+ //make copy so trimSeqs doesn't overtrim
+ Sequence* copy = new Sequence(match->getName(), match->getAligned());
+
+ seqsMatches.push_back(copy);
}
return seqsMatches;
//chops off beginning and end of sequences so they both start and end with a base
map<int, int> trim;
+
decalc->trimSeqs(querySeqs[j], bestfit[j], trim);
trimmed[j] = trim;
//output trimmed values
for (int i = lines[process]->start; i < lines[process]->end; i++) {
- it = trimmed[i].begin();
-
+ it = trimmed[i].begin();
out << it->first << '\t' << it->second << endl;
}
out.close();
for (int m = 0; m < size; m++) {
int front, back;
in >> front >> back;
-
+
map<int, int> t;
t[front] = back;
process++;
}else if (pid == 0){
- quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant);
//write out data to file so parent can read it
ofstream out;
//output observed distances
- for (int i = 0; i < quantiles.size(); i++) {
- out << quantiles[i].size() << '\t';
- for (int j = 0; j < quantiles[i].size(); j++) {
- out << quantiles[i][j] << '\t';
+ for (int i = 0; i < quantilesMembers.size(); i++) {
+ out << quantilesMembers[i].size() << '\t';
+ for (int j = 0; j < quantilesMembers[i].size(); j++) {
+ out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t';
}
out << endl;
}
int temp = processIDS[i];
wait(&temp);
}
-
+
//get data created by processes
for (int i=0;i<processors;i++) {
ifstream in;
string s = toString(processIDS[i]) + ".temp";
openInputFile(s, in);
- vector< vector<float> > quan; quan.resize(100);
+ vector< vector<quanMember> > quan;
+ quan.resize(100);
//get quantiles
for (int m = 0; m < quan.size(); m++) {
int num;
- in >> num;
-
- vector<float> q; float w;
+ in >> num;
+
+ gobble(in);
+
+ vector<quanMember> q; float w; int b, n;
for (int j = 0; j < num; j++) {
- in >> w;
- q.push_back(w);
+ in >> w >> b >> n;
+ //cout << w << '\t' << b << '\t' n << endl;
+ quanMember newMember(w, b, n);
+ q.push_back(newMember);
}
-
+//cout << "here" << endl;
quan[m] = q;
+//cout << "now here" << endl;
gobble(in);
}
-
+
//save quan in quantiles
- for (int i = 0; i < quan.size(); i++) {
+ for (int j = 0; j < quan.size(); j++) {
//put all values of q[i] into quan[i]
- quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end());
+ for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); }
+ //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end());
}
in.close();
}
#else
- quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
#endif
}
catch(exception& e) {
vector<float> DE; //DE[0] is the deviaation for queryseqs[0]...
vector<float> probabilityProfile;
vector< vector<float> > quantiles; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
+ vector< vector<quanMember> > quantilesMembers; //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
vector< set<int> > h;
void createProcesses();
void createProcessesQuan();
-
+ vector<float> makeCompliant; //used by decalc->getQuantiles so pintail and mallard can use same function, it contains the highest de value for each seq in the template
};