8 * Created by westcott on 8/24/09.
9 * Copyright 2009 Schloss LAB. All rights reserved.
17 /***********************************************************/
18 //This class was created using the algorythms described in the
19 // "Evaluating putative chimeric sequences from PCR-amplified products" paper
20 //by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.
22 /***********************************************************/
24 class Ccode : public Chimera {
27 Ccode(string, string, bool, string, int, int, string); //fasta, template, filter, mask, window, numWanted, outputDir
30 int getChimeras(Sequence* query);
31 Sequence* print(ostream&, ostream&);
34 Sequence* print(MPI_File&, MPI_File&);
40 int iters, window, numWanted;
41 string fastafile, mapInfo;
45 map<int, int> spotMap;
46 map<int, int>::iterator it;
48 vector<int> windows; //windows is the vector of window breaks for query
49 int windowSizes; //windowSizes is the size of the windows for query
50 map<int, int> trim; //trim is the map containing the starting and ending positions for query
51 vector<SeqDist> closest; //closest is a vector of sequence at are closest to query
52 vector<float> averageRef; //averageRef is the average distance at each window for the references for query
53 vector<float> averageQuery; //averageQuery is the average distance at each winow for the query for query
54 vector<float> sumRef; //sumRef is the sum of distances at each window for the references for query
55 vector<float> sumSquaredRef; //sumSquaredRef is the sum of squared distances at each window for the references for query
56 vector<float> sumQuery; //sumQuery is the sum of distances at each window for the comparison of query to references for query
57 vector<float> sumSquaredQuery; //sumSquaredQuery is the sum of squared distances at each window for the comparison of query to references for query
58 vector<float> varRef; //varRef is the variance among references seqs at each window for query
59 vector<float> varQuery; //varQuery is the variance among references and query at each window
60 vector<float> sdRef; //sdRef is the standard deviation of references seqs at each window for query
61 vector<float> sdQuery; //sdQuery is the standard deviation of references and query at each window
62 vector<float> anova; //anova is the vector of anova scores for each window for query
63 int refCombo; //refCombo is the number of reference sequences combinations for query
64 vector<bool> isChimericConfidence; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
65 vector<bool> isChimericTStudent; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
66 vector<bool> isChimericANOVA; //isChimericConfidence indicates whether query is chimeric at a given window according to the confidence limits
68 vector<SeqDist> findClosest(Sequence*, int);
69 void removeBadReferenceSeqs(vector<SeqDist>&); //removes sequences from closest that are to different of too similar to eachother.
70 void trimSequences(Sequence*);
71 vector<int> findWindows();
72 void getAverageRef(vector<SeqDist>); //fills sumRef, averageRef, sumSquaredRef and refCombo.
73 void getAverageQuery (vector<SeqDist>, Sequence*); //fills sumQuery, averageQuery, sumSquaredQuery.
74 void findVarianceRef (); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
75 void findVarianceQuery (); //fills varQuery and sdQuery
76 void determineChimeras (); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
78 int getDiff(string, string); //return number of mismatched bases, a gap to base is not counted as a mismatch
83 int printMapping(string&);
89 /***********************************************************/