]> git.donarmstrong.com Git - mothur.git/blob - chimeraseqscommand.cpp
182c25e4b909ba2dd49fc1971beb7abee3b99188
[mothur.git] / chimeraseqscommand.cpp
1 /*
2  *  chimeraseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 6/29/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "chimeraseqscommand.h"
11
12 //***************************************************************************************************************
13
14 ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
15         try {
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"fasta", "filter", "correction", "processors", "method" };
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string,string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                         
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //check for required parameters
37                         fastafile = validParameter.validFile(parameters, "fasta", true);
38                         if (fastafile == "not open") { abort = true; }
39                         else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }     
40                         
41                         string temp;
42                         temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
43                         filter = isTrue(temp);
44                         
45                         temp = validParameter.validFile(parameters, "correction", false);               if (temp == "not found") { temp = "T"; }
46                         correction = isTrue(temp);
47                         
48                         temp = validParameter.validFile(parameters, "processors", true);                if (temp == "not found") { temp = "1"; }
49                         convert(temp, processors);
50                         
51                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "bellerophon"; }
52
53                 }
54         }
55         catch(exception& e) {
56                 errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
57                 exit(1);
58         }
59 }
60 //**********************************************************************************************************************
61
62 void ChimeraSeqsCommand::help(){
63         try {
64                 mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
65                 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
66                 mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
67                 mothurOut("The correction parameter allows you to .....  The default is true. \n");
68                 mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
69                 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
70                 mothurOut("The chimera.seqs command should be in the following format: \n");
71                 mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
72                 mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
73                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");        
74         }
75         catch(exception& e) {
76                 errorOut(e, "ChimeraSeqsCommand", "help");
77                 exit(1);
78         }
79 }
80
81 //***************************************************************************************************************
82
83 ChimeraSeqsCommand::~ChimeraSeqsCommand(){      /*      do nothing      */      }
84
85 //***************************************************************************************************************
86
87 int ChimeraSeqsCommand::execute(){
88         try{
89                 
90                 if (abort == true) { return 0; }
91                 
92                 //do soft filter
93                 if (filter)  {
94                         string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F";
95                         filterSeqs = new FilterSeqsCommand(optionString);
96                         filterSeqs->execute();
97                         delete filterSeqs;
98                         
99                         //reset fastafile to filtered file
100                         fastafile = getRootName(fastafile) + "filter.fasta";
101                 }
102                 
103                 //read in sequences
104                 readSeqs();
105                 
106                 //int numSeqs = seqs.size();
107                 
108                 //find average midpoint of seqs
109                 midpoint = findAverageMidPoint();
110                 
111                 //create 2 vectors of sequences, 1 for left side and one for right side
112                 vector<Sequence> left;  vector<Sequence> right;
113                 
114                 for (int i = 0; i < seqs.size(); i++) {
115                         //save left side
116                         string seqLeft = seqs[i].getAligned();
117                         seqLeft = seqLeft.substr(0, midpoint);
118                         Sequence tempLeft(seqs[i].getName(), seqLeft);
119                         left.push_back(tempLeft);
120                         
121                         //save right side
122                         string seqRight = seqs[i].getAligned();
123                         seqRight = seqRight.substr(midpoint+1, (seqRight.length()-midpoint-1));
124                         Sequence tempRight(seqs[i].getName(), seqRight);
125                         right.push_back(tempRight);
126                 }
127                                 
128                 //this should be parallelized
129                 //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
130                 //create a matrix containing the distance from left to left and right to right
131                 //calculate distances
132                 SparseMatrix* SparseLeft = new SparseMatrix();
133                 SparseMatrix* SparseRight = new SparseMatrix();
134                 
135                 createSparseMatrix(0, left.size(), SparseLeft, left);
136                 createSparseMatrix(0, right.size(), SparseRight, right);
137                 
138                 
139                 //vector<SeqMap> distMapRight;
140                 //vector<SeqMap> distMapLeft;
141                 
142                 // Create a data structure to quickly access the distance information.
143                 // It consists of a vector of distance maps, where each map contains
144                 // all distances of a certain sequence. Vector and maps are accessed
145                 // via the index of a sequence in the distance matrix
146                 //distMapRight = vector<SeqMap>(globaldata->gListVector->size()); 
147                 //distMapLeft = vector<SeqMap>(globaldata->gListVector->size()); 
148                 for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
149                         //distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
150                 }
151                 for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
152                         //distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
153                 }
154
155                 
156                 //fill preference structure
157                 //generatePreferences(distMapLeft, distMapRight);
158                 
159                                 
160                 //output results to screen                                              
161                 mothurOutEndLine();
162                 mothurOut("\t\t"); mothurOutEndLine();
163                 //mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
164                 //mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine();
165                 //mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine();
166                 //mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine();
167                 //mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine();
168                 //mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine();
169                 //mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine();
170                 //mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine();
171                 
172                 //outSummary.close();
173                 return 0;
174         }
175         catch(exception& e) {
176                 errorOut(e, "ChimeraSeqsCommand", "execute");
177                 exit(1);
178         }
179 }
180
181 //***************************************************************************************************************
182 void ChimeraSeqsCommand::readSeqs(){
183         try {
184                 ifstream inFASTA;
185                 openInputFile(fastafile, inFASTA);
186                 
187                 //read in seqs and store in vector
188                 while(!inFASTA.eof()){
189                         Sequence current(inFASTA);
190                         
191                         seqs.push_back(current);
192                         
193                         gobble(inFASTA);
194                 }
195                 inFASTA.close();
196
197         }
198         catch(exception& e) {
199                 errorOut(e, "ChimeraSeqsCommand", "readSeqs");
200                 exit(1);
201         }
202 }
203
204
205 //***************************************************************************************************************
206 int ChimeraSeqsCommand::findAverageMidPoint(){
207         try {
208                 int totalMids = 0;
209                 int averageMid = 0;
210                 
211                 //loop through the seqs and find midpoint
212                 for (int i = 0; i < seqs.size(); i++) {
213                         
214                         //get unaligned sequence
215                         seqs[i].setUnaligned(seqs[i].getUnaligned());  //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned
216                         
217                         string unaligned = seqs[i].getUnaligned();
218                         string aligned = seqs[i].getAligned();
219                         
220                         //find midpoint of this seq
221                         int count = 0;
222                         int thismid = 0;
223                         for (int j = 0; j < aligned.length(); j++) {
224                                 
225                                 thismid++;
226                                 
227                                 //if you are part of the unaligned sequence increment
228                                 if (isalpha(aligned[j])) {  count++;  }
229                                 
230                                 //if you have reached the halfway point stop
231                                 if (count >= (unaligned.length() / 2)) { break; }
232                         }
233                         
234                         //add this mid to total
235                         totalMids += thismid;
236                 
237                 }
238                 
239                 averageMid = (totalMids / seqs.size());
240                 
241                 return averageMid; 
242         
243         
244         }
245         catch(exception& e) {
246                 errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint");
247                 exit(1);
248         }
249 }
250
251 /***************************************************************************************************************/
252 int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
253         try {
254
255                 for(int i=startSeq; i<endSeq; i++){
256                         
257                         for(int j=0;j<i;j++){
258                         
259                                 //distCalculator->calcDist(s.get(i), s.get(j));
260                                 float dist = distCalculator->getDist();
261                                 
262                                 PCell temp(i, j, dist);
263                                 sparse->addCell(temp);
264                                 
265                         }
266                 }
267                         
268         
269                 return 1;
270         }
271         catch(exception& e) {
272                 errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
273                 exit(1);
274         }
275 }
276 /***************************************************************************************************************
277 void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right){
278         try {
279
280                 for (int i = 0; i < left.size(); i++) {
281                         
282                         int iscore = 0;
283                         float closestLeft = 100000.0;
284                         float closestRight = 100000.0;
285                         
286                         for (int j = 0; j < left.size(); j++) {
287                                 
288                                 //iscore += abs(left
289                         
290                         }
291                 
292                 }
293
294         }
295         catch(exception& e) {
296                 errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
297                 exit(1);
298         }
299 }
300 /**************************************************************************************************/
301
302 /**************************************************************************************************/
303