]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
precluster command finished
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11
12 //**********************************************************************************************************************
13 inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
14 //**********************************************************************************************************************
15
16 PreClusterCommand::PreClusterCommand(string option){
17         try {
18                 abort = false;
19                 
20                 //allow user to run help
21                 if(option == "help") { help(); abort = true; }
22                 
23                 else {
24                         //valid paramters for this command
25                         string Array[] =  {"fasta", "name", "diffs"};
26                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string, string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                 
33                         //check to make sure all parameters are valid for command
34                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
35                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
36                         }
37                         
38                         //check for required parameters
39                         fastafile = validParameter.validFile(parameters, "fasta", true);
40                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the pre.cluster command."); mothurOutEndLine(); abort = true; }
41                         else if (fastafile == "not open") { abort = true; }     
42                         
43                         //check for optional parameter and set defaults
44                         // ...at some point should added some additional type checking...
45                         namefile = validParameter.validFile(parameters, "name", true);
46                         if (namefile == "not found") { namefile =  "";  }
47                         else if (namefile == "not open") { abort = true; }      
48                         else {  readNameFile();  }
49                         
50                         string temp     = validParameter.validFile(parameters, "diffs", false);                         if(temp == "not found"){        temp = "1"; }
51                         convert(temp, diffs); 
52                 }
53                                 
54         }
55         catch(exception& e) {
56                 errorOut(e, "PreClusterCommand", "PreClusterCommand");
57                 exit(1);
58         }
59 }
60
61 //**********************************************************************************************************************
62 PreClusterCommand::~PreClusterCommand(){}       
63 //**********************************************************************************************************************
64
65 void PreClusterCommand::help(){
66         try {
67                 mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n");
68                 mothurOut("The pre.cluster command outputs a new fasta and name file.\n");
69                 mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n");
70                 mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n");
71                 mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n");
72                 mothurOut("The pre.cluster command should be in the following format: \n");
73                 mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n");
74                 mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n");
75                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
76         }
77         catch(exception& e) {
78                 errorOut(e, "PreClusterCommand", "help");
79                 exit(1);
80         }
81 }
82 //**********************************************************************************************************************
83
84 int PreClusterCommand::execute(){
85         try {
86                 
87                 if (abort == true) { return 0; }
88                 
89                 //reads fasta file and return number of seqs
90                 int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
91         
92                 if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0;  }
93                 if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0;  }
94                 
95                 //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
96                 sizes.clear();
97         
98                 //sort seqs by number of identical seqs
99                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
100         
101                 //go through active list and cluster everthing you can, until all nodes are inactive
102                 //taking advantage of the fact that maps are already sorted
103                 map<string, bool>::iterator itActive;
104                 map<string, bool>::iterator it2Active;
105                 int count = 0;
106                 
107                 for (int i = 0; i < alignSeqs.size(); i++) {
108                 
109                         //are you active
110                         itActive = active.find(alignSeqs[i].seq.getName());
111                         
112                         if (itActive != active.end()) {  //this sequence has not been merged yet
113                         
114                                 //try to merge it with all smaller seqs
115                                 for (int j = i; j < alignSeqs.size(); j++) {
116                                         
117                                         if (i != j) {
118                                                 //are you active
119                                                 it2Active = active.find(alignSeqs[j].seq.getName());
120                                                 if (it2Active != active.end()) {  //this sequence has not been merged yet
121                                                         //are you within "diff" bases
122                                                         int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
123                                                         
124                                                         if (mismatch <= diffs) {
125                                                                 //merge
126                                                                 names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
127                                                         
128                                                                 //remove from active list
129                                                                 active.erase(it2Active);
130                                                                 
131                                                                 //set numIdentical to 0, so you only print the representative seqs in the fasta file
132                                                                 alignSeqs[j].numIdentical = 0;
133                                                                 count++;
134                                                         }
135                                                 }//end if j active
136                                         }//end if i != j
137                                 }//end for loop
138                                 
139                                 //remove from active list 
140                                 active.erase(itActive);
141                         }//end if active i
142                 }
143         
144                 string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
145                 string newNamesFile = getRootName(fastafile) + "precluster.names";
146                 
147                 printData(newFastaFile, newNamesFile);
148                 
149                 mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
150                 mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); 
151                 
152                 return 0;
153                 
154         }
155         catch(exception& e) {
156                 errorOut(e, "PreClusterCommand", "execute");
157                 exit(1);
158         }
159 }
160 /**************************************************************************************************/
161 int PreClusterCommand::readSeqs(){
162         try {
163                 ifstream inFasta;
164                 openInputFile(fastafile, inFasta);
165                 length = 0;
166                 map<string, string>::iterator it;
167                                 
168                 while (!inFasta.eof()) {
169                         Sequence temp(inFasta);  //read seq
170                         
171                         if (temp.getName() != "") {  
172                                 if (namefile != "") {
173                                         //make sure fasta and name files match
174                                         it = names.find(temp.getName());
175                                         if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
176                                 }else { sizes[temp.getName()] = 1; }
177                                 
178                                 seqPNode tempNode(sizes[temp.getName()], temp);
179                                 alignSeqs.push_back(tempNode); 
180                                 active[temp.getName()] = true;
181                         }
182                         gobble(inFasta);
183                 }
184                 inFasta.close();
185                 
186                 if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
187                 
188                 return alignSeqs.size();
189         }
190         catch(exception& e) {
191                 errorOut(e, "PreClusterCommand", "readSeqs");
192                 exit(1);
193         }
194 }
195 /**************************************************************************************************/
196 void PreClusterCommand::readNameFile(){
197         try {
198                 ifstream in;
199                 openInputFile(namefile, in);
200                 string firstCol, secondCol;
201                                 
202                 while (!in.eof()) {
203                         in >> firstCol >> secondCol; gobble(in);
204                         names[firstCol] = secondCol;
205                         int size = 1;
206                         while (secondCol.find_first_of(',') != -1) { 
207                                 size++;
208                                 secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
209                         }
210                         sizes[firstCol] = size;
211                 }
212                 in.close();
213         }
214         catch(exception& e) {
215                 errorOut(e, "PreClusterCommand", "readNameFile");
216                 exit(1);
217         }
218 }
219 /**************************************************************************************************/
220 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
221         try {
222                 int numBad = 0;
223                 
224                 for (int i = 0; i < seq1.length(); i++) {
225                         //do they match
226                         if (seq1[i] != seq2[i]) { numBad++; }
227                         if (numBad > diffs) { return length;  } //to far to cluster
228                 }
229                 
230                 return numBad;
231         }
232         catch(exception& e) {
233                 errorOut(e, "PreClusterCommand", "calcMisMatches");
234                 exit(1);
235         }
236 }
237 /**************************************************************************************************/
238 void PreClusterCommand::printData(string newfasta, string newname){
239         try {
240                 ofstream outFasta;
241                 ofstream outNames;
242                 openOutputFile(newfasta, outFasta);
243                 openOutputFile(newname, outNames);
244                                 
245                 map<string, string>::iterator itNames;
246                 
247                 for (int i = 0; i < alignSeqs.size(); i++) {
248                         if (alignSeqs[i].numIdentical != 0) {
249                                 alignSeqs[i].seq.printSequence(outFasta); 
250                                 
251                                 itNames = names.find(alignSeqs[i].seq.getName());
252                                 
253                                 outNames << itNames->first << '\t' << itNames->second << endl;
254                         }
255                 }
256                 
257                 outFasta.close();
258                 outNames.close();
259                 
260         }
261         catch(exception& e) {
262                 errorOut(e, "PreClusterCommand", "printData");
263                 exit(1);
264         }
265 }
266
267 /**************************************************************************************************/
268
269