2 * preclustercommand.cpp
5 * Created by westcott on 12/21/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "preclustercommand.h"
11 #include "deconvolutecommand.h"
13 //**********************************************************************************************************************
14 vector<string> PreClusterCommand::setParameters(){
16 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
17 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
18 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
19 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
20 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pdiffs);
21 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
22 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25 vector<string> myArray;
26 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
30 m->errorOut(e, "PreClusterCommand", "setParameters");
34 //**********************************************************************************************************************
35 string PreClusterCommand::getHelpString(){
37 string helpString = "";
38 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
39 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
40 helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n";
41 helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
42 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
43 helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
44 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
45 helpString += "The pre.cluster command should be in the following format: \n";
46 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
47 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
48 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52 m->errorOut(e, "PreClusterCommand", "getHelpString");
56 //**********************************************************************************************************************
57 string PreClusterCommand::getOutputPattern(string type) {
61 if (type == "fasta") { pattern = "[filename],precluster,[extension]"; }
62 else if (type == "name") { pattern = "[filename],precluster.names"; }
63 else if (type == "count") { pattern = "[filename],precluster.count_table"; }
64 else if (type == "map") { pattern = "[filename],precluster.map"; }
65 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
70 m->errorOut(e, "PreClusterCommand", "getOutputPattern");
74 //**********************************************************************************************************************
75 PreClusterCommand::PreClusterCommand(){
77 abort = true; calledHelp = true;
79 vector<string> tempOutNames;
80 outputTypes["fasta"] = tempOutNames;
81 outputTypes["name"] = tempOutNames;
82 outputTypes["count"] = tempOutNames;
83 outputTypes["map"] = tempOutNames;
86 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
90 //**********************************************************************************************************************
92 PreClusterCommand::PreClusterCommand(string option) {
94 abort = false; calledHelp = false;
96 //allow user to run help
97 if(option == "help") { help(); abort = true; calledHelp = true; }
98 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
101 vector<string> myArray = setParameters();
103 OptionParser parser(option);
104 map<string, string> parameters = parser.getParameters();
106 ValidParameters validParameter;
107 map<string, string>::iterator it;
109 //check to make sure all parameters are valid for command
110 for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) {
111 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; }
114 //initialize outputTypes
115 vector<string> tempOutNames;
116 outputTypes["fasta"] = tempOutNames;
117 outputTypes["name"] = tempOutNames;
118 outputTypes["map"] = tempOutNames;
119 outputTypes["count"] = tempOutNames;
121 //if the user changes the input directory command factory will send this info to us in the output parameter
122 string inputDir = validParameter.validFile(parameters, "inputdir", false);
123 if (inputDir == "not found"){ inputDir = ""; }
126 it = parameters.find("fasta");
127 //user has given a template file
128 if(it != parameters.end()){
129 path = m->hasPath(it->second);
130 //if the user has not given a path then, add inputdir. else leave path alone.
131 if (path == "") { parameters["fasta"] = inputDir + it->second; }
134 it = parameters.find("name");
135 //user has given a template file
136 if(it != parameters.end()){
137 path = m->hasPath(it->second);
138 //if the user has not given a path then, add inputdir. else leave path alone.
139 if (path == "") { parameters["name"] = inputDir + it->second; }
142 it = parameters.find("group");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["group"] = inputDir + it->second; }
150 it = parameters.find("count");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["count"] = inputDir + it->second; }
159 //check for required parameters
160 fastafile = validParameter.validFile(parameters, "fasta", true);
161 if (fastafile == "not found") {
162 fastafile = m->getFastaFile();
163 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
164 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
166 else if (fastafile == "not open") { abort = true; }
167 else { m->setFastaFile(fastafile); }
169 //if the user changes the output directory command factory will send this info to us in the output parameter
170 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
172 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
175 //check for optional parameter and set defaults
176 // ...at some point should added some additional type checking...
177 namefile = validParameter.validFile(parameters, "name", true);
178 if (namefile == "not found") { namefile = ""; }
179 else if (namefile == "not open") { namefile = ""; abort = true; }
180 else { m->setNameFile(namefile); }
182 groupfile = validParameter.validFile(parameters, "group", true);
183 if (groupfile == "not found") { groupfile = ""; bygroup = false; }
184 else if (groupfile == "not open") { abort = true; groupfile = ""; }
185 else { m->setGroupFile(groupfile); bygroup = true; }
187 countfile = validParameter.validFile(parameters, "count", true);
188 if (countfile == "not found") { countfile = ""; }
189 else if (countfile == "not open") { abort = true; countfile = ""; }
191 m->setCountTableFile(countfile);
192 ct.readTable(countfile);
193 if (ct.hasGroupInfo()) { bygroup = true; }
194 else { bygroup = false; }
197 if ((namefile != "") && (countfile != "")) {
198 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
201 if ((groupfile != "") && (countfile != "")) {
202 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
206 string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; }
207 m->mothurConvert(temp, diffs);
209 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
210 m->setProcessors(temp);
211 m->mothurConvert(temp, processors);
213 if (countfile == "") {
214 if (namefile == "") {
215 vector<string> files; files.push_back(fastafile);
216 parser.getNameFile(files);
222 catch(exception& e) {
223 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
227 //**********************************************************************************************************************
229 int PreClusterCommand::execute(){
232 if (abort == true) { if (calledHelp) { return 0; } return 2; }
234 int start = time(NULL);
236 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
237 map<string, string> variables;
238 variables["[filename]"] = fileroot;
239 string newNamesFile = getOutputFileName("name",variables);
240 string newCountFile = getOutputFileName("count",variables);
241 string newMapFile = getOutputFileName("map",variables); //add group name if by group
242 variables["[extension]"] = m->getExtension(fastafile);
243 string newFastaFile = getOutputFileName("fasta", variables);
244 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
245 if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
246 else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
249 //clear out old files
250 ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
251 ofstream outNames; m->openOutputFile(newNamesFile, outNames); outNames.close();
252 newMapFile = fileroot + "precluster.";
254 //parse fasta and name file by group
255 vector<string> groups;
256 if (countfile != "") {
257 cparser = new SequenceCountParser(countfile, fastafile);
258 groups = cparser->getNamesOfGroups();
260 if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile); }
261 else { parser = new SequenceParser(groupfile, fastafile); }
262 groups = parser->getNamesOfGroups();
265 if(processors == 1) { driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
266 else { createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups); }
268 if (countfile != "") {
269 mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
273 //run unique.seqs for deconvolute results
274 string inputString = "fasta=" + newFastaFile;
275 if (namefile != "") { inputString += ", name=" + newNamesFile; }
276 m->mothurOutEndLine();
277 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
278 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
279 m->mothurCalling = true;
281 Command* uniqueCommand = new DeconvoluteCommand(inputString);
282 uniqueCommand->execute();
284 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
286 delete uniqueCommand;
287 m->mothurCalling = false;
288 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
290 m->renameFile(filenames["fasta"][0], newFastaFile);
291 m->renameFile(filenames["name"][0], newNamesFile);
293 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
294 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine();
297 if (namefile != "") { readNameFile(); }
299 //reads fasta file and return number of seqs
300 int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
302 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
304 if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0; }
305 if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0; }
307 int count = process(newMapFile);
308 outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
310 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
312 m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
313 m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
314 if (countfile != "") { newNamesFile = newCountFile; }
315 printData(newFastaFile, newNamesFile, "");
317 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
320 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
322 m->mothurOutEndLine();
323 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
324 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
325 m->mothurOutEndLine();
327 //set fasta file as new current fastafile
329 itTypes = outputTypes.find("fasta");
330 if (itTypes != outputTypes.end()) {
331 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
334 itTypes = outputTypes.find("name");
335 if (itTypes != outputTypes.end()) {
336 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
339 itTypes = outputTypes.find("count");
340 if (itTypes != outputTypes.end()) {
341 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
347 catch(exception& e) {
348 m->errorOut(e, "PreClusterCommand", "execute");
352 /**************************************************************************************************/
353 int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
356 vector<int> processIDS;
361 if (groups.size() < processors) { processors = groups.size(); }
363 //divide the groups between the processors
364 vector<linePair> lines;
365 int numGroupsPerProcessor = groups.size() / processors;
366 for (int i = 0; i < processors; i++) {
367 int startIndex = i * numGroupsPerProcessor;
368 int endIndex = (i+1) * numGroupsPerProcessor;
369 if(i == (processors - 1)){ endIndex = groups.size(); }
370 lines.push_back(linePair(startIndex, endIndex));
373 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
375 //loop through and create all the processes you want
376 while (process != processors) {
380 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
384 num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
386 string tempFile = toString(getpid()) + ".outputNames.temp";
388 m->openOutputFile(tempFile, outTemp);
390 outTemp << outputNames.size();
391 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
396 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
397 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
403 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
405 //force parent to wait until all the processes are done
406 for (int i=0;i<processIDS.size();i++) {
407 int temp = processIDS[i];
411 for (int i = 0; i < processIDS.size(); i++) {
412 string tempFile = toString(processIDS[i]) + ".outputNames.temp";
414 m->openInputFile(tempFile, intemp);
418 for (int k = 0; k < num; k++) {
420 intemp >> name; m->gobble(intemp);
422 outputNames.push_back(name); outputTypes["map"].push_back(name);
425 m->mothurRemove(tempFile);
429 //////////////////////////////////////////////////////////////////////////////////////////////////////
430 //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
431 //Above fork() will clone, so memory is separate, but that's not the case with windows,
432 //////////////////////////////////////////////////////////////////////////////////////////////////////
434 vector<preClusterData*> pDataArray;
435 DWORD dwThreadIdArray[processors-1];
436 HANDLE hThreadArray[processors-1];
438 //Create processor worker threads.
439 for( int i=1; i<processors; i++ ){
440 // Allocate memory for thread data.
441 string extension = toString(i) + ".temp";
443 preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
444 pDataArray.push_back(tempPreCluster);
445 processIDS.push_back(i);
447 //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
448 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
449 hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
453 //using the main process as a worker saves time and memory
454 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
456 //Wait until all threads have terminated.
457 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
459 //Close all thread handles and free memory allocations.
460 for(int i=0; i < pDataArray.size(); i++){
461 for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
462 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]);
464 CloseHandle(hThreadArray[i]);
465 delete pDataArray[i];
470 //append output files
471 for(int i=0;i<processIDS.size();i++){
472 //newFName = m->getFullPathName(".\\" + newFName);
473 //newNName = m->getFullPathName(".\\" + newNName);
475 m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
476 m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
478 m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
479 m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
485 catch(exception& e) {
486 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
490 /**************************************************************************************************/
491 int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
496 //precluster each group
497 for (int i = start; i < end; i++) {
501 if (m->control_pressed) { return 0; }
503 m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
505 map<string, string> thisNameMap;
506 vector<Sequence> thisSeqs;
507 if (groupfile != "") {
508 thisSeqs = parser->getSeqs(groups[i]);
509 }else if (countfile != "") {
510 thisSeqs = cparser->getSeqs(groups[i]);
512 if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); }
514 //fill alignSeqs with this groups info.
515 numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
517 if (m->control_pressed) { return 0; }
519 if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0; }
521 int count= process(newMFile+groups[i]+".map");
522 outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
524 if (m->control_pressed) { return 0; }
526 m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
527 m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
528 printData(newFFile, newNFile, groups[i]);
530 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
536 catch(exception& e) {
537 m->errorOut(e, "PreClusterCommand", "driverGroups");
541 /**************************************************************************************************/
542 int PreClusterCommand::process(string newMapFile){
545 m->openOutputFile(newMapFile, out);
547 //sort seqs by number of identical seqs
548 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
551 int numSeqs = alignSeqs.size();
553 //think about running through twice...
554 for (int i = 0; i < numSeqs; i++) {
557 // itActive = active.find(alignSeqs[i].seq.getName());
559 if (alignSeqs[i].active) { //this sequence has not been merged yet
561 string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
563 //try to merge it with all smaller seqs
564 for (int j = i+1; j < numSeqs; j++) {
566 if (m->control_pressed) { out.close(); return 0; }
568 if (alignSeqs[j].active) { //this sequence has not been merged yet
569 //are you within "diff" bases
570 int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
572 if (mismatch <= diffs) {
574 alignSeqs[i].names += ',' + alignSeqs[j].names;
575 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
577 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
579 alignSeqs[j].active = 0;
580 alignSeqs[j].numIdentical = 0;
586 //remove from active list
587 alignSeqs[i].active = 0;
589 out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
592 if(i % 100 == 0) { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
596 if(numSeqs % 100 != 0) { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
601 catch(exception& e) {
602 m->errorOut(e, "PreClusterCommand", "process");
606 /**************************************************************************************************/
607 int PreClusterCommand::readFASTA(){
612 m->openInputFile(fastafile, inFasta);
615 while (!inFasta.eof()) {
617 if (m->control_pressed) { inFasta.close(); return 0; }
619 Sequence seq(inFasta); m->gobble(inFasta);
621 if (seq.getName() != "") { //can get "" if commented line is at end of fasta file
622 if (namefile != "") {
623 itSize = sizes.find(seq.getName());
625 if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
627 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
628 alignSeqs.push_back(tempNode);
629 lengths.insert(seq.getAligned().length());
631 }else { //no names file, you are identical to yourself
633 if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
634 seqPNode tempNode(numRep, seq, seq.getName());
635 alignSeqs.push_back(tempNode);
636 lengths.insert(seq.getAligned().length());
642 if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
643 else if (lengths.size() == 1) { length = *(lengths.begin()); }
645 return alignSeqs.size();
648 catch(exception& e) {
649 m->errorOut(e, "PreClusterCommand", "readFASTA");
653 /**************************************************************************************************/
654 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
658 map<string, string>::iterator it;
660 map<string, int> thisCount;
661 if (countfile != "") { thisCount = cparser->getCountTable(group); }
663 for (int i = 0; i < thisSeqs.size(); i++) {
665 if (m->control_pressed) { return 0; }
667 if (namefile != "") {
668 it = thisName.find(thisSeqs[i].getName());
670 //should never be true since parser checks for this
671 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
675 for(int j=0;j<(it->second).length();j++){
676 if((it->second)[j] == ','){ numReps++; }
679 seqPNode tempNode(numReps, thisSeqs[i], it->second);
680 alignSeqs.push_back(tempNode);
681 lengths.insert(thisSeqs[i].getAligned().length());
683 }else { //no names file, you are identical to yourself
685 if (countfile != "") {
686 map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
688 //should never be true since parser checks for this
689 if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
690 else { numRep = it2->second; }
692 seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
693 alignSeqs.push_back(tempNode);
694 lengths.insert(thisSeqs[i].getAligned().length());
698 if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
699 else if (lengths.size() == 1) { length = *(lengths.begin()); }
702 if (error) { m->control_pressed = true; }
706 return alignSeqs.size();
709 catch(exception& e) {
710 m->errorOut(e, "PreClusterCommand", "loadSeqs");
715 /**************************************************************************************************/
717 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
721 for (int i = 0; i < seq1.length(); i++) {
723 if (seq1[i] != seq2[i]) { numBad++; }
724 if (numBad > diffs) { return length; } //to far to cluster
729 catch(exception& e) {
730 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
734 /**************************************************************************************************/
736 int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
739 m->openInputFile(newname, inNames);
741 string group, first, second;
742 set<string> uniqueNames;
743 while (!inNames.eof()) {
744 if (m->control_pressed) { break; }
745 inNames >> group; m->gobble(inNames);
746 inNames >> first; m->gobble(inNames);
747 inNames >> second; m->gobble(inNames);
749 vector<string> names;
750 m->splitAtComma(second, names);
752 uniqueNames.insert(first);
754 int total = ct.getGroupCount(first, group);
755 for (int i = 1; i < names.size(); i++) {
756 total += ct.getGroupCount(names[i], group);
757 ct.setAbund(names[i], group, 0);
759 ct.setAbund(first, group, total);
763 vector<string> namesOfSeqs = ct.getNamesOfSeqs();
764 for (int i = 0; i < namesOfSeqs.size(); i++) {
765 if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
766 ct.remove(namesOfSeqs[i]);
770 ct.printTable(newcount);
771 m->mothurRemove(newname);
773 if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
775 m->openInputFile(newfasta, in);
778 m->openOutputFile(newfasta+"temp", out);
783 if (m->control_pressed) { break; }
785 Sequence seq(in); m->gobble(in);
787 if (seq.getName() != "") {
789 if (already.count(seq.getName()) == 0) {
790 seq.printSequence(out);
791 already.insert(seq.getName());
797 m->mothurRemove(newfasta);
798 m->renameFile(newfasta+"temp", newfasta);
803 catch(exception& e) {
804 m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
809 /**************************************************************************************************/
811 void PreClusterCommand::printData(string newfasta, string newname, string group){
817 m->openOutputFileAppend(newfasta, outFasta);
818 m->openOutputFileAppend(newname, outNames);
820 m->openOutputFile(newfasta, outFasta);
821 m->openOutputFile(newname, outNames);
824 if ((countfile != "") && (group == "")) { outNames << "Representative_Sequence\ttotal\n"; }
825 for (int i = 0; i < alignSeqs.size(); i++) {
826 if (alignSeqs[i].numIdentical != 0) {
827 alignSeqs[i].seq.printSequence(outFasta);
828 if (countfile != "") {
829 if (group != "") { outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
830 else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl; }
831 }else { outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
839 catch(exception& e) {
840 m->errorOut(e, "PreClusterCommand", "printData");
844 /**************************************************************************************************/
846 void PreClusterCommand::readNameFile(){
849 m->openInputFile(namefile, in);
850 string firstCol, secondCol;
853 in >> firstCol >> secondCol; m->gobble(in);
854 names[firstCol] = secondCol;
857 for(int i=0;i<secondCol.size();i++){
858 if(secondCol[i] == ','){ size++; }
860 sizes[firstCol] = size;
864 catch(exception& e) {
865 m->errorOut(e, "PreClusterCommand", "readNameFile");
870 /**************************************************************************************************/