]> git.donarmstrong.com Git - mothur.git/blob - seqsummarycommand.cpp
added count file to trim.seqs, get.groups, get.lineage, get.seqs, heatmap.sim, list...
[mothur.git] / seqsummarycommand.cpp
1 /*
2  *  seqcoordcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 5/30/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqsummarycommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> SeqSummaryCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none",false,false); parameters.push_back(pcount);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
21                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
22                 
23                 vector<string> myArray;
24                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
25                 return myArray;
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "SeqSummaryCommand", "setParameters");
29                 exit(1);
30         }
31 }
32 //**********************************************************************************************************************
33 string SeqSummaryCommand::getHelpString(){      
34         try {
35                 string helpString = "";
36                 helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n";
37                 helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n";
38                 helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n";
39         helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n";
40                 helpString += "The summary.seqs command should be in the following format: \n";
41                 helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n";
42                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
43                 return helpString;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqSummaryCommand", "getHelpString");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string SeqSummaryCommand::getOutputFileNameTag(string type, string inputName=""){       
52         try {
53         string outputFileName = "";
54                 map<string, vector<string> >::iterator it;
55         
56         //is this a type this command creates
57         it = outputTypes.find(type);
58         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
59         else {
60             if (type == "summary")            {   outputFileName =  "summary";   }
61             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
62         }
63         return outputFileName;
64         }
65         catch(exception& e) {
66                 m->errorOut(e, "SeqSummaryCommand", "getOutputFileNameTag");
67                 exit(1);
68         }
69 }
70
71 //**********************************************************************************************************************
72 SeqSummaryCommand::SeqSummaryCommand(){ 
73         try {
74                 abort = true; calledHelp = true; 
75                 setParameters();
76                 vector<string> tempOutNames;
77                 outputTypes["summary"] = tempOutNames;
78         }
79         catch(exception& e) {
80                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
81                 exit(1);
82         }
83 }
84 //***************************************************************************************************************
85
86 SeqSummaryCommand::SeqSummaryCommand(string option)  {
87         try {
88                 abort = false; calledHelp = false;   
89                 
90                 //allow user to run help
91                 if(option == "help") { help(); abort = true; calledHelp = true; }
92                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
93                 
94                 else {
95                         vector<string> myArray = setParameters();
96                         
97                         OptionParser parser(option);
98                         map<string,string> parameters = parser.getParameters();
99                         
100                         ValidParameters validParameter("summary.seqs");
101                         map<string,string>::iterator it;
102                         
103                         //check to make sure all parameters are valid for command
104                         for (it = parameters.begin(); it != parameters.end(); it++) { 
105                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
106                         }
107                         
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("fasta");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
119                                 }
120                                 
121                                 it = parameters.find("name");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
127                                 }
128                 
129                 it = parameters.find("count");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
135                                 }
136                         }
137                         
138                         //initialize outputTypes
139                         vector<string> tempOutNames;
140                         outputTypes["summary"] = tempOutNames;
141                         
142                         //check for required parameters
143                         fastafile = validParameter.validFile(parameters, "fasta", true);
144                         if (fastafile == "not open") { abort = true; }
145                         else if (fastafile == "not found") {                            
146                                 fastafile = m->getFastaFile(); 
147                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
148                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
149                         }else { m->setFastaFile(fastafile); }   
150                         
151                         namefile = validParameter.validFile(parameters, "name", true);
152                         if (namefile == "not open") { namefile = ""; abort = true; }
153                         else if (namefile == "not found") { namefile = "";  }   
154                         else { m->setNameFile(namefile); }
155             
156             countfile = validParameter.validFile(parameters, "count", true);
157                         if (countfile == "not open") { abort = true; countfile = ""; }  
158                         else if (countfile == "not found") { countfile = ""; }
159                         else { m->setCountTableFile(countfile); }
160                         
161             if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
162                         
163                         //if the user changes the output directory command factory will send this info to us in the output parameter 
164                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
165                                 outputDir = ""; 
166                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
167                         }
168                         
169                         string temp = validParameter.validFile(parameters, "processors", false);        if (temp == "not found"){       temp = m->getProcessors();      }
170                         m->setProcessors(temp);
171                         m->mothurConvert(temp, processors);
172                         
173             if (countfile == "") {
174                 if (namefile == "") {
175                     vector<string> files; files.push_back(fastafile);
176                     parser.getNameFile(files);
177                 }
178             }
179                 }
180         }
181         catch(exception& e) {
182                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
183                 exit(1);
184         }
185 }
186 //***************************************************************************************************************
187
188 int SeqSummaryCommand::execute(){
189         try{
190                 
191                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
192                 
193                 //set current fasta to fastafile
194                 m->setFastaFile(fastafile);
195                 
196                 string summaryFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("summary");
197                                 
198                 int numSeqs = 0;
199                 
200                 vector<int> startPosition;
201                 vector<int> endPosition;
202                 vector<int> seqLength;
203                 vector<int> ambigBases;
204                 vector<int> longHomoPolymer;
205                 
206                 if (namefile != "") { nameMap = m->readNames(namefile); }
207         else if (countfile != "") {
208             CountTable ct;
209             ct.readTable(countfile);
210             nameMap = ct.getNameMap();
211         }
212                 
213                 if (m->control_pressed) { return 0; }
214                         
215 #ifdef USE_MPI  
216                                 int pid, numSeqsPerProcessor; 
217                                 int tag = 2001;
218                                 int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
219                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
220                                 vector<unsigned long long> MPIPos;
221                                 
222                                 MPI_Status status; 
223                                 MPI_Status statusOut;
224                                 MPI_File inMPI; 
225                                 MPI_File outMPI; 
226                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
227                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
228                                                         
229                                 char tempFileName[1024];
230                                 strcpy(tempFileName, fastafile.c_str());
231                                 
232                                 char sumFileName[1024];
233                                 strcpy(sumFileName, summaryFile.c_str());
234                 
235                                 MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
236                                 MPI_File_open(MPI_COMM_WORLD, sumFileName, outMode, MPI_INFO_NULL, &outMPI);
237                                 
238                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); return 0;  }
239                                 
240                                 if (pid == 0) { //you are the root process
241                                                 //print header
242                                                 string outputString = "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs\n";        
243                                                 int length = outputString.length();
244                                                 char* buf2 = new char[length];
245                                                 memcpy(buf2, outputString.c_str(), length);
246                                         
247                                                 MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &statusOut);
248                                                 delete buf2;
249                                                 
250                                                 MPIPos = m->setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
251                                         
252                                                 for(int i = 1; i < processors; i++) { 
253                                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
254                                                         MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
255                                                 }
256                                                 
257                                                 //figure out how many sequences you have to do
258                                                 numSeqsPerProcessor = numSeqs / processors;
259                                                 int startIndex =  pid * numSeqsPerProcessor;
260                                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
261                                                 
262                                                 //do your part
263                                                 MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
264                                                 
265                                 }else { //i am the child process
266                         
267                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
268                                         MPIPos.resize(numSeqs+1);
269                                         MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
270                                 
271                                         //figure out how many sequences you have to align
272                                         numSeqsPerProcessor = numSeqs / processors;
273                                         int startIndex =  pid * numSeqsPerProcessor;
274                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
275                                 
276                                         //do your part
277                                         MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
278                                 }
279                                 
280                                 MPI_File_close(&inMPI);
281                                 MPI_File_close(&outMPI);
282                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
283                                 
284                                 if (pid == 0) {
285                                         //get the info from the child processes
286                                         for(int i = 1; i < processors; i++) { 
287                                                 int size;
288                                                 MPI_Recv(&size, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
289
290                                                 vector<int> temp; temp.resize(size+1);
291                                                 
292                                                 for(int j = 0; j < 5; j++) { 
293                                                 
294                                                         MPI_Recv(&temp[0], (size+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); 
295                                                         int receiveTag = temp[temp.size()-1];  //child process added a int to the end to indicate what count this is for
296                                                         
297                                                         if (receiveTag == startTag) { 
298                                                                 for (int k = 0; k < size; k++) {                startPosition.push_back(temp[k]);       }
299                                                         }else if (receiveTag == endTag) { 
300                                                                 for (int k = 0; k < size; k++) {                endPosition.push_back(temp[k]); }
301                                                         }else if (receiveTag == lengthTag) { 
302                                                                 for (int k = 0; k < size; k++) {                seqLength.push_back(temp[k]);   }
303                                                         }else if (receiveTag == baseTag) { 
304                                                                 for (int k = 0; k < size; k++) {                ambigBases.push_back(temp[k]);  }
305                                                         }else if (receiveTag == lhomoTag) { 
306                                                                 for (int k = 0; k < size; k++) {                longHomoPolymer.push_back(temp[k]);     }
307                                                         }
308                                                 } 
309                                         }
310
311                                 }else{
312                                 
313                                         //send my counts
314                                         int size = startPosition.size();
315                                         MPI_Send(&size, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
316                                         
317                                         startPosition.push_back(startTag);
318                                         int ierr = MPI_Send(&(startPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
319                                         endPosition.push_back(endTag);
320                                         ierr = MPI_Send (&(endPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
321                                         seqLength.push_back(lengthTag);
322                                         ierr = MPI_Send(&(seqLength[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
323                                         ambigBases.push_back(baseTag);
324                                         ierr = MPI_Send(&(ambigBases[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
325                                         longHomoPolymer.push_back(lhomoTag);
326                                         ierr = MPI_Send(&(longHomoPolymer[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
327                                 }
328                                 
329                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
330 #else
331                         vector<unsigned long long> positions; 
332                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
333                                 positions = m->divideFile(fastafile, processors);
334                                 for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
335                         #else
336                                 positions = m->setFilePosFasta(fastafile, numSeqs); 
337                 if (positions.size() < processors) { processors = positions.size(); }
338                 
339                                 //figure out how many sequences you have to process
340                                 int numSeqsPerProcessor = numSeqs / processors;
341                                 for (int i = 0; i < processors; i++) {
342                                         int startIndex =  i * numSeqsPerProcessor;
343                                         if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
344                                         lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
345                                 }
346                         #endif
347                         
348
349                         if(processors == 1){
350                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]);
351                         }else{
352                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); 
353                         }
354                         
355                         if (m->control_pressed) {  return 0; }
356 #endif
357                         
358                 #ifdef USE_MPI
359                         if (pid == 0) { 
360                 #endif
361                 
362                 sort(startPosition.begin(), startPosition.end());
363                 sort(endPosition.begin(), endPosition.end());
364                 sort(seqLength.begin(), seqLength.end());
365                 sort(ambigBases.begin(), ambigBases.end());
366                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
367                 int size = startPosition.size();
368                 
369                 //find means
370                 double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
371                 meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0;
372                 for (int i = 0; i < size; i++) {
373                         meanStartPosition += startPosition[i];
374                         meanEndPosition += endPosition[i];
375                         meanSeqLength += seqLength[i];
376                         meanAmbigBases += ambigBases[i];
377                         meanLongHomoPolymer += longHomoPolymer[i];
378                 }
379                 
380                 //this is an int divide so the remainder is lost
381                 meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size;
382                                 
383                 int ptile0_25   = int(size * 0.025);
384                 int ptile25             = int(size * 0.250);
385                 int ptile50             = int(size * 0.500);
386                 int ptile75             = int(size * 0.750);
387                 int ptile97_5   = int(size * 0.975);
388                 int ptile100    = size - 1;
389                 
390                 //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
391                 if (startPosition[0] == -1) {  startPosition[0] = 0;    }
392                 if (endPosition[0] == -1)       {  endPosition[0] = 0;          }
393                 
394                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
395                 
396                 m->mothurOutEndLine();
397                 m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
398                 m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
399                 m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
400                 m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
401                 m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
402                 m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
403                 m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
404                 m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
405                 m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine();
406
407                 if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
408                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
409                 
410                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
411                 
412                 m->mothurOutEndLine();
413                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
414                 m->mothurOut(summaryFile); m->mothurOutEndLine();       outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile);
415                 m->mothurOutEndLine();
416                 
417                 #ifdef USE_MPI
418                         }
419                 #endif
420
421                 return 0;
422         }
423         catch(exception& e) {
424                 m->errorOut(e, "SeqSummaryCommand", "execute");
425                 exit(1);
426         }
427 }
428 /**************************************************************************************/
429 int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile, linePair* filePos) {   
430         try {
431                 
432                 ofstream outSummary;
433                 m->openOutputFile(sumFile, outSummary);
434                 
435                 //print header if you are process 0
436                 if (filePos->start == 0) {
437                         outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs" << endl;  
438                 }
439                                 
440                 ifstream in;
441                 m->openInputFile(filename, in);
442                                 
443                 in.seekg(filePos->start);
444
445                 bool done = false;
446                 int count = 0;
447        
448                 while (!done) {
449                                 
450                         if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
451                                         
452                         Sequence current(in); m->gobble(in);
453         
454                         if (current.getName() != "") {
455                                 
456                                 int num = 1;
457                                 if ((namefile != "") || (countfile != "")) {
458                                         //make sure this sequence is in the namefile, else error 
459                                         map<string, int>::iterator it = nameMap.find(current.getName());
460                                         
461                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
462                                         else { num = it->second; }
463                                 }
464                                 
465                                 //for each sequence this sequence represents
466                                 for (int i = 0; i < num; i++) {
467                                         startPosition.push_back(current.getStartPos());
468                                         endPosition.push_back(current.getEndPos());
469                                         seqLength.push_back(current.getNumBases());
470                                         ambigBases.push_back(current.getAmbigBases());
471                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
472                                 }
473                                 
474                                 count++;
475                                 outSummary << current.getName() << '\t';
476                                 outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
477                                 outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
478                                 outSummary << current.getLongHomoPolymer() << '\t' << num << endl;
479                         }
480                         
481                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
482                                 unsigned long long pos = in.tellg();
483                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
484                         #else
485                                 if (in.eof()) { break; }
486                         #endif
487                         
488                         //report progress
489                         //if((count) % 100 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
490                 }
491                 //report progress
492                 //if((count) % 100 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
493                 
494                 in.close();
495                 
496                 return count;
497         }
498         catch(exception& e) {
499                 m->errorOut(e, "SeqSummaryCommand", "driverCreateSummary");
500                 exit(1);
501         }
502 }
503 #ifdef USE_MPI
504 /**************************************************************************************/
505 int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {       
506         try {
507                 
508                 int pid;
509                 MPI_Status status; 
510                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
511
512                 for(int i=0;i<num;i++){
513                         
514                         if (m->control_pressed) { return 0; }
515                         
516                         //read next sequence
517                         int length = MPIPos[start+i+1] - MPIPos[start+i];
518         
519                         char* buf4 = new char[length];
520                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
521                         
522                         string tempBuf = buf4;
523                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
524                         istringstream iss (tempBuf,istringstream::in);
525                         delete buf4;
526
527                         Sequence current(iss);  
528
529                         if (current.getName() != "") {
530                                 
531                                 int num = 1;
532                                 if ((namefile != "") || (countfile != "")) {
533                                         //make sure this sequence is in the namefile, else error 
534                                         map<string, int>::iterator it = nameMap.find(current.getName());
535                                         
536                                         if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; }
537                                         else { num = it->second; }
538                                 }
539                                 
540                                 //for each sequence this sequence represents
541                                 for (int i = 0; i < num; i++) {
542                                         startPosition.push_back(current.getStartPos());
543                                         endPosition.push_back(current.getEndPos());
544                                         seqLength.push_back(current.getNumBases());
545                                         ambigBases.push_back(current.getAmbigBases());
546                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
547                                 }
548                                 
549                                 string outputString = current.getName() + "\t" + toString(current.getStartPos()) + "\t" + toString(current.getEndPos()) + "\t";
550                                 outputString += toString(current.getNumBases()) + "\t" + toString(current.getAmbigBases()) + "\t" + toString(current.getLongHomoPolymer()) + "\t" + toString(num) + "\n";
551                                 
552                                 //output to file
553                                 length = outputString.length();
554                                 char* buf3 = new char[length];
555                                 memcpy(buf3, outputString.c_str(), length);
556                                         
557                                 MPI_File_write_shared(outMPI, buf3, length, MPI_CHAR, &status);
558                                 delete buf3;
559                         }       
560                 }
561                 
562                 return 0;
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "SeqSummaryCommand", "MPICreateSummary");
566                 exit(1);
567         }
568 }
569 #endif
570 /**************************************************************************************************/
571 int SeqSummaryCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile) {
572         try {
573                 int process = 1;
574                 int num = 0;
575                 processIDS.clear();
576                 
577 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
578                 
579                 //loop through and create all the processes you want
580                 while (process != processors) {
581                         int pid = fork();
582                         
583                         if (pid > 0) {
584                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
585                                 process++;
586                         }else if (pid == 0){
587                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]);
588                                 
589                                 //pass numSeqs to parent
590                                 ofstream out;
591                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
592                                 m->openOutputFile(tempFile, out);
593                                 
594                                 out << num << endl;
595                                 out << startPosition.size() << endl;
596                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
597                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
598                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
599                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
600                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
601                                 
602                                 out.close();
603                                 
604                                 exit(0);
605                         }else { 
606                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
607                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
608                                 exit(0);
609                         }
610                 }
611                 
612                 //do your part
613                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile, lines[0]);
614
615                 //force parent to wait until all the processes are done
616                 for (int i=0;i<processIDS.size();i++) { 
617                         int temp = processIDS[i];
618                         wait(&temp);
619                 }
620                 
621                 //parent reads in and combine Filter info
622                 for (int i = 0; i < processIDS.size(); i++) {
623                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
624                         ifstream in;
625                         m->openInputFile(tempFilename, in);
626                         
627                         int temp, tempNum;
628                         in >> tempNum; m->gobble(in); num += tempNum;
629                         in >> tempNum; m->gobble(in);
630                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
631                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
632                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
633                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
634                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
635                                 
636                         in.close();
637                         m->mothurRemove(tempFilename);
638                         
639                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
640                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
641                 }
642                 
643 #else
644                 //////////////////////////////////////////////////////////////////////////////////////////////////////
645                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
646                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
647                 //Taking advantage of shared memory to allow both threads to add info to vectors.
648                 //////////////////////////////////////////////////////////////////////////////////////////////////////
649                 
650                 vector<seqSumData*> pDataArray; 
651                 DWORD   dwThreadIdArray[processors-1];
652                 HANDLE  hThreadArray[processors-1]; 
653         
654                 bool hasNameMap = false;
655         if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
656         
657                 //Create processor worker threads.
658                 for( int i=0; i<processors-1; i++ ){
659             
660             string extension = "";
661             if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
662                         // Allocate memory for thread data.
663                         seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap);
664                         pDataArray.push_back(tempSum);
665                         
666                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
667                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
668                         hThreadArray[i] = CreateThread(NULL, 0, MySeqSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
669                 }
670                 
671         //do your part
672                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, (sumFile+toString(processors-1)+".temp"), lines[processors-1]);
673         processIDS.push_back(processors-1);
674
675                 //Wait until all threads have terminated.
676                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
677                 
678                 //Close all thread handles and free memory allocations.
679                 for(int i=0; i < pDataArray.size(); i++){
680                         num += pDataArray[i]->count;
681             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
682                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
683             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
684             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
685             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
686                         CloseHandle(hThreadArray[i]);
687                         delete pDataArray[i];
688                 }
689     
690                 //append files
691                 for(int i=0;i<processIDS.size();i++){
692                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
693                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
694                 }
695 #endif          
696                 return num;
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary");
700                 exit(1);
701         }
702 }
703 /**********************************************************************************************************************/
704
705
706