]> git.donarmstrong.com Git - mothur.git/blob - seqsummarycommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / seqsummarycommand.cpp
1 /*
2  *  seqcoordcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 5/30/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqsummarycommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> SeqSummaryCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pcount);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
21                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
22                 
23                 vector<string> myArray;
24                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
25                 return myArray;
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "SeqSummaryCommand", "setParameters");
29                 exit(1);
30         }
31 }
32 //**********************************************************************************************************************
33 string SeqSummaryCommand::getHelpString(){      
34         try {
35                 string helpString = "";
36                 helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n";
37                 helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n";
38                 helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n";
39         helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n";
40                 helpString += "The summary.seqs command should be in the following format: \n";
41                 helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n";
42                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
43                 return helpString;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqSummaryCommand", "getHelpString");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string SeqSummaryCommand::getOutputPattern(string type) {
52     try {
53         string pattern = "";
54         
55         if (type == "summary") {  pattern = "[filename],summary"; } 
56         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
57         
58         return pattern;
59     }
60     catch(exception& e) {
61         m->errorOut(e, "SeqSummaryCommand", "getOutputPattern");
62         exit(1);
63     }
64 }
65
66 //**********************************************************************************************************************
67 SeqSummaryCommand::SeqSummaryCommand(){ 
68         try {
69                 abort = true; calledHelp = true; 
70                 setParameters();
71                 vector<string> tempOutNames;
72                 outputTypes["summary"] = tempOutNames;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
76                 exit(1);
77         }
78 }
79 //***************************************************************************************************************
80
81 SeqSummaryCommand::SeqSummaryCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter("summary.seqs");
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("fasta");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
114                                 }
115                                 
116                                 it = parameters.find("name");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
122                                 }
123                 
124                 it = parameters.find("count");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
130                                 }
131                         }
132                         
133                         //initialize outputTypes
134                         vector<string> tempOutNames;
135                         outputTypes["summary"] = tempOutNames;
136                         
137                         //check for required parameters
138                         fastafile = validParameter.validFile(parameters, "fasta", true);
139                         if (fastafile == "not open") { abort = true; }
140                         else if (fastafile == "not found") {                            
141                                 fastafile = m->getFastaFile(); 
142                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
143                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
144                         }else { m->setFastaFile(fastafile); }   
145                         
146                         namefile = validParameter.validFile(parameters, "name", true);
147                         if (namefile == "not open") { namefile = ""; abort = true; }
148                         else if (namefile == "not found") { namefile = "";  }   
149                         else { m->setNameFile(namefile); }
150             
151             countfile = validParameter.validFile(parameters, "count", true);
152                         if (countfile == "not open") { abort = true; countfile = ""; }  
153                         else if (countfile == "not found") { countfile = ""; }
154                         else { m->setCountTableFile(countfile); }
155                         
156             if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
157                         
158                         //if the user changes the output directory command factory will send this info to us in the output parameter 
159                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
160                                 outputDir = ""; 
161                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
162                         }
163                         
164                         string temp = validParameter.validFile(parameters, "processors", false);        if (temp == "not found"){       temp = m->getProcessors();      }
165                         m->setProcessors(temp);
166                         m->mothurConvert(temp, processors);
167                         
168             if (countfile == "") {
169                 if (namefile == "") {
170                     vector<string> files; files.push_back(fastafile);
171                     parser.getNameFile(files);
172                 }
173             }
174                 }
175         }
176         catch(exception& e) {
177                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
178                 exit(1);
179         }
180 }
181 //***************************************************************************************************************
182
183 int SeqSummaryCommand::execute(){
184         try{
185                 
186                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
187                 
188                 //set current fasta to fastafile
189                 m->setFastaFile(fastafile);
190                 
191         map<string, string> variables; 
192                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
193                 string summaryFile = getOutputFileName("summary",variables);
194                                 
195                 int numSeqs = 0;
196                 
197                 vector<int> startPosition;
198                 vector<int> endPosition;
199                 vector<int> seqLength;
200                 vector<int> ambigBases;
201                 vector<int> longHomoPolymer;
202                 
203                 if (namefile != "") { nameMap = m->readNames(namefile); }
204         else if (countfile != "") {
205             CountTable ct;
206             ct.readTable(countfile);
207             nameMap = ct.getNameMap();
208         }
209                 
210                 if (m->control_pressed) { return 0; }
211                         
212 #ifdef USE_MPI  
213                                 int pid, numSeqsPerProcessor; 
214                                 int tag = 2001;
215                                 int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
216                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
217                                 vector<unsigned long long> MPIPos;
218                                 
219                                 MPI_Status status; 
220                                 MPI_Status statusOut;
221                                 MPI_File inMPI; 
222                                 MPI_File outMPI; 
223                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
224                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
225                                                         
226                                 char tempFileName[1024];
227                                 strcpy(tempFileName, fastafile.c_str());
228                                 
229                                 char sumFileName[1024];
230                                 strcpy(sumFileName, summaryFile.c_str());
231                 
232                                 MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
233                                 MPI_File_open(MPI_COMM_WORLD, sumFileName, outMode, MPI_INFO_NULL, &outMPI);
234                                 
235                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); return 0;  }
236                                 
237                                 if (pid == 0) { //you are the root process
238                                                 //print header
239                                                 string outputString = "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs\n";        
240                                                 int length = outputString.length();
241                                                 char* buf2 = new char[length];
242                                                 memcpy(buf2, outputString.c_str(), length);
243                                         
244                                                 MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &statusOut);
245                                                 delete buf2;
246                                                 
247                                                 MPIPos = m->setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
248                                         
249                                                 for(int i = 1; i < processors; i++) { 
250                                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
251                                                         MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
252                                                 }
253                                                 
254                                                 //figure out how many sequences you have to do
255                                                 numSeqsPerProcessor = numSeqs / processors;
256                                                 int startIndex =  pid * numSeqsPerProcessor;
257                                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
258                                                 
259                                                 //do your part
260                                                 MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
261                                                 
262                                 }else { //i am the child process
263                         
264                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
265                                         MPIPos.resize(numSeqs+1);
266                                         MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
267                                 
268                                         //figure out how many sequences you have to align
269                                         numSeqsPerProcessor = numSeqs / processors;
270                                         int startIndex =  pid * numSeqsPerProcessor;
271                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
272                                 
273                                         //do your part
274                                         MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
275                                 }
276                                 
277                                 MPI_File_close(&inMPI);
278                                 MPI_File_close(&outMPI);
279                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
280                                 
281                                 if (pid == 0) {
282                                         //get the info from the child processes
283                                         for(int i = 1; i < processors; i++) { 
284                                                 int size;
285                                                 MPI_Recv(&size, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
286
287                                                 vector<int> temp; temp.resize(size+1);
288                                                 
289                                                 for(int j = 0; j < 5; j++) { 
290                                                 
291                                                         MPI_Recv(&temp[0], (size+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); 
292                                                         int receiveTag = temp[temp.size()-1];  //child process added a int to the end to indicate what count this is for
293                                                         
294                                                         if (receiveTag == startTag) { 
295                                                                 for (int k = 0; k < size; k++) {                startPosition.push_back(temp[k]);       }
296                                                         }else if (receiveTag == endTag) { 
297                                                                 for (int k = 0; k < size; k++) {                endPosition.push_back(temp[k]); }
298                                                         }else if (receiveTag == lengthTag) { 
299                                                                 for (int k = 0; k < size; k++) {                seqLength.push_back(temp[k]);   }
300                                                         }else if (receiveTag == baseTag) { 
301                                                                 for (int k = 0; k < size; k++) {                ambigBases.push_back(temp[k]);  }
302                                                         }else if (receiveTag == lhomoTag) { 
303                                                                 for (int k = 0; k < size; k++) {                longHomoPolymer.push_back(temp[k]);     }
304                                                         }
305                                                 } 
306                                         }
307
308                                 }else{
309                                 
310                                         //send my counts
311                                         int size = startPosition.size();
312                                         MPI_Send(&size, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
313                                         
314                                         startPosition.push_back(startTag);
315                                         int ierr = MPI_Send(&(startPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
316                                         endPosition.push_back(endTag);
317                                         ierr = MPI_Send (&(endPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
318                                         seqLength.push_back(lengthTag);
319                                         ierr = MPI_Send(&(seqLength[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
320                                         ambigBases.push_back(baseTag);
321                                         ierr = MPI_Send(&(ambigBases[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
322                                         longHomoPolymer.push_back(lhomoTag);
323                                         ierr = MPI_Send(&(longHomoPolymer[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
324                                 }
325                                 
326                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
327 #else
328                         vector<unsigned long long> positions; 
329                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
330                                 positions = m->divideFile(fastafile, processors);
331                                 for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
332                         #else
333                                 positions = m->setFilePosFasta(fastafile, numSeqs); 
334                 if (positions.size() < processors) { processors = positions.size(); }
335                 
336                                 //figure out how many sequences you have to process
337                                 int numSeqsPerProcessor = numSeqs / processors;
338                                 for (int i = 0; i < processors; i++) {
339                                         int startIndex =  i * numSeqsPerProcessor;
340                                         if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
341                                         lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
342                                 }
343                         #endif
344                         
345
346                         if(processors == 1){
347                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]);
348                         }else{
349                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); 
350                         }
351                         
352                         if (m->control_pressed) {  return 0; }
353 #endif
354                         
355                 #ifdef USE_MPI
356                         if (pid == 0) { 
357                 #endif
358                 
359                 sort(startPosition.begin(), startPosition.end());
360                 sort(endPosition.begin(), endPosition.end());
361                 sort(seqLength.begin(), seqLength.end());
362                 sort(ambigBases.begin(), ambigBases.end());
363                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
364                 int size = startPosition.size();
365                 
366                 //find means
367                 double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
368                 meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0;
369                 for (int i = 0; i < size; i++) {
370                         meanStartPosition += startPosition[i];
371                         meanEndPosition += endPosition[i];
372                         meanSeqLength += seqLength[i];
373                         meanAmbigBases += ambigBases[i];
374                         meanLongHomoPolymer += longHomoPolymer[i];
375                 }
376                 
377                 //this is an int divide so the remainder is lost
378                 meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size;
379                                 
380                 int ptile0_25   = int(size * 0.025);
381                 int ptile25             = int(size * 0.250);
382                 int ptile50             = int(size * 0.500);
383                 int ptile75             = int(size * 0.750);
384                 int ptile97_5   = int(size * 0.975);
385                 int ptile100    = size - 1;
386                 
387                 //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
388                 if (startPosition[0] == -1) {  startPosition[0] = 0;    }
389                 if (endPosition[0] == -1)       {  endPosition[0] = 0;          }
390                 
391                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
392                 
393                 m->mothurOutEndLine();
394                 m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
395                 m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
396                 m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
397                 m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
398                 m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
399                 m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
400                 m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
401                 m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
402                 m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine();
403
404                 if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
405                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
406                 
407                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
408                 
409                 m->mothurOutEndLine();
410                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411                 m->mothurOut(summaryFile); m->mothurOutEndLine();       outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile);
412                 m->mothurOutEndLine();
413                 
414                 #ifdef USE_MPI
415                         }
416                 #endif
417
418                 return 0;
419         }
420         catch(exception& e) {
421                 m->errorOut(e, "SeqSummaryCommand", "execute");
422                 exit(1);
423         }
424 }
425 /**************************************************************************************/
426 int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile, linePair* filePos) {   
427         try {
428                 
429                 ofstream outSummary;
430                 m->openOutputFile(sumFile, outSummary);
431                 
432                 //print header if you are process 0
433                 if (filePos->start == 0) {
434                         outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs" << endl;  
435                 }
436                                 
437                 ifstream in;
438                 m->openInputFile(filename, in);
439                                 
440                 in.seekg(filePos->start);
441
442                 bool done = false;
443                 int count = 0;
444        
445                 while (!done) {
446                                 
447                         if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
448                                         
449                         Sequence current(in); m->gobble(in);
450            
451                         if (current.getName() != "") {
452                                 
453                                 int num = 1;
454                                 if ((namefile != "") || (countfile != "")) {
455                                         //make sure this sequence is in the namefile, else error 
456                                         map<string, int>::iterator it = nameMap.find(current.getName());
457                                         
458                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
459                                         else { num = it->second; }
460                                 }
461                                 
462                                 //for each sequence this sequence represents
463                                 for (int i = 0; i < num; i++) {
464                                         startPosition.push_back(current.getStartPos());
465                                         endPosition.push_back(current.getEndPos());
466                                         seqLength.push_back(current.getNumBases());
467                                         ambigBases.push_back(current.getAmbigBases());
468                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
469                                 }
470                                 
471                                 count++;
472                                 outSummary << current.getName() << '\t';
473                                 outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
474                                 outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
475                                 outSummary << current.getLongHomoPolymer() << '\t' << num << endl;
476                         }
477                         
478                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
479                                 unsigned long long pos = in.tellg();
480                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
481                         #else
482                                 if (in.eof()) { break; }
483                         #endif
484                         
485                         //report progress
486                         //if((count) % 100 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
487                 }
488                 //report progress
489                 //if((count) % 100 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
490                 
491                 in.close();
492                 
493                 return count;
494         }
495         catch(exception& e) {
496                 m->errorOut(e, "SeqSummaryCommand", "driverCreateSummary");
497                 exit(1);
498         }
499 }
500 #ifdef USE_MPI
501 /**************************************************************************************/
502 int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {       
503         try {
504                 
505                 int pid;
506                 MPI_Status status; 
507                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
508
509                 for(int i=0;i<num;i++){
510                         
511                         if (m->control_pressed) { return 0; }
512                         
513                         //read next sequence
514                         int length = MPIPos[start+i+1] - MPIPos[start+i];
515         
516                         char* buf4 = new char[length];
517                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
518                         
519                         string tempBuf = buf4;
520                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
521                         istringstream iss (tempBuf,istringstream::in);
522                         delete buf4;
523
524                         Sequence current(iss);  
525
526                         if (current.getName() != "") {
527                                 
528                                 int num = 1;
529                                 if ((namefile != "") || (countfile != "")) {
530                                         //make sure this sequence is in the namefile, else error 
531                                         map<string, int>::iterator it = nameMap.find(current.getName());
532                                         
533                                         if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; }
534                                         else { num = it->second; }
535                                 }
536                                 
537                                 //for each sequence this sequence represents
538                                 for (int i = 0; i < num; i++) {
539                                         startPosition.push_back(current.getStartPos());
540                                         endPosition.push_back(current.getEndPos());
541                                         seqLength.push_back(current.getNumBases());
542                                         ambigBases.push_back(current.getAmbigBases());
543                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
544                                 }
545                                 
546                                 string outputString = current.getName() + "\t" + toString(current.getStartPos()) + "\t" + toString(current.getEndPos()) + "\t";
547                                 outputString += toString(current.getNumBases()) + "\t" + toString(current.getAmbigBases()) + "\t" + toString(current.getLongHomoPolymer()) + "\t" + toString(num) + "\n";
548                                 
549                                 //output to file
550                                 length = outputString.length();
551                                 char* buf3 = new char[length];
552                                 memcpy(buf3, outputString.c_str(), length);
553                                         
554                                 MPI_File_write_shared(outMPI, buf3, length, MPI_CHAR, &status);
555                                 delete buf3;
556                         }       
557                 }
558                 
559                 return 0;
560         }
561         catch(exception& e) {
562                 m->errorOut(e, "SeqSummaryCommand", "MPICreateSummary");
563                 exit(1);
564         }
565 }
566 #endif
567 /**************************************************************************************************/
568 int SeqSummaryCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile) {
569         try {
570                 int process = 1;
571                 int num = 0;
572                 processIDS.clear();
573                 
574 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
575                 
576                 //loop through and create all the processes you want
577                 while (process != processors) {
578                         int pid = fork();
579                         
580                         if (pid > 0) {
581                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
582                                 process++;
583                         }else if (pid == 0){
584                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]);
585                                 
586                                 //pass numSeqs to parent
587                                 ofstream out;
588                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
589                                 m->openOutputFile(tempFile, out);
590                                 
591                                 out << num << endl;
592                                 out << startPosition.size() << endl;
593                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
594                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
595                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
596                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
597                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
598                                 
599                                 out.close();
600                                 
601                                 exit(0);
602                         }else { 
603                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
604                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
605                                 exit(0);
606                         }
607                 }
608                 
609                 //do your part
610                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile, lines[0]);
611
612                 //force parent to wait until all the processes are done
613                 for (int i=0;i<processIDS.size();i++) { 
614                         int temp = processIDS[i];
615                         wait(&temp);
616                 }
617                 
618                 //parent reads in and combine Filter info
619                 for (int i = 0; i < processIDS.size(); i++) {
620                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
621                         ifstream in;
622                         m->openInputFile(tempFilename, in);
623                         
624                         int temp, tempNum;
625                         in >> tempNum; m->gobble(in); num += tempNum;
626                         in >> tempNum; m->gobble(in);
627                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
628                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
629                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
630                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
631                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
632                                 
633                         in.close();
634                         m->mothurRemove(tempFilename);
635                         
636                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
637                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
638                 }
639                 
640 #else
641                 //////////////////////////////////////////////////////////////////////////////////////////////////////
642                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
643                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
644                 //Taking advantage of shared memory to allow both threads to add info to vectors.
645                 //////////////////////////////////////////////////////////////////////////////////////////////////////
646                 
647                 vector<seqSumData*> pDataArray; 
648                 DWORD   dwThreadIdArray[processors-1];
649                 HANDLE  hThreadArray[processors-1]; 
650         
651                 bool hasNameMap = false;
652         if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
653         
654                 //Create processor worker threads.
655                 for( int i=0; i<processors-1; i++ ){
656             
657             string extension = "";
658             if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
659                         // Allocate memory for thread data.
660                         seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap);
661                         pDataArray.push_back(tempSum);
662                         
663                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
664                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
665                         hThreadArray[i] = CreateThread(NULL, 0, MySeqSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
666                 }
667                 
668         //do your part
669                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, (sumFile+toString(processors-1)+".temp"), lines[processors-1]);
670         processIDS.push_back(processors-1);
671
672                 //Wait until all threads have terminated.
673                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
674                 
675                 //Close all thread handles and free memory allocations.
676                 for(int i=0; i < pDataArray.size(); i++){
677                         num += pDataArray[i]->count;
678             if (pDataArray[i]->count != pDataArray[i]->end) {
679                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
680             }
681             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
682                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
683             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
684             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
685             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
686                         CloseHandle(hThreadArray[i]);
687                         delete pDataArray[i];
688                 }
689     
690                 //append files
691                 for(int i=0;i<processIDS.size();i++){
692                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
693                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
694                 }
695 #endif          
696                 return num;
697         }
698         catch(exception& e) {
699                 m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary");
700                 exit(1);
701         }
702 }
703 /**********************************************************************************************************************/
704
705
706