]> git.donarmstrong.com Git - mothur.git/blob - seqsummarycommand.cpp
added Jensen-Shannon calc. working on get.communitytype command. fixed bug in get...
[mothur.git] / seqsummarycommand.cpp
1 /*
2  *  seqcoordcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 5/30/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqsummarycommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> SeqSummaryCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pcount);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
21                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
22                 
23                 vector<string> myArray;
24                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
25                 return myArray;
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "SeqSummaryCommand", "setParameters");
29                 exit(1);
30         }
31 }
32 //**********************************************************************************************************************
33 string SeqSummaryCommand::getHelpString(){      
34         try {
35                 string helpString = "";
36                 helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n";
37                 helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n";
38                 helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n";
39         helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n";
40                 helpString += "The summary.seqs command should be in the following format: \n";
41                 helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n";
42                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
43                 return helpString;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqSummaryCommand", "getHelpString");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string SeqSummaryCommand::getOutputPattern(string type) {
52     try {
53         string pattern = "";
54         
55         if (type == "summary") {  pattern = "[filename],summary"; } 
56         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
57         
58         return pattern;
59     }
60     catch(exception& e) {
61         m->errorOut(e, "SeqSummaryCommand", "getOutputPattern");
62         exit(1);
63     }
64 }
65
66 //**********************************************************************************************************************
67 SeqSummaryCommand::SeqSummaryCommand(){ 
68         try {
69                 abort = true; calledHelp = true; 
70                 setParameters();
71                 vector<string> tempOutNames;
72                 outputTypes["summary"] = tempOutNames;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
76                 exit(1);
77         }
78 }
79 //***************************************************************************************************************
80
81 SeqSummaryCommand::SeqSummaryCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter("summary.seqs");
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("fasta");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
114                                 }
115                                 
116                                 it = parameters.find("name");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
122                                 }
123                 
124                 it = parameters.find("count");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
130                                 }
131                         }
132                         
133                         //initialize outputTypes
134                         vector<string> tempOutNames;
135                         outputTypes["summary"] = tempOutNames;
136                         
137                         //check for required parameters
138                         fastafile = validParameter.validFile(parameters, "fasta", true);
139                         if (fastafile == "not open") { abort = true; }
140                         else if (fastafile == "not found") {                            
141                                 fastafile = m->getFastaFile(); 
142                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
143                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
144                         }else { m->setFastaFile(fastafile); }   
145                         
146                         namefile = validParameter.validFile(parameters, "name", true);
147                         if (namefile == "not open") { namefile = ""; abort = true; }
148                         else if (namefile == "not found") { namefile = "";  }   
149                         else { m->setNameFile(namefile); }
150             
151             countfile = validParameter.validFile(parameters, "count", true);
152                         if (countfile == "not open") { abort = true; countfile = ""; }  
153                         else if (countfile == "not found") { countfile = ""; }
154                         else { m->setCountTableFile(countfile); }
155                         
156             if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
157                         
158                         //if the user changes the output directory command factory will send this info to us in the output parameter 
159                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
160                                 outputDir = ""; 
161                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
162                         }
163                         
164                         string temp = validParameter.validFile(parameters, "processors", false);        if (temp == "not found"){       temp = m->getProcessors();      }
165                         m->setProcessors(temp);
166                         m->mothurConvert(temp, processors);
167                         
168             if (countfile == "") {
169                 if (namefile == "") {
170                     vector<string> files; files.push_back(fastafile);
171                     parser.getNameFile(files);
172                 }
173             }
174                 }
175         }
176         catch(exception& e) {
177                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
178                 exit(1);
179         }
180 }
181 //***************************************************************************************************************
182
183 int SeqSummaryCommand::execute(){
184         try{
185                 
186                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
187                 
188                 //set current fasta to fastafile
189                 m->setFastaFile(fastafile);
190                 
191         map<string, string> variables; 
192                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
193                 string summaryFile = getOutputFileName("summary",variables);
194                                 
195                 int numSeqs = 0;
196                 
197                 vector<int> startPosition;
198                 vector<int> endPosition;
199                 vector<int> seqLength;
200                 vector<int> ambigBases;
201                 vector<int> longHomoPolymer;
202                 
203                 if (namefile != "") { nameMap = m->readNames(namefile); }
204         else if (countfile != "") {
205             CountTable ct;
206             ct.readTable(countfile, false, false);
207             nameMap = ct.getNameMap();
208         }
209                 
210                 if (m->control_pressed) { return 0; }
211                         
212 #ifdef USE_MPI  
213                                 int pid, numSeqsPerProcessor; 
214                                 int tag = 2001;
215                                 int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
216                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
217                                 vector<unsigned long long> MPIPos;
218                                 
219                                 MPI_Status status; 
220                                 MPI_Status statusOut;
221                                 MPI_File inMPI; 
222                                 MPI_File outMPI; 
223                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
224                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
225                                                         
226                                 char tempFileName[1024];
227                                 strcpy(tempFileName, fastafile.c_str());
228                                 
229                                 char sumFileName[1024];
230                                 strcpy(sumFileName, summaryFile.c_str());
231                 
232                                 MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
233                                 MPI_File_open(MPI_COMM_WORLD, sumFileName, outMode, MPI_INFO_NULL, &outMPI);
234                                 
235                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); return 0;  }
236                                 
237                                 if (pid == 0) { //you are the root process
238                                                 //print header
239                                                 string outputString = "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs\n";        
240                                                 int length = outputString.length();
241                                                 char* buf2 = new char[length];
242                                                 memcpy(buf2, outputString.c_str(), length);
243                                         
244                                                 MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &statusOut);
245                                                 delete buf2;
246                                                 
247                                                 MPIPos = m->setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
248                                         
249                                                 for(int i = 1; i < processors; i++) { 
250                                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
251                                                         MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
252                                                 }
253                                                 
254                                                 //figure out how many sequences you have to do
255                                                 numSeqsPerProcessor = numSeqs / processors;
256                                                 int startIndex =  pid * numSeqsPerProcessor;
257                                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
258                                                 
259                                                 //do your part
260                                                 MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
261                                                 
262                                 }else { //i am the child process
263                         
264                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
265                                         MPIPos.resize(numSeqs+1);
266                                         MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
267                                 
268                                         //figure out how many sequences you have to align
269                                         numSeqsPerProcessor = numSeqs / processors;
270                                         int startIndex =  pid * numSeqsPerProcessor;
271                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
272                                 
273                                         //do your part
274                                         MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
275                                 }
276                                 
277                                 MPI_File_close(&inMPI);
278                                 MPI_File_close(&outMPI);
279                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
280                                 
281                                 if (pid == 0) {
282                                         //get the info from the child processes
283                                         for(int i = 1; i < processors; i++) { 
284                                                 int size;
285                                                 MPI_Recv(&size, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
286
287                                                 vector<int> temp; temp.resize(size+1);
288                                                 
289                                                 for(int j = 0; j < 5; j++) { 
290                                                 
291                                                         MPI_Recv(&temp[0], (size+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); 
292                                                         int receiveTag = temp[temp.size()-1];  //child process added a int to the end to indicate what count this is for
293                                                         
294                                                         if (receiveTag == startTag) { 
295                                                                 for (int k = 0; k < size; k++) {                startPosition.push_back(temp[k]);       }
296                                                         }else if (receiveTag == endTag) { 
297                                                                 for (int k = 0; k < size; k++) {                endPosition.push_back(temp[k]); }
298                                                         }else if (receiveTag == lengthTag) { 
299                                                                 for (int k = 0; k < size; k++) {                seqLength.push_back(temp[k]);   }
300                                                         }else if (receiveTag == baseTag) { 
301                                                                 for (int k = 0; k < size; k++) {                ambigBases.push_back(temp[k]);  }
302                                                         }else if (receiveTag == lhomoTag) { 
303                                                                 for (int k = 0; k < size; k++) {                longHomoPolymer.push_back(temp[k]);     }
304                                                         }
305                                                 } 
306                                         }
307
308                                 }else{
309                                 
310                                         //send my counts
311                                         int size = startPosition.size();
312                                         MPI_Send(&size, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
313                                         
314                                         startPosition.push_back(startTag);
315                                         int ierr = MPI_Send(&(startPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
316                                         endPosition.push_back(endTag);
317                                         ierr = MPI_Send (&(endPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
318                                         seqLength.push_back(lengthTag);
319                                         ierr = MPI_Send(&(seqLength[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
320                                         ambigBases.push_back(baseTag);
321                                         ierr = MPI_Send(&(ambigBases[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
322                                         longHomoPolymer.push_back(lhomoTag);
323                                         ierr = MPI_Send(&(longHomoPolymer[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
324                                 }
325                                 
326                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
327 #else
328                         vector<unsigned long long> positions; 
329                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
330                                 positions = m->divideFile(fastafile, processors);
331                                 for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
332                         #else
333                                 positions = m->setFilePosFasta(fastafile, numSeqs); 
334                 if (positions.size() < processors) { processors = positions.size(); }
335                 
336                                 //figure out how many sequences you have to process
337                                 int numSeqsPerProcessor = numSeqs / processors;
338                                 for (int i = 0; i < processors; i++) {
339                                         int startIndex =  i * numSeqsPerProcessor;
340                                         if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
341                                         lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
342                                 }
343                         #endif
344                         
345
346                         if(processors == 1){
347                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]);
348                         }else{
349                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); 
350                         }
351                         
352                         if (m->control_pressed) {  return 0; }
353 #endif
354                         
355                 #ifdef USE_MPI
356                         if (pid == 0) { 
357                 #endif
358                 
359                 sort(startPosition.begin(), startPosition.end());
360                 sort(endPosition.begin(), endPosition.end());
361                 sort(seqLength.begin(), seqLength.end());
362                 sort(ambigBases.begin(), ambigBases.end());
363                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
364                 int size = startPosition.size();
365                 
366                 //find means
367                 unsigned long long meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
368                 meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0;
369                 for (int i = 0; i < size; i++) {
370                         meanStartPosition += startPosition[i];
371                         meanEndPosition += endPosition[i];
372                         meanSeqLength += seqLength[i];
373                         meanAmbigBases += ambigBases[i];
374                         meanLongHomoPolymer += longHomoPolymer[i];
375                 }
376                 
377         double meanstartPosition, meanendPosition, meanseqLength, meanambigBases, meanlongHomoPolymer;
378                 
379                 meanstartPosition = meanStartPosition / (double) size; meanendPosition = meanEndPosition /(double) size; meanlongHomoPolymer = meanLongHomoPolymer / (double) size; meanseqLength = meanSeqLength / (double) size; meanambigBases = meanAmbigBases /(double) size;
380                                 
381                 int ptile0_25   = int(size * 0.025);
382                 int ptile25             = int(size * 0.250);
383                 int ptile50             = int(size * 0.500);
384                 int ptile75             = int(size * 0.750);
385                 int ptile97_5   = int(size * 0.975);
386                 int ptile100    = size - 1;
387                 
388                 //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
389                 if (startPosition[0] == -1) {  startPosition[0] = 0;    }
390                 if (endPosition[0] == -1)       {  endPosition[0] = 0;          }
391                 
392                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
393                 
394                 m->mothurOutEndLine();
395                 m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
396                 m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
397                 m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
398                 m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
399                 m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
400                 m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
401                 m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
402                 m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
403                 m->mothurOut("Mean:\t" + toString(meanstartPosition) + "\t" + toString(meanendPosition) + "\t" + toString(meanseqLength) + "\t" + toString(meanambigBases) + "\t" + toString(meanlongHomoPolymer)); m->mothurOutEndLine();
404
405                 if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
406                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
407                 
408                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
409                 
410                 m->mothurOutEndLine();
411                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
412                 m->mothurOut(summaryFile); m->mothurOutEndLine();       outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile);
413                 m->mothurOutEndLine();
414                 
415                 #ifdef USE_MPI
416                         }
417                 #endif
418
419         //set fasta file as new current fastafile
420                 string current = "";
421                 itTypes = outputTypes.find("summary");
422                 if (itTypes != outputTypes.end()) {
423                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSummaryFile(current); }
424                 }
425         
426                 return 0;
427         }
428         catch(exception& e) {
429                 m->errorOut(e, "SeqSummaryCommand", "execute");
430                 exit(1);
431         }
432 }
433 /**************************************************************************************/
434 int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile, linePair* filePos) {   
435         try {
436                 
437                 ofstream outSummary;
438                 m->openOutputFile(sumFile, outSummary);
439                 
440                 //print header if you are process 0
441                 if (filePos->start == 0) {
442                         outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs" << endl;  
443                 }
444                                 
445                 ifstream in;
446                 m->openInputFile(filename, in);
447                                 
448                 in.seekg(filePos->start);
449
450                 bool done = false;
451                 int count = 0;
452         
453         
454                 while (!done) {
455                                 
456                         if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
457             
458             if (m->debug) { m->mothurOut("[DEBUG]: count = " + toString(count) + "\n");  }
459             
460                         Sequence current(in); m->gobble(in);
461            
462                         if (current.getName() != "") {
463                                 
464                 if (m->debug) { m->mothurOut("[DEBUG]: " + current.getName() + '\t' + toString(current.getNumBases()) + "\n");  }
465                 
466                                 int num = 1;
467                                 if ((namefile != "") || (countfile != "")) {
468                                         //make sure this sequence is in the namefile, else error 
469                                         map<string, int>::iterator it = nameMap.find(current.getName());
470                                         
471                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
472                                         else { num = it->second; }
473                                 }
474                                 
475                                 //for each sequence this sequence represents
476                                 for (int i = 0; i < num; i++) {
477                                         startPosition.push_back(current.getStartPos());
478                                         endPosition.push_back(current.getEndPos());
479                                         seqLength.push_back(current.getNumBases());
480                                         ambigBases.push_back(current.getAmbigBases());
481                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
482                                 }
483                                 count++;
484                                 outSummary << current.getName() << '\t';
485                                 outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
486                                 outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
487                                 outSummary << current.getLongHomoPolymer() << '\t' << num << endl;
488                 
489                 if (m->debug) { m->mothurOut("[DEBUG]: " + current.getName() + '\t' + toString(current.getNumBases()) + "\n");  }
490                         }
491                         
492                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
493                                 unsigned long long pos = in.tellg();
494                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
495                         #else
496                                 if (in.eof()) { break; }
497                         #endif
498                 }
499                                 
500                 in.close();
501                 
502                 return count;
503         }
504         catch(exception& e) {
505                 m->errorOut(e, "SeqSummaryCommand", "driverCreateSummary");
506                 exit(1);
507         }
508 }
509 #ifdef USE_MPI
510 /**************************************************************************************/
511 int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {       
512         try {
513                 
514                 int pid;
515                 MPI_Status status; 
516                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
517
518                 for(int i=0;i<num;i++){
519                         
520                         if (m->control_pressed) { return 0; }
521                         
522                         //read next sequence
523                         int length = MPIPos[start+i+1] - MPIPos[start+i];
524         
525                         char* buf4 = new char[length];
526                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
527                         
528                         string tempBuf = buf4;
529                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
530                         istringstream iss (tempBuf,istringstream::in);
531                         delete buf4;
532
533                         Sequence current(iss);  
534
535                         if (current.getName() != "") {
536                                 
537                                 int num = 1;
538                                 if ((namefile != "") || (countfile != "")) {
539                                         //make sure this sequence is in the namefile, else error 
540                                         map<string, int>::iterator it = nameMap.find(current.getName());
541                                         
542                                         if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; }
543                                         else { num = it->second; }
544                                 }
545                                 
546                                 //for each sequence this sequence represents
547                                 for (int j = 0; j < num; j++) {
548                                         startPosition.push_back(current.getStartPos());
549                                         endPosition.push_back(current.getEndPos());
550                                         seqLength.push_back(current.getNumBases());
551                                         ambigBases.push_back(current.getAmbigBases());
552                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
553                                 }
554                                 
555                                 string outputString = current.getName() + "\t" + toString(current.getStartPos()) + "\t" + toString(current.getEndPos()) + "\t";
556                                 outputString += toString(current.getNumBases()) + "\t" + toString(current.getAmbigBases()) + "\t" + toString(current.getLongHomoPolymer()) + "\t" + toString(num) + "\n";
557                                 
558                                 //output to file
559                                 length = outputString.length();
560                                 char* buf3 = new char[length];
561                                 memcpy(buf3, outputString.c_str(), length);
562                                         
563                                 MPI_File_write_shared(outMPI, buf3, length, MPI_CHAR, &status);
564                                 delete buf3;
565                         }       
566                 }
567                 
568                 return 0;
569         }
570         catch(exception& e) {
571                 m->errorOut(e, "SeqSummaryCommand", "MPICreateSummary");
572                 exit(1);
573         }
574 }
575 #endif
576 /**************************************************************************************************/
577 int SeqSummaryCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile) {
578         try {
579                 int process = 1;
580                 int num = 0;
581                 processIDS.clear();
582                 
583 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
584                 
585                 //loop through and create all the processes you want
586                 while (process != processors) {
587                         int pid = fork();
588                         
589                         if (pid > 0) {
590                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
591                                 process++;
592                         }else if (pid == 0){
593                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]);
594                                 
595                                 //pass numSeqs to parent
596                                 ofstream out;
597                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
598                                 m->openOutputFile(tempFile, out);
599                                 
600                                 out << num << endl;
601                                 out << startPosition.size() << endl;
602                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
603                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
604                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
605                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
606                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
607                                 
608                                 out.close();
609                                 
610                                 exit(0);
611                         }else { 
612                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
613                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
614                                 exit(0);
615                         }
616                 }
617                 
618                 //do your part
619                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile, lines[0]);
620
621                 //force parent to wait until all the processes are done
622                 for (int i=0;i<processIDS.size();i++) { 
623                         int temp = processIDS[i];
624                         wait(&temp);
625                 }
626                 
627                 //parent reads in and combine Filter info
628                 for (int i = 0; i < processIDS.size(); i++) {
629                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
630                         ifstream in;
631                         m->openInputFile(tempFilename, in);
632                         
633                         int temp, tempNum;
634                         in >> tempNum; m->gobble(in); num += tempNum;
635                         in >> tempNum; m->gobble(in);
636                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
637                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
638                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
639                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
640                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
641                                 
642                         in.close();
643                         m->mothurRemove(tempFilename);
644                         
645                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
646                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
647                 }
648                 
649 #else
650                 //////////////////////////////////////////////////////////////////////////////////////////////////////
651                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
652                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
653                 //Taking advantage of shared memory to allow both threads to add info to vectors.
654                 //////////////////////////////////////////////////////////////////////////////////////////////////////
655                 
656                 vector<seqSumData*> pDataArray; 
657                 DWORD   dwThreadIdArray[processors-1];
658                 HANDLE  hThreadArray[processors-1]; 
659         
660                 bool hasNameMap = false;
661         if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
662         
663                 //Create processor worker threads.
664                 for( int i=0; i<processors-1; i++ ){
665             
666             string extension = "";
667             if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
668                         // Allocate memory for thread data.
669                         seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap);
670                         pDataArray.push_back(tempSum);
671                         
672                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
673                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
674                         hThreadArray[i] = CreateThread(NULL, 0, MySeqSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
675                 }
676                 
677         //do your part
678                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, (sumFile+toString(processors-1)+".temp"), lines[processors-1]);
679         processIDS.push_back(processors-1);
680
681                 //Wait until all threads have terminated.
682                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
683                 
684                 //Close all thread handles and free memory allocations.
685                 for(int i=0; i < pDataArray.size(); i++){
686                         num += pDataArray[i]->count;
687             if (pDataArray[i]->count != pDataArray[i]->end) {
688                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
689             }
690             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
691                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
692             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
693             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
694             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
695                         CloseHandle(hThreadArray[i]);
696                         delete pDataArray[i];
697                 }
698     
699                 //append files
700                 for(int i=0;i<processIDS.size();i++){
701                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
702                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
703                 }
704 #endif          
705                 return num;
706         }
707         catch(exception& e) {
708                 m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary");
709                 exit(1);
710         }
711 }
712 /**********************************************************************************************************************/
713
714
715