]> git.donarmstrong.com Git - mothur.git/blob - seqsummarycommand.cpp
added method parameter to get.oturep. options method=abundance or distance. default...
[mothur.git] / seqsummarycommand.cpp
1 /*
2  *  seqcoordcommand.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 5/30/09.
6  *  Copyright 2009 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 #include "seqsummarycommand.h"
11 #include "counttable.h"
12
13 //**********************************************************************************************************************
14 vector<string> SeqSummaryCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","summary",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","",false,false,true); parameters.push_back(pcount);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
21                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
22                 
23                 vector<string> myArray;
24                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
25                 return myArray;
26         }
27         catch(exception& e) {
28                 m->errorOut(e, "SeqSummaryCommand", "setParameters");
29                 exit(1);
30         }
31 }
32 //**********************************************************************************************************************
33 string SeqSummaryCommand::getHelpString(){      
34         try {
35                 string helpString = "";
36                 helpString += "The summary.seqs command reads a fastafile and summarizes the sequences.\n";
37                 helpString += "The summary.seqs command parameters are fasta, name, count and processors, fasta is required, unless you have a valid current fasta file.\n";
38                 helpString += "The name parameter allows you to enter a name file associated with your fasta file. \n";
39         helpString += "The count parameter allows you to enter a count file associated with your fasta file. \n";
40                 helpString += "The summary.seqs command should be in the following format: \n";
41                 helpString += "summary.seqs(fasta=yourFastaFile, processors=2) \n";
42                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";       
43                 return helpString;
44         }
45         catch(exception& e) {
46                 m->errorOut(e, "SeqSummaryCommand", "getHelpString");
47                 exit(1);
48         }
49 }
50 //**********************************************************************************************************************
51 string SeqSummaryCommand::getOutputPattern(string type) {
52     try {
53         string pattern = "";
54         
55         if (type == "summary") {  pattern = "[filename],summary"; } 
56         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
57         
58         return pattern;
59     }
60     catch(exception& e) {
61         m->errorOut(e, "SeqSummaryCommand", "getOutputPattern");
62         exit(1);
63     }
64 }
65
66 //**********************************************************************************************************************
67 SeqSummaryCommand::SeqSummaryCommand(){ 
68         try {
69                 abort = true; calledHelp = true; 
70                 setParameters();
71                 vector<string> tempOutNames;
72                 outputTypes["summary"] = tempOutNames;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
76                 exit(1);
77         }
78 }
79 //***************************************************************************************************************
80
81 SeqSummaryCommand::SeqSummaryCommand(string option)  {
82         try {
83                 abort = false; calledHelp = false;   
84                 
85                 //allow user to run help
86                 if(option == "help") { help(); abort = true; calledHelp = true; }
87                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88                 
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter("summary.seqs");
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("fasta");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
114                                 }
115                                 
116                                 it = parameters.find("name");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
122                                 }
123                 
124                 it = parameters.find("count");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
130                                 }
131                         }
132                         
133                         //initialize outputTypes
134                         vector<string> tempOutNames;
135                         outputTypes["summary"] = tempOutNames;
136                         
137                         //check for required parameters
138                         fastafile = validParameter.validFile(parameters, "fasta", true);
139                         if (fastafile == "not open") { abort = true; }
140                         else if (fastafile == "not found") {                            
141                                 fastafile = m->getFastaFile(); 
142                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
143                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
144                         }else { m->setFastaFile(fastafile); }   
145                         
146                         namefile = validParameter.validFile(parameters, "name", true);
147                         if (namefile == "not open") { namefile = ""; abort = true; }
148                         else if (namefile == "not found") { namefile = "";  }   
149                         else { m->setNameFile(namefile); }
150             
151             countfile = validParameter.validFile(parameters, "count", true);
152                         if (countfile == "not open") { abort = true; countfile = ""; }  
153                         else if (countfile == "not found") { countfile = ""; }
154                         else { m->setCountTableFile(countfile); }
155                         
156             if ((countfile != "") && (namefile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; }
157                         
158                         //if the user changes the output directory command factory will send this info to us in the output parameter 
159                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
160                                 outputDir = ""; 
161                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
162                         }
163                         
164                         string temp = validParameter.validFile(parameters, "processors", false);        if (temp == "not found"){       temp = m->getProcessors();      }
165                         m->setProcessors(temp);
166                         m->mothurConvert(temp, processors);
167                         
168             if (countfile == "") {
169                 if (namefile == "") {
170                     vector<string> files; files.push_back(fastafile);
171                     parser.getNameFile(files);
172                 }
173             }
174                 }
175         }
176         catch(exception& e) {
177                 m->errorOut(e, "SeqSummaryCommand", "SeqSummaryCommand");
178                 exit(1);
179         }
180 }
181 //***************************************************************************************************************
182
183 int SeqSummaryCommand::execute(){
184         try{
185                 
186                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
187                 
188                 //set current fasta to fastafile
189                 m->setFastaFile(fastafile);
190                 
191         map<string, string> variables; 
192                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastafile));
193                 string summaryFile = getOutputFileName("summary",variables);
194                                 
195                 int numSeqs = 0;
196                 
197                 vector<int> startPosition;
198                 vector<int> endPosition;
199                 vector<int> seqLength;
200                 vector<int> ambigBases;
201                 vector<int> longHomoPolymer;
202                 
203                 if (namefile != "") { nameMap = m->readNames(namefile); }
204         else if (countfile != "") {
205             CountTable ct;
206             ct.readTable(countfile);
207             nameMap = ct.getNameMap();
208         }
209                 
210                 if (m->control_pressed) { return 0; }
211                         
212 #ifdef USE_MPI  
213                                 int pid, numSeqsPerProcessor; 
214                                 int tag = 2001;
215                                 int startTag = 1; int endTag = 2; int lengthTag = 3; int baseTag = 4; int lhomoTag = 5;
216                                 int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
217                                 vector<unsigned long long> MPIPos;
218                                 
219                                 MPI_Status status; 
220                                 MPI_Status statusOut;
221                                 MPI_File inMPI; 
222                                 MPI_File outMPI; 
223                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
224                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
225                                                         
226                                 char tempFileName[1024];
227                                 strcpy(tempFileName, fastafile.c_str());
228                                 
229                                 char sumFileName[1024];
230                                 strcpy(sumFileName, summaryFile.c_str());
231                 
232                                 MPI_File_open(MPI_COMM_WORLD, tempFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
233                                 MPI_File_open(MPI_COMM_WORLD, sumFileName, outMode, MPI_INFO_NULL, &outMPI);
234                                 
235                                 if (m->control_pressed) {  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); return 0;  }
236                                 
237                                 if (pid == 0) { //you are the root process
238                                                 //print header
239                                                 string outputString = "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs\n";        
240                                                 int length = outputString.length();
241                                                 char* buf2 = new char[length];
242                                                 memcpy(buf2, outputString.c_str(), length);
243                                         
244                                                 MPI_File_write_shared(outMPI, buf2, length, MPI_CHAR, &statusOut);
245                                                 delete buf2;
246                                                 
247                                                 MPIPos = m->setFilePosFasta(fastafile, numSeqs); //fills MPIPos, returns numSeqs
248                                         
249                                                 for(int i = 1; i < processors; i++) { 
250                                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
251                                                         MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
252                                                 }
253                                                 
254                                                 //figure out how many sequences you have to do
255                                                 numSeqsPerProcessor = numSeqs / processors;
256                                                 int startIndex =  pid * numSeqsPerProcessor;
257                                                 if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
258                                                 
259                                                 //do your part
260                                                 MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
261                                                 
262                                 }else { //i am the child process
263                         
264                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
265                                         MPIPos.resize(numSeqs+1);
266                                         MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
267                                 
268                                         //figure out how many sequences you have to align
269                                         numSeqsPerProcessor = numSeqs / processors;
270                                         int startIndex =  pid * numSeqsPerProcessor;
271                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
272                                 
273                                         //do your part
274                                         MPICreateSummary(startIndex, numSeqsPerProcessor, startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, inMPI, outMPI, MPIPos);
275                                 }
276                                 
277                                 MPI_File_close(&inMPI);
278                                 MPI_File_close(&outMPI);
279                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
280                                 
281                                 if (pid == 0) {
282                                         //get the info from the child processes
283                                         for(int i = 1; i < processors; i++) { 
284                                                 int size;
285                                                 MPI_Recv(&size, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
286
287                                                 vector<int> temp; temp.resize(size+1);
288                                                 
289                                                 for(int j = 0; j < 5; j++) { 
290                                                 
291                                                         MPI_Recv(&temp[0], (size+1), MPI_INT, i, 2001, MPI_COMM_WORLD, &status); 
292                                                         int receiveTag = temp[temp.size()-1];  //child process added a int to the end to indicate what count this is for
293                                                         
294                                                         if (receiveTag == startTag) { 
295                                                                 for (int k = 0; k < size; k++) {                startPosition.push_back(temp[k]);       }
296                                                         }else if (receiveTag == endTag) { 
297                                                                 for (int k = 0; k < size; k++) {                endPosition.push_back(temp[k]); }
298                                                         }else if (receiveTag == lengthTag) { 
299                                                                 for (int k = 0; k < size; k++) {                seqLength.push_back(temp[k]);   }
300                                                         }else if (receiveTag == baseTag) { 
301                                                                 for (int k = 0; k < size; k++) {                ambigBases.push_back(temp[k]);  }
302                                                         }else if (receiveTag == lhomoTag) { 
303                                                                 for (int k = 0; k < size; k++) {                longHomoPolymer.push_back(temp[k]);     }
304                                                         }
305                                                 } 
306                                         }
307
308                                 }else{
309                                 
310                                         //send my counts
311                                         int size = startPosition.size();
312                                         MPI_Send(&size, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
313                                         
314                                         startPosition.push_back(startTag);
315                                         int ierr = MPI_Send(&(startPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
316                                         endPosition.push_back(endTag);
317                                         ierr = MPI_Send (&(endPosition[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
318                                         seqLength.push_back(lengthTag);
319                                         ierr = MPI_Send(&(seqLength[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
320                                         ambigBases.push_back(baseTag);
321                                         ierr = MPI_Send(&(ambigBases[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
322                                         longHomoPolymer.push_back(lhomoTag);
323                                         ierr = MPI_Send(&(longHomoPolymer[0]), (size+1), MPI_INT, 0, 2001, MPI_COMM_WORLD);
324                                 }
325                                 
326                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
327 #else
328                         vector<unsigned long long> positions; 
329                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
330                                 positions = m->divideFile(fastafile, processors);
331                                 for (int i = 0; i < (positions.size()-1); i++) {        lines.push_back(new linePair(positions[i], positions[(i+1)]));  }
332                         #else
333                                 positions = m->setFilePosFasta(fastafile, numSeqs); 
334                 if (positions.size() < processors) { processors = positions.size(); }
335                 
336                                 //figure out how many sequences you have to process
337                                 int numSeqsPerProcessor = numSeqs / processors;
338                                 for (int i = 0; i < processors; i++) {
339                                         int startIndex =  i * numSeqsPerProcessor;
340                                         if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
341                                         lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
342                                 }
343                         #endif
344                         
345
346                         if(processors == 1){
347                                 numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile, lines[0]);
348                         }else{
349                                 numSeqs = createProcessesCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, summaryFile); 
350                         }
351                         
352                         if (m->control_pressed) {  return 0; }
353 #endif
354                         
355                 #ifdef USE_MPI
356                         if (pid == 0) { 
357                 #endif
358                 
359                 sort(startPosition.begin(), startPosition.end());
360                 sort(endPosition.begin(), endPosition.end());
361                 sort(seqLength.begin(), seqLength.end());
362                 sort(ambigBases.begin(), ambigBases.end());
363                 sort(longHomoPolymer.begin(), longHomoPolymer.end());
364                 int size = startPosition.size();
365                 
366                 //find means
367                 double meanStartPosition, meanEndPosition, meanSeqLength, meanAmbigBases, meanLongHomoPolymer;
368                 meanStartPosition = 0; meanEndPosition = 0; meanSeqLength = 0; meanAmbigBases = 0; meanLongHomoPolymer = 0;
369                 for (int i = 0; i < size; i++) {
370                         meanStartPosition += startPosition[i];
371                         meanEndPosition += endPosition[i];
372                         meanSeqLength += seqLength[i];
373                         meanAmbigBases += ambigBases[i];
374                         meanLongHomoPolymer += longHomoPolymer[i];
375                 }
376                 
377                 //this is an int divide so the remainder is lost
378                 meanStartPosition /= (float) size; meanEndPosition /= (float) size; meanLongHomoPolymer /= (float) size; meanSeqLength /= (float) size; meanAmbigBases /= (float) size;
379                                 
380                 int ptile0_25   = int(size * 0.025);
381                 int ptile25             = int(size * 0.250);
382                 int ptile50             = int(size * 0.500);
383                 int ptile75             = int(size * 0.750);
384                 int ptile97_5   = int(size * 0.975);
385                 int ptile100    = size - 1;
386                 
387                 //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
388                 if (startPosition[0] == -1) {  startPosition[0] = 0;    }
389                 if (endPosition[0] == -1)       {  endPosition[0] = 0;          }
390                 
391                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
392                 
393                 m->mothurOutEndLine();
394                 m->mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer\tNumSeqs"); m->mothurOutEndLine();
395                 m->mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0]) + "\t" + toString(1)); m->mothurOutEndLine();
396                 m->mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25]) + "\t" + toString(ptile0_25+1)); m->mothurOutEndLine();
397                 m->mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25]) + "\t" + toString(ptile25+1)); m->mothurOutEndLine();
398                 m->mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50]) + "\t" + toString(ptile50+1)); m->mothurOutEndLine();
399                 m->mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75]) + "\t" + toString(ptile75+1)); m->mothurOutEndLine();
400                 m->mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5]) + "\t" + toString(ptile97_5+1)); m->mothurOutEndLine();
401                 m->mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100]) + "\t" + toString(ptile100+1)); m->mothurOutEndLine();
402                 m->mothurOut("Mean:\t" + toString(meanStartPosition) + "\t" + toString(meanEndPosition) + "\t" + toString(meanSeqLength) + "\t" + toString(meanAmbigBases) + "\t" + toString(meanLongHomoPolymer)); m->mothurOutEndLine();
403
404                 if ((namefile == "") && (countfile == "")) {  m->mothurOut("# of Seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); }
405                 else { m->mothurOut("# of unique seqs:\t" + toString(numSeqs)); m->mothurOutEndLine(); m->mothurOut("total # of seqs:\t" + toString(startPosition.size())); m->mothurOutEndLine(); }
406                 
407                 if (m->control_pressed) {  m->mothurRemove(summaryFile); return 0; }
408                 
409                 m->mothurOutEndLine();
410                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411                 m->mothurOut(summaryFile); m->mothurOutEndLine();       outputNames.push_back(summaryFile); outputTypes["summary"].push_back(summaryFile);
412                 m->mothurOutEndLine();
413                 
414                 #ifdef USE_MPI
415                         }
416                 #endif
417
418         //set fasta file as new current fastafile
419                 string current = "";
420                 itTypes = outputTypes.find("summary");
421                 if (itTypes != outputTypes.end()) {
422                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSummaryFile(current); }
423                 }
424         
425                 return 0;
426         }
427         catch(exception& e) {
428                 m->errorOut(e, "SeqSummaryCommand", "execute");
429                 exit(1);
430         }
431 }
432 /**************************************************************************************/
433 int SeqSummaryCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile, linePair* filePos) {   
434         try {
435                 
436                 ofstream outSummary;
437                 m->openOutputFile(sumFile, outSummary);
438                 
439                 //print header if you are process 0
440                 if (filePos->start == 0) {
441                         outSummary << "seqname\tstart\tend\tnbases\tambigs\tpolymer\tnumSeqs" << endl;  
442                 }
443                                 
444                 ifstream in;
445                 m->openInputFile(filename, in);
446                                 
447                 in.seekg(filePos->start);
448
449                 bool done = false;
450                 int count = 0;
451        
452                 while (!done) {
453                                 
454                         if (m->control_pressed) { in.close(); outSummary.close(); return 1; }
455                                         
456                         Sequence current(in); m->gobble(in);
457            
458                         if (current.getName() != "") {
459                                 
460                                 int num = 1;
461                                 if ((namefile != "") || (countfile != "")) {
462                                         //make sure this sequence is in the namefile, else error 
463                                         map<string, int>::iterator it = nameMap.find(current.getName());
464                                         
465                                         if (it == nameMap.end()) { m->mothurOut("[ERROR]: '" + current.getName() + "' is not in your name or count file, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
466                                         else { num = it->second; }
467                                 }
468                                 
469                                 //for each sequence this sequence represents
470                                 for (int i = 0; i < num; i++) {
471                                         startPosition.push_back(current.getStartPos());
472                                         endPosition.push_back(current.getEndPos());
473                                         seqLength.push_back(current.getNumBases());
474                                         ambigBases.push_back(current.getAmbigBases());
475                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
476                                 }
477                                 
478                                 count++;
479                                 outSummary << current.getName() << '\t';
480                                 outSummary << current.getStartPos() << '\t' << current.getEndPos() << '\t';
481                                 outSummary << current.getNumBases() << '\t' << current.getAmbigBases() << '\t';
482                                 outSummary << current.getLongHomoPolymer() << '\t' << num << endl;
483                         }
484                         
485                         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
486                                 unsigned long long pos = in.tellg();
487                                 if ((pos == -1) || (pos >= filePos->end)) { break; }
488                         #else
489                                 if (in.eof()) { break; }
490                         #endif
491                         
492                         //report progress
493                         //if((count) % 100 == 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
494                 }
495                 //report progress
496                 //if((count) % 100 != 0){       m->mothurOut(toString(count)); m->mothurOutEndLine();           }
497                 
498                 in.close();
499                 
500                 return count;
501         }
502         catch(exception& e) {
503                 m->errorOut(e, "SeqSummaryCommand", "driverCreateSummary");
504                 exit(1);
505         }
506 }
507 #ifdef USE_MPI
508 /**************************************************************************************/
509 int SeqSummaryCommand::MPICreateSummary(int start, int num, vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, MPI_File& inMPI, MPI_File& outMPI, vector<unsigned long long>& MPIPos) {       
510         try {
511                 
512                 int pid;
513                 MPI_Status status; 
514                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
515
516                 for(int i=0;i<num;i++){
517                         
518                         if (m->control_pressed) { return 0; }
519                         
520                         //read next sequence
521                         int length = MPIPos[start+i+1] - MPIPos[start+i];
522         
523                         char* buf4 = new char[length];
524                         MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
525                         
526                         string tempBuf = buf4;
527                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
528                         istringstream iss (tempBuf,istringstream::in);
529                         delete buf4;
530
531                         Sequence current(iss);  
532
533                         if (current.getName() != "") {
534                                 
535                                 int num = 1;
536                                 if ((namefile != "") || (countfile != "")) {
537                                         //make sure this sequence is in the namefile, else error 
538                                         map<string, int>::iterator it = nameMap.find(current.getName());
539                                         
540                                         if (it == nameMap.end()) { cout << "[ERROR]: " << current.getName() << " is not in your name or count file, please correct." << endl; m->control_pressed = true; }
541                                         else { num = it->second; }
542                                 }
543                                 
544                                 //for each sequence this sequence represents
545                                 for (int i = 0; i < num; i++) {
546                                         startPosition.push_back(current.getStartPos());
547                                         endPosition.push_back(current.getEndPos());
548                                         seqLength.push_back(current.getNumBases());
549                                         ambigBases.push_back(current.getAmbigBases());
550                                         longHomoPolymer.push_back(current.getLongHomoPolymer());
551                                 }
552                                 
553                                 string outputString = current.getName() + "\t" + toString(current.getStartPos()) + "\t" + toString(current.getEndPos()) + "\t";
554                                 outputString += toString(current.getNumBases()) + "\t" + toString(current.getAmbigBases()) + "\t" + toString(current.getLongHomoPolymer()) + "\t" + toString(num) + "\n";
555                                 
556                                 //output to file
557                                 length = outputString.length();
558                                 char* buf3 = new char[length];
559                                 memcpy(buf3, outputString.c_str(), length);
560                                         
561                                 MPI_File_write_shared(outMPI, buf3, length, MPI_CHAR, &status);
562                                 delete buf3;
563                         }       
564                 }
565                 
566                 return 0;
567         }
568         catch(exception& e) {
569                 m->errorOut(e, "SeqSummaryCommand", "MPICreateSummary");
570                 exit(1);
571         }
572 }
573 #endif
574 /**************************************************************************************************/
575 int SeqSummaryCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, string sumFile) {
576         try {
577                 int process = 1;
578                 int num = 0;
579                 processIDS.clear();
580                 
581 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
582                 
583                 //loop through and create all the processes you want
584                 while (process != processors) {
585                         int pid = fork();
586                         
587                         if (pid > 0) {
588                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
589                                 process++;
590                         }else if (pid == 0){
591                                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile + toString(getpid()) + ".temp", lines[process]);
592                                 
593                                 //pass numSeqs to parent
594                                 ofstream out;
595                                 string tempFile = fastafile + toString(getpid()) + ".num.temp";
596                                 m->openOutputFile(tempFile, out);
597                                 
598                                 out << num << endl;
599                                 out << startPosition.size() << endl;
600                                 for (int k = 0; k < startPosition.size(); k++)          {               out << startPosition[k] << '\t'; }  out << endl;
601                                 for (int k = 0; k < endPosition.size(); k++)            {               out << endPosition[k] << '\t'; }  out << endl;
602                                 for (int k = 0; k < seqLength.size(); k++)                      {               out << seqLength[k] << '\t'; }  out << endl;
603                                 for (int k = 0; k < ambigBases.size(); k++)                     {               out << ambigBases[k] << '\t'; }  out << endl;
604                                 for (int k = 0; k < longHomoPolymer.size(); k++)        {               out << longHomoPolymer[k] << '\t'; }  out << endl;
605                                 
606                                 out.close();
607                                 
608                                 exit(0);
609                         }else { 
610                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
611                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
612                                 exit(0);
613                         }
614                 }
615                 
616                 //do your part
617                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, sumFile, lines[0]);
618
619                 //force parent to wait until all the processes are done
620                 for (int i=0;i<processIDS.size();i++) { 
621                         int temp = processIDS[i];
622                         wait(&temp);
623                 }
624                 
625                 //parent reads in and combine Filter info
626                 for (int i = 0; i < processIDS.size(); i++) {
627                         string tempFilename = fastafile + toString(processIDS[i]) + ".num.temp";
628                         ifstream in;
629                         m->openInputFile(tempFilename, in);
630                         
631                         int temp, tempNum;
632                         in >> tempNum; m->gobble(in); num += tempNum;
633                         in >> tempNum; m->gobble(in);
634                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; startPosition.push_back(temp);              }               m->gobble(in);
635                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; endPosition.push_back(temp);                }               m->gobble(in);
636                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; seqLength.push_back(temp);                  }               m->gobble(in);
637                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; ambigBases.push_back(temp);                 }               m->gobble(in);
638                         for (int k = 0; k < tempNum; k++)                       {               in >> temp; longHomoPolymer.push_back(temp);    }               m->gobble(in);
639                                 
640                         in.close();
641                         m->mothurRemove(tempFilename);
642                         
643                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
644                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
645                 }
646                 
647 #else
648                 //////////////////////////////////////////////////////////////////////////////////////////////////////
649                 //Windows version shared memory, so be careful when passing variables through the seqSumData struct. 
650                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
651                 //Taking advantage of shared memory to allow both threads to add info to vectors.
652                 //////////////////////////////////////////////////////////////////////////////////////////////////////
653                 
654                 vector<seqSumData*> pDataArray; 
655                 DWORD   dwThreadIdArray[processors-1];
656                 HANDLE  hThreadArray[processors-1]; 
657         
658                 bool hasNameMap = false;
659         if ((namefile !="") || (countfile != "")) { hasNameMap = true; }
660         
661                 //Create processor worker threads.
662                 for( int i=0; i<processors-1; i++ ){
663             
664             string extension = "";
665             if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
666                         // Allocate memory for thread data.
667                         seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, hasNameMap, nameMap);
668                         pDataArray.push_back(tempSum);
669                         
670                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
671                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
672                         hThreadArray[i] = CreateThread(NULL, 0, MySeqSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
673                 }
674                 
675         //do your part
676                 num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, (sumFile+toString(processors-1)+".temp"), lines[processors-1]);
677         processIDS.push_back(processors-1);
678
679                 //Wait until all threads have terminated.
680                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
681                 
682                 //Close all thread handles and free memory allocations.
683                 for(int i=0; i < pDataArray.size(); i++){
684                         num += pDataArray[i]->count;
685             if (pDataArray[i]->count != pDataArray[i]->end) {
686                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; 
687             }
688             for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) {     startPosition.push_back(pDataArray[i]->startPosition[k]);       }
689                         for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) {   endPosition.push_back(pDataArray[i]->endPosition[k]);       }
690             for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]);       }
691             for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) {        ambigBases.push_back(pDataArray[i]->ambigBases[k]);       }
692             for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) {   longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]);       }
693                         CloseHandle(hThreadArray[i]);
694                         delete pDataArray[i];
695                 }
696     
697                 //append files
698                 for(int i=0;i<processIDS.size();i++){
699                         m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
700                         m->mothurRemove((sumFile + toString(processIDS[i]) + ".temp"));
701                 }
702 #endif          
703                 return num;
704         }
705         catch(exception& e) {
706                 m->errorOut(e, "SeqSummaryCommand", "createProcessesCreateSummary");
707                 exit(1);
708         }
709 }
710 /**********************************************************************************************************************/
711
712
713