]> git.donarmstrong.com Git - mothur.git/blobdiff - aligncommand.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / aligncommand.cpp
index 257587fa048ac3b494fa4f0ed4481f2cd505e513..a4b3a79012aa5ac0d53c6ff78f541864984b42ec 100644 (file)
@@ -32,7 +32,7 @@ AlignCommand::AlignCommand(string option)  {
        try {
                
                abort = false;
-               
+       
                //allow user to run help
                if(option == "help") { help(); abort = true; }
                
@@ -95,23 +95,45 @@ AlignCommand::AlignCommand(string option)  {
                                                //if the user has not given a path then, add inputdir. else leave path alone.
                                                if (path == "") {       candidateFileNames[i] = inputDir + candidateFileNames[i];               }
                                        }
-
+       
                                        int ableToOpen;
                                        ifstream in;
+                                       
+                                       #ifdef USE_MPI  
+                                               int pid;
+                                               MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
+                                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                               
+                                               if (pid == 0) {
+                                       #endif
+
                                        ableToOpen = openInputFile(candidateFileNames[i], in);
+                                       in.close();
+                                       
+                                       #ifdef USE_MPI  
+                                                       for (int j = 1; j < processors; j++) {
+                                                               MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
+                                                       }
+                                               }else{
+                                                       MPI_Status status;
+                                                       MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+                                               }
+                                               
+                                       #endif
+
                                        if (ableToOpen == 1) { 
                                                m->mothurOut(candidateFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); 
                                                //erase from file list
                                                candidateFileNames.erase(candidateFileNames.begin()+i);
                                                i--;
                                        }
-                                       in.close();
+                                       
                                }
                                
                                //make sure there is at least one valid file left
                                if (candidateFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
                        }
-                       
+               
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
                        string temp;
@@ -197,10 +219,10 @@ void AlignCommand::help(){
 int AlignCommand::execute(){
        try {
                if (abort == true) {    return 0;       }
-               
+
                templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
                int longestBase = templateDB->getLongestBase();
-       
+               
                if(align == "gotoh")                    {       alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase);                 }
                else if(align == "needleman")   {       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);                                }
                else if(align == "blast")               {       alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);            }
@@ -226,8 +248,111 @@ int AlignCommand::execute(){
                        int numFastaSeqs = 0;
                        for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
                        int start = time(NULL);
+               
+#ifdef USE_MPI 
+                               int pid, end, numSeqsPerProcessor; 
+                               int tag = 2001;
+                               vector<long> MPIPos;
+                               MPIWroteAccnos = false;
+                               
+                               MPI_Status status; 
+                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                               MPI_Comm_size(MPI_COMM_WORLD, &processors); 
+
+                               MPI_File inMPI;
+                               MPI_File outMPIAlign;
+                               MPI_File outMPIReport;
+                               MPI_File outMPIAccnos;
+                               
+                               int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; 
+                               int inMode=MPI_MODE_RDONLY; 
+                                                               
+                               char outAlignFilename[alignFileName.length()];
+                               strcpy(outAlignFilename, alignFileName.c_str());
+                               
+                               char outReportFilename[reportFileName.length()];
+                               strcpy(outReportFilename, reportFileName.c_str());
+                               
+                               char outAccnosFilename[accnosFileName.length()];
+                               strcpy(outAccnosFilename, accnosFileName.c_str());
+                               
+                               char inFileName[candidateFileNames[s].length()];
+                               strcpy(inFileName, candidateFileNames[s].c_str());
+
+                               MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                               MPI_File_open(MPI_COMM_WORLD, outAlignFilename, outMode, MPI_INFO_NULL, &outMPIAlign);
+                               MPI_File_open(MPI_COMM_WORLD, outReportFilename, outMode, MPI_INFO_NULL, &outMPIReport);
+                               MPI_File_open(MPI_COMM_WORLD, outAccnosFilename, outMode, MPI_INFO_NULL, &outMPIAccnos);
+                               
+                               if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIAlign);  MPI_File_close(&outMPIReport);  MPI_File_close(&outMPIAccnos); return 0; }
+                               
+                               if (pid == 0) { //you are the root process 
+                                       
+                                       MPIPos = setFilePosFasta(candidateFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs
+                                       
+                                       //send file positions to all processes
+                                       MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                                       MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos   
+                                       
+                                       //figure out how many sequences you have to align
+                                       numSeqsPerProcessor = numFastaSeqs / processors;
+                                       if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
+                                       int startIndex =  pid * numSeqsPerProcessor;
+                               
+                                       //align your part
+                                       driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIAlign, outMPIReport, outMPIAccnos, MPIPos);
+                                       
+                                       if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIAlign);  MPI_File_close(&outMPIReport);  MPI_File_close(&outMPIAccnos); return 0; }
+
+                                       for (int i = 1; i < processors; i++) {
+                                               bool tempResult;
+                                               MPI_Recv(&tempResult, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
+                                               if (tempResult != 0) { MPIWroteAccnos = true; }
+                                       }
+                               }else{ //you are a child process
+                                       MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                                       MPIPos.resize(numFastaSeqs+1);
+                                       MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                                       
+                                       //figure out how many sequences you have to align
+                                       numSeqsPerProcessor = numFastaSeqs / processors;
+                                       if(pid == (processors - 1)){    numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor;         }
+                                       int startIndex =  pid * numSeqsPerProcessor;
+                                       
+                                       //align your part
+                                       driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIAlign, outMPIReport, outMPIAccnos, MPIPos);
+                                       
+                                       if (m->control_pressed) { MPI_File_close(&inMPI);  MPI_File_close(&outMPIAlign);  MPI_File_close(&outMPIReport);  MPI_File_close(&outMPIAccnos); return 0; }
+
+                                       MPI_Send(&MPIWroteAccnos, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); 
+                               }
+                               
+                               //close files 
+                               MPI_File_close(&inMPI);
+                               MPI_File_close(&outMPIAlign);
+                               MPI_File_close(&outMPIReport);
+                               MPI_File_close(&outMPIAccnos);
+                               
+                               //delete accnos file if blank
+                               if (pid == 0) {
+                                       //delete accnos file if its blank else report to user
+                                       if (MPIWroteAccnos) { 
+                                               m->mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
+                                               if (!flip) {
+                                                       m->mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); 
+                                               }else{  m->mothurOut(" If the reverse compliment proved to be better it was reported.");  }
+                                               m->mothurOutEndLine();
+                                       }else { 
+                                               //MPI_Info info;
+                                               //MPI_File_delete(outAccnosFilename, info);
+                                               hasAccnos = false;      
+                                               remove(accnosFileName.c_str()); 
+                                       }
+                               }
+                               
+#else
                        
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+       #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                        if(processors == 1){
                                ifstream inFASTA;
                                openInputFile(candidateFileNames[s], inFASTA);
@@ -327,7 +452,7 @@ int AlignCommand::execute(){
                                        return 0; 
                                }
                        }
-#else
+       #else
                        ifstream inFASTA;
                        openInputFile(candidateFileNames[s], inFASTA);
                        numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
@@ -354,12 +479,25 @@ int AlignCommand::execute(){
                                m->mothurOutEndLine();
                        }
                        
-#endif
-                       
+       #endif
+
+#endif         
+
+
+               #ifdef USE_MPI
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); 
+                                       
+                       if (pid == 0) { //only one process should output to screen
+               #endif
+
                        outputNames.push_back(alignFileName);
                        outputNames.push_back(reportFileName);
                        if (hasAccnos)  {       outputNames.push_back(accnosFileName);          }
-                                               
+                       
+               #ifdef USE_MPI
+                       }
+               #endif
+
                        m->mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences.");
                        m->mothurOutEndLine();
                        m->mothurOutEndLine();
@@ -395,12 +533,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
                openInputFile(filename, inFASTA);
 
                inFASTA.seekg(line->start);
-               
+       
                for(int i=0;i<line->numSeqs;i++){
                        
                        if (m->control_pressed) {  return 0; }
                        
                        Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
+       
                        int origNumBases = candidateSeq->getNumBases();
                        string originalUnaligned = candidateSeq->getUnaligned();
                        int numBasesNeeded = origNumBases * threshold;
@@ -491,7 +630,153 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
                exit(1);
        }
 }
+//**********************************************************************************************************************
+#ifdef USE_MPI
+int AlignCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& alignFile, MPI_File& reportFile, MPI_File& accnosFile, vector<long>& MPIPos){
+       try {
+               string outputString = "";
+               MPI_Status statusReport; 
+               MPI_Status statusAlign; 
+               MPI_Status statusAccnos; 
+               MPI_Status status; 
+               int pid;
+               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+       
+               NastReport report;
+               
+               if (pid == 0) {
+                       outputString = report.getHeaders();
+                       int length = outputString.length();
+                       char buf[length];
+                       strcpy(buf, outputString.c_str()); 
+               
+                       MPI_File_write_shared(reportFile, buf, length, MPI_CHAR, &statusReport);
+               }
+               
+               for(int i=0;i<num;i++){
+               
+                       if (m->control_pressed) {  return 0; }
 
+                       //read next sequence
+                       int length = MPIPos[start+i+1] - MPIPos[start+i];
+                       char buf4[length];
+                       MPI_File_read_at(inMPI, MPIPos[start+i], buf4, length, MPI_CHAR, &status);
+                       
+                       string tempBuf = buf4;
+                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length);  }
+                       istringstream iss (tempBuf,istringstream::in);
+       
+                       Sequence* candidateSeq = new Sequence(iss);  
+                       int origNumBases = candidateSeq->getNumBases();
+                       string originalUnaligned = candidateSeq->getUnaligned();
+                       int numBasesNeeded = origNumBases * threshold;
+       
+                       if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file
+                               if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
+                                       alignment->resize(candidateSeq->getUnaligned().length()+1);
+                               }
+                                                               
+                               Sequence temp = templateDB->findClosestSequence(candidateSeq);
+                               Sequence* templateSeq = &temp;
+                               
+                               float searchScore = templateDB->getSearchScore();
+                                                               
+                               Nast* nast = new Nast(alignment, candidateSeq, templateSeq);
+                               Sequence* copy;
+                               
+                               Nast* nast2;
+                               bool needToDeleteCopy = false;  //this is needed in case you have you enter the ifs below
+                                                                                               //since nast does not make a copy of hte sequence passed, and it is used by the reporter below
+                                                                                               //you can't delete the copy sequence til after you report, but you may choose not to create it in the first place
+                                                                                               //so this bool tells you if you need to delete it
+                                                                                               
+                               //if there is a possibility that this sequence should be reversed
+                               if (candidateSeq->getNumBases() < numBasesNeeded) {
+                                       
+                                       string wasBetter = "";
+                                       //if the user wants you to try the reverse
+                                       if (flip) {
+                                               //get reverse compliment
+                                               copy = new Sequence(candidateSeq->getName(), originalUnaligned);
+                                               copy->reverseComplement();
+                                               
+                                               //rerun alignment
+                                               Sequence temp2 = templateDB->findClosestSequence(copy);
+                                               Sequence* templateSeq2 = &temp2;
+                                               
+                                               searchScore = templateDB->getSearchScore();
+                                               
+                                               nast2 = new Nast(alignment, copy, templateSeq2);
+                       
+                                               //check if any better
+                                               if (copy->getNumBases() > candidateSeq->getNumBases()) {
+                                                       candidateSeq->setAligned(copy->getAligned());  //use reverse compliments alignment since its better
+                                                       templateSeq = templateSeq2; 
+                                                       delete nast;
+                                                       nast = nast2;
+                                                       needToDeleteCopy = true;
+                                               }else{  
+                                                       wasBetter = "\treverse complement did NOT produce a better alignment, please check sequence.";
+                                                       delete nast2;
+                                                       delete copy;    
+                                               }
+                                       }
+                                       
+                                       //create accnos file with names
+                                       outputString = candidateSeq->getName() + wasBetter + "\n";
+                                       
+                                       //send results to parent
+                                       int length = outputString.length();
+                                       char buf[length];
+                                       strcpy(buf, outputString.c_str()); 
+                               
+                                       MPI_File_write_shared(accnosFile, buf, length, MPI_CHAR, &statusAccnos);
+                                       MPIWroteAccnos = true;
+                               }
+                               
+                               report.setCandidate(candidateSeq);
+                               report.setTemplate(templateSeq);
+                               report.setSearchParameters(search, searchScore);
+                               report.setAlignmentParameters(align, alignment);
+                               report.setNastParameters(*nast);
+       
+                               outputString =  ">" + candidateSeq->getName() + "\n" + candidateSeq->getAligned() + "\n";
+                               
+                               //send results to parent
+                               int length = outputString.length();
+                               char buf2[length];
+                               strcpy(buf2, outputString.c_str()); 
+                               
+                               MPI_File_write_shared(alignFile, buf2, length, MPI_CHAR, &statusAlign);
+                               
+                               outputString = report.getReport();
+                               
+                               //send results to parent
+                               length = outputString.length();
+                               char buf3[length];
+                               strcpy(buf3, outputString.c_str()); 
+                               
+                               MPI_File_write_shared(reportFile, buf3, length, MPI_CHAR, &statusReport);
+
+                               delete nast;
+                               if (needToDeleteCopy) {   delete copy;   }
+                       }
+                       delete candidateSeq;
+                       
+                       //report progress
+                       if((i+1) % 100 == 0){   cout << (toString(i+1)) << endl;                }
+               }
+               //report progress
+               if((num) % 100 != 0){   cout << (toString(num)) << endl;                }
+               
+               return 1;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "AlignCommand", "driverMPI");
+               exit(1);
+       }
+}
+#endif
 /**************************************************************************************************/
 
 int AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName, string filename) {
@@ -577,5 +862,4 @@ void AlignCommand::appendReportFiles(string temp, string filename) {
                exit(1);
        }
 }
-
 //**********************************************************************************************************************