]> git.donarmstrong.com Git - mothur.git/commitdiff
changed file divide for trim.seqs
authorwestcott <westcott>
Fri, 20 Aug 2010 12:40:46 +0000 (12:40 +0000)
committerwestcott <westcott>
Fri, 20 Aug 2010 12:40:46 +0000 (12:40 +0000)
qualityscores.cpp
trimseqscommand.cpp
trimseqscommand.h

index 29c81e295c883bc42d48830b59243f8961a273fb..fd9459c0d49d005437464463e421bdd61c7db3d7 100644 (file)
@@ -34,12 +34,17 @@ QualityScores::QualityScores(ifstream& qFile, int l){
                seqLength = l;
                int score;
                
-               string line;
-               getline(qFile, line); gobble(qFile);
-               istringstream nameStream(line);
+               //string line;
+               //getline(qFile, line); 
+               //istringstream nameStream(line);
        
-               nameStream >> seqName;
-               seqName = seqName.substr(1); 
+               qFile >> seqName; 
+               while (!qFile.eof())    {       char c = qFile.get(); if (c == 10 || c == 13 || c == -1){       break;  }       } // get rest of line 
+               gobble(qFile);
+               if (seqName == "") { m->mothurOut("Error reading quality file, name blank at position, " + toString(qFile.tellg())); m->mothurOutEndLine(); }
+               else {
+                       seqName = seqName.substr(1); 
+               }
 
                //getline(qFile, line);
                //istringstream qualStream(line);
index b430624838594e4185233f7e946733a94469303b..4b2ee4eee99d3d38c2e5cd4f77ad2c0b90af4b81 100644 (file)
@@ -235,21 +235,22 @@ int TrimSeqsCommand::execute(){
                        outputNames.push_back(groupFile);
                        getOligos(fastaFileNames, qualFileNames);
                }
-               
-               if(qFileName != "")     {       setLines(qFileName, qLines);    }
 
+               vector<unsigned long int> fastaFilePos;
+               vector<unsigned long int> qFilePos;
+               
+               setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
+               
+               for (int i = 0; i < (fastaFilePos.size()-1); i++) {
+                       lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
+                       if (qFileName != "") {  qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)]));  }
+               }       
+               if(qFileName == "")     {       qLines = lines; } //files with duds
                
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                                if(processors == 1){
-                                       ifstream inFASTA;
-                                       int numSeqs;
-                                       openInputFile(fastaFile, inFASTA);
-                                       getNumSeqs(inFASTA, numSeqs);
-                                       inFASTA.close();
-                                       
-                                       lines.push_back(new linePair(0, numSeqs));
-                                       
-                                       driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], lines[0]);
+                                                                               
+                                       driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]);
                                        
                                        for (int j = 0; j < fastaFileNames.size(); j++) {
                                                rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str());
@@ -261,9 +262,6 @@ int TrimSeqsCommand::execute(){
                                        }                                               
 
                                }else{
-                                       setLines(fastaFile, lines);     
-                                       if(qFileName == "")     {       qLines = lines; }       
-                                                               
                                        createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); 
                                        
                                        rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str());
@@ -317,15 +315,7 @@ int TrimSeqsCommand::execute(){
                                
                                if (m->control_pressed) {  return 0; }
                #else
-                               ifstream inFASTA;
-                               int numSeqs;
-                               openInputFile(fastaFile, inFASTA);
-                               getNumSeqs(inFASTA, numSeqs);
-                               inFASTA.close();
-                               
-                               lines.push_back(new linePair(0, numSeqs));
-                               
-                               driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], lines[0]);
+                               driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qlines[0]);
                                
                                for (int j = 0; j < fastaFileNames.size(); j++) {
                                        rename((fastaFileNames[j] + toString(j) + ".temp").c_str(), fastaFileNames[j].c_str());
@@ -459,14 +449,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                ifstream inFASTA;
                openInputFile(filename, inFASTA);
+               inFASTA.seekg(line->start);
                
                ifstream qFile;
-               if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
+               if(qFileName != "")     {       openInputFile(qFileName, qFile);        qFile.seekg(qline->start);  }
                
-               qFile.seekg(qline->start);
-               inFASTA.seekg(line->start);
-               
-               for(int i=0;i<line->num;i++){
+               bool done = false;
+               int count = 0;
+       
+               while (!done) {
                                
                        if (m->control_pressed) { 
                                inFASTA.close(); outFASTA.close(); scrapFASTA.close();
@@ -485,10 +476,10 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                        
                        int success = 1;
                        
-                       Sequence currSeq(inFASTA);
+                       Sequence currSeq(inFASTA); gobble(inFASTA);
                        QualityScores currQual;
                        if(qFileName != ""){
-                               currQual = QualityScores(qFile, currSeq.getNumBases());
+                               currQual = QualityScores(qFile, currSeq.getNumBases());  gobble(qFile);
                        }
                        
                        string origSeq = currSeq.getUnaligned();
@@ -576,10 +567,19 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        currSeq.printSequence(scrapFASTA);
                                        currQual.printQScores(scrapQual);
                                }
+                               count++;
                        }
-                       gobble(inFASTA);
-                       gobble(qFile);
+                       
+                       unsigned long int pos = inFASTA.tellg();
+                       if ((pos == -1) || (pos >= line->end)) { break; }
+                       
+                       //report progress
+                       if((count) % 1000 == 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
+                       
                }
+               //report progress
+               if((count) % 1000 != 0){        m->mothurOut(toString(count)); m->mothurOutEndLine();           }
+
                
                inFASTA.close();
                outFASTA.close();
@@ -599,7 +599,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                        }               
                }                       
                
-               return 0;
+               return count;
        }
        catch(exception& e) {
                m->errorOut(e, "TrimSeqsCommand", "driverCreateTrim");
@@ -646,33 +646,70 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
 
 /**************************************************************************************************/
 
-int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
        try {
                
-               lines.clear();
+               //set file positions for fasta file
+               fastaFilePos = divideFile(filename, processors);
                
-               vector<unsigned long int> positions;
+               if (qfilename == "") { return processors; }
                
-               ifstream inFASTA;
-               openInputFile(filename, inFASTA);
+               //get name of first sequence in each chunk
+               map<string, int> firstSeqNames;
+               for (int i = 0; i < (fastaFilePos.size()-1); i++) {
+                       ifstream in;
+                       openInputFile(filename, in);
+                       in.seekg(fastaFilePos[i]);
+               
+                       Sequence temp(in); 
+                       firstSeqNames[temp.getName()] = i;
+               
+                       in.close();
+               }
+                               
+               //seach for filePos of each first name in the qfile and save in qfileFilePos
+               ifstream inQual;
+               openInputFile(qfilename, inQual);
                        
                string input;
-               while(!inFASTA.eof()){  
-                       input = getline(inFASTA);
+               while(!inQual.eof()){   
+                       input = getline(inQual);
 
                        if (input.length() != 0) {
-                               if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);    }
+                               if(input[0] == '>'){ //this is a sequence name line
+                                       istringstream nameStream(input);
+                                       
+                                       string sname = "";  nameStream >> sname;
+                                       sname = sname.substr(1);
+                                       
+                                       map<string, int>::iterator it = firstSeqNames.find(sname);
+                                       
+                                       if(it != firstSeqNames.end()) { //this is the start of a new chunk
+                                               unsigned long int pos = inQual.tellg(); 
+                                               qfileFilePos.push_back(pos - input.length() - 1);       
+                                               firstSeqNames.erase(it);
+                                       }
+                               }
                        }
+                       
+                       if (firstSeqNames.size() == 0) { break; }
                }
-               inFASTA.close();
+               inQual.close();
                
-               int numFastaSeqs = positions.size();
-       
+               if (firstSeqNames.size() != 0) { 
+                       for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
+                               m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
+                       }
+                       qFileName = "";
+                       return processors;
+               }
+
+               //get last file position of qfile
                FILE * pFile;
                unsigned long int size;
                
                //get num bytes in file
-               pFile = fopen (filename.c_str(),"rb");
+               pFile = fopen (qfilename.c_str(),"rb");
                if (pFile==NULL) perror ("Error opening file");
                else{
                        fseek (pFile, 0, SEEK_END);
@@ -680,20 +717,9 @@ int TrimSeqsCommand::setLines(string filename, vector<linePair*>& lines) {
                        fclose (pFile);
                }
                
-               int numSeqsPerProcessor = numFastaSeqs / processors;
-               
-               for (int i = 0; i < processors; i++) {
-
-                       unsigned long int startPos = positions[ i * numSeqsPerProcessor ];
-                       if(i == processors - 1){
-                               numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
-                       }else{  
-                               unsigned long int myEnd = positions[ (i+1) * numSeqsPerProcessor ];
-                       }
-                       lines.push_back(new linePair(startPos, numSeqsPerProcessor));
-               }
+               qfileFilePos.push_back(size);
                
-               return numFastaSeqs;
+               return processors;
        }
        catch(exception& e) {
                m->errorOut(e, "TrimSeqsCommand", "setLines");
index a904a10b683ce3fd04784e9ac1a60d02e293c368..93e8c776918ec9fa2d6f661d7c178380a4f47d66 100644 (file)
@@ -26,8 +26,8 @@ private:
 
        struct linePair {
                unsigned long int start;
-               int num;
-               linePair(unsigned long int i, int j) : start(i), num(j) {}
+               unsigned long int end;
+               linePair(unsigned long int i, unsigned long int j) : start(i), end(j) {}
        };
 
        void getOligos(vector<string>&, vector<string>&);
@@ -60,7 +60,7 @@ private:
        
        int driverCreateTrim(string, string, string, string, string, string, string, vector<string>, vector<string>, linePair*, linePair*);     
        int createProcessesCreateTrim(string, string, string, string, string, string, string, vector<string>, vector<string>);
-       int setLines(string, vector<linePair*>&);
+       int setLines(string, string, vector<unsigned long int>&, vector<unsigned long int>&);
        
 };