The order of @SQ tags in SAM/BAM files can be arbitrary now

author Bo Li <bli@cs.wisc.edu>

Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)

committer Bo Li <bli@cs.wisc.edu>

Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)
author Bo Li <bli@cs.wisc.edu>
Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)
committer Bo Li <bli@cs.wisc.edu>
Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)
diff --git a/BamConverter.h b/BamConverter.h

index e7253ba9415b566442f3efbfe266ffe052bb2753..af984ac4968c24a6a0c297154374bdaa32dbcd9e 100644 (file)
--- a/BamConverter.h
+++ b/BamConverter.h
@@ -50,6 +50,8 @@ BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_l
         in = samopen(inpF, "rb", NULL);
         assert(in != 0);
  
         in = samopen(inpF, "rb", NULL);
         assert(in != 0);
  
+       transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
         bam_header_t *out_header = sam_header_read2(chr_list);
         refmap.clear();
         for (int i = 0; i < out_header->n_targets; i++) {
         bam_header_t *out_header = sam_header_read2(chr_list);
         refmap.clear();
         for (int i = 0; i < out_header->n_targets; i++) {
@@ -93,7 +95,7 @@ void BamConverter::process() {
                 // at least one segment is not properly mapped
                 if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
  
                 // at least one segment is not properly mapped
                 if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
  
-               const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1);
+               const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
  
                 convert(b, transcript);
                 if (isPaired) {
  
                 convert(b, transcript);
                 if (isPaired) {
diff --git a/BamWriter.h b/BamWriter.h

index ff11397e3539d8662f31e85d6a9e8d09309e51f8..e014f8d5915445622841b8ca6f680cdf6b9e413d 100644 (file)
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -47,23 +47,13 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const
         }
         assert(in != 0);
  
         }
         assert(in != 0);
  
+       //build mappings from external sid to internal sid
+       transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
         //generate output's header
         bam_header_t *out_header = bam_header_dwt(in->header);
         //generate output's header
         bam_header_t *out_header = bam_header_dwt(in->header);
-
-       if (out_header->n_targets != transcripts.getM()) {
-               fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM());
-               exit(-1);
-       }
-
         for (int i = 0; i < out_header->n_targets; i++) {
         for (int i = 0; i < out_header->n_targets; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
-               if (out_header->target_name[i] != transcript.getTranscriptID()) {
-                       fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i);
-                       fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]);
-                       fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str());
-                       exit(-1);
-               }
-               out_header->target_len[i] = transcript.getLength();  // transcript length without poly(A) tail
+               out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength();  // transcript length without poly(A) tail
         }
  
         std::ostringstream strout;
         }
  
         std::ostringstream strout;
@@ -99,7 +89,7 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
                 hit = wrapper.getNextHit();
                 assert(hit != NULL);
  
                 hit = wrapper.getNextHit();
                 assert(hit != NULL);
  
-               assert(b->core.tid + 1 == hit->getSid());
+               assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
                 convert(b, hit->getConPrb());
                 if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
         }
                 convert(b, hit->getConPrb());
                 if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
         }
@@ -135,8 +125,8 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
                 hit = wrapper.getNextHit();
                 assert(hit != NULL);
  
                 hit = wrapper.getNextHit();
                 assert(hit != NULL);
  
-               assert(b->core.tid + 1 == hit->getSid());
-               assert(b2->core.tid + 1 == hit->getSid());
+               assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+               assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
  
                 convert(b, hit->getConPrb());
                 convert(b2, hit->getConPrb());
  
                 convert(b, hit->getConPrb());
                 convert(b2, hit->getConPrb());
@@ -159,8 +149,7 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
  }
  
  void BamWriter::convert(bam1_t *b, double prb) {
  }
  
  void BamWriter::convert(bam1_t *b, double prb) {
-       int sid = b->core.tid + 1;
-       const Transcript& transcript = transcripts.getTranscriptAt(sid);
+       const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
  
         int pos = b->core.pos;
         int readlen = b->core.l_qseq;
  
         int pos = b->core.pos;
         int readlen = b->core.l_qseq;
diff --git a/Buffer.h b/Buffer.h

index 1ce28bca18d4e21193965886cd92e0d64e5a927e..50177960a94a2508b0df9a5c2c590502b23954f2 100644 (file)
--- a/Buffer.h
+++ b/Buffer.h
@@ -62,15 +62,15 @@ private:
                 std::streampos gap2 = std::streampos(nSamples - to) * FLOATSIZE;
                 float *p = NULL;
  
                 std::streampos gap2 = std::streampos(nSamples - to) * FLOATSIZE;
                 float *p = NULL;
  
-               ftmpOut.seekp(0, std::ios_base::beg);
+               ftmpOut.seekp(0, std::ios::beg);
                 for (int i = 0; i < cvlen; i++) {
                         p = buffer + i;
                 for (int i = 0; i < cvlen; i++) {
                         p = buffer + i;
-                       ftmpOut.seekp(gap1, std::ios_base::cur);
+                       ftmpOut.seekp(gap1, std::ios::cur);
                         for (int j = fr; j < to; j++) {
                                 ftmpOut.write((char*)p, FLOATSIZE);
                                 p += cvlen;
                         }
                         for (int j = fr; j < to; j++) {
                                 ftmpOut.write((char*)p, FLOATSIZE);
                                 p += cvlen;
                         }
-                       ftmpOut.seekp(gap2, std::ios_base::cur);
+                       ftmpOut.seekp(gap2, std::ios::cur);
                 }
  
                 cpos = 0;
                 }
  
                 cpos = 0;
diff --git a/EM.cpp b/EM.cpp

index b2493b2fa21982125b0cba196325ca86487848c6..dfdd0e4256f43debf0243bb61301ded8acf2749e 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -748,7 +748,7 @@ int main(int argc, char* argv[]) {
  
         time_t b = time(NULL);
  
  
         time_t b = time(NULL);
  
-       printTimeUsed(a, b);
+       printTimeUsed(a, b, "EM.cpp");
  
         return 0;
  }
  
         return 0;
  }
diff --git a/README.md b/README.md

index bb66d5498be3014ccab93ef382d6d05536e6e047..77c0693af045ab7daa551055dff244905bca111f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -109,8 +109,10 @@ and provide the SAM or BAM file as an argument.  When using an
  alternative aligner, you may also want to provide the '--no-bowtie' option
  to 'rsem-prepare-reference' so that the Bowtie indices are not built.
  
  alternative aligner, you may also want to provide the '--no-bowtie' option
  to 'rsem-prepare-reference' so that the Bowtie indices are not built.
  
-Some aligners' (other than Bowtie) output might need to be converted
-so that RSEM can use. For conversion, please run
+RSEM requires all alignments of the same read group together. For
+paired-end reads, RSEM also requires the two mates of any alignment be
+adjacent. If the alternative aligner does not satisfy the first
+requirement, you can use 'convert-sam-for-rsem' for conversion. Please run
   
     convert-sam-for-rsem --help
  
   
     convert-sam-for-rsem --help
  
diff --git a/SamParser.h b/SamParser.h

index 425593a91d8c48f72f28783abafc090e0f5c6920..c5a503554b8c9cf4716b722d47da5f8c5db8540c 100644 (file)
--- a/SamParser.h
+++ b/SamParser.h
@@ -12,9 +12,7 @@
  #include "sam/sam.h"
  
  #include "utils.h"
  #include "sam/sam.h"
  
  #include "utils.h"
-
-#include "RefSeq.h"
-#include "Refs.h"
+#include "my_assert.h"
  
  #include "SingleRead.h"
  #include "SingleReadQ.h"
  
  #include "SingleRead.h"
  #include "SingleReadQ.h"
@@ -23,9 +21,11 @@
  #include "SingleHit.h"
  #include "PairedEndHit.h"
  
  #include "SingleHit.h"
  #include "PairedEndHit.h"
  
+#include "Transcripts.h"
+
  class SamParser {
  public:
  class SamParser {
  public:
-       SamParser(char, const char*, Refs&, const char* = 0);
+       SamParser(char, const char*, Transcripts&, const char* = 0);
         ~SamParser();
  
         /**
         ~SamParser();
  
         /**
@@ -51,6 +51,7 @@ private:
         bam_header_t *header;
         bam1_t *b, *b2;
  
         bam_header_t *header;
         bam1_t *b, *b2;
  
+       Transcripts& transcripts;
  
         //tag used by aligner
         static char rtTag[STRLEN];
  
         //tag used by aligner
         static char rtTag[STRLEN];
@@ -79,33 +80,23 @@ private:
  char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
  
  // aux, if not 0, points to the file name of fn_list
  char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
  
  // aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Refs& refs, const char* aux) {
+SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
+       : transcripts(transcripts)
+{
         switch(inpType) {
         case 'b': sam_in = samopen(inpF, "rb", aux); break;
         case 's': sam_in = samopen(inpF, "r", aux); break;
         default: assert(false);
         }
  
         switch(inpType) {
         case 'b': sam_in = samopen(inpF, "rb", aux); break;
         case 's': sam_in = samopen(inpF, "r", aux); break;
         default: assert(false);
         }
  
-       if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
-    header = sam_in->header;
-    if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
-
-    // Check if the reference used for aligner is the transcript set RSEM generated
-    if (refs.getM() != header->n_targets) {
-       fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
-       exit(-1);
-    }
-    for (int i = 0; i < header->n_targets; i++) {
-       const RefSeq& refseq = refs.getRef(i + 1);
-       // If update int to long, chance the (int) conversion
-       if (refseq.getName().compare(header->target_name[i]) != 0 || refseq.getTotLen() != (int)header->target_len[i]) {
-               fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
-               exit(-1);
-       }
-    }
-
-    b = bam_init1();
-    b2 = bam_init1();
+       general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
+       header = sam_in->header;
+       general_assert(header != 0, "Fail to parse sam header!");
+
+       transcripts.buildMappings(header->n_targets, header->target_name);
+
+       b = bam_init1();
+       b2 = bam_init1();
  }
  
  SamParser::~SamParser() {
  }
  
  SamParser::~SamParser() {
@@ -137,10 +128,10 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
                 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
  
                 if (getDir(b) > 0) {
                 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
  
                 if (getDir(b) > 0) {
-                       hit = SingleHit(b->core.tid + 1, b->core.pos);
+                       hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
                 }
                 else {
                 }
                 else {
-                       hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+                       hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
                 }
         }
  
                 }
         }
  
@@ -168,10 +159,10 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
                 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
  
                 if (getDir(b) > 0) {
                 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
  
                 if (getDir(b) > 0) {
-                       hit = SingleHit(b->core.tid + 1, b->core.pos);
+                       hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
                 }
                 else {
                 }
                 else {
-                       hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
+                       hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
                 }
         }
  
                 }
         }
  
@@ -220,10 +211,10 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
                 }
                 //assert(mp1->core.tid == mp2->core.tid);
                 if (getDir(mp1) > 0) {
                 }
                 //assert(mp1->core.tid == mp2->core.tid);
                 if (getDir(mp1) > 0) {
-                       hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
+                       hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
                 }
                 else {
                 }
                 else {
-                       hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
+                       hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
                 }
         }
  
                 }
         }
  
@@ -271,10 +262,10 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
                 }
                 //assert(mp1->core.tid == mp2->core.tid);
                 if (getDir(mp1) > 0) {
                 }
                 //assert(mp1->core.tid == mp2->core.tid);
                 if (getDir(mp1) > 0) {
-                       hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
+                       hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
                 }
                 else {
                 }
                 else {
-                       hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
+                       hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
                 }
         }
  
                 }
         }
  
diff --git a/Transcripts.h b/Transcripts.h

index 2750e0c36394d3c3be5cb7d63dfb74f7ae5687a3..2637d35d197b0070e636a16a0404c669190b6e6d 100644 (file)
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -10,7 +10,10 @@
  #include<fstream>
  #include<vector>
  #include<algorithm>
  #include<fstream>
  #include<vector>
  #include<algorithm>
+#include<map>
+#include<string>
  
  
+#include "my_assert.h"
  #include "Transcript.h"
  
  
  #include "Transcript.h"
  
  
@@ -20,6 +23,8 @@ public:
                 M = 0; this->type = type;
                 transcripts.clear();
                 transcripts.push_back(Transcript());
                 M = 0; this->type = type;
                 transcripts.clear();
                 transcripts.push_back(Transcript());
+
+               e2i.clear(); i2e.clear();
         }
  
         int getM() { return M; }
         }
  
         int getM() { return M; }
@@ -42,9 +47,23 @@ public:
         void readFrom(const char*);
         void writeTo(const char*);
  
         void readFrom(const char*);
         void writeTo(const char*);
  
+       //Eid: external sid
+       int getInternalSid(int eid) {
+               assert(eid > 0 && eid <= M);
+               return e2i[eid];
+       }
+
+       const Transcript& getTranscriptViaEid(int eid) {
+               return transcripts[getInternalSid(eid)];
+       }
+
+       void buildMappings(int, char**);
+
  private:
         int M, type; // type 0 from genome , 1 standalone transcriptome
         std::vector<Transcript> transcripts;
  private:
         int M, type; // type 0 from genome , 1 standalone transcriptome
         std::vector<Transcript> transcripts;
+
+       std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
  };
  
  void Transcripts::readFrom(const char* inpF) {
  };
  
  void Transcripts::readFrom(const char* inpF) {
@@ -55,8 +74,7 @@ void Transcripts::readFrom(const char* inpF) {
  
         fin>>M>>type;
         getline(fin, line);
  
         fin>>M>>type;
         getline(fin, line);
-       transcripts.clear();
-       transcripts.resize(M + 1);
+       transcripts.assign(M + 1, Transcript());
         for (int i = 1; i <= M; i++) {
                 transcripts[i].read(fin);
         }
         for (int i = 1; i <= M; i++) {
                 transcripts[i].read(fin);
         }
@@ -72,4 +90,30 @@ void Transcripts::writeTo(const char* outF) {
         fout.close();
  }
  
         fout.close();
  }
  
+void Transcripts::buildMappings(int n_targets, char** target_name) {
+       std::map<std::string, int> dict;
+       std::map<std::string, int>::iterator iter;
+
+       general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+
+       dict.clear();
+       for (int i = 1; i <= M; i++) {
+               const std::string& tid = transcripts[i].getTranscriptID();
+               iter = dict.find(tid);
+               assert(iter == dict.end());
+               dict[tid] = i;
+       }
+
+       e2i.assign(M + 1, 0);
+       i2e.assign(M + 1, 0);
+       for (int i = 0; i < n_targets; i++) {
+               iter = dict.find(std::string(target_name[i]));
+               general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!");
+               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+               e2i[i + 1] = iter->second;
+               i2e[iter->second] = i + 1;
+               iter->second = -1;
+       }
+}
+
  #endif /* TRANSCRIPTS_H_ */
  #endif /* TRANSCRIPTS_H_ */
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW

index 08f978679d9a0c965359b8994d1141fb4daaa34c..9ec79a742d346b4e47a482808a9d561e8c3928fb 100644 (file)
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,11 @@
+RSEM v1.1.17
+
+- Fixed a bug related to parallezation of credibility intervals calculation
+- Added --no-bam-output option to rsem-calculate-expression
+- The order of @SQ tags in SAM/BAM files can be arbitrary now 
+
+--------------------------------------------------------------------------------------------
+  
  RSEM v1.1.16
  
  - Added --time option to show time consumed by each phase
  RSEM v1.1.16
  
  - Added --time option to show time consumed by each phase
diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem

index 68a42d7ea3cc468a9f96093da31c3d784e103695..348354b29a2f91e707575b33b374d2752b7c563a 100755 (executable)
--- a/convert-sam-for-rsem
+++ b/convert-sam-for-rsem
@@ -4,9 +4,7 @@ use Getopt::Long;
  use Pod::Usage;
  use strict;
  
  use Pod::Usage;
  use strict;
  
-my $standard_output = "&STDOUT";
-
-my $out_file = $standard_output;
+my $out_file = "";
  my @tmp_dirs = ();
  my $help = 0;
  
  my @tmp_dirs = ();
  my $help = 0;
  
@@ -16,86 +14,19 @@ GetOptions("o=s" => \$out_file,
  
            
  pod2usage(-verbose => 2) if ($help == 1);
  
            
  pod2usage(-verbose => 2) if ($help == 1);
-pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 1);
  
  my $command;
  
  my $command;
-my (@fields, @header) = ();
-my $M;
-
-# Load fields
-my @lines = ();
-my $type;
-
-open(INPUT, "$ARGV[0].ti");
-@lines = <INPUT>;
-chomp(@lines);
-close(INPUT);
-
-@fields = ();
-($M, $type) = split(/ /, $lines[0]);
-for (my $i = 0; $i < $M; $i++) {
-    push(@fields, "SN:$lines[$i * 6 + 1]");
-}
-
-
-# Reorder header
-my $line;
-
-open(INPUT, $ARGV[1]);
-@header = ();
-while (($line = <INPUT>) && substr($line, 0, 1) eq '@') {
-    chomp($line);
-    push(@header, $line);
-}
-close(INPUT);
-
-my $n = scalar(@header);
-if ($n > 0) {
-    my %hash = ();
-    my @ktable = ();
-
-    my $tid = 0;
-
-    for (my $i = 0; $i < $n; $i++) {
-       my @arr = split(/\t/, $header[$i]);
-       if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; }
-       my $hasSN = 0;
-       foreach my $key (@arr) {
-           if (substr($key, 0, 3) eq "SN:") {
-               $hash{$key} = $i;
-               $hasSN = 1;
-               last;
-           }
-       }
-       if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); }
-       push(@ktable, $fields[$tid++]);
-    }
-
-    if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); }
  
  
-    open(OUTPUT, ">$out_file");
-    for (my $i = 0; $i < $n; $i++) {
-       if ($ktable[$i] eq "") { print OUTPUT $header[$i]; }
-       else { print OUTPUT $header[$hash{$ktable[$i]}]; }
-       print OUTPUT "\n";
-    }
-    close(OUTPUT);
-}
-
-
-# extract alignment section
-$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp";
+# grep header section
+$command = "grep ^@ $ARGV[0]";
+if ($out_file ne "") { $command .= " > $out_file"; }
  &runCommand($command);
  
  &runCommand($command);
  
-# sort and output the alignment section
-$command = "sort -k 1,1 -s";
+# sort alignment section
+$command = "grep ^[^@] $ARGV[0] | sort -k 1,1 -s";
  if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
  if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; }
-$command .= " $ARGV[1].__temp";
-if ($out_file ne $standard_output) { $command .= " >> $out_file"; }
-&runCommand($command);
-
-# delete temporary files
-$command = "rm -f $ARGV[1].__temp";
+if ($out_file ne "") { $command .= " >> $out_file"; }
  &runCommand($command);
  
  # finish
  &runCommand($command);
  
  # finish
@@ -125,7 +56,7 @@ convert-sam-for-rsem
  
  =over
  
  
  =over
  
- convert-sam-for-rsem [options] reference_name input_sam
+ convert-sam-for-rsem [options] input_sam
  
  =back
  
  
  =back
  
@@ -133,10 +64,6 @@ convert-sam-for-rsem
  
  =over
  
  
  =over
  
-=item B<reference_name>
-
-The name of the reference used. This should be the same name used by 'rsem-prepare-reference'.
-
  =item B<input_sam>
  
  The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). 
  =item B<input_sam>
  
  The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). 
@@ -165,20 +92,20 @@ Show help information.
  
  This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
  
  
  This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk.
  
-Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
+Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads.
  
  Note: This program can only recognize SAM files. See ARGUMENTS section.
  
  =head1 EXAMPLES
  
  
  Note: This program can only recognize SAM files. See ARGUMENTS section.
  
  =head1 EXAMPLES
  
-Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. 
+Suppose input_sam is set to 'input.sam'. 
  
  1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
  
  
  1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz':
  
- convert-sam-for-rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz
+ convert-sam-for-rsem input.sam | gzip > input_for_rsem.sam.gz
  
  2) Output to 'input_for_rsem.sam' directly:
  
  
  2) Output to 'input_for_rsem.sam' directly:
  
- convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam
+ convert-sam-for-rsem input.sam -o input_for_rsem.sam
  
  =cut
  
  =cut
diff --git a/makefile b/makefile

index 45ab35d11cee55d44654949830bf4c0b43869d66..cca54bebacefe9630accd2c7039b356d0dea07c4 100644 (file)
--- a/makefile
+++ b/makefile
@@ -11,7 +11,7 @@ build-sam :
  
  Transcript.h : utils.h
  
  
  Transcript.h : utils.h
  
-Transcripts.h : Transcript.h
+Transcripts.h : my_assert.h Transcript.h
  
  rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
         $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
  
  rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
         $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
@@ -47,13 +47,13 @@ PairedEndHit.h : SingleHit.h
  HitContainer.h : GroupInfo.h
  
  
  HitContainer.h : GroupInfo.h
  
  
-SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h RefSeq.h Refs.h
+SamParser.h : sam/sam.h sam/bam.h utils.h my_assert.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcripts.h
  
  
  rsem-parse-alignments : parseIt.o sam/libbam.a
         $(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz 
  
  
  
  rsem-parse-alignments : parseIt.o sam/libbam.a
         $(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz 
  
-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h RefSeq.h Refs.h sam/sam.h sam/bam.h parseIt.cpp
+parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
         $(CC) $(COFLAGS) parseIt.cpp
  
  
         $(CC) $(COFLAGS) parseIt.cpp
  
  
diff --git a/parseIt.cpp b/parseIt.cpp

index e05585d43f5a6a897f874f90506a8add3bc3e571..8620432c2840140561451c211208b560b7e48363 100644 (file)
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -13,9 +13,7 @@
  #include "utils.h"
  
  #include "GroupInfo.h"
  #include "utils.h"
  
  #include "GroupInfo.h"
-
-#include "RefSeq.h"
-#include "Refs.h"
+#include "Transcripts.h"
  
  #include "SingleRead.h"
  #include "SingleReadQ.h"
  
  #include "SingleRead.h"
  #include "SingleReadQ.h"
@@ -34,12 +32,12 @@ int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number o
  int nHits; // # of hits
  int nUnique, nMulti, nIsoMulti;
  char fn_list[STRLEN];
  int nHits; // # of hits
  int nUnique, nMulti, nIsoMulti;
  char fn_list[STRLEN];
-char refF[STRLEN], groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
  char imdName[STRLEN];
  char datF[STRLEN], cntF[STRLEN];
  
  char imdName[STRLEN];
  char datF[STRLEN], cntF[STRLEN];
  
-Refs refs;
  GroupInfo gi;
  GroupInfo gi;
+Transcripts transcripts;
  
  SamParser *parser;
  ofstream hit_out;
  
  SamParser *parser;
  ofstream hit_out;
@@ -55,7 +53,7 @@ void init(const char* imdName, char alignFType, const char* alignF) {
  
         char* aux = 0;
         if (strcmp(fn_list, "")) aux = fn_list;
  
         char* aux = 0;
         if (strcmp(fn_list, "")) aux = fn_list;
-       parser = new SamParser(alignFType, alignF, refs, aux);
+       parser = new SamParser(alignFType, alignF, transcripts, aux);
  
         memset(cat, 0, sizeof(cat));
         memset(readOutFs, 0, sizeof(readOutFs));
  
         memset(cat, 0, sizeof(cat));
         memset(readOutFs, 0, sizeof(readOutFs));
@@ -192,10 +190,10 @@ int main(int argc, char* argv[]) {
  
         verbose = !quiet;
  
  
         verbose = !quiet;
  
-       sprintf(refF, "%s.seq", argv[1]);
-       refs.loadRefs(refF, 1);
         sprintf(groupF, "%s.grp", argv[1]);
         gi.load(groupF);
         sprintf(groupF, "%s.grp", argv[1]);
         gi.load(groupF);
+       sprintf(tiF, "%s.ti", argv[1]);
+       transcripts.readFrom(tiF);
  
         sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
         sprintf(datF, "%s.dat", imdName);
  
         sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
         sprintf(datF, "%s.dat", imdName);
diff --git a/utils.h b/utils.h

index 278e95e9272ca67f36de745dc924f40fd7085ca6..e3f8bcba1306fc7116550a292767895b6c2d9544 100644 (file)
--- a/utils.h
+++ b/utils.h
@@ -120,20 +120,6 @@ inline std::string cleanStr(const std::string& str) {
    return (fr <= to ? str.substr(fr, to - fr + 1) : "");
  }
  
    return (fr <= to ? str.substr(fr, to - fr + 1) : "");
  }
  
-void printTimeUsed(const time_t& a, const time_t& b, const char* filename = "") {
-       int hh = (b - a) / 3600;
-       int mm = (b - a) % 3600 / 60;
-       int ss = (b - a) % 60;
-
-       printf("Time Used : %d h %02d m %02d s\n", hh, mm, ss);
-
-       if (strcmp(filename, "")) {
-               FILE *fo = fopen(filename, "w");
-               fprintf(fo, "Time Used : %d h %02d m %02d s\n", hh, mm, ss);
-               fclose(fo);
-       }
-}
-
  void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, char readFs[][STRLEN]){
         const char tags[3][STRLEN] = {"un", "alignable", "max"};
         char suffix[STRLEN];
  void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, char readFs[][STRLEN]){
         const char tags[3][STRLEN] = {"un", "alignable", "max"};
         char suffix[STRLEN];
@@ -156,4 +142,12 @@ void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, ch
         }
  }
  
         }
  }
  
+void printTimeUsed(const time_t& a, const time_t& b, const char* program_name) {
+       int hh = (b - a) / 3600;
+       int mm = (b - a) % 3600 / 60;
+       int ss = (b - a) % 60;
+
+       printf("Time Used for %s : %d h %02d m %02d s\n", program_name, hh, mm, ss);
+}
+
  #endif
  #endif
author	Bo Li <bli@cs.wisc.edu>
	Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)
committer	Bo Li <bli@cs.wisc.edu>
	Thu, 9 Feb 2012 00:07:26 +0000 (18:07 -0600)
BamConverter.h		patch \| blob \| history
BamWriter.h		patch \| blob \| history
Buffer.h		patch \| blob \| history
EM.cpp		patch \| blob \| history
README.md		patch \| blob \| history
SamParser.h		patch \| blob \| history
Transcripts.h		patch \| blob \| history
WHAT_IS_NEW		patch \| blob \| history
convert-sam-for-rsem		patch \| blob \| history
makefile		patch \| blob \| history
parseIt.cpp		patch \| blob \| history
utils.h		patch \| blob \| history