From: Bo Li Date: Sun, 11 Mar 2012 10:22:22 +0000 (-0500) Subject: Modified 'convert-sam-for-rsem' so that this program will convert users' SAM/BAM... X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=d5639baddcd4bdf09dd31237e914afb987954472 Modified 'convert-sam-for-rsem' so that this program will convert users' SAM/BAM files into acceptable BAM files for RSEM --- diff --git a/.gitignore b/.gitignore index 0f2be17..0479fb5 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ rsem-simulate-reads rsem-synthesis-reference-transcripts rsem-tbam2gbam rsem-sam-validator +rsem-scan-for-paired-end-reads sam/samtools .project .cproject +update_doc.sh diff --git a/README.md b/README.md index 77c0693..836a367 100644 --- a/README.md +++ b/README.md @@ -102,17 +102,24 @@ consideration. By default, RSEM automates the alignment of reads to reference transcripts using the Bowtie alignment program. To use an alternative alignment program, align the input reads against the file -'reference_name.idx.fa' generated by 'rsem-prepare-reference', and format -the alignment output in SAM or BAM format. Then, instead of providing -reads to 'rsem-calculate-expression', specify the '--sam' or '--bam' option -and provide the SAM or BAM file as an argument. When using an -alternative aligner, you may also want to provide the '--no-bowtie' option -to 'rsem-prepare-reference' so that the Bowtie indices are not built. +'reference_name.idx.fa' generated by 'rsem-prepare-reference', and +format the alignment output in SAM or BAM format. Then, instead of +providing reads to 'rsem-calculate-expression', specify the '--sam' or +'--bam' option and provide the SAM or BAM file as an argument. When +using an alternative aligner, you may also want to provide the +'--no-bowtie' option to 'rsem-prepare-reference' so that the Bowtie +indices are not built. RSEM requires all alignments of the same read group together. For paired-end reads, RSEM also requires the two mates of any alignment be -adjacent. If the alternative aligner does not satisfy the first -requirement, you can use 'convert-sam-for-rsem' for conversion. Please run +adjacent. To check if your SAM/BAM file satisfy the requirements, +please run + + rsem-sam-validator + +If your file does not satisfy the requirements, you can use +'convert-sam-for-rsem' to convert it into a BAM file which RSEM can +process. Please run convert-sam-for-rsem --help diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index 9ec79a7..81c595c 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -1,3 +1,11 @@ +RSEM v1.1.18 + +- Added some user-friendly error messages +- Added program 'rsem-sam-validator', users can use this program to check if RSEM can process their SAM/BAM files +- Modified 'convert-sam-for-rsem' so that this program will convert users' SAM/BAM files into acceptable BAM files for RSEM + +-------------------------------------------------------------------------------------------- + RSEM v1.1.17 - Fixed a bug related to parallezation of credibility intervals calculation diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem index 348354b..13b9b13 100755 --- a/convert-sam-for-rsem +++ b/convert-sam-for-rsem @@ -2,35 +2,83 @@ use Getopt::Long; use Pod::Usage; +use File::Basename; +use File::Path 'rmtree'; use strict; -my $out_file = ""; +my ($in_file, $out_file) = (); + my @tmp_dirs = (); my $help = 0; -GetOptions("o=s" => \$out_file, - "T|temporary-directory=s" => \@tmp_dirs, +GetOptions("T|temporary-directory=s" => \@tmp_dirs, "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); pod2usage(-verbose => 2) if ($help == 1); -pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); my $command; +$in_file = $ARGV[0]; +$out_file = "$ARGV[1].bam"; + +my ($fn, $dir, $suf) = fileparse($in_file, qr/\.[^.]*/); + +$suf = lc(substr($suf, 1)); +pod2usage(-msg => "Input file's suffix is neither sam nor bam!", -exitval => 2, -verbose => 2) if (($suf ne "sam") && ($suf ne "bam")); +my $isSam = ($suf eq "sam"); + +($fn, $dir, $suf) = fileparse($0); + +my $temp_dir = "$out_file.temp"; +if (-d $temp_dir) { print "Warning: $temp_dir exists, convert-sam-for-rsem will write temporary files into this folder and delete it after it finishes!\n"; } +else { + if (!mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); } +} + +# Convert to SAM format if input is a BAM file + +my $sam_file; + +if (!$isSam) { + $sam_file = "$temp_dir/input.sam"; + $command = $dir."sam/samtools view -h -o $sam_file $in_file"; + &runCommand($command); +} +else { + $sam_file = $in_file; +} + +# Phase I, sort entries so that all entries of a same read groups together + +my $tmp_sam = "$temp_dir/tmp.sam"; + # grep header section -$command = "grep ^@ $ARGV[0]"; -if ($out_file ne "") { $command .= " > $out_file"; } +$command = "grep ^@ $sam_file > $tmp_sam"; &runCommand($command); # sort alignment section -$command = "grep ^[^@] $ARGV[0] | sort -k 1,1 -s"; +$command = "grep ^[^@] $sam_file | sort -k 1,1 -s"; if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } -if ($out_file ne "") { $command .= " >> $out_file"; } +$command .= " >> $tmp_sam"; +&runCommand($command); + +# Phase II, parse the temporary SAM file to make paired-end alignments' two mates adjacent to each other + +$command = $dir."rsem-scan-for-paired-end-reads $tmp_sam $out_file"; +&runCommand($command); + +# delete temporary directory +rmtree($temp_dir); + +print STDERR "Conversion is completed. $out_file will be checked by 'rsem-sam-validator'.\n"; + +# Phase III, validate if the resulting bam file is correct + +$command = $dir."rsem-sam-validator $out_file"; &runCommand($command); -# finish -print STDERR "Conversion is completed successfully!\n"; # command, {err_msg} sub runCommand { @@ -56,7 +104,7 @@ convert-sam-for-rsem =over - convert-sam-for-rsem [options] input_sam + convert-sam-for-rsem [options] output_file_name =back @@ -64,9 +112,13 @@ convert-sam-for-rsem =over -=item B +=item B -The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). +The SAM or BAM file generated by user's aligner. We require this file contains the header section. If input is a SAM file, it must end with suffix 'sam' (case insensitive). If input is a BAM file, it must end with suffix 'bam' (case insensitive). + +=item B + +The output name for the converted file. 'convert-sam-for-rsem' will output a BAM with the name 'output_file_name.bam'. =back @@ -74,10 +126,6 @@ The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM =over -=item B<-o> - -Output the converted SAM file into . (Default: STDOUT) - =item B<-T/--temporary-directory> 'convert-sam-for-rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". @@ -90,22 +138,18 @@ Show help information. =head1 DESCRIPTION -This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. +This program converts the SAM/BAM file generated by user's aligner into a BAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references and output header sections. This program will create a temporary directory called 'output_file_name.bam.temp' to store the intermediate files. The directory will be deleted automatically after the conversion. After the conversion, this program will call 'rsem-sam-validator' to validate the resulting BAM file. -Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. +Note: You do not need to run this script if `rsem-sam-validator' reports that your SAM/BAM file is valid. -Note: This program can only recognize SAM files. See ARGUMENTS section. +Note: This program does not check the correctness of input file. You should make sure the input is a valid SAM/BAM format file. =head1 EXAMPLES -Suppose input_sam is set to 'input.sam'. - -1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': - - convert-sam-for-rsem input.sam | gzip > input_for_rsem.sam.gz +Suppose input is set to 'input.sam' and output file name is "output" -2) Output to 'input_for_rsem.sam' directly: + convert-sam-for-rsem input.sam output - convert-sam-for-rsem input.sam -o input_for_rsem.sam +We will get a file called 'output.bam' as output. =cut diff --git a/makefile b/makefile index 44d7369..900a8b3 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = g++ CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator +PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator rsem-scan-for-paired-end-reads all : $(PROGRAMS) @@ -133,6 +133,9 @@ rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam.a $(CC) -O3 -Wall samValidator.cpp sam/libbam.a -lz -o $@ +rsem-scan-for-paired-end-reads : sam/bam.h sam/sam.h my_assert.h scanForPairedEndReads.cpp sam/libbam.a + $(CC) -O3 -Wall scanForPairedEndReads.cpp sam/libbam.a -lz -o $@ + clean: rm -f *.o *~ $(PROGRAMS) cd sam ; ${MAKE} clean diff --git a/my_assert.h b/my_assert.h index 74f0299..02844e1 100644 --- a/my_assert.h +++ b/my_assert.h @@ -31,9 +31,10 @@ std::string cstrtos(const char* s) { } -void general_assert(int expr, const std::string& errmsg) { +void general_assert(int expr, const std::string& errmsg, bool putEnter = false) { if (expr) return; + if (putEnter) printf("\n"); fprintf(stderr, "%s\n", errmsg.c_str()); exit(-1); } diff --git a/samValidator.cpp b/samValidator.cpp index 06e91d4..e1a2cc8 100644 --- a/samValidator.cpp +++ b/samValidator.cpp @@ -50,7 +50,7 @@ int main(int argc, char* argv[]) { uint64_t creadlen = 0, readlen; bool cispaired = false, ispaired; - printf("."); + printf("."); fflush(stdout); do { if (samread(in, b) < 0) break; assert(b->core.l_qseq > 0); @@ -60,11 +60,16 @@ int main(int argc, char* argv[]) { // if this is a paired-end read ispaired = b->core.flag & 0x0001; if (ispaired) { + isValid = (samread(in, b2) >= 0) && (qname == bam1_qname(b2)) && (b2->core.flag & 0x0001); if (!isValid) { printf("\nOnly find one mate for paired-end read %s!\n", qname.c_str()); continue; } assert(b2->core.l_qseq > 0); + isValid = !((b->core.flag & 0x0040) && (b->core.flag & 0x0080)) && !((b2->core.flag & 0x0040) & (b2->core.flag & 0x0080)); + if (!isValid) { printf("\nRead %s has more than 2 segments (e.g. tripled or more ended reads)!\n", qname.c_str()); continue; } isValid = !(((b->core.flag & 0x0040) && (b2->core.flag & 0x0040)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0080))); if (!isValid) { printf("\nThe two mates of paired-end read %s are not adjacent!\n", qname.c_str()); continue; } + + // both mates are mapped if (!(b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) { isValid = (b->core.tid == b2->core.tid) && (b->core.pos == b2->core.mpos) && (b2->core.pos == b->core.mpos); @@ -92,7 +97,7 @@ int main(int argc, char* argv[]) { } ++cnt; - if (cnt % 1000000 == 0) printf("."); + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } } while(isValid); diff --git a/scanForPairedEndReads.cpp b/scanForPairedEndReads.cpp new file mode 100644 index 0000000..4d1fbfb --- /dev/null +++ b/scanForPairedEndReads.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +#include "my_assert.h" + +using namespace std; + +samfile_t *in, *out; +bam1_t *b, *b2; +vector arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown; + +inline void add_to_appropriate_arr(bam1_t *b) { + if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) { + arr_both.push_back(bam_dup1(b)); return; + } + + if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b)); + else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b)); + else arr_partial_unknown.push_back(bam_dup1(b)); +} + +bool less_than(bam1_t *a, bam1_t *b) { + int32_t ap1 = min(a->core.pos, a->core.mpos); + int32_t ap2 = max(a->core.pos, a->core.mpos); + int32_t bp1 = min(b->core.pos, b->core.mpos); + int32_t bp2 = max(b->core.pos, b->core.mpos); + + if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid; + if (ap1 != bp1) return ap1 < bp1; + return ap2 < bp2; +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n"); + exit(-1); + } + + in = samopen(argv[1], "r", NULL); + general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !"); + out = samopen(argv[2], "wb", in->header); + general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !"); + + b = bam_init1(); b2 = bam_init1(); + + string qname; + bool go_on = (samread(in, b) >= 0); + bool isPaired; + long cnt = 0; + + printf("."); fflush(stdout); + + while (go_on) { + qname.assign(bam1_qname(b)); + isPaired = (b->core.flag & 0x0001); + + if (isPaired) { + add_to_appropriate_arr(b); + while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) { + general_assert(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!", true); + add_to_appropriate_arr(b); + } + + general_assert(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!", true); + general_assert((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!", true); + + if (!arr_both.empty()) { + sort(arr_both.begin(), arr_both.end(), less_than); + for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); } + arr_both.clear(); + } + + while (!arr_partial_1.empty() || !arr_partial_2.empty()) { + if (!arr_partial_1.empty() && !arr_partial_2.empty()) { + samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back(); + samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back(); + } + else if (!arr_partial_1.empty()) { + samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back(); + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + else { + samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back(); + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + } + + while (!arr_partial_unknown.empty()) { + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + } + else { + samwrite(out, b); + while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) { + samwrite(out, b); + } + } + + ++cnt; + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + } + + printf("\nFinished!\n"); + + bam_destroy1(b); bam_destroy1(b2); + + samclose(in); + samclose(out); + + return 0; +}