Provided a more detailed description for how to simulate RNA-Seq data using 'rsem...

[rsem.git] / simulation.cpp
diff --git a/simulation.cpp b/simulation.cpp

index 1288c65178afa360f89f4cff3ee5a2f78b7de5cf..0073c3ec1eee70249f087e4edeb1d769c48b61a5 100644 (file)
--- a/simulation.cpp
+++ b/simulation.cpp
@@ -260,7 +260,29 @@ int main(int argc, char* argv[]) {
         FILE *fi = NULL;
  
         if (argc != 7 && argc != 8) {
-               printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n");
+               printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n\n");
+               printf("Parameters:\n\n");
+               printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
+               printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
+               printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.\n");
+               printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
+               printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
+               printf("output_name: Prefix for all output files.\n");
+               printf("-q: Set it will stop outputting intermediate information.\n\n");
+               printf("Outputs:\n\n");
+               printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n\n");
+               printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
+               printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
+               printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
+               printf("{>/@}: Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated\n");
+               printf("rid: Simulated read's index, numbered from 0\n");
+               printf("dir: The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')\n");
+               printf("sid: Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names)\n");
+               printf("pos: The start position of the simulated read in strand dir of transcript sid. It is numbered from 0\n");
+               printf("insertL: Only appear for paired-end reads. It gives the insert length of the simulated read.\n\n");
+               printf("Example:\n\n");
+               printf("Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is:\n\n");
+               printf("\trsem-simulate-reads /ref/mouse_125 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads\n");
                 exit(-1);
         }