simulation.cpp

   1 #include<cmath>
   2 #include<cstdio>
   3 #include<cstring>
   4 #include<cstdlib>
   5 #include<cassert>
   6 #include<string>
   7 #include<iostream>
   8 #include<fstream>
   9 #include<vector>
  10
  11 #include "utils.h"
  12 #include "my_assert.h"
  13 #include "Read.h"
  14 #include "SingleRead.h"
  15 #include "SingleReadQ.h"
  16 #include "PairedEndRead.h"
  17 #include "PairedEndReadQ.h"
  18
  19 #include "Model.h"
  20 #include "SingleModel.h"
  21 #include "SingleQModel.h"
  22 #include "PairedEndModel.h"
  23 #include "PairedEndQModel.h"
  24
  25 #include "Refs.h"
  26 #include "GroupInfo.h"
  27 #include "Transcript.h"
  28 #include "Transcripts.h"
  29
  30 #include "simul.h"
  31
  32 using namespace std;
  33
  34 const int OFFSITE = 5;
  35
  36 READ_INT_TYPE N;
  37 int model_type, M, m;
  38
  39 Refs refs;
  40 GroupInfo gi;
  41 Transcripts transcripts;
  42
  43 vector<double> eel;
  44 vector<double> theta, counts;
  45
  46 int n_os;
  47 ostream *os[2];
  48 char outReadF[2][STRLEN];
  49
  50 char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
  51 char geneResF[STRLEN], isoResF[STRLEN];
  52
  53 simul sampler;
  54
  55 void genOutReadStreams(int type, char *outFN) {
  56         switch(type) {
  57         case 0 :
  58                 n_os = 1;
  59                 sprintf(outReadF[0], "%s.fa", outFN);
  60                 break;
  61         case 1 :
  62                 n_os = 1;
  63                 sprintf(outReadF[0], "%s.fq", outFN);
  64                 break;
  65         case 2 :
  66                 n_os = 2;
  67                 for (int i = 0; i < n_os; i++)
  68                         sprintf(outReadF[i], "%s_%d.fa", outFN, i + 1);
  69                 break;
  70         case 3 :
  71                 n_os = 2;
  72                 for (int i = 0; i < n_os; i++)
  73                         sprintf(outReadF[i], "%s_%d.fq", outFN, i + 1);
  74                 break;
  75         }
  76
  77         for (int i = 0; i < n_os; i++)
  78                 os[i] = new ofstream(outReadF[i]);
  79 }
  80
  81 template<class ModelType>
  82 void calcExpectedEffectiveLengths(ModelType& model) {
  83         int lb, ub, span;
  84         double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
  85
  86         model.getGLD().copyTo(pdf, cdf, lb, ub, span);
  87         clen = new double[span + 1];
  88         clen[0] = 0.0;
  89         for (int i = 1; i <= span; i++) {
  90                 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
  91         }
  92
  93         eel.assign(M + 1, 0.0);
  94         for (int i = 1; i <= M; i++) {
  95                 int totLen = refs.getRef(i).getTotLen();
  96                 int fullLen = refs.getRef(i).getFullLen();
  97                 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
  98                 int pos2 = max(min(totLen, ub) - lb, 0);
  99
 100                 if (pos2 == 0) { eel[i] = 0.0; continue; }
 101
 102                 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
 103                 assert(eel[i] >= 0);
 104                 if (eel[i] < MINEEL) { eel[i] = 0.0; }
 105         }
 106
 107         delete[] pdf;
 108         delete[] cdf;
 109         delete[] clen;
 110 }
 111
 112 template<class ReadType, class ModelType>
 113 void simulate(char* modelF, char* resultsF) {
 114         ModelType model(&refs);
 115         ReadType read;
 116         int sid;
 117
 118         model.read(modelF);
 119
 120         //calculate eel
 121         calcExpectedEffectiveLengths<ModelType>(model);
 122
 123         //generate theta vector
 124         ifstream fin(resultsF);
 125         string line;
 126         double tpm;
 127         double denom = 0.0;
 128         getline(fin, line); // read the first line, which is just column names
 129         for (int i = 1; i <= M; i++) {
 130           getline(fin, line);
 131           size_t pos = 0;
 132           for (int j = 0; j < OFFSITE; j++) pos = line.find_first_of('\t', pos) + 1;
 133           size_t pos2 = line.find_first_of('\t', pos);
 134           if (pos2 == string::npos) pos2 = line.length();
 135           tpm = atof(line.substr(pos, pos2 - pos).c_str());
 136           theta[i] = tpm * eel[i]; // during simulation, there is no check for effL < 0. The reason is for that case, eel[i] here = 0 and therefore no chance to sample from it
 137           denom += theta[i];
 138         }
 139         assert(denom > EPSILON);
 140         fin.close();
 141         for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
 142
 143         READ_INT_TYPE resimulation_count = 0;
 144
 145         //simulating...
 146         model.startSimulation(&sampler, theta);
 147         for (READ_INT_TYPE i = 0; i < N; i++) {
 148                 while (!model.simulate(i, read, sid)) { ++resimulation_count; }
 149                 read.write(n_os, os);
 150                 ++counts[sid];
 151                 if ((i + 1) % 1000000 == 0 && verbose) cout<<"GEN "<< i + 1<< endl;
 152         }
 153         model.finishSimulation();
 154
 155         cout<< "Total number of resimulation is "<< resimulation_count<< endl;
 156 }
 157
 158 void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
 159         double denom;
 160         vector<double> frac;
 161
 162         //calculate fraction of count over all mappabile reads
 163         denom = 0.0;
 164         frac.assign(M + 1, 0.0);
 165         for (int i = 1; i <= M; i++)
 166           if (eel[i] >= EPSILON) {
 167             frac[i] = theta[i];
 168             denom += frac[i];
 169           }
 170         general_assert(denom > 0, "No alignable reads?!");
 171         for (int i = 1; i <= M; i++) frac[i] /= denom;
 172
 173         //calculate FPKM
 174         fpkm.assign(M + 1, 0.0);
 175         for (int i = 1; i <= M; i++)
 176                 if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
 177
 178         //calculate TPM
 179         tpm.assign(M + 1, 0.0);
 180         denom = 0.0;
 181         for (int i = 1; i <= M; i++) denom += fpkm[i];
 182         for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
 183 }
 184
 185 void writeResFiles(char* outFN) {
 186         FILE *fo;
 187         vector<int> tlens;
 188         vector<double> fpkm, tpm, isopct;
 189         vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
 190
 191         for (int i = 1; i <= M; i++)
 192                 general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
 193
 194         calcExpressionValues(counts, eel, tpm, fpkm);
 195
 196         //calculate IsoPct, etc.
 197         isopct.assign(M + 1, 0.0);
 198         tlens.assign(M + 1, 0);
 199
 200         glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
 201         gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
 202
 203         for (int i = 0; i < m; i++) {
 204                 int b = gi.spAt(i), e = gi.spAt(i + 1);
 205                 for (int j = b; j < e; j++) {
 206                         const Transcript& transcript = transcripts.getTranscriptAt(j);
 207                         tlens[j] = transcript.getLength();
 208
 209                         glens[i] += tlens[j] * tpm[j];
 210                         gene_eels[i] += eel[j] * tpm[j];
 211                         gene_counts[i] += counts[j];
 212                         gene_tpm[i] += tpm[j];
 213                         gene_fpkm[i] += fpkm[j];
 214                 }
 215
 216                 if (gene_tpm[i] < EPSILON) continue;
 217
 218                 for (int j = b; j < e; j++)
 219                         isopct[j] = tpm[j] / gene_tpm[i];
 220                 glens[i] /= gene_tpm[i];
 221                 gene_eels[i] /= gene_tpm[i];
 222         }
 223
 224         //isoform level
 225         sprintf(isoResF, "%s.sim.isoforms.results", outFN);
 226         fo = fopen(isoResF, "w");
 227         fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
 228         for (int i = 1; i <= M; i++) {
 229                 const Transcript& transcript = transcripts.getTranscriptAt(i);
 230                 fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
 231                                 eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
 232         }
 233         fclose(fo);
 234
 235         //gene level
 236         sprintf(geneResF, "%s.sim.genes.results", outFN);
 237         fo = fopen(geneResF, "w");
 238         fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
 239         for (int i = 0; i < m; i++) {
 240                 int b = gi.spAt(i), e = gi.spAt(i + 1);
 241                 const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
 242                 fprintf(fo, "%s\t", gene_id.c_str());
 243                 for (int j = b; j < e; j++) {
 244                         fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\t'));
 245                 }
 246                 fprintf(fo, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
 247         }
 248         fclose(fo);
 249 }
 250
 251 void releaseOutReadStreams() {
 252         for (int i = 0; i < n_os; i++) {
 253                 ((ofstream*)os[i])->close();
 254                 delete os[i];
 255         }
 256 }
 257
 258 int main(int argc, char* argv[]) {
 259         bool quiet = false;
 260         FILE *fi = NULL;
 261
 262         if (argc != 7 && argc != 8) {
 263                 printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n\n");
 264                 printf("Parameters:\n\n");
 265                 printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
 266                 printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
 267                 printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.\n");
 268                 printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
 269                 printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
 270                 printf("output_name: Prefix for all output files.\n");
 271                 printf("-q: Set it will stop outputting intermediate information.\n\n");
 272                 printf("Outputs:\n\n");
 273                 printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n\n");
 274                 printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
 275                 printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
 276                 printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
 277                 printf("{>/@}: Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated\n");
 278                 printf("rid: Simulated read's index, numbered from 0\n");
 279                 printf("dir: The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')\n");
 280                 printf("sid: Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names)\n");
 281                 printf("pos: The start position of the simulated read in strand dir of transcript sid. It is numbered from 0\n");
 282                 printf("insertL: Only appear for paired-end reads. It gives the insert length of the simulated read.\n\n");
 283                 printf("Example:\n\n");
 284                 printf("Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is:\n\n");
 285                 printf("\trsem-simulate-reads /ref/mouse_125 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads\n");
 286                 exit(-1);
 287         }
 288
 289         if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
 290         verbose = !quiet;
 291
 292         //load basic files
 293         sprintf(refF, "%s.seq", argv[1]);
 294         refs.loadRefs(refF);
 295         M = refs.getM();
 296         sprintf(groupF, "%s.grp", argv[1]);
 297         gi.load(groupF);
 298         m = gi.getm();
 299         sprintf(tiF, "%s.ti", argv[1]);
 300         transcripts.readFrom(tiF);
 301
 302         //read model type from modelF
 303         fi = fopen(argv[2], "r");
 304         if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
 305         assert(fscanf(fi, "%d", &model_type) == 1);
 306         fclose(fi);
 307
 308         theta.assign(M + 1, 0.0);
 309         theta[0] = atof(argv[4]);
 310         N = atoi(argv[5]);
 311
 312         genOutReadStreams(model_type, argv[6]);
 313
 314         counts.assign(M + 1, 0.0);
 315
 316         switch(model_type) {
 317         case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
 318         case 1: simulate<SingleReadQ, SingleQModel>(argv[2], argv[3]); break;
 319         case 2: simulate<PairedEndRead, PairedEndModel>(argv[2], argv[3]); break;
 320         case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
 321         }
 322
 323         writeResFiles(argv[6]);
 324         releaseOutReadStreams();
 325
 326         return 0;
 327 }