vector<double> eel;
double *mw;
+vector<int> pseudo_counts;
vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
vector<double> pme_tpm, pme_fpkm;
GroupInfo gt, ta;
vector<double> pve_c_genes, pve_c_trans;
-void load_group_info(char* refName) {
- // Load group info
- sprintf(groupF, "%s.grp", refName);
- gi.load(groupF);
- m = gi.getm();
-
- alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific
- m_trans = (alleleS ? ta.getm() : 0);
-
- if (verbose) { printf("Loading group information is finished!\n"); }
-}
-
void load_data(char* refName, char* statName, char* imdName) {
ifstream fin;
string line;
if (verbose) { printf("Loading data is finished!\n"); }
}
+void load_group_info(char* refName) {
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific
+ m_trans = (alleleS ? ta.getm() : 0);
+
+ if (verbose) { printf("Loading group information is finished!\n"); }
+}
+
+// Load imdName.omit and initialize the pseudo count vector.
+void load_omit_info(const char* imdName) {
+ char omitF[STRLEN];
+ sprintf(omitF, "%s.omit", imdName);
+ FILE *fi = fopen(omitF, "r");
+ pseudo_counts.assign(M + 1, 1);
+ int tid;
+ while (fscanf(fi, "%d", &tid) == 1) pseudo_counts[tid] = 0;
+ fclose(fi);
+}
+
template<class ModelType>
void init_model_related(char* modelF) {
ModelType model;
theta.assign(M + 1, 0);
denom = 0.0;
for (int i = 0; i <= M; i++) {
- theta[i] = gmg();
+ theta[i] = (pseudo_counts[i] > 0 ? gmg() : 0.0);
denom += theta[i];
}
assert(denom > EPSILON);
Params *params = (Params*)arg;
vector<double> theta, tpm, fpkm;
- vector<int> z, counts;
+ vector<int> z, counts(pseudo_counts);
vector<double> arr;
uniform_01_generator rg(*params->engine, uniform_01_dist());
sampleTheta(*params->engine, theta);
z.assign(N1, 0);
-
- counts.assign(M + 1, 1); // 1 pseudo count
counts[0] += N0;
for (READ_INT_TYPE i = 0; i < N1; i++) {
polishTheta(M, theta, eel, mw);
calcExpressionValues(M, theta, eel, tpm, fpkm);
for (int i = 0; i <= M; i++) {
- params->pme_c[i] += counts[i] - 1;
- params->pve_c[i] += double(counts[i] - 1) * (counts[i] - 1);
+ params->pme_c[i] += counts[i] - pseudo_counts[i];
+ params->pve_c[i] += double(counts[i] - pseudo_counts[i]) * (counts[i] - pseudo_counts[i]);
params->pme_tpm[i] += tpm[i];
params->pme_fpkm[i] += fpkm[i];
}
for (int i = 0; i < m; i++) {
int b = gi.spAt(i), e = gi.spAt(i + 1);
double count = 0.0;
- for (int j = b; j < e; j++) count += counts[j] - 1;
+ for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
params->pve_c_genes[i] += count * count;
}
for (int i = 0; i < m_trans; i++) {
int b = ta.spAt(i), e = ta.spAt(i + 1);
double count = 0.0;
- for (int j = b; j < e; j++) count += counts[j] - 1;
+ for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
params->pve_c_trans[i] += count * count;
}
}
printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
}
- load_group_info(refName);
load_data(refName, statName, imdName);
+ load_group_info(refName);
+ load_omit_info(imdName);
sprintf(modelF, "%s.model", statName);
FILE *fi = fopen(modelF, "r");
Transcript.h : utils.h
-Transcripts.h : my_assert.h Transcript.h
+Transcripts.h : utils.h my_assert.h Transcript.h
rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
$(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
class SamParser {
public:
- SamParser(char, const char*, Transcripts&, const char* = 0);
+ SamParser(char, const char*, const char*, Transcripts&, const char*);
~SamParser();
/**
char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
// aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
+SamParser::SamParser(char inpType, const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName)
: transcripts(transcripts)
{
switch(inpType) {
header = sam_in->header;
general_assert(header != 0, "Fail to parse sam header!");
- transcripts.buildMappings(header->n_targets, header->target_name);
+ transcripts.buildMappings(header->n_targets, header->target_name, imdName);
b = bam_init1();
b2 = bam_init1();
#include<map>
#include<string>
+#include "utils.h"
#include "my_assert.h"
#include "Transcript.h"
return transcripts[getInternalSid(eid)];
}
- void buildMappings(int, char**);
+ void buildMappings(int, char**, const char* = NULL);
private:
int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
fout.close();
}
-void Transcripts::buildMappings(int n_targets, char** target_name) {
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
std::map<std::string, int> dict;
std::map<std::string, int>::iterator iter;
+ std::vector<bool> appeared;
- general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+ general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+ general_assert(n_targets <= M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+ if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
dict.clear();
for (int i = 1; i <= M; i++) {
const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
iter = dict.find(tid);
- general_assert(iter == dict.end(), tid + " appears more than once!");
+ general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
dict[tid] = i;
}
e2i.assign(M + 1, 0);
i2e.assign(M + 1, 0);
+ appeared.assign(M + 1, false);
for (int i = 0; i < n_targets; i++) {
iter = dict.find(std::string(target_name[i]));
general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
- general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+ general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
e2i[i + 1] = iter->second;
i2e[iter->second] = i + 1;
iter->second = -1;
+ appeared[e2i[i + 1]] = true;
+ }
+
+ if (imdName != NULL) {
+ char omitF[STRLEN];
+ sprintf(omitF, "%s.omit", imdName);
+ FILE *fo = fopen(omitF, "w");
+ for (int i = 1; i <= M; i++)
+ if (!appeared[i]) fprintf(fo, "%d\n", i);
+ fclose(fo);
}
}
int cnt = 0;
while (fscanf(fi, "%d", &cvec[0]) == 1) {
for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
+ assert(cvec[0] > 0);
++cnt;
for (int j = 0; j <= M; j++) {
- gammas[j] = new gamma_dist(cvec[j]);
- rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+ gammas[j] = NULL; rgs[j] = NULL;
+ if (cvec[j] > 0) {
+ gammas[j] = new gamma_dist(cvec[j]);
+ rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+ }
}
-
+
for (int i = 0; i < nSpC; i++) {
double sum = 0.0;
for (int j = 0; j <= M; j++) {
- theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
+ theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
sum += theta[j];
}
assert(sum >= EPSILON);
}
for (int j = 0; j <= M; j++) {
- delete gammas[j];
- delete rgs[j];
+ if (gammas[j] != NULL) delete gammas[j];
+ if (rgs[j] != NULL) delete rgs[j];
}
if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
char* aux = 0;
if (strcmp(fn_list, "")) aux = fn_list;
- parser = new SamParser(alignFType, alignF, transcripts, aux);
+ parser = new SamParser(alignFType, alignF, aux, transcripts, imdName);
memset(cat, 0, sizeof(cat));
memset(readOutFs, 0, sizeof(readOutFs));
=item B<--ci-number-of-samples-per-count-vector> <int>
-The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling (\Theta | C) for each sampled count vector. This option controls how many \Theta vectors are sampled per sampled count vector. (Default: 50)
+The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled count vector. This option controls how many Theta vectors are sampled per sampled count vector. (Default: 50)
=item B<--samtools-sort-mem> <string>