X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=synthesisRef.cpp;h=bde2d5b76045b7d125ca2761e3add289784c857a;hp=3bb28086b3fee8755d81b88ad8f053b1ffd5a0af;hb=da57529b92adbb7ae74a89861cb39fb35ac7c62d;hpb=019648be71e0b8ea5772530b5496720fcb841bba diff --git a/synthesisRef.cpp b/synthesisRef.cpp index 3bb2808..bde2d5b 100644 --- a/synthesisRef.cpp +++ b/synthesisRef.cpp @@ -5,8 +5,10 @@ #include #include #include +#include #include "utils.h" +#include "my_assert.h" #include "Transcript.h" #include "Transcripts.h" @@ -17,31 +19,49 @@ int M; map name2seq; map::iterator iter; -Transcripts transcripts; +Transcripts transcripts(1); // no genome, just transcript set char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN]; +char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele -bool hasMappingFile; +int hasMappingFile; char mappingFile[STRLEN]; map mi_table; // mapping info table map::iterator mi_iter; //mapping info table's iterator -void loadMappingInfo(char* mappingF) { - ifstream fin(mappingF); - string line, key, value; - - if (!fin.is_open()) { - fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); - exit(-1); - } +map mi_table2; // allele_id to transcript_id +map::iterator mi_iter2; // corresponding iterator - mi_table.clear(); - while (getline(fin, line)) { - line = cleanStr(line); - if (line[0] == '#') continue; - istringstream strin(line); - strin>>value>>key; - mi_table[key] = value; +void loadMappingInfo(int file_type, char* mappingF) { + ifstream fin(mappingF); + string line, key, value, value2; + + general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist."); + + switch(file_type) { + case 1: + mi_table.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>>value>>key; + mi_table[key] = value; + } + break; + case 2: + mi_table.clear(); + mi_table2.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>> value>> value2>> key; + mi_table[key] = value; + mi_table2[key] = value2; + } + break; + default: assert(false); } fin.close(); @@ -55,40 +75,58 @@ char check(char c) { return c; } -void writeResults(char* refName) { +void writeResults(int option, char* refName) { ofstream fout, fout2; - string cur_gene_id, name; + string cur_gene_id, cur_transcript_id, name; + vector gi, gt, ta; - sprintf(groupF, "%s.grp", refName); sprintf(tiF, "%s.ti", refName); - sprintf(refFastaF, "%s.transcripts.fa", refName); - sprintf(chromListF, "%s.chrlist", refName); - transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } - fout.open(groupF); - cur_gene_id = ""; + cur_gene_id = ""; gi.clear(); + if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { - fout<"<second< [mappingFile] reference_file_1 [reference_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); - if (hasMappingFile) { loadMappingInfo(argv[4]); } + if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); } + + // allele-specific + if (hasMappingFile == 2) { transcripts.setType(2); } int start = hasMappingFile ? 5 : 4; ifstream fin; string line, gseq; - string seqname, gene_id; - void* pt; + string seqname, gene_id, transcript_id; vector vec; @@ -126,14 +166,14 @@ int main(int argc, char* argv[]) { name2seq.clear(); for (int i = start; i < argc; i++) { fin.open(argv[i]); - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); + getline(fin, line); + while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { + while((getline(fin, line)) && (line[0] != '>')) { gseq += line; } @@ -143,20 +183,23 @@ int main(int argc, char* argv[]) { name2seq[seqname] = gseq; + transcript_id = seqname; + gene_id = seqname; + if (hasMappingFile) { mi_iter = mi_table.find(seqname); - if (mi_iter == mi_table.end()) { - fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", seqname.c_str()); - exit(-1); - } - //assert(iter != table.end()); + general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!"); gene_id = mi_iter->second; + if (hasMappingFile == 2) { + mi_iter2 = mi_table2.find(seqname); + general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!"); + transcript_id = mi_iter2->second; + } } - else gene_id = seqname; - + vec.clear(); vec.push_back(Interval(1, len)); - transcripts.add(Transcript(seqname, gene_id, seqname, '+', vec, "")); + transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, "")); ++M; if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); } @@ -172,7 +215,7 @@ int main(int argc, char* argv[]) { assert(M == transcripts.getM()); transcripts.sort(); - writeResults(argv[1]); + writeResults(hasMappingFile, argv[1]); return 0; }