12 #include "my_assert.h"
14 #include "Transcript.h"
15 #include "Transcripts.h"
23 ChrInfo(const string& name, size_t len) {
28 bool operator< (const ChrInfo& o) const {
35 vector<GTFItem> items;
37 vector<int> starts; // used to generate .grp
38 map<string, vector<int> > sn2tr; // map from seqname to transcripts
39 map<string, vector<int> >::iterator iter;
40 vector<ChrInfo> chrvec;
42 Transcripts transcripts;
44 char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN];
45 char chromListF[STRLEN];
48 char mappingFile[STRLEN];
50 map<string, string> mi_table; // mapping info table
51 map<string, string>::iterator mi_iter; //mapping info table's iterator
53 void loadMappingInfo(char* mappingF) {
54 ifstream fin(mappingF);
55 string line, key, value;
57 general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
60 while (getline(fin, line)) {
61 line = cleanStr(line);
62 if (line[0] == '#') continue;
63 istringstream strin(line);
65 mi_table[key] = value;
71 bool buildTranscript(int sp, int ep) {
72 int cur_s, cur_e; // current_start, current_end
75 string transcript_id = items[sp].getTranscriptID();
76 string gene_id = items[sp].getGeneID();
77 char strand = items[sp].getStrand();
78 string seqname = items[sp].getSeqName();
79 string left = items[sp].getLeft();
83 for (int i = sp; i <= ep; i++) {
84 int start = items[i].getStart();
85 int end = items[i].getEnd();
87 general_assert(strand == items[i].getStrand(), "According to the GTF file given, a transcript has exons from different orientations!");
88 general_assert(seqname == items[i].getSeqName(), "According to the GTF file given, a transcript has exons on multiple chromosomes!");
90 if (cur_e + 1 < start) {
91 if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
94 cur_e = (cur_e < end ? end : cur_e);
96 if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
98 transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));
103 void parse_gtf_file(char* gtfF) {
105 string line, tid, gid;
108 general_assert(fin.is_open(), "Cannot open " + cstrtos(gtfF) + "! It may not exist.");
113 while (getline(fin, line)) {
114 if (line[0] == '#') continue; // if this line is comment, jump it
116 string feature = item.getFeature();
117 if (feature == "exon") {
118 if (item.getStart() > item.getEnd()) {
119 printf("Warning: exon's start position is larger than its end position! This exon is discarded.\n");
120 printf("\t%s\n\n", line.c_str());
122 else if (item.getStart() < 1) {
123 printf("Warning: exon's start position is less than 1! This exon is discarded.\n");
124 printf("\t%s\n\n", line.c_str());
127 if (hasMappingFile) {
128 tid = item.getTranscriptID();
129 mi_iter = mi_table.find(tid);
130 general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + tid + "'s gene_id!");
131 gid = mi_iter->second;
134 items.push_back(item);
139 if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
143 sort(items.begin(), items.end());
145 int sp = 0, ep; // start pointer, end pointer
146 int nItems = items.size();
149 while (sp < nItems) {
150 tid = items[sp].getTranscriptID();
153 while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
156 buildTranscript(sp, ep);
158 int sid = transcripts.getM();
159 const Transcript& transcript = transcripts.getTranscriptAt(sid);
161 iter = sn2tr.find(transcript.getSeqName());
162 if (iter == sn2tr.end()) {
163 vector<int> vec(1, sid);
164 sn2tr[transcript.getSeqName()] = vec;
167 iter->second.push_back(sid);
175 M = transcripts.getM();
176 general_assert(M > 0, "The reference contains no transcripts!");
178 if (verbose) { printf("Parsing gtf File is done!\n"); }
182 general_assert(isalpha(c), "Sequence contains unknown letter '" + ctos(c) + "'!");
183 if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
184 if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
191 for (int i = 1; i <= M; i++)
193 const Transcript& transcript = transcripts.getTranscriptAt(i);
194 printf("Warning: Cannot extract transcript %s's sequence since the chromosome it locates, %s, is absent!\n", transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str());
198 transcripts.move(i, curp);
199 if (i > curp) seqs[curp] = seqs[i];
202 printf("%d transcripts are extracted and %d transcripts are omitted.\n", curp, M - curp);
204 transcripts.setM(curp);
205 M = transcripts.getM();
206 general_assert(M > 0, "The reference contains no transcripts!");
209 string curgid = "", gid;
211 for (int i = 1; i <= M; i++) {
212 gid = transcripts.getTranscriptAt(i).getGeneID();
218 starts.push_back(M + 1);
221 void writeResults(char* refName) {
225 sprintf(groupF, "%s.grp", refName);
226 sprintf(tiF, "%s.ti", refName);
227 sprintf(refFastaF, "%s.transcripts.fa", refName);
228 sprintf(chromListF, "%s.chrlist", refName);
233 for (int i = 0; i < s; i++) fout<<starts[i]<<endl;
235 if (verbose) { printf("Group File is generated!\n"); }
237 transcripts.writeTo(tiF);
238 if (verbose) { printf("Transcript Information File is generated!\n"); }
240 fout.open(chromListF);
242 for (int i = 0; i < s; i++) {
243 fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl;
246 if (verbose) { printf("Chromosome List File is generated!\n"); }
248 fout.open(refFastaF);
249 for (int i = 1; i <= M; i++) {
250 fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl;
254 if (verbose) { printf("Extracted Sequences File is generated!\n"); }
257 int main(int argc, char* argv[]) {
258 if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) {
259 printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
263 verbose = !atoi(argv[2]);
264 if (hasMappingFile) {
265 loadMappingInfo(argv[5]);
267 parse_gtf_file(argv[3]);
270 string line, gseq, seqname;
275 seqs.resize(M + 1, "");
276 int start = hasMappingFile ? 6 : 5;
277 for (int i = start; i < argc; i++) {
279 general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
281 while ((fin) && (line[0] == '>')) {
282 istringstream strin(line.substr(1));
286 while((getline(fin, line)) && (line[0] != '>')) {
290 size_t len = gseq.length();
292 for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]);
294 iter = sn2tr.find(seqname);
295 if (iter == sn2tr.end()) continue;
297 chrvec.push_back(ChrInfo(seqname, len));
299 vector<int>& vec = iter->second;
301 for (int j = 0; j < s; j++) {
302 assert(vec[j] > 0 && vec[j] <= M);
303 transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
308 if (verbose) { printf("%s is processed!\n", argv[i]); }
310 sort(chrvec.begin(), chrvec.end());
313 if (verbose) { printf("Extracting sequences is done!\n"); }
315 writeResults(argv[1]);