13 #include "Transcript.h"
14 #include "Transcripts.h"
22 ChrInfo(const string& name, size_t len) {
27 bool operator< (const ChrInfo& o) const {
34 vector<GTFItem> items;
36 vector<int> starts; // used to generate .grp
37 map<string, vector<int> > sn2tr; // map from seqname to transcripts
38 map<string, vector<int> >::iterator iter;
39 vector<ChrInfo> chrvec;
41 Transcripts transcripts;
43 char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN];
44 char chromListF[STRLEN];
47 char mappingFile[STRLEN];
49 map<string, string> mi_table; // mapping info table
50 map<string, string>::iterator mi_iter; //mapping info table's iterator
52 void loadMappingInfo(char* mappingF) {
53 ifstream fin(mappingF);
54 string line, key, value;
57 fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
62 while (getline(fin, line)) {
63 line = cleanStr(line);
64 if (line[0] == '#') continue;
65 istringstream strin(line);
67 mi_table[key] = value;
73 bool buildTranscript(int sp, int ep) {
74 int cur_s, cur_e; // current_start, current_end
77 string transcript_id = items[sp].getTranscriptID();
78 string gene_id = items[sp].getGeneID();
79 char strand = items[sp].getStrand();
80 string seqname = items[sp].getSeqName();
81 string left = items[sp].getLeft();
85 for (int i = sp; i <= ep; i++) {
86 int start = items[i].getStart();
87 int end = items[i].getEnd();
89 if (strand != items[i].getStrand()) {
90 fprintf(stderr, "According to the GTF file given, a transcript has exons from different orientations!\n");
93 if (seqname != items[i].getSeqName()) {
94 fprintf(stderr, "According to the GTF file given, a transcript has exons on multiple chromosomes!\n");
98 if (cur_e + 1 < start) {
99 if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
102 cur_e = (cur_e < end ? end : cur_e);
104 if (cur_s > 0) vec.push_back(Interval(cur_s, cur_e));
106 transcripts.add(Transcript(transcript_id, gene_id, seqname, strand, vec, left));
111 void parse_gtf_file(char* gtfF) {
113 string line, curgid, tid, gid; // curgid: current gene id;
116 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", gtfF); exit(-1); }
121 while (getline(fin, line)) {
122 if (line[0] == '#') continue; // if this line is comment, jump it
124 string feature = item.getFeature();
125 if (feature == "exon") {
126 if (item.getStart() > item.getEnd()) {
127 fprintf(stderr, "Warning: exon's start position is larger than its end position! This exon is discarded.\n");
128 fprintf(stderr, "\t%s\n\n", line.c_str());
130 else if (item.getStart() < 1) {
131 fprintf(stderr, "Warning: exon's start position is less than 1! This exon is discarded.\n");
132 fprintf(stderr, "\t%s\n\n", line.c_str());
135 if (hasMappingFile) {
136 tid = item.getTranscriptID();
137 mi_iter = mi_table.find(tid);
138 if (mi_iter == mi_table.end()) {
139 fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str());
142 //assert(iter != table.end());
143 gid = mi_iter->second;
146 items.push_back(item);
151 if (verbose && cnt % 200000 == 0) { printf("Parsed %d lines\n", cnt); }
155 sort(items.begin(), items.end());
161 int sp = 0, ep; // start pointer, end pointer
162 int nItems = items.size();
164 while (sp < nItems) {
165 tid = items[sp].getTranscriptID();
166 gid = items[sp].getGeneID();
169 while (ep < nItems && items[ep].getTranscriptID() == tid) ep++;
172 buildTranscript(sp, ep);
174 int sid = transcripts.getM();
175 const Transcript& transcript = transcripts.getTranscriptAt(sid);
178 starts.push_back(sid);
181 iter = sn2tr.find(transcript.getSeqName());
182 if (iter == sn2tr.end()) {
183 vector<int> vec(1, sid);
184 sn2tr[transcript.getSeqName()] = vec;
187 iter->second.push_back(sid);
193 M = transcripts.getM();
194 starts.push_back(M + 1);
198 fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
202 if (verbose) { printf("Parsing gtf File is done!\n"); }
206 if (!isalpha(c)) { fprintf(stderr, "Sequence contains unknown letter '%c'!\n", c); exit(-1); }
207 //assert(isalpha(c));
208 if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
209 if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
213 void writeResults(char* refName) {
217 sprintf(groupF, "%s.grp", refName);
218 sprintf(tiF, "%s.ti", refName);
219 sprintf(refFastaF, "%s.transcripts.fa", refName);
220 sprintf(chromListF, "%s.chrlist", refName);
225 for (int i = 0; i < s; i++) fout<<starts[i]<<endl;
227 if (verbose) { printf("Group File is generated!\n"); }
229 transcripts.writeTo(tiF);
230 if (verbose) { printf("Transcript Information File is generated!\n"); }
232 fout.open(chromListF);
234 for (int i = 0; i < s; i++) {
235 fout<<chrvec[i].name<<'\t'<<chrvec[i].len<<endl;
238 if (verbose) { printf("Chromosome List File is generated!\n"); }
240 fout.open(refFastaF);
241 for (int i = 1; i <= M; i++) {
242 fout<<">"<<transcripts.getTranscriptAt(i).getTranscriptID()<<endl;
246 if (verbose) { printf("Extracted Sequences File is generated!\n"); }
249 int main(int argc, char* argv[]) {
250 if (argc < 6 || ((hasMappingFile = atoi(argv[4])) && argc < 7)) {
251 printf("Usage: rsem-extract-reference-transcripts refName quiet gtfF hasMappingFile [mappingFile] chromosome_file_1 [chromosome_file_2 ...]\n");
255 verbose = !atoi(argv[2]);
256 if (hasMappingFile) {
257 loadMappingInfo(argv[5]);
259 parse_gtf_file(argv[3]);
262 string line, gseq, seqname;
268 seqs.resize(M + 1, "");
269 int start = hasMappingFile ? 6 : 5;
270 for (int i = start; i < argc; i++) {
272 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); }
273 pt = getline(fin, line);
274 while (pt != 0 && line[0] == '>') {
275 istringstream strin(line.substr(1));
279 while((pt = getline(fin, line)) && line[0] != '>') {
283 size_t len = gseq.length();
285 for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]);
287 iter = sn2tr.find(seqname);
288 if (iter == sn2tr.end()) continue;
290 chrvec.push_back(ChrInfo(seqname, len));
292 vector<int>& vec = iter->second;
294 for (int j = 0; j < s; j++) {
295 assert(vec[j] > 0 && vec[j] <= M);
296 transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]);
301 if (verbose) { printf("%s is processed!\n", argv[i]); }
304 for (int i = 1; i <= M; i++) {
306 const Transcript& transcript = transcripts.getTranscriptAt(i);
307 fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s, whose information might not be provided! Please check if the chromosome directory is set correctly or the list of chromosome files is complete.\n", \
308 transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str());
313 sort(chrvec.begin(), chrvec.end());
315 if (verbose) { printf("Extracting sequences is done!\n"); }
317 writeResults(argv[1]);