10 #include "Transcript.h"
11 #include "Transcripts.h"
17 map<string, string> name2seq;
18 map<string, string>::iterator iter;
20 Transcripts transcripts;
21 char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN];
24 char mappingFile[STRLEN];
26 map<string, string> mi_table; // mapping info table
27 map<string, string>::iterator mi_iter; //mapping info table's iterator
29 void loadMappingInfo(char* mappingF) {
30 ifstream fin(mappingF);
31 string line, key, value;
34 fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
39 while (getline(fin, line)) {
40 line = cleanStr(line);
41 if (line[0] == '#') continue;
42 istringstream strin(line);
44 mi_table[key] = value;
51 if (!isalpha(c)) { fprintf(stderr, "Sequence contains unknown letter '%c'!\n", c); exit(-1); }
53 if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
54 if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
58 void writeResults(char* refName) {
60 string cur_gene_id, name;
62 sprintf(groupF, "%s.grp", refName);
63 sprintf(tiF, "%s.ti", refName);
64 sprintf(refFastaF, "%s.transcripts.fa", refName);
66 transcripts.writeTo(tiF);
67 if (verbose) { printf("Transcript Information File is generated!\n"); }
71 for (int i = 1; i <= M; i++) {
72 const Transcript& transcript = transcripts.getTranscriptAt(i);
73 if (cur_gene_id != transcript.getGeneID()) {
75 cur_gene_id = transcript.getGeneID();
80 if (verbose) { printf("Group File is generated!\n"); }
82 // We have to generate this .transcripts.fa, even reference_file is only one. Reason : polyA choice 2, need ">transcript_id"
84 for (int i = 1; i <= M; i++) {
85 name = transcripts.getTranscriptAt(i).getTranscriptID();
86 iter = name2seq.find(name);
87 if (iter == name2seq.end()) {
88 fprintf(stderr, "Cannot recognize transcript ID %s!\n", name.c_str());
91 fout<<">"<<name<<endl;
92 fout<<iter->second<<endl;
95 if (verbose) { printf("Extracted Sequences File is generated!\n"); }
98 int main(int argc, char* argv[]) {
99 if (argc < 5 || (hasMappingFile = atoi(argv[3])) && argc < 6) {
100 printf("Usage: synthesisRef refName quiet hasMappingFile [mappingFile] reference_file_1 [reference_file_2 ...]\n");
104 verbose = !atoi(argv[2]);
106 if (hasMappingFile) { loadMappingInfo(argv[4]); }
108 int start = hasMappingFile ? 5 : 4;
112 string seqname, gene_id;
115 vector<Interval> vec;
119 for (int i = start; i < argc; i++) {
121 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); }
122 pt = getline(fin, line);
123 while (pt != 0 && line[0] == '>') {
124 istringstream strin(line.substr(1));
128 while((pt = getline(fin, line)) && line[0] != '>') {
132 int len = gseq.length();
134 for (int j = 0; j < len; j++) gseq[j] = check(gseq[j]);
136 name2seq[seqname] = gseq;
138 if (hasMappingFile) {
139 mi_iter = mi_table.find(seqname);
140 if (mi_iter == mi_table.end()) {
141 fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", seqname.c_str());
144 //assert(iter != table.end());
145 gene_id = mi_iter->second;
147 else gene_id = seqname;
150 vec.push_back(Interval(1, len));
151 transcripts.add(Transcript(seqname, gene_id, seqname, '+', vec, ""));
154 if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
160 fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
164 assert(M == transcripts.getM());
167 writeResults(argv[1]);