11 #include "my_assert.h"
12 #include "Transcript.h"
13 #include "Transcripts.h"
19 map<string, string> name2seq;
20 map<string, string>::iterator iter;
22 Transcripts transcripts(1); // no genome, just transcript set
23 char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN];
24 char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele
27 char mappingFile[STRLEN];
29 map<string, string> mi_table; // mapping info table
30 map<string, string>::iterator mi_iter; //mapping info table's iterator
32 map<string, string> mi_table2; // allele_id to transcript_id
33 map<string, string>::iterator mi_iter2; // corresponding iterator
35 void loadMappingInfo(int file_type, char* mappingF) {
36 ifstream fin(mappingF);
37 string line, key, value, value2;
39 general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
44 while (getline(fin, line)) {
45 line = cleanStr(line);
46 if (line[0] == '#') continue;
47 istringstream strin(line);
49 mi_table[key] = value;
55 while (getline(fin, line)) {
56 line = cleanStr(line);
57 if (line[0] == '#') continue;
58 istringstream strin(line);
59 strin>> value>> value2>> key;
60 mi_table[key] = value;
61 mi_table2[key] = value2;
64 default: assert(false);
71 if (!isalpha(c)) { fprintf(stderr, "Sequence contains unknown letter '%c'!\n", c); exit(-1); }
73 if (isupper(c) && c != 'A' && c != 'C' && c != 'G' && c != 'T') c = 'N';
74 if (islower(c) && c != 'a' && c != 'c' && c != 'g' && c != 't') c = 'n';
78 void writeResults(int option, char* refName) {
80 string cur_gene_id, cur_transcript_id, name;
81 vector<int> gi, gt, ta;
83 sprintf(tiF, "%s.ti", refName);
84 transcripts.writeTo(tiF);
85 if (verbose) { printf("Transcript Information File is generated!\n"); }
87 cur_gene_id = ""; gi.clear();
88 if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
89 for (int i = 1; i <= M; i++) {
90 const Transcript& transcript = transcripts.getTranscriptAt(i);
91 if (cur_gene_id != transcript.getGeneID()) {
93 if (option == 2) gt.push_back((int)ta.size());
94 cur_gene_id = transcript.getGeneID();
96 if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
98 cur_transcript_id = transcript.getTranscriptID();
102 if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }
104 sprintf(groupF, "%s.grp", refName);
106 for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl;
108 if (verbose) { printf("Group File is generated!\n"); }
111 sprintf(gtF, "%s.gt", refName);
113 for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl;
115 sprintf(taF, "%s.ta", refName);
117 for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl;
119 if (verbose) { printf("Allele-specific group files are generated!\n"); }
122 sprintf(refFastaF, "%s.transcripts.fa", refName);
123 sprintf(chromListF, "%s.chrlist", refName);
124 fout2.open(chromListF);
125 fout.open(refFastaF);
126 for (int i = 1; i <= M; i++) {
127 name = transcripts.getTranscriptAt(i).getSeqName();
128 iter = name2seq.find(name);
129 general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!");
130 fout<<">"<<name<<endl;
131 fout<<iter->second<<endl;
133 fout2<<name<<'\t'<<iter->second.length()<<endl;
139 printf("Chromosome List File is generated!\n");
140 printf("Extracted Sequences File is generated!\n");
144 int main(int argc, char* argv[]) {
145 if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
146 printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
150 verbose = !atoi(argv[2]);
152 if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }
155 if (hasMappingFile == 2) { transcripts.setType(2); }
157 int start = hasMappingFile ? 5 : 4;
161 string seqname, gene_id, transcript_id;
163 vector<Interval> vec;
167 for (int i = start; i < argc; i++) {
169 general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
171 while ((fin) && (line[0] == '>')) {
172 istringstream strin(line.substr(1));
176 while((getline(fin, line)) && (line[0] != '>')) {
180 int len = gseq.length();
182 for (int j = 0; j < len; j++) gseq[j] = check(gseq[j]);
184 name2seq[seqname] = gseq;
186 transcript_id = seqname;
189 if (hasMappingFile) {
190 mi_iter = mi_table.find(seqname);
191 general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
192 gene_id = mi_iter->second;
193 if (hasMappingFile == 2) {
194 mi_iter2 = mi_table2.find(seqname);
195 general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
196 transcript_id = mi_iter2->second;
201 vec.push_back(Interval(1, len));
202 transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
205 if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
211 fprintf(stderr, "Number of transcripts in the reference is less than 1!\n");
215 assert(M == transcripts.getM());
218 writeResults(hasMappingFile, argv[1]);