14 #include "RefSeqPolicy.h"
15 #include "PolyARules.h"
28 void makeRefs(char*, RefSeqPolicy&, PolyARules&);
29 void loadRefs(char*, int = 0);
32 int getM() { return M; } // get number of isoforms
34 //int getNS() { return M + 1; } // get number of parameters, I do not think we need it here.
36 RefSeq& getRef(int sid) { return seqs[sid]; } // get a particular reference
38 std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing
40 bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added
42 //lim : >=0 If mismatch > lim , return; -1 find all mismatches
43 int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
44 int nMis = 0; // number of mismatches
46 for (int i = 0; i < LEN; i++) {
47 char rc = toupper(readseq[i]);
48 if (seq[i + pos] == 'N' || rc == 'N' || seq[i + pos] != rc) nMis++;
51 if (lim >= 0 && nMis > lim) return nMis;
57 bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) {
58 if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false;
59 const std::string& seq = seqs[sid].getSeq(dir);
60 return countMismatch(seq, pos, readseq, LEN, C) <= C;
63 // get segment from refs
64 std::string getSegment(int sid, int dir, int pos, int LEN) {
65 if (pos < 0 || pos + LEN > seqs[sid].getTotLen()) return "fail";
67 const std::string& seq = seqs[sid].getSeq(dir);
70 for (int i = 0; i < LEN; i++)
71 seg.append(1, seq[pos + i]);
77 int M; // # of isoforms, id starts from 1
78 std::vector<RefSeq> seqs; // reference sequences, starts from 1; 0 is for noise gene
79 bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
82 //inpF in fasta format
83 void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
84 //read standard fasta format here
86 std::string tag, line, rawseq;
87 void* pt; // istream& is indeed a pointer, that's why I can use void* here
90 seqs.push_back(RefSeq()); // noise isoform
96 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
97 pt = getline(fin, line);
98 while (pt != 0 && line[0] == '>') {
101 while((pt = getline(fin, line)) && line[0] != '>') {
104 if (rawseq.size() <= 0) {
105 printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
109 seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
110 has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
114 if (verbose) { printf("Refs.makeRefs finished!\n"); }
117 //inpF in fasta format, with sequence all in one line together
118 //option 0 read all, 1 do not read sequences
119 void Refs::loadRefs(char *inpF, int option) {
124 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
126 seqs.push_back(RefSeq());
133 success = seq.read(fin, option);
137 has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
143 assert(M + 1 == (int)seqs.size());
145 if (verbose) { printf("Refs.loadRefs finished!\n"); }
148 void Refs::saveRefs(char* outF) {
152 for (int i = 1; i <= M; i++) {
157 if (verbose) { printf("Refs.saveRefs finished!\n"); }