14 #include "RefSeqPolicy.h"
15 #include "PolyARules.h"
28 void makeRefs(char*, RefSeqPolicy&, PolyARules&);
29 void loadRefs(char*, int = 0);
32 int getM() { return M; } // get number of isoforms
34 //int getNS() { return M + 1; } // get number of parameters, I do not think we need it here.
36 RefSeq& getRef(int sid) { return seqs[sid]; } // get a particular reference
38 std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing
40 bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added
42 //lim : >=0 If mismatch > lim , return; -1 find all mismatches
43 int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
44 int nMis = 0; // number of mismatches
46 for (int i = 0; i < LEN; i++) {
47 char rc = toupper(readseq[i]);
48 if (seq[i + pos] == 'N' || rc == 'N' || seq[i + pos] != rc) nMis++;
51 if (lim >= 0 && nMis > lim) return nMis;
57 bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) {
58 if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false;
59 const std::string& seq = seqs[sid].getSeq(dir);
60 return countMismatch(seq, pos, readseq, LEN, C) <= C;
63 // get segment from refs
64 std::string getSegment(int sid, int dir, int pos, int LEN) {
65 if (pos < 0 || pos + LEN > seqs[sid].getTotLen()) return "fail";
67 const std::string& seq = seqs[sid].getSeq(dir);
70 for (int i = 0; i < LEN; i++)
71 seg.append(1, seq[pos + i]);
77 int M; // # of isoforms, id starts from 1
78 std::vector<RefSeq> seqs; // reference sequences, starts from 1; 0 is for noise gene
79 bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
82 //inpF in fasta format
83 void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
84 //read standard fasta format here
86 std::string tag, line, rawseq;
89 seqs.push_back(RefSeq()); // noise isoform
95 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
97 while ((fin) && (line[0] == '>')) {
100 while((getline(fin, line)) && (line[0] != '>')) {
103 if (rawseq.size() <= 0) {
104 printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
108 seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
109 has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
113 if (verbose) { printf("Refs.makeRefs finished!\n"); }
116 //inpF in fasta format, with sequence all in one line together
117 //option 0 read all, 1 do not read sequences
118 void Refs::loadRefs(char *inpF, int option) {
123 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
125 seqs.push_back(RefSeq());
132 success = seq.read(fin, option);
136 has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
142 assert(M + 1 == (int)seqs.size());
144 if (verbose) { printf("Refs.loadRefs finished!\n"); }
147 void Refs::saveRefs(char* outF) {
151 for (int i = 1; i <= M; i++) {
156 if (verbose) { printf("Refs.saveRefs finished!\n"); }