]> git.donarmstrong.com Git - mothur.git/blob - chimera.cpp
changed random forest output filename
[mothur.git] / chimera.cpp
1 /*
2  *  chimera.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 8/11/09.
6  *  Copyright 2009 Schloss Lab Umass Amherst. All rights reserved.
7  *
8  */
9
10 #include "chimera.h"
11 #include "referencedb.h"
12
13 //***************************************************************************************************************
14 //this is a vertical soft filter
15 string Chimera::createFilter(vector<Sequence*> seqs, float t) {
16         try {
17                 filterString = "";
18                 int threshold = int (t * seqs.size());
19 //cout << "threshhold = " << threshold << endl;
20                 
21                 vector<int> gaps;       gaps.resize(seqs[0]->getAligned().length(), 0);
22                 vector<int> a;          a.resize(seqs[0]->getAligned().length(), 0);
23                 vector<int> t;          t.resize(seqs[0]->getAligned().length(), 0);
24                 vector<int> g;          g.resize(seqs[0]->getAligned().length(), 0);
25                 vector<int> c;          c.resize(seqs[0]->getAligned().length(), 0);
26         
27                 filterString = (string(seqs[0]->getAligned().length(), '1'));
28                 
29                 //for each sequence
30                 for (int i = 0; i < seqs.size(); i++) {
31                 
32                         if (m->control_pressed) { return filterString; }
33                 
34                         string seqAligned = seqs[i]->getAligned();
35                         
36                         if (seqAligned.length() != filterString.length()) {  m->mothurOut(seqs[i]->getName() + " is not the same length as the template sequences. Aborting!\n");  exit(1); }
37                 
38                         for (int j = 0; j < seqAligned.length(); j++) {
39                                 //if this spot is a gap
40                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
41                                 else if (toupper(seqAligned[j]) == 'A')                                 {       a[j]++;         }
42                                 else if (toupper(seqAligned[j]) == 'T')                                 {       t[j]++;         }
43                                 else if (toupper(seqAligned[j]) == 'G')                                 {       g[j]++;         }
44                                 else if (toupper(seqAligned[j]) == 'C')                                 {       c[j]++;         }
45                         }
46                 }
47                 
48                 //zero out spot where all sequences have blanks
49                 int numColRemoved = 0;
50                 for(int i = 0;i < seqs[0]->getAligned().length(); i++){
51                 
52                         if (m->control_pressed) { return filterString; }
53                         
54                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
55                         
56                         else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  numColRemoved++;  }
57                         //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
58                 }
59
60                 if (threshold != 0.0) {  m->mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  m->mothurOutEndLine();  }
61                 
62                 return filterString;
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "Chimera", "createFilter");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 map<int, int> Chimera::runFilter(Sequence* seq) {
71         try {
72                 map<int, int> maskMap;
73                 string seqAligned = seq->getAligned();
74                 string newAligned = "";
75                 int count = 0;
76                         
77                 for (int j = 0; j < seqAligned.length(); j++) {
78                         //if this spot is a gap
79                         if (filterString[j] == '1') { 
80                                 newAligned += seqAligned[j]; 
81                                 maskMap[count] = j;
82                                 count++;
83                         }
84                 }
85                         
86                 seq->setAligned(newAligned);
87                 
88                 return maskMap;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "Chimera", "runFilter");
92                 exit(1);
93         }
94 }
95 //***************************************************************************************************************
96 vector<Sequence*> Chimera::readSeqs(string file) {
97         try {
98                 
99                 vector<Sequence*> container;
100                 int count = 0;
101                 length = 0;
102                 unaligned = false;
103                 ReferenceDB* rdb = ReferenceDB::getInstance();
104                 
105                 if (file == "saved") {
106                         
107                         
108                         m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");        m->mothurOutEndLine();
109                         
110                         for (int i = 0; i < rdb->referenceSeqs.size(); i++) {
111                                 Sequence* temp = new Sequence(rdb->referenceSeqs[i].getName(), rdb->referenceSeqs[i].getAligned());
112                                 
113                                 if (count == 0) {  length = temp->getAligned().length();  count++;  } //gets first seqs length
114                                 else if (length != temp->getAligned().length()) {       unaligned = true;       }
115                                 
116                                 if (temp->getName() != "") {  container.push_back(temp);  }
117                         }
118                         
119                         templateFileName = rdb->getSavedReference();
120                         
121                 }else {
122                         
123                         m->mothurOut("Reading sequences from " + file + "..."); cout.flush();
124                         
125                         #ifdef USE_MPI
126                                 int pid, processors;
127                                 vector<unsigned long long> positions;
128                                 int numSeqs;
129                                 int tag = 2001;
130                                 MPI_File inMPI;
131                                 MPI_Status status; 
132                         
133                                 if (byGroup) {
134                                         char inFileName[1024];
135                                         strcpy(inFileName, file.c_str());
136                                         
137                                         MPI_File_open(MPI_COMM_SELF, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
138                                         
139                                         positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs
140                                         
141                                         //read file 
142                                         for(int i=0;i<numSeqs;i++){
143                                                 
144                                                 if (m->control_pressed) { MPI_File_close(&inMPI); return container; }
145                                                 
146                                                 //read next sequence
147                                                 int seqlength = positions[i+1] - positions[i];
148                                                 char* buf4 = new char[seqlength];
149                                                 
150                                                 MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status);
151                                                 
152                                                 string tempBuf = buf4;
153                                                 if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); }
154                                                 delete buf4;
155                                                 
156                                                 istringstream iss (tempBuf,istringstream::in);
157                                                 
158                                                 Sequence* current = new Sequence(iss);   
159                                                 if (current->getName() != "") {
160                                                         if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
161                                                         else if (length != current->getAligned().length()) {    unaligned = true;       }
162                                                         
163                                                         container.push_back(current);  
164                                                         if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
165                                                 }
166                                         }
167                                         
168                                         MPI_File_close(&inMPI);
169                                         
170                                 }else {                                 
171                                         
172                                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
173                                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
174                                         
175                                         //char* inFileName = new char[file.length()];
176                                         //memcpy(inFileName, file.c_str(), file.length());
177                                         
178                                         char inFileName[1024];
179                                         strcpy(inFileName, file.c_str());
180                                         
181                                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
182                                         //delete inFileName;
183                                         
184                                         if (pid == 0) {
185                                                 positions = m->setFilePosFasta(file, numSeqs); //fills MPIPos, returns numSeqs
186                                                 
187                                                 //send file positions to all processes
188                                                 for(int i = 1; i < processors; i++) { 
189                                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
190                                                         MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
191                                                 }
192                                         }else{
193                                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
194                                                 positions.resize(numSeqs+1);
195                                                 MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
196                                         }
197                                         
198                                         //read file 
199                                         for(int i=0;i<numSeqs;i++){
200                                                 
201                                                 if (m->control_pressed) { MPI_File_close(&inMPI); return container; }
202                                                 
203                                                 //read next sequence
204                                                 int seqlength = positions[i+1] - positions[i];
205                                                 char* buf4 = new char[seqlength];
206                                                 
207                                                 MPI_File_read_at(inMPI, positions[i], buf4, seqlength, MPI_CHAR, &status);
208                                                 
209                                                 string tempBuf = buf4;
210                                                 if (tempBuf.length() > seqlength) { tempBuf = tempBuf.substr(0, seqlength); }
211                                                 delete buf4;
212                                                 
213                                                 istringstream iss (tempBuf,istringstream::in);
214                                                 
215                                                 Sequence* current = new Sequence(iss);   
216                                                 if (current->getName() != "") {
217                                                         if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
218                                                         else if (length != current->getAligned().length()) {    unaligned = true;       }
219                                                         
220                                                         container.push_back(current);  
221                                                         if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
222                                                 }
223                                         }
224                                         
225                                         MPI_File_close(&inMPI);
226                                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
227                                 }
228                 #else
229
230                         ifstream in;
231                         m->openInputFile(file, in);
232                         
233                         //read in seqs and store in vector
234                         while(!in.eof()){
235                                 
236                                 if (m->control_pressed) { return container; }
237                                 
238                                 Sequence* current = new Sequence(in);  m->gobble(in);
239                                 
240                                 if (count == 0) {  length = current->getAligned().length();  count++;  } //gets first seqs length
241                                 else if (length != current->getAligned().length()) {   unaligned = true;        }
242                                                         
243                                 if (current->getName() != "") {  
244                                         container.push_back(current);  
245                                         if (rdb->save) { rdb->referenceSeqs.push_back(*current); }
246                                 }
247                         }
248                         in.close();
249                 #endif
250                 
251                         m->mothurOut("Done."); m->mothurOutEndLine();
252                         
253                         filterString = (string(container[0]->getAligned().length(), '1'));
254                 }
255                 
256                 return container;
257         }
258         catch(exception& e) {
259                 m->errorOut(e, "Chimera", "readSeqs");
260                 exit(1);
261         }
262 }
263 //***************************************************************************************************************
264 void Chimera::setMask(string filename) {
265         try {
266                 
267                 if (filename == "default") {
268                         //default is from wigeon  236627 EU009184.1 Shigella dysenteriae str. FBD013
269                         seqMask = ".....................................................................................................AAATTGAAGAGTTT-GA--T-CA-T-G-GCTC-AG-AT-TGAA-C-GC--TGG-C--G-GC-A-GG--C----C-T--AACACA-T-GC-A-AGT-CGA-A-CG----------G-TAA-CA-G----------------------------GAAG-A-AG----------------------------------------------------CTT-G----------------------------------------------------------------------------------CT-TCTTT----------------G-CT--G--AC--G--AG-T-GG-C-GG-A--C-------------GGG-TGAGT-A--AT-GT-C-T-G-GG---A-A--A-CT-G--C-C-TGA--TG-G------------------------------------------------------------------A-GG----GGG-AT-AA-CTA-------------------------C-T-G-----------------------GAA-A---CGG-TAG-CTAA-TA---CC-G--C-AT-A----------A--------------------C-------------------------------------GT-C-----------------------------------------------------------------------------------------------------------------------G-CA-A--------------------------------------------------------------------------------------------------------------------------------------G-A-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CAAA--G-A-G-GG-----G--GA-C-CT--------------------------------------------------------------------------------------------------------------------TCG-G----------------------------------------------------------------------------------------------------------------------G----CC-TC--T---T-G--------------C----C-A---T-CG-G---AT---G-T-----G-CCC-AGA--T-GGG--A------TT--A--G-CT-A----G---TAGG-T-G-GG-G-T----AAC-GG-C-T-C-ACCT--A-GG-C-G--A-CG-A------------TCC-C-T------AG-CT-G-G-TCT-G-AG----A--GG-AT--G-AC-C-AG-CCAC-A-CTGGA--A-C-TG-A-GA-C-AC-G-G-TCCAGA-CTCC-TAC-G--G-G-A-G-GC-A-GC-A-G-TG---GG-G-A-ATA-TTGCA-C-AA-T-GG--GC-GC-A----A-G-CC-T-GA-TG-CA-GCCA-TGCC-G-CG-T---G-T-A--T--GA-A-G--A--A-G-G-CC-----TT-CG---------G-G-T-T-G-T--A---AA-G-TAC--------TT-TC-A-G--C-GGG----GA-G--G---AA-GGGA---GTAA-AG----T--T--AA-T---A----C-----CT-T-TGC-TCA-TT-GA-CG-TT-A-C-CC-G-CA-G---------AA-----------GAAGC-ACC-GG-C-TAA---C--T-CCGT--GCCA--G-C---A--GCCG---C-GG--TA-AT--AC---GG-AG-GGT-GCA-A-G-CG-TTAA-T-CGG-AA-TT-A--C-T--GGGC-GTA----AA-GCGC-AC--G-CA-G-G-C-G------------G--T-TT-G-T-T-AA----G-T-C-A---G-ATG-TG-A-AA-TC--CC-CGG-G--------------------------------------------------------------------CT-C-AA-------------------------------------------------------------------------CC-T-G-GG-AA-C----T-G-C-A-T-C--------T--GA-T-A-C-T-G-GCA--A-G-C---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------T-T-G-A-G-T-C-----T-CG--TA-G-A------------G-GG-G-GG-T----AG--AATT-CCA-G-GT--GT-A-GCG-GTGAAA-TG-CGT-AGAG-A-TC-T-GGA--GG-A-AT-A-CC-GG--T--G--GC-GAA-G--G-C---G----G--C-C-CCCTG------G-AC-GA--------------------------------------------------------------AG-A-C-T--GA--CG-----CT-CA-GG--T-G-CGA--AA-G-C--------------G-TGGG-GAG-C-A-AACA--GG-ATTA-G-ATA-C-----CC-T-G-GTA-G-T----C-CA--C-G-CCG-T-AAA--C-GATG-TC--GA-CT---------T-GG--A--G-G-TT-G-TG-C--C--------------------------------------------------------------------------------------CTT-GA--------------------------------------------------------------------------------------------------------------------------------------------------G-G-C-GT--G-G-C-T-TC-C------GG--A----GC-TAA--CG-C-G-T--T--AA-GT--C----G-ACC-GCC-T-G-GG-GAG-TA---CGG-----C-C--G-C-A-A-GGT-T--AAA-ACTC-AAA---------TGAA-TTG-ACGGG-G-G-CCCG----C-A--C-A-A-GCG-GT-G--G--AG-CA-T--GT-GGT-TT-AATT-C-G-ATG-CAAC-G-CG-A-AG-A-A-CC-TT-A-CC-TGGTC-TT-G-AC-A-T-C--------------CAC-G-G-------------A-AG-T-T-T--TC--A-GA-G-A-T--G-A-G--A-A-T-G--T-G-----CC-------------------------------------T--TC-G------------------------------------------GG----A----A---CC-GTG---A--GA---------------------------------------------------C-A-G-G-T-GCTG-CA-TGG-CT--GTC-GTC-A-GC-TC---G-TG-TT-G--TGA-AA-TGT-T-GG-G-TT-AA-GT-CCCGC-AA--------C-GAG-CGC-A-ACC-C-T-TA--TC--C-TTTG--T-T-G-C-C---AG-C-G-----G-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------TCC------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GG---C----C-G------------G----G---A-A--CT---------------C-A-A-A-G-GA-G--AC-T-G-CCA--G-T------------------------------------G-A---TAA----------------------------------A-C-T-G--G-A-GG-A--AGG-T--GGGG-A-TGAC-GTC--AAGT-C---ATC-A-T-G-G-C-C-CTT----AC-G--AC-C-A-GG-GC-TA-CAC-ACGTG-C--TA--CAATG---G-CGCA-T-A--C-AAA-GA-GA--------------------------------------------------------------------------------------------------A-G-C-G-A--C-CTCG-C--G---------------------------------------A-GA-G-C-----------A--A-G-CG---G----------A--CCT-C------A-T-AAAGT-GC-G-T-C-G-TAG-TCC--------GGA-T-TGGAG-TC--T-GCAA-CT-C-------------------------------------------------------------------------------------------------G-ACTCC-A-T-G-AA-G-TC-GGAAT-CG-C-TA--G-TA-AT-C-G-T----GGA-TC-A-G--A------AT--GCC-AC-G-GT-G-AAT-ACGT-T-CCCGGGCCT-TGTA----CACACCG-CCC-GTC-----A---CA--CCA-TG-GG-A--G---TGG-G-TT-GC-AAA--A-GAA------G--T-AGG-TA-G-C-T-T-AA-C-C--------------------------------------------------------------TT----C-------------------------------------------------------------------------------------------------G--GG-A--GG-G--C---GC-TTA--CC--ACT-T----T-GTG-AT-TCA------------------------TG--ACT-GGGG-TG-AAG-TCGTAACAA-GGTAA-CCGT-AGGGGAA-CCTG-CGGT-TGGATCACCTCCTTA................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................";
270                 }else if (filename == "") {  //do nothing 
271                         seqMask = "";
272                 }else{
273                 
274         #ifdef USE_MPI  
275                         MPI_File inMPI;
276                         MPI_Offset size;
277                         MPI_Status status;
278                         
279                         //char* inFileName = new char[filename.length()];
280                         //memcpy(inFileName, filename.c_str(), filename.length());
281                         
282                         char inFileName[1024];
283                         strcpy(inFileName, filename.c_str());
284         
285                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
286                         MPI_File_get_size(inMPI, &size);
287
288                         //delete inFileName;
289                         
290                         char* buffer = new char[size];
291                         MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
292                         
293                         string tempBuf = buffer;
294                         if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
295                         istringstream iss (tempBuf,istringstream::in);
296
297                         delete buffer;
298                         
299                         if (!iss.eof()) {
300                                 Sequence temp(iss);
301                                 seqMask = temp.getAligned();
302                         }else {
303                                 m->mothurOut("Problem with mask."); m->mothurOutEndLine(); 
304                                 seqMask = "";
305                         }
306                         
307                         MPI_File_close(&inMPI);
308         #else
309         
310                         ifstream infile;
311                         m->openInputFile(filename, infile);
312                         
313                         if (!infile.eof()) {
314                                 Sequence temp(infile);
315                                 seqMask = temp.getAligned();
316                         }else {
317                                 m->mothurOut("Problem with mask."); m->mothurOutEndLine(); 
318                                 seqMask = "";
319                         }
320                         infile.close();
321         #endif
322         
323                 }
324         }
325         catch(exception& e) {
326                 m->errorOut(e, "Chimera", "setMask");
327                 exit(1);
328         }
329 }
330 //***************************************************************************************************************
331 Sequence* Chimera::getSequence(string name) {
332         try{
333                 Sequence* temp;
334                 
335                 //look through templateSeqs til you find it
336                 int spot = -1;
337                 for (int i = 0; i < templateSeqs.size(); i++) {
338                         if (name == templateSeqs[i]->getName()) {  
339                                 spot = i;
340                                 break;
341                         }
342                 }
343                 
344                 if(spot == -1) { m->mothurOut("Error: Could not find sequence."); m->mothurOutEndLine(); return NULL; }
345                 
346                 temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
347                 
348                 return temp;
349         }
350         catch(exception& e) {
351                 m->errorOut(e, "Chimera", "getSequence");
352                 exit(1);
353         }
354 }
355 //***************************************************************************************************************
356
357
358
359