archiveVersion = 1;
classes = {
};
- objectVersion = 45;
+ objectVersion = 46;
objects = {
/* Begin PBXBuildFile section */
A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; };
A754149714840CF7005850D1 /* summaryqualcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A754149614840CF7005850D1 /* summaryqualcommand.cpp */; };
A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; };
+ A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A76CDD811510F143004C8458 /* prcseqscommand.cpp */; };
A7730EFF13967241007433A3 /* countseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7730EFE13967241007433A3 /* countseqscommand.cpp */; };
A774101414695AF60098E6AC /* shhhseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774101314695AF60098E6AC /* shhhseqscommand.cpp */; };
A774104814696F320098E6AC /* myseqdist.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A774104614696F320098E6AC /* myseqdist.cpp */; };
A77410F614697C300098E6AC /* seqnoise.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77410F414697C300098E6AC /* seqnoise.cpp */; };
A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; };
A77A221F139001B600B0BE70 /* deuniquetreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */; };
+ A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */; };
+ A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7876A25152A017C00A0AE86 /* subsample.cpp */; };
A79234D713C74BF6002B08E2 /* mothurfisher.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79234D613C74BF6002B08E2 /* mothurfisher.cpp */; };
A795840D13F13CD900F201D5 /* countgroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A795840C13F13CD900F201D5 /* countgroupscommand.cpp */; };
A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; };
+ A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */; };
A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A3C8C714D041AD00B1BFBE /* otuassociationcommand.cpp */; };
A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; };
A7BF221414587886000AD524 /* myPerseus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF221214587886000AD524 /* myPerseus.cpp */; };
A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; };
+ A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; };
+ A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; };
A7E9B88112D37EC400DA6239 /* ace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B64F12D37EC300DA6239 /* ace.cpp */; };
A7E9B88212D37EC400DA6239 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65112D37EC300DA6239 /* aligncommand.cpp */; };
A7E9B88312D37EC400DA6239 /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B65312D37EC300DA6239 /* alignment.cpp */; };
A7E9B8C412D37EC400DA6239 /* fastamap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6DE12D37EC400DA6239 /* fastamap.cpp */; };
A7E9B8C512D37EC400DA6239 /* fileoutput.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E012D37EC400DA6239 /* fileoutput.cpp */; };
A7E9B8C612D37EC400DA6239 /* filterseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E312D37EC400DA6239 /* filterseqscommand.cpp */; };
- A7E9B8C712D37EC400DA6239 /* fisher2.c in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E512D37EC400DA6239 /* fisher2.c */; };
A7E9B8C812D37EC400DA6239 /* flowdata.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E712D37EC400DA6239 /* flowdata.cpp */; };
A7E9B8C912D37EC400DA6239 /* formatcolumn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6E912D37EC400DA6239 /* formatcolumn.cpp */; };
A7E9B8CA12D37EC400DA6239 /* formatphylip.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B6EC12D37EC400DA6239 /* formatphylip.cpp */; };
A7E9B8FB12D37EC400DA6239 /* memeuclidean.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B74F12D37EC400DA6239 /* memeuclidean.cpp */; };
A7E9B8FC12D37EC400DA6239 /* mempearson.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75112D37EC400DA6239 /* mempearson.cpp */; };
A7E9B8FD12D37EC400DA6239 /* mergefilecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75312D37EC400DA6239 /* mergefilecommand.cpp */; };
- A7E9B8FE12D37EC400DA6239 /* metastats2.c in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75612D37EC400DA6239 /* metastats2.c */; };
A7E9B8FF12D37EC400DA6239 /* metastatscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75712D37EC400DA6239 /* metastatscommand.cpp */; };
A7E9B90012D37EC400DA6239 /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75912D37EC400DA6239 /* mgclustercommand.cpp */; };
A7E9B90112D37EC400DA6239 /* mothur.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B75B12D37EC400DA6239 /* mothur.cpp */; };
A7E9B98D12D37EC400DA6239 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87C12D37EC400DA6239 /* weighted.cpp */; };
A7E9B98E12D37EC400DA6239 /* weightedlinkage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */; };
A7E9B98F12D37EC400DA6239 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E9B87F12D37EC400DA6239 /* whittaker.cpp */; };
+ A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */; };
A7F9F5CF141A5E500032F693 /* sequenceparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */; };
A7FA10021302E097003860FE /* mantelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7FA10011302E096003860FE /* mantelcommand.cpp */; };
A7FA2AC714A0E881007C09A6 /* bsplvb.f in Sources */ = {isa = PBXBuildFile; fileRef = A7FA2ABC14A0E881007C09A6 /* bsplvb.f */; };
A754149614840CF7005850D1 /* summaryqualcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = summaryqualcommand.cpp; sourceTree = "<group>"; };
A75790571301749D00A30DAB /* homovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = homovacommand.h; sourceTree = "<group>"; };
A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = "<group>"; };
+ A76CDD7F1510F09A004C8458 /* pcrseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcrseqscommand.h; sourceTree = "<group>"; };
+ A76CDD811510F143004C8458 /* prcseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prcseqscommand.cpp; sourceTree = "<group>"; };
A7730EFD13967241007433A3 /* countseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countseqscommand.h; sourceTree = "<group>"; };
A7730EFE13967241007433A3 /* countseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countseqscommand.cpp; sourceTree = "<group>"; };
A774101214695AF60098E6AC /* shhhseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = shhhseqscommand.h; sourceTree = "<group>"; };
A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getcommandinfocommand.cpp; sourceTree = "<group>"; };
A77A221D139001B600B0BE70 /* deuniquetreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = deuniquetreecommand.h; sourceTree = "<group>"; };
A77A221E139001B600B0BE70 /* deuniquetreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deuniquetreecommand.cpp; sourceTree = "<group>"; };
+ A77EBD2C1523707F00ED407C /* createdatabasecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = createdatabasecommand.h; sourceTree = "<group>"; };
+ A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = createdatabasecommand.cpp; sourceTree = "<group>"; };
+ A7876A25152A017C00A0AE86 /* subsample.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = subsample.cpp; sourceTree = "<group>"; };
+ A7876A28152A018B00A0AE86 /* subsample.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = subsample.h; sourceTree = "<group>"; };
A79234D513C74BF6002B08E2 /* mothurfisher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mothurfisher.h; sourceTree = "<group>"; };
A79234D613C74BF6002B08E2 /* mothurfisher.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mothurfisher.cpp; sourceTree = "<group>"; };
A795840B13F13CD900F201D5 /* countgroupscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = countgroupscommand.h; sourceTree = "<group>"; };
A795840C13F13CD900F201D5 /* countgroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = countgroupscommand.cpp; sourceTree = "<group>"; };
A799F5B71309A3E000AEEFA0 /* makefastqcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makefastqcommand.h; sourceTree = "<group>"; };
A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makefastqcommand.cpp; sourceTree = "<group>"; };
+ A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sortseqscommand.cpp; sourceTree = "<group>"; };
+ A7A32DAC14DC43D10001D2E5 /* sortseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sortseqscommand.h; sourceTree = "<group>"; };
A7A3C8C714D041AD00B1BFBE /* otuassociationcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = otuassociationcommand.cpp; sourceTree = "<group>"; };
A7A3C8C814D041AD00B1BFBE /* otuassociationcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = otuassociationcommand.h; sourceTree = "<group>"; };
A7A61F1A130035C800E05B6B /* LICENSE */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = LICENSE; sourceTree = "<group>"; };
A7BF221314587886000AD524 /* myPerseus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myPerseus.h; sourceTree = "<group>"; };
A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraperseuscommand.h; sourceTree = "<group>"; };
A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraperseuscommand.cpp; sourceTree = "<group>"; };
+ A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = cooccurrencecommand.cpp; sourceTree = "<group>"; };
+ A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = cooccurrencecommand.h; sourceTree = "<group>"; };
+ A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = trialSwap2.cpp; sourceTree = "<group>"; };
+ A7C3DC0E14FE469500FE1924 /* trialswap2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = trialswap2.h; sourceTree = "<group>"; };
A7DAAFA3133A254E003956EB /* commandparameter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = commandparameter.h; sourceTree = "<group>"; };
A7E9B64F12D37EC300DA6239 /* ace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ace.cpp; sourceTree = "<group>"; };
A7E9B65012D37EC300DA6239 /* ace.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ace.h; sourceTree = "<group>"; };
A7E9B6E212D37EC400DA6239 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = "<group>"; };
A7E9B6E312D37EC400DA6239 /* filterseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = filterseqscommand.cpp; sourceTree = "<group>"; };
A7E9B6E412D37EC400DA6239 /* filterseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filterseqscommand.h; sourceTree = "<group>"; };
- A7E9B6E512D37EC400DA6239 /* fisher2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fisher2.c; sourceTree = "<group>"; };
- A7E9B6E612D37EC400DA6239 /* fisher2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fisher2.h; sourceTree = "<group>"; };
A7E9B6E712D37EC400DA6239 /* flowdata.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = flowdata.cpp; sourceTree = "<group>"; };
A7E9B6E812D37EC400DA6239 /* flowdata.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = flowdata.h; sourceTree = "<group>"; };
A7E9B6E912D37EC400DA6239 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = "<group>"; };
A7E9B75212D37EC400DA6239 /* mempearson.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mempearson.h; sourceTree = "<group>"; };
A7E9B75312D37EC400DA6239 /* mergefilecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergefilecommand.cpp; sourceTree = "<group>"; };
A7E9B75412D37EC400DA6239 /* mergefilecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mergefilecommand.h; sourceTree = "<group>"; };
- A7E9B75512D37EC400DA6239 /* metastats.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metastats.h; sourceTree = "<group>"; };
- A7E9B75612D37EC400DA6239 /* metastats2.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = metastats2.c; sourceTree = "<group>"; };
A7E9B75712D37EC400DA6239 /* metastatscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = metastatscommand.cpp; sourceTree = "<group>"; };
A7E9B75812D37EC400DA6239 /* metastatscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = metastatscommand.h; sourceTree = "<group>"; };
A7E9B75912D37EC400DA6239 /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
A7E9B87E12D37EC400DA6239 /* weightedlinkage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = weightedlinkage.cpp; sourceTree = SOURCE_ROOT; };
A7E9B87F12D37EC400DA6239 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = "<group>"; };
A7E9B88012D37EC400DA6239 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = "<group>"; };
+ A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifytreecommand.cpp; sourceTree = "<group>"; };
+ A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifytreecommand.h; sourceTree = "<group>"; };
A7F9F5CD141A5E500032F693 /* sequenceparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequenceparser.h; sourceTree = "<group>"; };
A7F9F5CE141A5E500032F693 /* sequenceparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequenceparser.cpp; sourceTree = "<group>"; };
A7FA10001302E096003860FE /* mantelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mantelcommand.h; sourceTree = "<group>"; };
A7E9B82D12D37EC400DA6239 /* singlelinkage.cpp */,
A7E9B83012D37EC400DA6239 /* slibshuff.cpp */,
A7E9B83112D37EC400DA6239 /* slibshuff.h */,
+ A7876A28152A018B00A0AE86 /* subsample.h */,
+ A7876A25152A017C00A0AE86 /* subsample.cpp */,
+ A7C3DC0E14FE469500FE1924 /* trialswap2.h */,
+ A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */,
A7FF19F0140FFDA500AD216D /* trimoligos.h */,
A7FF19F1140FFDA500AD216D /* trimoligos.cpp */,
A7E9B87412D37EC400DA6239 /* validcalculator.cpp */,
A7E9B69012D37EC400DA6239 /* classifyotucommand.cpp */,
A7E9B69312D37EC400DA6239 /* classifyseqscommand.h */,
A7E9B69212D37EC400DA6239 /* classifyseqscommand.cpp */,
+ A7EEB0F714F29C1B00344B83 /* classifytreecommand.h */,
+ A7EEB0F414F29BFD00344B83 /* classifytreecommand.cpp */,
A7E9B69712D37EC400DA6239 /* clearcutcommand.h */,
A7E9B69612D37EC400DA6239 /* clearcutcommand.cpp */,
A73DDBB813C4A0D1006AAE38 /* clearmemorycommand.h */,
A7E9B6B512D37EC400DA6239 /* consensuscommand.cpp */,
A7E9B6B812D37EC400DA6239 /* consensusseqscommand.h */,
A7E9B6B712D37EC400DA6239 /* consensusseqscommand.cpp */,
+ A7C3DC0A14FE457500FE1924 /* cooccurrencecommand.h */,
+ A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */,
A7E9B6BA12D37EC400DA6239 /* corraxescommand.h */,
A7E9B6B912D37EC400DA6239 /* corraxescommand.cpp */,
A795840B13F13CD900F201D5 /* countgroupscommand.h */,
A795840C13F13CD900F201D5 /* countgroupscommand.cpp */,
A7730EFD13967241007433A3 /* countseqscommand.h */,
A7730EFE13967241007433A3 /* countseqscommand.cpp */,
+ A77EBD2C1523707F00ED407C /* createdatabasecommand.h */,
+ A77EBD2E1523709100ED407C /* createdatabasecommand.cpp */,
A7E9B6C412D37EC400DA6239 /* deconvolutecommand.h */,
A7E9B6C312D37EC400DA6239 /* deconvolutecommand.cpp */,
A7E9B6C612D37EC400DA6239 /* degapseqscommand.h */,
A7FC486612D795D60055BC5C /* pcacommand.cpp */,
A7E9B78812D37EC400DA6239 /* pcoacommand.h */,
A7E9B78712D37EC400DA6239 /* pcoacommand.cpp */,
+ A76CDD7F1510F09A004C8458 /* pcrseqscommand.h */,
+ A76CDD811510F143004C8458 /* prcseqscommand.cpp */,
A7E9B78C12D37EC400DA6239 /* phylodiversitycommand.h */,
A7E9B78B12D37EC400DA6239 /* phylodiversitycommand.cpp */,
A7E9B79212D37EC400DA6239 /* phylotypecommand.h */,
A7E9B82712D37EC400DA6239 /* shhhercommand.cpp */,
A774101214695AF60098E6AC /* shhhseqscommand.h */,
A774101314695AF60098E6AC /* shhhseqscommand.cpp */,
+ A7A32DAC14DC43D10001D2E5 /* sortseqscommand.h */,
+ A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */,
A7E9B84012D37EC400DA6239 /* splitabundcommand.h */,
A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */,
A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */,
isa = PBXGroup;
children = (
A7D161E7149F7F50000523E8 /* fortran */,
- A7E9B6E512D37EC400DA6239 /* fisher2.c */,
- A7E9B6E612D37EC400DA6239 /* fisher2.h */,
- A7E9B75512D37EC400DA6239 /* metastats.h */,
- A7E9B75612D37EC400DA6239 /* metastats2.c */,
A79234D513C74BF6002B08E2 /* mothurfisher.h */,
A79234D613C74BF6002B08E2 /* mothurfisher.cpp */,
A73DDC3613C4BF64006AAE38 /* mothurmetastats.h */,
08FB7793FE84155DC02AAC07 /* Project object */ = {
isa = PBXProject;
attributes = {
+ LastUpgradeCheck = 0420;
ORGANIZATIONNAME = "Schloss Lab";
};
buildConfigurationList = 1DEB928908733DD80010E9CD /* Build configuration list for PBXProject "Mothur" */;
- compatibilityVersion = "Xcode 3.1";
+ compatibilityVersion = "Xcode 3.2";
developmentRegion = English;
hasScannedForEncodings = 1;
knownRegions = (
A7E9B8C412D37EC400DA6239 /* fastamap.cpp in Sources */,
A7E9B8C512D37EC400DA6239 /* fileoutput.cpp in Sources */,
A7E9B8C612D37EC400DA6239 /* filterseqscommand.cpp in Sources */,
- A7E9B8C712D37EC400DA6239 /* fisher2.c in Sources */,
A7E9B8C812D37EC400DA6239 /* flowdata.cpp in Sources */,
A7E9B8C912D37EC400DA6239 /* formatcolumn.cpp in Sources */,
A7E9B8CA12D37EC400DA6239 /* formatphylip.cpp in Sources */,
A7E9B8FB12D37EC400DA6239 /* memeuclidean.cpp in Sources */,
A7E9B8FC12D37EC400DA6239 /* mempearson.cpp in Sources */,
A7E9B8FD12D37EC400DA6239 /* mergefilecommand.cpp in Sources */,
- A7E9B8FE12D37EC400DA6239 /* metastats2.c in Sources */,
A7E9B8FF12D37EC400DA6239 /* metastatscommand.cpp in Sources */,
A7E9B90012D37EC400DA6239 /* mgclustercommand.cpp in Sources */,
A7E9B90112D37EC400DA6239 /* mothur.cpp in Sources */,
A7FA2B1614A0EBEA007C09A6 /* sslvrg.f in Sources */,
A7FA2B5B14A0F0C2007C09A6 /* intrv.f in Sources */,
A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */,
+ A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */,
+ A7EEB0F514F29BFE00344B83 /* classifytreecommand.cpp in Sources */,
+ A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */,
+ A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */,
+ A76CDD821510F143004C8458 /* prcseqscommand.cpp in Sources */,
+ A77EBD2F1523709100ED407C /* createdatabasecommand.cpp in Sources */,
+ A7876A26152A017C00A0AE86 /* subsample.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
ALWAYS_SEARCH_USER_PATHS = NO;
COPY_PHASE_STRIP = NO;
GCC_DYNAMIC_NO_PIC = NO;
- GCC_ENABLE_FIX_AND_CONTINUE = YES;
GCC_MODEL_TUNING = G5;
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
INSTALL_PATH = /usr/local/bin;
PRODUCT_NAME = Mothur;
SDKROOT = macosx10.6;
GCC_ENABLE_SSE3_EXTENSIONS = NO;
GCC_ENABLE_SSE41_EXTENSIONS = NO;
GCC_ENABLE_SSE42_EXTENSIONS = NO;
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
"MOTHUR_FILES=\"\\\"../release\\\"\"",
- "VERSION=\"\\\"1.23.0\\\"\"",
- "RELEASE_DATE=\"\\\"1/9/2012\\\"\"",
+ "VERSION=\"\\\"1.24.0\\\"\"",
+ "RELEASE_DATE=\"\\\"3/12/2012\\\"\"",
);
"GCC_VERSION[arch=*]" = "";
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
"-lncurses",
"-lreadline",
);
- PREBINDING = NO;
SDKROOT = macosx10.6;
USER_HEADER_SEARCH_PATHS = "";
};
GCC_C_LANGUAGE_STANDARD = gnu99;
GCC_GENERATE_DEBUGGING_SYMBOLS = NO;
GCC_MODEL_TUNING = "";
- GCC_OPTIMIZATION_LEVEL = 3;
+ GCC_OPTIMIZATION_LEVEL = 0;
GCC_PREPROCESSOR_DEFINITIONS = (
- "VERSION=\"\\\"1.19.0\\\"\"",
- "RELEASE_DATE=\"\\\"5/9/2011\\\"\"",
+ "VERSION=\"\\\"1.24.0\\\"\"",
+ "RELEASE_DATE=\"\\\"3/12/2012\\\"\"",
);
GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
GCC_WARN_ABOUT_RETURN_TYPE = YES;
"-lncurses",
"-lreadline",
);
- PREBINDING = NO;
SDKROOT = macosx10.6;
};
name = Release;
#else
vector<unsigned long long> positions;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(candidateFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
#else
lines.push_back(new linePair(0, 1000));
}else {
positions = m->setFilePosFasta(candidateFileNames[s], numFastaSeqs);
-
+ if (positions.size() < processors) { processors = positions.size(); }
+
//figure out how many sequences you have to process
int numSeqsPerProcessor = numFastaSeqs / processors;
for (int i = 0; i < processors; i++) {
}
delete candidateSeq;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
try {
int num = 0;
processIDS.resize(0);
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
//loop through and create all the processes you want
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyAlignThreadFunction(LPVOID lpParam){
alignData* pDataArray;
#include "blastdb.hpp"
#include "referencedb.h"
-/**************************************************************************************************/
-//deep copy
-AlignmentDB::AlignmentDB(const AlignmentDB& adb) : numSeqs(adb.numSeqs), longest(adb.longest), method(adb.method), emptySequence(adb.emptySequence), threadID(adb.threadID) {
- try {
-
- m = MothurOut::getInstance();
- if (adb.method == "blast") {
- search = new BlastDB(*((BlastDB*)adb.search));
- }else if(adb.method == "kmer") {
- search = new KmerDB(*((KmerDB*)adb.search));
- }else if(adb.method == "suffix") {
- search = new SuffixDB(*((SuffixDB*)adb.search));
- }else {
- m->mothurOut("[ERROR]: cannot create copy of alignment database, unrecognized method - " + adb.method); m->mothurOutEndLine();
- }
-
- for (int i = 0; i < adb.templateSequences.size(); i++) {
- Sequence temp(adb.templateSequences[i]);
- templateSequences.push_back(temp);
- }
- }
- catch(exception& e) {
- m->errorOut(e, "AlignmentDB", "AlignmentDB");
- exit(1);
- }
-
-}
/**************************************************************************************************/
AlignmentDB::AlignmentDB(string fastaFileName, string s, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int tid){ // This assumes that the template database is in fasta format, may
try { // need to alter this in the future?
AlignmentDB(string, string, int, float, float, float, float, int); //reads fastafile passed in and stores sequences
AlignmentDB(string);
- AlignmentDB(const AlignmentDB& adb);
~AlignmentDB();
Sequence findClosestSequence(Sequence*);
//initialze probabilities
wordGenusProb.resize(numKmers);
WordPairDiffArr.resize(numKmers);
- //cout << numKmers << '\t' << genusNodes.size() << endl;
+
for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
- //cout << numKmers << '\t' << genusNodes.size() << endl;
- ofstream out;
+ ofstream out;
ofstream out2;
#ifdef USE_MPI
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
try{
istringstream iss (tempBuf,istringstream::in);
float probTemp;
iss >> zeroCountProb[i] >> numbers[i] >> probTemp;
- WordPairDiffArr[i].prob = tempProb;
+ WordPairDiffArr[i].prob = probTemp;
}
#else
//divide breakpoints between processors
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
lines.push_back(linePair(0, iters));
int Bellerophon::createProcesses(vector<int> mid) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 0;
int exitCommand = 1;
vector<int> processIDS;
+#ifndef BlastAlignment_H
+#define BlastAlignment_H
+
+
/*
* blastalign.hpp
*
float gapExtend;
};
+#endif
+
+
+
int randNumber = rand();
//int randNumber = 12345;
string pid = "";
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
pid += getpid();
#else
pid += toString(threadID);
for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
path = path.substr(0, (tempPath.find_last_of('m')));
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
path += "blast/bin/";
#else
path += "blast\\bin\\";
string formatdbCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
#else
formatdbCommand = path + "formatdb.exe";
if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
string blastCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
blastCommand = path + "blastall"; // format the database, -o option gives us the ability
#else
blastCommand = path + "blastall.exe";
string megablastCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
#else
megablastCommand = path + "megablast.exe";
for (int i = 0; i < path.length(); i++) { tempPath[i] = tolower(path[i]); }
path = path.substr(0, (tempPath.find_last_of('m')));
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
path += "blast/bin/";
#else
path += "blast\\bin\\";
int randNumber = rand();
string pid = "";
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
pid += getpid();
#else
pid += toString(threadID);
blastFileName = pid + toString(randNumber) + ".blast";
string formatdbCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
formatdbCommand = path + "formatdb"; // format the database, -o option gives us the ability
#else
formatdbCommand = path + "formatdb.exe";
if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe."); m->mothurOutEndLine(); m->control_pressed = true; }
string blastCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
blastCommand = path + "blastall"; // format the database, -o option gives us the ability
#else
blastCommand = path + "blastall.exe";
string megablastCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
megablastCommand = path + "megablast"; // format the database, -o option gives us the ability
#else
megablastCommand = path + "megablast.exe";
// long. With this setting, it seems comparable in speed to the suffix tree approach.
string blastCommand;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
blastCommand = path + "blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);
blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
// long. With this setting, it seems comparable in speed to the suffix tree approach.
//7000004128189528left 0 100 66 0 0 1 66 61 126 1e-31 131
string blastCommand;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
blastCommand = path + "megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
blastCommand += (" -i " + (queryFileName+pid+toString(randNumber)) + " -o " + blastFileName+pid+toString(randNumber));
#else
string formatdbCommand;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
formatdbCommand = path + "formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
#else
//formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
public:
BlastDB(string, float, float, float, float, string, int);
BlastDB(string, int);
- BlastDB(const BlastDB& bdb) : dbFileName(bdb.dbFileName), queryFileName(bdb.queryFileName), blastFileName(bdb.blastFileName), path(bdb.path),
- count(bdb.count), gapOpen(bdb.gapOpen), gapExtend(bdb.gapExtend), match(bdb.match), misMatch(bdb.misMatch), Database(bdb) {}
~BlastDB();
void generateDB();
savedOutputDir = outputDir;
string catchAllCommandExe = "";
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
catchAllCommandExe += "mono " + path + "CatchAllcmdL.exe ";
if (outputDir == "") { outputDir = "./"; } //force full pathname to be created for catchall, this is necessary because if catchall is in the path it will look for input file whereever the exe is and not the cwd.
#else
//create system command
string catchAllCommand = "";
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
#else
if (outputPath.length() > 0) { outputPath = outputPath.substr(0, outputPath.length()-1); }
//create system command
string catchAllCommand = "";
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
#else
if (outputPath.length() > 0) { outputPath = outputPath.substr(0, outputPath.length()-1); }
//create system command
string catchAllCommand = "";
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
catchAllCommand += catchAllCommandExe + filename + " " + outputPath + " 1";
#else
if (outputPath.length() > 0) { outputPath = outputPath.substr(0, outputPath.length()-1); }
//break up file
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
}
delete candidateSeq;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
int ChimeraCcodeCommand::createProcesses(string outputFileName, string filename, string accnos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 0;
int num = 0;
//break up file
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
vector<unsigned long long> positions = m->divideFile(fastaFileNames[i], processors);
for (int s = 0; s < (positions.size()-1); s++) {
}
delete candidateSeq;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
int ChimeraCheckCommand::createProcesses(string outputFileName, string filename) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 0;
int num = 0;
string inputString = "fasta=" + inputFile;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
nameFile = filenames["name"][0];
vector<seqData> sequences;
bool error = false;
+ alignLength = 0;
for (int i = 0; i < thisGroupsSeqs.size(); i++) {
else {
int num = m->getNumNames(it->second);
sequences.push_back(seqData(thisGroupsSeqs[i].getName(), thisGroupsSeqs[i].getUnaligned(), num));
+ if (thisGroupsSeqs[i].getUnaligned().length() > alignLength) { alignLength = thisGroupsSeqs[i].getUnaligned().length(); }
}
}
bool error = false;
ifstream in;
m->openInputFile(inputFile, in);
-
+ alignLength = 0;
+
while (!in.eof()) {
if (m->control_pressed) { in.close(); return sequences; }
if (it == nameMap.end()) { error = true; m->mothurOut("[ERROR]: " + temp.getName() + " is in your fasta file and not in your namefile, please correct."); m->mothurOutEndLine(); }
else {
sequences.push_back(seqData(temp.getName(), temp.getUnaligned(), it->second));
+ if (temp.getUnaligned().length() > alignLength) { alignLength = temp.getUnaligned().length(); }
}
}
in.close();
}
int numSeqs = sequences.size();
- int alignLength = sequences[0].sequence.size();
+ //int alignLength = sequences[0].sequence.size();
ofstream chimeraFile;
ofstream accnosFile;
for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
-
+
vector<bool> restricted = chimeras;
vector<vector<int> > leftDiffs(numSeqs);
string dummyA, dummyB;
- if(comparisons >= 2){
+ if (sequences[i].sequence.size() < 3) {
+ chimeraFile << i << '\t' << sequences[i].seqName << "\t0\t0\tNull\t0\t0\t0\tNull\tNull\t0.0\t0.0\t0.0\t0\t0\t0\t0.0\t0.0\tgood" << endl;
+ }else if(comparisons >= 2){
minMismatchToChimera = myPerseus.getChimera(sequences, leftDiffs, rightDiffs, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight, restricted);
if (m->control_pressed) { chimeraFile.close(); m->mothurRemove(chimeraFileName); accnosFile.close(); m->mothurRemove(accnosFileName); return 0; }
lines.push_back(linePair(startIndex, endIndex));
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
bool abort;
string fastafile, groupfile, outputDir, namefile;
- int processors;
+ int processors, alignLength;
double cutoff, alpha, beta;
vector<string> outputNames;
}
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyPerseusThreadFunction(LPVOID lpParam){
perseusData* pDataArray;
#else
//break up file
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
vector<unsigned long long> positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
}
delete candidateSeq;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
int ChimeraPintailCommand::createProcesses(string outputFileName, string filename, string accnos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 0;
int num = 0;
else {
//add / to name if needed
string lastChar = blastlocation.substr(blastlocation.length()-1);
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { blastlocation += "/"; }
#else
if (lastChar != "\\") { blastlocation += "\\"; }
#endif
blastlocation = m->getFullPathName(blastlocation);
string formatdbCommand = "";
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
formatdbCommand = blastlocation + "formatdb";
#else
formatdbCommand = blastlocation + "formatdb.exe";
if(ableToOpen == 1) { m->mothurOut("[ERROR]: " + formatdbCommand + " file does not exist. mothur requires formatdb.exe to run chimera.slayer."); m->mothurOutEndLine(); abort = true; }
string blastCommand = "";
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
blastCommand = blastlocation + "megablast";
#else
blastCommand = blastlocation + "megablast.exe";
if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
//until we resolve the issue 10-18-11
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
//processors=1;
#endif
#else
//break up file
vector<unsigned long long> positions;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(thisFastaName, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
if (processors == 1) { lines.push_back(linePair(0, 1000)); }
else {
positions = m->setFilePosFasta(thisFastaName, numSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numSeqs / processors;
map<string, string> uniqueNames = parser->getAllSeqsMap();
map<string, string>::iterator itUnique;
int total = 0;
+
+ if (trimera) { //add in more potential uniqueNames
+ map<string, string> newUniqueNames = uniqueNames;
+ for (map<string, string>::iterator it = uniqueNames.begin(); it != uniqueNames.end(); it++) {
+ newUniqueNames[(it->first)+"_LEFT"] = (it->first)+"_LEFT";
+ newUniqueNames[(it->first)+"_RIGHT"] = (it->first)+"_RIGHT";
+ }
+ uniqueNames = newUniqueNames;
+ newUniqueNames.clear();
+ }
//edit accnos file
ifstream in2;
string inputString = "fasta=" + inputFile;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
nameFile = filenames["name"][0];
m->mothurOutEndLine(); m->mothurOut("Checking sequences from group: " + fileGroup[thisFastaName] + "."); m->mothurOutEndLine();
lines.clear();
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int proc = 1;
vector<unsigned long long> positions = m->divideFile(thisFastaName, proc);
lines.push_back(linePair(positions[0], positions[1]));
breakUp.push_back(thisFileToPriority);
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
int num = 0;
processIDS.clear();
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MySlayerThreadFunction(LPVOID lpParam){
slayerData* pDataArray;
path = path.substr(0, (tempPath.find_last_of('m')));
string uchimeCommand;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
uchimeCommand = path + "uchime"; // format the database, -o option gives us the ability
#else
uchimeCommand = path + "uchime.exe";
string inputString = "fasta=" + inputFile;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
nameFile = filenames["name"][0];
path = path.substr(0, (tempPath.find_last_of('m')));
string uchimeCommand = path;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
uchimeCommand += "uchime ";
#else
uchimeCommand += "uchime";
//uchime_main(numArgs, uchimeParameters);
//cout << "commandString = " << commandString << endl;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
commandString = "\"" + commandString + "\"";
#endif
int num = 0;
vector<string> files;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//break up file into multiple files
m->divideFile(filename, processors, files);
lines.push_back(linePair(startIndex, endIndex));
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
uchimeData* pDataArray;
path = path.substr(0, (tempPath.find_last_of('m')));
string uchimeCommand = path;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
uchimeCommand += "uchime ";
#else
uchimeCommand += "uchime";
//uchime_main(numArgs, uchimeParameters);
//cout << "commandString = " << commandString << endl;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
commandString = "\"" + commandString + "\"";
#endif
for (int i = 0; i < temp.length(); i++) {
//eliminate N's
- if (toupper(temp[i]) == 'N') { temp[i] == '.'; }
+ if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
numBasesCounted++;
for (int i = (temp.length()-1); i >= 0; i--) {
//eliminate N's
- if (toupper(temp[i]) == 'N') { temp[i] == '.'; }
+ if (toupper(temp[i]) == 'N') { temp[i] = '.'; }
numBasesCounted++;
for (int i = 0; i < temp.length(); i++) {
//eliminate N's
if (toupper(temp[i]) == 'N') {
- temp[i] == '.';
+ temp[i] = '.';
tempLength--;
if (tempLength < numbases) { stopSpot = 0; break; }
}
for (int i = (temp.length()-1); i >= 0; i--) {
//eliminate N's
if (toupper(temp[i]) == 'N') {
- temp[i] == '.';
+ temp[i] = '.';
tempLength--;
if (tempLength < numbases) { stopSpot = 0; break; }
}
search = "kmer";
}
- if (namefileNames.size() == 0){
- vector<string> files; files.push_back(fastaFileNames[fastaFileNames.size()-1]);
- parser.getNameFile(files);
- }
+ if (!abort) {
+ if (namefileNames.size() == 0){
+ if (fastaFileNames.size() != 0) {
+ vector<string> files; files.push_back(fastaFileNames[fastaFileNames.size()-1]);
+ parser.getNameFile(files);
+ }
+ }
+ }
}
#else
vector<unsigned long long> positions;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(fastaFileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
#else
lines.push_back(new linePair(0, 1000));
}else {
positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numFastaSeqs / processors;
int num = 0;
processIDS.clear();
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
//loop through and create all the processes you want
string extension = "";
if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
- classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flipThreshold);
+ classifyData* tempclass = new classifyData((accnos + extension), probs, method, templateFileName, taxonomyFileName, (taxFileName + extension), (tempTaxFile + extension), filename, search, kmerSize, iters, numWanted, m, lines[i]->start, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip);
pDataArray.push_back(tempclass);
//MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
}
#endif
-
+ vector<string> nonBlankAccnosFiles;
+ if (!(m->isBlank(accnos))) { nonBlankAccnosFiles.push_back(accnos); }
+ else { m->mothurRemove(accnos); } //remove so other files can be renamed to it
+
for(int i=0;i<processIDS.size();i++){
appendTaxFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName);
appendTaxFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile);
- appendTaxFiles((accnos + toString(processIDS[i]) + ".temp"), accnos);
+ if (!(m->isBlank(accnos + toString(processIDS[i]) + ".temp"))) {
+ nonBlankAccnosFiles.push_back(accnos + toString(processIDS[i]) + ".temp");
+ }else { m->mothurRemove((accnos + toString(processIDS[i]) + ".temp")); }
+
m->mothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp"));
m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp"));
- m->mothurRemove((m->getFullPathName(accnos) + toString(processIDS[i]) + ".temp"));
}
+ //append accnos files
+ if (nonBlankAccnosFiles.size() != 0) {
+ rename(nonBlankAccnosFiles[0].c_str(), accnos.c_str());
+
+ for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
+ appendTaxFiles(nonBlankAccnosFiles[h], accnos);
+ m->mothurRemove(nonBlankAccnosFiles[h]);
+ }
+ }else { //recreate the accnosfile if needed
+ ofstream out;
+ m->openOutputFile(accnos, out);
+ out.close();
+ }
+
return num;
}
}
delete candidateSeq;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
*
*/
-#include "mothur.h"
+
#include "command.hpp"
#include "classify.h"
#include "referencedb.h"
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){
classifyData* pDataArray;
//make classify
Classify* myclassify;
if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip); }
- else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID, pDataArray->flipThreshold); }
+ else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); }
else {
pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian.");
pDataArray->m->mothurOutEndLine();
--- /dev/null
+//
+// classifytreecommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 2/20/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "classifytreecommand.h"
+#include "phylotree.h"
+
+//**********************************************************************************************************************
+vector<string> ClassifyTreeCommand::setParameters(){
+ try {
+ CommandParameter ptree("tree", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptree);
+ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "", "", "none",false,true); parameters.push_back(ptaxonomy);
+ CommandParameter pname("name", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "", "", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter pcutoff("cutoff", "Number", "", "51", "", "", "",false,true); parameters.push_back(pcutoff);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ClassifyTreeCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The classify.tree command reads a tree and taxonomy file and output the consensus taxonomy for each node on the tree. \n";
+ helpString += "If you provide a group file, the concensus for each group will also be provided. \n";
+ helpString += "The new tree contains labels at each internal node. The label is the node number so you can relate the tree to the summary file.\n";
+ helpString += "The summary file lists the concensus taxonomy for the descendants of each node.\n";
+ helpString += "The classify.tree command parameters are tree, group, name and taxonomy. The tree and taxonomy files are required.\n";
+ helpString += "The cutoff parameter allows you to specify a consensus confidence threshold for your taxonomy. The default is 51, meaning 51%. Cutoff cannot be below 51.\n";
+ helpString += "The classify.tree command should be used in the following format: classify.tree(tree=test.tre, group=test.group, taxonomy=test.taxonomy)\n";
+ helpString += "Note: No spaces between parameter labels (i.e. tree), '=' and parameters (i.e.yourTreefile).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getHelpString");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+ClassifyTreeCommand::ClassifyTreeCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["tree"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+ClassifyTreeCommand::ClassifyTreeCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string, string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ m->runParse = true;
+ m->clearGroups();
+ m->clearAllGroups();
+ m->Treenames.clear();
+ m->names.clear();
+
+ vector<string> tempOutNames;
+ outputTypes["tree"] = tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("tree");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["tree"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
+ }
+
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //check for required parameters
+ treefile = validParameter.validFile(parameters, "tree", true);
+ if (treefile == "not open") { treefile = ""; abort = true; }
+ else if (treefile == "not found") { treefile = "";
+ treefile = m->getTreeFile();
+ if (treefile != "") { m->mothurOut("Using " + treefile + " as input file for the tree parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("No valid current files. You must provide a tree file."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setTreeFile(treefile); }
+
+ taxonomyfile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxonomyfile == "not open") { taxonomyfile = ""; abort = true; }
+ else if (taxonomyfile == "not found") { taxonomyfile = "";
+ taxonomyfile = m->getTaxonomyFile();
+ if (taxonomyfile != "") { m->mothurOut("Using " + taxonomyfile + " as input file for the taxonomy parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("No valid current files. You must provide a taxonomy file."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setTaxonomyFile(taxonomyfile); }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { namefile = ""; abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { groupfile = ""; abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+
+ string temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "51"; }
+ m->mothurConvert(temp, cutoff);
+
+ if ((cutoff < 51) || (cutoff > 100)) { m->mothurOut("cutoff must be above 50, and no greater than 100."); m->mothurOutEndLine(); abort = true; }
+
+ if (namefile == "") {
+ vector<string> files; files.push_back(treefile);
+ parser.getNameFile(files);
+ }
+
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "ClassifyTreeCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int ClassifyTreeCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
+
+ int start = time(NULL);
+
+ /***************************************************/
+ // reading tree info //
+ /***************************************************/
+ m->setTreeFile(treefile);
+ if (groupfile != "") {
+ //read in group map info.
+ tmap = new TreeMap(groupfile);
+ tmap->readMap();
+ }else{ //fake out by putting everyone in one group
+ Tree* tree = new Tree(treefile); delete tree; //extracts names from tree to make faked out groupmap
+ tmap = new TreeMap();
+
+ for (int i = 0; i < m->Treenames.size(); i++) { tmap->addSeq(m->Treenames[i], "Group1"); }
+ }
+
+ if (namefile != "") { readNamesFile(); }
+
+ read = new ReadNewickTree(treefile);
+ int readOk = read->read(tmap);
+
+ if (readOk != 0) { m->mothurOut("Read Terminated."); m->mothurOutEndLine(); delete tmap; delete read; return 0; }
+
+ read->AssembleTrees();
+ vector<Tree*> T = read->getTrees();
+ Tree* outputTree = T[0];
+ delete read;
+
+ //make sure all files match
+ //if you provide a namefile we will use the numNames in the namefile as long as the number of unique match the tree names size.
+ int numNamesInTree;
+ if (namefile != "") {
+ if (numUniquesInName == m->Treenames.size()) { numNamesInTree = nameMap.size(); }
+ else { numNamesInTree = m->Treenames.size(); }
+ }else { numNamesInTree = m->Treenames.size(); }
+
+
+ //output any names that are in group file but not in tree
+ if (numNamesInTree < tmap->getNumSeqs()) {
+ for (int i = 0; i < tmap->namesOfSeqs.size(); i++) {
+ //is that name in the tree?
+ int count = 0;
+ for (int j = 0; j < m->Treenames.size(); j++) {
+ if (tmap->namesOfSeqs[i] == m->Treenames[j]) { break; } //found it
+ count++;
+ }
+
+ if (m->control_pressed) {
+ delete tmap; for (int i = 0; i < T.size(); i++) { delete T[i]; }
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
+ m->clearGroups();
+ return 0;
+ }
+
+ //then you did not find it so report it
+ if (count == m->Treenames.size()) {
+ //if it is in your namefile then don't remove
+ map<string, string>::iterator it = nameMap.find(tmap->namesOfSeqs[i]);
+
+ if (it == nameMap.end()) {
+ m->mothurOut(tmap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); m->mothurOutEndLine();
+ tmap->removeSeq(tmap->namesOfSeqs[i]);
+ i--; //need this because removeSeq removes name from namesOfSeqs
+ }
+ }
+ }
+ }
+
+ if (m->control_pressed) { delete outputTree; delete tmap; return 0; }
+
+ readTaxonomyFile();
+
+
+ /***************************************************/
+ // get concensus taxonomies //
+ /***************************************************/
+ getClassifications(outputTree);
+ delete outputTree; delete tmap;
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ //set tree file as new current treefile
+ if (treefile != "") {
+ string current = "";
+ itTypes = outputTypes.find("tree");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTreeFile(current); }
+ }
+ }
+
+ m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to find the concensus taxonomies."); m->mothurOutEndLine();
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+//traverse tree finding concensus taxonomy at each node
+//label node with a number to relate to output summary file
+//report all concensus taxonomies to file
+int ClassifyTreeCommand::getClassifications(Tree*& T){
+ try {
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(treefile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.summary";
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+ out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+
+ //print headings
+ out << "TreeNode\t";
+ if (groupfile != "") { out << "Group\t"; }
+ out << "NumRep\tTaxonomy" << endl;
+
+ string treeOutputDir = outputDir;
+ if (outputDir == "") { treeOutputDir += m->hasPath(treefile); }
+ string outputTreeFileName = treeOutputDir + m->getRootName(m->getSimpleName(treefile)) + "taxonomy.tre";
+
+ //create a map from tree node index to names of descendants, save time later
+ map<int, map<string, set<string> > > nodeToDescendants; //node# -> (groupName -> groupMembers)
+ for (int i = 0; i < T->getNumNodes(); i++) {
+ if (m->control_pressed) { return 0; }
+
+ nodeToDescendants[i] = getDescendantList(T, i, nodeToDescendants);
+ }
+
+ //for each node
+ for (int i = T->getNumLeaves(); i < T->getNumNodes(); i++) {
+
+ if (m->control_pressed) { out.close(); return 0; }
+
+ string tax = "not classifed";
+ int size;
+ if (groupfile != "") {
+ for (map<string, set<string> >::iterator itGroups = nodeToDescendants[i].begin(); itGroups != nodeToDescendants[i].end(); itGroups++) {
+ if (itGroups->first != "AllGroups") {
+ tax = getTaxonomy(itGroups->second, size);
+ out << (i+1) << '\t' << itGroups->first << '\t' << size << '\t' << tax << endl;
+ }
+ }
+ }else {
+ string group = "AllGroups";
+ tax = getTaxonomy(nodeToDescendants[i][group], size);
+ out << (i+1) << '\t' << size << '\t' << tax << endl;
+ }
+
+ T->tree[i].setLabel((i+1));
+ }
+ out.close();
+
+ ofstream outTree;
+ m->openOutputFile(outputTreeFileName, outTree);
+ outputNames.push_back(outputTreeFileName); outputTypes["tree"].push_back(outputTreeFileName);
+ T->print(outTree, "both");
+ outTree.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "GetConcensusTaxonomies");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string ClassifyTreeCommand::getTaxonomy(set<string> names, int& size) {
+ try{
+ string conTax = "";
+ size = 0;
+
+ //create a tree containing sequences from this bin
+ PhyloTree* phylo = new PhyloTree();
+
+ for (set<string>::iterator it = names.begin(); it != names.end(); it++) {
+
+
+ //if namesfile include the names
+ if (namefile != "") {
+
+ //is this sequence in the name file - namemap maps seqName -> repSeqName
+ map<string, string>::iterator it2 = nameMap.find(*it);
+
+ if (it2 == nameMap.end()) { //this name is not in name file, skip it
+ m->mothurOut((*it) + " is not in your name file. I will not include it in the consensus."); m->mothurOutEndLine();
+ }else{
+
+ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
+ map<string, string>::iterator itTax = taxMap.find((it2->second));
+
+ if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
+
+ if ((*it) != (it2->second)) { m->mothurOut((*it) + " is represented by " + it2->second + " and is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
+ else { m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine(); }
+ }else{
+ //add seq to tree
+ int num = nameCount[(*it)]; // we know its there since we found it in nameMap
+ for (int i = 0; i < num; i++) { phylo->addSeqToTree((*it)+toString(i), it2->second); }
+ size += num;
+ }
+ }
+
+ }else{
+ //is this sequence in the taxonomy file - look for repSeqName since we are assuming the taxonomy file is unique
+ map<string, string>::iterator itTax = taxMap.find((*it));
+
+ if (itTax == taxMap.end()) { //this name is not in taxonomy file, skip it
+ m->mothurOut((*it) + " is not in your taxonomy file. I will not include it in the consensus."); m->mothurOutEndLine();
+ }else{
+ //add seq to tree
+ phylo->addSeqToTree((*it), itTax->second);
+ size++;
+ }
+ }
+
+ if (m->control_pressed) { delete phylo; return conTax; }
+
+ }
+
+ //build tree
+ phylo->assignHeirarchyIDs(0);
+
+ TaxNode currentNode = phylo->get(0);
+ int myLevel = 0;
+ //at each level
+ while (currentNode.children.size() != 0) { //you still have more to explore
+
+ TaxNode bestChild;
+ int bestChildSize = 0;
+
+ //go through children
+ for (map<string, int>::iterator itChild = currentNode.children.begin(); itChild != currentNode.children.end(); itChild++) {
+
+ TaxNode temp = phylo->get(itChild->second);
+
+ //select child with largest accesions - most seqs assigned to it
+ if (temp.accessions.size() > bestChildSize) {
+ bestChild = phylo->get(itChild->second);
+ bestChildSize = temp.accessions.size();
+ }
+
+ }
+
+ //is this taxonomy above cutoff
+ int consensusConfidence = ceil((bestChildSize / (float) size) * 100);
+
+ if (consensusConfidence >= cutoff) { //if yes, add it
+ conTax += bestChild.name + "(" + toString(consensusConfidence) + ");";
+ myLevel++;
+ }else{ //if no, quit
+ break;
+ }
+
+ //move down a level
+ currentNode = bestChild;
+ }
+
+ if (myLevel != phylo->getMaxLevel()) {
+ while (myLevel != phylo->getMaxLevel()) {
+ conTax += "unclassified;";
+ myLevel++;
+ }
+ }
+ if (conTax == "") { conTax = "no_consensus;"; }
+
+ delete phylo;
+
+ return conTax;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getTaxonomy");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+map<string, set<string> > ClassifyTreeCommand::getDescendantList(Tree*& T, int i, map<int, map<string, set<string> > > descendants){
+ try {
+ map<string ,set<string> > names;
+
+ map<string ,set<string> >::iterator it;
+ map<string ,set<string> >::iterator it2;
+
+ int lc = T->tree[i].getLChild();
+ int rc = T->tree[i].getRChild();
+
+ if (lc == -1) { //you are a leaf your only descendant is yourself
+ string group = tmap->getGroup(T->tree[i].getName());
+ set<string> mynames; mynames.insert(T->tree[i].getName());
+ names[group] = mynames; //mygroup -> me
+ names["AllGroups"] = mynames;
+ }else{ //your descedants are the combination of your childrens descendants
+ names = descendants[lc];
+ for (it = descendants[rc].begin(); it != descendants[rc].end(); it++) {
+ it2 = names.find(it->first); //do we already have this group
+ if (it2 == names.end()) { //nope, so add it
+ names[it->first] = it->second;
+ }else {
+ for (set<string>::iterator it3 = (it->second).begin(); it3 != (it->second).end(); it3++) {
+ names[it->first].insert(*it3);
+ }
+ }
+
+ }
+ }
+
+ return names;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "getDescendantList");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ClassifyTreeCommand::readTaxonomyFile() {
+ try {
+
+ ifstream in;
+ m->openInputFile(taxonomyfile, in);
+
+ string name, tax;
+
+ while(!in.eof()){
+ in >> name >> tax;
+ m->gobble(in);
+
+ //are there confidence scores, if so remove them
+ if (tax.find_first_of('(') != -1) { m->removeConfidences(tax); }
+
+ taxMap[name] = tax;
+
+ if (m->control_pressed) { in.close(); taxMap.clear(); return 0; }
+ }
+ in.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "readTaxonomyFile");
+ exit(1);
+ }
+}
+
+/*****************************************************************/
+int ClassifyTreeCommand::readNamesFile() {
+ try {
+ ifstream inNames;
+ m->openInputFile(namefile, inNames);
+
+ string name, names;
+
+ while(!inNames.eof()){
+ inNames >> name; //read from first column A
+ inNames >> names; //read from second column A,B,C,D
+ m->gobble(inNames);
+
+ //parse names into vector
+ vector<string> theseNames;
+ m->splitAtComma(names, theseNames);
+
+ for (int i = 0; i < theseNames.size(); i++) { nameMap[theseNames[i]] = name; }
+ nameCount[name] = theseNames.size();
+
+ if (m->control_pressed) { inNames.close(); nameMap.clear(); return 0; }
+ }
+ inNames.close();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClassifyTreeCommand", "readNamesFile");
+ exit(1);
+ }
+}
+
+/*****************************************************************/
+
+
--- /dev/null
+#ifndef Mothur_classifytreecommand_h
+#define Mothur_classifytreecommand_h
+
+//
+// classifytreecommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 2/20/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "command.hpp"
+#include "readtree.h"
+#include "treemap.h"
+
+class ClassifyTreeCommand : public Command {
+public:
+ ClassifyTreeCommand(string);
+ ClassifyTreeCommand();
+ ~ClassifyTreeCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "classify.tree"; }
+ string getCommandCategory() { return "Phylotype Analysis"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Classify.tree"; }
+ string getDescription() { return "Find the consensus taxonomy for the descendant of each tree node"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+ ReadTree* read;
+ TreeMap* tmap;
+ string treefile, taxonomyfile, groupfile, namefile, outputDir;
+ bool abort;
+ vector<string> outputNames;
+ int numUniquesInName, cutoff;
+ map<string, string> nameMap;
+ map<string, int> nameCount;
+ map<string, string> taxMap;
+
+ int getClassifications(Tree*&);
+ map<string, set<string> > getDescendantList(Tree*&, int, map<int, map<string, set<string> > >);
+ string getTaxonomy(set<string>, int&);
+ int readNamesFile();
+ int readTaxonomyFile();
+
+};
+
+
+
+#endif
try {
//update location of seqs in smallRow since they move to smallCol now
for (int i = 0; i < dMatrix.size(); i++) {
- cout << "row = " << i << '\t';
+ m->mothurOut("row = " + toString(i) + "\t");
for (int j = 0; j < dMatrix[i].size(); j++) {
- cout << dMatrix[i][j] << '\t';
+ m->mothurOut(toString(dMatrix[i][j]) + "\t");
}
- cout << endl;
+ m->mothurOutEndLine();
}
}
catch(exception& e) {
*/
#include "clustersplitcommand.h"
-#include "readcluster.h"
-#include "splitmatrix.h"
-#include "readphylip.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "inputdata.h"
+
//**********************************************************************************************************************
MPI_Barrier(MPI_COMM_WORLD);
#else
-
+ ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
//sanity check
if (processors > distName.size()) { processors = distName.size(); }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
}else{
-
- //cout << processors << '\t' << distName.size() << endl;
- vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
- dividedNames.resize(processors);
-
- //for each file group figure out which process will complete it
- //want to divide the load intelligently so the big files are spread between processes
- for (int i = 0; i < distName.size(); i++) {
- //cout << i << endl;
- int processToAssign = (i+1) % processors;
- if (processToAssign == 0) { processToAssign = processors; }
-
- dividedNames[(processToAssign-1)].push_back(distName[i]);
- }
-
- //not lets reverse the order of ever other process, so we balance big files running with little ones
- for (int i = 0; i < processors; i++) {
- //cout << i << endl;
- int remainder = ((i+1) % processors);
- if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
- }
-
- createProcesses(dividedNames);
-
- if (m->control_pressed) { return 0; }
-
- //get list of list file names from each process
- for(int i=0;i<processors;i++){
- string filename = toString(processIDS[i]) + ".temp";
- ifstream in;
- m->openInputFile(filename, in);
-
- in >> tag; m->gobble(in);
-
- while(!in.eof()) {
- string tempName;
- in >> tempName; m->gobble(in);
- listFileNames.push_back(tempName);
- }
- in.close();
- m->mothurRemove((toString(processIDS[i]) + ".temp"));
-
- //get labels
- filename = toString(processIDS[i]) + ".temp.labels";
- ifstream in2;
- m->openInputFile(filename, in2);
-
- float tempCutoff;
- in2 >> tempCutoff; m->gobble(in2);
- if (tempCutoff < cutoff) { cutoff = tempCutoff; }
-
- while(!in2.eof()) {
- string tempName;
- in2 >> tempName; m->gobble(in2);
- if (labels.count(tempName) == 0) { labels.insert(tempName); }
- }
- in2.close();
- m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
- }
- }
+ listFileNames = createProcesses(distName, labels);
+ }
#else
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
#endif
}
}
//**********************************************************************************************************************
-int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
+vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
try {
+
+ vector<string> listFiles;
+ vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
+ dividedNames.resize(processors);
+
+ //for each file group figure out which process will complete it
+ //want to divide the load intelligently so the big files are spread between processes
+ for (int i = 0; i < distName.size(); i++) {
+ //cout << i << endl;
+ int processToAssign = (i+1) % processors;
+ if (processToAssign == 0) { processToAssign = processors; }
+
+ dividedNames[(processToAssign-1)].push_back(distName[i]);
+ if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
+ }
+
+ //not lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ //cout << i << endl;
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
+ }
+
+ if (m->control_pressed) { return listFiles; }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- int exitCommand = 1;
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ int process = 1;
processIDS.clear();
//loop through and create all the processes you want
}
}
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
//force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
+ for (int i=0;i< processIDS.size();i++) {
int temp = processIDS[i];
wait(&temp);
}
+
+ //get list of list file names from each process
+ for(int i=0;i<processIDS.size();i++){
+ string filename = toString(processIDS[i]) + ".temp";
+ ifstream in;
+ m->openInputFile(filename, in);
+
+ in >> tag; m->gobble(in);
+
+ while(!in.eof()) {
+ string tempName;
+ in >> tempName; m->gobble(in);
+ listFiles.push_back(tempName);
+ }
+ in.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp"));
+
+ //get labels
+ filename = toString(processIDS[i]) + ".temp.labels";
+ ifstream in2;
+ m->openInputFile(filename, in2);
+
+ float tempCutoff;
+ in2 >> tempCutoff; m->gobble(in2);
+ if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+
+ while(!in2.eof()) {
+ string tempName;
+ in2 >> tempName; m->gobble(in2);
+ if (labels.count(tempName) == 0) { labels.insert(tempName); }
+ }
+ in2.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
+ }
+
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the clusterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add labels.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
- return exitCommand;
+ vector<clusterData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
+ pDataArray.push_back(tempCluster);
+ processIDS.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+
+ }
+
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ //get tag
+ tag = pDataArray[i]->tag;
+ //get listfiles created
+ for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
+ //get labels
+ set<string>::iterator it;
+ for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
+ //check cutoff
+ if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
#endif
+
+ return listFiles;
}
catch(exception& e) {
vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
try {
- Cluster* cluster;
- SparseMatrix* matrix;
- ListVector* list;
- ListVector oldList;
- RAbundVector* rabund;
vector<string> listFileNames;
-
double smallestCutoff = cutoff;
//cluster each distance file
for (int i = 0; i < distNames.size(); i++) {
+
+ Cluster* cluster = NULL;
+ SparseMatrix* matrix = NULL;
+ ListVector* list = NULL;
+ ListVector oldList;
+ RAbundVector* rabund = NULL;
+
if (m->control_pressed) { return listFileNames; }
string thisNamefile = distNames[i].begin()->second;
oldList = *list;
matrix = read->getMatrix();
- delete read;
- delete nameMap;
+ delete read; read = NULL;
+ delete nameMap; nameMap = NULL;
#ifdef USE_MPI
}
delete matrix; delete list; delete cluster; delete rabund;
+ matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
listFile.close();
if (m->control_pressed) { //clean up
#include "listvector.hpp"
#include "cluster.hpp"
#include "sparsematrix.hpp"
-
+#include "readcluster.h"
+#include "splitmatrix.h"
+#include "readphylip.h"
+#include "readcolumn.h"
+#include "readmatrix.hpp"
+#include "inputdata.h"
+#include "clustercommand.h"
class ClusterSplitCommand : public Command {
ofstream outList, outRabund, outSabund;
void printData(ListVector*);
- int createProcesses(vector < vector < map<string, string> > >);
+ vector<string> createProcesses(vector< map<string, string> >, set<string>&);
vector<string> cluster(vector< map<string, string> >, set<string>&);
int mergeLists(vector<string>, map<float, int>, ListVector*);
map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
int createMergedDistanceFile(vector< map<string, string> >);
};
+/////////////////not working for Windows////////////////////////////////////////////////////////////
+// getting an access violation error. This is most likely caused by the
+// threads stepping on eachother's structures, as I can run the thread function and the cluster fuction
+// in separately without errors occuring. I suspect it may be in the use of the
+// static class mothurOut, but I can't pinpoint the problem. All other objects are made new
+// within the thread. MothurOut is used by almost all the classes in mothur, so if this was
+// really the cause I would expect to see all the windows threaded commands to have issues, but not
+// all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have
+// anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector
+// is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12
+////////////////////////////////////////////////////////////////////////////////////////////////////
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct clusterData {
+ set<string> labels;
+ vector < map<string, string> > distNames;
+ string method;
+ MothurOut* m;
+ double cutoff, precision;
+ string tag, outputDir;
+ vector<string> listFiles;
+ bool hard;
+ int length, threadID;
+
+
+ clusterData(){}
+ clusterData(vector < map<string, string> > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) {
+ distNames = dv;
+ m = mout;
+ cutoff = cu;
+ method = me;
+ outputDir = ou;
+ hard = hd;
+ precision = pre;
+ length = len;
+ threadID = th;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){
+ clusterData* pDataArray;
+ pDataArray = (clusterData*)lpParam;
+
+ try {
+ cout << "starting " << endl;
+
+ double smallestCutoff = pDataArray->cutoff;
+
+ //cluster each distance file
+ for (int i = 0; i < pDataArray->distNames.size(); i++) {
+
+ Cluster* mycluster = NULL;
+ SparseMatrix* mymatrix = NULL;
+ ListVector* mylist = NULL;
+ ListVector myoldList;
+ RAbundVector* myrabund = NULL;
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ string thisNamefile = pDataArray->distNames[i].begin()->second;
+ string thisDistFile = pDataArray->distNames[i].begin()->first;
+ cout << thisNamefile << '\t' << thisDistFile << endl;
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine();
+
+ ReadMatrix* myread = new ReadColumnMatrix(thisDistFile);
+ myread->setCutoff(pDataArray->cutoff);
+ NameAssignment* mynameMap = new NameAssignment(thisNamefile);
+ mynameMap->readMap();
+ cout << "done reading " << thisNamefile << endl;
+ myread->read(mynameMap);
+ cout << "done reading " << thisDistFile << endl;
+ if (pDataArray->m->control_pressed) { delete myread; delete mynameMap; break; }
+
+ mylist = myread->getListVector();
+ myoldList = *mylist;
+ mymatrix = myread->getMatrix();
+ cout << "here" << endl;
+ delete myread; myread = NULL;
+ delete mynameMap; mynameMap = NULL;
+
+ pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine();
+
+ myrabund = new RAbundVector(mylist->getRAbundVector());
+ cout << "here" << endl;
+ //create cluster
+ if (pDataArray->method == "furthest") { mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ else if(pDataArray->method == "nearest"){ mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ else if(pDataArray->method == "average"){ mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
+ pDataArray->tag = mycluster->getTag();
+ cout << "here" << endl;
+ if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); }
+ string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile));
+ cout << "here" << endl;
+ ofstream listFile;
+ pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list", listFile);
+ cout << "here" << endl;
+ pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list");
+
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+
+ myoldList = *mylist;
+
+ bool print_start = true;
+ int start = time(NULL);
+ double saveCutoff = pDataArray->cutoff;
+
+ while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){
+
+ if (pDataArray->m->control_pressed) { //clean up
+ delete mymatrix; delete mylist; delete mycluster; delete myrabund;
+ listFile.close();
+ for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
+ pDataArray->listFiles.clear(); break;
+ }
+
+ mycluster->update(saveCutoff);
+
+ float dist = mymatrix->getSmallDist();
+ float rndDist;
+ if (pDataArray->hard) {
+ rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision);
+ }else{
+ rndDist = pDataArray->m->roundDist(dist, pDataArray->precision);
+ }
+
+ if(previousDist <= 0.0000 && dist != previousDist){
+ myoldList.setLabel("unique");
+ myoldList.print(listFile);
+ if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
+ }
+ else if(rndDist != rndPreviousDist){
+ myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
+ myoldList.print(listFile);
+ if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
+ }
+
+ previousDist = dist;
+ rndPreviousDist = rndDist;
+ myoldList = *mylist;
+ }
+
+ cout << "here2" << endl;
+ if(previousDist <= 0.0000){
+ myoldList.setLabel("unique");
+ myoldList.print(listFile);
+ if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
+ }
+ else if(rndPreviousDist<pDataArray->cutoff){
+ myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
+ myoldList.print(listFile);
+ if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
+ }
+
+ delete mymatrix; delete mylist; delete mycluster; delete myrabund;
+ mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL;
+ listFile.close();
+
+ if (pDataArray->m->control_pressed) { //clean up
+ for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
+ pDataArray->listFiles.clear(); break;
+ }
+ cout << "here3" << endl;
+ pDataArray->m->mothurRemove(thisDistFile);
+ pDataArray->m->mothurRemove(thisNamefile);
+ cout << "here4" << endl;
+ if (saveCutoff != pDataArray->cutoff) {
+ if (pDataArray->hard) { saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision); }
+ else { saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision); }
+
+ pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine();
+ }
+ cout << "here5" << endl;
+ if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
+ }
+
+ pDataArray->cutoff = smallestCutoff;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+
+
+
#endif
#include "shhhseqscommand.h"
#include "summaryqualcommand.h"
#include "otuassociationcommand.h"
+#include "sortseqscommand.h"
+#include "classifytreecommand.h"
+#include "cooccurrencecommand.h"
+#include "pcrseqscommand.h"
+#include "createdatabasecommand.h"
/*******************************************************/
commands["summary.qual"] = "summary.qual";
commands["shhh.seqs"] = "shhh.seqs";
commands["otu.association"] = "otu.association";
+ commands["sort.seqs"] = "sort.seqs";
+ commands["classify.tree"] = "classify.tree";
+ commands["cooccurrence"] = "cooccurrence";
+ commands["pcr.seqs"] = "pcr.seqs";
+ commands["create.database"] = "create.database";
commands["quit"] = "MPIEnabled";
}
//This function calls the appropriate command fucntions based on user input.
Command* CommandFactory::getCommand(string commandName, string optionString){
try {
+
delete command; //delete the old command
//user has opted to redirect output from dir where input files are located to some other place
else if(commandName == "chimera.perseus") { command = new ChimeraPerseusCommand(optionString); }
else if(commandName == "shhh.seqs") { command = new ShhhSeqsCommand(optionString); }
else if(commandName == "otu.association") { command = new OTUAssociationCommand(optionString); }
+ else if(commandName == "sort.seqs") { command = new SortSeqsCommand(optionString); }
+ else if(commandName == "classify.tree") { command = new ClassifyTreeCommand(optionString); }
+ else if(commandName == "cooccurrence") { command = new CooccurrenceCommand(optionString); }
+ else if(commandName == "pcr.seqs") { command = new PcrSeqsCommand(optionString); }
+ else if(commandName == "create.database") { command = new CreateDatabaseCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
else if(commandName == "chimera.perseus") { pipecommand = new ChimeraPerseusCommand(optionString); }
else if(commandName == "shhh.seqs") { pipecommand = new ShhhSeqsCommand(optionString); }
else if(commandName == "otu.association") { pipecommand = new OTUAssociationCommand(optionString); }
+ else if(commandName == "sort.seqs") { pipecommand = new SortSeqsCommand(optionString); }
+ else if(commandName == "classify.tree") { pipecommand = new ClassifyTreeCommand(optionString); }
+ else if(commandName == "cooccurrence") { pipecommand = new CooccurrenceCommand(optionString); }
+ else if(commandName == "pcr.seqs") { pipecommand = new PcrSeqsCommand(optionString); }
+ else if(commandName == "create.database") { pipecommand = new CreateDatabaseCommand(optionString); }
else { pipecommand = new NoCommand(optionString); }
return pipecommand;
else if(commandName == "chimera.perseus") { shellcommand = new ChimeraPerseusCommand(); }
else if(commandName == "shhh.seqs") { shellcommand = new ShhhSeqsCommand(); }
else if(commandName == "otu.association") { shellcommand = new OTUAssociationCommand(); }
+ else if(commandName == "sort.seqs") { shellcommand = new SortSeqsCommand(); }
+ else if(commandName == "classify.tree") { shellcommand = new ClassifyTreeCommand(); }
+ else if(commandName == "cooccurrence") { shellcommand = new CooccurrenceCommand(); }
+ else if(commandName == "pcr.seqs") { shellcommand = new PcrSeqsCommand(); }
+ else if(commandName == "create.database") { shellcommand = new CreateDatabaseCommand(); }
else { shellcommand = new NoCommand(); }
return shellcommand;
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
bool CommandFactory::isValidCommand(string command) {
try {
--- /dev/null
+/*
+ * cooccurrencecommand.cpp
+ * Mothur
+ *
+ * Created by kiverson on 1/2/12.
+ * Copyright 2012 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "cooccurrencecommand.h"
+
+//**********************************************************************************************************************
+vector<string> CooccurrenceCommand::setParameters() {
+ try {
+ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
+ CommandParameter pmetric("metric", "Multiple", "cscore-checker-combo-vratio", "cscore", "", "", "",false,false); parameters.push_back(pmetric);
+ CommandParameter pmatrix("matrixmodel", "Multiple", "sim1-sim2-sim3-sim4-sim5-sim6-sim7-sim8-sim9", "sim2", "", "", "",false,false); parameters.push_back(pmatrix);
+ CommandParameter pruns("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pruns);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+ CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+ CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string CooccurrenceCommand::getHelpString(){
+ try {
+ string helpString = "The cooccurrence command calculates four metrics and tests their significance to assess whether presence-absence patterns are different than what one would expect by chance.";
+ helpString += "The cooccurrence command parameters are shared, metric, matrixmodel, iters, label and groups.";
+ helpString += "The matrixmodel parameter options are sim1, sim2, sim3, sim4, sim5, sim6, sim7, sim8 and sim9. Default=sim2";
+ helpString += "The metric parameter options are cscore, checker, combo and vratio. Default=cscore";
+ helpString += "The label parameter is used to analyze specific labels in your input.\n";
+ helpString += "The groups parameter allows you to specify which of the groups you would like analyzed.\n";
+ helpString += "The cooccurrence command should be in the following format: \n";
+ helpString += "cooccurrence(shared=yourSharedFile) \n";
+ helpString += "Example cooccurrence(shared=final.an.shared).\n";
+ helpString += "Note: No spaces between parameter labels (i.e. shared), '=' and parameters (i.e.yourShared).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+CooccurrenceCommand::CooccurrenceCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "CooccurrenceCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+CooccurrenceCommand::CooccurrenceCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+ allLines = 1;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+ map<string,string>::iterator it;
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("shared");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["shared"] = inputDir + it->second; }
+ }
+ }
+
+ vector<string> tempOutNames;
+ outputTypes["summary"] = tempOutNames;
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; }
+ else {
+ if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
+ else { allLines = 1; }
+ }
+
+ //get shared file
+ sharedfile = validParameter.validFile(parameters, "shared", true);
+ if (sharedfile == "not open") { sharedfile = ""; abort = true; }
+ else if (sharedfile == "not found") {
+ //if there is a current shared file, use it
+ sharedfile = m->getSharedFile();
+ if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else { m->setSharedFile(sharedfile); }
+
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(sharedfile); }
+
+
+ metric = validParameter.validFile(parameters, "metric", false); if (metric == "not found") { metric = "cscore"; }
+
+ if ((metric != "cscore") && (metric != "checker") && (metric != "combo") && (metric != "vratio")) {
+ m->mothurOut("[ERROR]: " + metric + " is not a valid metric option for the cooccurrence command. Choices are cscore, checker, combo, vratio."); m->mothurOutEndLine(); abort = true;
+ }
+
+ matrix = validParameter.validFile(parameters, "matrix", false); if (matrix == "not found") { matrix = "sim2"; }
+
+ if ((matrix != "sim1") && (matrix != "sim2") && (matrix != "sim3") && (matrix != "sim4") && (matrix != "sim5" ) && (matrix != "sim6" ) && (matrix != "sim7" ) && (matrix != "sim8" ) && (matrix != "sim9" )) {
+ m->mothurOut("[ERROR]: " + matrix + " is not a valid matrix option for the cooccurrence command. Choices are sim1, sim2, sim3, sim4, sim5, sim6, sim7, sim8, sim9."); m->mothurOutEndLine(); abort = true;
+ }
+
+ groups = validParameter.validFile(parameters, "groups", false);
+ if (groups == "not found") { groups = ""; }
+ else {
+ m->splitAtDash(groups, Groups);
+ }
+ m->setGroups(Groups);
+
+ string temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
+ m->mothurConvert(temp, runs);
+
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "CooccurrenceCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int CooccurrenceCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ InputData* input = new InputData(sharedfile, "sharedfile");
+ vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
+ string lastLabel = lookup[0]->getLabel();
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ ofstream out;
+ string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "cooccurence.summary";
+ m->openOutputFile(outputFileName, out);
+ outputNames.push_back(outputFileName); outputTypes["summary"].push_back(outputFileName);
+ out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
+ out << "metric\tlabel\tScore\tpValue\n";
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } delete input; out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
+
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+
+ getCooccurrence(lookup, out);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+ }
+
+ if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = lookup[0]->getLabel();
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input->getSharedRAbundVectors(lastLabel);
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+ getCooccurrence(lookup, out);
+
+ processedLabels.insert(lookup[0]->getLabel());
+ userLabels.erase(lookup[0]->getLabel());
+
+ //restore real lastlabel to save below
+ lookup[0]->setLabel(saveLabel);
+ }
+
+ lastLabel = lookup[0]->getLabel();
+ //prevent memory leak
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
+
+ if (m->control_pressed) { outputTypes.clear(); delete input; out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ //get next line to process
+ lookup = input->getSharedRAbundVectors();
+ }
+
+ if (m->control_pressed) { delete input; out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
+ lookup = input->getSharedRAbundVectors(lastLabel);
+
+ m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
+
+ getCooccurrence(lookup, out);
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ }
+
+ out.close();
+
+ //reset groups parameter
+ delete input;
+ m->clearGroups();
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ m->mothurOut(outputFileName); m->mothurOutEndLine();
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int CooccurrenceCommand::getCooccurrence(vector<SharedRAbundVector*>& thisLookUp, ofstream& out){
+ try {
+ int numOTUS = thisLookUp[0]->getNumBins();
+ vector< vector<int> > initmatrix; initmatrix.resize(thisLookUp.size());
+ vector< vector<int> > co_matrix; co_matrix.resize(thisLookUp[0]->getNumBins());
+ for (int i = 0; i < thisLookUp[0]->getNumBins(); i++) { co_matrix[i].resize((thisLookUp.size()), 0); }
+ for (int i = 0; i < thisLookUp.size(); i++) { initmatrix[i].resize((thisLookUp[i]->getNumBins()), 0); }
+ vector<int> columntotal; columntotal.resize(thisLookUp.size(), 0);
+ vector<int> rowtotal; rowtotal.resize(numOTUS, 0);
+
+ int rowcount = 0;
+ for (int i = 0; i < thisLookUp.size(); i++) {
+ for (int j = 0; j < thisLookUp[i]->getNumBins(); j++) {
+ if (m->control_pressed) { return 0; }
+ int abund = thisLookUp[i]->getAbundance(j);
+
+ if(abund > 0) {
+ initmatrix[i][j] = 1;
+ co_matrix[j][i] = 1;
+ rowcount++;
+ columntotal[j]++;
+ }
+ }
+ rowtotal[i] = rowcount;
+ rowcount = 0;
+ }
+
+ //nrows is ncols of inital matrix. All the functions need this value. They assume the transposition has already taken place and nrows and ncols refer to that matrix.
+ //comatrix and initmatrix are still vectors of vectors of ints as in the original script. The abundancevector is only what was read in ie not a co-occurrence matrix!
+ int ncols = numOTUS;//rows of inital matrix
+ int nrows = thisLookUp.size();//groups
+ double initscore = 0.0;
+ //transpose matrix
+ int newmatrows = ncols;
+ int newmatcols = nrows;
+
+ //swap for transposed matrix
+ nrows = newmatrows;//ncols;
+ ncols = newmatcols;//nrows;
+
+ vector<int> initcolumntotal; initcolumntotal.resize(ncols, 0);
+ vector<int> initrowtotal; initrowtotal.resize(nrows, 0);
+ vector<double> stats;
+
+ TrialSwap2 trial;
+
+ initcolumntotal = rowtotal;
+ initrowtotal = columntotal;
+ trial.update_row_col_totals(co_matrix, rowtotal, columntotal);
+
+ if (metric == "cscore") { initscore = trial.calc_c_score(co_matrix, rowtotal); }
+ else if (metric == "checker") { initscore = trial.calc_checker(co_matrix, rowtotal); }
+ else if (metric == "vratio") { initscore = trial.calc_vratio(rowtotal, columntotal); }
+ else if (metric == "combo") { initscore = trial.calc_combo(co_matrix); }
+ else { m->mothurOut("[ERROR]: No metric selected!\n"); m->control_pressed = true; return 1; }
+
+ m->mothurOut("Initial c score: " + toString(initscore)); m->mothurOutEndLine();
+
+ //nullmatrix burn in
+ for(int i=0;i<10000;i++) {
+ if (m->control_pressed) { return 0; }
+ if (matrix == "sim1") {
+ trial.sim1(co_matrix);
+ }else if (matrix == "sim2") {
+ trial.sim2(co_matrix);
+ }else if (matrix == "sim3") {
+ trial.sim3(initmatrix);
+ co_matrix = initmatrix;
+ }else if (matrix == "sim4") {
+ trial.sim4(columntotal, rowtotal, co_matrix);
+ }else if (matrix == "sim5") {
+ trial.sim5(initcolumntotal, initrowtotal, initmatrix);
+ trial.transpose_matrix(initmatrix,co_matrix);
+ }else if (matrix == "sim6") {
+ trial.sim6(columntotal, co_matrix);
+ }else if (matrix == "sim7") {
+ trial.sim7(initcolumntotal, initmatrix);
+ co_matrix = initmatrix;
+ }else if (matrix == "sim8") {
+ trial.sim8(columntotal, rowtotal, co_matrix);
+ }else if (matrix == "sim9") {
+ trial.swap_checkerboards (co_matrix);
+ }else{
+ m->mothurOut("[ERROR]: No model selected! \n");
+ m->control_pressed = true;
+ }
+ }
+
+ //run
+ for(int i=0;i<runs;i++) {
+ if (m->control_pressed) { return 0; }
+ //calc metric of nullmatrix
+ if (matrix == "sim1") {
+ trial.sim1(co_matrix);
+ }else if (matrix == "sim2") {
+ trial.sim2(co_matrix);
+ }else if (matrix == "sim3") {
+ trial.sim3(initmatrix);
+ co_matrix = initmatrix;
+ }else if (matrix == "sim4") {
+ trial.sim4(columntotal, rowtotal, co_matrix);
+ }else if (matrix == "sim5") {
+ trial.sim5(initcolumntotal, initrowtotal, initmatrix);
+ trial.transpose_matrix(initmatrix,co_matrix);
+ }else if (matrix == "sim6") {
+ trial.sim6(columntotal, co_matrix);
+ }else if (matrix == "sim7") {
+ trial.sim7(initcolumntotal, initmatrix);
+ co_matrix = initmatrix;
+ }else if (matrix == "sim8") {
+ trial.sim8(columntotal, rowtotal, co_matrix);
+ }else if (matrix == "sim9") {
+ trial.swap_checkerboards (co_matrix);
+ }else{
+ m->mothurOut("[ERROR]: No model selected! \n");
+ m->control_pressed = true;
+ }
+ //
+ //
+ trial.update_row_col_totals(co_matrix, rowtotal, columntotal);
+
+ if (metric == "cscore") {
+ stats.push_back(trial.calc_c_score(co_matrix, rowtotal));
+ }else if (metric == "checker") {
+ stats.push_back(trial.calc_checker(co_matrix, rowtotal));
+ }else if (metric == "vratio") {
+ stats.push_back(trial.calc_vratio(rowtotal, columntotal));
+ }else if (metric == "combo") {
+ stats.push_back(trial.calc_combo(co_matrix));
+ }else {
+ m->mothurOut("[ERROR]: No metric selected!\n");
+ m->control_pressed = true;
+ return 1;
+ }
+
+ }
+
+ double total = 0.0;
+ for (int i=0; i<stats.size();i++) { total+=stats[i]; }
+
+ double nullMean = double (total/(double)stats.size());
+
+ m->mothurOutEndLine(); m->mothurOut("average metric score: " + toString(nullMean)); m->mothurOutEndLine();
+
+ double pvalue = 0.0;
+ if (metric == "cscore" || metric == "checker") { pvalue = trial.calc_pvalue_greaterthan (stats, initscore); }
+ else{ pvalue = trial.calc_pvalue_lessthan (stats, initscore); }
+
+ m->mothurOut("pvalue: " + toString(pvalue)); m->mothurOutEndLine();
+ out << metric << '\t' << thisLookUp[0]->getLabel() << '\t' << nullMean << '\t' << pvalue << endl;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CooccurrenceCommand", "Cooccurrence");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
--- /dev/null
+#ifndef COOCCURRENCECOMMAND_H
+#define COOCCURRENCECOMMAND_H
+
+/*
+ * COOCCURRENCE.h
+ * Mothur
+ *
+ * Created by westcott on 11/10/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "command.hpp"
+#include "trialswap2.h"
+#include "inputdata.h"
+#include "sharedrabundvector.h"
+
+
+class CooccurrenceCommand : public Command {
+
+public:
+
+ CooccurrenceCommand(string);
+ CooccurrenceCommand();
+ ~CooccurrenceCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "Cooccurrence"; }
+ string getCommandCategory() { return "Hypothesis Testing"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Cooccurrence"; }
+ string getDescription() { return "calculates four metrics and tests their significance to assess whether presence-absence patterns are different than what one would expect by chance."; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+
+private:
+ string metric, matrix, outputDir;
+ string label, sharedfile, groups;
+ bool abort, allLines;
+ set<string> labels;
+ vector<string> outputNames, Groups;
+ int runs;
+
+ int getCooccurrence(vector<SharedRAbundVector*>&, ofstream&);
+
+};
+
+#endif
+
+
#include "corraxescommand.h"
#include "sharedutilities.h"
+#include "linearalgebra.h"
//**********************************************************************************************************************
vector<string> CorrAxesCommand::setParameters(){
int CorrAxesCommand::calcPearson(map<string, vector<float> >& axes, ofstream& out) {
try {
+ LinearAlgebra linear;
+
//find average of each axis - X
vector<float> averageAxes; averageAxes.resize(numaxes, 0.0);
for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
//for each otu
for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
- if (metadatafile == "") { out << i+1; }
+ if (metadatafile == "") { out << m->currentBinLabels[i]; }
else { out << metadataLabels[i]; }
//find the averages this otu - Y
rValues[k] = r;
out << '\t' << r;
- //signifigance calc - http://faculty.vassar.edu/lowry/ch4apx.html
- double temp = (1- (r*r)) / (double) (lookupFloat.size()-2);
- temp = sqrt(temp);
- double sig = r / temp;
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+ double sig = linear.calcPearsonSig(lookupFloat.size(), r);
out << '\t' << sig;
}
int CorrAxesCommand::calcSpearman(map<string, vector<float> >& axes, ofstream& out) {
try {
+ LinearAlgebra linear;
+ vector<double> sf;
+
//format data
vector< map<float, int> > tableX; tableX.resize(numaxes);
map<float, int>::iterator itTable;
vector<spearmanRank> ties;
int rankTotal = 0;
+ double sfTemp = 0.0;
for (int j = 0; j < scores[i].size(); j++) {
rankTotal += (j+1);
ties.push_back(scores[i][j]);
float thisrank = rankTotal / (float) ties.size();
rankAxes[ties[k].name].push_back(thisrank);
}
+ int t = ties.size();
+ sfTemp += (t*t*t-t);
ties.clear();
rankTotal = 0;
}
}
}
}
+ sf.push_back(sfTemp);
}
//for each otu
for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
- if (metadatafile == "") { out << i+1; }
+ if (metadatafile == "") { out << m->currentBinLabels[i]; }
else { out << metadataLabels[i]; }
//find the ranks of this otu - Y
sort(otuScores.begin(), otuScores.end(), compareSpearman);
+ double sg = 0.0;
map<string, float> rankOtus;
vector<spearmanRank> ties;
int rankTotal = 0;
float thisrank = rankTotal / (float) ties.size();
rankOtus[ties[k].name] = thisrank;
}
+ int t = ties.size();
+ sg += (t*t*t-t);
ties.clear();
rankTotal = 0;
}
pValues[j] = p;
- //signifigance calc - http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
- double temp = (lookupFloat.size()-2) / (double) (1- (p*p));
- temp = sqrt(temp);
- double sig = p*temp;
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
-
+ double sig = linear.calcSpearmanSig(n, sf[j], sg, di);
out << '\t' << sig;
}
int CorrAxesCommand::calcKendall(map<string, vector<float> >& axes, ofstream& out) {
try {
+ LinearAlgebra linear;
+
//format data
vector< vector<spearmanRank> > scores; scores.resize(numaxes);
for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
//for each otu
for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
- if (metadatafile == "") { out << i+1; }
+ if (metadatafile == "") { out << m->currentBinLabels[i]; }
else { out << metadataLabels[i]; }
//find the ranks of this otu - Y
out << '\t' << p;
pValues[j] = p;
- //calc signif - zA - http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient#Significance_tests
- double numer = 3.0 * (numCoor - numDisCoor);
- int n = scores[j].size();
- double denom = n * (n-1) * (2*n + 5) / (double) 2.0;
- denom = sqrt(denom);
- double sig = numer / denom;
-
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+ double sig = linear.calcKendallSig(scores[j].size(), p);
out << '\t' << sig;
}
//open input file
ifstream in;
m->openInputFile(namefile, in);
-
+
+ int total = 0;
while (!in.eof()) {
if (m->control_pressed) { break; }
out << firstCol << '\t' << names.size() << endl;
}
-
+ total += names.size();
}
in.close();
if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
+ m->mothurOutEndLine();
+ m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
m->mothurOutEndLine();
m->mothurOut("Output File Name: "); m->mothurOutEndLine();
m->mothurOut(outputFileName); m->mothurOutEndLine();
--- /dev/null
+//
+// createdatabasecommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 3/28/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "createdatabasecommand.h"
+#include "inputdata.h"
+
+//**********************************************************************************************************************
+vector<string> CreateDatabaseCommand::setParameters(){
+ try {
+ CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
+ CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
+ CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcontaxonomy);
+ CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string CreateDatabaseCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The create.database command reads a listfile, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n";
+ helpString += "The create.database command parameters are repfasta, list, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n";
+ helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
+ helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
+ helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n";
+ helpString += "The group file is optional and will just give you the abundance breakdown by group.\n";
+ helpString += "The label parameter allows you to specify a label to be used from your listfile.\n";
+ helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n";
+ helpString += "The create.database command should be in the following format: \n";
+ helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n";
+ helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, name=final.an.0.03.rep.names, list=fina.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n";
+ helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "getHelpString");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+CreateDatabaseCommand::CreateDatabaseCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["database"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+CreateDatabaseCommand::CreateDatabaseCommand(string option) {
+ try{
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if (option == "help") {
+ help(); abort = true; calledHelp = true;
+ }else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string, string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["database"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("list");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["list"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("repname");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["repname"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("contaxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["contaxonomy"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("repfasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["repfasta"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+ }
+
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //check for required parameters
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not found") {
+ //if there is a current list file, use it
+ listfile = m->getListFile();
+ if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }
+ else if (listfile == "not open") { abort = true; }
+ else { m->setListFile(listfile); }
+
+ contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
+ if (contaxonomyfile == "not found") { //if there is a current list file, use it
+ contaxonomyfile = ""; m->mothurOut("The contaxonomy parameter is required, aborting."); m->mothurOutEndLine(); abort = true;
+ }
+ else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
+
+ repfastafile = validParameter.validFile(parameters, "repfasta", true);
+ if (repfastafile == "not found") { //if there is a current list file, use it
+ repfastafile = ""; m->mothurOut("The repfasta parameter is required, aborting."); m->mothurOutEndLine(); abort = true;
+ }
+ else if (repfastafile == "not open") { repfastafile = ""; abort = true; }
+
+ repnamesfile = validParameter.validFile(parameters, "repname", true);
+ if (repnamesfile == "not found") { //if there is a current list file, use it
+ repnamesfile = ""; m->mothurOut("The repnames parameter is required, aborting."); m->mothurOutEndLine(); abort = true;
+ }
+ else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { groupfile = ""; abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your listfile.\n");}
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int CreateDatabaseCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ //taxonomies holds the taxonomy info for each Otu
+ //classifyOtuSizes holds the size info of each Otu to help with error checking
+ vector<string> taxonomies;
+ vector<int> classifyOtuSizes = readTax(taxonomies);
+
+ if (m->control_pressed) { return 0; }
+
+ vector<Sequence> seqs;
+ vector<int> repOtusSizes = readFasta(seqs);
+
+ if (m->control_pressed) { return 0; }
+
+ //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map.
+ map<string, string> repNames;
+ int numUniqueNamesFile = readNames(repNames);
+
+ //are there the same number of otus in the fasta and name files
+ if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file. These should match.\n"); m->control_pressed = true; }
+
+ if (m->control_pressed) { return 0; }
+
+ //are there the same number of OTUs in the tax and fasta file
+ if (classifyOtuSizes.size() != repOtusSizes.size()) { m->mothurOut("[ERROR]: you have " + toString(classifyOtuSizes.size()) + " taxonomies in your contaxonomy file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file. These should match.\n"); m->control_pressed = true; }
+
+ if (m->control_pressed) { return 0; }
+
+ //at this point we have the same number of OTUs. Are the sizes we have found so far accurate?
+ for (int i = 0; i < classifyOtuSizes.size(); i++) {
+ if (classifyOtuSizes[i] != repOtusSizes[i]) {
+ m->mothurOut("[ERROR]: OTU size info does not match for bin " + toString(i+1) + ". The contaxonomy file indicated the OTU represented " + toString(classifyOtuSizes[i]) + " sequences, but the repfasta file had " + toString(repOtusSizes[i]) + ". These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true;
+ }
+ }
+
+ if (m->control_pressed) { return 0; }
+
+ //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile
+ ListVector* list = getList();
+
+ if (m->control_pressed) { delete list; return 0; }
+
+ GroupMap* groupmap = NULL;
+ if (groupfile != "") {
+ groupmap = new GroupMap(groupfile);
+ groupmap->readMap();
+ }
+
+ if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; }
+
+ if (outputDir == "") { outputDir += m->hasPath(listfile); }
+ string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + "database";
+ outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ string header = "OTUNumber\tAbundance\t";
+ if (groupfile != "") {
+ header = "OTUNumber\t";
+ for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; }
+ }
+ header += "repSeqName\trepSeq\tOTUConTaxonomy";
+ out << header << endl;
+
+ for (int i = 0; i < list->getNumBins(); i++) {
+
+ if (m->control_pressed) { break; }
+
+ out << (i+1) << '\t';
+
+ vector<string> binNames;
+ string bin = list->get(i);
+
+ map<string, string>::iterator it = repNames.find(bin);
+ if (it == repNames.end()) {
+ m->mothurOut("[ERROR: OTU " + toString(i+1) + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break;
+ }
+
+ m->splitAtComma(bin, binNames);
+
+ //sanity check
+ if (binNames.size() != classifyOtuSizes[i]) {
+ m->mothurOut("[ERROR: OTU " + toString(i+1) + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break;
+ }
+
+ //output abundances
+ if (groupfile != "") {
+ string groupAbunds = "";
+ map<string, int> counts;
+ //initialize counts to 0
+ for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; }
+
+ //find abundances by group
+ bool error = false;
+ for (int j = 0; j < binNames.size(); j++) {
+ string group = groupmap->getGroup(binNames[j]);
+ if (group == "not found") {
+ m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n");
+ error = true;
+ }else { counts[group]++; }
+ }
+
+ //output counts
+ for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; }
+
+ if (error) { m->control_pressed = true; }
+ }else { out << binNames.size() << '\t'; }
+
+ //output repSeq
+ out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl;
+ }
+ out.close();
+
+ delete list;
+ if (groupfile != "") { delete groupmap; }
+
+ if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ m->mothurOut(outputFileName); m->mothurOutEndLine();
+ m->mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<int> CreateDatabaseCommand::readTax(vector<string>& taxonomies){
+ try {
+
+ vector<int> sizes;
+
+ ifstream in;
+ m->openInputFile(contaxonomyfile, in);
+
+ //read headers
+ m->getline(in);
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ string otu = ""; string tax = "unknown";
+ int size = 0;
+
+ in >> otu >> size >> tax; m->gobble(in);
+
+ sizes.push_back(size);
+ taxonomies.push_back(tax);
+ }
+ in.close();
+
+ return sizes;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "readTax");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<int> CreateDatabaseCommand::readFasta(vector<Sequence>& seqs){
+ try {
+
+ vector<int> sizes;
+
+ ifstream in;
+ m->openInputFile(repfastafile, in);
+
+ while (!in.eof()) {
+
+ if (m->control_pressed) { break; }
+
+ string binInfo;
+ Sequence seq(in, binInfo, true); m->gobble(in);
+
+ //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file
+ vector<string> info;
+ m->splitAtChar(binInfo, info, '|');
+ if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format. The create database command is designed to be used with the output from get.oturep. When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n"); m->control_pressed = true; break;}
+
+ int size = 0;
+ m->mothurConvert(info[1], size);
+
+ sizes.push_back(size);
+ seqs.push_back(seq);
+ }
+ in.close();
+
+ return sizes;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "readFasta");
+ exit(1);
+ }
+}
+/**********************************************************************************************************************/
+int CreateDatabaseCommand::readNames(map<string, string>& nameMap) {
+ try {
+
+ //open input file
+ ifstream in;
+ m->openInputFile(repnamesfile, in);
+
+ while (!in.eof()) {
+ if (m->control_pressed) { break; }
+
+ string firstCol, secondCol;
+ in >> firstCol >> secondCol; m->gobble(in);
+
+ nameMap[secondCol] = firstCol;
+ }
+ in.close();
+
+ return nameMap.size();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "readNames");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+ListVector* CreateDatabaseCommand::getList(){
+ try {
+ InputData* input = new InputData(listfile, "list");
+ ListVector* list = input->getListVector();
+ string lastLabel = list->getLabel();
+
+ if (label == "") { label = lastLabel; delete input; return list; }
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> labels; labels.insert(label);
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((list != NULL) && (userLabels.size() != 0)) {
+ if (m->control_pressed) { delete input; return list; }
+
+ if(labels.count(list->getLabel()) == 1){
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+ break;
+ }
+
+ if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = list->getLabel();
+
+ delete list;
+ list = input->getListVector(lastLabel);
+
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
+
+ //restore real lastlabel to save below
+ list->setLabel(saveLabel);
+ break;
+ }
+
+ lastLabel = list->getLabel();
+
+ //get next line to process
+ //prevent memory leak
+ delete list;
+ list = input->getListVector();
+ }
+
+
+ if (m->control_pressed) { delete input; return list; }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ m->mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+ needToRun = true;
+ }else {
+ m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ delete list;
+ list = input->getListVector(lastLabel);
+ }
+
+ delete input;
+
+ return list;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CreateDatabaseCommand", "getList");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+
--- /dev/null
+#ifndef Mothur_createdatabasecommand_h
+#define Mothur_createdatabasecommand_h
+
+//
+// createdatabasecommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 3/28/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "command.hpp"
+#include "listvector.hpp"
+#include "sequence.hpp"
+
+class CreateDatabaseCommand : public Command {
+public:
+ CreateDatabaseCommand(string);
+ CreateDatabaseCommand();
+ ~CreateDatabaseCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "create.database"; }
+ string getCommandCategory() { return "OTU-Based Approaches"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Create.database"; }
+ string getDescription() { return "creates database file that includes, abundances across groups, representative sequences, and taxonomy for each OTU"; }
+
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+
+ bool abort;
+ string listfile, groupfile, repfastafile, repnamesfile, contaxonomyfile, label, outputDir;
+
+ vector<string> outputNames;
+
+ vector<int> readFasta(vector<Sequence>&);
+ vector<int> readTax(vector<string>&);
+ int readNames(map<string, string>&);
+ ListVector* getList();
+
+};
+
+
+
+
+#endif
public:
Database();
- Database(const Database& db) : numSeqs(db.numSeqs), longest(db.longest), searchScore(db.searchScore), results(db.results), Scores(db.Scores) { m = MothurOut::getInstance(); }
virtual ~Database();
virtual void generateDB() = 0;
virtual void addSequence(Sequence) = 0; //add sequence to search engine
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
//used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
int DeCalculator::findLargestContrib(vector<int> seen) {
try{
exit(1);
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
try{
map<string, string> nameMap;
map<string, string>::iterator itNames;
- if (oldNameMapFName != "") { m->readNames(oldNameMapFName, nameMap); }
+ if (oldNameMapFName != "") {
+ m->readNames(oldNameMapFName, nameMap);
+ if (oldNameMapFName == outNameFile){ outNameFile = outputDir + m->getRootName(m->getSimpleName(inFastaName)) + "unique.names"; }
+ }
if (m->control_pressed) { return 0; }
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//if you don't need to fork anything
if(processors == 1){
if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
/**************************************************************************************************/
void DistanceCommand::createProcesses(string filename) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
processIDS.clear();
string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
//use the unix sort
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
string command = "sort -n " + outputFile + " -o " + outfile;
system(command.c_str());
#else //sort using windows sort
exit(1);
}
}
-/**************************************************************************************************
+**************************************************************************************************
int DistanceCommand::convertToLowerTriangle(string outputFile) {
try{
string outfile = m->getRootName(outputFile) + "sorted.dist.temp";
//use the unix sort
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
string command = "sort -n " + outputFile + " -o " + outfile;
system(command.c_str());
#else //sort using windows sort
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
//its okay if the column file does not contain all the names in the fasta file, since some distance may have been above a cutoff,
//but no sequences can be in the column file that are not in oldfasta. also, if a distance is above the cutoff given then remove it.
//also check to make sure the 2 files have the same alignment length.
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyDistThreadFunction(LPVOID lpParam){
distanceData* pDataArray;
#include "onegapignore.h"
-/**************************************************************************************************/
-DistanceDB::DistanceDB(const DistanceDB& ddb) : data(ddb.data), templateSeqsLength(ddb.templateSeqsLength), templateAligned(ddb.templateAligned), Database(ddb) {
- distCalculator = new oneGapIgnoreTermGapDist();
-}
/**************************************************************************************************/
DistanceDB::DistanceDB() : Database() {
try {
public:
DistanceDB();
- DistanceDB(const DistanceDB& ddb);
~DistanceDB() { delete distCalculator; }
void generateDB() {} //doesn't generate a search db
public:
eachGapDist() {}
- eachGapDist(const eachGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
int diff = 0;
//delimiting path char
char delim;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
delim = ':';
#else
delim = ';';
if (mothurPath != "") {
//add mothur so it looks like what argv would look like
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
mothurPath += "/mothur";
#else
mothurPath += "\\mothur";
//is this mothurs path?
ifstream in;
string tempIn = dirs[i];
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
tempIn += "/mothur";
#else
tempIn += "\\mothur";
string Engine::getCommand() {
try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_READLINE
char* nextCommand = NULL;
nextCommand = readline("mothur > ");
fasta = validParameter.validFile(parameters, "fasta", false);
if (fasta == "not found") {
fasta = m->getFastaFile();
- if (fasta != "") { fastafileNames.push_back(fasta); m->mothurOut("Using " + fasta + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+ if (fasta != "") {
+ fastafileNames.push_back(fasta);
+ m->mothurOut("Using " + fasta + " as input file for the fasta parameter."); m->mothurOutEndLine();
+ string simpleName = m->getSimpleName(fasta);
+ filterFileName += simpleName.substr(0, simpleName.find_first_of('.'));
+ }
else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
}
else {
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
+
+ vector<unsigned long long> positions;
+ if (savedPositions.size() != 0) { positions = savedPositions[s]; }
+ else {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafileNames[s], processors);
+#else
+ if(processors != 1){
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafileNames[s], numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+ }
+#endif
+ }
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ //vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
numSeqs += numFastaSeqs;
}else{
- int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s]);
+ int numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s], filteredFasta);
numSeqs += numFastaSeqs;
-
- rename((fastafileNames[s] + toString(processIDS[0]) + ".temp").c_str(), filteredFasta.c_str());
-
- //append fasta files
- for(int i=1;i<processors;i++){
- m->appendFiles((fastafileNames[s] + toString(processIDS[i]) + ".temp"), filteredFasta);
- m->mothurRemove((fastafileNames[s] + toString(processIDS[i]) + ".temp"));
- }
}
if (m->control_pressed) { return 1; }
#else
- lines.push_back(new linePair(0, 1000));
+ if(processors == 1){
+ lines.push_back(new linePair(0, 1000));
int numFastaSeqs = driverRunFilter(filter, filteredFasta, fastafileNames[s], lines[0]);
numSeqs += numFastaSeqs;
+ }else {
+ int numFastaSeqs = positions.size()-1;
+ //positions = m->setFilePosFasta(fastafileNames[s], numFastaSeqs);
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+
+ numFastaSeqs = createProcessesRunFilter(filter, fastafileNames[s], filteredFasta);
+ numSeqs += numFastaSeqs;
+ }
if (m->control_pressed) { return 1; }
#endif
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
}
/**************************************************************************************************/
-int FilterSeqsCommand::createProcessesRunFilter(string F, string filename) {
+int FilterSeqsCommand::createProcessesRunFilter(string F, string filename, string filteredFastaName) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
+
+ int process = 1;
int num = 0;
processIDS.clear();
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
//loop through and create all the processes you want
while (process != processors) {
}
}
+ num = driverRunFilter(F, filteredFastaName, filename, lines[0]);
+
//force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
+ for (int i=0;i<processIDS.size();i++) {
int temp = processIDS[i];
wait(&temp);
}
m->openInputFile(tempFile, in);
if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
in.close(); m->mothurRemove(tempFile);
+
+ m->appendFiles((filename + toString(processIDS[i]) + ".temp"), filteredFastaName);
+ m->mothurRemove((filename + toString(processIDS[i]) + ".temp"));
}
-
-
- return num;
-#endif
+
+#else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the filterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to F.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<filterRunData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++){
+
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; }
+
+ filterRunData* tempFilter = new filterRunData(filter, filename, (filteredFastaName + extension), m, lines[i]->start, lines[i]->end, alignmentLength, i);
+ pDataArray.push_back(tempFilter);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, MyRunFilterThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ num = driverRunFilter(F, (filteredFastaName + toString(processors-1) + ".temp"), filename, lines[processors-1]);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ for (int i = 1; i < processors; i++) {
+ m->appendFiles((filteredFastaName + toString(i) + ".temp"), filteredFastaName);
+ m->mothurRemove((filteredFastaName + toString(i) + ".temp"));
+ }
+#endif
+
+ return num;
+
}
catch(exception& e) {
m->errorOut(e, "FilterSeqsCommand", "createProcessesRunFilter");
#else
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- vector<unsigned long long> positions = m->divideFile(fastafileNames[s], processors);
+ vector<unsigned long long> positions;
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafileNames[s], processors);
for (int i = 0; i < (positions.size()-1); i++) {
lines.push_back(new linePair(positions[i], positions[(i+1)]));
}
int numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]);
numSeqs += numFastaSeqs;
}
-
- if (m->control_pressed) { return filterString; }
#else
- lines.push_back(new linePair(0, 1000));
- int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]);
- numSeqs += numFastaSeqs;
- if (m->control_pressed) { return filterString; }
+ if(processors == 1){
+ lines.push_back(new linePair(0, 1000));
+ int numFastaSeqs = driverCreateFilter(F, fastafileNames[s], lines[0]);
+ numSeqs += numFastaSeqs;
+ }else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafileNames[s], numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+
+ numFastaSeqs = createProcessesCreateFilter(F, fastafileNames[s]);
+ numSeqs += numFastaSeqs;
+ }
#endif
+ //save the file positions so we can reuse them in the runFilter function
+ savedPositions[s] = positions;
+
+ if (m->control_pressed) { return filterString; }
#endif
}
MPI_Barrier(MPI_COMM_WORLD);
#endif
-
+
return filterString;
}
catch(exception& e) {
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
int FilterSeqsCommand::createProcessesCreateFilter(Filters& F, string filename) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 1;
+ int process = 1;
int num = 0;
processIDS.clear();
-
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
m->mothurRemove(tempFilename);
}
- return num;
-#endif
+
+#else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the filterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to F.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<filterData*> pDataArray;
+ DWORD dwThreadIdArray[processors];
+ HANDLE hThreadArray[processors];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors; i++ ){
+
+ filterData* tempFilter = new filterData(filename, m, lines[i]->start, lines[i]->end, alignmentLength, trump, vertical, soft, hard, i);
+ pDataArray.push_back(tempFilter);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, MyCreateFilterThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ F.mergeFilter(pDataArray[i]->F.getFilter());
+
+ for (int k = 0; k < alignmentLength; k++) { F.a[k] += pDataArray[i]->F.a[k]; }
+ for (int k = 0; k < alignmentLength; k++) { F.t[k] += pDataArray[i]->F.t[k]; }
+ for (int k = 0; k < alignmentLength; k++) { F.g[k] += pDataArray[i]->F.g[k]; }
+ for (int k = 0; k < alignmentLength; k++) { F.c[k] += pDataArray[i]->F.c[k]; }
+ for (int k = 0; k < alignmentLength; k++) { F.gap[k] += pDataArray[i]->F.gap[k]; }
+
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+ return num;
+
}
catch(exception& e) {
m->errorOut(e, "FilterSeqsCommand", "createProcessesCreateFilter");
\r
vector<linePair*> lines;\r
vector<int> processIDS;\r
+ map<int, vector<unsigned long long> > savedPositions;\r
\r
string vertical, filter, fasta, hard, outputDir, filterFileName;\r
vector<string> fastafileNames; \r
string createFilter();\r
int filterSequences();\r
int createProcessesCreateFilter(Filters&, string);\r
- int createProcessesRunFilter(string, string);\r
+ int createProcessesRunFilter(string, string, string);\r
int driverRunFilter(string, string, string, linePair*);\r
int driverCreateFilter(Filters& F, string filename, linePair* line);\r
#ifdef USE_MPI\r
\r
};\r
\r
+\r
+/**************************************************************************************************/\r
+//custom data structure for threads to use.\r
+// This is passed by void pointer so it can be any data type\r
+// that can be passed using a single void pointer (LPVOID).\r
+struct filterData {\r
+ Filters F;\r
+ int count, tid, alignmentLength;\r
+ unsigned long long start, end;\r
+ MothurOut* m;\r
+ string filename, vertical, hard;\r
+ char trump;\r
+ float soft;\r
+ \r
+ filterData(){}\r
+ filterData(string fn, MothurOut* mout, unsigned long long st, unsigned long long en, int aLength, char tr, string vert, float so, string ha, int t) {\r
+ filename = fn;\r
+ m = mout;\r
+ start = st;\r
+ end = en;\r
+ tid = t;\r
+ trump = tr;\r
+ alignmentLength = aLength;\r
+ vertical = vert;\r
+ soft = so;\r
+ hard = ha;\r
+ count = 0;\r
+ }\r
+};\r
+/**************************************************************************************************/\r
+//custom data structure for threads to use.\r
+// This is passed by void pointer so it can be any data type\r
+// that can be passed using a single void pointer (LPVOID).\r
+struct filterRunData {\r
+ int count, tid, alignmentLength;\r
+ unsigned long long start, end;\r
+ MothurOut* m;\r
+ string filename;\r
+ string filter, outputFilename;\r
+ \r
+ filterRunData(){}\r
+ filterRunData(string f, string fn, string ofn, MothurOut* mout, unsigned long long st, unsigned long long en, int aLength, int t) {\r
+ filter = f;\r
+ outputFilename = ofn;\r
+ filename = fn;\r
+ m = mout;\r
+ start = st;\r
+ end = en;\r
+ tid = t;\r
+ alignmentLength = aLength;\r
+ count = 0;\r
+ }\r
+};\r
+\r
+/**************************************************************************************************/\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
+#else\r
+static DWORD WINAPI MyCreateFilterThreadFunction(LPVOID lpParam){ \r
+ filterData* pDataArray;\r
+ pDataArray = (filterData*)lpParam;\r
+ \r
+ try {\r
+\r
+ if (pDataArray->soft != 0) { pDataArray->F.setSoft(pDataArray->soft); }\r
+ if (pDataArray->trump != '*') { pDataArray->F.setTrump(pDataArray->trump); }\r
+ \r
+ pDataArray->F.setLength(pDataArray->alignmentLength);\r
+ \r
+ if(pDataArray->trump != '*' || pDataArray->m->isTrue(pDataArray->vertical) || pDataArray->soft != 0){\r
+ pDataArray->F.initialize();\r
+ }\r
+ \r
+ if(pDataArray->hard.compare("") != 0) { pDataArray->F.doHard(pDataArray->hard); }\r
+ else { pDataArray->F.setFilter(string(pDataArray->alignmentLength, '1')); }\r
+ \r
+ ifstream in;\r
+ pDataArray->m->openInputFile(pDataArray->filename, in);\r
+ \r
+ //print header if you are process 0\r
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {\r
+ in.seekg(0);\r
+ }else { //this accounts for the difference in line endings. \r
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); \r
+ }\r
+ \r
+ pDataArray->count = pDataArray->end;\r
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process\r
+ \r
+ if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }\r
+ \r
+ Sequence current(in); pDataArray->m->gobble(in); \r
+ \r
+ if (current.getName() != "") {\r
+ if (current.getAligned().length() != pDataArray->alignmentLength) { pDataArray->m->mothurOut("Sequences are not all the same length, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }\r
+ \r
+ if(pDataArray->trump != '*') { pDataArray->F.doTrump(current); }\r
+ if(pDataArray->m->isTrue(pDataArray->vertical) || pDataArray->soft != 0) { pDataArray->F.getFreqs(current); }\r
+ }\r
+ \r
+ //report progress\r
+ if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }\r
+ }\r
+ \r
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }\r
+ \r
+ in.close();\r
+ \r
+ return 0;\r
+ \r
+ }\r
+ catch(exception& e) {\r
+ pDataArray->m->errorOut(e, "FilterSeqsCommand", "MyCreateFilterThreadFunction");\r
+ exit(1);\r
+ }\r
+} \r
+/**************************************************************************************************/\r
+static DWORD WINAPI MyRunFilterThreadFunction(LPVOID lpParam){ \r
+ filterRunData* pDataArray;\r
+ pDataArray = (filterRunData*)lpParam;\r
+ \r
+ try {\r
+ \r
+ ofstream out;\r
+ pDataArray->m->openOutputFile(pDataArray->outputFilename, out);\r
+\r
+ ifstream in;\r
+ pDataArray->m->openInputFile(pDataArray->filename, in);\r
+ \r
+ //print header if you are process 0\r
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {\r
+ in.seekg(0);\r
+ }else { //this accounts for the difference in line endings. \r
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in); \r
+ }\r
+ \r
+ pDataArray->count = pDataArray->end;\r
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process\r
+ \r
+ if (pDataArray->m->control_pressed) { in.close(); out.close(); pDataArray->count = 1; return 1; }\r
+ \r
+ Sequence seq(in); pDataArray->m->gobble(in);\r
+ if (seq.getName() != "") {\r
+ string align = seq.getAligned();\r
+ string filterSeq = "";\r
+ \r
+ for(int j=0;j<pDataArray->alignmentLength;j++){\r
+ if(pDataArray->filter[j] == '1'){\r
+ filterSeq += align[j];\r
+ }\r
+ }\r
+ \r
+ out << '>' << seq.getName() << endl << filterSeq << endl;\r
+ }\r
+ \r
+ //report progress\r
+ if((i) % 100 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }\r
+ }\r
+ \r
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }\r
+ \r
+ in.close();\r
+ out.close();\r
+ \r
+ return 0;\r
+ \r
+ }\r
+ catch(exception& e) {\r
+ pDataArray->m->errorOut(e, "FilterSeqsCommand", "MyRunFilterThreadFunction");\r
+ exit(1);\r
+ }\r
+} \r
+/**************************************************************************************************/\r
+#endif\r
+\r
+\r
#endif\r
+++ /dev/null
-#include "fisher2.h"
-
-
-static void f2xact(int *nrow, int *ncol, double *table, int *ldtabl,
- double *expect, double *percnt, double *emin, double
- *prt, double *pre, double *fact, int *ico, int
- *iro, int *kyy, int *idif, int *irn, int *key,
- int *ldkey, int *ipoin, double *stp, int *ldstp,
- int *ifrq, double *dlp, double *dsp, double *tm,
- int *key2, int *iwk, double *rwk);
-static void f3xact(int *nrow, int *irow, int *ncol, int *icol,
- double *dlp, int *mm, double *fact, int *ico, int
- *iro, int *it, int *lb, int *nr, int *nt, int
- *nu, int *itc, int *ist, double *stv, double *alen,
- const double *tol);
-static void f4xact(int *nrow, int *irow, int *ncol, int *icol,
- double *dsp, double *fact, int *icstk, int *ncstk,
- int *lstk, int *mstk, int *nstk, int *nrstk, int
- *irstk, double *ystk, const double *tol);
-static void f5xact(double *pastp, const double *tol, int *kval, int *key,
- int *ldkey, int *ipoin, double *stp, int *ldstp,
- int *ifrq, int *npoin, int *nr, int *nl, int
- *ifreq, int *itop, int *ipsh);
-static void f6xact(int *nrow, int *irow, int *iflag, int *kyy,
- int *key, int *ldkey, int *last, int *ipn);
-static void f7xact(int *nrow, int *imax, int *idif, int *k, int *ks,
- int *iflag);
-static void f8xact(int *irow, int *is, int *i1, int *izero, int *myNew);
-static double f9xact(int *n, int *mm, int *ir, double *fact);
-static void f10act(int *nrow, int *irow, int *ncol, int *icol,
- double *val, int *xmin, double *fact, int *nd,
- int *ne, int *m);
-static void f11act(int *irow, int *i1, int *i2, int *myNew);
-static void prterr(int icode, char *mes);
-static int iwork(int iwkmax, int *iwkpt, int number, int itype);
-// void fexact(int *nrow, int *ncol, double *table, int *ldtabl,
-// double *expect, double *percnt, double *emin, double *prt,
-// double *pre, /* myNew in C : */ int *workspace);
- static void isort(int *n, int *ix);
- static double gammds(double *y, double *p, int *ifault);
- static double alogam(double *x, int *ifault);
-
-
-/* The only public function : */
-void fexact(int *nrow, int *ncol, double *table, int *ldtabl,
- double *expect, double *percnt, double *emin, double *prt,
- double *pre, /* myNew in C : */ int *workspace) {
-
-/*
- ALGORITHM 643, COLLECTED ALGORITHMS FROM ACM.
- THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE,
- VOL. 19, NO. 4, DECEMBER, 1993, PP. 484-488.
- -----------------------------------------------------------------------
- Name: FEXACT
- Purpose: Computes Fisher's exact test probabilities and a hybrid
- approximation to Fisher exact test probabilities for a
- contingency table using the network algorithm.
- Usage: CALL FEXACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT,
- EMIN, PRT, PRE)
- Arguments:
- NROW - The number of rows in the table. (Input)
- NCOL - The number of columns in the table. (Input)
- TABLE - NROW by NCOL matrix containing the contingency
- table. (Input)
- LDTABL - Leading dimension of TABLE exactly as specified
- in the dimension statement in the calling
- program. (Input)
- EXPECT - Expected value used in the hybrid algorithm for
- deciding when to use asymptotic theory
- probabilities. (Input)
- If EXPECT <= 0.0 then asymptotic theory probabilities
- are not used and Fisher exact test probabilities are
- computed. Otherwise, if PERCNT or more of the cells in
- the remaining table have estimated expected values of
- EXPECT or more, with no remaining cell having expected
- value less than EMIN, then asymptotic chi-squared
- probabilities are used. See the algorithm section of the
- manual document for details.
- Use EXPECT = 5.0 to obtain the 'Cochran' condition.
- PERCNT - Percentage of remaining cells that must have
- estimated expected values greater than EXPECT
- before asymptotic probabilities can be used. (Input)
- See argument EXPECT for details.
- Use PERCNT = 80.0 to obtain the 'Cochran' condition.
- EMIN - Minimum cell estimated expected value allowed for
- asymptotic chi-squared probabilities to be used. (Input)
- See argument EXPECT for details.
- Use EMIN = 1.0 to obtain the 'Cochran' condition.
- PRT - Probability of the observed table for fixed
- marginal totals. (Output)
- PRE - Table p-value. (Output)
- PRE is the probability of a more extreme table,
- where `extreme' is in a probabilistic sense.
- If EXPECT < 0 then the Fisher exact probability
- is returned. Otherwise, an approximation to the
- Fisher exact probability is computed based upon
- asymptotic chi-squared probabilities for ``large''
- table expected values. The user defines ``large''
- through the arguments EXPECT, PERCNT, and EMIN.
-
- Remarks:
- 1. For many problems one megabyte or more of workspace can be
- required. If the environment supports it, the user should begin
- by increasing the workspace used to 200,000 units.
- 2. In FEXACT, LDSTP = 30*LDKEY. The proportion of table space used
- by STP may be changed by changing the line MULT = 30 below to
- another value.
- 3. FEXACT may be converted to single precision by setting IREAL = 3,
- and converting all DOUBLE PRECISION specifications (except the
- specifications for RWRK, IWRK, and DWRK) to REAL. This will
- require changing the names and specifications of the intrinsic
- functions ALOG, AMAX1, AMIN1, EXP, and REAL. In addition, the
- machine specific constants will need to be changed, and the name
- DWRK will need to be changed to RWRK in the call to F2XACT.
- 4. Machine specific constants are specified and documented in F2XACT.
- A missing value code is specified in both FEXACT and F2XACT.
- 5. Although not a restriction, is is not generally practical to call
- this routine with large tables which are not sparse and in
- which the 'hybrid' algorithm has little effect. For example,
- although it is feasible to compute exact probabilities for the
- table
- 1 8 5 4 4 2 2
- 5 3 3 4 3 1 0
- 10 1 4 0 0 0 0,
- computing exact probabilities for a similar table which has been
- enlarged by the addition of an extra row (or column) may not be
- feasible.
- -----------------------------------------------------------------------
- */
-
- /* CONSTANT Parameters : */
-
- /* To increase the length of the table of paste path lengths relative
- to the length of the hash table, increase MULT.
- */
- const int mult = 30;
- /* AMISS is a missing value indicator which is returned when the
- probability is not defined.
- */
- const double amiss = -12345.;
- /*
- Set IREAL = 4 for DOUBLE PRECISION
- Set IREAL = 3 for SINGLE PRECISION
- */
-#define i_real 4
-#define i_int 2
-
- /* System generated locals */
- int ikh;
- /* Local variables */
- int nco, nro, ntot, numb, iiwk, irwk;
- int i, j, k, kk, ldkey, ldstp, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10;
- int i3a, i3b, i3c, i9a, iwkmax, iwkpt;
-
- /* Workspace Allocation (freed at end) */
- double *equiv;
- iwkmax = 2 * (int) (*workspace / 2);
-// equiv = (double *) R_alloc(iwkmax / 2, sizeof(double));
- equiv = (double *) calloc(iwkmax / 2, sizeof(double));
-
- /* The check could never happen with Calloc!
- equiv = Calloc(iwkmax / 2, double);
- if (!equiv) {
- prterr(0, "Can not allocate specified workspace");
- } */
-
-#define dwrk (equiv)
-#define iwrk ((int *)equiv)
-#define rwrk ((float *)equiv)
-
- /* Parameter adjustments */
- table -= *ldtabl + 1;
-
- /* Function Body */
- iwkpt = 0;
-
- if (*nrow > *ldtabl)
- prterr(1, "NROW must be less than or equal to LDTABL.");
-
- ntot = 0;
- for (i = 1; i <= *nrow; ++i) {
- for (j = 1; j <= *ncol; ++j) {
- if (table[i + j * *ldtabl] < 0.)
- prterr(2, "All elements of TABLE must be positive.");
- ntot = (int) (ntot + table[i + j * *ldtabl]);
- }
- }
- if (ntot == 0) {
- prterr(3, "All elements of TABLE are zero.\n"
- "PRT and PRE are set to missing values.");
- *prt = amiss;
- *pre = amiss;
- goto L_End;
- }
-
- nco = max(*nrow, *ncol);
- nro = *nrow + *ncol - nco;/* = min(*nrow, *ncol) */
- k = *nrow + *ncol + 1;
- kk = k * nco;
-
- ikh = ntot + 1;
- i1 = iwork(iwkmax, &iwkpt, ikh, i_real);
- i2 = iwork(iwkmax, &iwkpt, nco, i_int);
- i3 = iwork(iwkmax, &iwkpt, nco, i_int);
- i3a = iwork(iwkmax, &iwkpt, nco, i_int);
- i3b = iwork(iwkmax, &iwkpt, nro, i_int);
- i3c = iwork(iwkmax, &iwkpt, nro, i_int);
- ikh = max(k * 5 + (kk << 1), nco * 7 + 800);
- iiwk= iwork(iwkmax, &iwkpt, ikh, i_int);
- ikh = max(nco + 401, k);
- irwk= iwork(iwkmax, &iwkpt, ikh, i_real);
-
- /* NOTE:
- What follows below splits the remaining amount iwkmax - iwkpt of
- (int) workspace into hash tables as follows.
- type size index
- INT 2 * ldkey i4 i5 i11
- REAL 2 * ldkey i8 i9 i10
- REAL 2 * ldstp i6
- INT 6 * ldstp i7
- Hence, we need ldkey times
- 3 * 2 + 3 * 2 * s + 2 * mult * s + 6 * mult
- chunks of integer memory, where s = sizeof(REAL) / sizeof(INT).
- If doubles are used and are twice as long as ints, this gives
- 18 + 10 * mult
- so that the value of ldkey can be obtained by dividing available
- (int) workspace by this number.
-
- In fact, because iwork() can actually s * n + s - 1 int chunks
- when allocating a REAL, we use ldkey = available / numb - 1.
-
- FIXME:
- Can we always assume that sizeof(double) / sizeof(int) is 2?
- */
-
- if (i_real == 4) { /* Double precision reals */
- numb = 18 + 10 * mult;
- } else { /* Single precision reals */
- numb = (mult << 3) + 12;
- }
- ldkey = (iwkmax - iwkpt) / numb - 1;
- ldstp = mult * ldkey;
- ikh = ldkey << 1; i4 = iwork(iwkmax, &iwkpt, ikh, i_int);
- ikh = ldkey << 1; i5 = iwork(iwkmax, &iwkpt, ikh, i_int);
- ikh = ldstp << 1; i6 = iwork(iwkmax, &iwkpt, ikh, i_real);
- ikh = ldstp * 6; i7 = iwork(iwkmax, &iwkpt, ikh, i_int);
- ikh = ldkey << 1; i8 = iwork(iwkmax, &iwkpt, ikh, i_real);
- ikh = ldkey << 1; i9 = iwork(iwkmax, &iwkpt, ikh, i_real);
- ikh = ldkey << 1; i9a = iwork(iwkmax, &iwkpt, ikh, i_real);
- ikh = ldkey << 1; i10 = iwork(iwkmax, &iwkpt, ikh, i_int);
-
- /* To convert to double precision, change RWRK to DWRK in the next CALL.
- */
- f2xact(nrow,
- ncol,
- &table[*ldtabl + 1],
- ldtabl,
- expect,
- percnt,
- emin,
- prt,
- pre,
- dwrk + i1,
- iwrk + i2,
- iwrk + i3,
- iwrk + i3a,
- iwrk + i3b,
- iwrk + i3c,
- iwrk + i4,
- &ldkey,
- iwrk + i5,
- dwrk + i6,
- &ldstp,
- iwrk + i7,
- dwrk + i8,
- dwrk + i9,
- dwrk + i9a,
- iwrk + i10,
- iwrk + iiwk,
- dwrk + irwk);
-
-L_End:
- /* Free(equiv); */
- free(equiv);
- return;
-}
-
-#undef rwrk
-#undef iwrk
-#undef dwrk
-
-
-/*
- -----------------------------------------------------------------------
- Name: F2XACT
- Purpose: Computes Fisher's exact test for a contingency table,
- routine with workspace variables specified.
- Usage: F2XACT (NROW, NCOL, TABLE, LDTABL, EXPECT, PERCNT,
- EMIN, PRT, PRE, FACT, ICO, IRO, KYY, IDIF,
- IRN, KEY, LDKEY, IPOIN, STP, LDSTP, IFRQ,
- DLP, DSP, TM, KEY2, IWK, RWK)
- -----------------------------------------------------------------------
- */
-void
-f2xact(int *nrow, int *ncol, double *table, int *ldtabl,
- double *expect, double *percnt, double *emin, double *prt,
- double *pre, double *fact, int *ico, int *iro, int *kyy,
- int *idif, int *irn, int *key, int *ldkey, int *ipoin,
- double *stp, int *ldstp, int *ifrq, double *dlp, double *dsp,
- double *tm, int *key2, int *iwk, double *rwk)
-{
- /* IMAX is the largest representable int on the machine. */
- const int imax = SINT_MAX;
-// const int imax = 2147483647; //xx: I DONÂ¥T like this, and
-// thanks to the hint from Jason Turner I don't do it anymore. (R.D-U).
-
- /* AMISS is a missing value indicator which is returned when the
- probability is not defined. */
- const double amiss = -12345.;
-
- /* TOL is chosen as the square root of the smallest relative spacing. */
-#ifndef Macintosh
- const static double tol = 3.45254e-7;
-#else
- static double tol = 3.45254e-7;
-#endif
- /* EMX is a large positive value used in comparing expected values. */
- const static double emx = 1e30;
-
- /* Local variables {{any really need to be static ???}} */
- static int kval, kmax, jkey, last, ipsh, itmp, itop, jstp, ntot,
- jstp2, jstp3, jstp4, i, ii, j, k, n, iflag, ncell, ifreq, chisq,
- ikkey, ikstp, ikstp2, k1, kb, kd, ks,
- i31, i32, i33, i34, i35, i36, i37, i38, i39,
- i41, i42, i43, i44, i45, i46, i47, i48, i310, i311,
- nco, nrb, ipn, ipo, itp, nro, nro2;
- static double dspt, dd, df,ddf, drn,dro, emn, obs, obs2, obs3,
- pastp, pv, tmp;
- double d1;
-#ifndef USING_R
- double d2;
- static int ifault;
-#endif
- int nr_gt_nc=0;
-
- /* Parameter adjustments */
- table -= *ldtabl + 1;
- --ico;
- --iro;
- --kyy;
- --idif;
- --irn;
- --key;
- --ipoin;
- --stp;
- --ifrq;
- --dlp;
- --dsp;
- --tm;
- --key2;
- --iwk;
- --rwk;
-
-
- /* Check table dimensions */
- if (*nrow > *ldtabl)
- prterr(1, "NROW must be less than or equal to LDTABL.");
- if (*ncol <= 1)
- prterr(4, "NCOL must be at least 2");
-
- /* Initialize KEY array */
- for (i = 1; i <= *ldkey << 1; ++i) {
- key[i] = -9999;
- key2[i] = -9999;
- }
- /* Initialize parameters */
- *pre = 0.;
- itop = 0;
- if (*expect > 0.)
- emn = *emin;
- else
- emn = emx;
- if (*nrow > *ncol){
- nr_gt_nc = 1;
-}
-else{
- nr_gt_nc = 0;
-}
- /* nco := max(nrow, ncol) : */
- if(nr_gt_nc)
- nco = *nrow;
- else
- nco = *ncol;
- /* Initialize pointers for workspace */
- /* f3xact */
- i31 = 1;
- i32 = i31 + nco;
- i33 = i32 + nco;
- i34 = i33 + nco;
- i35 = i34 + nco;
- i36 = i35 + nco;
- i37 = i36 + nco;
- i38 = i37 + nco;
- i39 = i38 + 400;
- i310 = 1;
- i311 = 401;
- /* f4xact */
- k = *nrow + *ncol + 1;
- i41 = 1;
- i42 = i41 + k;
- i43 = i42 + k;
- i44 = i43 + k;
- i45 = i44 + k;
- i46 = i45 + k;
- i47 = i46 + k * nco;
- i48 = 1;
-
- /* Compute row marginals and total */
- ntot = 0;
- for (i = 1; i <= *nrow; ++i) {
- iro[i] = 0;
- for (j = 1; j <= *ncol; ++j) {
- if (table[i + j * *ldtabl] < -1e-4)
- prterr(2, "All elements of TABLE must be positive.");
- iro[i] += (int) table[i + j * *ldtabl];
- }
- ntot += iro[i];
- }
-
- if (ntot == 0) {
- prterr(3, "All elements of TABLE are zero.\n"
- "PRT and PRE are set to missing values.");
- *pre = *prt = amiss;
- return;
- }
-
- /* Column marginals */
- for (i = 1; i <= *ncol; ++i) {
- ico[i] = 0;
- for (j = 1; j <= *nrow; ++j)
- ico[i] += (int) table[j + i * *ldtabl];
- }
-
- /* sort marginals */
- isort(nrow, &iro[1]);
- isort(ncol, &ico[1]);
-
- /* Determine row and column marginals.
- Define max(nrow,ncol) =: nco >= nro := min(nrow,ncol)
- nco is defined above
-
- Swap marginals if necessary to ico[1:nco] & iro[1:nro]
- */
- if (nr_gt_nc) {
- nro = *ncol;
- /* Swap marginals */
- for (i = 1; i <= nco; ++i) {
- itmp = iro[i];
- if (i <= nro)
- iro[i] = ico[i];
- ico[i] = itmp;
- }
- } else
- nro = *nrow;
-
-
- /* Get multiplers for stack */
- kyy[1] = 1;
- for (i = 2; i <= nro; ++i) {
- /* Hash table multipliers */
- if (iro[i - 1] + 1 <= imax / kyy[i - 1]) {
- kyy[i] = kyy[i - 1] * (iro[i - 1] + 1);
- j /= kyy[i - 1];
- } else
- goto L_ERR_5;
- }
- /* Maximum product */
- if (iro[nro - 1] + 1 <= imax / kyy[nro - 1]) {
- kmax = (iro[nro] + 1) * kyy[nro - 1];
- } else {
- L_ERR_5:
- prterr(5, "The hash table key cannot be computed because "
- "the largest key\n"
- "is larger than the largest representable int.\n"
- "The algorithm cannot proceed.\n"
- "Reduce the workspace size, or use `exact = FALSE'.");
- return;
- }
-
- /* Compute log factorials */
- fact[0] = 0.;
- fact[1] = 0.;
- if(ntot >= 2) fact[2] = log(2.);
-/* MM: old code assuming log() to be SLOW */
- for (i = 3; i <= ntot; i += 2) {
- fact[i] = fact[i - 1] + log((double) i);
- j = i + 1;
- if (j <= ntot)
- fact[j] = fact[i] + fact[2] + fact[j / 2] - fact[j / 2 - 1];
- }
- /* Compute obs := observed path length */
- obs = tol;
- ntot = 0;
- for (j = 1; j <= nco; ++j) {
- dd = 0.;
- for (i = 1; i <= nro; ++i) {
- if (nr_gt_nc) {
- dd += fact[(int) table[j + i * *ldtabl]];
- ntot += (int) table[j + i * *ldtabl];
- } else {
- dd += fact[(int) table[i + j * *ldtabl]];
- ntot += (int) table[i + j * *ldtabl];
- }
- }
- obs += fact[ico[j]] - dd;
- }
- /* Denominator of observed table: DRO */
- dro = f9xact(&nro, &ntot, &iro[1], fact);
- *prt = exp(obs - dro);
- /* Initialize pointers */
- k = nco;
- last = *ldkey + 1;
- jkey = *ldkey + 1;
- jstp = *ldstp + 1;
- jstp2 = *ldstp * 3 + 1;
- jstp3 = (*ldstp << 2) + 1;
- jstp4 = *ldstp * 5 + 1;
- ikkey = 0;
- ikstp = 0;
- ikstp2 = *ldstp << 1;
- ipo = 1;
- ipoin[1] = 1;
- stp[1] = 0.;
- ifrq[1] = 1;
- ifrq[ikstp2 + 1] = -1;
-
-Outer_Loop:
- kb = nco - k + 1;
- ks = 0;
- n = ico[kb];
- kd = nro + 1;
- kmax = nro;
- /* IDIF is the difference in going to the daughter */
- for (i = 1; i <= nro; ++i)
- idif[i] = 0;
-
- /* Generate the first daughter */
- do {
- --kd;
- ntot = min(n, iro[kd]);
- idif[kd] = ntot;
- if (idif[kmax] == 0)
- --kmax;
- n -= ntot;
- }
- while (n > 0 && kd != 1);
-
- if (n != 0) {
- goto L310;
- }
-
- k1 = k - 1;
- n = ico[kb];
- ntot = 0;
- for (i = kb + 1; i <= nco; ++i)
- ntot += ico[i];
-
-
-L150:
- /* Arc to daughter length=ICO(KB) */
- for (i = 1; i <= nro; ++i)
- irn[i] = iro[i] - idif[i];
-
- /* Sort irn */
- if (k1 > 1) {
- if (nro == 2) {
- if (irn[1] > irn[2]) {
- ii = irn[1];
- irn[1] = irn[2];
- irn[2] = ii;
- }
- } else if (nro == 3) {
- ii = irn[1];
- if (ii > irn[3]) {
- if (ii > irn[2]) {
- if (irn[2] > irn[3]) {
- irn[1] = irn[3];
- irn[3] = ii;
- } else {
- irn[1] = irn[2];
- irn[2] = irn[3];
- irn[3] = ii;
- }
- } else {
- irn[1] = irn[3];
- irn[3] = irn[2];
- irn[2] = ii;
- }
- } else if (ii > irn[2]) {
- irn[1] = irn[2];
- irn[2] = ii;
- } else if (irn[2] > irn[3]) {
- ii = irn[2];
- irn[2] = irn[3];
- irn[3] = ii;
- }
- } else {
- for (j = 2; j <= nro; ++j) {
- i = j - 1;
- ii = irn[j];
-
- while (ii < irn[i]) {
- irn[i + 1] = irn[i];
- --i;
- if (i == 0)
- break;
- }
- irn[i + 1] = ii;
- }
- }
- /* Adjust start for zero */
- for (i = 1; i <= nro; ++i) {
- if (irn[i] != 0)
- break;
- }
-
- nrb = i;
- nro2 = nro - i + 1;
- } else {
- nrb = 1;
- nro2 = nro;
- }
- /* Some table values */
- ddf = f9xact(&nro, &n, &idif[1], fact);
- drn = f9xact(&nro2, &ntot, &irn[nrb], fact) - dro + ddf;
- /* Get hash value */
- if (k1 > 1) {
- kval = irn[1] + irn[2] * kyy[2];
- for (i = 3; i <= nro; ++i) {
- kval += irn[i] * kyy[i];
- }
- /* Get hash table entry */
- i = kval % (*ldkey << 1) + 1;
- /* Search for unused location */
- for (itp = i; itp <= *ldkey << 1; ++itp) {
- ii = key2[itp];
- if (ii == kval) {
- goto L240;
- } else if (ii < 0) {
- key2[itp] = kval;
- dlp[itp] = 1.;
- dsp[itp] = 1.;
- goto L240;
- }
- }
-
- for (itp = 1; itp <= i - 1; ++itp) {
- ii = key2[itp];
- if (ii == kval) {
- goto L240;
- } else if (ii < 0) {
- key2[itp] = kval;
- dlp[itp] = 1.;
- goto L240;
- }
- }
-
- /* KH
- prterr(6, "LDKEY is too small.\n"
- "It is not possible to give the value of LDKEY required,\n"
- "but you could try doubling LDKEY (and possibly LDSTP).");
- */
- prterr(6, "LDKEY is too small for this problem.\n"
- "Try increasing the size of the workspace.");
- }
-
-L240:
- ipsh = (1);
- /* Recover pastp */
- ipn = ipoin[ipo + ikkey];
- pastp = stp[ipn + ikstp];
- ifreq = ifrq[ipn + ikstp];
- /* Compute shortest and longest path */
- if (k1 > 1) {
- obs2 = obs - fact[ico[kb + 1]] - fact[ico[kb + 2]] - ddf;
- for (i = 3; i <= k1; ++i) {
- obs2 -= fact[ico[kb + i]];
- }
- if (dlp[itp] > 0.) {
- dspt = obs - obs2 - ddf;
- /* Compute longest path */
- dlp[itp] = 0.;
- f3xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dlp[itp],
- &ntot, fact, &iwk[i31], &iwk[i32], &iwk[i33],
- &iwk[i34], &iwk[i35], &iwk[i36], &iwk[i37],
- &iwk[i38], &iwk[i39], &rwk[i310], &rwk[i311], &tol);
- dlp[itp] = min(0., dlp[itp]);
- /* Compute shortest path */
- dsp[itp] = dspt;
- f4xact(&nro2, &irn[nrb], &k1, &ico[kb + 1], &dsp[itp], fact,
- &iwk[i47], &iwk[i41], &iwk[i42], &iwk[i43],
- &iwk[i44], &iwk[i45], &iwk[i46], &rwk[i48], &tol);
- dsp[itp] = min(0., dsp[itp] - dspt);
- /* Use chi-squared approximation? */
- if ((irn[nrb] * ico[kb + 1]) > ntot * emn) {
- ncell = 0.;
- for (i = 0; i < nro2; ++i)
- for (j = 1; j <= k1; ++j)
- if (irn[nrb + i] * ico[kb + j] >= ntot * *expect)
- ncell++;
-
- if (ncell * 100 >= k1 * nro2 * *percnt) {
- tmp = 0.;
- for (i = 0; i < nro2; ++i)
- tmp += (fact[irn[nrb + i]] -
- fact[irn[nrb + i] - 1]);
- tmp *= k1 - 1;
- for (j = 1; j <= k1; ++j)
- tmp += (nro2 - 1) * (fact[ico[kb + j]] -
- fact[ico[kb + j] - 1]);
- df = (double) ((nro2 - 1) * (k1 - 1));
- tmp += df * 1.83787706640934548356065947281;
- tmp -= (nro2 * k1 - 1) * (fact[ntot] - fact[ntot - 1]);
- tm[itp] = (obs - dro) * -2. - tmp;
- } else {
- /* tm(itp) set to a flag value */
- tm[itp] = -9876.;
- }
- } else {
- tm[itp] = -9876.;
- }
- }
- obs3 = obs2 - dlp[itp];
- obs2 -= dsp[itp];
- if (tm[itp] == -9876.) {
- chisq = (0);
- } else {
- chisq = (1);
- tmp = tm[itp];
- }
- } else {
- obs2 = obs - drn - dro;
- obs3 = obs2;
- }
-
-L300:
- /* Process node with new PASTP */
- if (pastp <= obs3) {
- /* Update pre */
- *pre += (double) ifreq * exp(pastp + drn);
- } else if (pastp < obs2) {
- if (chisq) {
- df = (double) ((nro2 - 1) * (k1 - 1));
-#ifdef USING_R
- pv = pgamma(fmax2(0., tmp + (pastp + drn) * 2.) / 2.,
- df / 2., /*scale = */ 1.,
- /*lower_tail = */FALSE, /*log_p = */ FALSE);
-#else
- d1 = max(0., tmp + (pastp + drn) * 2.) / 2.;
- d2 = df / 2.;
- pv = 1. - gammds(&d1, &d2, &ifault);
-#endif
- *pre += (double) ifreq * exp(pastp + drn) * pv;
- } else {
- /* Put daughter on queue */
- d1 = pastp + ddf;
- f5xact(&d1, &tol, &kval, &key[jkey], ldkey, &ipoin[jkey],
- &stp[jstp], ldstp, &ifrq[jstp], &ifrq[jstp2],
- &ifrq[jstp3], &ifrq[jstp4], &ifreq, &itop, &ipsh);
- ipsh = (0);
- }
- }
- /* Get next PASTP on chain */
- ipn = ifrq[ipn + ikstp2];
- if (ipn > 0) {
- pastp = stp[ipn + ikstp];
- ifreq = ifrq[ipn + ikstp];
- goto L300;
- }
- /* Generate a new daughter node */
- f7xact(&kmax, &iro[1], &idif[1], &kd, &ks, &iflag);
- if (iflag != 1) {
- goto L150;
- }
-
-L310:
- /* Go get a new mother from stage K */
- do {
- iflag = 1;
- f6xact(&nro, &iro[1], &iflag, &kyy[1], &key[ikkey + 1], ldkey,
- &last, &ipo);
- /* Update pointers */
- if (iflag != 3)
- goto Outer_Loop;
- /* else iflag == 3 : no additional nodes to process */
- --k;
- itop = 0;
- ikkey = jkey - 1;
- ikstp = jstp - 1;
- ikstp2 = jstp2 - 1;
- jkey = *ldkey - jkey + 2;
- jstp = *ldstp - jstp + 2;
- jstp2 = (*ldstp << 1) + jstp;
- for (i = 1; i <= *ldkey << 1; ++i)
- key2[i] = -9999;
-
- } while (k >= 2);
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F3XACT
- Purpose: Computes the shortest path length for a given table.
- Usage: F3XACT (NROW, IROW, NCOL, ICOL, DLP, MM, FACT, ICO, IRO,
- IT, LB, NR, NT, NU, ITC, IST, STV, ALEN, TOL)
- Arguments:
- NROW - The number of rows in the table. (Input)
- IROW - Vector of length NROW containing the row sums
- for the table. (Input)
- NCOL - The number of columns in the table. (Input)
- ICOL - Vector of length K containing the column sums
- for the table. (Input)
- DLP - The longest path for the table. (Output)
- MM - The total count in the table. (Output)
- FACT - Vector containing the logarithms of factorials. (Input)
- ICO - Work vector of length MAX(NROW,NCOL).
- IRO - Work vector of length MAX(NROW,NCOL).
- IT - Work vector of length MAX(NROW,NCOL).
- LB - Work vector of length MAX(NROW,NCOL).
- NR - Work vector of length MAX(NROW,NCOL).
- NT - Work vector of length MAX(NROW,NCOL).
- NU - Work vector of length MAX(NROW,NCOL).
- ITC - Work vector of length 400.
- IST - Work vector of length 400.
- STV - Work vector of length 400.
- ALEN - Work vector of length MAX(NROW,NCOL).
- TOL - Tolerance. (Input)
- -----------------------------------------------------------------------
- */
-
-void
-f3xact(int *nrow, int *irow, int *ncol, int *icol, double *dlp,
- int *mm, double *fact, int *ico, int *iro, int *it,
- int *lb, int *nr, int *nt, int *nu, int *itc, int *ist,
- double *stv, double *alen, const double *tol)
-{
- /* Initialized data */
- static int ldst = 200;
- static int nst = 0;
- static int nitc = 0;
-
- /* Local variables */
- static int xmin;
- static int i, k;
- static double v;
- static int n11, n12, ii, nn, ks, ic1, ic2, nc1, nn1;
- static int nr1, nco;
- static double val;
- static int nct, ipn, irl, key, lev, itp, nro;
- static double vmn;
- static int nrt, kyy, nc1s;
-
- /* Parameter adjustments */
- --stv;
- --ist;
- --itc;
- --nu;
- --nt;
- --nr;
- --lb;
- --it;
- --iro;
- --ico;
- --icol;
- --irow;
-
- /* Function Body */
- for (i = 0; i <= *ncol; ++i) {
- alen[i] = 0.;
- }
- for (i = 1; i <= 400; ++i) {
- ist[i] = -1;
- }
- /* nrow is 1 */
- if (*nrow <= 1) {
- if (*nrow > 0) {
- *dlp -= fact[icol[1]];
- for (i = 2; i <= *ncol; ++i) {
- *dlp -= fact[icol[i]];
- }
- }
- return;
- }
- /* ncol is 1 */
- if (*ncol <= 1) {
- if (*ncol > 0) {
- *dlp = *dlp - fact[irow[1]] - fact[irow[2]];
- for (i = 3; i <= *nrow; ++i) {
- *dlp -= fact[irow[i]];
- }
- }
- return;
- }
- /* 2 by 2 table */
- if (*nrow * *ncol == 4) {
- n11 = (irow[1] + 1) * (icol[1] + 1) / (*mm + 2);
- n12 = irow[1] - n11;
- *dlp = *dlp - fact[n11] - fact[n12] - fact[icol[1] - n11]
- - fact[icol[2] - n12];
- return;
- }
- /* Test for optimal table */
- val = 0.;
- xmin = (0);
- if (irow[*nrow] <= irow[1] + *ncol) {
- f10act(nrow, &irow[1], ncol, &icol[1], &val, &xmin, fact,
- &lb[1], &nu[1], &nr[1]);
- }
- if (! xmin) {
- if (icol[*ncol] <= icol[1] + *nrow) {
- f10act(ncol, &icol[1], nrow, &irow[1], &val, &xmin, fact,
- &lb[1], &nu[1], &nr[1]);
- }
- }
-
- if (xmin) {
- *dlp -= val;
- return;
- }
- /* Setup for dynamic programming */
- nn = *mm;
- /* Minimize ncol */
- if (*nrow >= *ncol) {
- nro = *nrow;
- nco = *ncol;
- for (i = 1; i <= *nrow; ++i) {
- iro[i] = irow[i];
- }
- ico[1] = icol[1];
- nt[1] = nn - ico[1];
- for (i = 2; i <= *ncol; ++i) {
- ico[i] = icol[i];
- nt[i] = nt[i - 1] - ico[i];
- }
- } else {
- nro = *ncol;
- nco = *nrow;
- ico[1] = irow[1];
- nt[1] = nn - ico[1];
- for (i = 2; i <= *nrow; ++i) {
- ico[i] = irow[i];
- nt[i] = nt[i - 1] - ico[i];
- }
- for (i = 1; i <= *ncol; ++i)
- iro[i] = icol[i];
- }
- /* Initialize pointers */
- vmn = 1e10;
- nc1s = nco - 1;
- irl = 1;
- ks = 0;
- k = ldst;
- kyy = ico[nco] + 1;
-
-LnewNode: /* Setup to generate new node */
-
- lev = 1;
- nr1 = nro - 1;
- nrt = iro[irl];
- nct = ico[1];
- lb[1] = (int) ((double) ((nrt + 1) * (nct + 1)) /
- (double) (nn + nr1 * nc1s + 1) - *tol) - 1;
- nu[1] = (int) ((double) ((nrt + nc1s) * (nct + nr1)) /
- (double) (nn + nr1 + nc1s)) - lb[1] + 1;
- nr[1] = nrt - lb[1];
-
-LoopNode: /* Generate a node */
- --nu[lev];
- if (nu[lev] == 0) {
- if (lev == 1)
- goto L200;
-
- --lev;
- goto LoopNode;
- }
- ++lb[lev];
- --nr[lev];
-L120:
- alen[lev] = alen[lev - 1] + fact[lb[lev]];
- if (lev < nc1s) {
- nn1 = nt[lev];
- nrt = nr[lev];
- ++lev;
- nc1 = nco - lev;
- nct = ico[lev];
- lb[lev] = (int) ((double) ((nrt + 1) * (nct + 1)) /
- (double) (nn1 + nr1 * nc1 + 1) - *tol);
- nu[lev] = (int) ((double) ((nrt + nc1) * (nct + nr1)) /
- (double) (nn1 + nr1 + nc1) - lb[lev] + 1);
- nr[lev] = nrt - lb[lev];
- goto L120;
- }
- alen[nco] = alen[lev] + fact[nr[lev]];
- lb[nco] = nr[lev];
-
- v = val + alen[nco];
- if (nro == 2) {
- /* Only 1 row left */
- v = v + fact[ico[1] - lb[1]] + fact[ico[2] - lb[2]];
- for (i = 3; i <= nco; ++i) {
- v += fact[ico[i] - lb[i]];
- }
- if (v < vmn) {
- vmn = v;
- }
- } else if (nro == 3 && nco == 2) {
- /* 3 rows and 2 columns */
- nn1 = nn - iro[irl] + 2;
- ic1 = ico[1] - lb[1];
- ic2 = ico[2] - lb[2];
- n11 = (iro[irl + 1] + 1) * (ic1 + 1) / nn1;
- n12 = iro[irl + 1] - n11;
- v = v + fact[n11] + fact[n12] + fact[ic1 - n11]
- + fact[ic2 - n12];
- if (v < vmn) {
- vmn = v;
- }
- } else {
- /* Column marginals are new node */
- for (i = 1; i <= nco; ++i) {
- it[i] = ico[i] - lb[i];
- }
- /* Sort column marginals */
- if (nco == 2) {
- if (it[1] > it[2]) {
- ii = it[1];
- it[1] = it[2];
- it[2] = ii;
- }
- } else if (nco == 3) {
- ii = it[1];
- if (ii > it[3]) {
- if (ii > it[2]) {
- if (it[2] > it[3]) {
- it[1] = it[3];
- it[3] = ii;
- } else {
- it[1] = it[2];
- it[2] = it[3];
- it[3] = ii;
- }
- } else {
- it[1] = it[3];
- it[3] = it[2];
- it[2] = ii;
- }
- } else if (ii > it[2]) {
- it[1] = it[2];
- it[2] = ii;
- } else if (it[2] > it[3]) {
- ii = it[2];
- it[2] = it[3];
- it[3] = ii;
- }
- } else {
- isort(&nco, &it[1]);
- }
- /* Compute hash value */
- key = it[1] * kyy + it[2];
- for (i = 3; i <= nco; ++i) {
- key = it[i] + key * kyy;
- }
- if(key < 0)
- //PROBLEM "Bug in FEXACT: gave negative key" RECOVER(NULL_ENTRY);
- printf("Bug in FEXACT: gave negative key \n"); //xx:another one of my ugly kluges (R.D-U)
-
- /* Table index */
- ipn = key % ldst + 1;
-
- /* Find empty position */
- for (itp = ipn, ii = ks + ipn; itp <= ldst; ++itp, ++ii) {
- if (ist[ii] < 0) {
- goto L180;
- } else if (ist[ii] == key) {
- goto L190;
- }
- }
-
- for (itp = 1, ii = ks + 1; itp <= ipn - 1; ++itp, ++ii) {
- if (ist[ii] < 0) {
- goto L180;
- } else if (ist[ii] == key) {
- goto L190;
- }
- }
-
- prterr(30, "Stack length exceeded in f3xact.\n"
- "This problem should not occur.");
-
-L180: /* Push onto stack */
- ist[ii] = key;
- stv[ii] = v;
- ++nst;
- ii = nst + ks;
- itc[ii] = itp;
- goto LoopNode;
-
-L190: /* Marginals already on stack */
- stv[ii] = min(v, stv[ii]);
- }
- goto LoopNode;
-
-
-L200: /* Pop item from stack */
- if (nitc > 0) {
- /* Stack index */
- itp = itc[nitc + k] + k;
- --nitc;
- val = stv[itp];
- key = ist[itp];
- ist[itp] = -1;
- /* Compute marginals */
- for (i = nco; i >= 2; --i) {
- ico[i] = key % kyy;
- key /= kyy;
- }
- ico[1] = key;
- /* Set up nt array */
- nt[1] = nn - ico[1];
- for (i = 2; i <= nco; ++i)
- nt[i] = nt[i - 1] - ico[i];
-
- /* Test for optimality (L90) */
- xmin = (0);
- if (iro[nro] <= iro[irl] + nco) {
- f10act(&nro, &iro[irl], &nco, &ico[1], &val, &xmin, fact,
- &lb[1], &nu[1], &nr[1]);
- }
- if (!xmin && ico[nco] <= ico[1] + nro)
- f10act(&nco, &ico[1], &nro, &iro[irl], &val, &xmin, fact,
- &lb[1], &nu[1], &nr[1]);
- if (xmin) {
- if (vmn > val)
- vmn = val;
- goto L200;
- }
- else goto LnewNode;
-
- } else if (nro > 2 && nst > 0) {
- /* Go to next level */
- nitc = nst;
- nst = 0;
- k = ks;
- ks = ldst - ks;
- nn -= iro[irl];
- ++irl;
- --nro;
- goto L200;
- }
-
- *dlp -= vmn;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F4XACT
- Purpose: Computes the longest path length for a given table.
- Usage: CALL F4XACT (NROW, IROW, NCOL, ICOL, DSP, FACT, ICSTK,
- NCSTK, LSTK, MSTK, NSTK, NRSTK, IRSTK, YSTK,
- TOL)
- Arguments:
- NROW - The number of rows in the table. (Input)
- IROW - Vector of length NROW containing the row sums for the
- table. (Input)
- NCOL - The number of columns in the table. (Input)
- ICOL - Vector of length K containing the column sums for the
- table. (Input)
- DSP - The shortest path for the table. (Output)
- FACT - Vector containing the logarithms of factorials. (Input)
- ICSTK - NCOL by NROW+NCOL+1 work array.
- NCSTK - Work vector of length NROW+NCOL+1.
- LSTK - Work vector of length NROW+NCOL+1.
- MSTK - Work vector of length NROW+NCOL+1.
- NSTK - Work vector of length NROW+NCOL+1.
- NRSTK - Work vector of length NROW+NCOL+1.
- IRSTK - NROW by MAX(NROW,NCOL) work array.
- YSTK - Work vector of length NROW+NCOL+1.
- TOL - Tolerance. (Input)
- -----------------------------------------------------------------------
- */
-
-void
-f4xact(int *nrow, int *irow, int *ncol, int *icol, double *dsp,
- double *fact, int *icstk, int *ncstk, int *lstk, int *mstk,
- int *nstk, int *nrstk, int *irstk, double *ystk, const double *tol)
-{
- /* System generated locals */
- int ikh;
-
- /* Local variables */
- int i, j, k, l, m, n, mn, ic1, ir1, ict, irt, istk, nco, nro;
- double y, amx;
-
- /* Parameter adjustments */
- irstk -= *nrow + 1;
- --irow;
- icstk -= *ncol + 1;
- --icol;
- --ncstk;
- --lstk;
- --mstk;
- --nstk;
- --nrstk;
- --ystk;
-
- /* Function Body */
- /* Take care of the easy cases first */
- if (*nrow == 1) {
- for (i = 1; i <= *ncol; ++i) {
- *dsp -= fact[icol[i]];
- }
- return;
- }
- if (*ncol == 1) {
- for (i = 1; i <= *nrow; ++i) {
- *dsp -= fact[irow[i]];
- }
- return;
- }
- if (*nrow * *ncol == 4) {
- if (irow[2] <= icol[2]) {
- *dsp = *dsp - fact[irow[2]] - fact[icol[1]]
- - fact[icol[2] - irow[2]];
- } else {
- *dsp = *dsp - fact[icol[2]] - fact[irow[1]]
- - fact[irow[2] - icol[2]];
- }
- return;
- }
- /* initialization before loop */
- for (i = 1; i <= *nrow; ++i) {
- irstk[i + *nrow] = irow[*nrow - i + 1];
- }
- for (j = 1; j <= *ncol; ++j) {
- icstk[j + *ncol] = icol[*ncol - j + 1];
- }
-
- nro = *nrow;
- nco = *ncol;
- nrstk[1] = nro;
- ncstk[1] = nco;
- ystk[1] = 0.;
- y = 0.;
- istk = 1;
- l = 1;
- amx = 0.;
-
- /* First LOOP */
- do {
- ir1 = irstk[istk * *nrow + 1];
- ic1 = icstk[istk * *ncol + 1];
- if (ir1 > ic1) {
- if (nro >= nco) {
- m = nco - 1; n = 2;
- } else {
- m = nro; n = 1;
- }
- } else if (ir1 < ic1) {
- if (nro <= nco) {
- m = nro - 1; n = 1;
- } else {
- m = nco; n = 2;
- }
- } else {
- if (nro <= nco) {
- m = nro - 1; n = 1;
- } else {
- m = nco - 1; n = 2;
- }
- }
-
- L60:
- if (n == 1) {
- i = l; j = 1;
- } else {
- i = 1; j = l;
- }
-
- irt = irstk[i + istk * *nrow];
- ict = icstk[j + istk * *ncol];
- mn = irt;
- if (mn > ict) {
- mn = ict;
- }
- y += fact[mn];
- if (irt == ict) {
- --nro;
- --nco;
- f11act(&irstk[istk * *nrow + 1], &i, &nro,
- &irstk[(istk + 1) * *nrow + 1]);
- f11act(&icstk[istk * *ncol + 1], &j, &nco,
- &icstk[(istk + 1) * *ncol + 1]);
- } else if (irt > ict) {
- --nco;
- f11act(&icstk[istk * *ncol + 1], &j, &nco,
- &icstk[(istk + 1) * *ncol + 1]);
- ikh = irt - ict;
- f8xact(&irstk[istk * *nrow + 1], &ikh, &i,
- &nro, &irstk[(istk + 1) * *nrow + 1]);
- } else {
- --nro;
- f11act(&irstk[istk * *nrow + 1], &i, &nro,
- &irstk[(istk + 1) * *nrow + 1]);
- ikh = ict - irt;
- f8xact(&icstk[istk * *ncol + 1], &ikh, &j,
- &nco, &icstk[(istk + 1) * *ncol + 1]);
- }
-
- if (nro == 1) {
- for (k = 1; k <= nco; ++k) {
- y += fact[icstk[k + (istk + 1) * *ncol]];
- }
- break;
- }
- if (nco == 1) {
- for (k = 1; k <= nro; ++k) {
- y += fact[irstk[k + (istk + 1) * *nrow]];
- }
- break;
- }
-
- lstk[istk] = l;
- mstk[istk] = m;
- nstk[istk] = n;
- ++istk;
- nrstk[istk] = nro;
- ncstk[istk] = nco;
- ystk[istk] = y;
- l = 1;
- } while(1);/* end do */
-
-/* L90:*/
- if (y > amx) {
- amx = y;
- if (*dsp - amx <= *tol) {
- *dsp = 0.;
- return;
- }
- }
-
-L100:
- --istk;
- if (istk == 0) {
- *dsp -= amx;
- if (*dsp - amx <= *tol) {
- *dsp = 0.;
- }
- return;
- }
- l = lstk[istk] + 1;
-
-/* L110: */
- for(;; ++l) {
- if (l > mstk[istk]) goto L100;
-
- n = nstk[istk];
- nro = nrstk[istk];
- nco = ncstk[istk];
- y = ystk[istk];
- if (n == 1) {
- if (irstk[l + istk * *nrow] <
- irstk[l - 1 + istk * *nrow]) goto L60;
- }
- else if (n == 2) {
- if (icstk[l + istk * *ncol] <
- icstk[l - 1 + istk * *ncol]) goto L60;
- }
- }
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F5XACT
- Purpose: Put node on stack in network algorithm.
- Usage: CALL F5XACT (PASTP, TOL, KVAL, KEY, LDKEY, IPOIN, STP,
- LDSTP, IFRQ, NPOIN, NR, NL, IFREQ, ITOP,
- IPSH)
- Arguments:
- PASTP - The past path length. (Input)
- TOL - Tolerance for equivalence of past path lengths. (Input)
- KVAL - Key value. (Input)
- KEY - Vector of length LDKEY containing the key values. (in/out)
- LDKEY - Length of vector KEY. (Input)
- IPOIN - Vector of length LDKEY pointing to the
- linked list of past path lengths. (in/out)
- STP - Vector of length LSDTP containing the
- linked lists of past path lengths. (in/out)
- LDSTP - Length of vector STP. (Input)
- IFRQ - Vector of length LDSTP containing the past path
- frequencies. (in/out)
- NPOIN - Vector of length LDSTP containing the pointers to
- the next past path length. (in/out)
- NR - Vector of length LDSTP containing the right object
- pointers in the tree of past path lengths. (in/out)
- NL - Vector of length LDSTP containing the left object
- pointers in the tree of past path lengths. (in/out)
- IFREQ - Frequency of the current path length. (Input)
- ITOP - Pointer to the top of STP. (Input)
- IPSH - Option parameter. (Input)
- If IPSH is true, the past path length is found in the
- table KEY. Otherwise the location of the past path
- length is assumed known and to have been found in
- a previous call. ==>>>>> USING "static" variables
- -----------------------------------------------------------------------
- */
-
-void
-f5xact(double *pastp, const double *tol, int *kval, int *key, int *ldkey,
- int *ipoin, double *stp, int *ldstp, int *ifrq, int *npoin,
- int *nr, int *nl, int *ifreq, int *itop, int *ipsh)
-{
- /* Local variables */
- static int itmp, ird, ipn, itp;
- double test1, test2;
-
- /* Parameter adjustments */
- --nl;
- --nr;
- --npoin;
- --ifrq;
- --stp;
- --ipoin;
- --key;
-
- /* Function Body */
- if (*ipsh) {
- /* Convert KVAL to int in range 1, ..., LDKEY. */
- ird = *kval % *ldkey + 1;
- /* Search for an unused location */
- for (itp = ird; itp <= *ldkey; ++itp) {
- if (key[itp] == *kval) {
- goto L40;
- }
- if (key[itp] < 0) {
- goto L30;
- }
- }
- for (itp = 1; itp <= ird - 1; ++itp) {
- if (key[itp] == *kval) {
- goto L40;
- }
- if (key[itp] < 0) {
- goto L30;
- }
- }
- /* Return if KEY array is full */
- /* KH
- prterr(6, "LDKEY is too small for this problem.\n"
- "It is not possible to estimate the value of LDKEY "
- "required,\n"
- "but twice the current value may be sufficient.");
- */
- prterr(6, "LDKEY is too small for this problem.\n"
- "Try increasing the size of the workspace.");
-
- /* Update KEY */
-L30:
- key[itp] = *kval;
- ++(*itop);
- ipoin[itp] = *itop;
- /* Return if STP array full */
- if (*itop > *ldstp) {
- /* KH
- prterr(7, "LDSTP is too small for this problem.\n"
- "It is not possible to estimate the value of LDSTP "
- "required,\n"
- "but twice the current value may be sufficient.");
- */
- prterr(7, "LDSTP is too small for this problem.\n"
- "Try increasing the size of the workspace.");
- }
- /* Update STP, etc. */
- npoin[*itop] = -1;
- nr[*itop] = -1;
- nl[*itop] = -1;
- stp[*itop] = *pastp;
- ifrq[*itop] = *ifreq;
- return;
- }
-
- /* Find location, if any, of pastp */
-L40:
- ipn = ipoin[itp];
- test1 = *pastp - *tol;
- test2 = *pastp + *tol;
-
-L50:
- if (stp[ipn] < test1) {
- ipn = nl[ipn];
- if (ipn > 0) {
- goto L50;
- }
- } else if (stp[ipn] > test2) {
- ipn = nr[ipn];
- if (ipn > 0) {
- goto L50;
- }
- } else {
- ifrq[ipn] += *ifreq;
- return;
- }
- /* Return if STP array full */
- ++(*itop);
- if (*itop > *ldstp) {
- /*
- prterr(7, "LDSTP is too small for this problem.\n"
- "It is not possible to estimate the value of LDSTP "
- "required,\n"
- "but twice the current value may be sufficient.");
- */
- prterr(7, "LDSTP is too small for this problem.\n"
- "Try increasing the size of the workspace.");
- return;
- }
- /* Find location to add value */
- ipn = ipoin[itp];
- itmp = ipn;
-
-L60:
- if (stp[ipn] < test1) {
- itmp = ipn;
- ipn = nl[ipn];
- if (ipn > 0) {
- goto L60;
- } else {
- nl[itmp] = *itop;
- }
- } else if (stp[ipn] > test2) {
- itmp = ipn;
- ipn = nr[ipn];
- if (ipn > 0) {
- goto L60;
- } else {
- nr[itmp] = *itop;
- }
- }
- /* Update STP, etc. */
- npoin[*itop] = npoin[itmp];
- npoin[itmp] = *itop;
- stp[*itop] = *pastp;
- ifrq[*itop] = *ifreq;
- nl[*itop] = -1;
- nr[*itop] = -1;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F6XACT
- Purpose: Pop a node off the stack.
- Usage: CALL F6XACT (NROW, IROW, IFLAG, KYY, KEY, LDKEY, LAST, IPN)
- Arguments:
- NROW - The number of rows in the table. (Input)
- IROW - Vector of length nrow containing the row sums on
- output. (Output)
- IFLAG - Set to 3 if there are no additional nodes to process.
- (Output)
- KYY - Constant mutlipliers used in forming the hash
- table key. (Input)
- KEY - Vector of length LDKEY containing the hash table
- keys. (In/out)
- LDKEY - Length of vector KEY. (Input)
- LAST - Index of the last key popped off the stack. (In/out)
- IPN - Pointer to the linked list of past path lengths. (Output)
- -----------------------------------------------------------------------
- */
-void
-f6xact(int *nrow, int *irow, int *iflag, int *kyy, int *key, int
- *ldkey, int *last, int *ipn)
-{
- int kval, j;
-
- /* Parameter adjustments */
- --key;
- --kyy;
- --irow;
-
- /* Function Body */
-L10:
- ++(*last);
- if (*last <= *ldkey) {
- if (key[*last] < 0) {
- goto L10;
- }
- /* Get KVAL from the stack */
- kval = key[*last];
- key[*last] = -9999;
- for (j = *nrow; j >= 2; --j) {
- irow[j] = kval / kyy[j];
- kval -= irow[j] * kyy[j];
- }
- irow[1] = kval;
- *ipn = *last;
- } else {
- *last = 0;
- *iflag = 3;
- }
- return;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F7XACT
- Purpose: Generate the new nodes for given marinal totals.
- Usage: CALL F7XACT (NROW, IMAX, IDIF, K, KS, IFLAG)
- Arguments:
- NROW - The number of rows in the table. (Input)
- IMAX - The row marginal totals. (Input)
- IDIF - The column counts for the new column. (in/out)
- K - Indicator for the row to decrement. (in/out)
- KS - Indicator for the row to increment. (in/out)
- IFLAG - Status indicator. (Output)
- If IFLAG is zero, a new table was generated. For
- IFLAG = 1, no additional tables could be generated.
- -----------------------------------------------------------------------
- */
-
-void
-f7xact(int *nrow, int *imax, int *idif, int *k, int *ks,
- int *iflag)
-
-{
- int i, m, k1, mm;
-
- /* Parameter adjustments */
- --idif;
- --imax;
-
- /* Function Body */
- *iflag = 0;
- /* Find node which can be incremented, ks */
- if (*ks == 0)
- do {
- ++(*ks);
- } while (idif[*ks] == imax[*ks]);
-
- /* Find node to decrement (>ks) */
- if (idif[*k] > 0 && *k > *ks) {
- --idif[*k];
- do {
- --(*k);
- } while(imax[*k] == 0);
-
- m = *k;
-
- /* Find node to increment (>=ks) */
- while (idif[m] >= imax[m]) {
- --m;
- }
- ++idif[m];
- /* Change ks */
- if (m == *ks) {
- if (idif[m] == imax[m]) {
- *ks = *k;
- }
- }
- }
- else {
- Loop:
- /* Check for finish */
- for (k1 = *k + 1; k1 <= *nrow; ++k1) {
- if (idif[k1] > 0) {
- goto L70;
- }
- }
- *iflag = 1;
- return;
-
- L70:
- /* Reallocate counts */
- mm = 1;
- for (i = 1; i <= *k; ++i) {
- mm += idif[i];
- idif[i] = 0;
- }
- *k = k1;
-
- do {
- --(*k);
- m = min(mm, imax[*k]);
- idif[*k] = m;
- mm -= m;
- } while (mm > 0 && *k != 1);
-
- /* Check that all counts reallocated */
- if (mm > 0) {
- if (k1 != *nrow) {
- *k = k1;
- goto Loop;
- }
- *iflag = 1;
- return;
- }
- /* Get ks */
- --idif[k1];
- *ks = 0;
- do {
- ++(*ks);
- if (*ks > *k) {
- return;
- }
- } while (idif[*ks] >= imax[*ks]);
- }
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F8XACT
- Purpose: Routine for reducing a vector when there is a zero
- element.
- Usage: CALL F8XACT (IROW, IS, I1, IZERO, NEW)
- Arguments:
- IROW - Vector containing the row counts. (Input)
- IS - Indicator. (Input)
- I1 - Indicator. (Input)
- IZERO - Position of the zero. (Input)
- NEW - Vector of new row counts. (Output)
- -----------------------------------------------------------------------
- */
-
-void
-f8xact(int *irow, int *is, int *i1, int *izero, int *myNew)
-{
- int i;
-
- /* Parameter adjustments */
- --myNew;
- --irow;
-
- /* Function Body */
- for (i = 1; i < *i1; ++i)
- myNew[i] = irow[i];
-
- for (i = *i1; i <= *izero - 1; ++i) {
- if (*is >= irow[i + 1])
- break;
- myNew[i] = irow[i + 1];
- }
-
- myNew[i] = *is;
-
- for(;;) {
- ++i;
- if (i > *izero)
- return;
- myNew[i] = irow[i];
- }
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F9XACT
- Purpose: Computes the log of a multinomial coefficient.
- Usage: F9XACT(N, MM, IR, FACT)
- Arguments:
- N - Length of IR. (Input)
- MM - Number for factorial in numerator. (Input)
- IR - Vector of length N containing the numbers for
- the denominator of the factorial. (Input)
- FACT - Table of log factorials. (Input)
- F9XACT - The log of the multinomal coefficient. (Output)
- -----------------------------------------------------------------------
- */
-
-double
-f9xact(int *n, int *mm, int *ir, double *fact)
-{
- double d;
- int k;
-
- d = fact[*mm];
- for (k = 0; k < *n; k++)
- d -= fact[ir[k]];
- return d;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F10ACT
- Purpose: Computes the shortest path length for special tables.
- Usage: F10ACT (NROW, IROW, NCOL, ICOL, VAL, XMIN, FACT, ND, NE, M)
- Arguments:
- NROW - The number of rows in the table. (Input)
- IROW - Vector of length NROW containing the row totals. (Input)
- NCOL - The number of columns in the table. (Input)
- ICO - Vector of length NCOL containing the column totals.(Input)
- VAL - The shortest path. (Output)
- XMIN - Set to true if shortest path obtained. (Output)
- FACT - Vector containing the logarithms of factorials. (Input)
- ND - Workspace vector of length NROW. (Input)
- NE - Workspace vector of length NCOL. (Input)
- M - Workspace vector of length NCOL. (Input)
-
- Chapter: STAT/LIBRARY Categorical and Discrete Data Analysis
- -----------------------------------------------------------------------
- */
-
-void
-f10act(int *nrow, int *irow, int *ncol, int *icol, double *val,
- int *xmin, double *fact, int *nd, int *ne, int *m)
-{
- /* Local variables */
- int i, is, ix, nrw1;
-
- /* Parameter adjustments */
- --m;
- --ne;
- --nd;
- --icol;
- --irow;
-
- /* Function Body */
- for (i = 1; i <= *nrow - 1; ++i)
- nd[i] = 0;
-
- is = icol[1] / *nrow;
- ix = icol[1] - *nrow * is;
- ne[1] = is;
- m[1] = ix;
- if (ix != 0)
- ++nd[ix];
-
- for (i = 2; i <= *ncol; ++i) {
- ix = icol[i] / *nrow;
- ne[i] = ix;
- is += ix;
- ix = icol[i] - *nrow * ix;
- m[i] = ix;
- if (ix != 0)
- ++nd[ix];
- }
-
- for (i = *nrow - 2; i >= 1; --i)
- nd[i] += nd[i + 1];
-
- ix = 0;
- nrw1 = *nrow + 1;
- for (i = *nrow; i >= 2; --i) {
- ix = ix + is + nd[nrw1 - i] - irow[i];
- if (ix < 0)
- return;
- }
-
- for (i = 1; i <= *ncol; ++i) {
- ix = ne[i];
- is = m[i];
- *val = *val + is * fact[ix + 1] + (*nrow - is) * fact[ix];
- }
- *xmin = (1);
-
- return;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: F11ACT
- Purpose: Routine for revising row totals.
- Usage: CALL F11ACT (IROW, I1, I2, NEW)
- Arguments:
- IROW - Vector containing the row totals. (Input)
- I1 - Indicator. (Input)
- I2 - Indicator. (Input)
- NEW - Vector containing the row totals. (Output)
- -----------------------------------------------------------------------
- */
-void
-f11act(int *irow, int *i1, int *i2, int *myNew)
-{
- int i;
-
- /* Parameter adjustments */
- --myNew;
- --irow;
-
- for (i = 1; i <= (*i1 - 1); ++i) myNew[i] = irow[i];
- for (i = *i1; i <= *i2; ++i) myNew[i] = irow[i + 1];
-
- return;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: prterr
- Purpose: Print an error message and stop.
- Usage: prterr(icode, mes)
- Arguments:
- icode - Integer code for the error message. (Input)
- mes - Character string containing the error message. (Input)
- -----------------------------------------------------------------------
- */
-void
-prterr(int icode, char *mes)
-{
-// PROBLEM "FEXACT error %d.\n%s", icode, mes RECOVER(NULL_ENTRY);
-// printf("FEXACT error %d.\n%s", icode, mes RECOVER(NULL_ENTRY));
- printf("FEXACT error %d.\n", icode); //xx:another one of my ugly kluges
- return;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: iwork
- Purpose: Routine for allocating workspace.
- Usage: iwork (iwkmax, iwkpt, number, itype)
- Arguments:
- iwkmax - Maximum (int) amount of workspace. (Input)
- iwkpt - Amount of (int) workspace currently allocated. (in/out)
- number - Number of elements of workspace desired. (Input)
- itype - Workspace type. (Input)
- ITYPE TYPE
- 2 integer
- 3 float
- 4 double
- iwork(): Index in rwrk, dwrk, or iwrk of the beginning of
- the first free element in the workspace array. (Output)
- -----------------------------------------------------------------------
- */
-int
-iwork(int iwkmax, int *iwkpt, int number, int itype)
-{
- int i;
-
- i = *iwkpt;
- if (itype == 2 || itype == 3)
- *iwkpt += number;
- else { /* double */
- if (i % 2 != 0)
- ++i;
- *iwkpt += (number << 1);
- i /= 2;
- }
- if (*iwkpt > iwkmax)
- prterr(40, "Out of workspace.");
-
- return i;
-}
-
-#ifndef USING_R
-
-void isort(int *n, int *ix)
-{
-/*
- -----------------------------------------------------------------------
- Name: ISORT
- Purpose: Shell sort for an int vector.
- Usage: CALL ISORT (N, IX)
- Arguments:
- N - Lenth of vector IX. (Input)
- IX - Vector to be sorted. (in/out)
- -----------------------------------------------------------------------
- */
- static int ikey, i, j, m, il[10], kl, it, iu[10], ku;
-
- /* Parameter adjustments */
- --ix;
-
- /* Function Body */
- m = 1;
- i = 1;
- j = *n;
-
-L10:
- if (i >= j) {
- goto L40;
- }
- kl = i;
- ku = j;
- ikey = i;
- ++j;
- /* Find element in first half */
-L20:
- ++i;
- if (i < j) {
- if (ix[ikey] > ix[i]) {
- goto L20;
- }
- }
- /* Find element in second half */
-L30:
- --j;
- if (ix[j] > ix[ikey]) {
- goto L30;
- }
- /* Interchange */
- if (i < j) {
- it = ix[i];
- ix[i] = ix[j];
- ix[j] = it;
- goto L20;
- }
- it = ix[ikey];
- ix[ikey] = ix[j];
- ix[j] = it;
- /* Save upper and lower subscripts of the array yet to be sorted */
- if (m < 11) {
- if (j - kl < ku - j) {
- il[m - 1] = j + 1;
- iu[m - 1] = ku;
- i = kl;
- --j;
- } else {
- il[m - 1] = kl;
- iu[m - 1] = j - 1;
- i = j + 1;
- j = ku;
- }
- ++m;
- goto L10;
- } else {
- prterr(20, "This should never occur.");
- }
- /* Use another segment */
-L40:
- --m;
- if (m == 0) {
- return;
- }
- i = il[m - 1];
- j = iu[m - 1];
- goto L10;
-}
-
-double gammds(double *y, double *p, int *ifault)
-{
-/*
- -----------------------------------------------------------------------
- Name: GAMMDS
- Purpose: Cumulative distribution for the gamma distribution.
- Usage: PGAMMA (Q, ALPHA,IFAULT)
- Arguments:
- Q - Value at which the distribution is desired. (Input)
- ALPHA - Parameter in the gamma distribution. (Input)
- IFAULT - Error indicator. (Output)
- IFAULT DEFINITION
- 0 No error
- 1 An argument is misspecified.
- 2 A numerical error has occurred.
- PGAMMA - The cdf for the gamma distribution with parameter alpha
- evaluated at Q. (Output)
- -----------------------------------------------------------------------
-
- Algorithm AS 147 APPL. Statist. (1980) VOL. 29, P. 113
-
- Computes the incomplete gamma integral for positive parameters Y, P
- using and infinite series.
- */
-
- static double a, c, f, g;
- static int ifail;
-
- /* Checks for the admissibility of arguments and value of F */
- *ifault = 1;
- g = 0.;
- if (*y <= 0. || *p <= 0.) {
- return g;
- }
- *ifault = 2;
-
- /*
- ALOGAM is natural log of gamma function no need to test ifail as
- an error is impossible
- */
-
- a = *p + 1.;
- f = exp(*p * log(*y) - alogam(&a, &ifail) - *y);
- if (f == 0.) {
- return g;
- }
- *ifault = 0;
-
- /* Series begins */
- c = 1.;
- g = 1.;
- a = *p;
-L10:
- a += 1.;
- c = c * *y / a;
- g += c;
- if (c / g > 1e-6) {
- goto L10;
- }
- g *= f;
- return g;
-}
-
-/*
- -----------------------------------------------------------------------
- Name: ALOGAM
- Purpose: Value of the log-gamma function.
- Usage: ALOGAM (X, IFAULT)
- Arguments:
- X - Value at which the log-gamma function is to be evaluated.
- (Input)
- IFAULT - Error indicator. (Output)
- IFAULT DEFINITION
- 0 No error
- 1 X < 0
- ALGAMA - The value of the log-gamma function at XX. (Output)
- -----------------------------------------------------------------------
-
- Algorithm ACM 291, Comm. ACM. (1966) Vol. 9, P. 684
-
- Evaluates natural logarithm of gamma(x) for X greater than zero.
- */
-
-double alogam(double *x, int *ifault)
-{
- /* Initialized data */
- //printf("alogam x = %f\t%d\n",*x,*ifault);
- static double a1 = .918938533204673;
- static double a2 = 5.95238095238e-4;
- static double a3 = 7.93650793651e-4;
- static double a4 = .002777777777778;
- static double a5 = .083333333333333;
-
- /* Local variables */
- static double f, y, z;
-
- *ifault = 1;
- if (*x < 0.) {
- return(0.);
- }
- *ifault = 0;
- y = *x;
- f = 0.;
- if (y >= 7.) {
- goto L30;
- }
- f = y;
-L10:
- y += 1.;
- if (y >= 7.) {
- goto L20;
- }
- f *= y;
- goto L10;
-L20:
- f = -log(f);
-L30:
- z = 1. / (y * y);
-
- //printf("returning %f\n",(f + (y - .5) * log(y) - y + a1 + (((-a2 * z + a3) * z - a4) * z + a5) / y));
- return(f + (y - .5) * log(y) - y + a1 +
- (((-a2 * z + a3) * z - a4) * z + a5) / y);
-}
-
-
-#endif /* not USING_R */
-
+++ /dev/null
-#ifndef GUARD_fisher2
-#define GUARD_fisher2
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define SINT_MAX INT_MAX
-
-#define max(a, b) ((a) < (b) ? (b) : (a))
-#define min(a, b) ((a) > (b) ? (b) : (a))
-
-
-void fexact(int *, int *, double *, int *,
- double *, double *, double *, double *,
- double *, int *);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
try{
maxFlows = mF;
- if(endFlow > maxFlows){ endFlow = maxFlows; }
+ if(endFlow > maxFlows){ endFlow = maxFlows; }
+ translateFlow();
}
catch(exception& e) {
string outfile = m->getRootName(squareFile) + "sorted.dist.temp";
//use the unix sort
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
string command = "sort -n " + squareFile + " -o " + outfile;
system(command.c_str());
#else //sort using windows sort
string outfile = m->getRootName(tempFile) + "sorted.dist.temp";
//use the unix sort
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
string command = "sort -n " + tempFile + " -o " + outfile;
system(command.c_str());
#else //sort using windows sort
helpString += "The output parameter allows you to output the list of names without the group and bin number added. \n";
helpString += "With this option you can use the names file as an input in get.seqs and remove.seqs commands. To do this enter output=accnos. \n";
helpString += "The get.sharedseqs command outputs a .names file for each distance level containing a list of sequences in the OTUs shared by the groups specified.\n";
- helpString += "The get.sharedseqs command should be in the following format: get.sharedseqs(label=yourLabels, groups=yourGroups, fasta=yourFastafile, output=yourOutput).\n";
- helpString += "Example get.sharedseqs(list=amazon.fn.list, label=unique-0.01, group=forest-pasture, fasta=amazon.fasta, output=accnos).\n";
+ helpString += "The get.sharedseqs command should be in the following format: get.sharedseqs(list=yourListFile, group=yourGroupFile, label=yourLabels, unique=yourGroups, fasta=yourFastafile, output=yourOutput).\n";
+ helpString += "Example get.sharedseqs(list=amazon.fn.list, label=unique-0.01, group= amazon.groups, unique=forest-pasture, fasta=amazon.fasta, output=accnos).\n";
helpString += "The output to the screen is the distance and the number of otus at that distance for the groups you specified.\n";
helpString += "The default value for label is all labels in your inputfile. The default for groups is all groups in your file.\n";
helpString += "Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n";
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
int HCluster::processFile() {
try {
string firstName, secondName;
public:
ignoreGaps() {}
- ignoreGaps(const ignoreGaps& ddb) {}
void calcDist(Sequence A, Sequence B){
int diff = 0;
try {
vector<float> pvalues;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
pvalues = driver(groupings, groupingsMap, num, indicatorValues, iters);
for (int i = 0; i < pvalues.size(); i++) { pvalues[i] /= (double)iters; }
try {
vector<float> pvalues;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
pvalues = driver(groupings, groupingsMap, num, indicatorValues, iters);
for (int i = 0; i < pvalues.size(); i++) { pvalues[i] /= (double)iters; }
public:
KmerDB(string, int);
- KmerDB(const KmerDB& kdb) : kmerSize(kdb.kmerSize), maxKmer(kdb.maxKmer), count(kdb.count), kmerDBName(kdb.kmerDBName), kmerLocations(kdb.kmerLocations), Database(kdb) {}
KmerDB();
~KmerDB();
#include "linearalgebra.h"
+// This class references functions used from "Numerical Recipes in C++" //
+
+/*********************************************************************************************************************************/
+inline double SQR(const double a)
+{
+ return a*a;
+}
/*********************************************************************************************************************************/
inline double SIGN(const double a, const double b)
return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
}
/*********************************************************************************************************************************/
+//NUmerical recipes pg. 245 - Returns the complementary error function erfc(x) with fractional error everywhere less than 1.2 × 10−7.
+double LinearAlgebra::erfcc(double x){
+ try {
+ double t,z,ans;
+ z=fabs(x);
+ t=1.0/(1.0+0.5*z);
+
+ ans=t*exp(-z*z-1.26551223+t*(1.00002368+t*(0.37409196+t*(0.09678418+
+ t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+
+ t*(-0.82215223+t*0.17087277)))))))));
+
+ //cout << "in erfcc " << t << '\t' << ans<< endl;
+ return (x >= 0.0 ? ans : 2.0 - ans);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "betai");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 232
+double LinearAlgebra::betai(const double a, const double b, const double x) {
+ try {
+ double bt;
+ double result = 0.0;
+
+ if (x < 0.0 || x > 1.0) { m->mothurOut("[ERROR]: bad x in betai.\n"); m->control_pressed = true; return 0.0; }
+
+ if (x == 0.0 || x == 1.0) { bt = 0.0; }
+ else { bt = exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x)); }
+
+ if (x < (a+1.0) / (a+b+2.0)) { result = bt*betacf(a,b,x)/a; }
+ else { result = 1.0-bt*betacf(b,a,1.0-x)/b; }
+
+ return result;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "betai");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 219
+double LinearAlgebra::gammln(const double xx) {
+ try {
+ int j;
+ double x,y,tmp,ser;
+ static const double cof[6]={76.18009172947146,-86.50532032941677,24.01409824083091,
+ -1.231739572450155,0.120858003e-2,-0.536382e-5};
+
+ y=x=xx;
+ tmp=x+5.5;
+ tmp -= (x+0.5)*log(tmp);
+ ser=1.0;
+ for (j=0;j<6;j++) {
+ ser += cof[j]/++y;
+ }
+ return -tmp+log(2.5066282746310005*ser/x);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "gammln");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 223
+double LinearAlgebra::gammp(const double a, const double x) {
+ try {
+ double gamser,gammcf,gln;
+
+ if (x < 0.0 || a <= 0.0) { m->mothurOut("[ERROR]: Invalid arguments in routine GAMMP\n"); m->control_pressed = true; return 0.0;}
+ if (x < (a+1.0)) {
+ gser(gamser,a,x,gln);
+ return gamser;
+ } else {
+ gcf(gammcf,a,x,gln);
+ return 1.0-gammcf;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "gammp");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 223
+double LinearAlgebra::gammq(const double a, const double x) {
+ try {
+ double gamser,gammcf,gln;
+
+ if (x < 0.0 || a <= 0.0) { m->mothurOut("[ERROR]: Invalid arguments in routine GAMMQ\n"); m->control_pressed = true; return 0.0; }
+ if (x < (a+1.0)) {
+ gser(gamser,a,x,gln);
+ return 1.0-gamser;
+ } else {
+ gcf(gammcf,a,x,gln);
+ return gammcf;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "gammp");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 224
+double LinearAlgebra::gcf(double& gammcf, const double a, const double x, double& gln){
+ try {
+ const int ITMAX=100;
+ const double EPS=numeric_limits<double>::epsilon();
+ const double FPMIN=numeric_limits<double>::min()/EPS;
+ int i;
+ double an,b,c,d,del,h;
+
+ gln=gammln(a);
+ b=x+1.0-a;
+ c=1.0/FPMIN;
+ d=1.0/b;
+ h=d;
+ for (i=1;i<=ITMAX;i++) {
+ an = -i*(i-a);
+ b += 2.0;
+ d=an*d+b;
+ if (fabs(d) < FPMIN) { d=FPMIN; }
+ c=b+an/c;
+ if (fabs(c) < FPMIN) { c=FPMIN; }
+ d=1.0/d;
+ del=d*c;
+ h *= del;
+ if (fabs(del-1.0) <= EPS) break;
+ }
+ if (i > ITMAX) { m->mothurOut("[ERROR]: a too large, ITMAX too small in gcf\n"); m->control_pressed = true; }
+ gammcf=exp(-x+a*log(x)-gln)*h;
+
+ return 0.0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "gcf");
+ exit(1);
+ }
+
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 223
+double LinearAlgebra::gser(double& gamser, const double a, const double x, double& gln) {
+ try {
+ int n;
+ double sum,del,ap;
+ const double EPS = numeric_limits<double>::epsilon();
+
+ gln=gammln(a);
+ if (x <= 0.0) {
+ if (x < 0.0) { m->mothurOut("[ERROR]: x less than 0 in routine GSER\n"); m->control_pressed = true; }
+ gamser=0.0; return 0.0;
+ } else {
+ ap=a;
+ del=sum=1.0/a;
+ for (n=0;n<100;n++) {
+ ++ap;
+ del *= x/ap;
+ sum += del;
+ if (fabs(del) < fabs(sum)*EPS) {
+ gamser=sum*exp(-x+a*log(x)-gln);
+ return 0.0;
+ }
+ }
+
+ m->mothurOut("[ERROR]: a too large, ITMAX too small in routine GSER\n");
+ return 0.0;
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "gser");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+//Numerical Recipes pg. 233
+double LinearAlgebra::betacf(const double a, const double b, const double x) {
+ try {
+ const int MAXIT = 100;
+ const double EPS = numeric_limits<double>::epsilon();
+ const double FPMIN = numeric_limits<double>::min() / EPS;
+ int m1, m2;
+ double aa, c, d, del, h, qab, qam, qap;
+
+ qab=a+b;
+ qap=a+1.0;
+ qam=a-1.0;
+ c=1.0;
+ d=1.0-qab*x/qap;
+ if (fabs(d) < FPMIN) d=FPMIN;
+ d=1.0/d;
+ h=d;
+ for (m1=1;m1<=MAXIT;m1++) {
+ m2=2*m1;
+ aa=m1*(b-m1)*x/((qam+m2)*(a+m2));
+ d=1.0+aa*d;
+ if (fabs(d) < FPMIN) d=FPMIN;
+ c=1.0+aa/c;
+ if (fabs(c) < FPMIN) c=FPMIN;
+ d=1.0/d;
+ h *= d*c;
+ aa = -(a+m1)*(qab+m1)*x/((a+m2)*(qap+m2));
+ d=1.0+aa*d;
+ if (fabs(d) < FPMIN) d=FPMIN;
+ c=1.0+aa/c;
+ if (fabs(c) < FPMIN) c=FPMIN;
+ d=1.0/d;
+ del=d*c;
+ h *= del;
+ if (fabs(del-1.0) < EPS) break;
+ }
+
+ if (m1 > MAXIT) { m->mothurOut("[ERROR]: a or b too big or MAXIT too small in betacf."); m->mothurOutEndLine(); m->control_pressed = true; }
+ return h;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "betacf");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
vector<vector<double> > LinearAlgebra::matrix_mult(vector<vector<double> > first, vector<vector<double> > second){
try {
double p = (numCoor - numDisCoor) / (float) count;
- //calc signif - zA - http://en.wikipedia.org/wiki/Kendall_tau_rank_correlation_coefficient#Significance_tests
- double numer = 3.0 * (numCoor - numDisCoor);
- int n = xscores.size();
- double denom = n * (n-1) * (2*n + 5) / (double) 2.0;
- denom = sqrt(denom);
- sig = numer / denom;
-
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+ sig = calcKendallSig(x.size(), p);
return p;
}
exit(1);
}
}
+double LinearAlgebra::ran0(int& idum)
+{
+ const int IA=16807,IM=2147483647,IQ=127773;
+ const int IR=2836,MASK=123459876;
+ const double AM=1.0/double(IM);
+ int k;
+ double ans;
+
+ idum ^= MASK;
+ k=idum/IQ;
+ idum=IA*(idum-k*IQ)-IR*k;
+ if (idum < 0) idum += IM;
+ ans=AM*idum;
+ idum ^= MASK;
+ return ans;
+}
+
+double LinearAlgebra::ran1(int &idum)
+{
+ const int IA=16807,IM=2147483647,IQ=127773,IR=2836,NTAB=32;
+ const int NDIV=(1+(IM-1)/NTAB);
+ const double EPS=3.0e-16,AM=1.0/IM,RNMX=(1.0-EPS);
+ static int iy=0;
+ static vector<int> iv(NTAB);
+ int j,k;
+ double temp;
+
+ if (idum <= 0 || !iy) {
+ if (-idum < 1) idum=1;
+ else idum = -idum;
+ for (j=NTAB+7;j>=0;j--) {
+ k=idum/IQ;
+ idum=IA*(idum-k*IQ)-IR*k;
+ if (idum < 0) idum += IM;
+ if (j < NTAB) iv[j] = idum;
+ }
+ iy=iv[0];
+ }
+ k=idum/IQ;
+ idum=IA*(idum-k*IQ)-IR*k;
+ if (idum < 0) idum += IM;
+ j=iy/NDIV;
+ iy=iv[j];
+ iv[j] = idum;
+ if ((temp=AM*iy) > RNMX) return RNMX;
+ else return temp;
+}
+
+double LinearAlgebra::ran2(int &idum)
+{
+ const int IM1=2147483563,IM2=2147483399;
+ const int IA1=40014,IA2=40692,IQ1=53668,IQ2=52774;
+ const int IR1=12211,IR2=3791,NTAB=32,IMM1=IM1-1;
+ const int NDIV=1+IMM1/NTAB;
+ const double EPS=3.0e-16,RNMX=1.0-EPS,AM=1.0/double(IM1);
+ static int idum2=123456789,iy=0;
+ static vector<int> iv(NTAB);
+ int j,k;
+ double temp;
+
+ if (idum <= 0) {
+ idum=(idum==0 ? 1 : -idum);
+ idum2=idum;
+ for (j=NTAB+7;j>=0;j--) {
+ k=idum/IQ1;
+ idum=IA1*(idum-k*IQ1)-k*IR1;
+ if (idum < 0) idum += IM1;
+ if (j < NTAB) iv[j] = idum;
+ }
+ iy=iv[0];
+ }
+ k=idum/IQ1;
+ idum=IA1*(idum-k*IQ1)-k*IR1;
+ if (idum < 0) idum += IM1;
+ k=idum2/IQ2;
+ idum2=IA2*(idum2-k*IQ2)-k*IR2;
+ if (idum2 < 0) idum2 += IM2;
+ j=iy/NDIV;
+ iy=iv[j]-idum2;
+ iv[j] = idum;
+ if (iy < 1) iy += IMM1;
+ if ((temp=AM*iy) > RNMX) return RNMX;
+ else return temp;
+}
+
+double LinearAlgebra::ran3(int &idum)
+{
+ static int inext,inextp;
+ static int iff=0;
+ const int MBIG=1000000000,MSEED=161803398,MZ=0;
+ const double FAC=(1.0/MBIG);
+ static vector<int> ma(56);
+ int i,ii,k,mj,mk;
+
+ if (idum < 0 || iff == 0) {
+ iff=1;
+ mj=labs(MSEED-labs(idum));
+ mj %= MBIG;
+ ma[55]=mj;
+ mk=1;
+ for (i=1;i<=54;i++) {
+ ii=(21*i) % 55;
+ ma[ii]=mk;
+ mk=mj-mk;
+ if (mk < int(MZ)) mk += MBIG;
+ mj=ma[ii];
+ }
+ for (k=0;k<4;k++)
+ for (i=1;i<=55;i++) {
+ ma[i] -= ma[1+(i+30) % 55];
+ if (ma[i] < int(MZ)) ma[i] += MBIG;
+ }
+ inext=0;
+ inextp=31;
+ idum=1;
+ }
+ if (++inext == 56) inext=1;
+ if (++inextp == 56) inextp=1;
+ mj=ma[inext]-ma[inextp];
+ if (mj < int(MZ)) mj += MBIG;
+ ma[inext]=mj;
+ return mj*FAC;
+}
+
+double LinearAlgebra::ran4(int &idum)
+{
+#if defined(vax) || defined(_vax_) || defined(__vax__) || defined(VAX)
+ static const unsigned long jflone = 0x00004080;
+ static const unsigned long jflmsk = 0xffff007f;
+#else
+ static const unsigned long jflone = 0x3f800000;
+ static const unsigned long jflmsk = 0x007fffff;
+#endif
+ unsigned long irword,itemp,lword;
+ static int idums = 0;
+
+ if (idum < 0) {
+ idums = -idum;
+ idum=1;
+ }
+ irword=idum;
+ lword=idums;
+ psdes(lword,irword);
+ itemp=jflone | (jflmsk & irword);
+ ++idum;
+ return (*(float *)&itemp)-1.0;
+}
+
+void LinearAlgebra::psdes(unsigned long &lword, unsigned long &irword)
+{
+ const int NITER=4;
+ static const unsigned long c1[NITER]={
+ 0xbaa96887L, 0x1e17d32cL, 0x03bcdc3cL, 0x0f33d1b2L};
+ static const unsigned long c2[NITER]={
+ 0x4b0f3b58L, 0xe874f0c3L, 0x6955c5a6L, 0x55a7ca46L};
+ unsigned long i,ia,ib,iswap,itmph=0,itmpl=0;
+
+ for (i=0;i<NITER;i++) {
+ ia=(iswap=irword) ^ c1[i];
+ itmpl = ia & 0xffff;
+ itmph = ia >> 16;
+ ib=itmpl*itmpl+ ~(itmph*itmph);
+ irword=lword ^ (((ia = (ib >> 16) |
+ ((ib & 0xffff) << 16)) ^ c2[i])+itmpl*itmph);
+ lword=iswap;
+ }
+}
+/*********************************************************************************************************************************/
+double LinearAlgebra::calcKendallSig(double n, double r){
+ try {
+
+ double sig = 0.0;
+ double svar=(4.0*n+10.0)/(9.0*n*(n-1.0));
+ double z= r/sqrt(svar);
+ sig=erfcc(fabs(z)/1.4142136);
+
+ if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+
+ return sig;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "calcKendallSig");
+ exit(1);
+ }
+}
+
/*********************************************************************************************************************************/
double LinearAlgebra::calcSpearman(vector<double>& x, vector<double>& y, double& sig){
try {
if (x.size() != y.size()) { m->mothurOut("[ERROR]: vector size mismatch."); m->mothurOutEndLine(); return 0.0; }
//format data
+ double sf = 0.0; //f^3 - f where f is the number of ties in x;
+ double sg = 0.0; //f^3 - f where f is the number of ties in y;
map<float, int> tableX;
map<float, int>::iterator itTable;
vector<spearmanRank> xscores;
float thisrank = rankTotal / (float) xties.size();
rankx[xties[k].name] = thisrank;
}
+ int t = xties.size();
+ sf += (t*t*t-t);
xties.clear();
rankTotal = 0;
}
float thisrank = rankTotal / (float) yties.size();
rank[yties[k].name] = thisrank;
}
+ int t = yties.size();
+ sg += (t*t*t-t);
yties.clear();
rankTotal = 0;
}
p = (SX2 + SY2 - di) / (2.0 * sqrt((SX2*SY2)));
- //signifigance calc - http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
- double temp = (x.size()-2) / (double) (1- (p*p));
- temp = sqrt(temp);
- sig = p*temp;
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
-
+ //Numerical Recipes 646
+ sig = calcSpearmanSig(n, sf, sg, di);
+
return p;
}
catch(exception& e) {
m->errorOut(e, "LinearAlgebra", "calcSpearman");
exit(1);
}
-}
+}
+/*********************************************************************************************************************************/
+double LinearAlgebra::calcSpearmanSig(double n, double sf, double sg, double d){
+ try {
+
+ double sig = 0.0;
+ double probrs = 0.0;
+ double en=n;
+ double en3n=en*en*en-en;
+ double aved=en3n/6.0-(sf+sg)/12.0;
+ double fac=(1.0-sf/en3n)*(1.0-sg/en3n);
+ double vard=((en-1.0)*en*en*SQR(en+1.0)/36.0)*fac;
+ double zd=(d-aved)/sqrt(vard);
+ double probd=erfcc(fabs(zd)/1.4142136);
+ double rs=(1.0-(6.0/en3n)*(d+(sf+sg)/12.0))/sqrt(fac);
+ fac=(rs+1.0)*(1.0-rs);
+ if (fac > 0.0) {
+ double t=rs*sqrt((en-2.0)/fac);
+ double df=en-2.0;
+ probrs=betai(0.5*df,0.5,df/(df+t*t));
+ }else {
+ probrs = 0.0;
+ }
+
+ //smaller of probd and probrs is sig
+ sig = probrs;
+ if (probd < probrs) { sig = probd; }
+
+ if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+
+ return sig;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "calcSpearmanSig");
+ exit(1);
+ }
+}
/*********************************************************************************************************************************/
double LinearAlgebra::calcPearson(vector<double>& x, vector<double>& y, double& sig){
try {
r = numerator / denom;
- //signifigance calc - http://faculty.vassar.edu/lowry/ch4apx.html
- double temp = (1- (r*r)) / (double) (x.size()-2);
- temp = sqrt(temp);
- sig = r / temp;
- if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+ //Numerical Recipes pg.644
+ sig = calcPearsonSig(x.size(), r);
return r;
}
m->errorOut(e, "LinearAlgebra", "calcPearson");
exit(1);
}
-}
+}
+/*********************************************************************************************************************************/
+double LinearAlgebra::calcPearsonSig(double n, double r){
+ try {
+
+ double sig = 0.0;
+ const double TINY = 1.0e-20;
+ double z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY)); //Fisher's z transformation
+
+ //code below was giving an error in betacf with sop files
+ //int df = n-2;
+ //double t = r*sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)));
+ //sig = betai(0.5+df, 0.5, df/(df+t*t));
+
+ //Numerical Recipes says code below gives approximately the same result
+ sig = erfcc(fabs(z*sqrt(n-1.0))/1.4142136);
+ if (isnan(sig) || isinf(sig)) { sig = 0.0; }
+
+ return sig;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "LinearAlgebra", "calcPearsonSig");
+ exit(1);
+ }
+}
/*********************************************************************************************************************************/
vector<vector<double> > LinearAlgebra::getObservedEuclideanDistance(vector<vector<double> >& relAbundData){
double calcPearson(vector<double>&, vector<double>&, double&);
double calcSpearman(vector<double>&, vector<double>&, double&);
double calcKendall(vector<double>&, vector<double>&, double&);
-
-
+
+ double calcSpearmanSig(double, double, double, double); //length, f^3 - f where f is the number of ties in x, f^3 - f where f is the number of ties in y, sum of squared diffs in ranks. - designed to find the sif of one score.
+ double calcPearsonSig(double, double); //length, coeff.
+ double calcKendallSig(double, double); //length, coeff.
+
+
private:
MothurOut* m;
double pythag(double, double);
+ double betacf(const double, const double, const double);
+ double betai(const double, const double, const double);
+ double gammln(const double);
+ double gammp(const double, const double);
+ double gammq(const double, const double);
+ double gser(double&, const double, const double, double&);
+ double gcf(double&, const double, const double, double&);
+ double erfcc(double);
+
+ double ran0(int&); //for testing
+ double ran1(int&); //for testing
+ double ran2(int&); //for testing
+ double ran3(int&); //for testing
+ double ran4(int&); //for testing
+ void psdes(unsigned long &, unsigned long &); //for testing
+
};
#endif
CYGWIN_BUILD ?= no
USECOMPRESSION ?= no
MOTHUR_FILES="\"Enter_your_default_path_here\""
-RELEASE_DATE = "\"1/9/2012\""
-VERSION = "\"1.23.0\""
+RELEASE_DATE = "\"3/16/2012\""
+VERSION = "\"1.24.1\""
FORTAN_COMPILER = gfortran
+FORTRAN_FLAGS =
# Optimize to level 3:
CXXFLAGS += -O3
#if you are a linux user use the following line
#CXXFLAGS += -mtune=native -march=native -m64
- CXXFLAGS += -DBIT_VERSION
+ CXXFLAGS += -DBIT_VERSION
+ FORTRAN_FLAGS = -m64
endif
$(CXX) $(LDFLAGS) $(TARGET_ARCH) -o $@ $(OBJECTS) $(LIBS)
strip mothur
-
+
uchime:
cd uchime_src && ./mk && mv uchime .. && cd ..
fortranSource:
- ${FORTAN_COMPILER} -c *.f
+ ${FORTAN_COMPILER} -c $(FORTRAN_FLAGS) *.f
install : mothur
# cp mothur ../Release/mothur
}
}
-//***************************************************************************************************************
+***************************************************************************************************************
vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
try {
*/
#include "matrixoutputcommand.h"
-#include "sharedsobscollectsummary.h"
-#include "sharedchao1.h"
-#include "sharedace.h"
-#include "sharednseqs.h"
-#include "sharedjabund.h"
-#include "sharedsorabund.h"
-#include "sharedjclass.h"
-#include "sharedsorclass.h"
-#include "sharedjest.h"
-#include "sharedsorest.h"
-#include "sharedthetayc.h"
-#include "sharedthetan.h"
-#include "sharedkstest.h"
-#include "whittaker.h"
-#include "sharedochiai.h"
-#include "sharedanderbergs.h"
-#include "sharedkulczynski.h"
-#include "sharedkulczynskicody.h"
-#include "sharedlennon.h"
-#include "sharedmorisitahorn.h"
-#include "sharedbraycurtis.h"
-#include "sharedjackknife.h"
-#include "whittaker.h"
-#include "odum.h"
-#include "canberra.h"
-#include "structeuclidean.h"
-#include "structchord.h"
-#include "hellinger.h"
-#include "manhattan.h"
-#include "structpearson.h"
-#include "soergel.h"
-#include "spearman.h"
-#include "structkulczynski.h"
-#include "structchi2.h"
-#include "speciesprofile.h"
-#include "hamming.h"
-#include "gower.h"
-#include "memchi2.h"
-#include "memchord.h"
-#include "memeuclidean.h"
-#include "mempearson.h"
+#include "subsample.h"
//**********************************************************************************************************************
vector<string> MatrixOutputCommand::setParameters(){
try {
CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
+ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample);
CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "",true,false); parameters.push_back(pcalc);
CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "",false,false); parameters.push_back(poutput);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
vector<string> myArray;
try {
string helpString = "";
ValidCalculators validCalculator;
- helpString += "The dist.shared command parameters are shared, groups, calc, output, processors and label. shared is a required, unless you have a valid current file.\n";
+ helpString += "The dist.shared command parameters are shared, groups, calc, output, processors, subsample, iters and label. shared is a required, unless you have a valid current file.\n";
helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n";
helpString += "The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n";
+ helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample.\n";
+ helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n";
helpString += "The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n";
helpString += "The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n";
helpString += "Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n";
setParameters();
vector<string> tempOutNames;
outputTypes["phylip"] = tempOutNames;
+ outputTypes["subsample"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "MatrixOutputCommand", "MatrixOutputCommand");
//initialize outputTypes
vector<string> tempOutNames;
outputTypes["phylip"] = tempOutNames;
+ outputTypes["subsample"] = tempOutNames;
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
//remove citation from list of calcs
for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") { Estimators.erase(Estimators.begin()+i); break; } }
}
-
+
+ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
+ m->mothurConvert(temp, iters);
+
+ temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; }
+ if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; }
+ else {
+ if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later
+ else { subsample = false; }
+ }
+
+ if (subsample == false) { iters = 1; }
+
if (abort == false) {
ValidCalculators validCalculator;
lines[i].start = int (sqrt(float(i)/float(processors)) * numGroups);
lines[i].end = int (sqrt(float(i+1)/float(processors)) * numGroups);
}
+
+ if (subsample) {
+ if (subsampleSize == -1) { //user has not set size, set size = smallest samples size
+ subsampleSize = lookup[0]->getNumSeqs();
+ for (int i = 1; i < lookup.size(); i++) {
+ int thisSize = lookup[i]->getNumSeqs();
+
+ if (thisSize < subsampleSize) { subsampleSize = thisSize; }
+ }
+ }else {
+ m->clearGroups();
+ Groups.clear();
+ vector<SharedRAbundVector*> temp;
+ for (int i = 0; i < lookup.size(); i++) {
+ if (lookup[i]->getNumSeqs() < subsampleSize) {
+ m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine();
+ delete lookup[i];
+ }else {
+ Groups.push_back(lookup[i]->getGroup());
+ temp.push_back(lookup[i]);
+ }
+ }
+ lookup = temp;
+ m->setGroups(Groups);
+ }
+ }
if (m->control_pressed) { delete input; for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } m->clearGroups(); return 0; }
}
}
/***********************************************************/
-void MatrixOutputCommand::printSims(ostream& out, vector< vector<float> >& simMatrix) {
+void MatrixOutputCommand::printSims(ostream& out, vector< vector<double> >& simMatrix) {
try {
out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
try {
EstOutput data;
vector<SharedRAbundVector*> subset;
- vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size()); //one for each calc, this will be used to make .dist files
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
- driver(thisLookup, 0, numGroups, calcDists);
- }else{
- int process = 1;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
- driver(thisLookup, lines[process].start, lines[process].end, calcDists);
-
- string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
- ofstream outtemp;
- m->openOutputFile(tempdistFileName, outtemp);
-
- for (int i = 0; i < calcDists.size(); i++) {
- outtemp << calcDists[i].size() << endl;
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
- }
- }
- outtemp.close();
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
- for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
- exit(0);
- }
- }
-
- //parent do your part
- driver(thisLookup, lines[0].start, lines[0].end, calcDists);
-
- //force parent to wait until all the processes are done
- for (int i = 0; i < processIDS.size(); i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- for (int i = 0; i < processIDS.size(); i++) {
- string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist";
- ifstream intemp;
- m->openInputFile(tempdistFileName, intemp);
-
- for (int k = 0; k < calcDists.size(); k++) {
- int size = 0;
- intemp >> size; m->gobble(intemp);
-
- for (int j = 0; j < size; j++) {
- int seq1 = 0;
- int seq2 = 0;
- float dist = 1.0;
-
- intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
-
- seqDist tempDist(seq1, seq2, dist);
- calcDists[k].push_back(tempDist);
- }
- }
- intemp.close();
- m->mothurRemove(tempdistFileName);
- }
-
+ vector< vector< vector<seqDist> > > calcDistsTotals; //each iter, one for each calc, then each groupCombos dists. this will be used to make .dist files
+
+ vector< vector<seqDist> > calcDists; calcDists.resize(matrixCalculators.size());
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) {
+
+ vector<SharedRAbundVector*> thisItersLookup = thisLookup;
+
+ if (subsample) {
+ SubSample sample;
+ vector<string> tempLabels; //dont need since we arent printing the sampled sharedRabunds
+
+ //make copy of lookup so we don't get access violations
+ vector<SharedRAbundVector*> newLookup;
+ for (int k = 0; k < thisItersLookup.size(); k++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thisItersLookup[k]->getLabel());
+ temp->setGroup(thisItersLookup[k]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+ for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
+ }
+
+ tempLabels = sample.getSample(newLookup, subsampleSize);
+ thisItersLookup = newLookup;
+ }
+
+ if(processors == 1){
+ driver(thisItersLookup, 0, numGroups, calcDists);
+ }else{
+ int process = 1;
+ vector<int> processIDS;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ driver(thisItersLookup, lines[process].start, lines[process].end, calcDists);
+
+ string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(getpid()) + ".dist";
+ ofstream outtemp;
+ m->openOutputFile(tempdistFileName, outtemp);
+
+ for (int i = 0; i < calcDists.size(); i++) {
+ outtemp << calcDists[i].size() << endl;
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
+ }
+ }
+ outtemp.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //parent do your part
+ driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);
+
+ //force parent to wait until all the processes are done
+ for (int i = 0; i < processIDS.size(); i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ string tempdistFileName = m->getRootName(m->getSimpleName(sharedfile)) + toString(processIDS[i]) + ".dist";
+ ifstream intemp;
+ m->openInputFile(tempdistFileName, intemp);
+
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = 0;
+ intemp >> size; m->gobble(intemp);
+
+ for (int j = 0; j < size; j++) {
+ int seq1 = 0;
+ int seq2 = 0;
+ float dist = 1.0;
+
+ intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
+
+ seqDist tempDist(seq1, seq2, dist);
+ calcDists[k].push_back(tempDist);
+ }
+ }
+ intemp.close();
+ m->mothurRemove(tempdistFileName);
+ }
+ #else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the distSharedData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to pass results vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<distSharedData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+
+ //make copy of lookup so we don't get access violations
+ vector<SharedRAbundVector*> newLookup;
+ for (int k = 0; k < thisItersLookup.size(); k++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thisItersLookup[k]->getLabel());
+ temp->setGroup(thisItersLookup[k]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+ for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); }
+ }
+
+ // Allocate memory for thread data.
+ distSharedData* tempSum = new distSharedData(m, lines[i].start, lines[i].end, Estimators, newLookup);
+ pDataArray.push_back(tempSum);
+ processIDS.push_back(i);
+
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyDistSharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+ //parent do your part
+ driver(thisItersLookup, lines[0].start, lines[0].end, calcDists);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
+
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = pDataArray[i]->calcDists[k].size();
+ for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); }
+ }
+
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ #endif
+ }
+
+ calcDistsTotals.push_back(calcDists);
+
+ if (subsample) {
+
+ //clean up memory
+ for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; }
+ thisItersLookup.clear();
+ for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); }
+ }
}
-#else
- driver(thisLookup, 0, numGroups, calcDists);
-#endif
- for (int i = 0; i < calcDists.size(); i++) {
- if (m->control_pressed) { break; }
-
- //initialize matrix
- vector< vector<float> > matrix; //square matrix to represent the distance
- matrix.resize(thisLookup.size());
- for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- int row = calcDists[i][j].seq1;
- int column = calcDists[i][j].seq2;
- float dist = calcDists[i][j].dist;
-
- matrix[row][column] = dist;
- matrix[column][row] = dist;
- }
-
- string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
- outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
- ofstream outDist;
- m->openOutputFile(distFileName, outDist);
- outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-
- printSims(outDist, matrix);
-
- outDist.close();
- }
+ if (iters != 1) {
+ //we need to find the average distance and standard deviation for each groups distance
+
+ vector< vector<seqDist> > calcAverages; calcAverages.resize(matrixCalculators.size());
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ calcAverages[i].resize(calcDistsTotals[0][i].size());
+
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].seq1 = calcDists[i][j].seq1;
+ calcAverages[i][j].seq2 = calcDists[i][j].seq2;
+ calcAverages[i][j].dist = 0.0;
+ }
+ }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator
+ for (int i = 0; i < calcAverages.size(); i++) { //initialize sums to zero.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist += calcDistsTotals[thisIter][i][j].dist;
+ }
+ }
+ }
+
+ for (int i = 0; i < calcAverages.size(); i++) { //finds average.
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ calcAverages[i][j].dist /= (float) iters;
+ }
+ }
+
+ //find standard deviation
+ vector< vector<seqDist> > stdDev; stdDev.resize(matrixCalculators.size());
+ for (int i = 0; i < stdDev.size(); i++) { //initialize sums to zero.
+ stdDev[i].resize(calcDistsTotals[0][i].size());
+
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].seq1 = calcDists[i][j].seq1;
+ stdDev[i][j].seq2 = calcDists[i][j].seq2;
+ stdDev[i][j].dist = 0.0;
+ }
+ }
+
+ for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each
+ for (int i = 0; i < stdDev.size(); i++) {
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist += ((calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist) * (calcDistsTotals[thisIter][i][j].dist - calcAverages[i][j].dist));
+ }
+ }
+ }
+
+ for (int i = 0; i < stdDev.size(); i++) { //finds average.
+ for (int j = 0; j < stdDev[i].size(); j++) {
+ stdDev[i][j].dist /= (float) iters;
+ stdDev[i][j].dist = sqrt(stdDev[i][j].dist);
+ }
+ }
+
+ //print results
+ for (int i = 0; i < calcDists.size(); i++) {
+ vector< vector<double> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ vector< vector<double> > stdmatrix; //square matrix to represent the stdDev
+ stdmatrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { stdmatrix[k].resize(thisLookup.size(), 0.0); }
+
+
+ for (int j = 0; j < calcAverages[i].size(); j++) {
+ int row = calcAverages[i][j].seq1;
+ int column = calcAverages[i][j].seq2;
+ float dist = calcAverages[i][j].dist;
+ float stdDist = stdDev[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ stdmatrix[row][column] = stdDist;
+ stdmatrix[column][row] = stdDist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".ave.dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outAve;
+ m->openOutputFile(distFileName, outAve);
+ outAve.setf(ios::fixed, ios::floatfield); outAve.setf(ios::showpoint);
+
+ printSims(outAve, matrix);
+
+ outAve.close();
+
+ distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".std.dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outSTD;
+ m->openOutputFile(distFileName, outSTD);
+ outSTD.setf(ios::fixed, ios::floatfield); outSTD.setf(ios::showpoint);
+
+ printSims(outSTD, stdmatrix);
+
+ outSTD.close();
+
+ }
+ }else {
+
+ for (int i = 0; i < calcDists.size(); i++) {
+ if (m->control_pressed) { break; }
+
+ //initialize matrix
+ vector< vector<double> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ int row = calcDists[i][j].seq1;
+ int column = calcDists[i][j].seq2;
+ double dist = calcDists[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ }
+
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist";
+ outputNames.push_back(distFileName); outputTypes["phylip"].push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ printSims(outDist, matrix);
+
+ outDist.close();
+ }
+ }
return 0;
}
/**************************************************************************************************/
int MatrixOutputCommand::driver(vector<SharedRAbundVector*> thisLookup, int start, int end, vector< vector<seqDist> >& calcDists) {
try {
-
vector<SharedRAbundVector*> subset;
for (int k = start; k < end; k++) { // pass cdd each set of groups to compare
#include "inputdata.h"
#include "groupmap.h"
#include "validcalculator.h"
+#include "sharedsobscollectsummary.h"
+#include "sharedchao1.h"
+#include "sharedace.h"
+#include "sharednseqs.h"
+#include "sharedjabund.h"
+#include "sharedsorabund.h"
+#include "sharedjclass.h"
+#include "sharedsorclass.h"
+#include "sharedjest.h"
+#include "sharedsorest.h"
+#include "sharedthetayc.h"
+#include "sharedthetan.h"
+#include "sharedkstest.h"
+#include "whittaker.h"
+#include "sharedochiai.h"
+#include "sharedanderbergs.h"
+#include "sharedkulczynski.h"
+#include "sharedkulczynskicody.h"
+#include "sharedlennon.h"
+#include "sharedmorisitahorn.h"
+#include "sharedbraycurtis.h"
+#include "sharedjackknife.h"
+#include "whittaker.h"
+#include "odum.h"
+#include "canberra.h"
+#include "structeuclidean.h"
+#include "structchord.h"
+#include "hellinger.h"
+#include "manhattan.h"
+#include "structpearson.h"
+#include "soergel.h"
+#include "spearman.h"
+#include "structkulczynski.h"
+#include "structchi2.h"
+#include "speciesprofile.h"
+#include "hamming.h"
+#include "gower.h"
+#include "memchi2.h"
+#include "memchord.h"
+#include "memeuclidean.h"
+#include "mempearson.h"
+
// aka. dist.shared()
};
vector<linePair> lines;
- void printSims(ostream&, vector< vector<float> >&);
+ void printSims(ostream&, vector< vector<double> >&);
int process(vector<SharedRAbundVector*>);
vector<Calculator*> matrixCalculators;
InputData* input;
vector<SharedRAbundVector*> lookup;
string exportFileName, output, sharedfile;
- int numGroups, processors;
+ int numGroups, processors, iters, subsampleSize;
ofstream out;
- bool abort, allLines;
+ bool abort, allLines, subsample;
set<string> labels; //holds labels to be used
string outputFile, calc, groups, label, outputDir;
vector<string> Estimators, Groups, outputNames; //holds estimators to be used
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct distSharedData {
+ vector<SharedRAbundVector*> thisLookup;
+ vector< vector<seqDist> > calcDists;
+ vector<string> Estimators;
+ unsigned long long start;
+ unsigned long long end;
+ MothurOut* m;
+
+ distSharedData(){}
+ distSharedData(MothurOut* mout, unsigned long long st, unsigned long long en, vector<string> est, vector<SharedRAbundVector*> lu) {
+ m = mout;
+ start = st;
+ end = en;
+ Estimators = est;
+ thisLookup = lu;
+ }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyDistSharedThreadFunction(LPVOID lpParam){
+ distSharedData* pDataArray;
+ pDataArray = (distSharedData*)lpParam;
+
+ try {
+
+ vector<Calculator*> matrixCalculators;
+ ValidCalculators validCalculator;
+ for (int i=0; i<pDataArray->Estimators.size(); i++) {
+ if (validCalculator.isValidCalculator("matrix", pDataArray->Estimators[i]) == true) {
+ if (pDataArray->Estimators[i] == "sharedsobs") {
+ matrixCalculators.push_back(new SharedSobsCS());
+ }else if (pDataArray->Estimators[i] == "sharedchao") {
+ matrixCalculators.push_back(new SharedChao1());
+ }else if (pDataArray->Estimators[i] == "sharedace") {
+ matrixCalculators.push_back(new SharedAce());
+ }else if (pDataArray->Estimators[i] == "jabund") {
+ matrixCalculators.push_back(new JAbund());
+ }else if (pDataArray->Estimators[i] == "sorabund") {
+ matrixCalculators.push_back(new SorAbund());
+ }else if (pDataArray->Estimators[i] == "jclass") {
+ matrixCalculators.push_back(new Jclass());
+ }else if (pDataArray->Estimators[i] == "sorclass") {
+ matrixCalculators.push_back(new SorClass());
+ }else if (pDataArray->Estimators[i] == "jest") {
+ matrixCalculators.push_back(new Jest());
+ }else if (pDataArray->Estimators[i] == "sorest") {
+ matrixCalculators.push_back(new SorEst());
+ }else if (pDataArray->Estimators[i] == "thetayc") {
+ matrixCalculators.push_back(new ThetaYC());
+ }else if (pDataArray->Estimators[i] == "thetan") {
+ matrixCalculators.push_back(new ThetaN());
+ }else if (pDataArray->Estimators[i] == "kstest") {
+ matrixCalculators.push_back(new KSTest());
+ }else if (pDataArray->Estimators[i] == "sharednseqs") {
+ matrixCalculators.push_back(new SharedNSeqs());
+ }else if (pDataArray->Estimators[i] == "ochiai") {
+ matrixCalculators.push_back(new Ochiai());
+ }else if (pDataArray->Estimators[i] == "anderberg") {
+ matrixCalculators.push_back(new Anderberg());
+ }else if (pDataArray->Estimators[i] == "kulczynski") {
+ matrixCalculators.push_back(new Kulczynski());
+ }else if (pDataArray->Estimators[i] == "kulczynskicody") {
+ matrixCalculators.push_back(new KulczynskiCody());
+ }else if (pDataArray->Estimators[i] == "lennon") {
+ matrixCalculators.push_back(new Lennon());
+ }else if (pDataArray->Estimators[i] == "morisitahorn") {
+ matrixCalculators.push_back(new MorHorn());
+ }else if (pDataArray->Estimators[i] == "braycurtis") {
+ matrixCalculators.push_back(new BrayCurtis());
+ }else if (pDataArray->Estimators[i] == "whittaker") {
+ matrixCalculators.push_back(new Whittaker());
+ }else if (pDataArray->Estimators[i] == "odum") {
+ matrixCalculators.push_back(new Odum());
+ }else if (pDataArray->Estimators[i] == "canberra") {
+ matrixCalculators.push_back(new Canberra());
+ }else if (pDataArray->Estimators[i] == "structeuclidean") {
+ matrixCalculators.push_back(new StructEuclidean());
+ }else if (pDataArray->Estimators[i] == "structchord") {
+ matrixCalculators.push_back(new StructChord());
+ }else if (pDataArray->Estimators[i] == "hellinger") {
+ matrixCalculators.push_back(new Hellinger());
+ }else if (pDataArray->Estimators[i] == "manhattan") {
+ matrixCalculators.push_back(new Manhattan());
+ }else if (pDataArray->Estimators[i] == "structpearson") {
+ matrixCalculators.push_back(new StructPearson());
+ }else if (pDataArray->Estimators[i] == "soergel") {
+ matrixCalculators.push_back(new Soergel());
+ }else if (pDataArray->Estimators[i] == "spearman") {
+ matrixCalculators.push_back(new Spearman());
+ }else if (pDataArray->Estimators[i] == "structkulczynski") {
+ matrixCalculators.push_back(new StructKulczynski());
+ }else if (pDataArray->Estimators[i] == "speciesprofile") {
+ matrixCalculators.push_back(new SpeciesProfile());
+ }else if (pDataArray->Estimators[i] == "hamming") {
+ matrixCalculators.push_back(new Hamming());
+ }else if (pDataArray->Estimators[i] == "structchi2") {
+ matrixCalculators.push_back(new StructChi2());
+ }else if (pDataArray->Estimators[i] == "gower") {
+ matrixCalculators.push_back(new Gower());
+ }else if (pDataArray->Estimators[i] == "memchi2") {
+ matrixCalculators.push_back(new MemChi2());
+ }else if (pDataArray->Estimators[i] == "memchord") {
+ matrixCalculators.push_back(new MemChord());
+ }else if (pDataArray->Estimators[i] == "memeuclidean") {
+ matrixCalculators.push_back(new MemEuclidean());
+ }else if (pDataArray->Estimators[i] == "mempearson") {
+ matrixCalculators.push_back(new MemPearson());
+ }
+ }
+ }
+
+ pDataArray->calcDists.resize(matrixCalculators.size());
+
+ vector<SharedRAbundVector*> subset;
+ for (int k = pDataArray->start; k < pDataArray->end; k++) { // pass cdd each set of groups to compare
+
+ for (int l = 0; l < k; l++) {
+
+ if (k != l) { //we dont need to similiarity of a groups to itself
+ subset.clear(); //clear out old pair of sharedrabunds
+ //add new pair of sharedrabunds
+ subset.push_back(pDataArray->thisLookup[k]); subset.push_back(pDataArray->thisLookup[l]);
+
+ for(int i=0;i<matrixCalculators.size();i++) {
+
+ //if this calc needs all groups to calculate the pair load all groups
+ if (matrixCalculators[i]->getNeedsAll()) {
+ //load subset with rest of lookup for those calcs that need everyone to calc for a pair
+ for (int w = 0; w < pDataArray->thisLookup.size(); w++) {
+ if ((w != k) && (w != l)) { subset.push_back(pDataArray->thisLookup[w]); }
+ }
+ }
+
+ vector<double> tempdata = matrixCalculators[i]->getValues(subset); //saves the calculator outputs
+
+ if (pDataArray->m->control_pressed) { return 1; }
+
+ seqDist temp(l, k, tempdata[0]);
+ pDataArray->calcDists[i].push_back(temp);
+ }
+ }
+ }
+ }
+
+ for(int i=0;i<matrixCalculators.size();i++){ delete matrixCalculators[i]; }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "MatrixOutputCommand", "MyDistSharedThreadFunction");
+ exit(1);
+ }
+}
+#endif
#endif
+++ /dev/null
-#ifndef METASTATS2
-#define METASTATS2
-
-/*
- * metastats.h
- * Mothur
- *
- * Created by westcott on 9/16/10.
- * Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <time.h>
-#include <math.h>
-#include "fisher2.h"
-
-void testp(double *permuted_ttests,int *B,double *permuted,double
- *Imatrix,int *nc,int *nr,int *g,double *Tinitial,double *ps);
-void permute_matrix(double *Imatrix,int *nc,int *nr,double
- *permuted,int *g,double *trial_ts,double *Tinitial,double
- *counter);
-void permute_array(int *array, int n);
-void calc_twosample_ts(double *Pmatrix,int *g,int *nc,int *nr,double
- *Ts,double *Tinitial,double *counter1);
-void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *storage);
-void start(double *Imatrix,int *g,int *nr,int *nc,double *testing,
- double storage[][9]);
-
-int metastat_main (char*, int, int, double, int, double**, int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
-
-
-
+++ /dev/null
-
-#include "metastats.h"
-
-//The following code has been modified using the original Metastats program from White, J.R., Nagarajan, N. & Pop, M. Statistical methods for detecting differentially abundant features in clinical metagenomic samples. PLoS Comput Biol 5, e1000352 (2009).
-
-int metastat_main (char* outputFileName, int numRows, int numCols, double threshold, int numPermutations, double** data, int secondGroupingStart){
-
- int size,c=0,i=0,j=0,counter=0, bflag=0;
- int B=numPermutations;
- int row = numRows;
- int col = numCols;
- int g = secondGroupingStart;
- double thresh=threshold;
- double min=0;
-
- char output[1024];
- strcpy(output, outputFileName);
- FILE *out;
-
- if (g>=col || g<=0){
- printf("Check your g value\n");
- }
-
- // Initialize the matrices
- size = row*col;
- double matrix[row][col];
- double pmatrix[size],permuted[size];
- double storage[row][9];
-
- for (i=0;i<row;i++){
- for (j =0;j<9;j++){
- storage[i][j]=0;
- }
- }
-
- for(i=0; i<row; i++){
- for(j=0; j<col;j++){
- matrix[i][j]=data[i][j];
- pmatrix[c]=0; // initializing to zero
- permuted[c]=0;
- c++;
- }
- }
-
- // Produces the sum of each column
- double total[col],total1=0,total2=0;
- double ratio[col];
-
- for(i=0;i<col;i++){
- total[i]=0;
- ratio[i]=0; }
-
- for(i=0; i<col; i++){
- for(j=0;j<row;j++){
- total[i]=total[i]+matrix[j][i];
- }
- }
-
- for(i=0;i<g-1;i++){
- total1=total1+total[i];}
-
- for(i=g-1;i<col;i++){
- total2=total2+total[i];}
-
-
- // Creates the ratios by first finding the minimum of totals
- min = total[0];
- if (col==2){
- if (total[0]<total[1]){
- min = total[1];}
- }
- if (col >2){
- for(i=1;i<col;i++){
- if (min > total[i]){
- min = total[i];}
- }
- }
- if (min<=0){
- printf("Error, the sum of one of the columns <= 0.");
- return 0;
- }
-
-
- // Ratio time...
- for(i=0;i<col;i++){
- ratio[i]=total[i]/min;
- }
-
- //Change matrix into an array as received by R for compatibility.
-
- c=0;
- for(i=0;i<col;i++){
- for(j=0;j<row;j++){
- pmatrix[c]=matrix[j][i];
- c++;
- }
- }
-
- if(row == 1){
- for (i =0; i<col;i++){
- pmatrix[i]=pmatrix[i]/ratio[i];
- }
- }
- else {
- counter = 0;
- j=-1;
- for (i=0; i<size; i++) {
- if (counter % row == 0) {
- j++;
- }
- pmatrix[i]=pmatrix[i]/ratio[j];
- counter++;
- }
- }
- // pass everything to the rest of the code using pointers. then
- // write to output file. below pointers for most of the values are
- // created to send everything by reference.
-
- int ptt_size, *permutes,*nc,*nr,*gvalue;
-
- nc = &col;
- nr = &row;
- gvalue = &g;
-
- permutes = &B;
- ptt_size = B*row;
-
- //changing ptt_size to row
- double permuted_ttests[row], pvalues[row], tinitial[row];
-
- for(i=0;i<row;i++){
- permuted_ttests[i]=0;}
-
- for(i=0;i<row;i++){
- pvalues[i]=0;
- tinitial[i]=0; }
-
- // Find the initial values for the matrix.
- start(pmatrix,gvalue,nr,nc,tinitial,storage);
-
- // Start the calculations.
-
- if ( (col==2) || ((g-1)<8) || ((col-g+1) < 8) ){
-
- double fish[row], fish2[row];
- for(i=0;i<row;i++){
- fish[i]=0;
- fish2[i]=0;}
-
- for(i=0;i<row;i++){
-
- for(j=0;j<g-1;j++){
- fish[i]=fish[i]+matrix[i][j];
- }
-
- for(j=g-1;j<col;j++){
- fish2[i]=fish2[i]+matrix[i][j];
- }
-
- double f11,f12,f21,f22;
-
- f11=fish[i];
- f12=fish2[i];
-
- f21=total1-f11;
- f22=total2-f12;
-
- double data[] = {f11, f12, f21, f22};
-
- // CONTINGENGCY TABLE:
- // f11 f12
- // f21 f22
-
- int *nr, *nc, *ldtabl, *work;
- int nrow=2, ncol=2, ldtable=2, workspace=100000;
- double *expect, *prc, *emin,*prt,*pre;
- double e=0, prc1=0, emin1=0, prt1=0, pre1=0;
-
- nr = &nrow;
- nc = &ncol;
- ldtabl=&ldtable;
- work = &workspace;
-
- expect = &e;
- prc = &prc1;
- emin=&emin1;
- prt=&prt1;
- pre=&pre1;
-
- //MothurFisher fishtere;
- //double mothurFex = fishtere.fexact(f11, f12, f21, f22);
-
- fexact(nr,nc,data, ldtabl,expect,prc,emin,prt,pre,work);
-
- if (*pre>.999999999){
- *pre=1;
- }
-
- //printf("feaxt = %f\t%f\t%f\t%f\t%f\t%f\n", *expect, *pre, f11, f12, f21, f22);
- storage[i][8] = *pre;
- pvalues[i]=*pre;
- }
- }
- else{
-
- testp(permuted_ttests, permutes, permuted,pmatrix, nc, nr, gvalue,tinitial,pvalues);
-
- // Checks to make sure the matrix isn't sparse.
- double sparse[row], sparse2[row];
- for(i=0;i<row;i++){
- sparse[i]=0;
- sparse2[i]=0;}
-
- c=0;
- for(i=0;i<row;i++){
-
- for(j=0;j<g-1;j++){
- sparse[i]=sparse[i]+matrix[i][j];
- }
-
- if(sparse[i] < (double)(g-1)){
- c++;
- }
- for(j=g-1;j<col;j++){ // ?<= for col
- sparse2[i]=sparse2[i]+matrix[i][j];
- }
-
- if( (sparse2[i] <(double)(col-g+1))) {
- c++;
- }
-
- if (c==2){
- c=0;
-
- double f11,f12,f21,f22;
-
- f11=sparse[i];
- sparse[i]=0;
-
- f12=sparse2[i];
- sparse2[i]=0;
-
- f21=total1-f11;
- f22=total2-f12;
-
- double data[] = {f11, f12, f21, f22};
-
- int *nr, *nc, *ldtabl, *work;
- int nrow=2, ncol=2, ldtable=2, workspace=10000000; // I added two zeros for larger data sets
- double *expect, *prc, *emin,*prt,*pre;
- double e=0, prc1=0, emin1=0, prt1=0, pre1=0;
-
- nr = &nrow;
- nc = &ncol;
- ldtabl=&ldtable;
- work = &workspace;
-
- expect = &e;
- prc = &prc1;
- emin=&emin1;
- prt=&prt1;
- pre=&pre1;
-
- fexact(nr,nc,data, ldtabl,expect,prc,emin,prt,pre,work);
-
- if (*pre>.999999999){
- *pre=1;
- }
- storage[i][8] = *pre;
- pvalues[i]=*pre;
- }
- }
- // End of else statement
- bflag = 1;
- }
-
- // Calculates the mean of counts (not normalized)
- double temp[row][2];
-
- for(j=0;j<row;j++){
- for(i=0;i<2;i++){
- temp[j][i]=0;
- }
- }
-
- for (j=0;j<row;j++){
- for (i=1; i<=(g-1); i++){
- temp[j][0]=temp[j][0]+matrix[j][i-1];
- }
- temp[j][0]= (double) temp[j][0]/(g-1);
- for(i=g;i<=col;i++){
- temp[j][1]=temp[j][1]+matrix[j][i-1];
- }
- temp[j][1]= (double) temp[j][1]/(col-g+1);
- }
-
- for(i=0;i<row;i++){
- storage[i][3]=temp[i][0];
- storage[i][7]=temp[i][1];
- storage[i][8]=pvalues[i];
- }
-
- // BACKUP checks
-
- for (i=0;i<row;i++){
- if(pvalues[i]<thresh){
- printf("Feature %d is significant, p = %.10lf \n",i+1,pvalues[i]);
- }
- }
-
- // And now we write the files to a text file.
- struct tm *local;
- time_t t;
- t = time(NULL);
- local = localtime(&t);
-
- out = fopen(output,"w");
-
- fprintf(out,"Local time and date of test: %s\n", asctime(local));
- fprintf(out,"# rows = %d, # col = %d, g = %d\n\n",row,col,g);
- if (bflag == 1){
- fprintf(out,"%d permutations\n\n",B);
- }
-
- //output column headings - not really sure... documentation labels 9 columns, there are 10 in the output file
- //storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
- fprintf(out, "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean_of_counts(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tmean_of_counts(group1)\tp-value\n");
-
- for(i=0; i<row; i++){
- fprintf(out,"%d",(i+1));
-
- for(j=0; j<9;j++){
- fprintf(out,"\t%.12lf",storage[i][j]);
- }
- fprintf(out,"\n");
- }
-
- fprintf(out,"\n \n");
-
- // fclose(jobj);
- fclose(out);
-
- return 0;
-}
-
-void testp(double *permuted_ttests,int *B,double *permuted,
- double *Imatrix,int *nc,int *nr,int *g,double *Tinitial,double
- *ps) {
-
- double Tvalues[*nr];
- int a, b, n, j;
-
- a = *B;
- b = *nr;
- n = a*b;
-
- double counter[b];
-
- for(j=0;j<b;j++){
- counter[j]=0;
- }
-
- for (j=1; j<=*B; j++){
- permute_matrix(Imatrix,nc,nr,permuted,g,Tvalues,Tinitial,counter);
- // for(i=0;i<*nr;i++){
- // permuted_ttests[k]=fabs(Tvalues[i]);
- // k++;
- }
-
-
- for(j=0;j<*nr;j++){
- ps[j]=((counter[j]+1)/(double)(a+1));
- }
-}
-
-void permute_matrix(double *Imatrix,int *nc,int *nr,double *permuted,
- int *g,double *trial_ts,double *Tinitial,double *counter1){
-
- int i=0,j=0,n=0,a=0,b=0,f=0,c=0,k=0;
-
- a = *nr; // number of rows
- b = *nc;
- n = a*b;
-
- int y[b];
-
- for (i=1; i<=*nc; i++){
- y[i-1] = i;
- }
-
- permute_array(y, b);
-
- for (i=0; i<*nc; i++){
- f = y[i]; //column number
- c=1;
- c*=(f-1);
- c*=a;
- if (f == 1){
- c = 0;
- } // starting value position in the Imatrix
- for(j=1; j<=*nr; j++){
- permuted[k] = Imatrix[c];
- c++;
- k++;
- }
- }
-
- calc_twosample_ts(permuted,g,nc,nr,trial_ts,Tinitial,counter1);
-}
-
-void permute_array(int *array, int n) {
- static int seeded = 0;
- int i;
-
- if (! seeded) {
- seeded = 1;
- srand(time(NULL));
- }
-
- for (i = 0; i < n; i++) {
- int selection = rand() % (n - i);
- int tmp = array[i + selection];
- array[i + selection] = array[i];
- array[i] = tmp;
- }
-}
-
-void calc_twosample_ts(double *Pmatrix,int *g,int *nc,int *nr,
- double *Ts,double *Tinitial,double *counter) {
- int i,a;
- a = *nr;
- a*=4;
-
- double C1[*nr][3], C2[*nr][3], storage[a],tool[a];
- double nrows,ncols,gvalue, xbardiff=0, denom=0;
-
- nrows = (double) *nr;
- ncols = (double) *nc;
- gvalue= (double) *g;
-
- meanvar(Pmatrix,g,nr,nc,storage);
- for(i=0;i<=a-1;i++){
- tool[i]=storage[i];
- }
- for (i=0; i<*nr;i++){
- C1[i][0]=tool[i];
- C1[i][1]=tool[i+*nr+*nr];
- C1[i][2]=C1[i][1]/(gvalue-1);
-
- C2[i][0]=tool[i+*nr];
- C2[i][1]=tool[i+*nr+*nr+*nr]; // var group 2
- C2[i][2]=C2[i][1]/(ncols-gvalue+1);
- }
-
- for (i=0; i<*nr; i++){
- xbardiff = C1[i][0]-C2[i][0];
- denom = sqrt(C1[i][2]+C2[i][2]);
- Ts[i]=fabs(xbardiff/denom);
- if (fabs(Ts[i])>(fabs(Tinitial[i])+.0000000000001)){ //13th place
- counter[i]++;
- }
- }
-}
-
-void meanvar(double *pmatrix,int *g,int *nr,int *nc,double *store){
- double temp[*nr], temp2[*nr],var[*nr],var2[*nr],a,b;
-
- int i,m,k,l,n;
-
- a = (double) *g-1;
- b = (double) (*nc-a);
-
- for (i = 0; i<*nr; i++){
- temp[i]=0;
- temp2[i]=0;
- var[i]=0;
- var2[i]=0;
- }
-
- k = *nr; // number of rows
- l = *nc;
- n = k*l;
-
- m=0;
- m=*g-1;
- k=*nr;
- m*=k; // m = g * nr now
- for (i=0;i<m;i++){
- temp[i%k]=temp[i%k]+pmatrix[i];
- }
- for (i=0;i<n;i++){
- temp2[i%k]=temp2[i%k]+pmatrix[i];
- }
- for (i=0;i<*nr;i++){
- temp2[i]=temp2[i]-temp[i];
- }
- for (i=0;i<=*nr-1;i++){
- store[i]=temp[i]/a;
- store[i+*nr]=temp2[i]/b;
- }
-
- // That completes the mean calculations.
-
- for (i=0;i<m;i++){
- var[i%k]=var[i%k]+pow((pmatrix[i]-store[i%k]),2);
- }
- for (i=m;i<n;i++){
- var2[i%k]=var2[i%k]+pow((pmatrix[i]-store[(i%k)+*nr]),2);
- }
-
- for (i=0;i<=*nr-1;i++){
- store[i+2*k]=var[i]/(a-1);
- store[i+3*k]=var2[i]/(b-1);
- }
- // That completes var calculations.
-}
-
-void start(double *Imatrix,int *g,int *nr,int *nc,double *initial,
- double storage[][9]){
- int i, a = *nr;
- a*=4;
-
- double store[a], tool[a], C1[*nr][3], C2[*nr][3];
- double nrows,ncols,gvalue, xbardiff=0, denom=0;
-
- nrows = (double) *nr;
- ncols = (double) *nc;
- gvalue= (double) *g;
-
- meanvar(Imatrix,g,nr,nc,store);
-
- for(i=0;i<=a-1;i++){
- tool[i]=store[i];
- }
- for (i=0; i<*nr;i++){
- C1[i][0]=tool[i]; //mean group 1
- storage[i][0]=C1[i][0];
- C1[i][1]=tool[i+*nr+*nr]; // var group 1
- storage[i][1]=C1[i][1];
- C1[i][2]=C1[i][1]/(gvalue-1);
- storage[i][2]=sqrt(C1[i][2]);
-
- C2[i][0]=tool[i+*nr]; // mean group 2
- storage[i][4]=C2[i][0];
- C2[i][1]=tool[i+*nr+*nr+*nr]; // var group 2
- storage[i][5]=C2[i][1];
- C2[i][2]=C2[i][1]/(ncols-gvalue+1);
- storage[i][6]=sqrt(C2[i][2]);
- }
- for (i=0; i<*nr; i++){
- xbardiff = C1[i][0]-C2[i][0];
- denom = sqrt(C1[i][2]+C2[i][2]);
- initial[i]=fabs(xbardiff/denom);
- }
-}
-
-
-
-
*/
#include "metastatscommand.h"
-#include "metastats.h"
#include "sharedutilities.h"
-#include "mothurmetastats.h"
+
//**********************************************************************************************************************
vector<string> MetaStatsCommand::setParameters(){
//only 1 combo
if (numGroups == 2) { processors = 1; }
else if (numGroups < 2) { m->mothurOut("Not enough sets, I need at least 2 valid sets. Unable to complete command."); m->mothurOutEndLine(); m->control_pressed = true; }
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors != 1){
- int numPairs = namesOfGroupCombos.size();
- int numPairsPerProcessor = numPairs / processors;
+
+ if(processors != 1){
+ int numPairs = namesOfGroupCombos.size();
+ int numPairsPerProcessor = numPairs / processors;
- for (int i = 0; i < processors; i++) {
- int startPos = i * numPairsPerProcessor;
- if(i == processors - 1){
- numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
- }
- lines.push_back(linePair(startPos, numPairsPerProcessor));
- }
- }
- #endif
+ for (int i = 0; i < processors; i++) {
+ int startPos = i * numPairsPerProcessor;
+ if(i == processors - 1){
+ numPairsPerProcessor = numPairs - i * numPairsPerProcessor;
+ }
+ lines.push_back(linePair(startPos, numPairsPerProcessor));
+ }
+ }
//as long as you are not at the end of the file or done wih the lines you want
while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
int MetaStatsCommand::process(vector<SharedRAbundVector*>& thisLookUp){
try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
if(processors == 1){
driver(0, namesOfGroupCombos.size(), thisLookUp);
}else{
int process = 1;
vector<int> processIDS;
-
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
int temp = processIDS[i];
wait(&temp);
}
- }
- #else
- driver(0, namesOfGroupCombos.size(), thisLookUp);
- #endif
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the summarySharedData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to pass results vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<metastatsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+
+ //make copy of lookup so we don't get access violations
+ vector<SharedRAbundVector*> newLookup;
+ vector<string> designMapGroups;
+ for (int k = 0; k < thisLookUp.size(); k++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thisLookUp[k]->getLabel());
+ temp->setGroup(thisLookUp[k]->getGroup());
+ newLookup.push_back(temp);
+ designMapGroups.push_back(designMap->getGroup(thisLookUp[k]->getGroup()));
+ }
+
+ //for each bin
+ for (int k = 0; k < thisLookUp[0]->getNumBins(); k++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+ for (int j = 0; j < thisLookUp.size(); j++) { newLookup[j]->push_back(thisLookUp[j]->getAbundance(k), thisLookUp[j]->getGroup()); }
+ }
+
+ // Allocate memory for thread data.
+ metastatsData* tempSum = new metastatsData(sharedfile, outputDir, m, lines[i].start, lines[i].num, namesOfGroupCombos, newLookup, designMapGroups, iters, threshold);
+ pDataArray.push_back(tempSum);
+ processIDS.push_back(i);
+
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyMetastatsThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+ //do my part
+ driver(lines[0].start, lines[0].num, thisLookUp);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for (int j = 0; j < pDataArray[i]->thisLookUp.size(); j++) { delete pDataArray[i]->thisLookUp[j]; }
+ for (int j = 0; j < pDataArray[i]->outputNames.size(); j++) {
+ outputNames.push_back(pDataArray[i]->outputNames[j]);
+ outputTypes["metastats"].push_back(pDataArray[i]->outputNames[j]);
+ }
+
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+ #endif
+ }
+
return 0;
}
//get set names
string setA = namesOfGroupCombos[c][0];
string setB = namesOfGroupCombos[c][1];
- //cout << setA << '\t' << setB << endl;
+
//get filename
string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
outputNames.push_back(outputFileName); outputTypes["metastats"].push_back(outputFileName);
setACount++;
}
}
-
- //for (int i = 0; i < subset.size(); i++) { cout << designMap->getGroup(subset[i]->getGroup()) << endl; }
- //cout << setACount << endl;
-
+
if ((setACount == 0) || (setBCount == 0)) {
m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); m->mothurOutEndLine();
outputNames.pop_back();
}else {
+
+ ofstream outTemp;
+ string tempOut = outputDir + "data." + setA + "-" + setB + ".matrix";
+ m->openOutputFile(tempOut, outTemp);
+ for (int i = 0; i < subset.size(); i++) { outTemp << '\t' << subset[i]->getGroup(); }
+ outTemp << endl;
+
+
//fill data
for (int j = 0; j < thisLookUp[0]->getNumBins(); j++) {
//data[j] = new double[subset.size()];
data2[j].resize(subset.size(), 0.0);
+ outTemp << "OTU" << (j+1);
for (int i = 0; i < subset.size(); i++) {
- //data[j][i] = (subset[i]->getAbundance(j));
data2[j][i] = (subset[i]->getAbundance(j));
+ outTemp << '\t' << subset[i]->getAbundance(j);
}
+ outTemp << endl;
}
-
+ outTemp.close();
m->mothurOut("Comparing " + setA + " and " + setB + "..."); m->mothurOutEndLine();
//metastat_main(output, thisLookUp[0]->getNumBins(), subset.size(), threshold, iters, data, setACount);
MothurMetastats mothurMeta(threshold, iters);
mothurMeta.runMetastats(outputFileName , data2, setACount);
m->mothurOutEndLine();
-
m->mothurOutEndLine();
}
#include "command.hpp"
#include "inputdata.h"
#include "sharedrabundvector.h"
+#include "mothurmetastats.h"
class MetaStatsCommand : public Command {
int driver(int, int, vector<SharedRAbundVector*>&);
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct metastatsData {
+ vector<SharedRAbundVector*> thisLookUp;
+ vector< vector<string> > namesOfGroupCombos;
+ vector<string> designMapGroups;
+ vector<string> outputNames;
+ int start;
+ int num, iters;
+ float threshold;
+ MothurOut* m;
+ string sharedfile;
+ string outputDir;
+
+ metastatsData(){}
+ metastatsData(string sf, string oDir, MothurOut* mout, int st, int en, vector< vector<string> > ns, vector<SharedRAbundVector*> lu, vector<string> dg, int i, float thr) {
+ sharedfile = sf;
+ outputDir = oDir;
+ m = mout;
+ start = st;
+ num = en;
+ namesOfGroupCombos = ns;
+ thisLookUp = lu;
+ designMapGroups = dg;
+ iters = i;
+ threshold = thr;
+ }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyMetastatsThreadFunction(LPVOID lpParam){
+ metastatsData* pDataArray;
+ pDataArray = (metastatsData*)lpParam;
+
+ try {
+
+ //for each combo
+ for (int c = pDataArray->start; c < (pDataArray->start+pDataArray->num); c++) {
+
+ //get set names
+ string setA = pDataArray->namesOfGroupCombos[c][0];
+ string setB = pDataArray->namesOfGroupCombos[c][1];
+
+ //get filename
+ string outputFileName = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(pDataArray->sharedfile)) + pDataArray->thisLookUp[0]->getLabel() + "." + setA + "-" + setB + ".metastats";
+ pDataArray->outputNames.push_back(outputFileName);
+
+ vector< vector<double> > data2; data2.resize(pDataArray->thisLookUp[0]->getNumBins());
+
+ vector<SharedRAbundVector*> subset;
+ int setACount = 0;
+ int setBCount = 0;
+ for (int i = 0; i < pDataArray->thisLookUp.size(); i++) {
+ //is this group for a set we want to compare??
+ //sorting the sets by putting setB at the back and setA in the front
+ if (pDataArray->designMapGroups[i] == setB) {
+ subset.push_back(pDataArray->thisLookUp[i]);
+ setBCount++;
+ }else if (pDataArray->designMapGroups[i] == setA) {
+ subset.insert(subset.begin()+setACount, pDataArray->thisLookUp[i]);
+ setACount++;
+ }
+ }
+
+ if ((setACount == 0) || (setBCount == 0)) {
+ pDataArray->m->mothurOut("Missing shared info for " + setA + " or " + setB + ". Skipping comparison."); pDataArray->m->mothurOutEndLine();
+ pDataArray->outputNames.pop_back();
+ }else {
+ //fill data
+ for (int j = 0; j < pDataArray->thisLookUp[0]->getNumBins(); j++) {
+ data2[j].resize(subset.size(), 0.0);
+ for (int i = 0; i < subset.size(); i++) {
+ data2[j][i] = (subset[i]->getAbundance(j));
+ }
+ }
+
+ pDataArray->m->mothurOut("Comparing " + setA + " and " + setB + "..."); pDataArray->m->mothurOutEndLine();
+
+ pDataArray->m->mothurOutEndLine();
+ MothurMetastats mothurMeta(pDataArray->threshold, pDataArray->iters);
+ mothurMeta.runMetastats(outputFileName, data2, setACount);
+ pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOutEndLine();
+ }
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "MetaStatsCommand", "MyMetastatsThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+
+
#endif
m->setFileName(logFileName);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
system("clear");
#else
system("CLS");
//add / to name if needed
string lastChar = temp.substr(temp.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { temp += "/"; }
#else
if (lastChar != "\\") { temp += "\\"; }
if (outputHeader) {
//version
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#if defined (__APPLE__) || (__MACH__)
m->mothurOutJustToLog("Mac version");
m->mothurOutEndLine(); m->mothurOutEndLine();
#include <cmath>
#include <math.h>
#include <algorithm>
+#include <numeric>
//misc
#include <cerrno>
#endif
/***********************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/resource.h>
struct seqDist {
int seq1;
int seq2;
- float dist;
+ double dist;
seqDist() {}
- seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+ seqDist(int s1, int s2, double d) : seq1(s1), seq2(s2), dist(d) {}
~seqDist() {}
};
/************************************************************/
/***********************************************************/
MothurMetastats::~MothurMetastats() {}
/***********************************************************/
-//main metastats function
-int MothurMetastats::runMetastats(string outputFileName, vector< vector<double> >& data, int secondGroupingStart) {
- try {
- int bflag = 0;
- row = data.size(); //numBins
+ //main metastats function
+int MothurMetastats::runMetastats(string outputFileName, vector< vector<double> >& data, int secGroupingStart) {
+ try {
+ row = data.size(); //numBins
column = data[0].size(); //numGroups in subset
- int size = row*column;
-
- //consistent with original, but this should never be true
- if ((secondGroupingStart >= column) || (secondGroupingStart <= 0)) { m->mothurOut("[ERROR]: Check your g value."); m->mothurOutEndLine(); return 0; }
-
- //Initialize the matrices
- vector<double> pmatrix; pmatrix.resize(size, 0.0);
- vector<double> permuted; permuted.resize(size, 0.0);
- vector< vector<double> > storage; storage.resize(row);
- for (int i = 0; i < storage.size(); i++) { storage[i].resize(9, 0.0); }
-
- //Produces the sum of each column
- vector<double> total; total.resize(column, 0.0);
- vector<double> ratio; ratio.resize(column, 0.0);
- double total1 = 0.0; double total2 = 0.0;
-
- //total[i] = total abundance for group[i]
+ secondGroupingStart = secGroupingStart; //g
+
+ vector< vector<double> > Pmatrix; Pmatrix.resize(row);
+ for (int i = 0; i < row; i++) { Pmatrix[i].resize(column, 0.0); } // the relative proportion matrix
+ vector< vector<double> > C1; C1.resize(row);
+ for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
+ vector< vector<double> > C2; C2.resize(row); // mean[1], variance[2], standard error[3]
+ for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); }
+ vector<double> T_statistics; T_statistics.resize(row, 1); // a place to store the true t-statistics
+ vector<double> pvalues; pvalues.resize(row, 1); // place to store pvalues
+ vector<double> qvalues; qvalues.resize(row, 1); // stores qvalues
+
+ //*************************************
+ // convert to proportions
+ // generate Pmatrix
+ //*************************************
+ vector<double> totals; totals.resize(column, 0); // sum of columns
+ //total[i] = total abundance for group[i]
for (int i = 0; i < column; i++) {
for (int j = 0; j < row; j++) {
- total[i] += data[j][i];
+ totals[i] += data[j][i];
}
- }
-
- //total for first grouping
- for (int i = 0; i < secondGroupingStart; i++) { total1 += total[i]; }
-
- //total for second grouping
- for (int i = secondGroupingStart; i < column; i++) { total2 += total[i]; }
-
- //Creates the ratios by first finding the minimum of totals
- double min = total[0];
- for (int i = 0; i < total.size(); i++) {
- if (total[i] < min) { min = total[i]; }
- }
-
- //sanity check
- if (min <= 0.0) { m->mothurOut("[ERROR]: the sum of one of the columns <= 0."); m->mothurOutEndLine(); return 0; }
-
- //Ratio time...
- for(int i = 0; i < ratio.size(); i++){ ratio[i] = total[i] / min; }
-
- //Change matrix into an array as received by R for compatibility - kept to be consistent with original
- int count = 0;
- for(int i = 0; i < column; i++){
- for(int j = 0; j < row; j++){
- pmatrix[count]=data[j][i];
- count++;
+ }
+
+ for (int i = 0; i < column; i++) {
+ for (int j = 0; j < row; j++) {
+ Pmatrix[j][i] = data[j][i]/totals[i];
+
}
- }
-
- if(row == 1){
- for (int i =0; i < column; i++){ pmatrix[i] /= ratio[i]; }
- }else {
- count = 0; int j=-1;
-
- for (int i=0; i < size; i++) {
- if (count % row == 0) { j++; }
- pmatrix[i] /= ratio[j];
- count++;
- }
- }
-
- vector<double> permuted_ttests; permuted_ttests.resize(row, 0.0);
- vector<double> pvalues; pvalues.resize(row, 0.0);
- vector<double> tinitial; tinitial.resize(row, 0.0);
-
- if (m->control_pressed) { return 1; }
-
- //Find the initial values for the matrix.
- start(pmatrix, secondGroupingStart, tinitial, storage);
-
- if (m->control_pressed) { return 1; }
-
- // Start the calculations.
- if ( (column == 2) || (secondGroupingStart < 8) || ((column-secondGroupingStart) < 8) ){
-
- vector<double> fish; fish.resize(row, 0.0);
+ }
+
+ //#********************************************************************************
+ //# ************************** STATISTICAL TESTING ********************************
+ //#********************************************************************************
+
+ if (column == 2){ //# then we have a two sample comparison
+ //#************************************************************
+ //# generate p values fisher's exact test
+ //#************************************************************
+ double total1, total2;
+ //total for first grouping
+ for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
+
+ //total for second grouping
+ for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; }
+
+ vector<double> fish; fish.resize(row, 0.0);
vector<double> fish2; fish2.resize(row, 0.0);
-
+
for(int i = 0; i < row; i++){
for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; }
- //vector<double> tempData; tempData.resize(4, 0.0);
double f11, f12, f21, f22;
f11 = fish[i];
f12 = fish2[i];
f21 = total1 - fish[i];
f22 = total2 - fish2[i];
- double pre = 0.0;
-
MothurFisher fisher;
- pre = fisher.fexact(f11, f12, f21, f22);
-
+ double pre = fisher.fexact(f11, f12, f21, f22);
+ if (pre > 0.999999999) { pre = 1.0; }
+
if (m->control_pressed) { return 1; }
- if (pre > 0.999999999) { pre = 1.0; }
- storage[i][8] = pre;
pvalues[i] = pre;
}
-
- }else {
-
- testp(permuted_ttests, permuted, pmatrix, secondGroupingStart, tinitial, pvalues);
-
- if (m->control_pressed) { return 1; }
-
- // Checks to make sure the matrix isn't sparse.
- vector<double> sparse; sparse.resize(row, 0.0);
- vector<double> sparse2; sparse2.resize(row, 0.0);
-
- int c = 0;
-
+
+ //#*************************************
+ //# calculate q values from p values
+ //#*************************************
+ qvalues = calc_qvalues(pvalues);
+
+ }else { //we have multiple subjects per population
+
+ //#*************************************
+ //# generate statistics mean, var, stderr
+ //#*************************************
+ for(int i = 0; i < row; i++){ // for each taxa
+ //# find the mean of each group
+ double g1Total = 0.0; double g2Total = 0.0;
+ for (int j = 0; j < secondGroupingStart; j++) { g1Total += Pmatrix[i][j]; }
+ C1[i][0] = g1Total/(double)(secondGroupingStart);
+ for (int j = secondGroupingStart; j < column; j++) { g2Total += Pmatrix[i][j]; }
+ C2[i][0] = g2Total/(double)(column-secondGroupingStart);
+
+ //# find the variance of each group
+ double g1Var = 0.0; double g2Var = 0.0;
+ for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((Pmatrix[i][j]-C1[i][0]), 2); }
+ C1[i][1] = g1Var/(double)(secondGroupingStart-1);
+ for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((Pmatrix[i][j]-C2[i][0]), 2); }
+ C2[i][1] = g2Var/(double)(column-secondGroupingStart-1);
+
+ //# find the std error of each group -std err^2 (will change to std err at end)
+ C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
+ C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart);
+ }
+
+ //#*************************************
+ //# two sample t-statistics
+ //#*************************************
+ for(int i = 0; i < row; i++){ // # for each taxa
+ double xbar_diff = C1[i][0] - C2[i][0];
+ double denom = sqrt(C1[i][2] + C2[i][2]);
+ T_statistics[i] = xbar_diff/denom; // calculate two sample t-statistic
+ }
+
+ /*for (int i = 0; i < row; i++) {
+ for (int j = 0; j < 3; j++) {
+ cout << "C1[" << i+1 << "," << j+1 << "]=" << C1[i][j] << ";" << endl;
+ cout << "C2[" << i+1 << "," << j+1 << "]=" << C2[i][j] << ";" << endl;
+ }
+ cout << "T_statistics[" << i+1 << "]=" << T_statistics[i] << ";" << endl;
+ }*/
+ //#*************************************
+ //# generate initial permuted p-values
+ //#*************************************
+ pvalues = permuted_pvalues(Pmatrix, T_statistics, data);
+
+ //#*************************************
+ //# generate p values for sparse data
+ //# using fisher's exact test
+ //#*************************************
+ double total1, total2;
+ //total for first grouping
+ for (int i = 0; i < secondGroupingStart; i++) { total1 += totals[i]; }
+
+ //total for second grouping
+ for (int i = secondGroupingStart; i < column; i++) { total2 += totals[i]; }
+
+ vector<double> fish; fish.resize(row, 0.0);
+ vector<double> fish2; fish2.resize(row, 0.0);
+
for(int i = 0; i < row; i++){
- for(int j = 0; j < secondGroupingStart; j++) { sparse[i] += data[i][j]; }
- if(sparse[i] < (double)secondGroupingStart) { c++; }
+ for(int j = 0; j < secondGroupingStart; j++) { fish[i] += data[i][j]; }
+ for(int j = secondGroupingStart; j < column; j++) { fish2[i] += data[i][j]; }
+
+ if ((fish[1] < secondGroupingStart) && (fish2[i] < (column-secondGroupingStart))) {
+ double f11, f12, f21, f22;
+ f11 = fish[i];
+ f12 = fish2[i];
+ f21 = total1 - fish[i];
+ f22 = total2 - fish2[i];
- // ?<= for col
- for(int j = secondGroupingStart; j < column; j++) { sparse2[i] += data[i][j]; }
- if( (sparse2[i] < (double)(column-secondGroupingStart))) { c++; }
+ MothurFisher fisher;
+ double pre = fisher.fexact(f11, f12, f21, f22);
+ if (pre > 0.999999999) { pre = 1.0; }
+
+ if (m->control_pressed) { return 1; }
- if (c == 2) {
- c=0;
- double f11,f12,f21,f22;
-
- f11=sparse[i]; sparse[i]=0;
- f12=sparse2[i]; sparse2[i]=0;
- f21 = total1 - f11;
- f22 = total2 - f12;
-
- double pre = 0.0;
-
- MothurFisher fisher;
- pre = fisher.fexact(f11, f12, f21, f22);
-
- if (m->control_pressed) { return 1; }
-
- if (pre > 0.999999999){
- pre = 1.0;
- }
-
- storage[i][8] = pre;
- pvalues[i] = pre;
- }
+ pvalues[i] = pre;
+ }
}
-
- bflag = 1;
- }
- // Calculates the mean of counts (not normalized)
- vector< vector<double> > temp; temp.resize(row);
- for (int i = 0; i < temp.size(); i++) { temp[i].resize(2, 0.0); }
-
- for (int j = 0; j < row; j++){
- if (m->control_pressed) { return 1; }
-
- for (int i = 0; i < secondGroupingStart; i++){ temp[j][0] += data[j][i]; }
- temp[j][0] /= (double)secondGroupingStart;
-
- for(int i = secondGroupingStart; i < column; i++){ temp[j][1] += data[j][i]; }
- temp[j][1] /= (double)(column-secondGroupingStart);
- }
-
- for(int i = 0; i < row; i++){
- if (m->control_pressed) { return 1; }
-
- storage[i][3]=temp[i][0];
- storage[i][7]=temp[i][1];
- storage[i][8]=pvalues[i];
- }
-
- vector<double> qvalues = calc_qvalues(pvalues);
-
- // BACKUP checks
- cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint);
- for (int i = 0; i < row; i++){
-
- if (m->control_pressed) { return 1; }
-
- if(qvalues[i] < threshold){
- m->mothurOut("Feature " + toString((i+1)) + " is significant, q = ");
- cout << qvalues[i];
- m->mothurOutJustToLog(toString(pvalues[i])); m->mothurOutEndLine();
- }
- }
-
- // And now we write the files to a text file.
+ //#*************************************
+ //# calculate q values from p values
+ //#*************************************
+ qvalues = calc_qvalues(pvalues);
+
+ //#*************************************
+ //# convert stderr^2 to std error
+ //#*************************************
+ for(int i = 0; i < row; i++){
+ C1[i][2] = sqrt(C1[i][2]);
+ C2[i][2] = sqrt(C2[i][2]);
+ }
+ }
+
+ // And now we write the files to a text file.
struct tm *local;
time_t t; t = time(NULL);
local = localtime(&t);
ofstream out;
m->openOutputFile(outputFileName, out);
out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-
+
out << "Local time and date of test: " << asctime(local) << endl;
out << "# rows = " << row << ", # col = " << column << ", g = " << secondGroupingStart << endl << endl;
- if (bflag == 1){ out << numPermutations << " permutations" << endl << endl; }
+ out << numPermutations << " permutations" << endl << endl;
//output column headings - not really sure... documentation labels 9 columns, there are 10 in the output file
//storage 0 = meanGroup1 - line 529, 1 = varGroup1 - line 532, 2 = err rate1 - line 534, 3 = mean of counts group1?? - line 291, 4 = meanGroup2 - line 536, 5 = varGroup2 - line 539, 6 = err rate2 - line 541, 7 = mean of counts group2?? - line 292, 8 = pvalues - line 293
- out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean_of_counts(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tmean_of_counts(group1)\tp-value\tq-value\n";
+ out << "OTU\tmean(group1)\tvariance(group1)\tstderr(group1)\tmean(group2)\tvariance(group2)\tstderr(group2)\tp-value\tq-value\n";
for(int i = 0; i < row; i++){
if (m->control_pressed) { out.close(); return 0; }
- out << (i+1);
- for(int j = 0; j < 9; j++){ out << '\t' << storage[i][j]; }
- out << '\t' << qvalues[i];
- out << endl;
+ //if there are binlabels use them otherwise count.
+ if (m->binLabelsInFile.size() == row) { out << m->binLabelsInFile[i] << '\t'; }
+ else { out << (i+1) << '\t'; }
+
+ out << C1[i][0] << '\t' << C1[i][1] << '\t' << C1[i][2] << '\t' << C2[i][0] << '\t' << C2[i][1] << '\t' << C2[i][2] << '\t' << pvalues[i] << '\t' << qvalues[i] << endl;
}
out << endl << endl;
out.close();
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "runMetastats");
- exit(1);
- }
-}
-/***********************************************************/
-//Find the initial values for the matrix
-int MothurMetastats::start(vector<double>& Imatrix, int secondGroupingStart, vector<double>& initial, vector< vector<double> >& storage) {
- try {
-
- int a = row; a*=4;
-
- double xbardiff = 0.0; double denom = 0.0;
- vector<double> store; store.resize(a, 0.0);
- vector<double> tool; tool.resize(a, 0.0);
- vector< vector<double> > C1; C1.resize(row);
- for (int i = 0; i < C1.size(); i++) { C1[i].resize(3, 0.0); }
- vector< vector<double> > C2; C2.resize(row);
- for (int i = 0; i < C2.size(); i++) { C2[i].resize(3, 0.0); }
-
- meanvar(Imatrix, secondGroupingStart, store);
-
- if (m->control_pressed) { return 0; }
-
- //copy store into tool
- tool = store;
-
- for (int i = 0; i < row; i++){
- C1[i][0]=tool[i]; //mean group 1
- storage[i][0]=C1[i][0];
- C1[i][1]=tool[i+row+row]; // var group 1
- storage[i][1]=C1[i][1];
- C1[i][2]=C1[i][1]/(secondGroupingStart);
- storage[i][2]=sqrt(C1[i][2]);
-
- C2[i][0]=tool[i+row]; // mean group 2
- storage[i][4]=C2[i][0];
- C2[i][1]=tool[i+row+row+row]; // var group 2
- storage[i][5]=C2[i][1];
- C2[i][2]=C2[i][1]/(column-secondGroupingStart);
- storage[i][6]=sqrt(C2[i][2]);
- }
-
- if (m->control_pressed) { return 0; }
-
- for (int i = 0; i < row; i++){
- xbardiff = C1[i][0]-C2[i][0];
- denom = sqrt(C1[i][2]+C2[i][2]);
- initial[i]=fabs(xbardiff/denom);
- }
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "start");
- exit(1);
- }
-}
-/***********************************************************/
-int MothurMetastats::meanvar(vector<double>& pmatrix, int secondGroupingStart, vector<double>& store) {
- try {
- vector<double> temp; temp.resize(row, 0.0);
- vector<double> temp2; temp2.resize(row, 0.0);
- vector<double> var; var.resize(row, 0.0);
- vector<double> var2; var2.resize(row, 0.0);
-
- double a = secondGroupingStart;
- double b = column - a;
- int m = a * row;
- int n = row * column;
-
- for (int i = 0; i < m; i++) { temp[i%row] += pmatrix[i]; }
- for (int i = 0; i < n; i++) { temp2[i%row]+= pmatrix[i]; }
- for (int i = 0; i < row; i++) { temp2[i] -= temp[i]; }
- for (int i = 0; i <= row-1;i++) {
- store[i] = temp[i]/a;
- store[i+row]=temp2[i]/b;
- }
-
- //That completes the mean calculations.
-
- for (int i = 0; i < m; i++) { var[i%row] += pow((pmatrix[i]-store[i%row]),2); }
- for (int i = m; i < n; i++) { var2[i%row]+= pow((pmatrix[i]-store[(i%row)+row]),2); }
- for (int i = 0; i <= row-1; i++){
- store[i+2*row]=var[i]/(a-1);
- store[i+3*row]=var2[i]/(b-1);
- }
-
- // That completes var calculations.
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "meanvar");
- exit(1);
- }
-}
-/***********************************************************/
-int MothurMetastats::testp(vector<double>& permuted_ttests, vector<double>& permuted, vector<double>& Imatrix, int secondGroupingStart, vector<double>& Tinitial, vector<double>& ps) {
- try {
-
- vector<double> Tvalues; Tvalues.resize(row, 0.0);
- vector<double> counter; counter.resize(row, 0.0);
- int a, b, n;
-
- a = numPermutations;
- b = row;
- n = a*b;
-
- for (int j = 1; j <= row; j++) {
- if (m->control_pressed) { return 0; }
- permute_matrix(Imatrix, permuted, secondGroupingStart, Tvalues, Tinitial, counter);
- }
-
- for(int j = 0; j < row; j++) {
- if (m->control_pressed) { return 0; }
- ps[j] = ((counter[j]+1)/(double)(a+1));
- }
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "testp");
- exit(1);
- }
-}
-/***********************************************************/
-int MothurMetastats::permute_matrix(vector<double>& Imatrix, vector<double>& permuted, int secondGroupingStart, vector<double>& trial_ts, vector<double>& Tinitial, vector<double>& counter1){
- try {
-
- vector<int> y; y.resize(column, 0);
- for (int i = 1; i <= column; i++){ y[i-1] = i; }
-
- permute_array(y);
-
- int f = 0; int c = 0; int k = 0;
- for (int i = 0; i < column; i++){
-
- if (m->control_pressed) { return 0; }
-
- f = y[i]; //column number
- c = 1;
- c *= (f-1);
- c *= row;
- if (f == 1){ c = 0; } // starting value position in the Imatrix
-
- for(int j = 1; j <= row; j++){
- permuted[k] = Imatrix[c];
- c++; k++;
- }
- }
-
- calc_twosample_ts(permuted, secondGroupingStart, trial_ts, Tinitial, counter1);
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "permute_matrix");
- exit(1);
- }
+
+ return 0;
+
+ }catch(exception& e) {
+ m->errorOut(e, "MothurMetastats", "runMetastats");
+ exit(1);
+ }
}
/***********************************************************/
-int MothurMetastats::permute_array(vector<int>& array) {
+vector<double> MothurMetastats::permuted_pvalues(vector< vector<double> >& Imatrix, vector<double>& tstats, vector< vector<double> >& Fmatrix) {
try {
- static int seeded = 0;
-
- if (! seeded) {
- seeded = 1;
- srand(time(NULL));
- }
-
- for (int i = 0; i < array.size(); i++) {
- if (m->control_pressed) { return 0; }
-
- int selection = rand() % (array.size() - i);
- int tmp = array[i + selection];
- array[i + selection] = array[i];
- array[i] = tmp;
- }
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "permute_array");
- exit(1);
- }
+ //# matrix stores tstats for each taxa(row) for each permuted trial(column)
+ vector<double> ps; ps.resize(row, 0.0); //# to store the pvalues
+ vector< vector<double> > permuted_ttests; permuted_ttests.resize(numPermutations);
+ for (int i = 0; i < numPermutations; i++) { permuted_ttests[i].resize(row, 0.0); }
+
+ //# calculate null version of tstats using B permutations.
+ for (int i = 0; i < numPermutations; i++) {
+ permuted_ttests[i] = permute_and_calc_ts(Imatrix);
+ }
+
+ //# calculate each pvalue using the null ts
+ if ((secondGroupingStart) < 8 || (column-secondGroupingStart) < 8){
+ vector< vector<double> > cleanedpermuted_ttests; cleanedpermuted_ttests.resize(numPermutations); //# the array pooling just the frequently observed ts
+ //# then pool the t's together!
+ //# count how many high freq taxa there are
+ int hfc = 1;
+ for (int i = 0; i < row; i++) { // # for each taxa
+ double group1Total = 0.0; double group2Total = 0.0;
+ for(int j = 0; j < secondGroupingStart; j++) { group1Total += Fmatrix[i][j]; }
+ for(int j = secondGroupingStart; j < column; j++) { group2Total += Fmatrix[i][j]; }
+
+ if (group1Total >= secondGroupingStart || group2Total >= (column-secondGroupingStart)){
+ hfc++;
+ for (int j = 0; j < numPermutations; j++) { cleanedpermuted_ttests[j].push_back(permuted_ttests[j][i]); }
+ }
+ }
+
+ //#now for each taxa
+ for (int i = 0; i < row; i++) {
+ //number of cleanedpermuted_ttests greater than tstat[i]
+ int numGreater = 0;
+ for (int j = 0; j < numPermutations; j++) {
+ for (int k = 0; k < hfc; k++) {
+ if (cleanedpermuted_ttests[j][k] > abs(tstats[i])) { numGreater++; }
+ }
+ }
+
+ ps[i] = (1/(double)(numPermutations*hfc))*numGreater;
+ }
+ }else{
+ for (int i = 0; i < row; i++) {
+ //number of permuted_ttests[i] greater than tstat[i] //(sum(permuted_ttests[i,] > abs(tstats[i]))+1)
+ int numGreater = 1;
+ for (int j = 0; j < numPermutations; j++) { if (permuted_ttests[j][i] > abs(tstats[i])) { numGreater++; } }
+ ps[i] = (1/(double)(numPermutations+1))*numGreater;
+ }
+ }
+
+ return ps;
+
+ }catch(exception& e) {
+ m->errorOut(e, "MothurMetastats", "permuted_pvalues");
+ exit(1);
+ }
}
/***********************************************************/
-int MothurMetastats::calc_twosample_ts(vector<double>& Pmatrix, int secondGroupingStart, vector<double>& Ts, vector<double>& Tinitial, vector<double>& counter) {
+vector<double> MothurMetastats::permute_and_calc_ts(vector< vector<double> >& Imatrix) {
try {
- int a = row * 4;
-
- vector< vector<double> > C1; C1.resize(row);
- for (int i = 0; i < C1.size(); i++) { C1[i].resize(3, 0.0); }
- vector< vector<double> > C2; C2.resize(row);
- for (int i = 0; i < C2.size(); i++) { C2[i].resize(3, 0.0); }
- vector<double> storage; storage.resize(a, 0.0);
- vector<double> tool; tool.resize(a, 0.0);
- double xbardiff = 0.0; double denom = 0.0;
-
- meanvar(Pmatrix, secondGroupingStart, storage);
-
- for(int i = 0;i <= (a-1); i++) {
- if (m->control_pressed) { return 0; }
- tool[i] = storage[i];
- }
-
- for (int i = 0; i < row; i++){
- if (m->control_pressed) { return 0; }
- C1[i][0]=tool[i];
- C1[i][1]=tool[i+row+row];
- C1[i][2]=C1[i][1]/(secondGroupingStart);
-
- C2[i][0]=tool[i+row];
- C2[i][1]=tool[i+row+row+row]; // var group 2
- C2[i][2]=C2[i][1]/(column-secondGroupingStart);
- }
-
- for (int i = 0; i < row; i++){
- if (m->control_pressed) { return 0; }
- xbardiff = C1[i][0]-C2[i][0];
- denom = sqrt(C1[i][2]+C2[i][2]);
- Ts[i]=fabs(xbardiff/denom);
- if (fabs(Ts[i])>(fabs(Tinitial[i])+.0000000000001)){ //13th place
- counter[i]++;
- }
- }
-
- return 0;
-
- }catch(exception& e) {
- m->errorOut(e, "MothurMetastats", "calc_twosample_ts");
- exit(1);
- }
+ vector< vector<double> > permutedMatrix = Imatrix;
+
+ //randomize columns, ie group abundances.
+ for (int i = 0; i < permutedMatrix.size(); i++) { random_shuffle(permutedMatrix[i].begin(), permutedMatrix[i].end()); }
+
+ //calc ts
+ vector< vector<double> > C1; C1.resize(row);
+ for (int i = 0; i < row; i++) { C1[i].resize(3, 0.0); } // statistic profiles for class1 and class 2
+ vector< vector<double> > C2; C2.resize(row); // mean[1], variance[2], standard error[3]
+ for (int i = 0; i < row; i++) { C2[i].resize(3, 0.0); }
+ vector<double> Ts; Ts.resize(row, 0.0); // a place to store the true t-statistics
+
+ //#*************************************
+ //# generate statistics mean, var, stderr
+ //#*************************************
+ for(int i = 0; i < row; i++){ // for each taxa
+ //# find the mean of each group
+ double g1Total = 0.0; double g2Total = 0.0;
+ for (int j = 0; j < secondGroupingStart; j++) { g1Total += permutedMatrix[i][j]; }
+ C1[i][0] = g1Total/(double)(secondGroupingStart);
+ for (int j = secondGroupingStart; j < column; j++) { g2Total += permutedMatrix[i][j]; }
+ C2[i][0] = g2Total/(double)(column-secondGroupingStart);
+
+ //# find the variance of each group
+ double g1Var = 0.0; double g2Var = 0.0;
+ for (int j = 0; j < secondGroupingStart; j++) { g1Var += pow((permutedMatrix[i][j]-C1[i][0]), 2); }
+ C1[i][1] = g1Var/(double)(secondGroupingStart-1);
+ for (int j = secondGroupingStart; j < column; j++) { g2Var += pow((permutedMatrix[i][j]-C2[i][0]), 2); }
+ C2[i][1] = g2Var/(double)(column-secondGroupingStart-1);
+
+ //# find the std error of each group -std err^2 (will change to std err at end)
+ C1[i][2] = C1[i][1]/(double)(secondGroupingStart);
+ C2[i][2] = C2[i][1]/(double)(column-secondGroupingStart);
+ }
+
+ //#*************************************
+ //# two sample t-statistics
+ //#*************************************
+ for(int i = 0; i < row; i++){ // # for each taxa
+ double xbar_diff = C1[i][0] - C2[i][0];
+ double denom = sqrt(C1[i][2] + C2[i][2]);
+ Ts[i] = abs(xbar_diff/denom); // calculate two sample t-statistic
+ }
+
+ return Ts;
+
+
+ }catch(exception& e) {
+ m->errorOut(e, "MothurMetastats", "permuted_ttests");
+ exit(1);
+ }
}
/***********************************************************/
vector<double> MothurMetastats::calc_qvalues(vector<double>& pValues) {
try {
+ /* cout << "x <- c(" << pValues[0];
+ for (int l = 1; l < pValues.size(); l++){
+ cout << ", " << pValues[l];
+ }
+ cout << ")\n";*/
+
int numRows = pValues.size();
vector<double> qvalues(numRows, 0.0);
private:
MothurOut* m;
- int row, column, numPermutations;
+ int row, column, numPermutations, secondGroupingStart;
double threshold;
-
+
+ vector<double> permuted_pvalues(vector< vector<double> >&, vector<double>&, vector< vector<double> >&);
+ vector<double> permute_and_calc_ts(vector< vector<double> >&);
+
int start(vector<double>&, int, vector<double>&, vector< vector<double> >&); //Find the initial values for the matrix
int meanvar(vector<double>&, int, vector<double>&);
int testp(vector<double>&, vector<double>&, vector<double>&, int, vector<double>&, vector<double>&);
//add / to name if needed
string lastChar = pathname.substr(pathname.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { pathname += "/"; }
#else
if (lastChar != "\\") { pathname += "\\"; }
if (pid == 0) { //only one process should output to screen
#endif
- cout << output;
out << output;
+ logger() << output;
#ifdef USE_MPI
}
if (pid == 0) { //only one process should output to screen
#endif
- cout << endl;
out << endl;
+ logger() << endl;
#ifdef USE_MPI
}
if (pid == 0) { //only one process should output to screen
#endif
- cout << output;
+
out << output;
outputFile << output;
+ logger() << output;
#ifdef USE_MPI
}
#endif
+
}
catch(exception& e) {
errorOut(e, "MothurOut", "MothurOut");
if (pid == 0) { //only one process should output to screen
#endif
- cout << endl;
out << endl;
outputFile << endl;
+ logger() << endl;
#ifdef USE_MPI
}
//
// On failure, returns 0.0, 0.0
int MothurOut::mem_usage(double& vm_usage, double& resident_set) {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
vm_usage = 0.0;
resident_set = 0.0;
}
/***********************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_COMPRESSION
inline bool endsWith(string s, const char * suffix){
size_t suffixLength = strlen(suffix);
string rootName = longName;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_COMPRESSION
if (endsWith(rootName, ".gz") || endsWith(rootName, ".bz2")) {
int pos = rootName.find_last_of('.');
string MothurOut::getExtension(string longName){
try {
- string extension = longName;
+ string extension = "";
if(longName.find_last_of('.') != longName.npos){
int pos = longName.find_last_of('.');
string cwd;
//get current working directory
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (path.find("~") != -1) { //go to home directory
string homeDir;
}else if (path[(pos-1)] == '/') { //you want the current working dir ./
path = path.substr(0, pos);
}else if (pos == 1) { break; //you are at the end
- }else { cout << "cannot resolve path for " << fileName << endl; return fileName; }
+ }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; }
}
for (int i = index; i >= 0; i--) {
}else if (path[(pos-1)] == '\\') { //you want the current working dir ./
path = path.substr(0, pos);
}else if (pos == 1) { break; //you are at the end
- }else { cout << "cannot resolve path for " << fileName << endl; return fileName; }
+ }else { mothurOut("cannot resolve path for " + fileName + "\n"); return fileName; }
}
for (int i = index; i >= 0; i--) {
try {
//get full path name
string completeFileName = getFullPathName(fileName);
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_COMPRESSION
// check for gzipped or bzipped file
if (endsWith(completeFileName, ".gz") || endsWith(completeFileName, ".bz2")) {
//get full path name
string completeFileName = getFullPathName(fileName);
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_COMPRESSION
// check for gzipped or bzipped file
if (endsWith(completeFileName, ".gz") || endsWith(completeFileName, ".bz2")) {
int exist = openInputFile(newName, inTest, "");
inTest.close();
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (exist == 0) { //you could open it so you want to delete it
string command = "rm " + newName;
system(command.c_str());
try {
string completeFileName = getFullPathName(fileName);
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#ifdef USE_COMPRESSION
// check for gzipped file
if (endsWith(completeFileName, ".gz") || endsWith(completeFileName, ".bz2")) {
//if you can, use the unix sort since its been optimized for years
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
string command = "sort -n -k +3 " + distFile + " -o " + outfile;
system(command.c_str());
#else //you are stuck with my best attempt...
while(isspace(d) && (d != in.eof())) { d=in.get(); count++;}
}
positions.push_back(count-1);
- cout << count-1 << endl;
+ //cout << count-1 << endl;
}
in.close();
fclose (pFile);
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//estimate file breaks
unsigned long long chunkSize = 0;
}
in.close();
- return 0;
+ return nameMap.size();
}
catch(exception& e) {
}
in.close();
- return 0;
+ return nameMap.size();
}
catch(exception& e) {
#include "mothur.h"
+/***********************************************/
+struct logger {
+
+ logger() {}
+ ~logger() {}
+
+ template< class T >
+ logger& operator <<( const T& o ) {
+ cout << o; return *this;
+ }
+
+ logger& operator<<(ostream& (*m)(ostream&) ) {
+ cout << m; return *this;
+ }
+
+};
/***********************************************/
class MothurOut {
int getRandomIndex(int); //highest
int control_pressed;
- bool executing, runParse, jumble, gui;
+ bool executing, runParse, jumble, gui, mothurCalling;
//current files - if you add a new type you must edit optionParser->getParameters, get.current command and mothurOut->printCurrentFiles/clearCurrentFiles.
string getPhylipFile() { return phylipfile; }
gui = false;
printedHeaders = false;
commandInputsConvertError = false;
+ mothurCalling = false;
sharedHeaderMode = "";
}
~MothurOut();
for(int i=0;i<numRefSeqs;i++){
if(restricted[i] == 0){
- if(leftDiffs[i][l] < singleLeft[l] && sequences[i].frequency || (leftDiffs[i][l] == singleLeft[l] && sequences[i].frequency > sequences[bestLeft[l]].frequency)){
+ if(((leftDiffs[i][l] < singleLeft[l]) && sequences[i].frequency) || ((leftDiffs[i][l] == singleLeft[l]) && (sequences[i].frequency > sequences[bestLeft[l]].frequency))){
singleLeft[l] = leftDiffs[i][l];
bestLeft[l] = i;
}
for(int i=0;i<numRefSeqs;i++){
if(restricted[i] == 0){
- if(rightDiffs[i][l] < singleRight[l] && sequences[i].frequency || (rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency)){
+ if((rightDiffs[i][l] < singleRight[l] && sequences[i].frequency) || ((rightDiffs[i][l] == singleRight[l] && sequences[i].frequency > sequences[bestRight[l]].frequency))){
singleRight[l] = rightDiffs[i][l];
bestRight[l] = i;
}
if(restricted[i] == 0){
int delta = leftDiffs[i][y] - leftDiffs[i][x];
- if(delta < minDelta[x][y] || delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency){
+ if(delta < minDelta[x][y] || (delta == minDelta[x][y] && sequences[i].frequency > sequences[minDeltaSeq[x][y]].frequency)){
minDelta[x][y] = delta;
minDeltaSeq[x][y] = i;
}
/**************************************************************************************************/
int correctDist::execute(string distanceFileName){
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
processors = 1;
#endif
int correctDist::createProcess(string distanceFileName){
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDs;
#include <signal.h>\r
#include <float.h>\r
\r
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
#include <sys/time.h>\r
#include <sys/resource.h>\r
#include <unistd.h>\r
return isatty(fd) != 0;\r
}\r
\r
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
#else\r
//#ifdef BIT_VERSION\r
//#include <io.h>\r
Log("fpos %ld (retval %d)\n", (long) fpos, fgetpos_retval);\r
// Log("eof %d\n", _eof(fd));\r
#endif\r
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
#else\r
#ifdef BIT_VERSION\r
__int64 pos64 = _ftelli64(f);\r
fprintf(stderr, "\n---Fatal error---\n%s\n", szStr);\r
Log("\n---Fatal error---\n%s\n", szStr);\r
\r
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
#else\r
//if (IsDebuggerPresent())\r
// __debugbreak();\r
g_Opts.insert(Opt);\r
}\r
\r
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)\r
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)\r
#else\r
#pragma warning(disable: 4505) // unreferenced local function\r
#endif\r
m->mothurConvert(temp, epsilon);
if (mindim < 1) { m->mothurOut("mindim must be at least 1."); m->mothurOutEndLine(); abort = true; }
- if (maxdim < mindim) { m->mothurOut("maxdim must be greater than mindim."); m->mothurOutEndLine(); abort = true; }
+ if (maxdim < mindim) { maxdim = mindim; }
}
}
int numGroups = shared.size();
data.clear(); data.resize(numGroups,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//get bin values and set sharedByAll
bool sharedByAll = true;
for (int j = 0; j < numGroups; j++) {
public:
oneGapDist() {}
- oneGapDist(const oneGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
public:
oneGapIgnoreTermGapDist() {}
- oneGapIgnoreTermGapDist(const oneGapIgnoreTermGapDist& ddb) {}
void calcDist(Sequence A, Sequence B){
string namefile = m->getNameFile();
bool match = false;
- if (namefile != "") {
+ if ((namefile != "")&&(!m->mothurCalling)) {
string temp = m->getRootName(m->getSimpleName(namefile));
vector<string> rootName;
m->splitAtChar(temp, rootName, '.');
for (int k = 0; k < i; k++) {
if (m->control_pressed) { out.close(); return 0; }
-
+
double coef = 0.0;
double sig = 0.0;
if (method == "spearman") { coef = linear.calcSpearman(xy[i], xy[k], sig); }
out.close();
+
return 0;
}
*/
#include "pairwiseseqscommand.h"
-#include "sequence.hpp"
-
-#include "gotohoverlap.hpp"
-#include "needlemanoverlap.hpp"
-#include "blastalign.hpp"
-#include "noalign.hpp"
-
-#include "ignoregaps.h"
-#include "eachgapdist.h"
-#include "eachgapignore.h"
-#include "onegapdist.h"
-#include "onegapignore.h"
-
//**********************************************************************************************************************
vector<string> PairwiseSeqsCommand::setParameters(){
if (calc == "default") { calc = "onegap"; }
}
m->splitAtDash(calc, Estimators);
-
- ValidCalculators validCalculator;
- if (countends) {
- for (int i=0; i<Estimators.size(); i++) {
- if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) {
- if (Estimators[i] == "nogaps") { distCalculator = new ignoreGaps(); }
- else if (Estimators[i] == "eachgap") { distCalculator = new eachGapDist(); }
- else if (Estimators[i] == "onegap") { distCalculator = new oneGapDist(); }
- }
- }
- }else {
- for (int i=0; i<Estimators.size(); i++) {
- if (validCalculator.isValidCalculator("distance", Estimators[i]) == true) {
- if (Estimators[i] == "nogaps") { distCalculator = new ignoreGaps(); }
- else if (Estimators[i] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
- else if (Estimators[i] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
- }
- }
- }
}
}
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- int longestBase = 2000; //will need to update this in driver if we find sequences with more bases. hardcoded so we don't have the pre-read user fasta file.
-
- if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
- else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
- else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
- else if(align == "noalign") { alignment = new NoAlign(); }
- else {
- m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
- m->mothurOutEndLine();
- alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
- }
+ longestBase = 2000; //will need to update this in driver if we find sequences with more bases. hardcoded so we don't have the pre-read user fasta file.
cutoff += 0.005;
driverMPI(start, end, outMPI, cutoff);
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); return 0; }
//wait on chidren
for(int i = 1; i < processors; i++) {
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); return 0; }
char buf[5];
MPI_Recv(buf, 5, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
//do your part
driverMPI(start, end, outMPI, cutoff);
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); return 0; }
char buf[5];
strcpy(buf, "done");
if (output != "square"){ driverMPI(start, end, outputFile, mySize); }
else { driverMPI(start, end, outputFile, mySize, output); }
- if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outputFile); delete distCalculator; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outputFile); return 0; }
int amode=MPI_MODE_APPEND|MPI_MODE_WRONLY|MPI_MODE_CREATE; //
MPI_File outMPI;
for(int b = 1; b < processors; b++) {
unsigned long long fileSize;
- if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); delete distCalculator; return 0; }
+ if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&outMPI); m->mothurRemove(outputFile); return 0; }
MPI_Recv(&fileSize, 1, MPI_LONG, b, tag, MPI_COMM_WORLD, &status);
if (output != "square"){ driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size); }
else { driverMPI(start, end, (outputFile + toString(pid) + ".temp"), size, output); }
- if (m->control_pressed) { delete distCalculator; return 0; }
+ if (m->control_pressed) { return 0; }
//tell parent you are done.
MPI_Send(&size, 1, MPI_LONG, 0, tag, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//if you don't need to fork anything
if(processors == 1){
if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
createProcesses(outputFile);
}
- #else
+ //#else
//ifstream inFASTA;
- if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
- else { driver(0, numSeqs, outputFile, "square"); }
- #endif
+ //if (output != "square") { driver(0, numSeqs, outputFile, cutoff); }
+ //else { driver(0, numSeqs, outputFile, "square"); }
+ //#endif
#endif
- if (m->control_pressed) { outputTypes.clear(); delete distCalculator; m->mothurRemove(outputFile); return 0; }
+ if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outputFile); return 0; }
#ifdef USE_MPI
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
m->mothurOut("It took " + toString(time(NULL) - startTime) + " to calculate the distances for " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
- if (m->control_pressed) { outputTypes.clear(); delete distCalculator; m->mothurRemove(outputFile); return 0; }
+ if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outputFile); return 0; }
}
-
- delete distCalculator;
//set phylip file as new current phylipfile
string current = "";
/**************************************************************************************************/
void PairwiseSeqsCommand::createProcesses(string filename) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 1;
+ int process = 1;
processIDS.clear();
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
//loop through and create all the processes you want
while (process != processors) {
int temp = processIDS[i];
wait(&temp);
}
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the distanceData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //that's why the distance calculator was moved inside of the driver to make separate copies.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
- //append and remove temp files
+ vector<pairwiseData*> pDataArray; //[processors-1];
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor-1 worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ string extension = toString(i) + ".temp";
+
+ // Allocate memory for thread data.
+ pairwiseData* tempDist = new pairwiseData((filename+extension), align, "square", Estimators[0], countends, output, alignDB, m, lines[i+1].start, lines[i+1].end, match, misMatch, gapOpen, gapExtend, longestBase, i);
+ pDataArray.push_back(tempDist);
+ processIDS.push_back(i);
+
+ if (output != "square") { hThreadArray[i] = CreateThread(NULL, 0, MyPairwiseThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); }
+ else { hThreadArray[i] = CreateThread(NULL, 0, MyPairwiseSquareThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); }
+ }
+
+ //do your part
+ if (output != "square") { driver(lines[0].start, lines[0].end, filename, cutoff); }
+ else { driver(lines[0].start, lines[0].end, filename, "square"); }
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+
+ //append and remove temp files
for (int i=0;i<processIDS.size();i++) {
m->appendFiles((filename + toString(processIDS[i]) + ".temp"), filename);
m->mothurRemove((filename + toString(processIDS[i]) + ".temp"));
}
-#endif
+
}
catch(exception& e) {
m->errorOut(e, "PairwiseSeqsCommand", "createProcesses");
try {
int startTime = time(NULL);
+
+ Alignment* alignment;
+ if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
+ else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
+ else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(align == "noalign") { alignment = new NoAlign(); }
+ else {
+ m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
+ m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
+ }
+
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (countends) {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
//column file
ofstream outFile(dFileName.c_str(), ios::trunc);
for(int j=0;j<i;j++){
- if (m->control_pressed) { outFile.close(); return 0; }
+ if (m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
if (alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
alignment->resize(alignDB.get(i).getUnaligned().length()+1);
m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
outFile.close();
+ delete alignment;
+ delete distCalculator;
return 1;
}
try {
int startTime = time(NULL);
+
+ Alignment* alignment;
+ if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
+ else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
+ else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(align == "noalign") { alignment = new NoAlign(); }
+ else {
+ m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
+ m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
+ }
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (countends) {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
//column file
ofstream outFile(dFileName.c_str(), ios::trunc);
outFile.setf(ios::fixed, ios::showpoint);
for(int j=0;j<alignDB.getNumSeqs();j++){
- if (m->control_pressed) { outFile.close(); return 0; }
+ if (m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
if (alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
alignment->resize(alignDB.get(i).getUnaligned().length()+1);
m->mothurOut(toString(endLine-1) + "\t" + toString(time(NULL) - startTime)); m->mothurOutEndLine();
outFile.close();
+ delete alignment;
+ delete distCalculator;
return 1;
}
try {
MPI_Status status;
int startTime = time(NULL);
+
+ Alignment* alignment;
+ if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
+ else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
+ else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(align == "noalign") { alignment = new NoAlign(); }
+ else {
+ m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
+ m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
+ }
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (countends) {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
string outputString = "";
for(int i=startLine;i<endLine;i++){
for(int j=0;j<i;j++){
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { delete alignment; delete distCalculator; return 0; }
if (alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
alignment->resize(alignDB.get(i).getUnaligned().length()+1);
delete buf;
}
-
+ delete alignment;
+ delete distCalculator;
return 1;
}
catch(exception& e) {
MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
-
+ Alignment* alignment;
+ if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
+ else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
+ else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(align == "noalign") { alignment = new NoAlign(); }
+ else {
+ m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
+ m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
+ }
+
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (countends) {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
string outputString = "";
size = 0;
for(int j=0;j<i;j++){
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { delete alignment; delete distCalculator; return 0; }
if (alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
alignment->resize(alignDB.get(i).getUnaligned().length()+1);
}
MPI_File_close(&outMPI);
+ delete alignment;
+ delete distCalculator;
return 1;
}
MPI_File_open(MPI_COMM_SELF, filename, amode, MPI_INFO_NULL, &outMPI);
+ Alignment* alignment;
+ if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
+ else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
+ else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(align == "noalign") { alignment = new NoAlign(); }
+ else {
+ m->mothurOut(align + " is not a valid alignment option. I will run the command using needleman.");
+ m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
+ }
-
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (countends) {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", Estimators[0]) == true) {
+ if (Estimators[0] == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (Estimators[0] == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (Estimators[0] == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
string outputString = "";
size = 0;
for(int j=0;j<alignDB.getNumSeqs();j++){
- if (m->control_pressed) { return 0; }
+ if (m->control_pressed) { delete alignment; return 0; }
if (alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
alignment->resize(alignDB.get(i).getUnaligned().length()+1);
MPI_File_close(&outMPI);
+ delete alignment;
return 1;
}
catch(exception& e) {
#include "validcalculator.h"
#include "dist.h"
#include "sequencedb.h"
+#include "sequence.hpp"
+
+#include "gotohoverlap.hpp"
+#include "needlemanoverlap.hpp"
+#include "blastalign.hpp"
+#include "noalign.hpp"
+
+#include "ignoregaps.h"
+#include "eachgapdist.h"
+#include "eachgapignore.h"
+#include "onegapdist.h"
+#include "onegapignore.h"
class PairwiseSeqsCommand : public Command {
vector<int> processIDS; //end line, processid
vector<distlinePair> lines;
- Alignment* alignment;
- Dist* distCalculator;
SequenceDB alignDB;
void createProcesses(string);
string fastaFileName, align, calc, outputDir, output;
float match, misMatch, gapOpen, gapExtend, cutoff;
- int processors;
+ int processors, longestBase;
vector<string> fastaFileNames, Estimators;
vector<string> outputNames;
bool abort, countends, compress;
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct pairwiseData {
+ string outputFileName;
+ string align, square, distcalcType, output;
+ unsigned long long start;
+ unsigned long long end;
+ MothurOut* m;
+ float match, misMatch, gapOpen, gapExtend, cutoff;
+ int count, threadID, longestBase;
+ bool countends;
+ SequenceDB alignDB;
+
+ pairwiseData(){}
+ pairwiseData(string ofn, string al, string sq, string di, bool co, string op, SequenceDB DB, MothurOut* mout, unsigned long long st, unsigned long long en, float ma, float misMa, float gapO, float gapE, int thr, int tid) {
+ outputFileName = ofn;
+ m = mout;
+ start = st;
+ end = en;
+ match = ma;
+ misMatch = misMa;
+ gapOpen = gapO;
+ gapExtend = gapE;
+ longestBase = thr;
+ align = al;
+ square = sq;
+ distcalcType = di;
+ countends = co;
+ alignDB = DB;
+ count = 0;
+ output = op;
+ threadID = tid;
+ }
+};
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyPairwiseSquareThreadFunction(LPVOID lpParam){
+ pairwiseData* pDataArray;
+ pDataArray = (pairwiseData*)lpParam;
+
+ try {
+ ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
+ outFile.setf(ios::fixed, ios::showpoint);
+ outFile << setprecision(4);
+
+ pDataArray->count = pDataArray->end;
+
+ int startTime = time(NULL);
+
+ Alignment* alignment;
+ if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
+ else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
+ else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
+ else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
+ else {
+ pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
+ pDataArray->m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
+ }
+
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (pDataArray->countends) {
+ if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
+ if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
+ if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
+ if(pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
+
+ for(int i=pDataArray->start;i<pDataArray->end;i++){
+
+ string name = pDataArray->alignDB.get(i).getName();
+ //pad with spaces to make compatible
+ if (name.length() < 10) { while (name.length() < 10) { name += " "; } }
+
+ outFile << name << '\t';
+
+ for(int j=0;j<pDataArray->alignDB.getNumSeqs();j++){
+
+ if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
+
+ if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
+ alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
+ }
+
+ if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
+ alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
+ }
+
+ Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
+ Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
+
+ alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
+ seqI.setAligned(alignment->getSeqAAln());
+ seqJ.setAligned(alignment->getSeqBAln());
+
+ distCalculator->calcDist(seqI, seqJ);
+ double dist = distCalculator->getDist();
+
+ outFile << dist << '\t';
+ }
+
+ outFile << endl;
+
+ if(i % 100 == 0){
+ pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+ }
+
+ }
+ pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+
+ outFile.close();
+ delete alignment;
+ delete distCalculator;
+
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseSquareThreadFunction");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+static DWORD WINAPI MyPairwiseThreadFunction(LPVOID lpParam){
+ pairwiseData* pDataArray;
+ pDataArray = (pairwiseData*)lpParam;
+
+ try {
+ ofstream outFile((pDataArray->outputFileName).c_str(), ios::trunc);
+ outFile.setf(ios::fixed, ios::showpoint);
+ outFile << setprecision(4);
+
+ pDataArray->count = pDataArray->end;
+
+ int startTime = time(NULL);
+
+ Alignment* alignment;
+ if(pDataArray->align == "gotoh") { alignment = new GotohOverlap(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
+ else if(pDataArray->align == "needleman") { alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase); }
+ else if(pDataArray->align == "blast") { alignment = new BlastAlignment(pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch); }
+ else if(pDataArray->align == "noalign") { alignment = new NoAlign(); }
+ else {
+ pDataArray->m->mothurOut(pDataArray->align + " is not a valid alignment option. I will run the command using needleman.");
+ pDataArray->m->mothurOutEndLine();
+ alignment = new NeedlemanOverlap(pDataArray->gapOpen, pDataArray->match, pDataArray->misMatch, pDataArray->longestBase);
+ }
+
+ ValidCalculators validCalculator;
+ Dist* distCalculator;
+ if (pDataArray->countends) {
+ if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
+ if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (pDataArray->distcalcType == "eachgap") { distCalculator = new eachGapDist(); }
+ else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapDist(); }
+ }
+ }else {
+ if (validCalculator.isValidCalculator("distance", pDataArray->distcalcType) == true) {
+ if (pDataArray->distcalcType == "nogaps") { distCalculator = new ignoreGaps(); }
+ else if (pDataArray->distcalcType == "eachgap"){ distCalculator = new eachGapIgnoreTermGapDist(); }
+ else if (pDataArray->distcalcType == "onegap") { distCalculator = new oneGapIgnoreTermGapDist(); }
+ }
+ }
+
+ if((pDataArray->output == "lt") && pDataArray->start == 0){ outFile << pDataArray->alignDB.getNumSeqs() << endl; }
+
+ for(int i=pDataArray->start;i<pDataArray->end;i++){
+
+ if(pDataArray->output == "lt") {
+ string name = pDataArray->alignDB.get(i).getName();
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+ outFile << name << '\t';
+ }
+
+
+ for(int j=0;j<i;j++){
+
+ if (pDataArray->m->control_pressed) { outFile.close(); delete alignment; delete distCalculator; return 0; }
+
+ if (pDataArray->alignDB.get(i).getUnaligned().length() > alignment->getnRows()) {
+ alignment->resize(pDataArray->alignDB.get(i).getUnaligned().length()+1);
+ }
+
+ if (pDataArray->alignDB.get(j).getUnaligned().length() > alignment->getnRows()) {
+ alignment->resize(pDataArray->alignDB.get(j).getUnaligned().length()+1);
+ }
+
+ Sequence seqI(pDataArray->alignDB.get(i).getName(), pDataArray->alignDB.get(i).getAligned());
+ Sequence seqJ(pDataArray->alignDB.get(j).getName(), pDataArray->alignDB.get(j).getAligned());
+
+ alignment->align(seqI.getUnaligned(), seqJ.getUnaligned());
+ seqI.setAligned(alignment->getSeqAAln());
+ seqJ.setAligned(alignment->getSeqBAln());
+
+ distCalculator->calcDist(seqI, seqJ);
+ double dist = distCalculator->getDist();
+
+ if(dist <= pDataArray->cutoff){
+ if (pDataArray->output == "column") { outFile << pDataArray->alignDB.get(i).getName() << ' ' << pDataArray->alignDB.get(j).getName() << ' ' << dist << endl; }
+ }
+ if (pDataArray->output == "lt") { outFile << dist << '\t'; }
+ }
+
+ if (pDataArray->output == "lt") { outFile << endl; }
+
+ if(i % 100 == 0){
+ pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+ }
+
+ }
+ pDataArray->m->mothurOut(toString(pDataArray->end-1) + "\t" + toString(time(NULL) - startTime)); pDataArray->m->mothurOutEndLine();
+
+ outFile.close();
+ delete alignment;
+ delete distCalculator;
+
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "PairwiseSeqsCommand", "MyPairwiseThreadFunction");
+ exit(1);
+ }
+}
+
+#endif
+
+
#endif
try {
string helpString = "";
helpString += "The fastq.info command reads a fastq file and creates a fasta and quality file.\n";
- helpString += "The fastq.info command parameter is fastq, and it is required.\n";
+ helpString += "The fastq.info command parameters are fastq, fasta and qfile; fastq is required.\n";
helpString += "The fastq.info command should be in the following format: fastq.info(fastaq=yourFastaQFile).\n";
+ helpString += "The fasta parameter allows you to indicate whether you want a fasta file generated. Default=T.\n";
+ helpString += "The qfile parameter allows you to indicate whether you want a quality file generated. Default=T.\n";
helpString += "Example fastq.info(fastaq=test.fastaq).\n";
helpString += "Note: No spaces between parameter labels (i.e. fastq), '=' and yourFastQFile.\n";
return helpString;
}
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
}else{
EstOutput Parsimony::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
--- /dev/null
+#ifndef Mothur_pcrseqscommand_h
+#define Mothur_pcrseqscommand_h
+
+//
+// pcrseqscommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 3/14/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+
+#include "command.hpp"
+#include "sequence.hpp"
+#include "trimoligos.h"
+#include "alignment.hpp"
+#include "needlemanoverlap.hpp"
+
+class PcrSeqsCommand : public Command {
+public:
+ PcrSeqsCommand(string);
+ PcrSeqsCommand();
+ ~PcrSeqsCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "pcr.seqs"; }
+ string getCommandCategory() { return "Sequence Processing"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Pcr.seqs"; }
+ string getDescription() { return "pcr.seqs"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+private:
+
+ struct linePair {
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+ linePair() {}
+ };
+
+ vector<linePair> lines;
+ bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
+ bool abort, keepprimer, keepdots;
+ string fastafile, oligosfile, taxfile, groupfile, namefile, ecolifile, outputDir, nomatch;
+ int start, end, pdiffs, processors, length;
+
+ vector<string> revPrimer, outputNames;
+ vector<string> primers;
+
+ int writeAccnos(set<string>);
+ int readName(set<string>&);
+ int readGroup(set<string>);
+ int readTax(set<string>);
+ bool readOligos();
+ bool readEcoli();
+ int driverPcr(string, string, string, set<string>&, linePair);
+ int createProcesses(string, string, string, set<string>&);
+ bool findForward(Sequence&, int&, int&);
+ bool findReverse(Sequence&, int&, int&);
+ bool isAligned(string, map<int, int>&);
+ bool compareDNASeq(string, string);
+ string reverseOligo(string);
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct pcrData {
+ string filename;
+ string goodFasta, badFasta, oligosfile, ecolifile, nomatch;
+ unsigned long long fstart;
+ unsigned long long fend;
+ int count, start, end, length;
+ MothurOut* m;
+ vector<string> primers;
+ vector<string> revPrimer;
+ set<string> badSeqNames;
+ bool keepprimer, keepdots;
+
+
+ pcrData(){}
+ pcrData(string f, string gf, string bfn, MothurOut* mout, string ol, string ec, vector<string> pr, vector<string> rpr, string nm, bool kp, bool kd, int st, int en, int l, unsigned long long fst, unsigned long long fen) {
+ filename = f;
+ goodFasta = gf;
+ badFasta = bfn;
+ m = mout;
+ oligosfile = ol;
+ ecolifile = ec;
+ primers = pr;
+ revPrimer = rpr;
+ nomatch = nm;
+ keepprimer = kp;
+ keepdots = kd;
+ start = st;
+ end = en;
+ length = l;
+ fstart = fst;
+ fend = fen;
+ count = 0;
+ }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){
+ pcrData* pDataArray;
+ pDataArray = (pcrData*)lpParam;
+
+ try {
+ ofstream goodFile;
+ pDataArray->m->openOutputFile(pDataArray->goodFasta, goodFile);
+
+ ofstream badFile;
+ pDataArray->m->openOutputFile(pDataArray->badFasta, badFile);
+
+ ifstream inFASTA;
+ pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+
+ //print header if you are process 0
+ if ((pDataArray->fstart == 0) || (pDataArray->fstart == 1)) {
+ inFASTA.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ inFASTA.seekg(pDataArray->fstart-1); pDataArray->m->gobble(inFASTA);
+ }
+
+ set<int> lengths;
+ pDataArray->count = pDataArray->fend;
+ for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
+
+ string trashCode = "";
+ if (currSeq.getName() != "") {
+
+ bool goodSeq = true;
+ if (pDataArray->oligosfile != "") {
+ map<int, int> mapAligned;
+ //bool aligned = isAligned(currSeq.getAligned(), mapAligned);
+ ///////////////////////////////////////////////////////////////
+ bool aligned = false;
+ string seq = currSeq.getAligned();
+ int countBases = 0;
+ for (int k = 0; k < seq.length(); k++) {
+ if (!isalpha(seq[k])) { aligned = true; }
+ else { mapAligned[countBases] = k; countBases++; } //maps location in unaligned -> location in aligned.
+ } //ie. the 3rd base may be at spot 10 in the alignment
+ //later when we trim we want to trim from spot 10.
+ ///////////////////////////////////////////////////////////////
+
+ //process primers
+ if (pDataArray->primers.size() != 0) {
+ int primerStart = 0; int primerEnd = 0;
+ //bool good = findForward(currSeq, primerStart, primerEnd);
+ ///////////////////////////////////////////////////////////////
+ bool good = false;
+ string rawSequence = currSeq.getUnaligned();
+
+ for(int j=0;j<pDataArray->primers.size();j++){
+ string oligo = pDataArray->primers[j];
+
+ if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
+
+ if(rawSequence.length() < oligo.length()) { break; }
+
+ //search for primer
+ int olength = oligo.length();
+ for (int l = 0; l < rawSequence.length()-olength; l++){
+ if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
+ string rawChunk = rawSequence.substr(l, olength);
+ //compareDNASeq(oligo, rawChunk)
+ ////////////////////////////////////////////////////////
+ bool success = 1;
+ for(int k=0;k<olength;k++){
+
+ if(oligo[k] != rawChunk[k]){
+ if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
+ else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
+ else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
+ else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
+ else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
+ else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
+ else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
+
+ if(success == 0) { break; }
+ }
+ else{
+ success = 1;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////
+ if(success) {
+ primerStart = j;
+ primerEnd = primerStart + olength;
+ good = true; break;
+ }
+ }
+ if (good) { break; }
+ }
+
+ if (!good) { primerStart = 0; primerEnd = 0; }
+ ///////////////////////////////////////////////////////////////
+
+
+ if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
+ else{
+ //are you aligned
+ if (aligned) {
+ if (!pDataArray->keepprimer) {
+ if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
+ }
+ else {
+ if (pDataArray->keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
+ }
+ }else {
+ if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
+ else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
+ }
+ }
+ }
+
+ //process reverse primers
+ if (pDataArray->revPrimer.size() != 0) {
+ int primerStart = 0; int primerEnd = 0;
+ bool good = false;
+ //findReverse(currSeq, primerStart, primerEnd);
+ ///////////////////////////////////////////////////////////////
+ string rawSequence = currSeq.getUnaligned();
+
+ for(int j=0;j<pDataArray->revPrimer.size();j++){
+ string oligo = pDataArray->revPrimer[j];
+ if (pDataArray->m->control_pressed) { primerStart = 0; primerEnd = 0; good = false; break; }
+ if(rawSequence.length() < oligo.length()) { break; }
+
+ //search for primer
+ int olength = oligo.length();
+ for (int l = rawSequence.length()-olength; l >= 0; l--){
+
+ string rawChunk = rawSequence.substr(l, olength);
+ //compareDNASeq(oligo, rawChunk)
+ ////////////////////////////////////////////////////////
+ bool success = 1;
+ for(int k=0;k<olength;k++){
+
+ if(oligo[k] != rawChunk[k]){
+ if(oligo[k] == 'A' || oligo[k] == 'T' || oligo[k] == 'G' || oligo[k] == 'C') { success = 0; }
+ else if((oligo[k] == 'N' || oligo[k] == 'I') && (rawChunk[k] == 'N')) { success = 0; }
+ else if(oligo[k] == 'R' && (rawChunk[k] != 'A' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'Y' && (rawChunk[k] != 'C' && rawChunk[k] != 'T')) { success = 0; }
+ else if(oligo[k] == 'M' && (rawChunk[k] != 'C' && rawChunk[k] != 'A')) { success = 0; }
+ else if(oligo[k] == 'K' && (rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'W' && (rawChunk[k] != 'T' && rawChunk[k] != 'A')) { success = 0; }
+ else if(oligo[k] == 'S' && (rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'B' && (rawChunk[k] != 'C' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'D' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'G')) { success = 0; }
+ else if(oligo[k] == 'H' && (rawChunk[k] != 'A' && rawChunk[k] != 'T' && rawChunk[k] != 'C')) { success = 0; }
+ else if(oligo[k] == 'V' && (rawChunk[k] != 'A' && rawChunk[k] != 'C' && rawChunk[k] != 'G')) { success = 0; }
+
+ if(success == 0) { break; }
+ }
+ else{
+ success = 1;
+ }
+ }
+
+ ////////////////////////////////////////////////////////////////////
+ if(success) {
+ primerStart = j;
+ primerEnd = primerStart + olength;
+ good = true; break;
+ }
+ }
+ if (good) { break; }
+ }
+
+ if (!good) { primerStart = 0; primerEnd = 0; }
+
+ ///////////////////////////////////////////////////////////////
+ if(!good){ if (pDataArray->nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
+ else{
+ //are you aligned
+ if (aligned) {
+ if (!pDataArray->keepprimer) {
+ if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
+ }
+ else {
+ if (pDataArray->keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
+ } }
+ else {
+ if (!pDataArray->keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
+ else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
+ }
+ }
+ }
+ }else if (pDataArray->ecolifile != "") {
+ //make sure the seqs are aligned
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+ else if (currSeq.getAligned().length() != pDataArray->length) {
+ pDataArray->m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); pDataArray->m->control_pressed = true; break;
+ }else {
+ if (pDataArray->keepdots) {
+ currSeq.filterToPos(start);
+ currSeq.filterFromPos(end);
+ }else {
+ string seqString = currSeq.getAligned().substr(0, end);
+ seqString = seqString.substr(start);
+ currSeq.setAligned(seqString);
+ }
+ }
+ }else{ //using start and end to trim
+ //make sure the seqs are aligned
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { pDataArray->m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); pDataArray->m->control_pressed = true; break; }
+ else {
+ if (pDataArray->end != -1) {
+ if (pDataArray->end > currSeq.getAligned().length()) { pDataArray->m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); pDataArray->m->control_pressed = true; break; }
+ else {
+ if (pDataArray->keepdots) { currSeq.filterFromPos(end); }
+ else {
+ string seqString = currSeq.getAligned().substr(0, end);
+ currSeq.setAligned(seqString);
+ }
+ }
+ }
+ if (pDataArray->start != -1) {
+ if (pDataArray->keepdots) { currSeq.filterToPos(start); }
+ else {
+ string seqString = currSeq.getAligned().substr(start);
+ currSeq.setAligned(seqString);
+ }
+ }
+
+ }
+ }
+
+ if(goodSeq == 1) { currSeq.printSequence(goodFile); }
+ else {
+ pDataArray->badSeqNames.insert(currSeq.getName());
+ currSeq.setName(currSeq.getName() + '|' + trashCode);
+ currSeq.printSequence(badFile);
+ }
+ }
+
+ //report progress
+ if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); }
+ }
+ //report progress
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Thread Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
+
+ goodFile.close();
+ inFASTA.close();
+ badFile.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "PcrSeqsCommand", "MyPcrThreadFunction");
+ exit(1);
+ }
+}
+
+#endif
+
+/**************************************************************************************************/
+
+
+
+#endif
//initialize Dscore
for (int i=0; i<globaldata->Groups.size(); i++) { DScore[globaldata->Groups[i]] = 0.0; }
- /********************************************************
+ ********************************************************
//calculate a D value for each group
for(int v=0;v<treeNodes.size();v++){
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
if (numSampledList.count(diversity[mGroups[j]].size()-1) == 0) { numSampledList.insert(diversity[mGroups[j]].size()-1); }
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
driver(trees[i], diversity, sumDiversity, iters, increment, randomLeaf, numSampledList, outCollect, outSum, true);
}else{
//**********************************************************************************************************************
int PhyloDiversityCommand::createProcesses(vector<int>& procIters, Tree* t, map< string, vector<float> >& div, map<string, vector<float> >& sumDiv, int numIters, int increment, vector<int>& randomLeaf, set<int>& numSampledList, ofstream& outCollect, ofstream& outSum){
try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
maxLevel = 0;
calcTotals = true;
string name, tax;
- addSeqToTree("unknown", "unknown;");
-
#ifdef USE_MPI
int pid, num, processors;
#endif
assignHeirarchyIDs(0);
-
+
+
+ string unknownTax = "unknown;";
+ //added last taxon until you get desired level
+ for (int i = 1; i < maxLevel; i++) {
+ unknownTax += "unclassfied;";
+ }
+
+ addSeqToTree("unknown", unknownTax);
+
//create file for summary if needed
setUp(tfile);
}
ListVector list;
list.setLabel(level);
+
//go through nodes and build listvector
for (itCurrent = currentNodes.begin(); itCurrent != currentNodes.end(); itCurrent++) {
//make the names compatable with listvector
string name = "";
for (int i = 0; i < names.size(); i++) {
- if (namefile != "") {
- map<string, string>::iterator itNames = namemap.find(names[i]); //make sure this name is in namefile
-
- if (itNames != namemap.end()) { name += namemap[names[i]] + ","; } //you found it in namefile
- else { m->mothurOut(names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); exit(1); }
-
- }else{ name += names[i] + ","; }
+
+ if (names[i] != "unknown") {
+ if (namefile != "") {
+ map<string, string>::iterator itNames = namemap.find(names[i]); //make sure this name is in namefile
+
+ if (itNames != namemap.end()) { name += namemap[names[i]] + ","; } //you found it in namefile
+ else { m->mothurOut(names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); exit(1); }
+
+ }else{ name += names[i] + ","; }
+ }
}
name = name.substr(0, name.length()-1); //rip off extra ','
-
//add bin to list vector
- list.push_back(name);
+ if (name != "") { list.push_back(name); } //caused by unknown
}
//print listvector
#ifdef USE_MPI
//do nothing
#else
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//find breakup of templatefile for quantiles
if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
else {
//**************************************************************************************************
void Pintail::createProcessesQuan() {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
--- /dev/null
+//
+// prcseqscommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 3/14/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "pcrseqscommand.h"
+
+//**********************************************************************************************************************
+vector<string> PcrSeqsCommand::setParameters(){
+ try {
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
+ CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
+ CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
+ CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
+ CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
+ CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
+ CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
+ CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
+ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
+ CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
+ CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string PcrSeqsCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The pcr.seqs command reads a fasta file ...\n";
+
+ helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
+ helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "getHelpString");
+ exit(1);
+ }
+}
+
+
+//**********************************************************************************************************************
+
+PcrSeqsCommand::PcrSeqsCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["accnos"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+PcrSeqsCommand::PcrSeqsCommand(string option) {
+ try {
+
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string,string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["accnos"] = tempOutNames;
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("fasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["fasta"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("oligos");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["oligos"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("ecoli");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["ecoli"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+
+ }
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not found") {
+ fastafile = m->getFastaFile();
+ if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
+ else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
+ }else if (fastafile == "not open") { fastafile = ""; abort = true; }
+ else { m->setFastaFile(fastafile); }
+
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ string temp;
+ temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
+ keepprimer = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
+ keepdots = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "oligos", true);
+ if (temp == "not found"){ oligosfile = ""; }
+ else if(temp == "not open"){ oligosfile = ""; abort = true; }
+ else { oligosfile = temp; m->setOligosFile(oligosfile); }
+
+ ecolifile = validParameter.validFile(parameters, "ecoli", true);
+ if (ecolifile == "not found"){ ecolifile = ""; }
+ else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not found"){ namefile = ""; }
+ else if(namefile == "not open"){ namefile = ""; abort = true; }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not found"){ groupfile = ""; }
+ else if(groupfile == "not open"){ groupfile = ""; abort = true; }
+ else { m->setGroupFile(groupfile); }
+
+ taxfile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxfile == "not found"){ taxfile = ""; }
+ else if(taxfile == "not open"){ taxfile = ""; abort = true; }
+ else { m->setTaxonomyFile(taxfile); }
+
+ temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, pdiffs);
+
+ temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, start);
+
+ temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
+ m->mothurConvert(temp, end);
+
+ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
+ m->setProcessors(temp);
+ m->mothurConvert(temp, processors);
+
+ nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
+
+ if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
+
+ //didnt set anything
+ if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
+ m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
+ }
+
+ if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
+
+ if ((ecolifile != "") && (start != -1) && (end != -1)) {
+ m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
+ }
+
+
+ if ((oligosfile != "") && (ecolifile != "")) {
+ m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
+ }
+
+ //check to make sure you didn't forget the name file by mistake
+ if (namefile == "") {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int PcrSeqsCommand::execute(){
+ try{
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ int start = time(NULL);
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
+ string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
+ outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
+
+ string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
+
+
+ length = 0;
+ if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
+ if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
+
+ vector<unsigned long long> positions;
+ int numFastaSeqs = 0;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
+#else
+ if (processors == 1) {
+ lines.push_back(linePair(0, 1000));
+ }else {
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
+#endif
+ if (m->control_pressed) { return 0; }
+
+ set<string> badNames;
+ if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
+ else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
+
+ if (m->control_pressed) { return 0; }
+
+ //don't write or keep if blank
+ if (badNames.size() != 0) { writeAccnos(badNames); }
+ if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
+ else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+ if (namefile != "") { readName(badNames); }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+ if (groupfile != "") { readGroup(badNames); }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+ if (taxfile != "") { readTax(badNames); }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+ m->mothurOutEndLine();
+
+ //set fasta file as new current fastafile
+ string current = "";
+ itTypes = outputTypes.find("fasta");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+ }
+
+ itTypes = outputTypes.find("name");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+ }
+
+ itTypes = outputTypes.find("group");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
+ }
+
+ itTypes = outputTypes.find("accnos");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
+ }
+
+ itTypes = outputTypes.find("taxonomy");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
+ }
+
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
+ m->mothurOutEndLine();
+
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "execute");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
+ try {
+
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
+
+ //pass numSeqs to parent
+ ofstream out;
+ string tempFile = filename + toString(getpid()) + ".num.temp";
+ m->openOutputFile(tempFile, out);
+ out << num << '\t' << badSeqNames.size() << endl;
+ for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
+ out << (*it) << endl;
+ }
+ out.close();
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ ifstream in;
+ string tempFile = filename + toString(processIDS[i]) + ".num.temp";
+ m->openInputFile(tempFile, in);
+ int numBadNames = 0; string name = "";
+ if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
+ for (int j = 0; j < numBadNames; j++) {
+ in >> name; m->gobble(in);
+ badSeqNames.insert(name);
+ }
+ in.close(); m->mothurRemove(tempFile);
+
+ m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
+ m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
+ m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
+ }
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<pcrData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+
+ string extension = "";
+ if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
+
+ // Allocate memory for thread data.
+ pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
+ pDataArray.push_back(tempPcr);
+
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //do your part
+ num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
+ processIDS.push_back(processors-1);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
+ m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
+ m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
+ }
+
+#endif
+
+ return num;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "createProcesses");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
+ try {
+ ofstream goodFile;
+ m->openOutputFile(goodFasta, goodFile);
+
+ ofstream badFile;
+ m->openOutputFile(badFasta, badFile);
+
+ ifstream inFASTA;
+ m->openInputFile(filename, inFASTA);
+
+ inFASTA.seekg(filePos.start);
+
+ bool done = false;
+ int count = 0;
+ set<int> lengths;
+
+ while (!done) {
+
+ if (m->control_pressed) { break; }
+
+ Sequence currSeq(inFASTA); m->gobble(inFASTA);
+
+ string trashCode = "";
+ if (currSeq.getName() != "") {
+
+ bool goodSeq = true;
+ if (oligosfile != "") {
+ map<int, int> mapAligned;
+ bool aligned = isAligned(currSeq.getAligned(), mapAligned);
+
+ //process primers
+ if (primers.size() != 0) {
+ int primerStart = 0; int primerEnd = 0;
+ bool good = findForward(currSeq, primerStart, primerEnd);
+
+ if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
+ else{
+ //are you aligned
+ if (aligned) {
+ if (!keepprimer) {
+ if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
+ }
+ else {
+ if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
+ }
+ }else {
+ if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
+ else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
+ }
+ }
+ }
+
+ //process reverse primers
+ if (revPrimer.size() != 0) {
+ int primerStart = 0; int primerEnd = 0;
+ bool good = findReverse(currSeq, primerStart, primerEnd);
+ if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
+ else{
+ //are you aligned
+ if (aligned) {
+ if (!keepprimer) {
+ if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
+ }
+ else {
+ if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
+ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
+ }
+ }
+ else {
+ if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
+ else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
+ }
+ }
+ }
+ }else if (ecolifile != "") {
+ //make sure the seqs are aligned
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
+ else if (currSeq.getAligned().length() != length) {
+ m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
+ }else {
+ if (keepdots) {
+ currSeq.filterToPos(start);
+ currSeq.filterFromPos(end);
+ }else {
+ string seqString = currSeq.getAligned().substr(0, end);
+ seqString = seqString.substr(start);
+ currSeq.setAligned(seqString);
+ }
+ }
+ }else{ //using start and end to trim
+ //make sure the seqs are aligned
+ lengths.insert(currSeq.getAligned().length());
+ if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
+ else {
+ if (end != -1) {
+ if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
+ else {
+ if (keepdots) { currSeq.filterFromPos(end); }
+ else {
+ string seqString = currSeq.getAligned().substr(0, end);
+ currSeq.setAligned(seqString);
+ }
+ }
+ }
+ if (start != -1) {
+ if (keepdots) { currSeq.filterToPos(start); }
+ else {
+ string seqString = currSeq.getAligned().substr(start);
+ currSeq.setAligned(seqString);
+ }
+ }
+ }
+ }
+
+ //trimming removed all bases
+ if (currSeq.getUnaligned() == "") { goodSeq = false; }
+
+ if(goodSeq == 1) { currSeq.printSequence(goodFile); }
+ else {
+ badSeqNames.insert(currSeq.getName());
+ currSeq.setName(currSeq.getName() + '|' + trashCode);
+ currSeq.printSequence(badFile);
+ }
+ count++;
+ }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ unsigned long long pos = inFASTA.tellg();
+ if ((pos == -1) || (pos >= filePos.end)) { break; }
+#else
+ if (inFASTA.eof()) { break; }
+#endif
+
+ //report progress
+ if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
+ }
+ //report progress
+ if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
+
+ badFile.close();
+ goodFile.close();
+ inFASTA.close();
+
+ return count;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "driverPcr");
+ exit(1);
+ }
+}
+//********************************************************************/
+bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+
+ for(int j=0;j<primers.size();j++){
+ string oligo = primers[j];
+
+ if(rawSequence.length() < oligo.length()) { break; }
+
+ //search for primer
+ int olength = oligo.length();
+ for (int j = 0; j < rawSequence.length()-olength; j++){
+ if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
+ string rawChunk = rawSequence.substr(j, olength);
+ if(compareDNASeq(oligo, rawChunk)) {
+ primerStart = j;
+ primerEnd = primerStart + olength;
+ return true;
+ }
+
+ }
+ }
+
+ primerStart = 0; primerEnd = 0;
+ return false;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripForward");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+
+ for(int i=0;i<revPrimer.size();i++){
+ string oligo = revPrimer[i];
+ if(rawSequence.length() < oligo.length()) { break; }
+
+ //search for primer
+ int olength = oligo.length();
+ for (int j = rawSequence.length()-olength; j >= 0; j--){
+ if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
+ string rawChunk = rawSequence.substr(j, olength);
+
+ if(compareDNASeq(oligo, rawChunk)) {
+ primerStart = j;
+ primerEnd = primerStart + olength;
+ return true;
+ }
+
+ }
+ }
+
+ primerStart = 0; primerEnd = 0;
+ return false;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "findReverse");
+ exit(1);
+ }
+}
+//********************************************************************/
+bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
+ try {
+ bool isAligned = false;
+
+ int countBases = 0;
+ for (int i = 0; i < seq.length(); i++) {
+ if (!isalpha(seq[i])) { isAligned = true; }
+ else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
+ } //ie. the 3rd base may be at spot 10 in the alignment
+ //later when we trim we want to trim from spot 10.
+ return isAligned;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "isAligned");
+ exit(1);
+ }
+}
+//********************************************************************/
+string PcrSeqsCommand::reverseOligo(string oligo){
+ try {
+ string reverse = "";
+
+ for(int i=oligo.length()-1;i>=0;i--){
+
+ if(oligo[i] == 'A') { reverse += 'T'; }
+ else if(oligo[i] == 'T'){ reverse += 'A'; }
+ else if(oligo[i] == 'U'){ reverse += 'A'; }
+
+ else if(oligo[i] == 'G'){ reverse += 'C'; }
+ else if(oligo[i] == 'C'){ reverse += 'G'; }
+
+ else if(oligo[i] == 'R'){ reverse += 'Y'; }
+ else if(oligo[i] == 'Y'){ reverse += 'R'; }
+
+ else if(oligo[i] == 'M'){ reverse += 'K'; }
+ else if(oligo[i] == 'K'){ reverse += 'M'; }
+
+ else if(oligo[i] == 'W'){ reverse += 'W'; }
+ else if(oligo[i] == 'S'){ reverse += 'S'; }
+
+ else if(oligo[i] == 'B'){ reverse += 'V'; }
+ else if(oligo[i] == 'V'){ reverse += 'B'; }
+
+ else if(oligo[i] == 'D'){ reverse += 'H'; }
+ else if(oligo[i] == 'H'){ reverse += 'D'; }
+
+ else { reverse += 'N'; }
+ }
+
+
+ return reverse;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+bool PcrSeqsCommand::readOligos(){
+ try {
+ ifstream inOligos;
+ m->openInputFile(oligosfile, inOligos);
+
+ string type, oligo, group;
+
+ while(!inOligos.eof()){
+
+ inOligos >> type;
+
+ if(type[0] == '#'){ //ignore
+ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ m->gobble(inOligos);
+ }else{
+ m->gobble(inOligos);
+ //make type case insensitive
+ for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
+
+ inOligos >> oligo;
+
+ for(int i=0;i<oligo.length();i++){
+ oligo[i] = toupper(oligo[i]);
+ if(oligo[i] == 'U') { oligo[i] = 'T'; }
+ }
+
+ if(type == "FORWARD"){
+ // get rest of line in case there is a primer name
+ while (!inOligos.eof()) {
+ char c = inOligos.get();
+ if (c == 10 || c == 13){ break; }
+ else if (c == 32 || c == 9){;} //space or tab
+ }
+ primers.push_back(oligo);
+ }else if(type == "REVERSE"){
+ string oligoRC = reverseOligo(oligo);
+ revPrimer.push_back(oligoRC);
+ //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
+ }else if(type == "BARCODE"){
+ inOligos >> group;
+ }else if((type == "LINKER")||(type == "SPACER")) {;}
+ else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+ }
+ m->gobble(inOligos);
+ }
+ inOligos.close();
+
+ if ((primers.size() == 0) && (revPrimer.size() == 0)) {
+ m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
+ m->control_pressed = true;
+ return false;
+ }
+
+ return true;
+
+ }catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readOligos");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+bool PcrSeqsCommand::readEcoli(){
+ try {
+ ifstream in;
+ m->openInputFile(ecolifile, in);
+
+ //read seq
+ if (!in.eof()){
+ Sequence ecoli(in);
+ length = ecoli.getAligned().length();
+ start = ecoli.getStartPos();
+ end = ecoli.getEndPos();
+ }else { in.close(); m->control_pressed = true; return false; }
+ in.close();
+
+ return true;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readEcoli");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************
+int PcrSeqsCommand::writeAccnos(set<string> badNames){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
+ outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
+ if (m->control_pressed) { break; }
+ out << (*it) << endl;
+ }
+
+ out.close();
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
+ exit(1);
+ }
+
+}
+//******************************************************************/
+bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
+ try {
+ bool success = 1;
+ int length = oligo.length();
+
+ for(int i=0;i<length;i++){
+
+ if(oligo[i] != seq[i]){
+ if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
+ else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
+ else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
+ else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
+ else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
+ else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
+ else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
+ else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
+
+ if(success == 0) { break; }
+ }
+ else{
+ success = 1;
+ }
+ }
+
+ return success;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
+ exit(1);
+ }
+
+}
+//***************************************************************************************************************
+int PcrSeqsCommand::readName(set<string>& names){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(namefile, in);
+ string name, firstCol, secondCol;
+
+ bool wroteSomething = false;
+ int removedCount = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> firstCol; m->gobble(in);
+ in >> secondCol;
+
+ string savedSecond = secondCol;
+ vector<string> parsedNames;
+ m->splitAtComma(secondCol, parsedNames);
+
+ vector<string> validSecond; validSecond.clear();
+ for (int i = 0; i < parsedNames.size(); i++) {
+ if (names.count(parsedNames[i]) == 0) {
+ validSecond.push_back(parsedNames[i]);
+ }
+ }
+
+ if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
+ for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
+ removedCount += parsedNames.size();
+ }else {
+ out << firstCol << '\t' << savedSecond << endl;
+ wroteSomething = true;
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
+ outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readName");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int PcrSeqsCommand::readGroup(set<string> names){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(groupfile, in);
+ string name, group;
+
+ bool wroteSomething = false;
+ int removedCount = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; //read from first column
+ in >> group; //read from second column
+
+ //if this name is in the accnos file
+ if (names.count(name) == 0) {
+ wroteSomething = true;
+ out << name << '\t' << group << endl;
+ }else { removedCount++; }
+
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
+ outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
+
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readGroup");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int PcrSeqsCommand::readTax(set<string> names){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(taxfile, in);
+ string name, tax;
+
+ bool wroteSomething = false;
+ int removedCount = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; //read from first column
+ in >> tax; //read from second column
+
+ //if this name is in the accnos file
+ if (names.count(name) == 0) {
+ wroteSomething = true;
+ out << name << '\t' << tax << endl;
+ }else { removedCount++; }
+
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
+ outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PcrSeqsCommand", "readTax");
+ exit(1);
+ }
+}
+/**************************************************************************************/
+
+
m->mothurOutEndLine();
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->renameFile(filenames["fasta"][0], newFastaFile);
+ m->renameFile(filenames["name"][0], newNamesFile);
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine();
lines.push_back(linePair(startIndex, endIndex));
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
m->openInputFile(fastafile, inFasta);
//string firstCol, secondCol, nameString;
- length = 0;
+ set<int> lengths;
while (!inFasta.eof()) {
else{
seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
+ lengths.insert(seq.getAligned().length());
}
}else { //no names file, you are identical to yourself
seqPNode tempNode(1, seq, seq.getName());
alignSeqs.push_back(tempNode);
- if (seq.getAligned().length() > length) { length = seq.getAligned().length(); }
+ lengths.insert(seq.getAligned().length());
}
}
}
inFasta.close();
//inNames.close();
+
+ if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
+ else if (lengths.size() == 1) { length = *(lengths.begin()); }
+
return alignSeqs.size();
}
/**************************************************************************************************/
int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
try {
- length = 0;
+ set<int> lengths;
alignSeqs.clear();
map<string, string>::iterator it;
bool error = false;
seqPNode tempNode(numReps, thisSeqs[i], it->second);
alignSeqs.push_back(tempNode);
- if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ lengths.insert(thisSeqs[i].getAligned().length());
}
}else { //no names file, you are identical to yourself
seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
alignSeqs.push_back(tempNode);
- if (thisSeqs[i].getAligned().length() > length) { length = thisSeqs[i].getAligned().length(); }
+ lengths.insert(thisSeqs[i].getAligned().length());
}
}
+ if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
+ else if (lengths.size() == 1) { length = *(lengths.begin()); }
+
//sanity check
if (error) { m->control_pressed = true; }
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyPreclusterThreadFunction(LPVOID lpParam){
preClusterData* pDataArray;
void updateQScoreErrorMap(map<char, vector<int> >&, string, int, int, int);
void updateForwardMap(vector<vector<int> >&, int, int, int);
void updateReverseMap(vector<vector<int> >&, int, int, int);
+ void setName(string n) { seqName = n; }
+ void setScores(vector<int> qs) { qScores = qs; seqLength = qScores.size(); }
+
private:
if (percentFreq < 1.0) { increment = numSeqs * percentFreq; }
else { increment = percentFreq; }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
driver(rcd, increment, nIters);
}else{
int Rarefact::createProcesses(vector<int>& procIters, RarefactionCurveData* rcd, int increment) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
if (m->control_pressed) { return 0; }
- map<string, string> nameMap;
+ map<int, string> file2Group; //index in outputNames[i] -> group
for (int p = 0; p < inputFileNames.size(); p++) {
string fileNameRoot = outputDir + m->getRootName(m->getSimpleName(inputFileNames[p]));
if (inputFileNames.size() > 1) {
m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[p]); m->mothurOutEndLine(); m->mothurOutEndLine();
- nameMap[fileNameRoot] = groups[p];
}
int i;
ValidCalculators validCalculator;
rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs")));
outputNames.push_back(fileNameRoot+"r_nseqs"); outputTypes["r_nseqs"].push_back(fileNameRoot+"r_nseqs");
}
+ if (inputFileNames.size() > 1) { file2Group[outputNames.size()-1] = groups[p]; }
}
}
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
//create summary file containing all the groups data for each label - this function just combines the info from the files already created.
- if ((sharedfile != "") && (groupMode)) { outputNames = createGroupFile(outputNames, nameMap); }
+ if ((sharedfile != "") && (groupMode)) { outputNames = createGroupFile(outputNames, file2Group); }
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
}
}
//**********************************************************************************************************************
-vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map<string, string> nameMap) {
+vector<string> RareFactCommand::createGroupFile(vector<string>& outputNames, map<int, string> file2Group) {
try {
vector<string> newFileNames;
//find different types of files
- map<string, vector<string> > typesFiles;
+ map<string, map<string, string> > typesFiles;
+ map<string, string> temp;
for (int i = 0; i < outputNames.size(); i++) {
string extension = m->getExtension(outputNames[i]);
newLine += "\tGroup" + labels.substr(labels.find_first_of('\t'));
- typesFiles[extension].push_back(outputNames[i]);
+ temp[outputNames[i]] = file2Group[i];
+ typesFiles[extension] = temp;
string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + extension;
//for each type create a combo file
map<int, int> lineToNumber;
- for (map<string, vector<string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
+ for (map<string, map<string, string> >::iterator it = typesFiles.begin(); it != typesFiles.end(); it++) {
ofstream out;
string combineFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + "groups" + it->first;
m->openOutputFileAppend(combineFileName, out);
newFileNames.push_back(combineFileName);
- vector<string> thisTypesFiles = it->second;
+ map<string, string> thisTypesFiles = it->second;
//open each type summary file
map<string, vector<string> > files; //maps file name to lines in file
int maxLines = 0;
int numColumns = 0;
- for (int i=0; i<thisTypesFiles.size(); i++) {
-
+ for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
+
+ string thisfilename = itFileNameGroup->first;
+ string group = itFileNameGroup->second;
+
ifstream temp;
- m->openInputFile(thisTypesFiles[i], temp);
+ m->openInputFile(thisfilename, temp);
//read through first line - labels
m->getline(temp); m->gobble(temp);
vector<string> thisFilesLines;
- string fileNameRoot = m->getRootName(thisTypesFiles[i]);
- map<string, string>::iterator itName = nameMap.find(fileNameRoot);
- string group = "";
- if (itName != nameMap.end()) {
- group = itName->second;
- }else {
- group = "not found" + i;
- m->mothurOut("[ERROR]: can't parse filename."); m->mothurOutEndLine();
- }
thisFilesLines.push_back(group);
int count = 1;
m->gobble(temp);
}
- files[thisTypesFiles[i]] = thisFilesLines;
+ files[thisfilename] = thisFilesLines;
//save longest file for below
if (maxLines < thisFilesLines.size()) { maxLines = thisFilesLines.size(); }
temp.close();
- m->mothurRemove(thisTypesFiles[i]);
+ m->mothurRemove(thisfilename);
}
for (int k = 1; k < maxLines; k++) {
//grab data for each group
- for (int i=0; i<thisTypesFiles.size(); i++) {
-
+ for (map<string, string>::iterator itFileNameGroup = thisTypesFiles.begin(); itFileNameGroup != thisTypesFiles.end(); itFileNameGroup++) {
+
+ string thisfilename = itFileNameGroup->first;
+
map<int, int>::iterator itLine = lineToNumber.find(k);
if (itLine != lineToNumber.end()) {
string output = toString(itLine->second);
- if (k < files[thisTypesFiles[i]].size()) {
- string line = files[thisTypesFiles[i]][k];
+ if (k < files[thisfilename].size()) {
+ string line = files[thisfilename][k];
output = line.substr(0, line.find_first_of('\t'));
- output += '\t' + files[thisTypesFiles[i]][0] + '\t' + line.substr(line.find_first_of('\t'));
+ output += '\t' + files[thisfilename][0] + '\t' + line.substr(line.find_first_of('\t'));
}else{
- output += '\t' + files[thisTypesFiles[i]][0] + '\t';
+ output += '\t' + files[thisfilename][0] + '\t';
for (int h = 0; h < numColumns; h++) {
output += "NA\t";
}
string outputDir;
vector<string> parseSharedFile(string);
- vector<string> createGroupFile(vector<string>&, map<string, string>);
+ vector<string> createGroupFile(vector<string>&, map<int, string>);
};
#endif
}
/*******************************************************
ReferenceDB::~ReferenceDB() { myInstance = NULL; }
-/*******************************************************/
+*******************************************************/
*/
#include "screenseqscommand.h"
-#include "sequence.hpp"
+
//**********************************************************************************************************************
vector<string> ScreenSeqsCommand::setParameters(){
getSummary(positions);
}
else {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- positions = m->divideFile(fastafile, processors);
- for (int i = 0; i < (positions.size()-1); i++) {
- lines.push_back(new linePair(positions[i], positions[(i+1)]));
- }
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
- positions.push_back(0); positions.push_back(1000);
- lines.push_back(new linePair(0, 1000));
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
#endif
}
-
+
string goodSeqFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "good" + m->getExtension(fastafile);
string badAccnosFile = outputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
numSeqsPerProcessor = numFastaSeqs / processors;
int startIndex = pid * numSeqsPerProcessor;
if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
- // cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
+
//align your part
driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
- //cout << pid << " done" << endl;
+
if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
for (int i = 1; i < processors; i++) {
-
//get bad lists
int badSize;
MPI_Recv(&badSize, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
- /*for (int j = 0; j < badSize; j++) {
- int length;
- MPI_Recv(&length, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); //recv the length of the name
- char* buf2 = new char[length]; //make space to recieve it
- MPI_Recv(buf2, length, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //get name
-
- string tempBuf = buf2;
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
- delete buf2;
-
- badSeqNames.insert(tempBuf);
- }*/
}
}else{ //you are a child process
MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
numSeqsPerProcessor = numFastaSeqs / processors;
int startIndex = pid * numSeqsPerProcessor;
if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; }
- //cout << pid << '\t' << numSeqsPerProcessor << '\t' << startIndex << endl;
+
//align your part
driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPIGood, outMPIBadAccnos, MPIPos, badSeqNames);
-//cout << pid << " done" << endl;
+
if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPIGood); MPI_File_close(&outMPIBadAccnos); return 0; }
//send bad list
int badSize = badSeqNames.size();
MPI_Send(&badSize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
-
- /*
- set<string>::iterator it;
- for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
- string name = *it;
- int length = name.length();
- char* buf2 = new char[length];
- memcpy(buf2, name.c_str(), length);
-
- MPI_Send(&length, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
- MPI_Send(buf2, length, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
- }*/
}
//close files
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
- numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
-
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
-
- }else{
- processIDS.resize(0);
-
- numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames);
-
- rename((goodSeqFile + toString(processIDS[0]) + ".temp").c_str(), goodSeqFile.c_str());
- rename((badAccnosFile + toString(processIDS[0]) + ".temp").c_str(), badAccnosFile.c_str());
-
- //append alignment and report files
- for(int i=1;i<processors;i++){
- m->appendFiles((goodSeqFile + toString(processIDS[i]) + ".temp"), goodSeqFile);
- m->mothurRemove((goodSeqFile + toString(processIDS[i]) + ".temp"));
-
- m->appendFiles((badAccnosFile + toString(processIDS[i]) + ".temp"), badAccnosFile);
- m->mothurRemove((badAccnosFile + toString(processIDS[i]) + ".temp"));
- }
-
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
-
- //read badSeqs in because root process doesnt know what other "bad" seqs the children found
- ifstream inBad;
- int ableToOpen = m->openInputFile(badAccnosFile, inBad, "no error");
-
- if (ableToOpen == 0) {
- badSeqNames.clear();
- string tempName;
- while (!inBad.eof()) {
- inBad >> tempName; m->gobble(inBad);
- badSeqNames.insert(tempName);
- }
- inBad.close();
- }
- }
- #else
- numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames);
-
- if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
-
- #endif
-
+ if(processors == 1){ numFastaSeqs = driver(lines[0], goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+ else{ numFastaSeqs = createProcesses(goodSeqFile, badAccnosFile, fastafile, badSeqNames); }
+
+ if (m->control_pressed) { m->mothurRemove(goodSeqFile); return 0; }
#endif
#ifdef USE_MPI
vector<int> ambigBases;
vector<int> longHomoPolymer;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- vector<unsigned long long> positions = m->divideFile(fastafile, processors);
-
- for (int i = 0; i < (positions.size()-1); i++) {
- lines.push_back(new linePair(positions[i], positions[(i+1)]));
- }
+ vector<unsigned long long> positions;
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ positions = m->divideFile(fastafile, processors);
+ for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
- lines.push_back(new linePair(0, 1000));
+ if(processors == 1){ lines.push_back(linePair(0, 1000)); }
+ else {
+ int numFastaSeqs = 0;
+ positions = m->setFilePosFasta(fastafile, numFastaSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
+ }
+ }
#endif
#ifdef USE_MPI
driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
#else
int numSeqs = 0;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
}else{
}
if (m->control_pressed) { return 0; }
- #else
- numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
- if (m->control_pressed) { return 0; }
- #endif
+ //#else
+ // numSeqs = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[0]);
+ // if (m->control_pressed) { return 0; }
+ //#endif
#endif
sort(startPosition.begin(), startPosition.end());
sort(endPosition.begin(), endPosition.end());
}
}
/**************************************************************************************/
-int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair* filePos) {
+int ScreenSeqsCommand::driverCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename, linePair filePos) {
try {
ifstream in;
m->openInputFile(filename, in);
- in.seekg(filePos->start);
+ in.seekg(filePos.start);
bool done = false;
int count = 0;
count++;
}
//if((count) % 100 == 0){ m->mothurOut("Optimizing sequence: " + toString(count)); m->mothurOutEndLine(); }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
- if ((pos == -1) || (pos >= filePos->end)) { break; }
+ if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
if (in.eof()) { break; }
#endif
/**************************************************************************************************/
int ScreenSeqsCommand::createProcessesCreateSummary(vector<int>& startPosition, vector<int>& endPosition, vector<int>& seqLength, vector<int>& ambigBases, vector<int>& longHomoPolymer, string filename) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 1;
+
+ int process = 1;
int num = 0;
- processIDS.clear();
-
+ vector<int> processIDS;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
m->mothurRemove(tempFilename);
}
- return num;
+
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the seqSumData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<sumData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+
+ // Allocate memory for thread data.
+ sumData* tempSum = new sumData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
+ pDataArray.push_back(tempSum);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MySumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //do your part
+ num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, lines[processors-1]);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
+ for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); }
+ for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
#endif
+ return num;
}
catch(exception& e) {
m->errorOut(e, "ScreenSeqsCommand", "createProcessesCreateSummary");
}
//**********************************************************************************************************************
-int ScreenSeqsCommand::driver(linePair* filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
+int ScreenSeqsCommand::driver(linePair filePos, string goodFName, string badAccnosFName, string filename, set<string>& badSeqNames){
try {
ofstream goodFile;
m->openOutputFile(goodFName, goodFile);
ifstream inFASTA;
m->openInputFile(filename, inFASTA);
- inFASTA.seekg(filePos->start);
+ inFASTA.seekg(filePos.start);
bool done = false;
int count = 0;
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
- if ((pos == -1) || (pos >= filePos->end)) { break; }
+ if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
if (inFASTA.eof()) { break; }
#endif
int ScreenSeqsCommand::createProcesses(string goodFileName, string badAccnos, string filename, set<string>& badSeqNames) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
+
+ vector<int> processIDS;
+ int process = 1;
int num = 0;
-
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+
//loop through and create all the processes you want
while (process != processors) {
int pid = fork();
}
}
+ num = driver(lines[0], goodFileName, badAccnos, filename, badSeqNames);
+
//force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
+ for (int i=0;i<processIDS.size();i++) {
int temp = processIDS[i];
wait(&temp);
}
m->openInputFile(tempFile, in);
if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; }
in.close(); m->mothurRemove(tempFile);
+
+ m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
+ m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
+ m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
}
- return num;
-#endif
+ //read badSeqs in because root process doesnt know what other "bad" seqs the children found
+ ifstream inBad;
+ int ableToOpen = m->openInputFile(badAccnos, inBad, "no error");
+
+ if (ableToOpen == 0) {
+ badSeqNames.clear();
+ string tempName;
+ while (!inBad.eof()) {
+ inBad >> tempName; m->gobble(inBad);
+ badSeqNames.insert(tempName);
+ }
+ inBad.close();
+ }
+#else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<sumScreenData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+
+ string extension = "";
+ if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
+
+ // Allocate memory for thread data.
+ sumScreenData* tempSum = new sumScreenData(startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength, filename, m, lines[i].start, lines[i].end,goodFileName+extension, badAccnos+extension);
+ pDataArray.push_back(tempSum);
+
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i] = CreateThread(NULL, 0, MySumScreenThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //do your part
+ num = driver(lines[processors-1], (goodFileName+toString(processors-1)+".temp"), (badAccnos+toString(processors-1)+".temp"), filename, badSeqNames);
+ processIDS.push_back(processors-1);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ num += pDataArray[i]->count;
+ for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
+ m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
+
+ m->appendFiles((badAccnos + toString(processIDS[i]) + ".temp"), badAccnos);
+ m->mothurRemove((badAccnos + toString(processIDS[i]) + ".temp"));
+ }
+
+#endif
+
+ return num;
+
}
catch(exception& e) {
m->errorOut(e, "ScreenSeqsCommand", "createProcesses");
*/
#include "mothur.h"
#include "command.hpp"
+#include "sequence.hpp"
class ScreenSeqsCommand : public Command {
linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
};
- vector<int> processIDS; //processid
- vector<linePair*> lines;
+ vector<linePair> lines;
int screenNameGroupFile(set<string>);
int screenGroupFile(set<string>);
int screenQual(set<string>);
int screenTaxonomy(set<string>);
- int driver(linePair*, string, string, string, set<string>&);
+ int driver(linePair, string, string, string, set<string>&);
int createProcesses(string, string, string, set<string>&);
#ifdef USE_MPI
int getSummary(vector<unsigned long long>&);
int createProcessesCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string);
- int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair*);
+ int driverCreateSummary(vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, string, linePair);
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct sumData {
+ vector<int> startPosition;
+ vector<int> endPosition;
+ vector<int> seqLength;
+ vector<int> ambigBases;
+ vector<int> longHomoPolymer;
+ string filename, namefile;
+ unsigned long long start;
+ unsigned long long end;
+ int count;
+ MothurOut* m;
+ map<string, int> nameMap;
+
+
+ sumData(){}
+ sumData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string nf, map<string, int> nam) {
+ filename = f;
+ namefile = nf;
+ m = mout;
+ start = st;
+ end = en;
+ nameMap = nam;
+ count = 0;
+ }
+};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct sumScreenData {
+ int startPos, endPos, maxAmbig, maxHomoP, minLength, maxLength;
+ unsigned long long start;
+ unsigned long long end;
+ int count;
+ MothurOut* m;
+ string goodFName, badAccnosFName, filename;
+ set<string> badSeqNames;
+
+
+ sumScreenData(){}
+ sumScreenData(int s, int e, int a, int h, int minl, int maxl, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string gf, string bf) {
+ startPos = s;
+ endPos = e;
+ minLength = minl;
+ maxLength = maxl;
+ maxAmbig = a;
+ maxHomoP = h;
+ filename = f;
+ goodFName = gf;
+ badAccnosFName = bf;
+ m = mout;
+ start = st;
+ end = en;
+ count = 0;
+ }
+};
+
+
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MySumThreadFunction(LPVOID lpParam){
+ sumData* pDataArray;
+ pDataArray = (sumData*)lpParam;
+
+ try {
+ ifstream in;
+ pDataArray->m->openInputFile(pDataArray->filename, in);
+
+ //print header if you are process 0
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+ in.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
+ }
+
+ pDataArray->count = pDataArray->end;
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) { in.close(); pDataArray->count = 1; return 1; }
+
+ Sequence current(in); pDataArray->m->gobble(in);
+
+ if (current.getName() != "") {
+
+ int num = 1;
+ if (pDataArray->namefile != "") {
+ //make sure this sequence is in the namefile, else error
+ map<string, int>::iterator it = pDataArray->nameMap.find(current.getName());
+
+ if (it == pDataArray->nameMap.end()) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
+ else { num = it->second; }
+ }
+
+ //for each sequence this sequence represents
+ for (int i = 0; i < num; i++) {
+ pDataArray->startPosition.push_back(current.getStartPos());
+ pDataArray->endPosition.push_back(current.getEndPos());
+ pDataArray->seqLength.push_back(current.getNumBases());
+ pDataArray->ambigBases.push_back(current.getAmbigBases());
+ pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer());
+ }
+ }
+ }
+
+ in.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumThreadFunction");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+static DWORD WINAPI MySumScreenThreadFunction(LPVOID lpParam){
+ sumScreenData* pDataArray;
+ pDataArray = (sumScreenData*)lpParam;
+
+ try {
+
+ ofstream goodFile;
+ pDataArray->m->openOutputFile(pDataArray->goodFName, goodFile);
+
+ ofstream badAccnosFile;
+ pDataArray->m->openOutputFile(pDataArray->badAccnosFName, badAccnosFile);
+
+ ifstream in;
+ pDataArray->m->openInputFile(pDataArray->filename, in);
+
+ //print header if you are process 0
+ if ((pDataArray->start == 0) || (pDataArray->start == 1)) {
+ in.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ in.seekg(pDataArray->start-1); pDataArray->m->gobble(in);
+ }
+
+ pDataArray->count = pDataArray->end;
+ for(int i = 0; i < pDataArray->end; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) { in.close(); badAccnosFile.close(); goodFile.close(); pDataArray->count = 1; return 1; }
+
+ Sequence currSeq(in); pDataArray->m->gobble(in);
+
+ if (currSeq.getName() != "") {
+ bool goodSeq = 1; // innocent until proven guilty
+ if(goodSeq == 1 && pDataArray->startPos != -1 && pDataArray->startPos < currSeq.getStartPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && pDataArray->endPos != -1 && pDataArray->endPos > currSeq.getEndPos()) { goodSeq = 0; }
+ if(goodSeq == 1 && pDataArray->maxAmbig != -1 && pDataArray->maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && pDataArray->maxHomoP != -1 && pDataArray->maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
+ if(goodSeq == 1 && pDataArray->minLength != -1 && pDataArray->minLength > currSeq.getNumBases()) { goodSeq = 0; }
+ if(goodSeq == 1 && pDataArray->maxLength != -1 && pDataArray->maxLength < currSeq.getNumBases()) { goodSeq = 0; }
+
+ if(goodSeq == 1){
+ currSeq.printSequence(goodFile);
+ }
+ else{
+ badAccnosFile << currSeq.getName() << endl;
+ pDataArray->badSeqNames.insert(currSeq.getName());
+ }
+
+ }
+ //report progress
+ if((i+1) % 100 == 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(i+1)); pDataArray->m->mothurOutEndLine(); }
+ }
+ //report progress
+ if((pDataArray->count) % 100 != 0){ pDataArray->m->mothurOut("Processing sequence: " + toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
+
+
+
+ in.close();
+ goodFile.close();
+ badAccnosFile.close();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "ScreenSeqsCommand", "MySumScreenThreadFunction");
+ exit(1);
+ }
+}
+
+#endif
+
+/**************************************************************************************************/
+
+
+
#endif
if(qualFileName == "") { qLines = lines; rLines = lines; } //fills with duds
int numSeqs = 0;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
numSeqs = driver(queryFileName, qualFileName, reportFileName, errorSummaryFileName, errorSeqFileName, errorChimeraFileName, lines[0], qLines[0], rLines[0]);
}else{
processIDS.clear();
map<char, vector<int> >::iterator it;
int num = 0;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
if(numParentSeqs > 1 && ignoreChimeras == 1) { ignoreSeq = 1; }
else { ignoreSeq = 0; }
- Compare minCompare = getErrors(query, referenceSeqs[closestRefIndex]);
+ Compare minCompare;
+ getErrors(query, referenceSeqs[closestRefIndex], minCompare);
if(namesFileName != ""){
it = weights.find(query.getName());
index++;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = queryFile.tellg();
if ((pos == -1) || (pos >= line.end)) { break; }
#else
//***************************************************************************************************************
-Compare SeqErrorCommand::getErrors(Sequence query, Sequence reference){
+int SeqErrorCommand::getErrors(Sequence query, Sequence reference, Compare& errors){
try {
if(query.getAlignLength() != reference.getAlignLength()){
m->mothurOut("Warning: " + toString(query.getName()) + " and " + toString(reference.getName()) + " are different lengths\n");
string r = reference.getAligned();
int started = 0;
- Compare errors;
+ //Compare errors;
for(int i=0;i<alignLength;i++){
// cout << r[i] << '\t' << q[i] << '\t';
errors.queryName = query.getName();
errors.refName = reference.getName();
- return errors;
+ //return errors;
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "SeqErrorCommand", "getErrors");
int SeqErrorCommand::setLines(string filename, string qfilename, string rfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos, vector<unsigned long long>& rfileFilePos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//set file positions for fasta file
fastaFilePos = m->divideFile(filename, processors);
*
*/
-#include "mothur.h"
#include "command.hpp"
#include "sequence.hpp"
#include "referencedb.h"
-struct Compare {
- int AA, AT, AG, AC, TA, TT, TG, TC, GA, GT, GG, GC, CA, CT, CG, CC, NA, NT, NG, NC, Ai, Ti, Gi, Ci, Ni, dA, dT, dG, dC;
- string refName, queryName, sequence;
- double errorRate;
- int weight, matches, mismatches, total;
-
- Compare(){
- AA=0; AT=0; AG=0; AC=0;
- TA=0; TT=0; TG=0; TC=0;
- GA=0; GT=0; GG=0; GC=0;
- CA=0; CT=0; CG=0; CC=0;
- NA=0; NT=0; NG=0; NC=0;
- Ai=0; Ti=0; Gi=0; Ci=0; Ni=0;
- dA=0; dT=0; dG=0; dC=0;
- refName = "";
- queryName = "";
- weight = 1;
- matches = 0;
- mismatches = 0;
- total = 0;
- errorRate = 1.0000;
- sequence = "";
- }
-};
class SeqErrorCommand : public Command {
public:
SeqErrorCommand(string);
SeqErrorCommand();
- ~SeqErrorCommand() {}
+ ~SeqErrorCommand(){}
vector<string> setParameters();
string getCommandName() { return "seq.error"; }
unsigned long long start;
unsigned long long end;
linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+ ~linePair(){}
};
+ struct Compare {
+ int AA, AT, AG, AC, TA, TT, TG, TC, GA, GT, GG, GC, CA, CT, CG, CC, NA, NT, NG, NC, Ai, Ti, Gi, Ci, Ni, dA, dT, dG, dC;
+ string refName, queryName, sequence;
+ double errorRate;
+ int weight, matches, mismatches, total;
+
+ Compare(){
+ AA=0; AT=0; AG=0; AC=0;
+ TA=0; TT=0; TG=0; TC=0;
+ GA=0; GT=0; GG=0; GC=0;
+ CA=0; CT=0; CG=0; CC=0;
+ NA=0; NT=0; NG=0; NC=0;
+ Ai=0; Ti=0; Gi=0; Ci=0; Ni=0;
+ dA=0; dT=0; dG=0; dC=0;
+ refName = "";
+ queryName = "";
+ weight = 1;
+ matches = 0;
+ mismatches = 0;
+ total = 0;
+ errorRate = 1.0000;
+ sequence = "";
+ }
+ ~Compare(){};
+ };
+
vector<int> processIDS; //processid
vector<linePair> lines;
vector<linePair> qLines;
void getReferences();
map<string,int> getWeights();
- Compare getErrors(Sequence, Sequence);
+ int getErrors(Sequence, Sequence, Compare&);
void printErrorHeader(ofstream&);
void printErrorData(Compare, int, ofstream&, ofstream&);
void printSubMatrix();
return 0;
}
-/**************************************************************************************************/
+**************************************************************************************************/
MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
vector<unsigned long long> positions;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(fastafile, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); }
#else
positions = m->setFilePosFasta(fastafile, numSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numSeqs / processors;
outSummary << current.getLongHomoPolymer() << '\t' << num << endl;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos->end)) { break; }
#else
int num = 0;
processIDS.clear();
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
//////////////////////////////////////////////////////////////////////////////////////////////////////
vector<seqSumData*> pDataArray;
- DWORD dwThreadIdArray[processors];
- HANDLE hThreadArray[processors];
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
//Create processor worker threads.
- for( int i=0; i<processors; i++ ){
-
- //cout << i << '\t' << lines[i]->start << '\t' << lines[i]->end << endl;
+ for( int i=0; i<processors-1; i++ ){
+
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
// Allocate memory for thread data.
- seqSumData* tempSum = new seqSumData(&startPosition, &endPosition, &seqLength, &ambigBases, &longHomoPolymer, filename, (sumFile + toString(i) + ".temp"), m, lines[i]->start, lines[i]->end, namefile, nameMap);
+ seqSumData* tempSum = new seqSumData(filename, (sumFile+extension), m, lines[i]->start, lines[i]->end, namefile, nameMap);
pDataArray.push_back(tempSum);
- processIDS.push_back(i);
-
+
//MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
hThreadArray[i] = CreateThread(NULL, 0, MySeqSumThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
}
-
+
+ //do your part
+ num = driverCreateSummary(startPosition, endPosition, seqLength, ambigBases, longHomoPolymer, fastafile, (sumFile+toString(processors-1)+".temp"), lines[processors-1]);
+ processIDS.push_back(processors-1);
+
//Wait until all threads have terminated.
- WaitForMultipleObjects(processors, hThreadArray, TRUE, INFINITE);
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
num += pDataArray[i]->count;
+ for (int k = 0; k < pDataArray[i]->startPosition.size(); k++) { startPosition.push_back(pDataArray[i]->startPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->endPosition.size(); k++) { endPosition.push_back(pDataArray[i]->endPosition[k]); }
+ for (int k = 0; k < pDataArray[i]->seqLength.size(); k++) { seqLength.push_back(pDataArray[i]->seqLength[k]); }
+ for (int k = 0; k < pDataArray[i]->ambigBases.size(); k++) { ambigBases.push_back(pDataArray[i]->ambigBases[k]); }
+ for (int k = 0; k < pDataArray[i]->longHomoPolymer.size(); k++) { longHomoPolymer.push_back(pDataArray[i]->longHomoPolymer[k]); }
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
-
- //rename((sumFile + toString(processIDS[0]) + ".temp").c_str(), sumFile.c_str());
+
//append files
for(int i=0;i<processIDS.size();i++){
m->appendFiles((sumFile + toString(processIDS[i]) + ".temp"), sumFile);
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
struct seqSumData {
- vector<int>* startPosition;
- vector<int>* endPosition;
- vector<int>* seqLength;
- vector<int>* ambigBases;
- vector<int>* longHomoPolymer;
+ vector<int> startPosition;
+ vector<int> endPosition;
+ vector<int> seqLength;
+ vector<int> ambigBases;
+ vector<int> longHomoPolymer;
string filename;
string sumFile;
unsigned long long start;
seqSumData(){}
- seqSumData(vector<int>* s, vector<int>* e, vector<int>* l, vector<int>* a, vector<int>* h, string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map<string, int> nam) {
- startPosition = s;
- endPosition = e;
- seqLength = l;
- ambigBases = a;
- longHomoPolymer = h;
+ seqSumData(string f, string sf, MothurOut* mout, unsigned long long st, unsigned long long en, string na, map<string, int> nam) {
filename = f;
sumFile = sf;
m = mout;
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MySeqSumThreadFunction(LPVOID lpParam){
seqSumData* pDataArray;
//for each sequence this sequence represents
for (int i = 0; i < num; i++) {
- pDataArray->startPosition->push_back(current.getStartPos());
- pDataArray->endPosition->push_back(current.getEndPos());
- pDataArray->seqLength->push_back(current.getNumBases());
- pDataArray->ambigBases->push_back(current.getAmbigBases());
- pDataArray->longHomoPolymer->push_back(current.getLongHomoPolymer());
+ pDataArray->startPosition.push_back(current.getStartPos());
+ pDataArray->endPosition.push_back(current.getEndPos());
+ pDataArray->seqLength.push_back(current.getNumBases());
+ pDataArray->ambigBases.push_back(current.getAmbigBases());
+ pDataArray->longHomoPolymer.push_back(current.getLongHomoPolymer());
}
outSummary << current.getName() << '\t';
}
//********************************************************************************************************************
//this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
+Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){
+ try {
+ m = MothurOut::getInstance();
+ initialize();
+ fastaFile >> name;
+ extraInfo = "";
+
+ if (name.length() != 0) {
+
+ name = name.substr(1);
+
+ string sequence;
+
+ //read comments
+ while ((name[0] == '#') && fastaFile) {
+ while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
+ sequence = getCommentString(fastaFile);
+
+ if (fastaFile) {
+ fastaFile >> name;
+ name = name.substr(1);
+ }else {
+ name = "";
+ break;
+ }
+ }
+
+ //read info after sequence name
+ while (!fastaFile.eof()) {
+ char c = fastaFile.get();
+ if (c == 10 || c == 13){ break; }
+ extraInfo += c;
+ }
+
+ int numAmbig = 0;
+ sequence = getSequenceString(fastaFile, numAmbig);
+
+ setAligned(sequence);
+ //setUnaligned removes any gap characters for us
+ setUnaligned(sequence);
+
+ if ((numAmbig / (float) numBases) > 0.25) { m->mothurOut("[WARNING]: We found more than 25% of the bases in sequence " + name + " to be ambiguous. Mothur is not setup to process protein sequences."); m->mothurOutEndLine(); }
+
+ }else{ m->mothurOut("Error in reading your fastafile, at position " + toString(fastaFile.tellg()) + ". Blank name."); m->mothurOutEndLine(); }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Sequence", "Sequence");
+ exit(1);
+ }
+}
+//********************************************************************************************************************
+//this function will jump over commented out sequences, but if the last sequence in a file is commented out it makes a blank seq
Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){
try {
m = MothurOut::getInstance();
startPos = start;
}
+//********************************************************************************************************************
+int Sequence::filterToPos(int start){
+
+ if (start > aligned.length()) { start = aligned.length(); m->mothurOut("[ERROR]: start to large.\n"); }
+
+ for(int j = 0; j < start-1; j++) {
+ aligned[j] = '.';
+ }
+
+ //things like ......----------AT become ................AT
+ for(int j = start-1; j < aligned.length(); j++) {
+ if (isalpha(aligned[j])) { break; }
+ else { aligned[j] = '.'; }
+ }
+ setUnaligned(aligned);
+
+ return 0;
+
+}
+//********************************************************************************************************************
+
+int Sequence::filterFromPos(int end){
+
+ if (end > aligned.length()) { end = aligned.length(); m->mothurOut("[ERROR]: end to large.\n"); }
+
+ for(int j = end; j < aligned.length(); j++) {
+ aligned[j] = '.';
+ }
+
+ for(int j = aligned.length()-1; j < 0; j--) {
+ if (isalpha(aligned[j])) { break; }
+ else { aligned[j] = '.'; }
+ }
+
+ setUnaligned(aligned);
+
+ return 0;
+}
//********************************************************************************************************************
int Sequence::getEndPos(){
//********************************************************************************************************************
void Sequence::padFromPos(int end){
-
+ cout << end << '\t' << endPos << endl;
for(int j = end; j < endPos; j++) {
aligned[j] = '.';
}
Sequence();
Sequence(string, string);
Sequence(ifstream&);
+ Sequence(ifstream&, string&, bool);
Sequence(istringstream&);
- Sequence(const Sequence& se) : name(se.name), unaligned(se.unaligned), aligned(se.aligned), pairwise(se.pairwise), numBases(se.numBases), startPos(se.startPos), endPos(se.endPos),
- alignmentLength(se.alignmentLength), isAligned(se.isAligned), longHomoPolymer(se.longHomoPolymer), ambigBases(se.ambigBases) { m = MothurOut::getInstance(); }
-
//these constructors just set the unaligned string to save space
Sequence(string, string, string);
Sequence(ifstream&, string);
int getEndPos();
void padToPos(int);
void padFromPos(int);
+ int filterToPos(int); //any character before the pos is changed to . and aligned and unaligned strings changed
+ int filterFromPos(int); //any character after the pos is changed to . and aligned and unaligned strings changed
int getAlignLength();
int getAmbigBases();
void removeAmbigBases();
}
}else {
+ //m->mothurOut("Group " + g + " contains " + toString(seqForThisGroup.size()) + " unique seqs.\n");
for (int i = 0; i < seqForThisGroup.size(); i++) {
if(m->control_pressed) { out.close(); m->mothurRemove(filename); return 1; }
}else {
//add / to name if needed
string lastChar = output.substr(output.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { output += "/"; }
#else
if (lastChar != "\\") { output += "\\"; }
}else {
//add / to name if needed
string lastChar = input.substr(input.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { input += "/"; }
#else
if (lastChar != "\\") { input += "\\"; }
}else {
//add / to name if needed
string lastChar = tempdefault.substr(tempdefault.length()-1);
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if (lastChar != "/") { tempdefault += "/"; }
#else
if (lastChar != "\\") { tempdefault += "\\"; }
CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(psff);
CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "",false,false); parameters.push_back(psfftxt);
- CommandParameter pflow("flow", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflow);
+ CommandParameter pflow("flow", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pflow);
CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(ptrim);
CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pfasta);
CommandParameter pqfile("name", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqfile);
helpString += "The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n";
helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n";
helpString += "The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n";
- helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated. Default=False. \n";
+ helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated. Default=True. \n";
helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n";
helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n";
helpString += "The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values. Default=True. \n";
temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ temp = "T"; }
fasta = m->isTrue(temp);
- temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "F"; }
+ temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "T"; }
flow = m->isTrue(temp);
temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; }
//**********************************************************************************************************************
int SffInfoCommand::execute(){
try {
-
if (abort == true) { if (calledHelp) { return 0; } return 2; }
for (int s = 0; s < filenames.size(); s++) {
ofstream outSfftxt, outFasta, outQual, outFlow;
string outFastaFileName, outQualFileName;
+ string rootName = outputDir + m->getRootName(m->getSimpleName(input));
+ if(rootName.find_last_of(".") == rootName.npos){ rootName += "."; }
+
string sfftxtFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "sff.txt";
string outFlowFileName = outputDir + m->getRootName(m->getSimpleName(input)) + "flow";
if (trim) {
//read data
seqRead read;
readSeqData(in, read, header.numFlowsPerRead, readheader.numBases);
-
+ bool okay = sanityCheck(readheader, read);
+ if (!okay) { break; }
+
//if you have provided an accosfile and this seq is not in it, then dont print
if (seqNames.size() != 0) { if (seqNames.count(readheader.name) == 0) { print = false; } }
in.read(buffer, 2);
read.flowgram[i] = be_int2(*(unsigned short *)(&buffer));
}
-
+
//read flowIndex
read.flowIndex.resize(numBases);
for (int i = 0; i < numBases; i++) {
exit(1);
}
}
-
+//**********************************************************************************************************************
+bool SffInfoCommand::sanityCheck(Header& header, seqRead& read) {
+ try {
+ bool okay = true;
+ string message = "[WARNING]: Your sff file may be corrupted! Sequence: " + header.name + "\n";
+
+ if (header.clipQualLeft > read.bases.length()) {
+ okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.bases.length()) + " bases.\n";
+ }
+ if (header.clipQualRight > read.bases.length()) {
+ okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.bases.length()) + " bases.\n";
+ }
+ if (header.clipQualLeft > read.qualScores.size()) {
+ okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n";
+ }
+ if (header.clipQualRight > read.qualScores.size()) {
+ okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n";
+ }
+
+ if (okay == false) {
+ m->mothurOut(message); m->mothurOutEndLine();
+ }
+
+ return okay;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SffInfoCommand", "sanityCheck");
+ exit(1);
+ }
+}
//**********************************************************************************************************************
int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read, Header& header) {
try {
-
out << "Flowgram: ";
for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; }
//**********************************************************************************************************************
int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) {
try {
-
string seq = read.bases;
- if (trim) {
+ if (trim) {
if(header.clipQualRight < header.clipQualLeft){
seq = "NNNN";
}
bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos;
int mycount;
set<string> seqNames;
-
+
//extract sff file functions
int extractSffInfo(string, string);
int readCommonHeader(ifstream&, CommonHeader&);
int printQualSeqData(ofstream&, seqRead&, Header&);
int readAccnosFile(string);
int parseSffTxt();
-
+ bool sanityCheck(Header&, seqRead&);
+
//parsesfftxt file functions
int parseHeaderLineToInt(ifstream&);
vector<unsigned short> parseHeaderLineToFloatVector(ifstream&, int);
S12 = number of shared OTUs in A and B
This estimator was changed to reflect Caldwell's changes, eliminating the nrare / nrare - 1 */
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
sumSharedAB = the sum of the minimum otus int all shared otus in AB.
*/
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
//create and initialize trees to 0.
initialTree(numGroups);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//get bin values and calc shared
bool sharedByAll = true;
temp.clear();
m->openOutputFile(outputMisMatchName, outMisMatch);
- map<string, string> listNames;
- map<string, string>::iterator itList;
+ set<string> listNames;
+ set<string>::iterator itList;
//go through list and if group returns "not found" output it
for (int i = 0; i < SharedList->getNumBins(); i++) {
string names = SharedList->get(i);
- while (names.find_first_of(',') != -1) {
- string name = names.substr(0,names.find_first_of(','));
- names = names.substr(names.find_first_of(',')+1, names.length());
+ vector<string> binNames;
+ m->splitAtComma(names, binNames);
+
+ for (int j = 0; j < binNames.size(); j++) {
+ string name = binNames[j];
string group = groupMap->getGroup(name);
if(group == "not found") { outMisMatch << name << endl; }
itList = listNames.find(name);
if (itList != listNames.end()) { m->mothurOut(name + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); }
- else { listNames[name] = name; }
+ else { listNames.insert(name); }
}
-
- //get last name
- string group = groupMap->getGroup(names);
- if(group == "not found") { outMisMatch << names << endl; }
-
- itList = listNames.find(names);
- if (itList != listNames.end()) { m->mothurOut(names + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); }
- else { listNames[names] = names; }
-
}
outMisMatch.close();
string names = SharedList->get(i);
- while (names.find_first_of(',') != -1) {
- string name = names.substr(0,names.find_first_of(','));
- names = names.substr(names.find_first_of(',')+1, names.length());
+ vector<string> binNames;
+ m->splitAtComma(names, binNames);
+
+ for (int j = 0; j < binNames.size(); j++) {
+
+ string name = binNames[j];
itList = namesInList.find(name);
if (itList != namesInList.end()) { m->mothurOut(name + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); }
namesInList[name] = name;
}
-
- itList = namesInList.find(names);
- if (itList != namesInList.end()) { m->mothurOut(names + " is in your list file more than once. Sequence names must be unique. please correct."); m->mothurOutEndLine(); }
-
- //get last name
- namesInList[names] = names;
}
//get names of sequences in groupfile
int error = 0;
vector<string> groupMapsSeqs = groupMap->getNamesSeqs();
-
+
set<string> groupNamesSeqs;
for(int i = 0; i < groupMapsSeqs.size(); i++) {
groupNamesSeqs.insert(groupMapsSeqs[i]);
}
-
//go through list and if group returns "not found" output it
for (int i = 0; i < SharedList->getNumBins(); i++) {
if (m->control_pressed) { return 0; }
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
//get the total values we need to calculate the theta denominator sums
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
Atotal += shared[0]->getAbundance(i);
Btotal += shared[1]->getAbundance(i);
}
//calculate the denominator sums
- for (int j = 0; j < shared[0]->size(); j++) {
+ for (int j = 0; j < shared[0]->getNumBins(); j++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(j);
tempB = shared[1]->getAbundance(j);
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
vector<SharedRAbundFloatVector*> SharedRAbundFloatVector::getSharedRAbundFloatVectors(){
try {
SharedUtil* util;
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
SAbundVector SharedRAbundFloatVector::getSAbundVector() {
try {
exit(1);
}
}
-/***********************************************************************/
+***********************************************************************/
//this is not functional, not sure how to handle it yet, but I need the stub because it is a pure function
OrderVector SharedRAbundFloatVector::getOrderVector(map<string,int>* nameMap = NULL) {
try {
}
-/***********************************************************************/
+***********************************************************************/
//reads a shared file
SharedRAbundVector::SharedRAbundVector(ifstream& f) : DataVector(), maxRank(0), numBins(0), numSeqs(0) {
try {
double observed = 0;
//loop through the species in each group
- for (int k = 0; k < shared[0]->size(); k++) {
+ for (int k = 0; k < shared[0]->getNumBins(); k++) {
//if you have found a new species
if (shared[0]->getAbundance(k) != 0) { observed++; }
else if ((shared[0]->getAbundance(k) == 0) && (shared[1]->getAbundance(k) != 0)) { observed++; }
double observed = 0;
int numGroups = shared.size();
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//get bin values and set sharedByAll
bool sharedByAll = true;
for (int j = 0; j < numGroups; j++) {
data.resize(1,0);
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
numerator = 0.0; denominator = 0.0; thetaN = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
//get the total values we need to calculate the theta denominator sums
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
Atotal += shared[0]->getAbundance(i);
Btotal += shared[1]->getAbundance(i);
}
//calculate the theta denominator sums
- for (int j = 0; j < shared[0]->size(); j++) {
+ for (int j = 0; j < shared[0]->getNumBins(); j++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(j);
tempB = shared[1]->getAbundance(j);
double sumPsqQ = 0;
//get the total values we need to calculate the theta denominator sums
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
Atotal += (double)shared[0]->getAbundance(i);
Btotal += (double)shared[1]->getAbundance(i);
}
//calculate the theta denominator sums
- for (int j = 0; j < shared[0]->size(); j++) {
+ for (int j = 0; j < shared[0]->getNumBins(); j++) {
//store in temps to avoid multiple repetitive function calls
pi = shared[0]->getAbundance(j) / Atotal;
qi = shared[1]->getAbundance(j) / Btotal;
#include "shhhercommand.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "rabundvector.hpp"
-#include "sabundvector.hpp"
-#include "listvector.hpp"
-#include "cluster.hpp"
-#include "sparsematrix.hpp"
-#include <cfloat>
-
//**********************************************************************************************************************
vector<string> ShhherCommand::setParameters(){
try {
ShhherCommand::ShhherCommand(string option) {
try {
-
+
#ifdef USE_MPI
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
-
+
if(pid == 0){
#endif
-
-
abort = false; calledHelp = false;
-
//allow user to run help
if(option == "help") { help(); abort = true; calledHelp = true; }
else if(option == "citation") { citation(); abort = true; calledHelp = true;}
m->openOutputFile(compositeNamesFileName, temp);
temp.close();
}
+
+ if(flowFilesFileName != "not found"){
+ string fName;
+
+ ifstream flowFilesFile;
+ m->openInputFile(flowFilesFileName, flowFilesFile);
+ while(flowFilesFile){
+ fName = m->getline(flowFilesFile);
+
+ //test if file is valid
+ ifstream in;
+ int ableToOpen = m->openInputFile(fName, in, "noerror");
+ in.close();
+ if (ableToOpen == 1) {
+ if (inputDir != "") { //default path is set
+ string tryPath = inputDir + fName;
+ m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+ }
+
+ //if you can't open it its not in current working directory or inputDir, try mothur excutable location
+ if (ableToOpen == 1) {
+ string exepath = m->argv;
+ string tempPath = exepath;
+ for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
+ exepath = exepath.substr(0, (tempPath.find_last_of('m')));
+
+ string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
+ m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ fName = tryPath;
+ }
+
+ if (ableToOpen == 1) { m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine(); }
+ else { flowFileVector.push_back(fName); }
+ m->gobble(flowFilesFile);
+ }
+ flowFilesFile.close();
+ if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+ }
+ else{
+ flowFileVector.push_back(flowFileName);
+ }
+
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
}
}
-
#ifdef USE_MPI
}
#endif
-
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "ShhherCommand");
int tag = 1976;
MPI_Status status;
-
+
if(pid == 0){
for(int i=1;i<ncpus;i++){
getSingleLookUp(); if (m->control_pressed) { return 0; }
getJointLookUp(); if (m->control_pressed) { return 0; }
- vector<string> flowFileVector;
+ vector<string> flowFileVector;
if(flowFilesFileName != "not found"){
string fName;
-
+
ifstream flowFilesFile;
m->openInputFile(flowFilesFileName, flowFilesFile);
while(flowFilesFile){
else{
flowFileVector.push_back(flowFileName);
}
+
int numFiles = flowFileVector.size();
for(int i=1;i<ncpus;i++){
exit(1);
}
}
-
+/**************************************************************************************************/
+string ShhherCommand::createNamesFile(){
+ try{
+
+ vector<string> duplicateNames(numUniques, "");
+ for(int i=0;i<numSeqs;i++){
+ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
+ }
+
+ string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numUniques;i++){
+
+ if (m->control_pressed) { break; }
+
+ // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ }
+
+ nameFile.close();
+ return nameFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "createNamesFile");
+ exit(1);
+ }
+}
/**************************************************************************************************/
string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
return fDistFileName;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "flowDistParentFork");
+ m->errorOut(e, "ShhherCommand", "flowDistMPI");
exit(1);
}
}
+/**************************************************************************************************/
+
+void ShhherCommand::getOTUData(string listFileName){
+ try {
+
+ ifstream listFile;
+ m->openInputFile(listFileName, listFile);
+ string label;
+
+ listFile >> label >> numOTUs;
+
+ otuData.assign(numSeqs, 0);
+ cumNumSeqs.assign(numOTUs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+ aaP.clear();aaP.resize(numOTUs);
+
+ seqNumber.clear();
+ aaI.clear();
+ seqIndex.clear();
+
+ string singleOTU = "";
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ listFile >> singleOTU;
+
+ istringstream otuString(singleOTU);
+
+ while(otuString){
+
+ string seqName = "";
+
+ for(int j=0;j<singleOTU.length();j++){
+ char letter = otuString.get();
+
+ if(letter != ','){
+ seqName += letter;
+ }
+ else{
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+ int index = nmIt->second;
+
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+ seqName = "";
+ }
+ }
+
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+
+ int index = nmIt->second;
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+
+ otuString.get();
+ }
+
+ sort(aaP[i].begin(), aaP[i].end());
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber.push_back(aaP[i][j]);
+ }
+ for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
+ aaP[i].push_back(0);
+ }
+
+
+ }
+
+ for(int i=1;i<numOTUs;i++){
+ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
+ }
+ aaI = aaP;
+ seqIndex = seqNumber;
+
+ listFile.close();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getOTUData");
+ exit(1);
+ }
+}
-#else
-//**********************************************************************************************************************
-
-int ShhherCommand::execute(){
- try {
- if (abort == true) { return 0; }
-
- getSingleLookUp(); if (m->control_pressed) { return 0; }
- getJointLookUp(); if (m->control_pressed) { return 0; }
-
-
- vector<string> flowFileVector;
- if(flowFilesFileName != "not found"){
- string fName;
-
- ifstream flowFilesFile;
- m->openInputFile(flowFilesFileName, flowFilesFile);
- while(flowFilesFile){
- fName = m->getline(flowFilesFile);
- flowFileVector.push_back(fName);
- m->gobble(flowFilesFile);
- }
- }
- else{
- flowFileVector.push_back(flowFileName);
- }
- int numFiles = flowFileVector.size();
-
-
- for(int i=0;i<numFiles;i++){
-
- if (m->control_pressed) { break; }
-
- flowFileName = flowFileVector[i];
-
- m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
- m->mothurOut("Reading flowgrams...\n");
- getFlowData();
-
- if (m->control_pressed) { break; }
-
- m->mothurOut("Identifying unique flowgrams...\n");
- getUniques();
-
- if (m->control_pressed) { break; }
-
- m->mothurOut("Calculating distances between flowgrams...\n");
- string distFileName = createDistFile(processors);
- string namesFileName = createNamesFile();
-
- if (m->control_pressed) { break; }
-
- m->mothurOut("\nClustering flowgrams...\n");
- string listFileName = cluster(distFileName, namesFileName);
-
- if (m->control_pressed) { break; }
-
- getOTUData(listFileName);
-
- if (m->control_pressed) { break; }
-
- m->mothurRemove(distFileName);
- m->mothurRemove(namesFileName);
- m->mothurRemove(listFileName);
-
- initPyroCluster();
-
- if (m->control_pressed) { break; }
-
- double maxDelta = 0;
- int iter = 0;
-
- double begClock = clock();
- unsigned long long begTime = time(NULL);
+/**************************************************************************************************/
- m->mothurOut("\nDenoising flowgrams...\n");
- m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
-
- while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
-
- if (m->control_pressed) { break; }
-
- double cycClock = clock();
- unsigned long long cycTime = time(NULL);
- fill();
-
- if (m->control_pressed) { break; }
+void ShhherCommand::initPyroCluster(){
+ try{
+ if (numOTUs < processors) { processors = 1; }
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(processors+1, 0);
+ nOTUsBreaks.assign(processors+1, 0);
+
+ nSeqsBreaks[0] = 0;
+ for(int i=0;i<processors;i++){
+ nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
+ nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
+ }
+ nSeqsBreaks[processors] = numSeqs;
+ nOTUsBreaks[processors] = numOTUs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "initPyroCluster");
+ exit(1);
+ }
+}
- calcCentroids();
-
- if (m->control_pressed) { break; }
+/**************************************************************************************************/
- maxDelta = getNewWeights(); if (m->control_pressed) { break; }
- double nLL = getLikelihood(); if (m->control_pressed) { break; }
- checkCentroids();
-
- if (m->control_pressed) { break; }
-
- calcNewDistances();
-
- if (m->control_pressed) { break; }
-
- iter++;
-
- m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+void ShhherCommand::fill(){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
- }
-
- if (m->control_pressed) { break; }
-
- m->mothurOut("\nFinalizing...\n");
- fill();
+/**************************************************************************************************/
+
+void ShhherCommand::getFlowData(){
+ try{
+ ifstream flowFile;
+ m->openInputFile(flowFileName, flowFile);
+
+ string seqName;
+ seqNameVector.clear();
+ lengths.clear();
+ flowDataIntI.clear();
+ nameMap.clear();
+
+
+ int currentNumFlowCells;
+
+ float intensity;
+
+ flowFile >> numFlowCells;
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ lengths.push_back(currentNumFlowCells);
+
+ seqNameVector.push_back(seqName);
+ nameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ flowDataIntI.push_back(intI);
+ }
+ m->gobble(flowFile);
+ }
+ flowFile.close();
+
+ numSeqs = seqNameVector.size();
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=lengths[i];j<numFlowCells;j++){
+ flowDataIntI[iNumFlowCells + j] = 0;
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getFlowData");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
+
+ try{
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ otuIndex.clear();
+ seqIndex.clear();
+ singleTau.clear();
+
+ for(int i=startSeq;i<stopSeq;i++){
if (m->control_pressed) { break; }
- setOTUs();
+ double offset = 1e8;
+ int indexOffset = i * numOTUs;
- if (m->control_pressed) { break; }
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
- vector<int> otuCounts(numOTUs, 0);
- for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
- calcCentroidsDriver(0, numOTUs); if (m->control_pressed) { break; }
- writeQualities(otuCounts); if (m->control_pressed) { break; }
- writeSequences(otuCounts); if (m->control_pressed) { break; }
- writeNames(otuCounts); if (m->control_pressed) { break; }
- writeClusters(otuCounts); if (m->control_pressed) { break; }
- writeGroups(); if (m->control_pressed) { break; }
+ for(int j=0;j<numOTUs;j++){
+
+ newTau[j] /= norms[i];
+
+ if(newTau[j] > MIN_TAU){
+ otuIndex.push_back(j);
+ seqIndex.push_back(i);
+ singleTau.push_back(newTau[j]);
+ }
+ }
- m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
- }
-
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
-
-
- if(compositeFASTAFileName != ""){
- outputNames.push_back(compositeFASTAFileName);
- outputNames.push_back(compositeNamesFileName);
}
-
- m->mothurOutEndLine();
- m->mothurOut("Output File Names: "); m->mothurOutEndLine();
- for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
- m->mothurOutEndLine();
-
- return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "execute");
- exit(1);
- }
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ exit(1);
+ }
}
-#endif
+
/**************************************************************************************************/
-void ShhherCommand::getFlowData(){
- try{
- ifstream flowFile;
- m->openInputFile(flowFileName, flowFile);
-
- string seqName;
- seqNameVector.clear();
- lengths.clear();
- flowDataIntI.clear();
- nameMap.clear();
-
-
- int currentNumFlowCells;
+void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+
+ try{
+
+ int total = 0;
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=startSeq;i<stopSeq;i++){
+
+ if (m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
+ double offset = 1e8;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ }
+
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ newTau[j] /= norms[i];
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(newTau[j] > MIN_TAU){
+
+ int oldTotal = total;
+
+ total++;
+
+ singleTau.resize(total, 0);
+ seqNumber.resize(total, 0);
+ seqIndex.resize(total, 0);
+
+ singleTau[oldTotal] = newTau[j];
+
+ aaP[j][nSeqsPerOTU[j]] = oldTotal;
+ aaI[j][nSeqsPerOTU[j]] = i;
+ nSeqsPerOTU[j]++;
+ }
+ }
+
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::setOTUs(){
+
+ try {
+ vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ int sIndex = seqIndex[index];
+ bigTauMatrix[sIndex * numOTUs + i] = tauValue;
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ double maxTau = -1.0000;
+ int maxOTU = -1;
+ for(int j=0;j<numOTUs;j++){
+ if(bigTauMatrix[i * numOTUs + j] > maxTau){
+ maxTau = bigTauMatrix[i * numOTUs + j];
+ maxOTU = j;
+ }
+ }
+
+ otuData[i] = maxOTU;
+ }
+
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ int index = otuData[i];
+
+ singleTau[i] = 1.0000;
+ dist[i] = 0.0000;
+
+ aaP[index][nSeqsPerOTU[index]] = i;
+ aaI[index][nSeqsPerOTU[index]] = i;
+
+ nSeqsPerOTU[index]++;
+ }
+ fill();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "setOTUs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getUniques(){
+ try{
+
+
+ numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
+ }
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
+ }
+
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
+ }
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
+
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getUniques");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
+ try{
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
+
+ for(int i=0;i<minLength;i++){
+
+ if (m->control_pressed) { break; }
+
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ dist /= (float) minLength;
+ return dist;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************/
+
+string ShhherCommand::cluster(string distFileName, string namesFileName){
+ try {
+
+ ReadMatrix* read = new ReadColumnMatrix(distFileName);
+ read->setCutoff(cutoff);
+
+ NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+ clusterNameMap->readMap();
+ read->read(clusterNameMap);
+
+ ListVector* list = read->getListVector();
+ SparseMatrix* matrix = read->getMatrix();
+
+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(cutoff));
+
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ ofstream listFile;
+ m->openOutputFile(listFileName, listFile);
+ list->print(listFile);
+ listFile.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+
+ return listFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "cluster");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::calcCentroidsDriver(int start, int finish){
+
+ //this function gets the most likely homopolymer length at a flow position for a group of sequences
+ //within an otu
+
+ try{
+
+ for(int i=start;i<finish;i++){
+
+ if (m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+ try{
+
+ int flowAValue = cent * numFlowCells;
+ int flowBValue = flow * numFlowCells;
+
+ double dist = 0;
+
+ for(int i=0;i<length;i++){
+ dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ return dist / (double)length;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getDistToCentroid");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getNewWeights(){
+ try{
+
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ return maxChange;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+ /**************************************************************************************************/
+
+double ShhherCommand::getLikelihood(){
+
+ try{
+
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::checkCentroids(){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
+ exit(1);
+ }
+}
+ /**************************************************************************************************/
+
+
+
+void ShhherCommand::writeQualities(vector<int> otuCounts){
+
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
+
+ ofstream qualityFile;
+ m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ outputNames.push_back(qualityFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeQualities");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeSequences(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
+ ofstream fastaFile;
+ m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = flowOrder[j % 4];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ outputNames.push_back(fastaFileName);
+
+ if(compositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, compositeFASTAFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeSequences");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeNames(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
+ ofstream nameFile;
+ m->openOutputFile(nameFileName, nameFile);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFile << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFile << endl;
+ }
+ }
+ nameFile.close();
+ outputNames.push_back(nameFileName);
+
+
+ if(compositeNamesFileName != ""){
+ m->appendFiles(nameFileName, compositeNamesFileName);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeNames");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeGroups(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
+ string groupFileName = fileRoot + "shhh.groups";
+ ofstream groupFile;
+ m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
+ }
+ groupFile.close();
+ outputNames.push_back(groupFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeGroups");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::writeClusters(vector<int> otuCounts){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
+ ofstream otuCountsFile;
+ m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % 4];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % 4];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ outputNames.push_back(otuCountsFileName);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "writeClusters");
+ exit(1);
+ }
+}
+
+#else
+//**********************************************************************************************************************
+
+int ShhherCommand::execute(){
+ try {
+ if (abort == true) { return 0; }
- float intensity;
+ getSingleLookUp(); if (m->control_pressed) { return 0; }
+ getJointLookUp(); if (m->control_pressed) { return 0; }
- flowFile >> numFlowCells;
- int index = 0;//pcluster
- while(!flowFile.eof()){
-
- if (m->control_pressed) { break; }
-
- flowFile >> seqName >> currentNumFlowCells;
- lengths.push_back(currentNumFlowCells);
+ int numFiles = flowFileVector.size();
+
+ if (numFiles < processors) { processors = numFiles; }
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
+ else { createProcesses(flowFileVector); } //each processor processes one file
+#else
+ driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
+#endif
+
+ if(compositeFASTAFileName != ""){
+ outputNames.push_back(compositeFASTAFileName);
+ outputNames.push_back(compositeNamesFileName);
+ }
- seqNameVector.push_back(seqName);
- nameMap[seqName] = index++;//pcluster
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "execute");
+ exit(1);
+ }
+}
+#endif
+/**************************************************************************************************/
- for(int i=0;i<numFlowCells;i++){
- flowFile >> intensity;
- if(intensity > 9.99) { intensity = 9.99; }
- int intI = int(100 * intensity + 0.0001);
- flowDataIntI.push_back(intI);
- }
- m->gobble(flowFile);
+int ShhherCommand::createProcesses(vector<string> filenames){
+ try {
+ vector<int> processIDS;
+ int process = 1;
+ int num = 0;
+
+ //sanity check
+ if (filenames.size() < processors) { processors = filenames.size(); }
+
+ //divide the groups between the processors
+ vector<linePair> lines;
+ int numFilesPerProcessor = filenames.size() / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numFilesPerProcessor;
+ int endIndex = (i+1) * numFilesPerProcessor;
+ if(i == (processors - 1)){ endIndex = filenames.size(); }
+ lines.push_back(linePair(startIndex, endIndex));
}
- flowFile.close();
- numSeqs = seqNameVector.size();
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
- for(int i=0;i<numSeqs;i++){
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
- if (m->control_pressed) { break; }
-
- int iNumFlowCells = i * numFlowCells;
- for(int j=lengths[i];j<numFlowCells;j++){
- flowDataIntI[iNumFlowCells + j] = 0;
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
}
}
- }
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processIDS.size();i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<shhhFlowsData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++ ){
+ // Allocate memory for thread data.
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; }
+
+ shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
+ pDataArray.push_back(tempFlow);
+ processIDS.push_back(i);
+
+ hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //using the main process as a worker saves time and memory
+ //do my part
+ driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+ #endif
+
+ for (int i=0;i<processIDS.size();i++) {
+ if (compositeFASTAFileName != "") {
+ m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
+ m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
+ m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
+ m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
+ }
+ }
+
+ return 0;
+
+ }
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getFlowData");
+ m->errorOut(e, "ShhherCommand", "createProcesses");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getSingleLookUp(){
+int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
+ try {
+
+ for(int i=start;i<end;i++){
+
+ if (m->control_pressed) { break; }
+
+ string flowFileName = filenames[i];
+
+ m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
+ m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Identifying unique flowgrams...\n");
+ int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ unsigned long long begTime = time(NULL);
+ double begClock = clock();
+
+ flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ m->mothurOutEndLine();
+ m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ cluster(listFileName, distFileName, namesFileName);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+
+ if (m->control_pressed) { break; }
+
+ m->mothurRemove(distFileName);
+ m->mothurRemove(namesFileName);
+ m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
+
+ if (m->control_pressed) { break; }
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ m->mothurOut("\nDenoising flowgrams...\n");
+ m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
+
+ if (m->control_pressed) { break; }
+
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+
+ if (m->control_pressed) { break; }
+
+ double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+
+ if (m->control_pressed) { break; }
+
+ checkCentroids(numOTUs, centroids, weight);
+
+ if (m->control_pressed) { break; }
+
+ calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+
+ if (m->control_pressed) { break; }
+
+ iter++;
+
+ m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (m->control_pressed) { break; }
+
+ m->mothurOut("\nFinalizing...\n");
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+
+ if (m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+
+ calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+
+ if (m->control_pressed) { break; }
+
+ writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
+ writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
+ writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->control_pressed) { break; }
+ writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->control_pressed) { break; }
+ writeGroups(flowFileName, numSeqs, seqNameVector); if (m->control_pressed) { break; }
+
+ m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
+ }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ return 0;
+
+ }catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "driver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
try{
- // these are the -log probabilities that a signal corresponds to a particular homopolymer length
- singleLookUp.assign(HOMOPS * NUMBINS, 0);
+
+ ifstream flowFile;
+
+ m->openInputFile(filename, flowFile);
+
+ string seqName;
+ int currentNumFlowCells;
+ float intensity;
+ thisSeqNameVector.clear();
+ thisLengths.clear();
+ thisFlowDataIntI.clear();
+ thisNameMap.clear();
+
+ flowFile >> numFlowCells;
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (m->control_pressed) { break; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ thisLengths.push_back(currentNumFlowCells);
+
+ thisSeqNameVector.push_back(seqName);
+ thisNameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ thisFlowDataIntI.push_back(intI);
+ }
+ m->gobble(flowFile);
+ }
+ flowFile.close();
- int index = 0;
- ifstream lookUpFile;
- m->openInputFile(lookupFileName, lookUpFile);
+ int numSeqs = thisSeqNameVector.size();
- for(int i=0;i<HOMOPS;i++){
+ for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { break; }
- float logFracFreq;
- lookUpFile >> logFracFreq;
-
- for(int j=0;j<NUMBINS;j++) {
- lookUpFile >> singleLookUp[index];
- index++;
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=thisLengths[i];j<numFlowCells;j++){
+ thisFlowDataIntI[iNumFlowCells + j] = 0;
}
- }
- lookUpFile.close();
+ }
+
+ return numSeqs;
+
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getSingleLookUp");
+ m->errorOut(e, "ShhherCommand", "getFlowData");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getJointLookUp(){
- try{
-
- // the most likely joint probability (-log) that two intenities have the same polymer length
- jointLookUp.resize(NUMBINS * NUMBINS, 0);
+int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
+ try{
+
+ ostringstream outStream;
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.setf(ios::dec, ios::basefield);
+ outStream.setf(ios::showpoint);
+ outStream.precision(6);
- for(int i=0;i<NUMBINS;i++){
+ int begTime = time(NULL);
+ double begClock = clock();
+
+ for(int i=0;i<stopSeq;i++){
if (m->control_pressed) { break; }
- for(int j=0;j<NUMBINS;j++){
-
- double minSum = 100000000;
-
- for(int k=0;k<HOMOPS;k++){
- double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
-
- if(sum < minSum) { minSum = sum; }
- }
- jointLookUp[i * NUMBINS + j] = minSum;
+ for(int j=0;j<i;j++){
+ float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+
+ if(flowDistance < 1e-6){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+ }
+ else if(flowDistance <= cutoff){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+ }
}
+ if(i % 100 == 0){
+ m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ m->mothurOutEndLine();
+ }
+ }
+
+ ofstream distFile(distFileName.c_str());
+ distFile << outStream.str();
+ distFile.close();
+
+ if (m->control_pressed) {}
+ else {
+ m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
+ m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
+ m->mothurOutEndLine();
}
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getJointLookUp");
+ m->errorOut(e, "ShhherCommand", "flowDistParentFork");
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getProbIntensity(int intIntensity){
+float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
-
- double minNegLogProb = 100000000;
-
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
- for(int i=0;i<HOMOPS;i++){//loop signal strength
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float dist = 0;
+
+ for(int i=0;i<minLength;i++){
if (m->control_pressed) { break; }
- float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
- if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ int flowAIntI = flowDataIntI[ANumFlowCells + i];
+ float flowAPrI = flowDataPrI[ANumFlowCells + i];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + i];
+ float flowBPrI = flowDataPrI[BNumFlowCells + i];
+ dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
}
- return minNegLogProb;
+ dist /= (float) minLength;
+ return dist;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
exit(1);
}
}
/**************************************************************************************************/
-void ShhherCommand::getUniques(){
+int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
try{
-
-
- numUniques = 0;
+ int numUniques = 0;
uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
uniqueCount.assign(numSeqs, 0); // anWeights
uniqueLengths.assign(numSeqs, 0);
for(int j=0;j<numFlowCells;j++){
current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
}
-
+
for(int j=0;j<numUniques;j++){
int offset = j * numFlowCells;
bool toEnd = 1;
int shorterLength;
if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
else { shorterLength = uniqueLengths[j]; }
-
+
for(int k=0;k<shorterLength;k++){
if(current[k] != uniqueFlowgrams[offset + k]){
toEnd = 0;
flowDataPrI.resize(numSeqs * numFlowCells, 0);
for(int i=0;i<flowDataPrI.size();i++) { if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
+
+ return numUniques;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "getUniques");
exit(1);
}
}
-
-/**************************************************************************************************/
-
-float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
- try{
- int minLength = lengths[mapSeqToUnique[seqA]];
- if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
-
- int ANumFlowCells = seqA * numFlowCells;
- int BNumFlowCells = seqB * numFlowCells;
-
- float dist = 0;
-
- for(int i=0;i<minLength;i++){
-
- if (m->control_pressed) { break; }
-
- int flowAIntI = flowDataIntI[ANumFlowCells + i];
- float flowAPrI = flowDataPrI[ANumFlowCells + i];
-
- int flowBIntI = flowDataIntI[BNumFlowCells + i];
- float flowBPrI = flowDataPrI[BNumFlowCells + i];
- dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
- }
-
- dist /= (float) minLength;
- return dist;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
- try{
-
- ostringstream outStream;
- outStream.setf(ios::fixed, ios::floatfield);
- outStream.setf(ios::dec, ios::basefield);
- outStream.setf(ios::showpoint);
- outStream.precision(6);
-
- int begTime = time(NULL);
- double begClock = clock();
-
- for(int i=startSeq;i<stopSeq;i++){
-
- if (m->control_pressed) { break; }
-
- for(int j=0;j<i;j++){
- float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
-
- if(flowDistance < 1e-6){
- outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
- }
- else if(flowDistance <= cutoff){
- outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
- }
- }
- if(i % 100 == 0){
- m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
- }
- }
-
- ofstream distFile(distFileName.c_str());
- distFile << outStream.str();
- distFile.close();
-
- if (m->control_pressed) {}
- else {
- m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- m->mothurOutEndLine();
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "flowDistParentFork");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-string ShhherCommand::createDistFile(int processors){
- try{
-//////////////////////// until I figure out the shared memory issue //////////////////////
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-#else
- processors=1;
-#endif
-//////////////////////// until I figure out the shared memory issue //////////////////////
-
- string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
-
- unsigned long long begTime = time(NULL);
- double begClock = clock();
-
- if (numSeqs < processors){ processors = 1; }
-
- if(processors == 1) { flowDistParentFork(fDistFileName, 0, numUniques); }
-
- else{ //you have multiple processors
-
- vector<int> start(processors, 0);
- vector<int> end(processors, 0);
-
- int process = 1;
- vector<int> processIDs;
-
- for (int i = 0; i < processors; i++) {
- start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
- end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
- }
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- flowDistParentFork(fDistFileName, start[0], end[0]);
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
-#else
- //////////////////////////////////////////////////////////////////////////////////////////////////////
- //Windows version shared memory, so be careful when passing variables through the flowDistParentForkData struct.
- //Above fork() will clone, so memory is separate, but that's not the case with windows,
- //////////////////////////////////////////////////////////////////////////////////////////////////////
-
- vector<flowDistParentForkData*> pDataArray;
- DWORD dwThreadIdArray[processors-1];
- HANDLE hThreadArray[processors-1];
-
- //Create processor worker threads.
- for(int i = 0; i < processors-1; i++){
- // Allocate memory for thread data.
- string extension = extension = toString(i) + ".temp";
-
- flowDistParentForkData* tempdist = new flowDistParentForkData((fDistFileName + extension), mapUniqueToSeq, mapSeqToUnique, lengths, flowDataIntI, flowDataPrI, jointLookUp, m, start[i+1], end[i+1], numFlowCells, cutoff, i);
- pDataArray.push_back(tempdist);
- processIDs.push_back(i);
-
- //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
- //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
- hThreadArray[i] = CreateThread(NULL, 0, MyflowDistParentForkThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
- }
-
- //parent does its part
- flowDistParentFork(fDistFileName, start[0], end[0]);
-
- //Wait until all threads have terminated.
- WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
-
- //Close all thread handles and free memory allocations.
- for(int i=0; i < pDataArray.size(); i++){
- CloseHandle(hThreadArray[i]);
- delete pDataArray[i];
- }
-
-#endif
-
- //append and remove temp files
- for (int i=0;i<processIDs.size();i++) {
- m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
- m->mothurRemove((fDistFileName + toString(processIDs[i]) + ".temp"));
- }
-
- }
-
- m->mothurOutEndLine();
- m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
-
- return fDistFileName;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "createDistFile");
- exit(1);
- }
-
-}
-
/**************************************************************************************************/
-
-string ShhherCommand::createNamesFile(){
+int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
try{
vector<string> duplicateNames(numUniques, "");
duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
}
- string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
-
ofstream nameFile;
- m->openOutputFile(nameFileName, nameFile);
+ m->openOutputFile(filename, nameFile);
for(int i=0;i<numUniques;i++){
if (m->control_pressed) { break; }
-// nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
}
nameFile.close();
- return nameFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "createNamesFile");
exit(1);
}
}
-
//**********************************************************************************************************************
-string ShhherCommand::cluster(string distFileName, string namesFileName){
+int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
try {
ReadMatrix* read = new ReadColumnMatrix(distFileName);
delete read;
delete clusterNameMap;
-
+
RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest");
list->setLabel(toString(cutoff));
- string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
ofstream listFile;
- m->openOutputFile(listFileName, listFile);
+ m->openOutputFile(filename, listFile);
list->print(listFile);
listFile.close();
delete matrix; delete cluster; delete rabund; delete list;
-
- return listFileName;
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "cluster");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::getOTUData(string listFileName){
+int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
+ vector<int>& seqIndex,
+ map<string, int>& nameMap){
try {
-
+
ifstream listFile;
- m->openInputFile(listFileName, listFile);
+ m->openInputFile(fileName, listFile);
string label;
+ int numOTUs;
listFile >> label >> numOTUs;
-
+
otuData.assign(numSeqs, 0);
cumNumSeqs.assign(numOTUs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
-
+
listFile >> singleOTU;
istringstream otuString(singleOTU);
-
+
while(otuString){
string seqName = "";
}
map<string,int>::iterator nmIt = nameMap.find(seqName);
-
+
int index = nmIt->second;
nameMap.erase(nmIt);
-
+
otuData[index] = i;
nSeqsPerOTU[i]++;
aaP[i].push_back(index);
seqIndex = seqNumber;
listFile.close();
+
+ return numOTUs;
}
catch(exception& e) {
exit(1);
}
}
-
-/**************************************************************************************************/
-
-void ShhherCommand::initPyroCluster(){
- try{
- if (numOTUs < processors) { processors = 1; }
-
- dist.assign(numSeqs * numOTUs, 0);
- change.assign(numOTUs, 1);
- centroids.assign(numOTUs, -1);
- weight.assign(numOTUs, 0);
- singleTau.assign(numSeqs, 1.0);
-
- nSeqsBreaks.assign(processors+1, 0);
- nOTUsBreaks.assign(processors+1, 0);
-
- nSeqsBreaks[0] = 0;
- for(int i=0;i<processors;i++){
- nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
- nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
- }
- nSeqsBreaks[processors] = numSeqs;
- nOTUsBreaks[processors] = numOTUs;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "initPyroCluster");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::fill(){
- try {
- int index = 0;
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- cumNumSeqs[i] = index;
- for(int j=0;j<nSeqsPerOTU[i];j++){
- seqNumber[index] = aaP[i][j];
- seqIndex[index] = aaI[i][j];
-
- index++;
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "fill");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcCentroids(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcCentroidsDriver(0, numOTUs);
- }
- else{ //you have multiple processors
- if (numOTUs < processors){ processors = 1; }
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
- }
-
-#else
- calcCentroidsDriver(0, numOTUs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
- exit(1);
- }
-}
-
/**************************************************************************************************/
-void ShhherCommand::calcCentroidsDriver(int start, int finish){
+int ShhherCommand::calcCentroidsDriver(int numOTUs,
+ vector<int>& cumNumSeqs,
+ vector<int>& nSeqsPerOTU,
+ vector<int>& seqIndex,
+ vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int>& centroids, //the representative flowgram for each cluster m
+ vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int>& mapSeqToUnique,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI,
+ vector<int>& lengths,
+ int numFlowCells,
+ vector<int>& seqNumber){
//this function gets the most likely homopolymer length at a flow position for a group of sequences
//within an otu
try{
- for(int i=start;i<finish;i++){
+ for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
for(int j=0;j<nSeqsPerOTU[i];j++){
count += singleTau[seqNumber[cumNumSeqs[i] + j]];
}
-
+
if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
vector<double> adF(nSeqsPerOTU[i]);
vector<int> anL(nSeqsPerOTU[i]);
double tauValue = singleTau[seqNumber[index]];
for(int k=0;k<position;k++){
- double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
+ double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
adF[k] += dist * tauValue;
}
}
centroids[i] = -1;
}
}
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
+double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells){
try{
int flowAValue = cent * numFlowCells;
int flowBValue = flow * numFlowCells;
double dist = 0;
-
+
for(int i=0;i<length;i++){
dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
flowAValue++;
exit(1);
}
}
-
/**************************************************************************************************/
-double ShhherCommand::getNewWeights(){
+double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
try{
double maxChange = 0;
weight[i] += tauValue;
}
- difference = fabs(weight[i] - difference);
- if(difference > maxChange){ maxChange = difference; }
- }
- return maxChange;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-double ShhherCommand::getLikelihood(){
-
- try{
-
- vector<long double> P(numSeqs, 0);
- int effNumOTUs = 0;
-
- for(int i=0;i<numOTUs;i++){
- if(weight[i] > MIN_WEIGHT){
- effNumOTUs++;
- }
- }
-
- string hold;
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- for(int j=0;j<nSeqsPerOTU[i];j++){
- int index = cumNumSeqs[i] + j;
- int nI = seqIndex[index];
- double singleDist = dist[seqNumber[index]];
-
- P[nI] += weight[i] * exp(-singleDist * sigma);
- }
- }
- double nLL = 0.00;
- for(int i=0;i<numSeqs;i++){
- if(P[i] == 0){ P[i] = DBL_EPSILON; }
-
- nLL += -log(P[i]);
- }
-
- nLL = nLL -(double)numSeqs * log(sigma);
-
- return nLL;
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "getNewWeights");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::checkCentroids(){
- try{
- vector<int> unique(numOTUs, 1);
-
- for(int i=0;i<numOTUs;i++){
- if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
- unique[i] = -1;
- }
- }
-
- for(int i=0;i<numOTUs;i++){
-
- if (m->control_pressed) { break; }
-
- if(unique[i] == 1){
- for(int j=i+1;j<numOTUs;j++){
- if(unique[j] == 1){
-
- if(centroids[j] == centroids[i]){
- unique[j] = 0;
- centroids[j] = -1;
-
- weight[i] += weight[j];
- weight[j] = 0.0;
- }
- }
- }
- }
- }
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "checkCentroids");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void ShhherCommand::calcNewDistances(){
- try{
-
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-
- if(processors == 1) {
- calcNewDistancesParent(0, numSeqs);
- }
- else{ //you have multiple processors
- if (numSeqs < processors){ processors = 1; }
-
- vector<vector<int> > child_otuIndex(processors);
- vector<vector<int> > child_seqIndex(processors);
- vector<vector<double> > child_singleTau(processors);
- vector<int> totals(processors);
-
- int process = 1;
- vector<int> processIDs;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = vfork();
-
- if (pid > 0) {
- processIDs.push_back(pid); //create map from line number to pid so you can append files in correct order later
- process++;
- }else if (pid == 0){
- calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
- totals[process] = child_otuIndex[process].size();
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine();
- perror(" : ");
- for (int i=0;i<processIDs.size();i++) { int temp = processIDs[i]; kill (temp, SIGINT); }
- exit(0);
- }
- }
-
- //parent does its part
- calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
- int total = seqIndex.size();
-
- //force parent to wait until all the processes are done
- for (int i=0;i<processIDs.size();i++) {
- int temp = processIDs[i];
- wait(&temp);
- }
-
- for(int i=1;i<processors;i++){
- int oldTotal = total;
- total += totals[i];
-
- singleTau.resize(total, 0);
- seqIndex.resize(total, 0);
- seqNumber.resize(total, 0);
-
- int childIndex = 0;
-
- for(int j=oldTotal;j<total;j++){
- int otuI = child_otuIndex[i][childIndex];
- int seqI = child_seqIndex[i][childIndex];
-
- singleTau[j] = child_singleTau[i][childIndex];
- aaP[otuI][nSeqsPerOTU[otuI]] = j;
- aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
- nSeqsPerOTU[otuI]++;
-
- childIndex++;
- }
- }
- }
-#else
- calcNewDistancesParent(0, numSeqs);
-#endif
- }
- catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-#ifdef USE_MPI
-void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
-
- try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- otuIndex.clear();
- seqIndex.clear();
- singleTau.clear();
-
- for(int i=startSeq;i<stopSeq;i++){
-
- if (m->control_pressed) { break; }
-
- double offset = 1e8;
- int indexOffset = i * numOTUs;
-
- for(int j=0;j<numOTUs;j++){
-
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
- }
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
- }
-
- for(int j=0;j<numOTUs;j++){
-
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- otuIndex.push_back(j);
- seqIndex.push_back(i);
- singleTau.push_back(newTau[j]);
- }
- }
-
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
}
+ return maxChange;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
exit(1);
}
}
-#endif
+
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
+double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
try{
- vector<double> newTau(numOTUs,0);
- vector<double> norms(numSeqs, 0);
- child_otuIndex.resize(0);
- child_seqIndex.resize(0);
- child_singleTau.resize(0);
- for(int i=startSeq;i<stopSeq;i++){
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
if (m->control_pressed) { break; }
- double offset = 1e8;
- int indexOffset = i * numOTUs;
-
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
- }
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
- if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
- offset = dist[indexOffset + j];
- }
+ P[nI] += weight[i] * exp(-singleDist * sigma);
}
-
- for(int j=0;j<numOTUs;j++){
- if(weight[j] > MIN_WEIGHT){
- newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
- norms[i] += newTau[j];
- }
- else{
- newTau[j] = 0.0;
- }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(sigma);
+
+ return nLL;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getNewWeights");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
+ try{
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
}
+ }
+
+ for(int i=0;i<numOTUs;i++){
- for(int j=0;j<numOTUs;j++){
- newTau[j] /= norms[i];
-
- if(newTau[j] > MIN_TAU){
- child_otuIndex.push_back(j);
- child_seqIndex.push_back(i);
- child_singleTau.push_back(newTau[j]);
+ if (m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
}
}
}
+
+ return 0;
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
+ m->errorOut(e, "ShhherCommand", "checkCentroids");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
+void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
+ vector<double>& weight, vector<short>& change, vector<int>& centroids,
+ vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
+ vector<int>& seqNumber, vector<int>& seqIndex,
+ vector<short>& uniqueFlowgrams,
+ vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
try{
vector<double> newTau(numOTUs,0);
vector<double> norms(numSeqs, 0);
nSeqsPerOTU.assign(numOTUs, 0);
-
- for(int i=startSeq;i<stopSeq;i++){
+
+ for(int i=0;i<numSeqs;i++){
if (m->control_pressed) { break; }
int indexOffset = i * numOTUs;
-
+
double offset = 1e8;
for(int j=0;j<numOTUs;j++){
-
+
if(weight[j] > MIN_WEIGHT && change[j] == 1){
- dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
+ dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
}
-
+
if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
offset = dist[indexOffset + j];
}
}
-
+
for(int j=0;j<numOTUs;j++){
if(weight[j] > MIN_WEIGHT){
newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
newTau[j] = 0.0;
}
}
-
+
for(int j=0;j<numOTUs;j++){
newTau[j] /= norms[i];
}
-
+
for(int j=0;j<numOTUs;j++){
if(newTau[j] > MIN_TAU){
nSeqsPerOTU[j]++;
}
}
-
+
}
-
+
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
+ m->errorOut(e, "ShhherCommand", "calcNewDistances");
exit(1);
}
}
+/**************************************************************************************************/
+int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
+ try {
+ int index = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = index;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[index] = aaP[i][j];
+ seqIndex[index] = aaI[i][j];
+
+ index++;
+ }
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "fill");
+ exit(1);
+ }
+}
/**************************************************************************************************/
-void ShhherCommand::setOTUs(){
+void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
+ vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
try {
vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
nSeqsPerOTU[index]++;
}
- fill();
+
+ fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
}
catch(exception& e) {
- m->errorOut(e, "ShhherCommand", "calcNewDistances");
+ m->errorOut(e, "ShhherCommand", "setOTUs");
exit(1);
}
}
-
/**************************************************************************************************/
-void ShhherCommand::writeQualities(vector<int> otuCounts){
+void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
+ vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
+ vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.qual";
-
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
+
ofstream qualityFile;
m->openOutputFile(qualityFileName, qualityFile);
-
+
qualityFile.setf(ios::fixed, ios::floatfield);
qualityFile.setf(ios::showpoint);
qualityFile << setprecision(6);
if(otuCounts[i] > 0){
qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
-
+
int j=4; //need to get past the first four bases
while(qualities[i][j] != -1){
- qualityFile << qualities[i][j] << ' ';
- j++;
+ qualityFile << qualities[i][j] << ' ';
+ if (j > qualities[i].size()) { break; }
+ j++;
}
qualityFile << endl;
}
}
qualityFile.close();
outputNames.push_back(qualityFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeQualities");
/**************************************************************************************************/
-void ShhherCommand::writeSequences(vector<int> otuCounts){
+void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.fasta";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
ofstream fastaFile;
m->openOutputFile(fastaFileName, fastaFile);
}
}
fastaFile.close();
-
+
outputNames.push_back(fastaFileName);
-
- if(compositeFASTAFileName != ""){
- m->appendFiles(fastaFileName, compositeFASTAFileName);
+
+ if(thisCompositeFASTAFileName != ""){
+ m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
}
}
catch(exception& e) {
/**************************************************************************************************/
-void ShhherCommand::writeNames(vector<int> otuCounts){
+void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.names";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
ofstream nameFile;
m->openOutputFile(nameFileName, nameFile);
outputNames.push_back(nameFileName);
- if(compositeNamesFileName != ""){
- m->appendFiles(nameFileName, compositeNamesFileName);
+ if(thisCompositeNamesFileName != ""){
+ m->appendFiles(nameFileName, thisCompositeNamesFileName);
}
}
catch(exception& e) {
/**************************************************************************************************/
-void ShhherCommand::writeGroups(){
+void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
- string groupFileName = fileRoot + ".shhh.groups";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
+ string groupFileName = fileRoot + "shhh.groups";
ofstream groupFile;
m->openOutputFile(groupFileName, groupFile);
}
groupFile.close();
outputNames.push_back(groupFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeGroups");
/**************************************************************************************************/
-void ShhherCommand::writeClusters(vector<int> otuCounts){
+void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
try {
string thisOutputDir = outputDir;
- if (outputDir == "") { thisOutputDir += m->hasPath(flowFileName); }
- string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + ".shhh.counts";
+ if (outputDir == "") { thisOutputDir += m->hasPath(filename); }
+ string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
ofstream otuCountsFile;
m->openOutputFile(otuCountsFileName, otuCountsFile);
for(int k=0;k<lengths[sequence];k++){
char base = bases[k % 4];
int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
-
+
for(int s=0;s<freq;s++){
newSeq += base;
//otuCountsFile << base;
}
otuCountsFile.close();
outputNames.push_back(otuCountsFileName);
-
+
}
catch(exception& e) {
m->errorOut(e, "ShhherCommand", "writeClusters");
}
}
-//**********************************************************************************************************************
+/**************************************************************************************************/
+
+void ShhherCommand::getSingleLookUp(){
+ try{
+ // these are the -log probabilities that a signal corresponds to a particular homopolymer length
+ singleLookUp.assign(HOMOPS * NUMBINS, 0);
+
+ int index = 0;
+ ifstream lookUpFile;
+ m->openInputFile(lookupFileName, lookUpFile);
+
+ for(int i=0;i<HOMOPS;i++){
+
+ if (m->control_pressed) { break; }
+
+ float logFracFreq;
+ lookUpFile >> logFracFreq;
+
+ for(int j=0;j<NUMBINS;j++) {
+ lookUpFile >> singleLookUp[index];
+ index++;
+ }
+ }
+ lookUpFile.close();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getSingleLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void ShhherCommand::getJointLookUp(){
+ try{
+
+ // the most likely joint probability (-log) that two intenities have the same polymer length
+ jointLookUp.resize(NUMBINS * NUMBINS, 0);
+
+ for(int i=0;i<NUMBINS;i++){
+
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<NUMBINS;j++){
+
+ double minSum = 100000000;
+
+ for(int k=0;k<HOMOPS;k++){
+ double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
+
+ if(sum < minSum) { minSum = sum; }
+ }
+ jointLookUp[i * NUMBINS + j] = minSum;
+ }
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getJointLookUp");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+double ShhherCommand::getProbIntensity(int intIntensity){
+ try{
+
+ double minNegLogProb = 100000000;
+
+
+ for(int i=0;i<HOMOPS;i++){//loop signal strength
+
+ if (m->control_pressed) { break; }
+
+ float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
+ if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
+ }
+
+ return minNegLogProb;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ShhherCommand", "getProbIntensity");
+ exit(1);
+ }
+}
+
+
+
+
#include "mothur.h"
#include "command.hpp"
+#include "readcolumn.h"
+#include "readmatrix.hpp"
+#include "rabundvector.hpp"
+#include "sabundvector.hpp"
+#include "listvector.hpp"
+#include "cluster.hpp"
+#include "sparsematrix.hpp"
+#include <cfloat>
//**********************************************************************************************************************
void help() { m->mothurOut(getHelpString()); }
private:
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+
int abort;
-
string outputDir, flowFileName, flowFilesFileName, lookupFileName, compositeFASTAFileName, compositeNamesFileName;
int processors, maxIters;
float cutoff, sigma, minDelta;
string flowOrder;
-
- vector<int> nSeqsBreaks;
- vector<int> nOTUsBreaks;
+
+ vector<string> outputNames;
vector<double> singleLookUp;
vector<double> jointLookUp;
+ vector<string> flowFileVector;
+
+ int driver(vector<string>, string, string, int, int);
+ int createProcesses(vector<string>);
+ int getFlowData(string, vector<string>&, vector<int>&, vector<short>&, map<string, int>&, int&);
+ int getUniques(int, int, vector<short>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ int flowDistParentFork(int, string, int, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ float calcPairwiseDist(int, int, int, vector<int>&, vector<int>&, vector<double>&, vector<short>&);
+ int createNamesFile(int, int, string, vector<string>&, vector<int>&, vector<int>&);
+ int cluster(string, string, string);
+ int getOTUData(int numSeqs, string, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&, vector<int>&, vector<int>&,map<string, int>&);
+ int calcCentroidsDriver(int numOTUs, vector<int>&, vector<int>&, vector<int>&, vector<short>&, vector<int>&, vector<double>&, vector<int>&, vector<short>&, vector<short>&, vector<int>&, int, vector<int>&);
+ double getDistToCentroid(int, int, int, vector<short>&, vector<short>&, int);
+ double getNewWeights(int, vector<int>&, vector<int>&, vector<double>&, vector<int>&, vector<double>&);
+
+ double getLikelihood(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<double>&, vector<double>&);
+ int checkCentroids(int, vector<int>&, vector<double>&);
+ void calcNewDistances(int, int, vector<int>& , vector<double>&,vector<double>& , vector<short>& change, vector<int>&,vector<vector<int> >&, vector<double>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&, vector<short>&, int, vector<int>&);
+ int fill(int, vector<int>&, vector<int>&, vector<int>&, vector<int>&, vector<vector<int> >&, vector<vector<int> >&);
+ void setOTUs(int, int, vector<int>&, vector<int>&, vector<int>&, vector<int>&,
+ vector<int>&, vector<double>&, vector<double>&, vector<vector<int> >&, vector<vector<int> >&);
+ void writeQualities(int, int, string, vector<int>, vector<int>&, vector<int>&, vector<double>&, vector<short>&, vector<short>&, vector<int>&, vector<int>&, vector<string>&, vector<int>&, vector<vector<int> >&);
+ void writeSequences(string, int, int, string, vector<int>, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&);
+ void writeNames(string, int, string, vector<int>, vector<string>&, vector<vector<int> >&, vector<int>&);
+ void writeGroups(string, int, vector<string>&);
+ void writeClusters(string, int, int, vector<int>, vector<int>&, vector<short>&, vector<string>&, vector<vector<int> >&, vector<int>&, vector<int>&, vector<short>&);
+
+ void getSingleLookUp();
+ void getJointLookUp();
+ double getProbIntensity(int);
- vector<string> seqNameVector;
+
+#ifdef USE_MPI
+ string flowDistMPI(int, int);
+ void calcNewDistancesChildMPI(int, int, vector<int>&);
+
+ int pid, ncpus;
+
+ void getFlowData();
+ void getUniques();
+
+ float calcPairwiseDist(int, int);
+ void flowDistParentFork(string, int, int);
+
+ string createDistFile(int);
+ string createNamesFile();
+ string cluster(string, string);
+
+ void getOTUData(string);
+ void initPyroCluster();
+ void fill();
+ void calcCentroids();
+ void calcCentroidsDriver(int, int);
+ double getDistToCentroid(int, int, int);
+ double getNewWeights();
+ double getLikelihood();
+ void checkCentroids();
+ void calcNewDistances();
+ void calcNewDistancesParent(int, int);
+ void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
+
+
+ void setOTUs();
+ void writeQualities(vector<int>);
+ void writeSequences(vector<int>);
+ void writeNames(vector<int>);
+ void writeGroups();
+ void writeClusters(vector<int>);
+
+ vector<string> seqNameVector;
vector<int> lengths;
vector<short> flowDataIntI;
vector<double> flowDataPrI;
vector<int> mapSeqToUnique;
vector<int> mapUniqueToSeq;
vector<int> uniqueLengths;
+ int numSeqs, numUniques, numOTUs, numFlowCells;
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
- vector<string> outputNames;
-
- int numSeqs, numUniques, numOTUs, numFlowCells;
-
- void getSingleLookUp();
- void getJointLookUp();
- void getFlowData();
- void getUniques();
- double getProbIntensity(int);
- float calcPairwiseDist(int, int);
- void flowDistParentFork(string, int, int);
-
- string createDistFile(int);
- string createNamesFile();
- string cluster(string, string);
-
- void getOTUData(string);
- void initPyroCluster();
- void fill();
- void calcCentroids();
- void calcCentroidsDriver(int, int);
- double getDistToCentroid(int, int, int);
- double getNewWeights();
- double getLikelihood();
- void checkCentroids();
- void calcNewDistances();
- void calcNewDistancesParent(int, int);
- void calcNewDistancesChild(int, int, vector<int>&, vector<int>&, vector<double>&);
-
-
- void setOTUs();
- void writeQualities(vector<int>);
- void writeSequences(vector<int>);
- void writeNames(vector<int>);
- void writeGroups();
- void writeClusters(vector<int>);
-
-
-#ifdef USE_MPI
- string flowDistMPI(int, int);
- void calcNewDistancesChildMPI(int, int, vector<int>&);
-
- int pid, ncpus;
#endif
};
//custom data structure for threads to use.
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
-struct flowDistParentForkData {
- string distFileName;
- vector<int> mapUniqueToSeq;
- vector<int> mapSeqToUnique;
- vector<int> lengths;
- vector<short> flowDataIntI;
- vector<double> flowDataPrI;
+struct shhhFlowsData {
+ int threadID, maxIters;
+ float cutoff, sigma, minDelta;
+ string flowOrder;
+ vector<double> singleLookUp;
vector<double> jointLookUp;
+ vector<string> filenames;
+ vector<string> outputNames;
+ string thisCompositeFASTAFileName, thisCompositeNameFileName, outputDir;
+ int start, stop;
MothurOut* m;
- int threadID, startSeq, stopSeq, numFlowCells;
- float cutoff;
- flowDistParentForkData(){}
- flowDistParentForkData(string d, vector<int> mapU, vector<int> mapS, vector<int> l, vector<short> flowD, vector<double> flowDa, vector<double> j, MothurOut* mout, int st, int sp, int n, float cut, int tid) {
- distFileName = d;
- mapUniqueToSeq = mapU;
- mapSeqToUnique = mapS;
- lengths = l;
- flowDataIntI = flowD;
- flowDataPrI = flowDa;
- jointLookUp = j;
+ shhhFlowsData(){}
+ shhhFlowsData(vector<string> f, string cf, string cn, string ou, string flor, vector<double> jl, vector<double> sl, MothurOut* mout, int st, int sp, float cut, float si, float mD, int mx, int tid) {
+ filenames = f;
+ thisCompositeFASTAFileName = cf;
+ thisCompositeNameFileName = cn;
+ outputDir = ou;
+ flowOrder = flor;
m = mout;
- startSeq = st;
- stopSeq = sp;
- numFlowCells = n;
+ start = st;
+ stop = sp;
cutoff= cut;
+ sigma = si;
+ minDelta = mD;
+ maxIters = mx;
+ jointLookUp = jl;
+ singleLookUp = sl;
threadID = tid;
}
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
-static DWORD WINAPI MyflowDistParentForkThreadFunction(LPVOID lpParam){
- flowDistParentForkData* pDataArray;
- pDataArray = (flowDistParentForkData*)lpParam;
+static DWORD WINAPI ShhhFlowsThreadFunction(LPVOID lpParam){
+ shhhFlowsData* pDataArray;
+ pDataArray = (shhhFlowsData*)lpParam;
try {
- ostringstream outStream;
- outStream.setf(ios::fixed, ios::floatfield);
- outStream.setf(ios::dec, ios::basefield);
- outStream.setf(ios::showpoint);
- outStream.precision(6);
-
- int begTime = time(NULL);
- double begClock = clock();
- string tempOut = "start and end = " + toString(pDataArray->startSeq) +'\t' + toString(pDataArray->stopSeq) + "-";
- cout << tempOut << endl;
+
+ for(int l=pDataArray->start;l<pDataArray->stop;l++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ string flowFileName = pDataArray->filenames[l];
+
+ pDataArray->m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(l+1) + " of " + toString(pDataArray->filenames.size()) + ")\t<<<<<\n");
+ pDataArray->m->mothurOut("Reading flowgrams...\n");
+
+ vector<string> seqNameVector;
+ vector<int> lengths;
+ vector<short> flowDataIntI;
+ vector<double> flowDataPrI;
+ map<string, int> nameMap;
+ vector<short> uniqueFlowgrams;
+ vector<int> uniqueCount;
+ vector<int> mapSeqToUnique;
+ vector<int> mapUniqueToSeq;
+ vector<int> uniqueLengths;
+ int numFlowCells;
+
+ //int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
+ /*****************************************************************************************************/
+
+ ifstream flowFile;
+ // cout << "herethread " << flowFileName << '\t' << &flowFile << endl;
+ pDataArray->m->openInputFile(flowFileName, flowFile);
+
+ // cout << "herethread " << flowFileName << endl;
+ string seqName;
+ int currentNumFlowCells;
+ float intensity;
+
+ flowFile >> numFlowCells;
+ int index = 0;//pcluster
+ while(!flowFile.eof()){
+
+ if (pDataArray->m->control_pressed) { flowFile.close(); return 0; }
+
+ flowFile >> seqName >> currentNumFlowCells;
+ lengths.push_back(currentNumFlowCells);
+ // cout << "herethread " << seqName << endl;
+ seqNameVector.push_back(seqName);
+ nameMap[seqName] = index++;//pcluster
+
+ for(int i=0;i<numFlowCells;i++){
+ flowFile >> intensity;
+ if(intensity > 9.99) { intensity = 9.99; }
+ int intI = int(100 * intensity + 0.0001);
+ flowDataIntI.push_back(intI);
+ }
+ pDataArray->m->gobble(flowFile);
+ }
+ flowFile.close();
+
+ int numSeqs = seqNameVector.size();
+ // cout << numSeqs << endl;
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ int iNumFlowCells = i * numFlowCells;
+ for(int j=lengths[i];j<numFlowCells;j++){
+ flowDataIntI[iNumFlowCells + j] = 0;
+ }
+ }
+ // cout << "here" << endl;
+ /*****************************************************************************************************/
- for(int i=pDataArray->startSeq;i<pDataArray->stopSeq;i++){
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("Identifying unique flowgrams...\n");
+ //int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ int numUniques = 0;
+ uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
+ uniqueCount.assign(numSeqs, 0); // anWeights
+ uniqueLengths.assign(numSeqs, 0);
+ mapSeqToUnique.assign(numSeqs, -1);
+ mapUniqueToSeq.assign(numSeqs, -1);
+
+ vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ int index = 0;
+
+ vector<short> current(numFlowCells);
+ for(int j=0;j<numFlowCells;j++){
+ current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
+ }
+
+ for(int j=0;j<numUniques;j++){
+ int offset = j * numFlowCells;
+ bool toEnd = 1;
+
+ int shorterLength;
+ if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
+ else { shorterLength = uniqueLengths[j]; }
+
+ for(int k=0;k<shorterLength;k++){
+ if(current[k] != uniqueFlowgrams[offset + k]){
+ toEnd = 0;
+ break;
+ }
+ }
+
+ if(toEnd){
+ mapSeqToUnique[i] = j;
+ uniqueCount[j]++;
+ index = j;
+ if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
+ break;
+ }
+ index++;
+ }
+
+ if(index == numUniques){
+ uniqueLengths[numUniques] = lengths[i];
+ uniqueCount[numUniques] = 1;
+ mapSeqToUnique[i] = numUniques;//anMap
+ mapUniqueToSeq[numUniques] = i;//anF
+
+ for(int k=0;k<numFlowCells;k++){
+ uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
+ uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
+ }
+
+ numUniques++;
+ }
+ }
+ uniqueFlowDataIntI.resize(numFlowCells * numUniques);
+ uniqueLengths.resize(numUniques);
+
+ flowDataPrI.resize(numSeqs * numFlowCells, 0);
+ for(int i=0;i<flowDataPrI.size();i++) {
+ if (pDataArray->m->control_pressed) { return 0; }
+ //flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);
+
+ flowDataPrI[i] = 100000000;
+
+ for(int j=0;j<HOMOPS;j++){//loop signal strength
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ float negLogProb = pDataArray->singleLookUp[j * NUMBINS + flowDataIntI[i]];
+ if(negLogProb < flowDataPrI[i]) { flowDataPrI[i] = negLogProb; }
+ }
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("Calculating distances between flowgrams...\n");
+ string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
+ unsigned long long begTime = time(NULL);
+ double begClock = clock();
+
+ //flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ ostringstream outStream;
+ outStream.setf(ios::fixed, ios::floatfield);
+ outStream.setf(ios::dec, ios::basefield);
+ outStream.setf(ios::showpoint);
+ outStream.precision(6);
+
+ int thisbegTime = time(NULL);
+ double thisbegClock = clock();
+
+ for(int i=0;i<numUniques;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<i;j++){
+ //float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
+ /*****************************************************************************************************/
+ int seqA = mapUniqueToSeq[i]; int seqB = mapUniqueToSeq[j];
+ int minLength = lengths[mapSeqToUnique[seqA]];
+ if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
+
+ int ANumFlowCells = seqA * numFlowCells;
+ int BNumFlowCells = seqB * numFlowCells;
+
+ float flowDistance = 0;
+
+ for(int k=0;k<minLength;k++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int flowAIntI = flowDataIntI[ANumFlowCells + k];
+ float flowAPrI = flowDataPrI[ANumFlowCells + k];
+
+ int flowBIntI = flowDataIntI[BNumFlowCells + k];
+ float flowBPrI = flowDataPrI[BNumFlowCells + k];
+ flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
+ }
+
+ flowDistance /= (float) minLength;
+ /*****************************************************************************************************/
+
+ if(flowDistance < 1e-6){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
+ }
+ else if(flowDistance <= pDataArray->cutoff){
+ outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
+ }
+ }
+ if(i % 100 == 0){
+ pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - thisbegTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+ }
+
+ ofstream distFile(distFileName.c_str());
+ distFile << outStream.str();
+ distFile.close();
+
+ if (pDataArray->m->control_pressed) {}
+ else {
+ pDataArray->m->mothurOut(toString(numUniques-1) + "\t" + toString(time(NULL) - thisbegTime));
+ pDataArray->m->mothurOut("\t" + toString((clock()-thisbegClock)/CLOCKS_PER_SEC));
+ pDataArray->m->mothurOutEndLine();
+ }
+ /*****************************************************************************************************/
+
+ pDataArray->m->mothurOutEndLine();
+ pDataArray->m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
+
+ string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
+ //createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
+ /*****************************************************************************************************/
+ vector<string> duplicateNames(numUniques, "");
+ for(int i=0;i<numSeqs;i++){
+ duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
+ }
+
+ ofstream nameFile;
+ pDataArray->m->openOutputFile(namesFileName, nameFile);
+
+ for(int i=0;i<numUniques;i++){
+ if (pDataArray->m->control_pressed) { nameFile.close(); return 0; }
+ nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
+ }
+ nameFile.close();
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurOut("\nClustering flowgrams...\n");
+ string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
+ //cluster(listFileName, distFileName, namesFileName);
+ /*****************************************************************************************************/
+ ReadMatrix* read = new ReadColumnMatrix(distFileName);
+ read->setCutoff(pDataArray->cutoff);
+
+ NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
+ clusterNameMap->readMap();
+ read->read(clusterNameMap);
+
+ ListVector* list = read->getListVector();
+ SparseMatrix* matrix = read->getMatrix();
+
+ delete read;
+ delete clusterNameMap;
+
+ RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
+
+ Cluster* cluster = new CompleteLinkage(rabund, list, matrix, pDataArray->cutoff, "furthest");
+ string tag = cluster->getTag();
+
+ double clusterCutoff = pDataArray->cutoff;
+ while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ cluster->update(clusterCutoff);
+ }
+
+ list->setLabel(toString(pDataArray->cutoff));
+
+ ofstream listFileOut;
+ pDataArray->m->openOutputFile(listFileName, listFileOut);
+ list->print(listFileOut);
+ listFileOut.close();
+
+ delete matrix; delete cluster; delete rabund; delete list;
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ vector<int> otuData;
+ vector<int> cumNumSeqs;
+ vector<int> nSeqsPerOTU;
+ vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
+ vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
+ vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
+ vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
+
+
+ //int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
+ /*****************************************************************************************************/
+ ifstream listFile;
+ pDataArray->m->openInputFile(listFileName, listFile);
+ string label;
+ int numOTUs;
+
+ listFile >> label >> numOTUs;
+
+ otuData.assign(numSeqs, 0);
+ cumNumSeqs.assign(numOTUs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+ aaP.clear();aaP.resize(numOTUs);
+
+ seqNumber.clear();
+ aaI.clear();
+ seqIndex.clear();
+
+ string singleOTU = "";
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ listFile >> singleOTU;
+
+ istringstream otuString(singleOTU);
+
+ while(otuString){
+
+ string seqName = "";
+
+ for(int j=0;j<singleOTU.length();j++){
+ char letter = otuString.get();
+
+ if(letter != ','){
+ seqName += letter;
+ }
+ else{
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+ int index = nmIt->second;
+
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+ seqName = "";
+ }
+ }
+
+ map<string,int>::iterator nmIt = nameMap.find(seqName);
+
+ int index = nmIt->second;
+ nameMap.erase(nmIt);
+
+ otuData[index] = i;
+ nSeqsPerOTU[i]++;
+ aaP[i].push_back(index);
+
+ otuString.get();
+ }
+
+ sort(aaP[i].begin(), aaP[i].end());
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber.push_back(aaP[i][j]);
+ }
+ for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
+ aaP[i].push_back(0);
+ }
+
+
+ }
+
+ for(int i=1;i<numOTUs;i++){
+ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
+ }
+ aaI = aaP;
+ seqIndex = seqNumber;
+
+ listFile.close();
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ pDataArray->m->mothurRemove(distFileName);
+ pDataArray->m->mothurRemove(namesFileName);
+ pDataArray->m->mothurRemove(listFileName);
+
+ vector<double> dist; //adDist - distance of sequences to centroids
+ vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
+ vector<int> centroids; //the representative flowgram for each cluster m
+ vector<double> weight;
+ vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
+ vector<int> nSeqsBreaks;
+ vector<int> nOTUsBreaks;
+
+ dist.assign(numSeqs * numOTUs, 0);
+ change.assign(numOTUs, 1);
+ centroids.assign(numOTUs, -1);
+ weight.assign(numOTUs, 0);
+ singleTau.assign(numSeqs, 1.0);
+
+ nSeqsBreaks.assign(2, 0);
+ nOTUsBreaks.assign(2, 0);
+
+ nSeqsBreaks[0] = 0;
+ nSeqsBreaks[1] = numSeqs;
+ nOTUsBreaks[1] = numOTUs;
if (pDataArray->m->control_pressed) { break; }
- cout << "thread i = " << i << endl;
- for(int j=0;j<i;j++){
+
+ double maxDelta = 0;
+ int iter = 0;
+
+ begClock = clock();
+ begTime = time(NULL);
+
+ pDataArray->m->mothurOut("\nDenoising flowgrams...\n");
+ pDataArray->m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
+
+ while((pDataArray->maxIters == 0 && maxDelta > pDataArray->minDelta) || iter < MIN_ITER || (maxDelta > pDataArray->minDelta && iter < pDataArray->maxIters)){
+
+ if (pDataArray->m->control_pressed) { break; }
- cout << "thread j = " << j << endl;
- float flowDistance = 0.0;
- ////////////////// calcPairwiseDist ///////////////////
- //needed because this is a static global function that can't see the classes internal functions
+ double cycClock = clock();
+ unsigned long long cycTime = time(NULL);
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ int indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
- int minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[i]]];
- if(pDataArray->lengths[pDataArray->mapUniqueToSeq[j]] < minLength){ minLength = pDataArray->lengths[pDataArray->mapSeqToUnique[pDataArray->mapUniqueToSeq[j]]]; }
+ if (pDataArray->m->control_pressed) { break; }
+
+ //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+ /*****************************************************************************************************/
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = anL[k] * numFlowCells;
+ int flowBValue = nI * numFlowCells;
+
+ double dist = 0;
+
+ for(int l=0;l<lengths[nI];l++){
+ dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist = dist / (double)lengths[nI];
+ /*****************************************************************************************************/
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
+ /*****************************************************************************************************/
+ double maxChange = 0;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double difference = weight[i];
+ weight[i] = 0;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ weight[i] += tauValue;
+ }
+
+ difference = fabs(weight[i] - difference);
+ if(difference > maxChange){ maxChange = difference; }
+ }
+ maxDelta = maxChange;
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
+ /*****************************************************************************************************/
+ vector<long double> P(numSeqs, 0);
+ int effNumOTUs = 0;
+
+ for(int i=0;i<numOTUs;i++){
+ if(weight[i] > MIN_WEIGHT){
+ effNumOTUs++;
+ }
+ }
+
+ string hold;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ double singleDist = dist[seqNumber[index]];
+
+ P[nI] += weight[i] * exp(-singleDist * pDataArray->sigma);
+ }
+ }
+ double nLL = 0.00;
+ for(int i=0;i<numSeqs;i++){
+ if(P[i] == 0){ P[i] = DBL_EPSILON; }
+
+ nLL += -log(P[i]);
+ }
+
+ nLL = nLL -(double)numSeqs * log(pDataArray->sigma);
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //checkCentroids(numOTUs, centroids, weight);
+ /*****************************************************************************************************/
+ vector<int> unique(numOTUs, 1);
+
+ for(int i=0;i<numOTUs;i++){
+ if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
+ unique[i] = -1;
+ }
+ }
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ if(unique[i] == 1){
+ for(int j=i+1;j<numOTUs;j++){
+ if(unique[j] == 1){
+
+ if(centroids[j] == centroids[i]){
+ unique[j] = 0;
+ centroids[j] = -1;
+
+ weight[i] += weight[j];
+ weight[j] = 0.0;
+ }
+ }
+ }
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
- int ANumFlowCells = pDataArray->mapUniqueToSeq[i] * pDataArray->numFlowCells;
- int BNumFlowCells = pDataArray->mapUniqueToSeq[j] * pDataArray->numFlowCells;
+ //calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
+ /*****************************************************************************************************/
+ int total = 0;
+ vector<double> newTau(numOTUs,0);
+ vector<double> norms(numSeqs, 0);
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int indexOffset = i * numOTUs;
+
+ double offset = 1e8;
+
+ for(int j=0;j<numOTUs;j++){
+
+ if(weight[j] > MIN_WEIGHT && change[j] == 1){
+ //dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = centroids[j] * numFlowCells;
+ int flowBValue = i * numFlowCells;
+
+ double distTemp = 0;
+
+ for(int l=0;l<lengths[i];l++){
+ distTemp += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist[indexOffset + j] = distTemp / (double)lengths[i];
+ /*****************************************************************************************************/
+
+ }
+
+ if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
+ offset = dist[indexOffset + j];
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(weight[j] > MIN_WEIGHT){
+ newTau[j] = exp(pDataArray->sigma * (-dist[indexOffset + j] + offset)) * weight[j];
+ norms[i] += newTau[j];
+ }
+ else{
+ newTau[j] = 0.0;
+ }
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ newTau[j] /= norms[i];
+ }
+
+ for(int j=0;j<numOTUs;j++){
+ if(newTau[j] > MIN_TAU){
+
+ int oldTotal = total;
+
+ total++;
+
+ singleTau.resize(total, 0);
+ seqNumber.resize(total, 0);
+ seqIndex.resize(total, 0);
+
+ singleTau[oldTotal] = newTau[j];
+
+ aaP[j][nSeqsPerOTU[j]] = oldTotal;
+ aaI[j][nSeqsPerOTU[j]] = i;
+ nSeqsPerOTU[j]++;
+ }
+ }
+
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
- for(int k=0;k<minLength;k++){
-
- if (pDataArray->m->control_pressed) { break; }
-
- int flowAIntI = pDataArray->flowDataIntI[ANumFlowCells + k];
- float flowAPrI = pDataArray->flowDataPrI[ANumFlowCells + k];
-
- int flowBIntI = pDataArray->flowDataIntI[BNumFlowCells + k];
- float flowBPrI = pDataArray->flowDataPrI[BNumFlowCells + k];
- flowDistance += pDataArray->jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
- }
+ iter++;
- flowDistance /= (float) minLength;
- //cout << flowDistance << endl;
- ////////////////// end of calcPairwiseDist ///////////////////
-
- if(flowDistance < 1e-6){
- outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
- }
- else if(flowDistance <= pDataArray->cutoff){
- outStream << pDataArray->mapUniqueToSeq[i] << '\t' << pDataArray->mapUniqueToSeq[j] << '\t' << flowDistance << endl;
- }
- }
- if(i % 100 == 0){
- pDataArray->m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
- pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- pDataArray->m->mothurOutEndLine();
- }
+ pDataArray->m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
+
+ }
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ pDataArray->m->mothurOut("\nFinalizing...\n");
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ int indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
+ /*****************************************************************************************************/
+ vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[index]];
+ int sIndex = seqIndex[index];
+ bigTauMatrix[sIndex * numOTUs + i] = tauValue;
+ }
+ }
+
+ for(int i=0;i<numSeqs;i++){
+ double maxTau = -1.0000;
+ int maxOTU = -1;
+ for(int j=0;j<numOTUs;j++){
+ if(bigTauMatrix[i * numOTUs + j] > maxTau){
+ maxTau = bigTauMatrix[i * numOTUs + j];
+ maxOTU = j;
+ }
+ }
+
+ otuData[i] = maxOTU;
+ }
+
+ nSeqsPerOTU.assign(numOTUs, 0);
+
+ for(int i=0;i<numSeqs;i++){
+ int index = otuData[i];
+
+ singleTau[i] = 1.0000;
+ dist[i] = 0.0000;
+
+ aaP[index][nSeqsPerOTU[index]] = i;
+ aaI[index][nSeqsPerOTU[index]] = i;
+
+ nSeqsPerOTU[index]++;
+ }
+
+ //fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
+ /*****************************************************************************************************/
+ indexFill = 0;
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { return 0; }
+
+ cumNumSeqs[i] = indexFill;
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ seqNumber[indexFill] = aaP[i][j];
+ seqIndex[indexFill] = aaI[i][j];
+
+ indexFill++;
+ }
+ }
+ /*****************************************************************************************************/
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ vector<int> otuCounts(numOTUs, 0);
+ for(int i=0;i<numSeqs;i++) { otuCounts[otuData[i]]++; }
+
+ //calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
+ /*****************************************************************************************************/
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ double count = 0;
+ int position = 0;
+ int minFlowGram = 100000000;
+ double minFlowValue = 1e8;
+ change[i] = 0; //FALSE
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ count += singleTau[seqNumber[cumNumSeqs[i] + j]];
+ }
+
+ if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
+ vector<double> adF(nSeqsPerOTU[i]);
+ vector<int> anL(nSeqsPerOTU[i]);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+ int nIU = mapSeqToUnique[nI];
+
+ int k;
+ for(k=0;k<position;k++){
+ if(nIU == anL[k]){
+ break;
+ }
+ }
+ if(k == position){
+ anL[position] = nIU;
+ adF[position] = 0.0000;
+ position++;
+ }
+ }
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int index = cumNumSeqs[i] + j;
+ int nI = seqIndex[index];
+
+ double tauValue = singleTau[seqNumber[index]];
+
+ for(int k=0;k<position;k++){
+ // double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
+ /*****************************************************************************************************/
+ int flowAValue = anL[k] * numFlowCells;
+ int flowBValue = nI * numFlowCells;
+
+ double dist = 0;
+
+ for(int l=0;l<lengths[nI];l++){
+ dist += pDataArray->singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
+ flowAValue++;
+ flowBValue++;
+ }
+
+ dist = dist / (double)lengths[nI];
+ /*****************************************************************************************************/
+ adF[k] += dist * tauValue;
+ }
+ }
+
+ for(int j=0;j<position;j++){
+ if(adF[j] < minFlowValue){
+ minFlowGram = j;
+ minFlowValue = adF[j];
+ }
+ }
+
+ if(centroids[i] != anL[minFlowGram]){
+ change[i] = 1;
+ centroids[i] = anL[minFlowGram];
+ }
+ }
+ else if(centroids[i] != -1){
+ change[i] = 1;
+ centroids[i] = -1;
+ }
+ }
+
+ /*****************************************************************************************************/
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ //writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ string thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string qualityFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.qual";
+
+ ofstream qualityFile;
+ pDataArray->m->openOutputFile(qualityFileName, qualityFile);
+
+ qualityFile.setf(ios::fixed, ios::floatfield);
+ qualityFile.setf(ios::showpoint);
+ qualityFile << setprecision(6);
+
+ vector<vector<int> > qualities(numOTUs);
+ vector<double> pr(HOMOPS, 0);
+
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int index = 0;
+ int base = 0;
+
+ if(nSeqsPerOTU[i] > 0){
+ qualities[i].assign(1024, -1);
+
+ while(index < numFlowCells){
+ double maxPrValue = 1e8;
+ short maxPrIndex = -1;
+ double count = 0.0000;
+
+ pr.assign(HOMOPS, 0);
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int lIndex = cumNumSeqs[i] + j;
+ double tauValue = singleTau[seqNumber[lIndex]];
+ int sequenceIndex = aaI[i][j];
+ short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
+
+ count += tauValue;
+
+ for(int s=0;s<HOMOPS;s++){
+ pr[s] += tauValue * pDataArray->singleLookUp[s * NUMBINS + intensity];
+ }
+ }
+
+ maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
+ maxPrValue = pr[maxPrIndex];
+
+ if(count > MIN_COUNT){
+ double U = 0.0000;
+ double norm = 0.0000;
+
+ for(int s=0;s<HOMOPS;s++){
+ norm += exp(-(pr[s] - maxPrValue));
+ }
+
+ for(int s=1;s<=maxPrIndex;s++){
+ int value = 0;
+ double temp = 0.0000;
+
+ U += exp(-(pr[s-1]-maxPrValue))/norm;
+
+ if(U>0.00){
+ temp = log10(U);
+ }
+ else{
+ temp = -10.1;
+ }
+ temp = floor(-10 * temp);
+ value = (int)floor(temp);
+ if(value > 100){ value = 100; }
+
+ qualities[i][base] = (int)value;
+ base++;
+ }
+ }
+
+ index++;
+ }
+ }
+
+
+ if(otuCounts[i] > 0){
+ qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
+
+ int j=4; //need to get past the first four bases
+ while(qualities[i][j] != -1){
+ qualityFile << qualities[i][j] << ' ';
+ j++;
+ }
+ qualityFile << endl;
+ }
+ }
+ qualityFile.close();
+ pDataArray->outputNames.push_back(qualityFileName);
+ /*****************************************************************************************************/
+
+ // writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string fastaFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.fasta";
+ ofstream fastaFile;
+ pDataArray->m->openOutputFile(fastaFileName, fastaFile);
+
+ vector<string> names(numOTUs, "");
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ int index = centroids[i];
+
+ if(otuCounts[i] > 0){
+ fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
+
+ string newSeq = "";
+
+ for(int j=0;j<numFlowCells;j++){
+
+ char base = pDataArray->flowOrder[j % 4];
+ for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
+ newSeq += base;
+ }
+ }
+
+ fastaFile << newSeq.substr(4) << endl;
+ }
+ }
+ fastaFile.close();
+
+ pDataArray->outputNames.push_back(fastaFileName);
+
+ if(pDataArray->thisCompositeFASTAFileName != ""){
+ pDataArray->m->appendFiles(fastaFileName, pDataArray->thisCompositeFASTAFileName);
+ }
+
+ /*****************************************************************************************************/
+
+ //writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string nameFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.names";
+ ofstream nameFileOut;
+ pDataArray->m->openOutputFile(nameFileName, nameFileOut);
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) { break; }
+
+ if(otuCounts[i] > 0){
+ nameFileOut << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
+
+ for(int j=1;j<nSeqsPerOTU[i];j++){
+ nameFileOut << ',' << seqNameVector[aaI[i][j]];
+ }
+
+ nameFileOut << endl;
+ }
+ }
+ nameFileOut.close();
+ pDataArray->outputNames.push_back(nameFileName);
+
+
+ if(pDataArray->thisCompositeNameFileName != ""){
+ pDataArray->m->appendFiles(nameFileName, pDataArray->thisCompositeNameFileName);
+ }
+ /*****************************************************************************************************/
+
+ //writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string otuCountsFileName = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName)) + "shhh.counts";
+ ofstream otuCountsFile;
+ pDataArray->m->openOutputFile(otuCountsFileName, otuCountsFile);
+
+ string bases = pDataArray->flowOrder;
+
+ for(int i=0;i<numOTUs;i++){
+
+ if (pDataArray->m->control_pressed) {
+ break;
+ }
+ //output the translated version of the centroid sequence for the otu
+ if(otuCounts[i] > 0){
+ int index = centroids[i];
+
+ otuCountsFile << "ideal\t";
+ for(int j=8;j<numFlowCells;j++){
+ char base = bases[j % 4];
+ for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
+ otuCountsFile << base;
+ }
+ }
+ otuCountsFile << endl;
+
+ for(int j=0;j<nSeqsPerOTU[i];j++){
+ int sequence = aaI[i][j];
+ otuCountsFile << seqNameVector[sequence] << '\t';
+
+ string newSeq = "";
+
+ for(int k=0;k<lengths[sequence];k++){
+ char base = bases[k % 4];
+ int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
+
+ for(int s=0;s<freq;s++){
+ newSeq += base;
+ //otuCountsFile << base;
+ }
+ }
+ otuCountsFile << newSeq.substr(4) << endl;
+ }
+ otuCountsFile << endl;
+ }
+ }
+ otuCountsFile.close();
+ pDataArray->outputNames.push_back(otuCountsFileName);
+ /*****************************************************************************************************/
+
+ //writeGroups(flowFileName, numSeqs, seqNameVector);
+ if (pDataArray->m->control_pressed) { break; }
+ /*****************************************************************************************************/
+ thisOutputDir = pDataArray->outputDir;
+ if (pDataArray->outputDir == "") { thisOutputDir += pDataArray->m->hasPath(flowFileName); }
+ string fileRoot = thisOutputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(flowFileName));
+ string groupFileName = fileRoot + "shhh.groups";
+ ofstream groupFile;
+ pDataArray->m->openOutputFile(groupFileName, groupFile);
+
+ for(int i=0;i<numSeqs;i++){
+ if (pDataArray->m->control_pressed) { break; }
+ groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
+ }
+ groupFile.close();
+ pDataArray->outputNames.push_back(groupFileName);
+ /*****************************************************************************************************/
+
+ pDataArray->m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
}
- ofstream distFile(pDataArray->distFileName.c_str());
- distFile << outStream.str();
- distFile.close();
-
- if (pDataArray->m->control_pressed) {}
- else {
- pDataArray->m->mothurOut(toString(pDataArray->stopSeq-1) + "\t" + toString(time(NULL) - begTime));
- pDataArray->m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
- pDataArray->m->mothurOutEndLine();
- }
+ if (pDataArray->m->control_pressed) { for (int i = 0; i < pDataArray->outputNames.size(); i++) { pDataArray->m->mothurRemove(pDataArray->outputNames[i]); } return 0; }
+
+ return 0;
}
catch(exception& e) {
- pDataArray->m->errorOut(e, "ShhherCommand", "MyflowDistParentForkThreadFunction");
+ pDataArray->m->errorOut(e, "ShhherCommand", "ShhhFlowsThreadFunction");
exit(1);
}
}
lines.push_back(linePair(startIndex, endIndex));
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
string inputString = "fasta=" + fastaFile + ", name=" + nameFile;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
string newnameFile = filenames["name"][0];
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyShhhSeqsThreadFunction(LPVOID lpParam){
shhhseqsData* pDataArray;
--- /dev/null
+//
+// sortseqscommand.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 2/3/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "sortseqscommand.h"
+#include "sequence.hpp"
+#include "qualityscores.h"
+
+//**********************************************************************************************************************
+vector<string> SortSeqsCommand::setParameters(){
+ try {
+ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
+ CommandParameter pflow("flow", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pflow);
+ CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
+ CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
+ CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
+ CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
+ CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
+ CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(paccnos);
+ CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
+ CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+
+ vector<string> myArray;
+ for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
+ return myArray;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "setParameters");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+string SortSeqsCommand::getHelpString(){
+ try {
+ string helpString = "";
+ helpString += "The sort.seqs command puts the sequences in the same order for the following file types: accnos fasta, name, group, taxonomy, flow or quality file.\n";
+ helpString += "The sort.seqs command parameters are accnos, fasta, name, group, taxonomy, flow, qfile and large.\n";
+ helpString += "The accnos file allows you to specify the order you want the files in. If none is provided, mothur will use the order of the first file it reads.\n";
+ helpString += "The large parameters is used to indicate your files are too large to fit in RAM.\n";
+ helpString += "The sort.seqs command should be in the following format: sort.seqs(fasta=yourFasta).\n";
+ helpString += "Example sort.seqs(fasta=amazon.fasta).\n";
+ helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
+ return helpString;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "getHelpString");
+ exit(1);
+ }
+}
+
+
+//**********************************************************************************************************************
+SortSeqsCommand::SortSeqsCommand(){
+ try {
+ abort = true; calledHelp = true;
+ setParameters();
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["qfile"] = tempOutNames;
+ outputTypes["flow"] = tempOutNames;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+SortSeqsCommand::SortSeqsCommand(string option) {
+ try {
+ abort = false; calledHelp = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; calledHelp = true; }
+ else if(option == "citation") { citation(); abort = true; calledHelp = true;}
+
+ else {
+ vector<string> myArray = setParameters();
+
+ OptionParser parser(option);
+ map<string,string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+ map<string,string>::iterator it;
+
+ //check to make sure all parameters are valid for command
+ for (it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //initialize outputTypes
+ vector<string> tempOutNames;
+ outputTypes["fasta"] = tempOutNames;
+ outputTypes["taxonomy"] = tempOutNames;
+ outputTypes["name"] = tempOutNames;
+ outputTypes["group"] = tempOutNames;
+ outputTypes["qfile"] = tempOutNames;
+ outputTypes["flow"] = tempOutNames;
+
+ //if the user changes the output directory command factory will send this info to us in the output parameter
+ outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
+
+ //if the user changes the input directory command factory will send this info to us in the output parameter
+ string inputDir = validParameter.validFile(parameters, "inputdir", false);
+ if (inputDir == "not found"){ inputDir = ""; }
+ else {
+ string path;
+ it = parameters.find("fasta");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["fasta"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("name");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["name"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("qfile");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["qfile"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("accnos");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["accnos"] = inputDir + it->second; }
+ }
+
+ it = parameters.find("flow");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["flow"] = inputDir + it->second; }
+ }
+ }
+
+
+ //check for parameters
+ accnosfile = validParameter.validFile(parameters, "accnos", true);
+ if (accnosfile == "not open") { accnosfile = ""; abort = true; }
+ else if (accnosfile == "not found") { accnosfile = ""; }
+ else { m->setAccnosFile(accnosfile); }
+
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not open") { fastafile = ""; abort = true; }
+ else if (fastafile == "not found") { fastafile = ""; }
+ else { m->setFastaFile(fastafile); }
+
+ flowfile = validParameter.validFile(parameters, "flow", true);
+ if (flowfile == "not open") { flowfile = ""; abort = true; }
+ else if (flowfile == "not found") { flowfile = ""; }
+ else { m->setFlowFile(flowfile); }
+
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not open") { namefile = ""; abort = true; }
+ else if (namefile == "not found") { namefile = ""; }
+ else { m->setNameFile(namefile); }
+
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else { m->setGroupFile(groupfile); }
+
+ taxfile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxfile == "not open") { abort = true; }
+ else if (taxfile == "not found") { taxfile = ""; }
+ else { m->setTaxonomyFile(taxfile); }
+
+ qualfile = validParameter.validFile(parameters, "qfile", true);
+ if (qualfile == "not open") { abort = true; }
+ else if (qualfile == "not found") { qualfile = ""; }
+ else { m->setQualFile(qualfile); }
+
+ string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "f"; }
+ large = m->isTrue(temp);
+
+ if ((fastafile == "") && (namefile == "") && (groupfile == "") && (taxfile == "") && (flowfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, flow or quality."); m->mothurOutEndLine(); abort = true; }
+
+ if ((fastafile != "") && (namefile == "")) {
+ vector<string> files; files.push_back(fastafile);
+ parser.getNameFile(files);
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "SortSeqsCommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int SortSeqsCommand::execute(){
+ try {
+
+ if (abort == true) { if (calledHelp) { return 0; } return 2; }
+
+ //read through the correct file and output lines you want to keep
+ if (accnosfile != "") { readAccnos(); }
+ if (fastafile != "") { readFasta(); }
+ if (flowfile != "") { readFlow(); }
+ if (qualfile != "") { readQual(); }
+ if (namefile != "") { readName(); }
+ if (groupfile != "") { readGroup(); }
+ if (taxfile != "") { readTax(); }
+
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
+
+ if (outputNames.size() != 0) {
+ m->mothurOutEndLine();
+ m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+ for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
+ m->mothurOutEndLine();
+
+ //set fasta file as new current fastafile
+ string current = "";
+ itTypes = outputTypes.find("fasta");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
+ }
+
+ itTypes = outputTypes.find("name");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
+ }
+
+ itTypes = outputTypes.find("group");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
+ }
+
+
+ itTypes = outputTypes.find("taxonomy");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
+ }
+
+ itTypes = outputTypes.find("qfile");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
+ }
+
+ itTypes = outputTypes.find("flow");
+ if (itTypes != outputTypes.end()) {
+ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); }
+ }
+ }
+
+ return 0;
+ }
+
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "execute");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int SortSeqsCommand::readFasta(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "sorted" + m->getExtension(fastafile);
+ outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(fastafile, in);
+ string name;
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
+ //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
+ //this way we only store 1000 seqs in memory at a time.
+
+ int numNames = names.size();
+ int numNamesInFile = 0;
+
+ //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ Sequence currSeq(in);
+ name = currSeq.getName();
+
+ if (name != "") {
+ numNamesInFile++;
+ map<string, int>::iterator it = names.find(name);
+ if (it == names.end()) {
+ names[name] = numNames; numNames++;
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ int numLeft = names.size();
+ if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
+
+ int size = 1000; //assume that user can hold 1000 seqs in memory
+ if (numLeft < size) { size = numLeft; }
+ int times = 0;
+
+ vector<Sequence> seqs; seqs.resize(size);
+ for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
+
+ while (numLeft > 0) {
+
+ ifstream in2;
+ m->openInputFile(fastafile, in2);
+
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ int found = 0;
+ int needToFind = size;
+ if (numLeft < size) { needToFind = numLeft; }
+
+ while(!in2.eof()){
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ //stop reading if we already found the seqs we are looking for
+ if (found >= needToFind) { break; }
+
+ Sequence currSeq(in2);
+ name = currSeq.getName();
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ //is it in the set of seqs we are looking for this time around
+ int thisSeqsPlace = it->second;
+ thisSeqsPlace -= (times * size);
+ if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
+ seqs[thisSeqsPlace] = currSeq;
+ found++;
+ }
+ }else { m->mothurOut("[ERROR]: in logic of readFasta function.\n"); m->control_pressed = true; }
+ }
+ m->gobble(in2);
+ }
+ in2.close();
+
+ ofstream out2;
+ m->openOutputFileAppend(outputFileName, out2);
+
+ int output = seqs.size();
+ if (numLeft < seqs.size()) { output = numLeft; }
+
+ for (int i = 0; i < output; i++) {
+ if (seqs[i].getName() != "") { seqs[i].printSequence(out2); }
+ }
+ out2.close();
+
+ times++;
+ numLeft -= output;
+ }
+
+ m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + fastafile + ".\n");
+ }else {
+
+ vector<Sequence> seqs; seqs.resize(names.size());
+ for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ Sequence currSeq(in);
+ name = currSeq.getName();
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = currSeq;
+ }else { //if we cant find it then add it to the end
+ names[name] = seqs.size();
+ seqs.push_back(currSeq);
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i].getName() != "") {
+ seqs[i].printSequence(out); count++;
+ }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " sequences from " + fastafile + ".\n");
+ }
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ Sequence currSeq(in);
+ name = currSeq.getName();
+
+ if (name != "") {
+ //if this name is in the accnos file
+ names[name] = count;
+ count++;
+ currSeq.printSequence(out);
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + fastafile + " to determine the order. It contains " + toString(count) + " sequences.\n");
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readFasta");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SortSeqsCommand::readFlow(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(flowfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowfile)) + "sorted" + m->getExtension(flowfile);
+ outputTypes["flow"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(flowfile, in);
+ int numFlows;
+ string name;
+
+ in >> numFlows; m->gobble(in);
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
+ //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
+ //this way we only store 1000 seqs in memory at a time.
+
+ int numNames = names.size();
+ int numNamesInFile = 0;
+
+ //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name;
+ string rest = m->getline(in);
+
+ if (name != "") {
+ numNamesInFile++;
+ map<string, int>::iterator it = names.find(name);
+ if (it == names.end()) {
+ names[name] = numNames; numNames++;
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ int numLeft = names.size();
+ if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
+
+ int size = 1000; //assume that user can hold 1000 seqs in memory
+ if (numLeft < size) { size = numLeft; }
+ int times = 0;
+
+ vector<string> seqs; seqs.resize(size, "");
+
+ while (numLeft > 0) {
+
+ ifstream in2;
+ m->openInputFile(flowfile, in2); in2 >> numFlows; m->gobble(in2);
+
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ int found = 0;
+ int needToFind = size;
+ if (numLeft < size) { needToFind = numLeft; }
+
+ while(!in2.eof()){
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ //stop reading if we already found the seqs we are looking for
+ if (found >= needToFind) { break; }
+
+ in2 >> name;
+ string rest = m->getline(in2);
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ //is it in the set of seqs we are looking for this time around
+ int thisSeqsPlace = it->second;
+ thisSeqsPlace -= (times * size);
+ if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
+ seqs[thisSeqsPlace] = (name +'\t' + rest);
+ found++;
+ }
+ }else { m->mothurOut("[ERROR]: in logic of readFlow function.\n"); m->control_pressed = true; }
+ }
+ m->gobble(in2);
+ }
+ in2.close();
+
+ ofstream out2;
+ m->openOutputFileAppend(outputFileName, out2);
+
+ int output = seqs.size();
+ if (numLeft < seqs.size()) { output = numLeft; }
+
+ for (int i = 0; i < output; i++) {
+ if (seqs[i] != "") {
+ out2 << seqs[i] << endl;
+ }
+ }
+ out2.close();
+
+ times++;
+ numLeft -= output;
+ }
+
+ m->mothurOut("Ordered " + toString(numNamesInFile) + " flows from " + flowfile + ".\n");
+ }else {
+
+ vector<string> seqs; seqs.resize(names.size(), "");
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name;
+ string rest = m->getline(in);
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = (name + '\t' + rest);
+ }else { //if we cant find it then add it to the end
+ names[name] = seqs.size();
+ seqs.push_back((name + '\t' + rest));
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i] != "") {
+ out << seqs[i] << endl;
+ count++;
+ }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " flows from " + flowfile + ".\n");
+ }
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name;
+ string rest = m->getline(in);
+
+ if (name != "") {
+ //if this name is in the accnos file
+ names[name] = count;
+ count++;
+ out << name << '\t' << rest << endl;
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + flowfile + " to determine the order. It contains " + toString(count) + " flows.\n");
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readFlow");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int SortSeqsCommand::readQual(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(qualfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + "sorted" + m->getExtension(qualfile);
+ outputTypes["qfile"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(qualfile, in);
+ string name;
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ if (large) { //if the file is too large to fit in memory we can still process it, but the io will be very time consuming.
+ //read through the file looking for 1000 seqs at a time. Once we find them output them and start looking for the next 1000.
+ //this way we only store 1000 seqs in memory at a time.
+
+ int numNames = names.size();
+ int numNamesInFile = 0;
+
+ //to make sure we dont miss any seqs, add any seqs that are not in names but in the file to the end of names
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ QualityScores currQual;
+ currQual = QualityScores(in);
+ name = currQual.getName();
+
+ if (name != "") {
+ numNamesInFile++;
+ map<string, int>::iterator it = names.find(name);
+ if (it == names.end()) {
+ names[name] = numNames; numNames++;
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ int numLeft = names.size();
+ if (numNamesInFile < numLeft) { numLeft = numNamesInFile; }
+
+ int size = 1000; //assume that user can hold 1000 seqs in memory
+ if (numLeft < size) { size = numLeft; }
+ int times = 0;
+
+
+ vector<QualityScores> seqs; seqs.resize(size);
+ for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
+
+ while (numLeft > 0) {
+
+ ifstream in2;
+ m->openInputFile(qualfile, in2);
+
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ int found = 0;
+ int needToFind = size;
+ if (numLeft < size) { needToFind = numLeft; }
+
+ while(!in2.eof()){
+ if (m->control_pressed) { in2.close(); m->mothurRemove(outputFileName); return 0; }
+
+ //stop reading if we already found the seqs we are looking for
+ if (found >= needToFind) { break; }
+
+ QualityScores currQual;
+ currQual = QualityScores(in2);
+ name = currQual.getName();
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ //is it in the set of seqs we are looking for this time around
+ int thisSeqsPlace = it->second;
+ thisSeqsPlace -= (times * size);
+ if ((thisSeqsPlace < size) && (thisSeqsPlace >= 0)) {
+ seqs[thisSeqsPlace] = currQual;
+ found++;
+ }
+ }else { m->mothurOut("[ERROR]: in logic of readQual function.\n"); m->control_pressed = true; }
+ }
+ m->gobble(in2);
+ }
+ in2.close();
+
+ ofstream out2;
+ m->openOutputFileAppend(outputFileName, out2);
+
+ int output = seqs.size();
+ if (numLeft < seqs.size()) { output = numLeft; }
+
+ for (int i = 0; i < output; i++) {
+ if (seqs[i].getName() != "") {
+ seqs[i].printQScores(out2);
+ }
+ }
+ out2.close();
+
+ times++;
+ numLeft -= output;
+ }
+
+ m->mothurOut("Ordered " + toString(numNamesInFile) + " sequences from " + qualfile + ".\n");
+
+ }else {
+
+ vector<QualityScores> seqs; seqs.resize(names.size());
+ for (int i = 0; i < seqs.size(); i++) { seqs[i].setName(""); } //this is so if some of the seqs are missing we dont print out garbage
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ QualityScores currQual;
+ currQual = QualityScores(in);
+ name = currQual.getName();
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = currQual;
+ }else { //if we cant find it then add it to the end
+ names[name] = seqs.size();
+ seqs.push_back(currQual);
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ m->gobble(in);
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i].getName() != "") { seqs[i].printQScores(out); count++; }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " sequences from " + qualfile + ".\n");
+ }
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ QualityScores currQual;
+ currQual = QualityScores(in);
+
+ m->gobble(in);
+
+ if (currQual.getName() != "") {
+ //if this name is in the accnos file
+ names[currQual.getName()] = count;
+ count++;
+ currQual.printQScores(out);
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + qualfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
+ }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readQual");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SortSeqsCommand::readName(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "sorted" + m->getExtension(namefile);
+ outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(namefile, in);
+ string name, firstCol, secondCol;
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ vector<string> seqs; seqs.resize(names.size(), "");
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> firstCol; m->gobble(in);
+ in >> secondCol; m->gobble(in);
+
+ if (firstCol != "") {
+ map<string, int>::iterator it = names.find(firstCol);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = firstCol + '\t' + secondCol;
+ }else { //if we cant find it then add it to the end
+ names[firstCol] = seqs.size();
+ seqs.push_back((firstCol + '\t' + secondCol));
+ m->mothurOut(firstCol + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i] != "") { out << seqs[i] << endl; count++; }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " sequences from " + namefile + ".\n");
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> firstCol; m->gobble(in);
+ in >> secondCol; m->gobble(in);
+
+ if (firstCol != "") {
+ //if this name is in the accnos file
+ names[firstCol] = count;
+ count++;
+ out << firstCol << '\t' << secondCol << endl;
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + namefile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readName");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+int SortSeqsCommand::readGroup(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile);
+ outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(groupfile, in);
+ string name, group;
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ vector<string> seqs; seqs.resize(names.size(), "");
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> group; m->gobble(in);
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = name + '\t' + group;
+ }else { //if we cant find it then add it to the end
+ names[name] = seqs.size();
+ seqs.push_back((name + '\t' + group));
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i] != "") { out << seqs[i] << endl; count++; }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " sequences from " + groupfile + ".\n");
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> group; m->gobble(in);
+
+ if (name != "") {
+ //if this name is in the accnos file
+ names[name] = count;
+ count++;
+ out << name << '\t' << group << endl;
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + groupfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readGroup");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SortSeqsCommand::readTax(){
+ try {
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile);
+ outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
+
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(taxfile, in);
+ string name, tax;
+
+ if (names.size() != 0) {//this is not the first file we are reading so we need to use the order we already have
+
+ vector<string> seqs; seqs.resize(names.size(), "");
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> tax; m->gobble(in);
+
+ if (name != "") {
+ map<string, int>::iterator it = names.find(name);
+ if (it != names.end()) { //we found it, so put it in the vector in the right place.
+ seqs[it->second] = name + '\t' + tax;
+ }else { //if we cant find it then add it to the end
+ names[name] = seqs.size();
+ seqs.push_back((name + '\t' + tax));
+ m->mothurOut(name + " was not in the contained the file which determined the order, adding it to the end.\n");
+ }
+ }
+ }
+ in.close();
+
+ int count = 0;
+ for (int i = 0; i < seqs.size(); i++) {
+ if (seqs[i] != "") { out << seqs[i] << endl; count++; }
+ }
+ out.close();
+
+ m->mothurOut("Ordered " + toString(count) + " sequences from " + taxfile + ".\n");
+
+ }else { //read in file to fill names
+ int count = 0;
+
+ while(!in.eof()){
+ if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
+
+ in >> name; m->gobble(in);
+ in >> tax; m->gobble(in);
+
+ if (name != "") {
+ //if this name is in the accnos file
+ names[name] = count;
+ count++;
+ out << name << '\t' << tax << endl;
+ }
+ m->gobble(in);
+ }
+ in.close();
+ out.close();
+
+ m->mothurOut("\nUsing " + taxfile + " to determine the order. It contains " + toString(count) + " sequences.\n");
+ }
+
+ return 0;
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readTax");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SortSeqsCommand::readAccnos(){
+ try {
+
+ ifstream in;
+ m->openInputFile(accnosfile, in);
+ string name;
+ int count = 0;
+
+ while(!in.eof()){
+
+ if (m->control_pressed) { break; }
+
+ in >> name; m->gobble(in);
+
+ if (name != "") {
+ names[name] = count;
+ count++;
+ }
+ }
+ in.close();
+
+ m->mothurOut("\nUsing " + accnosfile + " to determine the order. It contains " + toString(count) + " representative sequences.\n");
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SortSeqsCommand", "readAccnos");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+
+
+
+
--- /dev/null
+#ifndef Mothur_sortseqscommand_h
+#define Mothur_sortseqscommand_h
+
+
+//
+// sortseqscommand.h
+// Mothur
+//
+// Created by Sarah Westcott on 2/3/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+
+
+#include "command.hpp"
+
+class SortSeqsCommand : public Command {
+
+public:
+
+ SortSeqsCommand(string);
+ SortSeqsCommand();
+ ~SortSeqsCommand(){}
+
+ vector<string> setParameters();
+ string getCommandName() { return "sort.seqs"; }
+ string getCommandCategory() { return "Sequence Processing"; }
+ string getHelpString();
+ string getCitation() { return "http://www.mothur.org/wiki/Sort.seqs"; }
+ string getDescription() { return "puts sequences from a fasta, name, group, quality, flow or taxonomy file in the same order"; }
+
+ int execute();
+ void help() { m->mothurOut(getHelpString()); }
+
+
+private:
+ map<string, int> names;
+ string accnosfile, fastafile, namefile, groupfile, taxfile, qualfile, flowfile, outputDir;
+ bool abort, large;
+ vector<string> outputNames;
+
+ int readFasta();
+ int readFlow();
+ int readName();
+ int readGroup();
+ int readAccnos();
+ int readTax();
+ int readQual();
+
+};
+
+#endif
+
+
--- /dev/null
+//
+// subsample.cpp
+// Mothur
+//
+// Created by Sarah Westcott on 4/2/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "subsample.h"
+
+//**********************************************************************************************************************
+vector<string> SubSample::getSample(vector<SharedRAbundVector*>& thislookup, int size) {
+ try {
+
+ //save mothurOut's binLabels to restore for next label
+ vector<string> saveBinLabels = m->currentBinLabels;
+
+ int numBins = thislookup[0]->getNumBins();
+ for (int i = 0; i < thislookup.size(); i++) {
+ int thisSize = thislookup[i]->getNumSeqs();
+
+ if (thisSize != size) {
+
+ string thisgroup = thislookup[i]->getGroup();
+
+ OrderVector order;
+ for(int p=0;p<numBins;p++){
+ for(int j=0;j<thislookup[i]->getAbundance(p);j++){
+ order.push_back(p);
+ }
+ }
+ random_shuffle(order.begin(), order.end());
+
+ SharedRAbundVector* temp = new SharedRAbundVector(numBins);
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+
+ delete thislookup[i];
+ thislookup[i] = temp;
+
+
+ for (int j = 0; j < size; j++) {
+
+ if (m->control_pressed) { return m->currentBinLabels; }
+
+ int bin = order.get(j);
+
+ int abund = thislookup[i]->getAbundance(bin);
+ thislookup[i]->set(bin, (abund+1), thisgroup);
+ }
+ }
+ }
+
+ //subsampling may have created some otus with no sequences in them
+ eliminateZeroOTUS(thislookup);
+
+ if (m->control_pressed) { return m->currentBinLabels; }
+
+ //save mothurOut's binLabels to restore for next label
+ vector<string> subsampleBinLabels = m->currentBinLabels;
+ m->currentBinLabels = saveBinLabels;
+
+ return subsampleBinLabels;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "getSample");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int SubSample::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
+ try {
+
+ vector<SharedRAbundVector*> newLookup;
+ for (int i = 0; i < thislookup.size(); i++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thislookup[i]->getLabel());
+ temp->setGroup(thislookup[i]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ vector<string> newBinLabels;
+ string snumBins = toString(thislookup[0]->getNumBins());
+ for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+
+ //look at each sharedRabund and make sure they are not all zero
+ bool allZero = true;
+ for (int j = 0; j < thislookup.size(); j++) {
+ if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
+ }
+
+ //if they are not all zero add this bin
+ if (!allZero) {
+ for (int j = 0; j < thislookup.size(); j++) {
+ newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
+ }
+ //if there is a bin label use it otherwise make one
+ string binLabel = "Otu";
+ string sbinNumber = toString(i+1);
+ if (sbinNumber.length() < snumBins.length()) {
+ int diff = snumBins.length() - sbinNumber.length();
+ for (int h = 0; h < diff; h++) { binLabel += "0"; }
+ }
+ binLabel += sbinNumber;
+ if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
+
+ newBinLabels.push_back(binLabel);
+ }
+ }
+
+ for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
+ thislookup.clear();
+
+ thislookup = newLookup;
+ m->currentBinLabels = newBinLabels;
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "SubSample", "eliminateZeroOTUS");
+ exit(1);
+ }
+}
+
+
+//**********************************************************************************************************************
+
+
--- /dev/null
+#ifndef Mothur_subsample_h
+#define Mothur_subsample_h
+
+//
+// subsample.h
+// Mothur
+//
+// Created by Sarah Westcott on 4/2/12.
+// Copyright (c) 2012 Schloss Lab. All rights reserved.
+//
+
+#include "mothurout.h"
+#include "sharedrabundvector.h"
+
+//subsampling overwrites the sharedRabunds. If you need to reuse the original use the getSamplePreserve function.
+
+class SubSample {
+
+ public:
+
+ SubSample() { m = MothurOut::getInstance(); }
+ ~SubSample() {}
+
+ vector<string> getSample(vector<SharedRAbundVector*>&, int); //returns the bin labels for the subsample, mothurOuts binlabels are preserved so you can run this multiple times. Overwrites original vector passed in, if you need to preserve it deep copy first.
+
+
+ private:
+
+ MothurOut* m;
+ int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
+
+};
+
+#endif
#include "subsamplecommand.h"
#include "sharedutilities.h"
#include "deconvolutecommand.h"
+#include "subsample.h"
//**********************************************************************************************************************
vector<string> SubSampleCommand::setParameters(){
string inputString = "fasta=" + outputFileName;
m->mothurOut("/******************************************/"); m->mothurOutEndLine();
m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
-
+ m->mothurCalling = true;
+
Command* uniqueCommand = new DeconvoluteCommand(inputString);
uniqueCommand->execute();
map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
delete uniqueCommand;
-
+ m->mothurCalling = false;
+
outputTypes["name"].push_back(filenames["name"][0]); outputNames.push_back(filenames["name"][0]);
m->mothurRemove(outputFileName);
outputFileName = filenames["fasta"][0];
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir += m->hasPath(sharedfile); }
string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(sharedfile)) + thislookup[0]->getLabel() + ".subsample" + m->getExtension(sharedfile);
-
-
- ofstream out;
+
+ SubSample sample;
+ vector<string> subsampledLabels = sample.getSample(thislookup, size);
+
+ if (m->control_pressed) { return 0; }
+
+ ofstream out;
m->openOutputFile(outputFileName, out);
outputTypes["shared"].push_back(outputFileName); outputNames.push_back(outputFileName);
- int numBins = thislookup[0]->getNumBins();
- for (int i = 0; i < thislookup.size(); i++) {
- int thisSize = thislookup[i]->getNumSeqs();
-
- if (thisSize != size) {
-
- string thisgroup = thislookup[i]->getGroup();
-
- OrderVector* order = new OrderVector();
- for(int p=0;p<numBins;p++){
- for(int j=0;j<thislookup[i]->getAbundance(p);j++){
- order->push_back(p);
- }
- }
- random_shuffle(order->begin(), order->end());
-
- SharedRAbundVector* temp = new SharedRAbundVector(numBins);
- temp->setLabel(thislookup[i]->getLabel());
- temp->setGroup(thislookup[i]->getGroup());
-
- delete thislookup[i];
- thislookup[i] = temp;
-
-
- for (int j = 0; j < size; j++) {
-
- if (m->control_pressed) { delete order; out.close(); return 0; }
-
- //get random number to sample from order between 0 and thisSize-1.
- //don't need this because of the random shuffle above
- //int myrand = int((float)(thisSize) * (float)(rand()) / ((float)RAND_MAX+1.0));
-
- int bin = order->get(j);
-
- int abund = thislookup[i]->getAbundance(bin);
- thislookup[i]->set(bin, (abund+1), thisgroup);
- }
- delete order;
- }
- }
-
- //subsampling may have created some otus with no sequences in them
- eliminateZeroOTUS(thislookup);
-
- if (m->control_pressed) { out.close(); return 0; }
-
+ m->currentBinLabels = subsampledLabels;
+
thislookup[0]->printHeaders(out);
for (int i = 0; i < thislookup.size(); i++) {
out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
thislookup[i]->print(out);
}
-
out.close();
-
- //save mothurOut's binLabels to restore for next label
+
+
+ //save mothurOut's binLabels to restore for next label
m->currentBinLabels = saveBinLabels;
return 0;
}
}
//**********************************************************************************************************************
-int SubSampleCommand::eliminateZeroOTUS(vector<SharedRAbundVector*>& thislookup) {
- try {
-
- vector<SharedRAbundVector*> newLookup;
- for (int i = 0; i < thislookup.size(); i++) {
- SharedRAbundVector* temp = new SharedRAbundVector();
- temp->setLabel(thislookup[i]->getLabel());
- temp->setGroup(thislookup[i]->getGroup());
- newLookup.push_back(temp);
- }
-
- //for each bin
- vector<string> newBinLabels;
- string snumBins = toString(thislookup[0]->getNumBins());
- for (int i = 0; i < thislookup[0]->getNumBins(); i++) {
- if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
-
- //look at each sharedRabund and make sure they are not all zero
- bool allZero = true;
- for (int j = 0; j < thislookup.size(); j++) {
- if (thislookup[j]->getAbundance(i) != 0) { allZero = false; break; }
- }
-
- //if they are not all zero add this bin
- if (!allZero) {
- for (int j = 0; j < thislookup.size(); j++) {
- newLookup[j]->push_back(thislookup[j]->getAbundance(i), thislookup[j]->getGroup());
- }
- //if there is a bin label use it otherwise make one
- string binLabel = "Otu";
- string sbinNumber = toString(i+1);
- if (sbinNumber.length() < snumBins.length()) {
- int diff = snumBins.length() - sbinNumber.length();
- for (int h = 0; h < diff; h++) { binLabel += "0"; }
- }
- binLabel += sbinNumber;
- if (i < m->currentBinLabels.size()) { binLabel = m->currentBinLabels[i]; }
-
- newBinLabels.push_back(binLabel);
- }
- }
-
- for (int j = 0; j < thislookup.size(); j++) { delete thislookup[j]; }
- thislookup.clear();
-
- thislookup = newLookup;
- m->currentBinLabels = newBinLabels;
-
- return 0;
-
- }
- catch(exception& e) {
- m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
- exit(1);
- }
-}
-
-//**********************************************************************************************************************
vector<string> names;
map<string, vector<string> > nameMap;
- int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
int getSubSampleShared();
int getSubSampleList();
int getSubSampleRabund();
#include "mothur.h"
#include "database.hpp"
#include "suffixtree.hpp"
-//class SuffixTree;
class SuffixDB : public Database {
public:
SuffixDB(int);
SuffixDB();
- SuffixDB(const SuffixDB& sdb) : count(sdb.count), Database(sdb) {
- for (int i = 0; i < sdb.suffixForest.size(); i++) {
- SuffixTree temp(sdb.suffixForest[i]);
- suffixForest.push_back(temp);
- }
- }
~SuffixDB();
void generateDB() {}; //adding sequences generates the db
public:
SuffixNode(int, int, int);
- SuffixNode(const SuffixNode& sn) : parentNode(sn.parentNode), startCharPosition(sn.startCharPosition), endCharPosition(sn.endCharPosition) {m = MothurOut::getInstance();}
virtual ~SuffixNode() {}
virtual void print(string, int) = 0;
virtual void setChildren(char, int);
public:
SuffixBranch(int, int, int);
- SuffixBranch(const SuffixBranch& sb) : suffixNode(sb.suffixNode), childNodes(sb.childNodes), SuffixNode(sb.parentNode, sb.startCharPosition, sb.endCharPosition) {}
~SuffixBranch() {}
void print(string, int); // need a special method for printing the node because there are children
void eraseChild(char); // need a special method for erasing the children
return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent
}
-//********************************************************************************************************************
-
-SuffixTree::SuffixTree(const SuffixTree& st) : root(st.root), activeEndPosition(st.activeEndPosition), activeStartPosition(st.activeStartPosition), activeNode(st.activeNode),
- nodeCounter(st.nodeCounter), seqName(st.seqName), sequence(st.sequence) {
- try {
- m = MothurOut::getInstance();
-
- for (int i = 0; i < st.nodeVector.size(); i++) {
- SuffixNode* temp = new SuffixBranch(*((SuffixBranch*)st.nodeVector[i]));
- nodeVector.push_back(temp);
- }
-
-
- }catch(exception& e) {
- m->errorOut(e, "SuffixTree", "SuffixTree");
- exit(1);
- }
-}
-
//********************************************************************************************************************
SuffixTree::SuffixTree(){ m = MothurOut::getInstance(); }
public:
SuffixTree();
~SuffixTree();
-// SuffixTree(string, string);
- SuffixTree(const SuffixTree&);
void loadSequence(Sequence);
string getSeqName();
if (namefile != "") { nameMap = m->readNames(namefile); }
vector<unsigned long long> positions;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
positions = m->divideFile(qualfile, processors);
for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
#else
lines.push_back(linePair(0, 1000));
}else {
positions = m->setFilePosFasta(qualfile, numSeqs);
+ if (positions.size() < processors) { processors = positions.size(); }
//figure out how many sequences you have to process
int numSeqsPerProcessor = numSeqs / processors;
count += num;
}
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = in.tellg();
if ((pos == -1) || (pos >= filePos.end)) { break; }
#else
int numSeqs = 0;
processIDS.clear();
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//loop through and create all the processes you want
while (process != processors) {
//////////////////////////////////////////////////////////////////////////////////////////////////////
//Windows version shared memory, so be careful when passing variables through the seqSumQualData struct.
//Above fork() will clone, so memory is separate, but that's not the case with windows,
- //Taking advantage of shared memory to allow both threads to add info to vectors.
+ //Taking advantage of shared memory to pass results vectors.
//////////////////////////////////////////////////////////////////////////////////////////////////////
vector<seqSumQualData*> pDataArray;
for( int i=0; i<processors; i++ ){
// Allocate memory for thread data.
- seqSumQualData* tempSum = new seqSumQualData(&position, &averageQ, &scores, filename, m, lines[i].start, lines[i].end, namefile, nameMap);
+ seqSumQualData* tempSum = new seqSumQualData(filename, m, lines[i].start, lines[i].end, namefile, nameMap);
pDataArray.push_back(tempSum);
processIDS.push_back(i);
-
- //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
- //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+
hThreadArray[i] = CreateThread(NULL, 0, MySeqSumQualThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
}
//Close all thread handles and free memory allocations.
for(int i=0; i < pDataArray.size(); i++){
numSeqs += pDataArray[i]->count;
+ int tempNum = pDataArray[i]->position.size();
+ if (position.size() < tempNum) { position.resize(tempNum, 0); }
+ if (averageQ.size() < tempNum) { averageQ.resize(tempNum, 0); }
+ if (scores.size() < tempNum) {
+ scores.resize(tempNum);
+ for (int i = 0; i < scores.size(); i++) { scores[i].resize(41, 0); }
+ }
+
+ for (int k = 0; k < tempNum; k++) { position[k] += pDataArray[i]->position[k]; }
+ for (int k = 0; k < tempNum; k++) { averageQ[k] += pDataArray[i]->averageQ[k]; }
+ for (int k = 0; k < tempNum; k++) { for (int j = 0; j < 41; j++) { scores[k][j] += pDataArray[i]->scores[k][j]; } }
+
CloseHandle(hThreadArray[i]);
delete pDataArray[i];
}
// This is passed by void pointer so it can be any data type
// that can be passed using a single void pointer (LPVOID).
struct seqSumQualData {
- vector<int>* position;
- vector<int>* averageQ;
- vector< vector<int> >* scores;
+ vector<int> position;
+ vector<int> averageQ;
+ vector< vector<int> > scores;
string filename, namefile;
unsigned long long start;
unsigned long long end;
map<string, int> nameMap;
~seqSumQualData(){}
- seqSumQualData(vector<int>* p, vector<int>* a, vector< vector<int> >* s, string f, MothurOut* mout, unsigned long long st, unsigned long long en, string n, map<string, int> nam) {
- position = p;
- averageQ = a;
- scores = s;
+ seqSumQualData(string f, MothurOut* mout, unsigned long long st, unsigned long long en, string n, map<string, int> nam) {
filename = f;
m = mout;
start = st;
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MySeqSumQualThreadFunction(LPVOID lpParam){
seqSumQualData* pDataArray;
vector<int> thisScores = current.getQualityScores();
//resize to num of positions setting number of seqs with that size to 1
- if (pDataArray->position->size() < thisScores.size()) { pDataArray->position->resize(thisScores.size(), 0); }
- if (pDataArray->averageQ->size() < thisScores.size()) { pDataArray->averageQ->resize(thisScores.size(), 0); }
- if (pDataArray->scores->size() < thisScores.size()) {
- pDataArray->scores->resize(thisScores.size());
- for (int i = 0; i < pDataArray->scores->size(); i++) { pDataArray->scores->at(i).resize(41, 0); }
+ if (pDataArray->position.size() < thisScores.size()) { pDataArray->position.resize(thisScores.size(), 0); }
+ if (pDataArray->averageQ.size() < thisScores.size()) { pDataArray->averageQ.resize(thisScores.size(), 0); }
+ if (pDataArray->scores.size() < thisScores.size()) {
+ pDataArray->scores.resize(thisScores.size());
+ for (int i = 0; i < pDataArray->scores.size(); i++) { pDataArray->scores.at(i).resize(41, 0); }
}
//increase counts of number of seqs with this position
//average is really the total, we will average in execute
for (int i = 0; i < thisScores.size(); i++) {
- pDataArray->position->at(i) += num;
- pDataArray->averageQ->at(i) += (thisScores[i] * num); //weighting for namesfile
+ pDataArray->position.at(i) += num;
+ pDataArray->averageQ.at(i) += (thisScores[i] * num); //weighting for namesfile
if (thisScores[i] > 40) { pDataArray->m->mothurOut("[ERROR]: " + current.getName() + " has a quality scores of " + toString(thisScores[i]) + ", expecting values to be less than 40."); pDataArray->m->mothurOutEndLine(); pDataArray->m->control_pressed = true; }
- else { pDataArray->scores->at(i)[thisScores[i]] += num; }
+ else { pDataArray->scores.at(i)[thisScores[i]] += num; }
}
count += num;
*/
#include "summarysharedcommand.h"
-#include "sharedsobscollectsummary.h"
-#include "sharedchao1.h"
-#include "sharedace.h"
-#include "sharednseqs.h"
-#include "sharedjabund.h"
-#include "sharedsorabund.h"
-#include "sharedjclass.h"
-#include "sharedsorclass.h"
-#include "sharedjest.h"
-#include "sharedsorest.h"
-#include "sharedthetayc.h"
-#include "sharedthetan.h"
-#include "sharedkstest.h"
-#include "whittaker.h"
-#include "sharedochiai.h"
-#include "sharedanderbergs.h"
-#include "sharedkulczynski.h"
-#include "sharedkulczynskicody.h"
-#include "sharedlennon.h"
-#include "sharedmorisitahorn.h"
-#include "sharedbraycurtis.h"
-#include "sharedjackknife.h"
-#include "whittaker.h"
-#include "odum.h"
-#include "canberra.h"
-#include "structeuclidean.h"
-#include "structchord.h"
-#include "hellinger.h"
-#include "manhattan.h"
-#include "structpearson.h"
-#include "soergel.h"
-#include "spearman.h"
-#include "structkulczynski.h"
-#include "structchi2.h"
-#include "speciesprofile.h"
-#include "hamming.h"
-#include "gower.h"
-#include "memchi2.h"
-#include "memchord.h"
-#include "memeuclidean.h"
-#include "mempearson.h"
//**********************************************************************************************************************
vector<string> SummarySharedCommand::setParameters(){
/***********************************************************/
int SummarySharedCommand::process(vector<SharedRAbundVector*> thisLookup, string sumFileName, string sumAllFileName) {
try {
- vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
- calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
- driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
- m->appendFiles((sumFileName + ".temp"), sumFileName);
- m->mothurRemove((sumFileName + ".temp"));
- if (mult) {
- m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
- m->mothurRemove((sumAllFileName + ".temp"));
- }
- }else{
- int process = 1;
- vector<int> processIDS;
-
- //loop through and create all the processes you want
- while (process != processors) {
- int pid = fork();
-
- if (pid > 0) {
- processIDS.push_back(pid);
- process++;
- }else if (pid == 0){
- driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
-
- //only do this if you want a distance file
- if (createPhylip) {
- string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
- ofstream outtemp;
- m->openOutputFile(tempdistFileName, outtemp);
-
- for (int i = 0; i < calcDists.size(); i++) {
- outtemp << calcDists[i].size() << endl;
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
- }
- }
- outtemp.close();
- }
-
- exit(0);
- }else {
- m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
- for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
- exit(0);
- }
- }
-
- //parent do your part
- driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
- m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
- m->mothurRemove((sumFileName + toString(getpid()) + ".temp"));
- if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
-
- //force parent to wait until all the processes are done
- for (int i = 0; i < processIDS.size(); i++) {
- int temp = processIDS[i];
- wait(&temp);
- }
-
- for (int i = 0; i < processIDS.size(); i++) {
- m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
- m->mothurRemove((sumFileName + toString(processIDS[i]) + ".temp"));
- if (mult) { m->mothurRemove((sumAllFileName + toString(processIDS[i]) + ".temp")); }
-
- if (createPhylip) {
- string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
- ifstream intemp;
- m->openInputFile(tempdistFileName, intemp);
-
- for (int k = 0; k < calcDists.size(); k++) {
- int size = 0;
- intemp >> size; m->gobble(intemp);
-
- for (int j = 0; j < size; j++) {
- int seq1 = 0;
- int seq2 = 0;
- float dist = 1.0;
-
- intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
-
- seqDist tempDist(seq1, seq2, dist);
- calcDists[k].push_back(tempDist);
- }
- }
- intemp.close();
- m->mothurRemove(tempdistFileName);
- }
- }
+ vector< vector<seqDist> > calcDists; //vector containing vectors that contains the summary results for each group compare
+ calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files
+
+
+ if(processors == 1){
+ driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists);
+ m->appendFiles((sumFileName + ".temp"), sumFileName);
+ m->mothurRemove((sumFileName + ".temp"));
+ if (mult) {
+ m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
+ m->mothurRemove((sumAllFileName + ".temp"));
+ }
+ }else{
+
+ int process = 1;
+ vector<int> processIDS;
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+ driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
+
+ //only do this if you want a distance file
+ if (createPhylip) {
+ string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist";
+ ofstream outtemp;
+ m->openOutputFile(tempdistFileName, outtemp);
+
+ for (int i = 0; i < calcDists.size(); i++) {
+ outtemp << calcDists[i].size() << endl;
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl;
+ }
+ }
+ outtemp.close();
+ }
+
+ exit(0);
+ }else {
+ m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
+ for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
+ exit(0);
+ }
+ }
+
+ //parent do your part
+ driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists);
+ m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName);
+ m->mothurRemove((sumFileName + toString(getpid()) + ".temp"));
+ if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); }
+
+ //force parent to wait until all the processes are done
+ for (int i = 0; i < processIDS.size(); i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ for (int i = 0; i < processIDS.size(); i++) {
+ m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
+ m->mothurRemove((sumFileName + toString(processIDS[i]) + ".temp"));
+ if (mult) { m->mothurRemove((sumAllFileName + toString(processIDS[i]) + ".temp")); }
+
+ if (createPhylip) {
+ string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist";
+ ifstream intemp;
+ m->openInputFile(tempdistFileName, intemp);
+
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = 0;
+ intemp >> size; m->gobble(intemp);
+
+ for (int j = 0; j < size; j++) {
+ int seq1 = 0;
+ int seq2 = 0;
+ float dist = 1.0;
+
+ intemp >> seq1 >> seq2 >> dist; m->gobble(intemp);
+
+ seqDist tempDist(seq1, seq2, dist);
+ calcDists[k].push_back(tempDist);
+ }
+ }
+ intemp.close();
+ m->mothurRemove(tempdistFileName);
+ }
+ }
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the summarySharedData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to pass results vectors.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
- }
- #else
- driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists);
- m->appendFiles((sumFileName + ".temp"), sumFileName);
- m->mothurRemove((sumFileName + ".temp"));
- if (mult) {
- m->appendFiles((sumAllFileName + ".temp"), sumAllFileName);
- m->mothurRemove((sumAllFileName + ".temp"));
- }
- #endif
-
- if (createPhylip) {
- for (int i = 0; i < calcDists.size(); i++) {
- if (m->control_pressed) { break; }
+ vector<summarySharedData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+
+ //make copy of lookup so we don't get access violations
+ vector<SharedRAbundVector*> newLookup;
+ for (int k = 0; k < thisLookup.size(); k++) {
+ SharedRAbundVector* temp = new SharedRAbundVector();
+ temp->setLabel(thisLookup[k]->getLabel());
+ temp->setGroup(thisLookup[k]->getGroup());
+ newLookup.push_back(temp);
+ }
+
+ //for each bin
+ for (int k = 0; k < thisLookup[0]->getNumBins(); k++) {
+ if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
+ for (int j = 0; j < thisLookup.size(); j++) { newLookup[j]->push_back(thisLookup[j]->getAbundance(k), thisLookup[j]->getGroup()); }
+ }
+
+ // Allocate memory for thread data.
+ summarySharedData* tempSum = new summarySharedData((sumFileName+toString(i)+".temp"), m, lines[i].start, lines[i].end, Estimators, newLookup);
+ pDataArray.push_back(tempSum);
+ processIDS.push_back(i);
+
+ hThreadArray[i-1] = CreateThread(NULL, 0, MySummarySharedThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+ }
+
+ //parent do your part
+ driver(thisLookup, lines[0].start, lines[0].end, sumFileName +"0.temp", sumAllFileName + "0.temp", calcDists);
+ m->appendFiles((sumFileName + "0.temp"), sumFileName);
+ m->mothurRemove((sumFileName + "0.temp"));
+ if (mult) { m->appendFiles((sumAllFileName + "0.temp"), sumAllFileName); }
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName);
+ m->mothurRemove((sumFileName + toString(processIDS[i]) + ".temp"));
+
+ for (int j = 0; j < pDataArray[i]->thisLookup.size(); j++) { delete pDataArray[i]->thisLookup[j]; }
+
+ if (createPhylip) {
+ for (int k = 0; k < calcDists.size(); k++) {
+ int size = pDataArray[i]->calcDists[k].size();
+ for (int j = 0; j < size; j++) { calcDists[k].push_back(pDataArray[i]->calcDists[k][j]); }
+ }
+ }
+
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+ }
+
+ if (createPhylip) {
+ for (int i = 0; i < calcDists.size(); i++) {
+ if (m->control_pressed) { break; }
- string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
- outputNames.push_back(distFileName);
- ofstream outDist;
- m->openOutputFile(distFileName, outDist);
- outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
-
- //initialize matrix
- vector< vector<float> > matrix; //square matrix to represent the distance
- matrix.resize(thisLookup.size());
- for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
-
-
- for (int j = 0; j < calcDists[i].size(); j++) {
- int row = calcDists[i][j].seq1;
- int column = calcDists[i][j].seq2;
- float dist = calcDists[i][j].dist;
-
- matrix[row][column] = dist;
- matrix[column][row] = dist;
- }
+ string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist";
+ outputNames.push_back(distFileName);
+ ofstream outDist;
+ m->openOutputFile(distFileName, outDist);
+ outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint);
+
+ //initialize matrix
+ vector< vector<float> > matrix; //square matrix to represent the distance
+ matrix.resize(thisLookup.size());
+ for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); }
+
+
+ for (int j = 0; j < calcDists[i].size(); j++) {
+ int row = calcDists[i][j].seq1;
+ int column = calcDists[i][j].seq2;
+ float dist = calcDists[i][j].dist;
+
+ matrix[row][column] = dist;
+ matrix[column][row] = dist;
+ }
+
+ //output to file
+ outDist << thisLookup.size() << endl;
+ for (int r=0; r<thisLookup.size(); r++) {
+ //output name
+ string name = thisLookup[r]->getGroup();
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+ outDist << name << '\t';
- //output to file
- outDist << thisLookup.size() << endl;
- for (int r=0; r<thisLookup.size(); r++) {
- //output name
- string name = thisLookup[r]->getGroup();
- if (name.length() < 10) { //pad with spaces to make compatible
- while (name.length() < 10) { name += " "; }
- }
- outDist << name << '\t';
-
- //output distances
- for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
- outDist << endl;
- }
-
- outDist.close();
- }
- }
+ //output distances
+ for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; }
+ outDist << endl;
+ }
+
+ outDist.close();
+ }
+ }
return 0;
}
catch(exception& e) {
#include "inputdata.h"
#include "calculator.h"
#include "validcalculator.h"
+#include "sharedsobscollectsummary.h"
+#include "sharedchao1.h"
+#include "sharedace.h"
+#include "sharednseqs.h"
+#include "sharedjabund.h"
+#include "sharedsorabund.h"
+#include "sharedjclass.h"
+#include "sharedsorclass.h"
+#include "sharedjest.h"
+#include "sharedsorest.h"
+#include "sharedthetayc.h"
+#include "sharedthetan.h"
+#include "sharedkstest.h"
+#include "whittaker.h"
+#include "sharedochiai.h"
+#include "sharedanderbergs.h"
+#include "sharedkulczynski.h"
+#include "sharedkulczynskicody.h"
+#include "sharedlennon.h"
+#include "sharedmorisitahorn.h"
+#include "sharedbraycurtis.h"
+#include "sharedjackknife.h"
+#include "whittaker.h"
+#include "odum.h"
+#include "canberra.h"
+#include "structeuclidean.h"
+#include "structchord.h"
+#include "hellinger.h"
+#include "manhattan.h"
+#include "structpearson.h"
+#include "soergel.h"
+#include "spearman.h"
+#include "structkulczynski.h"
+#include "structchi2.h"
+#include "speciesprofile.h"
+#include "hamming.h"
+#include "gower.h"
+#include "memchi2.h"
+#include "memchord.h"
+#include "memeuclidean.h"
+#include "mempearson.h"
class SummarySharedCommand : public Command {
};
+/**************************************************************************************************/
+//custom data structure for threads to use.
+//main process handling the calcs that can do more than 2 groups
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct summarySharedData {
+ vector<SharedRAbundVector*> thisLookup;
+ vector< vector<seqDist> > calcDists;
+ vector<string> Estimators;
+ unsigned long long start;
+ unsigned long long end;
+ MothurOut* m;
+ string sumFile;
+
+ summarySharedData(){}
+ summarySharedData(string sf, MothurOut* mout, unsigned long long st, unsigned long long en, vector<string> est, vector<SharedRAbundVector*> lu) {
+ sumFile = sf;
+ m = mout;
+ start = st;
+ end = en;
+ Estimators = est;
+ thisLookup = lu;
+ }
+};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MySummarySharedThreadFunction(LPVOID lpParam){
+ summarySharedData* pDataArray;
+ pDataArray = (summarySharedData*)lpParam;
+
+ try {
+
+ vector<Calculator*> sumCalculators;
+ ValidCalculators validCalculator;
+ for (int i=0; i<pDataArray->Estimators.size(); i++) {
+ if (validCalculator.isValidCalculator("sharedsummary", pDataArray->Estimators[i]) == true) {
+ if (pDataArray->Estimators[i] == "sharedsobs") {
+ sumCalculators.push_back(new SharedSobsCS());
+ }else if (pDataArray->Estimators[i] == "sharedchao") {
+ sumCalculators.push_back(new SharedChao1());
+ }else if (pDataArray->Estimators[i] == "sharedace") {
+ sumCalculators.push_back(new SharedAce());
+ }else if (pDataArray->Estimators[i] == "jabund") {
+ sumCalculators.push_back(new JAbund());
+ }else if (pDataArray->Estimators[i] == "sorabund") {
+ sumCalculators.push_back(new SorAbund());
+ }else if (pDataArray->Estimators[i] == "jclass") {
+ sumCalculators.push_back(new Jclass());
+ }else if (pDataArray->Estimators[i] == "sorclass") {
+ sumCalculators.push_back(new SorClass());
+ }else if (pDataArray->Estimators[i] == "jest") {
+ sumCalculators.push_back(new Jest());
+ }else if (pDataArray->Estimators[i] == "sorest") {
+ sumCalculators.push_back(new SorEst());
+ }else if (pDataArray->Estimators[i] == "thetayc") {
+ sumCalculators.push_back(new ThetaYC());
+ }else if (pDataArray->Estimators[i] == "thetan") {
+ sumCalculators.push_back(new ThetaN());
+ }else if (pDataArray->Estimators[i] == "kstest") {
+ sumCalculators.push_back(new KSTest());
+ }else if (pDataArray->Estimators[i] == "sharednseqs") {
+ sumCalculators.push_back(new SharedNSeqs());
+ }else if (pDataArray->Estimators[i] == "ochiai") {
+ sumCalculators.push_back(new Ochiai());
+ }else if (pDataArray->Estimators[i] == "anderberg") {
+ sumCalculators.push_back(new Anderberg());
+ }else if (pDataArray->Estimators[i] == "kulczynski") {
+ sumCalculators.push_back(new Kulczynski());
+ }else if (pDataArray->Estimators[i] == "kulczynskicody") {
+ sumCalculators.push_back(new KulczynskiCody());
+ }else if (pDataArray->Estimators[i] == "lennon") {
+ sumCalculators.push_back(new Lennon());
+ }else if (pDataArray->Estimators[i] == "morisitahorn") {
+ sumCalculators.push_back(new MorHorn());
+ }else if (pDataArray->Estimators[i] == "braycurtis") {
+ sumCalculators.push_back(new BrayCurtis());
+ }else if (pDataArray->Estimators[i] == "whittaker") {
+ sumCalculators.push_back(new Whittaker());
+ }else if (pDataArray->Estimators[i] == "odum") {
+ sumCalculators.push_back(new Odum());
+ }else if (pDataArray->Estimators[i] == "canberra") {
+ sumCalculators.push_back(new Canberra());
+ }else if (pDataArray->Estimators[i] == "structeuclidean") {
+ sumCalculators.push_back(new StructEuclidean());
+ }else if (pDataArray->Estimators[i] == "structchord") {
+ sumCalculators.push_back(new StructChord());
+ }else if (pDataArray->Estimators[i] == "hellinger") {
+ sumCalculators.push_back(new Hellinger());
+ }else if (pDataArray->Estimators[i] == "manhattan") {
+ sumCalculators.push_back(new Manhattan());
+ }else if (pDataArray->Estimators[i] == "structpearson") {
+ sumCalculators.push_back(new StructPearson());
+ }else if (pDataArray->Estimators[i] == "soergel") {
+ sumCalculators.push_back(new Soergel());
+ }else if (pDataArray->Estimators[i] == "spearman") {
+ sumCalculators.push_back(new Spearman());
+ }else if (pDataArray->Estimators[i] == "structkulczynski") {
+ sumCalculators.push_back(new StructKulczynski());
+ }else if (pDataArray->Estimators[i] == "speciesprofile") {
+ sumCalculators.push_back(new SpeciesProfile());
+ }else if (pDataArray->Estimators[i] == "hamming") {
+ sumCalculators.push_back(new Hamming());
+ }else if (pDataArray->Estimators[i] == "structchi2") {
+ sumCalculators.push_back(new StructChi2());
+ }else if (pDataArray->Estimators[i] == "gower") {
+ sumCalculators.push_back(new Gower());
+ }else if (pDataArray->Estimators[i] == "memchi2") {
+ sumCalculators.push_back(new MemChi2());
+ }else if (pDataArray->Estimators[i] == "memchord") {
+ sumCalculators.push_back(new MemChord());
+ }else if (pDataArray->Estimators[i] == "memeuclidean") {
+ sumCalculators.push_back(new MemEuclidean());
+ }else if (pDataArray->Estimators[i] == "mempearson") {
+ sumCalculators.push_back(new MemPearson());
+ }
+ }
+ }
+
+ pDataArray->calcDists.resize(sumCalculators.size());
+
+ ofstream outputFileHandle;
+ pDataArray->m->openOutputFile(pDataArray->sumFile, outputFileHandle);
+
+ vector<SharedRAbundVector*> subset;
+ for (int k = pDataArray->start; k < pDataArray->end; k++) { // pass cdd each set of groups to compare
+
+ for (int l = 0; l < k; l++) {
+
+ outputFileHandle << pDataArray->thisLookup[0]->getLabel() << '\t';
+
+ subset.clear(); //clear out old pair of sharedrabunds
+ //add new pair of sharedrabunds
+ subset.push_back(pDataArray->thisLookup[k]); subset.push_back(pDataArray->thisLookup[l]);
+
+ //sort groups to be alphanumeric
+ if (pDataArray->thisLookup[k]->getGroup() > pDataArray->thisLookup[l]->getGroup()) {
+ outputFileHandle << (pDataArray->thisLookup[l]->getGroup() +'\t' + pDataArray->thisLookup[k]->getGroup()) << '\t'; //print out groups
+ }else{
+ outputFileHandle << (pDataArray->thisLookup[k]->getGroup() +'\t' + pDataArray->thisLookup[l]->getGroup()) << '\t'; //print out groups
+ }
+
+ for(int i=0;i<sumCalculators.size();i++) {
+
+ //if this calc needs all groups to calculate the pair load all groups
+ if (sumCalculators[i]->getNeedsAll()) {
+ //load subset with rest of lookup for those calcs that need everyone to calc for a pair
+ for (int w = 0; w < pDataArray->thisLookup.size(); w++) {
+ if ((w != k) && (w != l)) { subset.push_back(pDataArray->thisLookup[w]); }
+ }
+ }
+
+ vector<double> tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs
+
+ if (pDataArray->m->control_pressed) { for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; } outputFileHandle.close(); return 1; }
+
+ outputFileHandle << '\t';
+ sumCalculators[i]->print(outputFileHandle);
+
+ seqDist temp(l, k, tempdata[0]);
+ pDataArray->calcDists[i].push_back(temp);
+ }
+ outputFileHandle << endl;
+ }
+ }
+
+ outputFileHandle.close();
+ for(int i=0;i<sumCalculators.size();i++){ delete sumCalculators[i]; }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "SummarySharedCommand", "MySummarySharedThreadFunction");
+ exit(1);
+ }
+}
+#endif
+
+
#endif
exit(1);
}
}
-/**************************************************************************************************/
+**************************************************************************************************/
void Tree::randomBlengths() {
try {
for(int i=numNodes-1;i>=0;i--){
--- /dev/null
+#include "trialswap2.h"
+
+
+//The sum_of_squares, havel_hakimi and calc_c_score algorithms have been adapted from I. Miklos and J. Podani. 2004. Randomization of presence-absence matrices: comments and new algorithms. Ecology 85:86-92.
+
+
+/**************************************************************************************************
+int TrialSwap2::intrand(int n){
+ try {
+ double z;
+
+ z = (double)random() * (double)n / (double)RAND_MAX;
+ if(z>=n)
+ z=n-1;
+ if(z<0)
+ z=0;
+ return((int)floor(z));
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "intrand");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/* completely random matrix, all column and row totals are variable, matrix size is the same
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim1(vector<vector<int> > &co_matrix){
+ try {
+ vector<int> randRow;
+ vector<vector<int> > tmpmatrix;
+ int nrows = co_matrix.size();
+ int ncols = co_matrix[0].size();
+
+ //clear co_matrix
+ // for(i=0;i<nrows;i++)
+ // {
+ // co_matrix.clear();
+ // }
+
+ //cout << "building matrix" << endl;
+ for(int i=0;i<nrows;i++){
+ if (m->control_pressed) { break; }
+
+ for(int j=0;j<ncols;j++){
+ double randNum = rand() / double(RAND_MAX);
+ //cout << randNum << endl;
+
+ if(randNum > 0.5) {
+ randRow.push_back(1);
+ }else{
+ randRow.push_back(0);
+ }
+ }
+ tmpmatrix.push_back(randRow);
+ randRow.clear();
+ //cout << endl;
+ }
+ co_matrix = tmpmatrix;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim1");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ *row sums fixed, columns equiprobable
+ */
+void TrialSwap2::sim2(vector<vector<int> > &co_matrix)
+{
+ try {
+
+ for(int i=0;i<co_matrix.size();i++)
+ {
+ if (m->control_pressed) { break; }
+ random_shuffle( co_matrix[i].begin(), co_matrix[i].end() );
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim2");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::sim2plus(vector<int> rowtotal, vector<vector<int> > &co_matrix)
+{
+ try {
+ int nrows = co_matrix.size();
+ int ncols = co_matrix[0].size();
+ double cellprob = 1.0/ncols;
+ vector<double> cellprobvec;
+ vector<int> tmprow;
+ vector<vector<int> > tmpmatrix;
+ //double randNum;
+
+ double start = 0.0;
+
+ for(int i=0; i<ncols; i++)
+ {
+ if (m->control_pressed) { return 0; }
+ cellprobvec.push_back(start + cellprob);
+ start = cellprobvec[i];
+ }
+
+ for(int i=0; i<nrows; i++)
+ {
+ tmprow.assign(ncols, 0);
+
+ while( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
+ {
+ if (m->control_pressed) { return 0; }
+ double randNum = rand() / double(RAND_MAX);
+ //cout << randNum << endl;
+ if(randNum <= cellprobvec[0])
+ {
+ tmprow[0] = 1;
+ continue;
+ }
+ for(int j=1;j<ncols;j++)
+ {
+ //cout << range[j] << endl;
+ if(randNum <= cellprobvec[j] && randNum > cellprobvec[j-1] && tmprow[j] != 1)
+ {
+ tmprow[j] = 1;
+ }
+ }
+ }
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+ co_matrix = tmpmatrix;
+ tmpmatrix.clear();
+ cellprobvec.clear();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim2plus");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ * same as sim2 but using initmatrix which is the initial co-occurrence matrix before transposition
+ * may have to be changed depending on what matrix 'seed' is used. One way to use is to transpose
+ * every null matrix before using an index and use the random matrix as a seed for the next null.
+ */
+/**************************************************************************************************/
+void TrialSwap2::sim3(vector<vector<int> > &initmatrix)
+{
+ try {
+ for(int i=0;i<initmatrix.size();i++)
+ {
+ if (m->control_pressed) { break; }
+ random_shuffle( initmatrix[i].begin(), initmatrix[i].end() );
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim3");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ *
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim4(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
+{
+ try {
+ vector<double> colProb;
+ vector<int> tmprow;//(ncols, 7);
+ vector<vector<int> > tmpmatrix;
+ vector<double> range;
+ vector<double> randNums;
+ int ncols = columntotal.size();
+ int nrows = rowtotal.size();
+ tmprow.clear();
+
+ double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
+ //cout << "col sum: " << colSum << endl;
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ colProb.push_back(columntotal[i]/colSum);
+ }
+
+ double start = 0.0;
+
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ range.push_back(start + colProb[i]);
+ start = range[i];
+ }
+
+ for(int i=0;i<nrows;i++)
+ {
+ tmprow.assign(ncols, 0);
+ if (m->control_pressed) { return 0; }
+
+ while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < rowtotal[i])
+ {
+ if (m->control_pressed) { return 0; }
+
+ double randNum = rand() / double(RAND_MAX);
+ if(randNum <= range[0])
+ {
+ tmprow[0] = 1;
+ continue;
+ }
+ for(int j=1;j<ncols;j++)
+ {
+ if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
+ {
+ tmprow[j] = 1;
+ }
+
+ }
+ }
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+
+ co_matrix = tmpmatrix;
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim4");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ * inverse of sim4, MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim5(vector<int> initcolumntotal,vector<int> initrowtotal, vector<vector<int> > &initmatrix)
+{
+ try {
+ vector<double> colProb;
+ vector<int> tmprow;//(ncols, 7);
+ vector<vector<int> > tmpmatrix;
+ vector<double> range;
+ vector<double> randNums;
+ int ncols = initcolumntotal.size();
+ int nrows = initrowtotal.size();
+
+ tmprow.clear();
+
+ double colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
+ //cout << "col sum: " << colSum << endl;
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ colProb.push_back(initcolumntotal[i]/colSum);
+ }
+
+ double start = 0.0;
+
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ range.push_back(start + colProb[i]);
+ start = range[i];
+ }
+
+ for(int i=0;i<nrows;i++)
+ {
+ tmprow.assign(ncols, 0);
+ if (m->control_pressed) { return 0; }
+
+ while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < initrowtotal[i])
+ {
+ if (m->control_pressed) { return 0; }
+
+ double randNum = rand() / double(RAND_MAX);
+ if(randNum <= range[0])
+ {
+ tmprow[0] = 1;
+ continue;
+ }
+ for(int j=1;j<ncols;j++)
+ {
+ if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
+ {
+ tmprow[j] = 1;
+ }
+
+ }
+ }
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+
+ initmatrix = tmpmatrix;
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim5");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ *
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim6(vector<int> columntotal, vector<vector<int> > &co_matrix)
+{
+ try {
+ vector<vector<int> > tmpmatrix;
+ vector<double> colProb;
+ vector<int> tmprow;
+ vector<double> range;
+ int ncols = columntotal.size();
+ int nrows = co_matrix.size();
+
+ int colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
+
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ colProb.push_back(columntotal[i]/double (colSum));
+ }
+
+ double start = 0.0;
+
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ range.push_back(start + colProb[i]);
+ start = range[i];
+ }
+
+ for(int i=0;i<nrows;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ tmprow.assign(ncols, 0);
+ int tmprowtotal;
+ tmprowtotal = (rand() / double (RAND_MAX)) * 10;
+ while ( tmprowtotal > ncols) {
+ if (m->control_pressed) { return 0; }
+ tmprowtotal = (rand() / double (RAND_MAX)) * 10;
+ }
+ //cout << tmprowtotal << endl;
+ //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
+
+ while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
+ {
+ if (m->control_pressed) { return 0; }
+ double randNum = rand() / double(RAND_MAX);
+ //cout << randNum << endl;
+ if(randNum <= range[0])
+ {
+ tmprow[0] = 1;
+ continue;
+ }
+ for(int j=1;j<ncols;j++)
+ {
+ //cout << range[j] << endl;
+ if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
+ {
+ tmprow[j] = 1;
+ }
+
+ }
+
+
+ }
+
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+
+ co_matrix = tmpmatrix;
+ tmpmatrix.clear();
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim6");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ * MUST BE TRANSPOSED BEFORE CO-OCCURRENCE ANALYSIS
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim7(vector<int> initrowtotal, vector<vector<int> > &co_matrix)
+{
+ try {
+ vector<vector<double> > probmatrix;
+ vector<vector<int> > tmpmatrix;
+ vector<double> colProb;
+ vector<double> probrow;
+ vector<int> tmprow;
+ vector<double> range;
+ double nc;
+ int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
+
+ tmpmatrix.assign(nrows, vector<int>(ncols, 0.));
+
+ int rowsum = accumulate( initrowtotal.begin(), initrowtotal.end(), 0 );
+
+ nc = rowsum * ncols;
+ //cout << nc << endl;
+
+ //assign null matrix based on probabilities
+
+ double start = 0.0; // don't reset start -- probs should be from 0-1 thoughout the entire matrix
+
+ for(int i=0;i<nrows;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ //cout << initrowtotal[i]/double(nc) << endl;
+ double cellprob = initrowtotal[i]/double(nc);
+ //cout << cellprob << endl;
+ for(int j=0;j<ncols;j++)
+ {
+
+ probrow.push_back(start + cellprob);
+ //cout << probrow[j] << endl;
+ //cout << start << endl;
+ start = start + cellprob;
+ }
+ probmatrix.push_back(probrow);
+ probrow.clear();
+ }
+
+
+ //while(tmprowsum < rowsum)
+ //for(int k=0;k<rowsum;k++)
+ int k = 0;
+ while(k < rowsum)
+ {
+ if (m->control_pressed) { return 0; }
+ done:
+ //cout << k << endl;
+ //tmprowsum = accumulate( tmprowtotal.begin(), tmprowtotal.end(), 0 );
+ double randNum = rand() / double(RAND_MAX);
+ //cout << randNum << "+" << endl;
+ //special case for the first entry
+ if(randNum <= probmatrix[0][0] && tmpmatrix[0][0] != 1)
+ {
+ tmpmatrix[0][0] = 1;
+ k++;
+ //cout << k << endl;
+ continue;
+ }
+
+
+ for(int i=0;i<nrows;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<ncols;j++)
+ {
+ //cout << probmatrix[i][j] << endl;
+ if(randNum <= probmatrix[i][j] && randNum > probmatrix[i][j-1] && tmpmatrix[i][j] != 1)
+ {
+ tmpmatrix[i][j] = 1;
+ k++;
+ //cout << k << endl;
+ goto done;
+ }
+ //else
+ //k = k-1;
+ }
+
+ }
+
+ }
+
+ co_matrix = tmpmatrix;
+ return 0;
+ //build probibility matrix
+ /* for(int i=0;i<nrows;i++)
+ {
+ for(int j=0;j<ncols;j++)
+ {
+ probrow.push_back(rowtotal[i]/nc);
+ }
+ probmatrix.pushback(probrow);
+ probrow.clear;
+ }
+ */
+
+ /* int colSum = accumulate( initcolumntotal.begin(), initcolumntotal.end(), 0 );
+
+ for(int i=0;i<ncols;i++)
+ {
+ colProb.push_back(initcolumntotal[i]/double (colSum));
+ }
+
+ double start = 0.0;
+
+ for(int i=0;i<ncols;i++)
+ {
+ range.push_back(start + colProb[i]);
+ start = range[i];
+ }
+
+ for(int i=0;i<nrows;i++)
+ {
+ tmprow.assign(ncols, 0);
+ int tmprowtotal;
+ tmprowtotal = (rand() / double (RAND_MAX)) * 10;
+ while ( tmprowtotal > ncols)
+ tmprowtotal = (rand() / double (RAND_MAX)) * 10;
+ //cout << tmprowtotal << endl;
+ //cout << accumulate( tmprow.begin(), tmprow.end(), 0 ) << endl;
+
+ while ( accumulate( tmprow.begin(), tmprow.end(), 0 ) < tmprowtotal)
+ {
+ double randNum = rand() / double(RAND_MAX);
+ //cout << randNum << endl;
+ if(randNum <= range[0])
+ {
+ tmprow[0] = 1;
+ continue;
+ }
+ for(int j=1;j<ncols;j++)
+ {
+ //cout << range[j] << endl;
+ if(randNum <= range[j] && randNum > range[j-1] && tmprow[j] != 1)
+ {
+ tmprow[j] = 1;
+ }
+ }
+ }
+
+ tmpmatrix.push_back(tmprow);
+ tmprow.clear();
+ }
+
+ initmatrix = tmpmatrix;
+ */
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim7");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+/*
+ *
+ *
+ *
+ */
+/**************************************************************************************************/
+int TrialSwap2::sim8(vector<int> columntotal, vector<int> rowtotal, vector<vector<int> > &co_matrix)
+{
+ try {
+ double prob;
+ double start = 0.0;
+ int ncols = columntotal.size(); int nrows = rowtotal.size();
+ double probarray[nrows * ncols];
+ double randnum;
+ int grandtotal;
+ int total = 0;
+
+ //double colSum = accumulate( columntotal.begin(), columntotal.end(), 0 );
+ double rowSum = accumulate( rowtotal.begin(), rowtotal.end(), 0 );
+
+ if (m->control_pressed) { return 0; }
+
+ //cout << "rowsum: " << rowSum << endl;
+
+ grandtotal = rowSum;
+
+ //create probability matrix with each site being between 0 and 1
+ for (int i=0;i<nrows;i++) {
+ if (m->control_pressed) { return 0; }
+ for (int j=0;j<ncols;j++) {
+ prob = (rowtotal[i] * columntotal[j])/(rowSum*rowSum);
+ if (prob == 0.0)
+ probarray[ncols * i + j] = -1;
+ else
+ probarray[ncols * i + j] = start + prob;
+ //probmatrixrow.pushback(start + prob);
+ start += prob;
+ }
+ }
+ //cout << "prbarray" << endl;
+ //for(int i=0;i<(nrows*ncols);i++)
+ //cout << probarray[i] << " ";
+ //cout << endl;
+
+ //generate random muber between 0 and 1 and interate through probarray until found
+ while (total < grandtotal) {
+ if (m->control_pressed) { return 0; }
+ randnum = rand() / double(RAND_MAX);
+ //cout << "rand num: " << randnum << endl;
+ if((randnum <= probarray[0]) && (probarray[0] != 2) ) {
+ probarray[0] = 2;
+ total++;
+ continue;
+ }
+ for(int i=1;i<(nrows*ncols);i++) {
+ if (m->control_pressed) { return 0; }
+ if((randnum <= probarray[i]) && (randnum > probarray[i-1]) && (probarray[i] != 2) ) {
+ probarray[i] = 2;
+ total++;
+ break;
+ }
+ else
+ continue;
+ }
+ }
+ //cout << "prbarray" << endl;
+ //for(int i=0;i<(nrows*ncols);i++)
+ //cout << probarray[i] << " ";
+ //cout << endl;
+ for(int i=0;i<nrows;i++) {
+ if (m->control_pressed) { return 0; }
+ for(int j=0;j<ncols;j++) {
+ if(probarray[ncols * i + j] == 2)
+ co_matrix[i][j] = 1;
+ else
+ co_matrix[i][j] = 0;
+ }
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "sim8");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double TrialSwap2::calc_c_score (vector<vector<int> > &co_matrix,vector<int> rowtotal)
+{
+ try {
+ double cscore = 0.0;
+ double maxD;
+ double D;
+ double normcscore = 0.0;
+ int nonzeros = 0;
+ int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
+ vector<vector<double> > s; s.resize(nrows);
+ for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0.0); }//only fill half the matrix
+
+
+ for(int i=0;i<nrows-1;i++)
+ {
+
+ for(int j=i+1;j<nrows;j++)
+ {
+ if (m->control_pressed) { return 0; }
+ for(int k=0;k<ncols;k++)
+ {
+ if((co_matrix[i][k]==1)&&(co_matrix[j][k]==1)) //if both are 1s ie co-occurrence
+ s[i][j]++; //s counts co-occurrences
+ }
+
+ //rowtotal[i] = A, rowtotal[j] = B, ncols = P, s[i][j] = J
+ cscore += (rowtotal[i]-s[i][j])*(rowtotal[j]-s[i][j]);///(nrows*(nrows-1)/2);
+ D = (rowtotal[i]-s[i][j])*(rowtotal[j]-s[i][j]);
+
+ if(ncols < (rowtotal[i] + rowtotal[j]))
+ {
+ maxD = (ncols-rowtotal[i])*(ncols-rowtotal[j]);
+ }
+ else
+ {
+ maxD = rowtotal[i] * rowtotal[j];
+ }
+
+ if(maxD != 0)
+ {
+ normcscore += D/maxD;
+ nonzeros++;
+ }
+ }
+ }
+
+ cscore = cscore/(double)(nrows*(nrows-1)/2);
+ //cout << "normalized c score: " << normcscore/nonzeros << endl;
+
+ return cscore;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_c_score");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::calc_checker (vector<vector<int> > &co_matrix, vector<int> rowtotal)
+{
+ try {
+ int cunits=0;
+ //int s[nrows][ncols];
+ int ncols = co_matrix[0].size(); int nrows = rowtotal.size();
+ vector<vector<int> > s; s.resize(nrows);
+ for (int i = 0; i < nrows; i++) { s[i].resize(nrows,0); }//only fill half the matrix
+
+ for(int i=0;i<nrows-1;i++)
+ {
+ for(int j=i+1;j<nrows;j++)
+ {
+ if (m->control_pressed) { return 0; }
+ //s[i][j]=0;
+ for(int k=0;k<ncols;k++)
+ {
+ //cout << s[i][j] << endl;
+ //iterates through the row and counts co-occurrences. The total number of co-occurrences for each row pair is kept in matrix s at location s[i][j].
+ if((co_matrix[i][k]==1)&&(co_matrix[j][k]==1)) //if both are 1s ie co-occurrence
+ s[i][j]++; //s counts co-occurrences
+
+ }
+ //cout << "rowtotal: " << rowtotal[i] << endl;
+ //cout << "co-occurrences: " << s[i][j] << endl;
+ //cunits+=(rowtotal[i]-s[i][j])*(rowtotal[j]-s[i][j]);
+ if (s[i][j] == 0)
+ {
+ cunits+=1;
+ }
+ //cunits+=s[i][j];
+ }
+ }
+
+ return cunits;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_checker");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double TrialSwap2::calc_vratio (vector<int> rowtotal, vector<int> columntotal)
+{
+ try {
+ int nrows = rowtotal.size();
+ int ncols = columntotal.size();
+ int sumCol = accumulate(columntotal.begin(), columntotal.end(), 0 );
+ // int sumRow = accumulate(rowtotal.begin(), rowtotal.end(), 0 );
+
+ double colAvg = (double) sumCol / (double) ncols;
+ // double rowAvg = (double) sumRow / (double) nrows;
+
+ double p = 0.0;
+
+ // double totalRowVar = 0.0;
+ double rowVar = 0.0;
+ double colVar = 0.0;
+
+ for(int i=0;i<nrows;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ p = (double) rowtotal[i]/(double) ncols;
+ rowVar += p * (1.0-p);
+ }
+
+ for(int i=0;i<ncols;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ colVar += pow(((double) columntotal[i]-colAvg),2);
+ }
+
+ colVar = (1.0/(double)ncols) * colVar;
+
+ return colVar/rowVar;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_vratio");
+ exit(1);
+ }
+
+}
+/**************************************************************************************************/
+int TrialSwap2::calc_combo (vector<vector<int> > &initmatrix)
+{
+ try {
+ int initrows = initmatrix.size();
+ int unique = 0;
+ int match = 0;
+ int matches = 0;
+ for(int i=0;i<initrows;i++)
+ {
+ match = 0;
+ for(int j=i+1;j<=initrows;j++)
+ {
+ if (m->control_pressed) { return 0; }
+ if( (initmatrix[i] == initmatrix[j]))
+ {
+ match++;
+ matches++;
+ break;
+ }
+ }
+
+ //on the last iteration of a previously matched row it will add itself because it doesn't match any following rows, so that combination is counted
+ if (match == 0)
+ unique++;
+ }
+ return unique;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_combo");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::swap_checkerboards (vector<vector<int> > &co_matrix)
+{
+ try {
+ int ncols = co_matrix[0].size(); int nrows = co_matrix.size();
+ int i, j, k, l;
+ i = m->getRandomIndex(nrows-1);
+ while((j = m->getRandomIndex(nrows-1) ) == i ) {;if (m->control_pressed) { return 0; }}
+ k = m->getRandomIndex(ncols-1);
+ while((l = m->getRandomIndex(ncols-1)) == k ) {;if (m->control_pressed) { return 0; }}
+
+ //cout << co_matrix[i][k] << " " << co_matrix[j][l] << endl;
+ //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
+ //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
+ //cout << co_matrix[i][l] << " " << co_matrix[j][k] << endl;
+ if((co_matrix[i][k]*co_matrix[j][l]==1 && co_matrix[i][l]+co_matrix[j][k]==0)||(co_matrix[i][k]+co_matrix[j][l]==0 && co_matrix[i][l]*co_matrix[j][k]==1)) //checking for checkerboard value and swap
+ {
+ co_matrix[i][k]=1-co_matrix[i][k];
+ co_matrix[i][l]=1-co_matrix[i][l];
+ co_matrix[j][k]=1-co_matrix[j][k];
+ co_matrix[j][l]=1-co_matrix[j][l];
+ //cout << "swapped!" << endl;
+ }
+ //cout << "i: " << i << " j: " << j << " k: " << " l: " << l << endl;
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "swap_checkerboards");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double TrialSwap2::calc_pvalue_greaterthan (vector<double> scorevec, double initialscore)
+{
+ try {
+ int runs = scorevec.size();
+ double p = 0.0;
+ for( int i=0;i<runs;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ if(scorevec[i]>=initialscore)
+ p++;
+ }
+ return p/(double)runs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_pvalue_greaterthan");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double TrialSwap2::calc_pvalue_lessthan (vector<double> scorevec, double initialscore)
+{
+ try {
+ int runs = scorevec.size();
+ double p = 0.0;
+ for( int i=0;i<runs;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ if(scorevec[i]<=initialscore)
+ p++;
+ }
+ return p/(double)runs;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "calc_pvalue_lessthan");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+double TrialSwap2::t_test (double initialscore, int runs, double nullMean, vector<double> scorevec)
+{
+ try {
+ double t;
+ double sampleSD;
+ double sum = 0;
+
+ for(int i=0;i<runs;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ sum += pow((scorevec[i] - nullMean),2);
+ //cout << "scorevec[" << i << "]" << scorevec[i] << endl;
+ }
+
+ m->mothurOut("nullMean: " + toString(nullMean)); m->mothurOutEndLine();
+
+ m->mothurOut("sum: " + toString(sum)); m->mothurOutEndLine();
+
+ sampleSD = sqrt( (1/runs) * sum );
+
+ m->mothurOut("samplSD: " + toString(sampleSD)); m->mothurOutEndLine();
+
+ t = (nullMean - initialscore) / (sampleSD / sqrt(runs));
+
+ return t;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "t_test");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::print_matrix(vector<vector<int> > &matrix, int nrows, int ncols)
+{
+ try {
+ m->mothurOut("matrix:"); m->mothurOutEndLine();
+
+ for (int i = 0; i < nrows; i++)
+ {
+ if (m->control_pressed) { return 0; }
+ for (int j = 0; j < ncols; j++)
+ {
+ m->mothurOut(toString(matrix[i][j]));
+ }
+ m->mothurOutEndLine();
+ }
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "print_matrix");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::transpose_matrix (vector<vector<int> > &initmatrix, vector<vector<int> > &co_matrix)//, int nrows, int nocols)
+{
+ try {
+ int ncols = initmatrix.size(); int nrows = initmatrix[0].size();
+ int tmpnrows = nrows;
+ //vector<vector<int> > tmpvec;
+ vector<int> tmprow;
+ if(!co_matrix.empty())
+ co_matrix.clear();
+ for (int i=0;i<nrows;i++)
+ {
+ if (m->control_pressed) { return 0; }
+ for (int j=0;j<ncols;j++)
+ {
+ tmprow.push_back(initmatrix[j][i]);
+ }
+ /*if (accumulate( tmprow.begin(), tmprow.end(), 0 ) == 0)
+ {
+ tmpnrows--;
+ }
+ else */
+ co_matrix.push_back(tmprow);
+ tmprow.clear();
+ }
+ nrows = tmpnrows;
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "transpose_matrix");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int TrialSwap2::update_row_col_totals(vector<vector<int> > &co_matrix, vector<int> &rowtotal, vector<int> &columntotal)
+{
+ try {
+ //rowtotal.clear();
+ //columntotal.clear();
+ //generate (rowtotal.begin(), rowtotal.end(), 0);
+ //generate (columntotal.begin(), columntotal.end(), 0);
+ int nrows = co_matrix.size();
+ int ncols = co_matrix[0].size();
+ vector<int> tmpcolumntotal; tmpcolumntotal.resize(ncols, 0);
+ vector<int> tmprowtotal; tmprowtotal.resize(nrows, 0);
+
+ int rowcount = 0;
+
+ for (int i = 0; i < nrows; i++)
+ {
+ if (m->control_pressed) { return 0; }
+ for (int j = 0; j < ncols; j++)
+ {
+ if (co_matrix[i][j] == 1)
+ {
+ rowcount++;
+ tmpcolumntotal[j]++;
+ }
+ }
+ tmprowtotal[i] = rowcount;
+ rowcount = 0;
+ }
+ columntotal = tmpcolumntotal;
+ rowtotal = tmprowtotal;
+ /*cout << "rowtotal: ";
+ for(int i = 0; i<nrows; i++) { cout << rowtotal[i]; }
+ cout << " ";
+ cout << " coltotal: ";
+ for(int i = 0; i<ncols; i++) { cout << columntotal[i]; }
+ cout << endl;*/
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrialSwap2", "update_row_col_totals");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+
+
+
+
--- /dev/null
+#ifndef TRIALSWAP2
+#define TRIALSWAP2
+
+/*
+ * trialswap2.h
+ * Mothur
+ *
+ * Created by Kathryn Iverson on June 27, 2011.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothurout.h"
+
+
+class TrialSwap2 {
+
+public:
+ TrialSwap2(){ m = MothurOut::getInstance(); };
+ ~TrialSwap2(){};
+
+ double calc_pvalue_lessthan (vector<double>, double);
+ double calc_pvalue_greaterthan (vector<double>, double);
+ int swap_checkerboards (vector<vector<int> > &);
+ int calc_combo (vector<vector<int> > &);
+ double calc_vratio (vector<int>, vector<int>);
+ int calc_checker (vector<vector<int> > &,vector<int>);
+ double calc_c_score (vector<vector<int> > &,vector<int>);
+
+ int sim1 (vector<vector<int> > &);
+ void sim2(vector<vector<int> >&);
+ int sim2plus(vector<int>, vector<vector<int> > &);
+ void sim3(vector<vector<int> > &);
+ int sim4(vector<int>, vector<int>, vector<vector<int> > &);
+ int sim5(vector<int>, vector<int>, vector<vector<int> > &);
+ int sim6(vector<int>, vector<vector<int> > &);
+ int sim7(vector<int>, vector<vector<int> > &);
+ int sim8(vector<int>, vector<int>, vector<vector<int> > &);
+ int transpose_matrix (vector<vector<int> > &, vector<vector<int> > &);
+ int update_row_col_totals(vector<vector<int> > &, vector<int>&, vector<int>&);
+
+
+private:
+ MothurOut* m;
+
+ double t_test (double, int, double, vector<double>);
+ int print_matrix(vector<vector<int> > &, int, int);
+
+
+
+};
+
+#endif
+
+
CommandParameter pminflows("minflows", "Number", "", "450", "", "", "",false,false); parameters.push_back(pminflows);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
- CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+ CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+ CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+ CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter psignal("signal", "Number", "", "0.50", "", "", "",false,false); parameters.push_back(psignal);
CommandParameter pnoise("noise", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pnoise);
temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found"){ temp = "0"; }
m->mothurConvert(temp, pdiffs);
- temp = validParameter.validFile(parameters, "tdiffs", false);
- if (temp == "not found"){ int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
+ temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, ldiffs);
+
+ temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, sdiffs);
+
+ temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
m->mothurConvert(temp, tdiffs);
- if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
+
+ if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
}
vector<unsigned long long> flowFilePos;
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
flowFilePos = getFlowFileBreaks();
for (int i = 0; i < (flowFilePos.size()-1); i++) {
lines.push_back(new linePair(flowFilePos[i], flowFilePos[(i+1)]));
m->mothurRemove(barcodePrimerComboFileNames[i][j]);
}
else{
- output << barcodePrimerComboFileNames[i][j] << endl;
+ output << m->getFullPathName(barcodePrimerComboFileNames[i][j]) << endl;
outputNames.push_back(barcodePrimerComboFileNames[i][j]);
outputTypes["flow"].push_back(barcodePrimerComboFileNames[i][j]);
}
flowFilesFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + "flow.files";
m->openOutputFile(flowFilesFileName, output);
- output << trimFlowFileName << endl;
+ output << m->getFullPathName(trimFlowFileName) << endl;
output.close();
}
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
while(moreSeqs) {
//cout << "driver " << count << endl;
int primerIndex = 0;
int barcodeIndex = 0;
+ if(numLinkers != 0){
+ success = trimOligos.stripLinker(currSeq);
+ if(success > ldiffs) { trashCode += 'k'; }
+ else{ currentSeqDiffs += success; }
+
+ }
+
if(barcodes.size() != 0){
success = trimOligos.stripBarcode(currSeq, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqDiffs += success; }
}
+ if(numSpacers != 0){
+ success = trimOligos.stripSpacer(currSeq);
+ if(success > sdiffs) { trashCode += 's'; }
+ else{ currentSeqDiffs += success; }
+
+ }
+
if(numFPrimers != 0){
success = trimOligos.stripForward(currSeq, primerIndex);
if(success > pdiffs) { trashCode += 'f'; }
//report progress
if((count) % 10000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); }
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = flowFile.tellg();
if ((pos == -1) || (pos >= line->end)) { break; }
}
else if(type == "REVERSE"){
- Sequence oligoRC("reverse", oligo);
- oligoRC.reverseComplement();
- revPrimer.push_back(oligoRC.getUnaligned());
+ string oligoRC = reverseOligo(oligo);
+ revPrimer.push_back(oligoRC);
}
else if(type == "BARCODE"){
oligosFile >> group;
barcodes[oligo]=indexBarcode; indexBarcode++;
barcodeNameVector.push_back(group);
+ }else if(type == "LINKER"){
+ linker.push_back(oligo);
+ }else if(type == "SPACER"){
+ spacer.push_back(oligo);
}
else{
m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine();
numFPrimers = primers.size();
numRPrimers = revPrimer.size();
+ numLinkers = linker.size();
+ numSpacers = spacer.size();
}
catch(exception& e) {
exit(1);
}
}
+//********************************************************************/
+string TrimFlowsCommand::reverseOligo(string oligo){
+ try {
+ string reverse = "";
+
+ for(int i=oligo.length()-1;i>=0;i--){
+
+ if(oligo[i] == 'A') { reverse += 'T'; }
+ else if(oligo[i] == 'T'){ reverse += 'A'; }
+ else if(oligo[i] == 'U'){ reverse += 'A'; }
+
+ else if(oligo[i] == 'G'){ reverse += 'C'; }
+ else if(oligo[i] == 'C'){ reverse += 'G'; }
+
+ else if(oligo[i] == 'R'){ reverse += 'Y'; }
+ else if(oligo[i] == 'Y'){ reverse += 'R'; }
+
+ else if(oligo[i] == 'M'){ reverse += 'K'; }
+ else if(oligo[i] == 'K'){ reverse += 'M'; }
+
+ else if(oligo[i] == 'W'){ reverse += 'W'; }
+ else if(oligo[i] == 'S'){ reverse += 'S'; }
+
+ else if(oligo[i] == 'B'){ reverse += 'V'; }
+ else if(oligo[i] == 'V'){ reverse += 'B'; }
+
+ else if(oligo[i] == 'D'){ reverse += 'H'; }
+ else if(oligo[i] == 'H'){ reverse += 'D'; }
+
+ else { reverse += 'N'; }
+ }
+
+
+ return reverse;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimFlowsCommand", "reverseOligo");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
vector<unsigned long long> TrimFlowsCommand::getFlowFileBreaks() {
processIDS.clear();
int exitCommand = 1;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
//loop through and create all the processes you want
vector<unsigned long long> getFlowFileBreaks();
int createProcessesCreateTrim(string, string, string, string, vector<vector<string> >);
int driverCreateTrim(string, string, string, string, vector<vector<string> >, linePair*);
-
+ string reverseOligo(string);
+
vector<string> outputNames;
set<string> filesToRemove;
map<string, int> barcodes;
map<string, int> primers;
vector<string> revPrimer;
+ vector<string> linker;
+ vector<string> spacer;
vector<string> primerNameVector; //needed here?
vector<string> barcodeNameVector; //needed here?
};
/**************************************************************************************************/
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
#else
static DWORD WINAPI MyTrimFlowThreadFunction(LPVOID lpParam){
trimFlowData* pDataArray;
#include "needlemanoverlap.hpp"
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
+ try {
+ m = MothurOut::getInstance();
+
+ pdiffs = p;
+ bdiffs = b;
+ ldiffs = l;
+ sdiffs = s;
+
+ barcodes = br;
+ primers = pr;
+ revPrimer = r;
+ linker = lk;
+ spacer = sp;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "TrimOligos");
+ exit(1);
+ }
+}
/********************************************************************/
//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
Alignment* alignment;
if (barcodes.size() > 0) {
- map<string,int>::iterator it=barcodes.begin();
+ map<string,int>::iterator it;
- for(it;it!=barcodes.end();it++){
+ for(it=barcodes.begin();it!=barcodes.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
else{ //use the best match
group = minGroup;
seq.setUnaligned(rawSequence.substr(minPos));
-
+
if(qual.getName() != ""){
qual.trimQScores(minPos, -1);
}
Alignment* alignment;
if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
+ map<string,int>::iterator it;
- for(it;it!=primers.end();it++){
+ for(it=primers.begin();it!=primers.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
}
}
//*******************************************************************/
-int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
+int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
try {
string rawSequence = seq.getUnaligned();
int success = pdiffs + 1; //guilty until proven innocent
if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
group = it->second;
- seq.setUnaligned(rawSequence.substr(oligo.length()));
+ if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); }
if(qual.getName() != ""){
- qual.trimQScores(oligo.length(), -1);
+ if (!keepForward) { qual.trimQScores(oligo.length(), -1); }
}
success = 0;
break;
Alignment* alignment;
if (primers.size() > 0) {
- map<string,int>::iterator it=primers.begin();
+ map<string,int>::iterator it;
- for(it;it!=primers.end();it++){
+ for(it=primers.begin();it!=primers.end();it++){
if(it->first.length() > maxLength){
maxLength = it->first.length();
}
else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
else{ //use the best match
group = minGroup;
- seq.setUnaligned(rawSequence.substr(minPos));
+ if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); }
if(qual.getName() != ""){
- qual.trimQScores(minPos, -1);
+ if (!keepForward) { qual.trimQScores(minPos, -1); }
}
success = minDiff;
}
exit(1);
}
}
+//******************************************************************/
+bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
+ try {
+ string rawSequence = seq.getUnaligned();
+ bool success = ldiffs + 1; //guilty until proven innocent
+
+ for(int i=0;i<linker.size();i++){
+ string oligo = linker[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = ldiffs + 10;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(oligo.length(), -1);
+ }
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the linker or if you don't want to allow for diffs
+ if ((ldiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (linker.size() > 0) {
+ for(int i = 0; i < linker.size(); i++){
+ if(linker[i].length() > maxLength){
+ maxLength = linker[i].length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minPos = 0;
+
+ for(int i = 0; i < linker.size(); i++){
+ string oligo = linker[i];
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = ldiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > ldiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ seq.setUnaligned(rawSequence.substr(minPos));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(minPos, -1);
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripLinker");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripLinker(Sequence& seq){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ bool success = ldiffs +1; //guilty until proven innocent
+
+ for(int i=0;i<linker.size();i++){
+ string oligo = linker[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = ldiffs +10;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the linker or if you don't want to allow for diffs
+ if ((ldiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (linker.size() > 0) {
+ for(int i = 0; i < linker.size(); i++){
+ if(linker[i].length() > maxLength){
+ maxLength = linker[i].length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minPos = 0;
+
+ for(int i = 0; i < linker.size(); i++){
+ string oligo = linker[i];
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = ldiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > ldiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ seq.setUnaligned(rawSequence.substr(minPos));
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripLinker");
+ exit(1);
+ }
+}
+
+//******************************************************************/
+bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
+ try {
+ string rawSequence = seq.getUnaligned();
+ bool success = sdiffs+1; //guilty until proven innocent
+
+ for(int i=0;i<spacer.size();i++){
+ string oligo = spacer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = sdiffs+10;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ if(qual.getName() != ""){
+ qual.trimQScores(oligo.length(), -1);
+ }
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the spacer or if you don't want to allow for diffs
+ if ((sdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (spacer.size() > 0) {
+ for(int i = 0; i < spacer.size(); i++){
+ if(spacer[i].length() > maxLength){
+ maxLength = spacer[i].length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minPos = 0;
+
+ for(int i = 0; i < spacer.size(); i++){
+ string oligo = spacer[i];
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = sdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > sdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ seq.setUnaligned(rawSequence.substr(minPos));
+
+ if(qual.getName() != ""){
+ qual.trimQScores(minPos, -1);
+ }
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripSpacer");
+ exit(1);
+ }
+}
+//******************************************************************/
+bool TrimOligos::stripSpacer(Sequence& seq){
+ try {
+
+ string rawSequence = seq.getUnaligned();
+ bool success = sdiffs+1; //guilty until proven innocent
+
+ for(int i=0;i<spacer.size();i++){
+ string oligo = spacer[i];
+
+ if(rawSequence.length() < oligo.length()){
+ success = sdiffs+10;
+ break;
+ }
+
+ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+ seq.setUnaligned(rawSequence.substr(oligo.length()));
+ success = 0;
+ break;
+ }
+ }
+
+ //if you found the spacer or if you don't want to allow for diffs
+ if ((sdiffs == 0) || (success == 0)) { return success; }
+
+ else { //try aligning and see if you can find it
+
+ int maxLength = 0;
+
+ Alignment* alignment;
+ if (spacer.size() > 0) {
+ for(int i = 0; i < spacer.size(); i++){
+ if(spacer[i].length() > maxLength){
+ maxLength = spacer[i].length();
+ }
+ }
+ alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));
+
+ }else{ alignment = NULL; }
+
+ //can you find the barcode
+ int minDiff = 1e6;
+ int minCount = 1;
+ int minPos = 0;
+
+ for(int i = 0; i < spacer.size(); i++){
+ string oligo = spacer[i];
+ // int length = oligo.length();
+
+ if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
+ success = sdiffs + 10;
+ break;
+ }
+
+ //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+ alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
+ oligo = alignment->getSeqAAln();
+ string temp = alignment->getSeqBAln();
+
+ int alnLength = oligo.length();
+
+ for(int i=oligo.length()-1;i>=0;i--){
+ if(oligo[i] != '-'){ alnLength = i+1; break; }
+ }
+ oligo = oligo.substr(0,alnLength);
+ temp = temp.substr(0,alnLength);
+
+ int numDiff = countDiffs(oligo, temp);
+
+ if(numDiff < minDiff){
+ minDiff = numDiff;
+ minCount = 1;
+ minPos = 0;
+ for(int i=0;i<alnLength;i++){
+ if(temp[i] != '-'){
+ minPos++;
+ }
+ }
+ }
+ else if(numDiff == minDiff){
+ minCount++;
+ }
+
+ }
+
+ if(minDiff > sdiffs) { success = minDiff; } //no good matches
+ else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
+ else{ //use the best match
+ seq.setUnaligned(rawSequence.substr(minPos));
+ success = minDiff;
+ }
+
+ if (alignment != NULL) { delete alignment; }
+
+ }
+
+ return success;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimOligos", "stripSpacer");
+ exit(1);
+ }
+}
//******************************************************************/
bool TrimOligos::compareDNASeq(string oligo, string seq){
class TrimOligos {
public:
- TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+ TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
+ TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
~TrimOligos();
int stripBarcode(Sequence&, int&);
int stripBarcode(Sequence&, QualityScores&, int&);
int stripForward(Sequence&, int&);
- int stripForward(Sequence&, QualityScores&, int&);
+ int stripForward(Sequence&, QualityScores&, int&, bool);
bool stripReverse(Sequence&);
bool stripReverse(Sequence&, QualityScores&);
+
+ bool stripLinker(Sequence&);
+ bool stripLinker(Sequence&, QualityScores&);
+
+ bool stripSpacer(Sequence&);
+ bool stripSpacer(Sequence&, QualityScores&);
private:
- int pdiffs, bdiffs;
+ int pdiffs, bdiffs, ldiffs, sdiffs;
map<string, int> barcodes;
map<string, int> primers;
vector<string> revPrimer;
+ vector<string> linker;
+ vector<string> spacer;
MothurOut* m;
CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength);
CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs);
- CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
+ CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs);
+ CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs);
+ CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles);
+ CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward);
CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim);
CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold);
CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage);
helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n";
helpString += "The minlength parameter allows you to set and minimum sequence length. \n";
helpString += "The maxlength parameter allows you to set and maximum sequence length. \n";
- helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n";
+ helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n";
helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n";
helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
+ helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n";
+ helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n";
helpString += "The qfile parameter allows you to provide a quality file.\n";
helpString += "The qthreshold parameter allows you to set a minimum quality score allowed. \n";
helpString += "The qaverage parameter allows you to set a minimum average quality score allowed. \n";
helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n";
helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n";
helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n";
+ helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n";
helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n";
helpString += "The keepfirst parameter trims the sequence to the first keepfirst number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements. \n";
helpString += "The removelast removes the last removelast number of bases after the barcode or primers are removed, before the sequence is checked to see if it meets the other requirements.\n";
temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
m->mothurConvert(temp, pdiffs);
+
+ temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, ldiffs);
+
+ temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; }
+ m->mothurConvert(temp, sdiffs);
- temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); }
+ temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); }
m->mothurConvert(temp, tdiffs);
- if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; }
+ if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; }
temp = validParameter.validFile(parameters, "qfile", true);
if (temp == "not found") { qFileName = ""; }
temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; }
allFiles = m->isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; }
+ keepforward = m->isTrue(temp);
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
numFPrimers = 0; //this needs to be initialized
numRPrimers = 0;
+ numSpacers = 0;
+ numLinkers = 0;
createGroup = false;
vector<vector<string> > fastaFileNames;
vector<vector<string> > qualFileNames;
outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName);
}
}
+
+ //fills lines and qlines
+ setLines(fastaFile, qFileName);
- vector<unsigned long long> fastaFilePos;
- vector<unsigned long long> qFilePos;
+ if(processors == 1){
+ driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
+ }else{
+ createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
+ }
- setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
-
- for (int i = 0; i < (fastaFilePos.size()-1); i++) {
- lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
- if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); }
- }
- if(qFileName == "") { qLines = lines; } //files with duds
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
- driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
- }else{
- createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames);
- }
- #else
- driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]);
- #endif
if (m->control_pressed) { return 0; }
/**************************************************************************************/
-int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair* line, linePair* qline) {
+int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames, linePair line, linePair qline) {
try {
ifstream inFASTA;
m->openInputFile(filename, inFASTA);
- inFASTA.seekg(line->start);
+ inFASTA.seekg(line.start);
ifstream qFile;
if(qFileName != "") {
m->openInputFile(qFileName, qFile);
- qFile.seekg(qline->start);
+ qFile.seekg(qline.start);
}
int count = 0;
bool moreSeqs = 1;
- TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
+ TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
while (moreSeqs) {
int barcodeIndex = 0;
int primerIndex = 0;
+ if(numLinkers != 0){
+ success = trimOligos.stripLinker(currSeq, currQual);
+ if(success > ldiffs) { trashCode += 'k'; }
+ else{ currentSeqsDiffs += success; }
+
+ }
+
if(barcodes.size() != 0){
success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
if(success > bdiffs) { trashCode += 'b'; }
else{ currentSeqsDiffs += success; }
}
+ if(numSpacers != 0){
+ success = trimOligos.stripSpacer(currSeq, currQual);
+ if(success > sdiffs) { trashCode += 's'; }
+ else{ currentSeqsDiffs += success; }
+
+ }
+
if(numFPrimers != 0){
- success = trimOligos.stripForward(currSeq, currQual, primerIndex);
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex, keepforward);
if(success > pdiffs) { trashCode += 'f'; }
else{ currentSeqsDiffs += success; }
}
count++;
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
unsigned long long pos = inFASTA.tellg();
- if ((pos == -1) || (pos >= line->end)) { break; }
+ if ((pos == -1) || (pos >= line.end)) { break; }
#else
if (inFASTA.eof()) { break; }
int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector<vector<string> > fastaFileNames, vector<vector<string> > qualFileNames, vector<vector<string> > nameFileNames) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 1;
+
+ int process = 1;
int exitCommand = 1;
processIDS.clear();
- //loop through and create all the processes you want
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ //loop through and create all the processes you want
while (process != processors) {
int pid = fork();
int temp = processIDS[i];
wait(&temp);
}
-
- //append files
+#else
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the trimData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<trimData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
+
+ //Create processor worker threads.
+ for( int i=0; i<processors-1; i++){
+
+ string extension = "";
+ if (i != 0) { extension = toString(i) + ".temp"; processIDS.push_back(i); }
+ vector<vector<string> > tempFASTAFileNames = fastaFileNames;
+ vector<vector<string> > tempPrimerQualFileNames = qualFileNames;
+ vector<vector<string> > tempNameFileNames = nameFileNames;
+
+ if(allFiles){
+ ofstream temp;
+
+ for(int i=0;i<tempFASTAFileNames.size();i++){
+ for(int j=0;j<tempFASTAFileNames[i].size();j++){
+ if (tempFASTAFileNames[i][j] != "") {
+ tempFASTAFileNames[i][j] += extension;
+ m->openOutputFile(tempFASTAFileNames[i][j], temp); temp.close();
+
+ if(qFileName != ""){
+ tempPrimerQualFileNames[i][j] += extension;
+ m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close();
+ }
+ if(nameFile != ""){
+ tempNameFileNames[i][j] += extension;
+ m->openOutputFile(tempNameFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+ }
+
+
+ trimData* tempTrim = new trimData(filename,
+ qFileName, nameFile,
+ (trimFASTAFileName+extension),
+ (scrapFASTAFileName+extension),
+ (trimQualFileName+extension),
+ (scrapQualFileName+extension),
+ (trimNameFileName+extension),
+ (scrapNameFileName+extension),
+ (groupFile+extension),
+ tempFASTAFileNames,
+ tempPrimerQualFileNames,
+ tempNameFileNames,
+ lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
+ pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer,
+ primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
+ qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
+ minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
+ pDataArray.push_back(tempTrim);
+
+ hThreadArray[i] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
+ }
+
+ //parent do my part
+ ofstream temp;
+ m->openOutputFile(trimFASTAFileName, temp); temp.close();
+ m->openOutputFile(scrapFASTAFileName, temp); temp.close();
+ if(qFileName != ""){
+ m->openOutputFile(trimQualFileName, temp); temp.close();
+ m->openOutputFile(scrapQualFileName, temp); temp.close();
+ }
+ if (nameFile != "") {
+ m->openOutputFile(trimNameFileName, temp); temp.close();
+ m->openOutputFile(scrapNameFileName, temp); temp.close();
+ }
+
+ driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), fastaFileNames, qualFileNames, nameFileNames, lines[processors-1], qLines[processors-1]);
+ processIDS.push_back(processors-1);
+
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ for (map<string, int>::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) {
+ map<string, int>::iterator it2 = groupCounts.find(it->first);
+ if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; }
+ else { groupCounts[it->first] += it->second; }
+ }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
+#endif
+
+
+ //append files
for(int i=0;i<processIDS.size();i++){
m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine();
}
}
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(createGroup){
ifstream in;
string tempFile = filename + toString(processIDS[i]) + ".num.temp";
if (tempNum != 0) {
while (!in.eof()) {
in >> group >> tempNum; m->gobble(in);
-
+
map<string, int>::iterator it = groupCounts.find(group);
if (it == groupCounts.end()) { groupCounts[group] = tempNum; }
else { groupCounts[it->first] += tempNum; }
}
in.close(); m->mothurRemove(tempFile);
}
-
+ #endif
}
-
- return exitCommand;
-#endif
+
+ return exitCommand;
}
catch(exception& e) {
m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim");
/**************************************************************************************************/
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename) {
try {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+
+ vector<unsigned long long> fastaFilePos;
+ vector<unsigned long long> qfileFilePos;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
//set file positions for fasta file
fastaFilePos = m->divideFile(filename, processors);
- if (qfilename == "") { return processors; }
-
//get name of first sequence in each chunk
map<string, int> firstSeqNames;
for (int i = 0; i < (fastaFilePos.size()-1); i++) {
in.close();
}
-
- //seach for filePos of each first name in the qfile and save in qfileFilePos
- ifstream inQual;
- m->openInputFile(qfilename, inQual);
-
- string input;
- while(!inQual.eof()){
- input = m->getline(inQual);
-
- if (input.length() != 0) {
- if(input[0] == '>'){ //this is a sequence name line
- istringstream nameStream(input);
-
- string sname = ""; nameStream >> sname;
- sname = sname.substr(1);
-
- map<string, int>::iterator it = firstSeqNames.find(sname);
-
- if(it != firstSeqNames.end()) { //this is the start of a new chunk
- unsigned long long pos = inQual.tellg();
- qfileFilePos.push_back(pos - input.length() - 1);
- firstSeqNames.erase(it);
- }
- }
- }
-
- if (firstSeqNames.size() == 0) { break; }
- }
- inQual.close();
-
- if (firstSeqNames.size() != 0) {
- for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
- m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
- }
- qFileName = "";
- return processors;
- }
-
- //get last file position of qfile
- FILE * pFile;
- unsigned long long size;
-
- //get num bytes in file
- pFile = fopen (qfilename.c_str(),"rb");
- if (pFile==NULL) perror ("Error opening file");
- else{
- fseek (pFile, 0, SEEK_END);
- size=ftell (pFile);
- fclose (pFile);
- }
-
- qfileFilePos.push_back(size);
+ if(qfilename != "") {
+ //seach for filePos of each first name in the qfile and save in qfileFilePos
+ ifstream inQual;
+ m->openInputFile(qfilename, inQual);
+
+ string input;
+ while(!inQual.eof()){
+ input = m->getline(inQual);
+
+ if (input.length() != 0) {
+ if(input[0] == '>'){ //this is a sequence name line
+ istringstream nameStream(input);
+
+ string sname = ""; nameStream >> sname;
+ sname = sname.substr(1);
+
+ map<string, int>::iterator it = firstSeqNames.find(sname);
+
+ if(it != firstSeqNames.end()) { //this is the start of a new chunk
+ unsigned long long pos = inQual.tellg();
+ qfileFilePos.push_back(pos - input.length() - 1);
+ firstSeqNames.erase(it);
+ }
+ }
+ }
+
+ if (firstSeqNames.size() == 0) { break; }
+ }
+ inQual.close();
+
+
+ if (firstSeqNames.size() != 0) {
+ for (map<string, int>::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) {
+ m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine();
+ }
+ qFileName = "";
+ return processors;
+ }
+
+ //get last file position of qfile
+ FILE * pFile;
+ unsigned long long size;
+
+ //get num bytes in file
+ pFile = fopen (qfilename.c_str(),"rb");
+ if (pFile==NULL) perror ("Error opening file");
+ else{
+ fseek (pFile, 0, SEEK_END);
+ size=ftell (pFile);
+ fclose (pFile);
+ }
+
+ qfileFilePos.push_back(size);
+ }
+
+ for (int i = 0; i < (fastaFilePos.size()-1); i++) {
+ lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)]));
+ if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); }
+ }
+ if(qfilename == "") { qLines = lines; } //files with duds
return processors;
#else
-
- fastaFilePos.push_back(0); qfileFilePos.push_back(0);
- fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
+
+ if (processors == 1) { //save time
+ //fastaFilePos.push_back(0); qfileFilePos.push_back(0);
+ //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000);
+ lines.push_back(linePair(0, 1000));
+ if (qfilename != "") { qLines.push_back(linePair(0, 1000)); }
+ }else{
+ int numFastaSeqs = 0;
+ fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs);
+ if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); }
+
+ if (qfilename != "") {
+ int numQualSeqs = 0;
+ qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs);
+
+ if (numFastaSeqs != numQualSeqs) {
+ m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true;
+ }
+ }
+
+ //figure out how many sequences you have to process
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+ for (int i = 0; i < processors; i++) {
+ int startIndex = i * numSeqsPerProcessor;
+ if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
+ lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
+ cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
+ if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
+ }
+
+ if(qfilename == "") { qLines = lines; } //files with duds
+ }
return 1;
#endif
primerNameVector.push_back(group);
}
else if(type == "REVERSE"){
- Sequence oligoRC("reverse", oligo);
- oligoRC.reverseComplement();
- revPrimer.push_back(oligoRC.getUnaligned());
+ //Sequence oligoRC("reverse", oligo);
+ //oligoRC.reverseComplement();
+ string oligoRC = reverseOligo(oligo);
+ revPrimer.push_back(oligoRC);
}
else if(type == "BARCODE"){
inOligos >> group;
barcodes[oligo]=indexBarcode; indexBarcode++;
barcodeNameVector.push_back(group);
+ }else if(type == "LINKER"){
+ linker.push_back(oligo);
+ }else if(type == "SPACER"){
+ spacer.push_back(oligo);
}
else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); }
}
}
numFPrimers = primers.size();
numRPrimers = revPrimer.size();
+ numLinkers = linker.size();
+ numSpacers = spacer.size();
bool allBlank = true;
for (int i = 0; i < barcodeNameVector.size(); i++) {
}
}
+//********************************************************************/
+string TrimSeqsCommand::reverseOligo(string oligo){
+ try {
+ string reverse = "";
+
+ for(int i=oligo.length()-1;i>=0;i--){
+
+ if(oligo[i] == 'A') { reverse += 'T'; }
+ else if(oligo[i] == 'T'){ reverse += 'A'; }
+ else if(oligo[i] == 'U'){ reverse += 'A'; }
+
+ else if(oligo[i] == 'G'){ reverse += 'C'; }
+ else if(oligo[i] == 'C'){ reverse += 'G'; }
+
+ else if(oligo[i] == 'R'){ reverse += 'Y'; }
+ else if(oligo[i] == 'Y'){ reverse += 'R'; }
+
+ else if(oligo[i] == 'M'){ reverse += 'K'; }
+ else if(oligo[i] == 'K'){ reverse += 'M'; }
+
+ else if(oligo[i] == 'W'){ reverse += 'W'; }
+ else if(oligo[i] == 'S'){ reverse += 'S'; }
+
+ else if(oligo[i] == 'B'){ reverse += 'V'; }
+ else if(oligo[i] == 'V'){ reverse += 'B'; }
+
+ else if(oligo[i] == 'D'){ reverse += 'H'; }
+ else if(oligo[i] == 'H'){ reverse += 'D'; }
+
+ else { reverse += 'N'; }
+ }
+
+
+ return reverse;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "TrimSeqsCommand", "reverseOligo");
+ exit(1);
+ }
+}
//***************************************************************************************************************
#include "sequence.hpp"
#include "qualityscores.h"
#include "groupmap.h"
+#include "trimoligos.h"
+
class TrimSeqsCommand : public Command {
public:
private:
GroupMap* groupMap;
-
- struct linePair {
- unsigned long long start;
- unsigned long long end;
- linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
- };
-
+
+ struct linePair {
+ unsigned long long start;
+ unsigned long long end;
+ linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+ linePair() {}
+ };
+
bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
bool keepFirstTrim(Sequence&, QualityScores&);
bool removeLastTrim(Sequence&, QualityScores&);
bool cullLength(Sequence&);
bool cullHomoP(Sequence&);
bool cullAmbigs(Sequence&);
+ string reverseOligo(string);
bool abort, createGroup;
string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
- bool flip, allFiles, qtrim;
- int numFPrimers, numRPrimers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, comboStarts;
+ bool flip, allFiles, qtrim, keepforward;
+ int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
int qWindowSize, qWindowStep, keepFirst, removeLast;
double qRollAverage, qThreshold, qWindowAverage, qAverage;
vector<string> revPrimer, outputNames;
map<string, int> barcodes;
vector<string> groupVector;
map<string, int> primers;
+ vector<string> linker;
+ vector<string> spacer;
map<string, int> combos;
map<string, int> groupToIndex;
vector<string> primerNameVector; //needed here?
map<string, string> nameMap;
vector<int> processIDS; //processid
- vector<linePair*> lines;
- vector<linePair*> qLines;
+ vector<linePair> lines;
+ vector<linePair> qLines;
- int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair*, linePair*);
+ int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);
int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
- int setLines(string, string, vector<unsigned long long>&, vector<unsigned long long>&);
+ int setLines(string, string);
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct trimData {
+ unsigned long long start, end;
+ MothurOut* m;
+ string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
+ vector<vector<string> > fastaFileNames;
+ vector<vector<string> > qualFileNames;
+ vector<vector<string> > nameFileNames;
+ unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
+ bool flip, allFiles, qtrim, keepforward, createGroup;
+ int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
+ int qWindowSize, qWindowStep, keepFirst, removeLast, count;
+ double qRollAverage, qThreshold, qWindowAverage, qAverage;
+ vector<string> revPrimer;
+ map<string, int> barcodes;
+ map<string, int> primers;
+ vector<string> linker;
+ vector<string> spacer;
+ map<string, int> combos;
+ vector<string> primerNameVector;
+ vector<string> barcodeNameVector;
+ map<string, int> groupCounts;
+ map<string, string> nameMap;
+
+ trimData(){}
+ trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout,
+ int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa,
+ vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
+ int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
+ int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
+ filename = fn;
+ qFileName = qn;
+ nameFile = nf;
+ trimFileName = tn;
+ scrapFileName = sn;
+ trimQFileName = tqn;
+ scrapQFileName = sqn;
+ trimNFileName = tnn;
+ scrapNFileName = snn;
+ groupFileName = gn;
+ fastaFileNames = ffn;
+ qualFileNames = qfn;
+ nameFileNames = nfn;
+ lineStart = lstart;
+ lineEnd = lend;
+ qlineStart = qstart;
+ qlineEnd = qend;
+ m = mout;
+
+ pdiffs = pd;
+ bdiffs = bd;
+ ldiffs = ld;
+ sdiffs = sd;
+ tdiffs = td;
+ barcodes = bar;
+ primers = pri; numFPrimers = primers.size();
+ revPrimer = revP; numRPrimers = revPrimer.size();
+ linker = li; numLinkers = linker.size();
+ spacer = spa; numSpacers = spacer.size();
+ primerNameVector = priNameVector;
+ barcodeNameVector = barNameVector;
+
+ createGroup = cGroup;
+ allFiles = aFiles;
+ keepforward = keepF;
+ keepFirst = keepfi;
+ removeLast = removeL;
+ qWindowStep = WindowStep;
+ qWindowSize = WindowSize;
+ qWindowAverage = WindowAverage;
+ qtrim = trim;
+ qThreshold = Threshold;
+ qAverage = Average;
+ qRollAverage = RollAverage;
+ minLength = minL;
+ maxAmbig = maxA;
+ maxHomoP = maxH;
+ maxLength = maxL;
+ flip = fli;
+ nameMap = nm;
+ count = 0;
+ }
};
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+#else
+static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
+ trimData* pDataArray;
+ pDataArray = (trimData*)lpParam;
+
+ try {
+ ofstream trimFASTAFile;
+ pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
+
+ ofstream scrapFASTAFile;
+ pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
+
+ ofstream trimQualFile;
+ ofstream scrapQualFile;
+ if(pDataArray->qFileName != ""){
+ pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
+ pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
+ }
+
+ ofstream trimNameFile;
+ ofstream scrapNameFile;
+ if(pDataArray->nameFile != ""){
+ pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
+ pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
+ }
+
+
+ ofstream outGroupsFile;
+ if (pDataArray->createGroup){ pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile); }
+ if(pDataArray->allFiles){
+ for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
+ for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
+ if (pDataArray->fastaFileNames[i][j] != "") {
+ ofstream temp;
+ pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp); temp.close();
+ if(pDataArray->qFileName != ""){
+ pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp); temp.close();
+ }
+
+ if(pDataArray->nameFile != ""){
+ pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp); temp.close();
+ }
+ }
+ }
+ }
+ }
+
+ ifstream inFASTA;
+ pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+ if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
+ inFASTA.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA);
+ }
+
+ ifstream qFile;
+ if(pDataArray->qFileName != "") {
+ pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
+ if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
+ qFile.seekg(0);
+ }else { //this accounts for the difference in line endings.
+ qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile);
+ }
+ }
+
+
+ TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+
+ pDataArray->count = pDataArray->lineEnd;
+ for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
+
+ if (pDataArray->m->control_pressed) {
+ inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
+ if (pDataArray->createGroup) { outGroupsFile.close(); }
+ if(pDataArray->qFileName != ""){ qFile.close(); }
+ return 0;
+ }
+
+ int success = 1;
+ string trashCode = "";
+ int currentSeqsDiffs = 0;
+
+ Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
+
+ QualityScores currQual;
+ if(pDataArray->qFileName != ""){
+ currQual = QualityScores(qFile); pDataArray->m->gobble(qFile);
+ }
+
+ string origSeq = currSeq.getUnaligned();
+ if (origSeq != "") {
+
+ int barcodeIndex = 0;
+ int primerIndex = 0;
+
+ if(pDataArray->numLinkers != 0){
+ success = trimOligos.stripLinker(currSeq, currQual);
+ if(success > pDataArray->ldiffs) { trashCode += 'k'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if(pDataArray->barcodes.size() != 0){
+ success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
+ if(success > pDataArray->bdiffs) { trashCode += 'b'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if(pDataArray->numSpacers != 0){
+ success = trimOligos.stripSpacer(currSeq, currQual);
+ if(success > pDataArray->sdiffs) { trashCode += 's'; }
+ else{ currentSeqsDiffs += success; }
+
+ }
+
+ if(pDataArray->numFPrimers != 0){
+ success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
+ if(success > pDataArray->pdiffs) { trashCode += 'f'; }
+ else{ currentSeqsDiffs += success; }
+ }
+
+ if (currentSeqsDiffs > pDataArray->tdiffs) { trashCode += 't'; }
+
+ if(pDataArray->numRPrimers != 0){
+ success = trimOligos.stripReverse(currSeq, currQual);
+ if(!success) { trashCode += 'r'; }
+ }
+
+ if(pDataArray->keepFirst != 0){
+ //success = keepFirstTrim(currSeq, currQual);
+ success = 1;
+ if(currQual.getName() != ""){
+ currQual.trimQScores(-1, pDataArray->keepFirst);
+ }
+ currSeq.trim(pDataArray->keepFirst);
+ }
+
+ if(pDataArray->removeLast != 0){
+ //success = removeLastTrim(currSeq, currQual);
+ success = 0;
+ int length = currSeq.getNumBases() - pDataArray->removeLast;
+
+ if(length > 0){
+ if(currQual.getName() != ""){
+ currQual.trimQScores(-1, length);
+ }
+ currSeq.trim(length);
+ success = 1;
+ }
+ else{ success = 0; }
+
+ if(!success) { trashCode += 'l'; }
+ }
+
+
+ if(pDataArray->qFileName != ""){
+ int origLength = currSeq.getNumBases();
+
+ if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); }
+ else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); }
+ else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); }
+ else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); }
+ else { success = 1; }
+
+ //you don't want to trim, if it fails above then scrap it
+ if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) { success = 0; }
+
+ if(!success) { trashCode += 'q'; }
+ }
+
+ if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
+ //success = cullLength(currSeq);
+ int length = currSeq.getNumBases();
+ success = 0; //guilty until proven innocent
+ if(length >= pDataArray->minLength && pDataArray->maxLength == 0) { success = 1; }
+ else if(length >= pDataArray->minLength && length <= pDataArray->maxLength) { success = 1; }
+ else { success = 0; }
+
+ if(!success) { trashCode += 'l'; }
+ }
+ if(pDataArray->maxHomoP > 0){
+ //success = cullHomoP(currSeq);
+ int longHomoP = currSeq.getLongHomoPolymer();
+ success = 0; //guilty until proven innocent
+ if(longHomoP <= pDataArray->maxHomoP){ success = 1; }
+ else { success = 0; }
+
+ if(!success) { trashCode += 'h'; }
+ }
+ if(pDataArray->maxAmbig != -1){
+ //success = cullAmbigs(currSeq);
+ int numNs = currSeq.getAmbigBases();
+ success = 0; //guilty until proven innocent
+ if(numNs <= pDataArray->maxAmbig) { success = 1; }
+ else { success = 0; }
+ if(!success) { trashCode += 'n'; }
+ }
+
+ if(pDataArray->flip){ // should go last
+ currSeq.reverseComplement();
+ if(pDataArray->qFileName != ""){
+ currQual.flipQScores();
+ }
+ }
+
+ if(trashCode.length() == 0){
+ currSeq.setAligned(currSeq.getUnaligned());
+ currSeq.printSequence(trimFASTAFile);
+
+ if(pDataArray->qFileName != ""){
+ currQual.printQScores(trimQualFile);
+ }
+
+ if(pDataArray->nameFile != ""){
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; }
+ else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
+ if (pDataArray->createGroup) {
+ if(pDataArray->barcodes.size() != 0){
+ string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+ if (pDataArray->primers.size() != 0) {
+ if (pDataArray->primerNameVector[primerIndex] != "") {
+ if(thisGroup != "") {
+ thisGroup += "." + pDataArray->primerNameVector[primerIndex];
+ }else {
+ thisGroup = pDataArray->primerNameVector[primerIndex];
+ }
+ }
+ }
+
+ outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+
+ if (pDataArray->nameFile != "") {
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) {
+ vector<string> thisSeqsNames;
+ pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
+ for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+ outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+ }
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+
+ map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+ if (it == pDataArray->groupCounts.end()) { pDataArray->groupCounts[thisGroup] = 1; }
+ else { pDataArray->groupCounts[it->first]++; }
+
+ }
+ }
+
+ if(pDataArray->allFiles){
+ ofstream output;
+ pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+ currSeq.printSequence(output);
+ output.close();
+
+ if(pDataArray->qFileName != ""){
+ pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+ currQual.printQScores(output);
+ output.close();
+ }
+
+ if(pDataArray->nameFile != ""){
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) {
+ pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
+ output << itName->first << '\t' << itName->second << endl;
+ output.close();
+ }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+ }
+ }
+ else{
+ if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
+ map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+ if (itName != pDataArray->nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; }
+ else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+ }
+ currSeq.setName(currSeq.getName() + '|' + trashCode);
+ currSeq.setUnaligned(origSeq);
+ currSeq.setAligned(origSeq);
+ currSeq.printSequence(scrapFASTAFile);
+ if(pDataArray->qFileName != ""){
+ currQual.printQScores(scrapQualFile);
+ }
+ }
+
+ }
+
+ //report progress
+ if((i) % 1000 == 0){ pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine(); }
+
+ }
+ //report progress
+ if((pDataArray->count) % 1000 != 0){ pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine(); }
+
+
+ inFASTA.close();
+ trimFASTAFile.close();
+ scrapFASTAFile.close();
+ if (pDataArray->createGroup) { outGroupsFile.close(); }
+ if(pDataArray->qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); }
+ if(pDataArray->nameFile != "") { scrapNameFile.close(); trimNameFile.close(); }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
+ exit(1);
+ }
+ }
+#endif
+
+
+/**************************************************************************************************/
#endif
lines.clear();
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors != 1){
int numPairs = namesOfGroupCombos.size();
int numPairsPerProcessor = numPairs / processors;
//get scores for random trees
for (int j = 0; j < iters; j++) {
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
driver(T[i], namesOfGroupCombos, 0, namesOfGroupCombos.size(), rScores);
}else{
int UnifracWeightedCommand::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, vector< vector<double> >& scores) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
}
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
}else{
EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
}
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size(), true);
}else{
EstOutput Unweighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos, bool usingGroups) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;
sumSharedA1 = the sum of all shared otus in A where B = 1
sumSharedB1 = the sum of all shared otus in B where A = 1 */
- for (int i = 0; i < shared[0]->size(); i++) {
+ for (int i = 0; i < shared[0]->getNumBins(); i++) {
//store in temps to avoid multiple repetitive function calls
tempA = shared[0]->getAbundance(i);
tempB = shared[1]->getAbundance(i);
}
}
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
data = driver(t, namesOfGroupCombos, 0, namesOfGroupCombos.size());
}else{
EstOutput Weighted::createProcesses(Tree* t, vector< vector<string> > namesOfGroupCombos) {
try {
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
int process = 1;
vector<int> processIDS;