3 # Copyright (C) 2007-2010 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
21 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
23 # This program is part of the Biopieces framework (www.biopieces.org).
25 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
27 # Find and count MID tags in sequences in the stream.
29 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
32 require 'maasha/biopieces'
36 mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG
37 ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG
38 TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA
39 TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG
40 CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG
41 ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT
42 AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT
43 CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT
44 TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT
45 TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA
46 AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA
47 CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA
48 CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA
49 TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA
50 TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC
51 ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC
52 AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC
53 ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC
54 CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC
55 CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC
56 TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC
57 TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC
58 TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG
59 AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG
60 ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG
61 ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG
62 CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG
63 CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG
64 TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG
65 TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG
66 TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA
67 ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC
68 ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG
72 count_hash = Hash.new { |hash, key| hash[key] = 0 }
75 mids.each_with_index do |mid, i|
80 casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
82 options = Biopieces.options_parse(ARGV, casts)
86 Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
87 input.each_record do |record|
88 if record.has_key? :SEQ
89 tag = record[:SEQ][pos ... pos + MID_LEN].upcase
91 if mid_hash.has_key? tag
98 mids.each_with_index do |mid, i|
99 if count_hash[mid] > 0
101 record[:REC_TYPE] = "MID"
102 record[:MID_NUM] = i + 1
103 record[:MID_COUNT] = count_hash[mid]
104 record[:MID_SEQ] = mid
111 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<