#!/usr/bin/env ruby # Copyright (C) 2007-2010 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Find and count MID tags in sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' MID_LEN = 10 mids = %w{ ACGAGTGCGT ACGCTCGACA AGACGCACTC AGCACTGTAG ATCAGACACG ATATCGCGAG CGTGTCTCTA CTCGCGTGTC TAGTATCAGC TCTCTATGCG TGATACGTCT TACTGAGCTA CATAGTAGTG CGAGAGATAC ATACGACGTA TCACGTACTA CGTCTAGTAC TCTACGTAGC TGTACTACTC ACGACTACAG CGTAGACTAG TACGAGTATG TACTCTCGTG TAGAGACGAG TCGTCGCTCG ACATACGCGT ACGCGAGTAT ACTACTATGT ACTGTACAGT AGACTATACT AGCGTCGTCT AGTACGCTAT ATAGAGTACT CACGCTACGT CAGTAGACGT CGACGTGACT TACACACACT TACACGTGAT TACAGATCGT TACGCTGTCT TAGTGTAGAT TCGATCACGT TCGCACTAGT TCTAGCGACT TCTATACTAT TGACGTATGT TGTGAGTAGT ACAGTATATA ACGCGATCGA ACTAGCAGTA AGCTCACGTA AGTATACATA AGTCGAGAGA AGTGCTACGA CGATCGTATA CGCAGTACGA CGCGTATACA CGTACAGTCA CGTACTCAGA CTACGCTCTA CTATAGCGTA TACGTCATCA TAGTCGCATA TATATATACA TATGCTAGTA TCACGCGAGA TCGATAGTGA TCGCTGCGTA TCTGACGTCA TGAGTCAGTA TGTAGTGTGA TGTCACACGA TGTCGTCGCA ACACATACGC ACAGTCGTGC ACATGACGAC ACGACAGCTC ACGTCTCATC ACTCATCTAC ACTCGCGCAC AGAGCGTCAC AGCGACTAGC AGTAGTGATC AGTGACACAC AGTGTATGTC ATAGATAGAC ATATAGTCGC ATCTACTGAC CACGTAGATC CACGTGTCGC CATACTCTAC CGACACTATC CGAGACGCGC CGTATGCGAC CGTCGATCTC CTACGACTGC CTAGTCACTC CTCTACGCTC CTGTACATAC TAGACTGCAC TAGCGCGCGC TAGCTCTATC TATAGACATC TATGATACGC TCACTCATAC TCATCGAGTC TCGAGCTCTC TCGCAGACAC TCTGTCTCGC TGAGTGACGC TGATGTGTAC TGCTATAGAC TGCTCGCTAC ACGTGCAGCG ACTCACAGAG AGACTCAGCG AGAGAGTGTG AGCTATCGCG AGTCTGACTG AGTGAGCTCG ATAGCTCTCG ATCACGTGCG ATCGTAGCAG ATCGTCTGTG ATGTACGATG ATGTGTCTAG CACACGATAG CACTCGCACG CAGACGTCTG CAGTACTGCG CGACAGCGAG CGATCTGTCG CGCGTGCTAG CGCTCGAGTG CGTGATGACG CTATGTACAG CTCGATATAG CTCGCACGCG CTGCGTCACG CTGTGCGTCG TAGCATACTG TATACATGTG TATCACTCAG TATCTGATAG TCGTGACATG TCTGATCGAG TGACATCTCG TGAGCTAGAG TGATAGAGCG TGCGTGTGCG TGCTAGTCAG TGTATCACAG TGTGCGCGTG ACACGACGAC ACACGTAGTA ACACTACTCG ACGACACGTA ACGAGTAGAC ACGCGTCTAG ACGTACACAC ACGTACTGTG ACGTAGATCG ACTACGTCTC ACTATACGAG ACTCGCGTCG } count_hash = Hash.new { |hash, key| hash[key] = 0 } mid_hash = {} mids.each_with_index do |mid, i| mid_hash[mid] = true end casts = [] casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) pos = options[:pos] Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ tag = record[:SEQ][pos ... pos + MID_LEN].upcase if mid_hash.has_key? tag count_hash[tag] += 1 end end output.puts record end mids.each_with_index do |mid, i| if count_hash[mid] > 0 record = {} record[:REC_TYPE] = "MID" record[:MID_NUM] = i + 1 record[:MID_COUNT] = count_hash[mid] record[:MID_SEQ] = mid output.puts record end end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__