#!/usr/bin/env ruby # Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Find MID tags in sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' require 'maasha/bits' require 'pp' GS_SIZE = 10 RL_SIZE = 11 GSMID_HASH = { ACGAGTGCGT: "MID1", TCTCTATGCG: "MID10", TAGACTGCAC: "MID100", TAGCGCGCGC: "MID101", TAGCTCTATC: "MID102", TATAGACATC: "MID103", TATGATACGC: "MID104", TCACTCATAC: "MID105", TCATCGAGTC: "MID106", TCGAGCTCTC: "MID107", TCGCAGACAC: "MID108", TCTGTCTCGC: "MID109", TGATACGTCT: "MID11", TGAGTGACGC: "MID110", TGATGTGTAC: "MID111", TGCTATAGAC: "MID112", TGCTCGCTAC: "MID113", ACGTGCAGCG: "MID114", ACTCACAGAG: "MID115", AGACTCAGCG: "MID116", AGAGAGTGTG: "MID117", AGCTATCGCG: "MID118", AGTCTGACTG: "MID119", TACTGAGCTA: "MID12", AGTGAGCTCG: "MID120", ATAGCTCTCG: "MID121", ATCACGTGCG: "MID122", ATCGTAGCAG: "MID123", ATCGTCTGTG: "MID124", ATGTACGATG: "MID125", ATGTGTCTAG: "MID126", CACACGATAG: "MID127", CACTCGCACG: "MID128", CAGACGTCTG: "MID129", CATAGTAGTG: "MID13", CAGTACTGCG: "MID130", CGACAGCGAG: "MID131", CGATCTGTCG: "MID132", CGCGTGCTAG: "MID133", CGCTCGAGTG: "MID134", CGTGATGACG: "MID135", CTATGTACAG: "MID136", CTCGATATAG: "MID137", CTCGCACGCG: "MID138", CTGCGTCACG: "MID139", CGAGAGATAC: "MID14", CTGTGCGTCG: "MID140", TAGCATACTG: "MID141", TATACATGTG: "MID142", TATCACTCAG: "MID143", TATCTGATAG: "MID144", TCGTGACATG: "MID145", TCTGATCGAG: "MID146", TGACATCTCG: "MID147", TGAGCTAGAG: "MID148", TGATAGAGCG: "MID149", ATACGACGTA: "MID15", TGCGTGTGCG: "MID150", TGCTAGTCAG: "MID151", TGTATCACAG: "MID152", TGTGCGCGTG: "MID153", TCACGTACTA: "MID16", CGTCTAGTAC: "MID17", TCTACGTAGC: "MID18", TGTACTACTC: "MID19", ACGCTCGACA: "MID2", ACGACTACAG: "MID20", CGTAGACTAG: "MID21", TACGAGTATG: "MID22", TACTCTCGTG: "MID23", TAGAGACGAG: "MID24", TCGTCGCTCG: "MID25", ACATACGCGT: "MID26", ACGCGAGTAT: "MID27", ACTACTATGT: "MID28", ACTGTACAGT: "MID29", AGACGCACTC: "MID3", AGACTATACT: "MID30", AGCGTCGTCT: "MID31", AGTACGCTAT: "MID32", ATAGAGTACT: "MID33", CACGCTACGT: "MID34", CAGTAGACGT: "MID35", CGACGTGACT: "MID36", TACACACACT: "MID37", TACACGTGAT: "MID38", TACAGATCGT: "MID39", AGCACTGTAG: "MID4", TACGCTGTCT: "MID40", TAGTGTAGAT: "MID41", TCGATCACGT: "MID42", TCGCACTAGT: "MID43", TCTAGCGACT: "MID44", TCTATACTAT: "MID45", TGACGTATGT: "MID46", TGTGAGTAGT: "MID47", ACAGTATATA: "MID48", ACGCGATCGA: "MID49", ATCAGACACG: "MID5", ACTAGCAGTA: "MID50", AGCTCACGTA: "MID51", AGTATACATA: "MID52", AGTCGAGAGA: "MID53", AGTGCTACGA: "MID54", CGATCGTATA: "MID55", CGCAGTACGA: "MID56", CGCGTATACA: "MID57", CGTACAGTCA: "MID58", CGTACTCAGA: "MID59", ATATCGCGAG: "MID6", CTACGCTCTA: "MID60", CTATAGCGTA: "MID61", TACGTCATCA: "MID62", TAGTCGCATA: "MID63", TATATATACA: "MID64", TATGCTAGTA: "MID65", TCACGCGAGA: "MID66", TCGATAGTGA: "MID67", TCGCTGCGTA: "MID68", TCTGACGTCA: "MID69", CGTGTCTCTA: "MID7", TGAGTCAGTA: "MID70", TGTAGTGTGA: "MID71", TGTCACACGA: "MID72", TGTCGTCGCA: "MID73", ACACATACGC: "MID74", ACAGTCGTGC: "MID75", ACATGACGAC: "MID76", ACGACAGCTC: "MID77", ACGTCTCATC: "MID78", ACTCATCTAC: "MID79", CTCGCGTGTC: "MID8", ACTCGCGCAC: "MID80", AGAGCGTCAC: "MID81", AGCGACTAGC: "MID82", AGTAGTGATC: "MID83", AGTGACACAC: "MID84", AGTGTATGTC: "MID85", ATAGATAGAC: "MID86", ATATAGTCGC: "MID87", ATCTACTGAC: "MID88", CACGTAGATC: "MID89", TAGTATCAGC: "MID9", CACGTGTCGC: "MID90", CATACTCTAC: "MID91", CGACACTATC: "MID92", CGAGACGCGC: "MID93", CGTATGCGAC: "MID94", CGTCGATCTC: "MID95", CTACGACTGC: "MID96", CTAGTCACTC: "MID97", CTCTACGCTC: "MID98", CTGTACATAC: "MID99", } RLMID_HASH = { ACACGACGACT: "RL1", ACACGTAGTAT: "RL2", ACACTACTCGT: "RL3", ACGACACGTAT: "RL4", ACGAGTAGACT: "RL5", ACGCGTCTAGT: "RL6", ACGTACACACT: "RL7", ACGTACTGTGT: "RL8", ACGTAGATCGT: "RL9", ACTACGTCTCT: "RL10", ACTATACGAGT: "RL11", ACTCGCGTCGT: "RL12" } class MIDfinder def initialize(mid_hash, pos, size, max_mismatches) @mid_hash = mid_hash @pos = pos @size = size @max_mismatches = max_mismatches end def find_mid(seq) hamming_dist = 0 tag = seq[@pos ... @pos + @size].upcase.to_sym if @mid_hash.has_key? tag return MID.new(tag, @mid_hash[tag], @pos, @size, hamming_dist) elsif @max_mismatches > 0 @mid_hash.each_pair do |mid, mid_name| hamming_dist = tag.to_s.hamming_distance(mid.to_s) if hamming_dist <= @max_mismatches return MID.new(mid, @mid_hash[mid], @pos, @size, hamming_dist) end end end nil end private # Class to hold a MID object. class MID attr_accessor :tag, :name, :pos, :len, :mismatches def initialize(tag, name, pos, len, mismatches) @tag = tag @name = name @pos = pos @len = len @mismatches = mismatches end def to_hash hash = {} hash[:MID] = @tag hash[:MID_NAME] = @name hash[:MID_POS] = @pos hash[:MID_LEN] = @len hash[:MID_MISMATCHES] = @mismatches hash end def start @pos + @len end end end casts = [] casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) raise "Can't remove only GSMIDs and only RLMIDs simultaniously." if options[:rlmids] and options[:gsmids] pos = options[:pos] mismatches = options[:mismatches] gsfinder = MIDfinder.new(GSMID_HASH, options[:pos], GS_SIZE, options[:mismatches]) rlfinder = MIDfinder.new(RLMID_HASH, options[:pos], RL_SIZE, options[:mismatches]) Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record.has_key? :SEQ if options[:gsmids] mid = gsfinder.find_mid(record[:SEQ]) elsif options[:rlmids] mid = rlfinder.find_mid(record[:SEQ]) else mid = gsfinder.find_mid(record[:SEQ]) || rlfinder.find_mid(record[:SEQ]) end if mid record.merge!(mid.to_hash) record[:SEQ] = record[:SEQ][mid.start .. -1] record[:SCORES] = record[:SCORES][mid.start .. -1] if record[:SCORES] record[:SEQ_LEN] = record[:SEQ].length end end output.puts record end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__