#!/usr/bin/env ruby # Copyright (C) 2007-2011 Martin A. Hansen. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # http://www.gnu.org/copyleft/gpl.html # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # This program is part of the Biopieces framework (www.biopieces.org). # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< # Find barcodes in sequences in the stream. # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< require 'maasha/biopieces' require 'maasha/bits' require 'pp' # Hard coded Roche Genome Sequencing MIDs GSMID_HASH = { ACGAGTGCGT: "MID1", TCTCTATGCG: "MID10", TAGACTGCAC: "MID100", TAGCGCGCGC: "MID101", TAGCTCTATC: "MID102", TATAGACATC: "MID103", TATGATACGC: "MID104", TCACTCATAC: "MID105", TCATCGAGTC: "MID106", TCGAGCTCTC: "MID107", TCGCAGACAC: "MID108", TCTGTCTCGC: "MID109", TGATACGTCT: "MID11", TGAGTGACGC: "MID110", TGATGTGTAC: "MID111", TGCTATAGAC: "MID112", TGCTCGCTAC: "MID113", ACGTGCAGCG: "MID114", ACTCACAGAG: "MID115", AGACTCAGCG: "MID116", AGAGAGTGTG: "MID117", AGCTATCGCG: "MID118", AGTCTGACTG: "MID119", TACTGAGCTA: "MID12", AGTGAGCTCG: "MID120", ATAGCTCTCG: "MID121", ATCACGTGCG: "MID122", ATCGTAGCAG: "MID123", ATCGTCTGTG: "MID124", ATGTACGATG: "MID125", ATGTGTCTAG: "MID126", CACACGATAG: "MID127", CACTCGCACG: "MID128", CAGACGTCTG: "MID129", CATAGTAGTG: "MID13", CAGTACTGCG: "MID130", CGACAGCGAG: "MID131", CGATCTGTCG: "MID132", CGCGTGCTAG: "MID133", CGCTCGAGTG: "MID134", CGTGATGACG: "MID135", CTATGTACAG: "MID136", CTCGATATAG: "MID137", CTCGCACGCG: "MID138", CTGCGTCACG: "MID139", CGAGAGATAC: "MID14", CTGTGCGTCG: "MID140", TAGCATACTG: "MID141", TATACATGTG: "MID142", TATCACTCAG: "MID143", TATCTGATAG: "MID144", TCGTGACATG: "MID145", TCTGATCGAG: "MID146", TGACATCTCG: "MID147", TGAGCTAGAG: "MID148", TGATAGAGCG: "MID149", ATACGACGTA: "MID15", TGCGTGTGCG: "MID150", TGCTAGTCAG: "MID151", TGTATCACAG: "MID152", TGTGCGCGTG: "MID153", TCACGTACTA: "MID16", CGTCTAGTAC: "MID17", TCTACGTAGC: "MID18", TGTACTACTC: "MID19", ACGCTCGACA: "MID2", ACGACTACAG: "MID20", CGTAGACTAG: "MID21", TACGAGTATG: "MID22", TACTCTCGTG: "MID23", TAGAGACGAG: "MID24", TCGTCGCTCG: "MID25", ACATACGCGT: "MID26", ACGCGAGTAT: "MID27", ACTACTATGT: "MID28", ACTGTACAGT: "MID29", AGACGCACTC: "MID3", AGACTATACT: "MID30", AGCGTCGTCT: "MID31", AGTACGCTAT: "MID32", ATAGAGTACT: "MID33", CACGCTACGT: "MID34", CAGTAGACGT: "MID35", CGACGTGACT: "MID36", TACACACACT: "MID37", TACACGTGAT: "MID38", TACAGATCGT: "MID39", AGCACTGTAG: "MID4", TACGCTGTCT: "MID40", TAGTGTAGAT: "MID41", TCGATCACGT: "MID42", TCGCACTAGT: "MID43", TCTAGCGACT: "MID44", TCTATACTAT: "MID45", TGACGTATGT: "MID46", TGTGAGTAGT: "MID47", ACAGTATATA: "MID48", ACGCGATCGA: "MID49", ATCAGACACG: "MID5", ACTAGCAGTA: "MID50", AGCTCACGTA: "MID51", AGTATACATA: "MID52", AGTCGAGAGA: "MID53", AGTGCTACGA: "MID54", CGATCGTATA: "MID55", CGCAGTACGA: "MID56", CGCGTATACA: "MID57", CGTACAGTCA: "MID58", CGTACTCAGA: "MID59", ATATCGCGAG: "MID6", CTACGCTCTA: "MID60", CTATAGCGTA: "MID61", TACGTCATCA: "MID62", TAGTCGCATA: "MID63", TATATATACA: "MID64", TATGCTAGTA: "MID65", TCACGCGAGA: "MID66", TCGATAGTGA: "MID67", TCGCTGCGTA: "MID68", TCTGACGTCA: "MID69", CGTGTCTCTA: "MID7", TGAGTCAGTA: "MID70", TGTAGTGTGA: "MID71", TGTCACACGA: "MID72", TGTCGTCGCA: "MID73", ACACATACGC: "MID74", ACAGTCGTGC: "MID75", ACATGACGAC: "MID76", ACGACAGCTC: "MID77", ACGTCTCATC: "MID78", ACTCATCTAC: "MID79", CTCGCGTGTC: "MID8", ACTCGCGCAC: "MID80", AGAGCGTCAC: "MID81", AGCGACTAGC: "MID82", AGTAGTGATC: "MID83", AGTGACACAC: "MID84", AGTGTATGTC: "MID85", ATAGATAGAC: "MID86", ATATAGTCGC: "MID87", ATCTACTGAC: "MID88", CACGTAGATC: "MID89", TAGTATCAGC: "MID9", CACGTGTCGC: "MID90", CATACTCTAC: "MID91", CGACACTATC: "MID92", CGAGACGCGC: "MID93", CGTATGCGAC: "MID94", CGTCGATCTC: "MID95", CTACGACTGC: "MID96", CTAGTCACTC: "MID97", CTCTACGCTC: "MID98", CTGTACATAC: "MID99" } # Hard coded Roche Rapid Labrary MIDs RLMID_HASH = { ACACGACGACT: "RL1", ACACGTAGTAT: "RL2", ACACTACTCGT: "RL3", ACGACACGTAT: "RL4", ACGAGTAGACT: "RL5", ACGCGTCTAGT: "RL6", ACGTACACACT: "RL7", ACGTACTGTGT: "RL8", ACGTAGATCGT: "RL9", ACTACGTCTCT: "RL10", ACTATACGAGT: "RL11", ACTCGCGTCGT: "RL12", AGACTCGACGT: "RL13", AGTACGAGAGT: "RL14", AGTACTACTAT: "RL15", AGTAGACGTCT: "RL16", AGTCGTACACT: "RL17", AGTGTAGTAGT: "RL18", ATAGTATACGT: "RL19", CAGTACGTACT: "RL20", CGACGACGCGT: "RL21", CGACGAGTACT: "RL22", CGATACTACGT: "RL23", CGTACGTCGAT: "RL24", CTACTCGTAGT: "RL25", GTACAGTACGT: "RL26", GTCGTACGTAT: "RL27", GTGTACGACGT: "RL28", ACACAGTGAGT: "RL29", ACACTCATACT: "RL30", ACAGACAGCGT: "RL31", ACAGACTATAT: "RL32", ACAGAGACTCT: "RL33", ACAGCTCGTGT: "RL34", ACAGTGTCGAT: "RL35", ACGAGCGCGCT: "RL36", ACGATGAGTGT: "RL37", ACGCGAGAGAT: "RL38", ACGCTCTCTCT: "RL39", ACGTCGCTGAT: "RL40", ACGTCTAGCAT: "RL41", ACTAGTGATAT: "RL42", ACTCACACTGT: "RL43", ACTCACTAGCT: "RL44", ACTCTATATAT: "RL45", ACTGATCTCGT: "RL46", ACTGCTGTACT: "RL47", ACTGTAGCGCT: "RL48", AGACACTCACT: "RL49", AGACATATAGT: "RL50" } class BarCodeFinderError < StandardError; end class BarCodeFinder def initialize(pos, max_mismatches) @pos = pos @max_mismatches = max_mismatches @barcode_hash = {} @barcode_sizes = [] end def parse_barcodes(file) length = 0 hash = {} File.open(file, "r") do |ios| while not ios.eof? name, barcode = ios.readline.chomp.split(/\s+/) if length > 0 raise BarCodeFinderError, "Uneven barcode length detected" if length != barcode.length else length = barcode.length end hash[barcode.upcase.to_sym] = name end end load_barcodes(hash) end def load_barcodes(hash) @barcode_sizes << hash.keys.first.to_s.size @barcode_hash.merge!(hash) end def find_barcode(seq) raise BarCodeFinderError, "No barcodes to find" if @barcode_hash.empty? @barcode_sizes.each do |size| hamming_dist = 0 barcode = seq[@pos ... @pos + size].upcase.to_sym if @barcode_hash[barcode] return BarCode.new(barcode, @barcode_hash[barcode], @pos, size, hamming_dist) elsif @max_mismatches > 0 @barcode_hash.each_key do |key| if key.to_s.length == barcode.to_s.size hamming_dist = barcode.to_s.hamming_distance(key.to_s) if hamming_dist <= @max_mismatches return BarCode.new(key, @barcode_hash[key], @pos, size, hamming_dist) end end end end end nil end private # Class to hold a BacCode object. class BarCode attr_accessor :barcode, :name, :pos, :len, :mismatches def initialize(barcode, name, pos, len, mismatches) @barcode = barcode @name = name @pos = pos @len = len @mismatches = mismatches end def to_hash hash = {} hash[:BARCODE] = @barcode hash[:BARCODE_NAME] = @name hash[:BARCODE_POS] = @pos hash[:BARCODE_LEN] = @len hash[:BARCODE_MISMATCHES] = @mismatches hash end def start @pos + @len end end end casts = [] casts << {:long=>'barcodes_in', :short=>'b', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} casts << {:long=>'remove', :short=>'R', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} options = Biopieces.options_parse(ARGV, casts) bc_finder = BarCodeFinder.new(options[:pos], options[:mismatches]) bc_finder.parse_barcodes(options[:barcodes_in]) if options[:barcodes_in] bc_finder.load_barcodes(GSMID_HASH) if options[:gsmids] bc_finder.load_barcodes(RLMID_HASH) if options[:rlmids] Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| input.each_record do |record| if record[:SEQ] if barcode = bc_finder.find_barcode(record[:SEQ]) record.merge!(barcode.to_hash) if options[:remove] record[:SEQ] = record[:SEQ][barcode.start .. -1] record[:SCORES] = record[:SCORES][barcode.start .. -1] if record[:SCORES] record[:SEQ_LEN] = record[:SEQ].length end end end output.puts record end end # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< __END__