--- /dev/null
+#!/usr/bin/env ruby
+
+# Copyright (C) 2007-2011 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# This program is part of the Biopieces framework (www.biopieces.org).
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+# Find barcodes in sequences in the stream.
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+require 'maasha/biopieces'
+require 'maasha/bits'
+require 'pp'
+
+# Hard coded Roche Genome Sequencing MIDs
+GSMID_HASH = {
+ ACGAGTGCGT: "MID1",
+ TCTCTATGCG: "MID10",
+ TAGACTGCAC: "MID100",
+ TAGCGCGCGC: "MID101",
+ TAGCTCTATC: "MID102",
+ TATAGACATC: "MID103",
+ TATGATACGC: "MID104",
+ TCACTCATAC: "MID105",
+ TCATCGAGTC: "MID106",
+ TCGAGCTCTC: "MID107",
+ TCGCAGACAC: "MID108",
+ TCTGTCTCGC: "MID109",
+ TGATACGTCT: "MID11",
+ TGAGTGACGC: "MID110",
+ TGATGTGTAC: "MID111",
+ TGCTATAGAC: "MID112",
+ TGCTCGCTAC: "MID113",
+ ACGTGCAGCG: "MID114",
+ ACTCACAGAG: "MID115",
+ AGACTCAGCG: "MID116",
+ AGAGAGTGTG: "MID117",
+ AGCTATCGCG: "MID118",
+ AGTCTGACTG: "MID119",
+ TACTGAGCTA: "MID12",
+ AGTGAGCTCG: "MID120",
+ ATAGCTCTCG: "MID121",
+ ATCACGTGCG: "MID122",
+ ATCGTAGCAG: "MID123",
+ ATCGTCTGTG: "MID124",
+ ATGTACGATG: "MID125",
+ ATGTGTCTAG: "MID126",
+ CACACGATAG: "MID127",
+ CACTCGCACG: "MID128",
+ CAGACGTCTG: "MID129",
+ CATAGTAGTG: "MID13",
+ CAGTACTGCG: "MID130",
+ CGACAGCGAG: "MID131",
+ CGATCTGTCG: "MID132",
+ CGCGTGCTAG: "MID133",
+ CGCTCGAGTG: "MID134",
+ CGTGATGACG: "MID135",
+ CTATGTACAG: "MID136",
+ CTCGATATAG: "MID137",
+ CTCGCACGCG: "MID138",
+ CTGCGTCACG: "MID139",
+ CGAGAGATAC: "MID14",
+ CTGTGCGTCG: "MID140",
+ TAGCATACTG: "MID141",
+ TATACATGTG: "MID142",
+ TATCACTCAG: "MID143",
+ TATCTGATAG: "MID144",
+ TCGTGACATG: "MID145",
+ TCTGATCGAG: "MID146",
+ TGACATCTCG: "MID147",
+ TGAGCTAGAG: "MID148",
+ TGATAGAGCG: "MID149",
+ ATACGACGTA: "MID15",
+ TGCGTGTGCG: "MID150",
+ TGCTAGTCAG: "MID151",
+ TGTATCACAG: "MID152",
+ TGTGCGCGTG: "MID153",
+ TCACGTACTA: "MID16",
+ CGTCTAGTAC: "MID17",
+ TCTACGTAGC: "MID18",
+ TGTACTACTC: "MID19",
+ ACGCTCGACA: "MID2",
+ ACGACTACAG: "MID20",
+ CGTAGACTAG: "MID21",
+ TACGAGTATG: "MID22",
+ TACTCTCGTG: "MID23",
+ TAGAGACGAG: "MID24",
+ TCGTCGCTCG: "MID25",
+ ACATACGCGT: "MID26",
+ ACGCGAGTAT: "MID27",
+ ACTACTATGT: "MID28",
+ ACTGTACAGT: "MID29",
+ AGACGCACTC: "MID3",
+ AGACTATACT: "MID30",
+ AGCGTCGTCT: "MID31",
+ AGTACGCTAT: "MID32",
+ ATAGAGTACT: "MID33",
+ CACGCTACGT: "MID34",
+ CAGTAGACGT: "MID35",
+ CGACGTGACT: "MID36",
+ TACACACACT: "MID37",
+ TACACGTGAT: "MID38",
+ TACAGATCGT: "MID39",
+ AGCACTGTAG: "MID4",
+ TACGCTGTCT: "MID40",
+ TAGTGTAGAT: "MID41",
+ TCGATCACGT: "MID42",
+ TCGCACTAGT: "MID43",
+ TCTAGCGACT: "MID44",
+ TCTATACTAT: "MID45",
+ TGACGTATGT: "MID46",
+ TGTGAGTAGT: "MID47",
+ ACAGTATATA: "MID48",
+ ACGCGATCGA: "MID49",
+ ATCAGACACG: "MID5",
+ ACTAGCAGTA: "MID50",
+ AGCTCACGTA: "MID51",
+ AGTATACATA: "MID52",
+ AGTCGAGAGA: "MID53",
+ AGTGCTACGA: "MID54",
+ CGATCGTATA: "MID55",
+ CGCAGTACGA: "MID56",
+ CGCGTATACA: "MID57",
+ CGTACAGTCA: "MID58",
+ CGTACTCAGA: "MID59",
+ ATATCGCGAG: "MID6",
+ CTACGCTCTA: "MID60",
+ CTATAGCGTA: "MID61",
+ TACGTCATCA: "MID62",
+ TAGTCGCATA: "MID63",
+ TATATATACA: "MID64",
+ TATGCTAGTA: "MID65",
+ TCACGCGAGA: "MID66",
+ TCGATAGTGA: "MID67",
+ TCGCTGCGTA: "MID68",
+ TCTGACGTCA: "MID69",
+ CGTGTCTCTA: "MID7",
+ TGAGTCAGTA: "MID70",
+ TGTAGTGTGA: "MID71",
+ TGTCACACGA: "MID72",
+ TGTCGTCGCA: "MID73",
+ ACACATACGC: "MID74",
+ ACAGTCGTGC: "MID75",
+ ACATGACGAC: "MID76",
+ ACGACAGCTC: "MID77",
+ ACGTCTCATC: "MID78",
+ ACTCATCTAC: "MID79",
+ CTCGCGTGTC: "MID8",
+ ACTCGCGCAC: "MID80",
+ AGAGCGTCAC: "MID81",
+ AGCGACTAGC: "MID82",
+ AGTAGTGATC: "MID83",
+ AGTGACACAC: "MID84",
+ AGTGTATGTC: "MID85",
+ ATAGATAGAC: "MID86",
+ ATATAGTCGC: "MID87",
+ ATCTACTGAC: "MID88",
+ CACGTAGATC: "MID89",
+ TAGTATCAGC: "MID9",
+ CACGTGTCGC: "MID90",
+ CATACTCTAC: "MID91",
+ CGACACTATC: "MID92",
+ CGAGACGCGC: "MID93",
+ CGTATGCGAC: "MID94",
+ CGTCGATCTC: "MID95",
+ CTACGACTGC: "MID96",
+ CTAGTCACTC: "MID97",
+ CTCTACGCTC: "MID98",
+ CTGTACATAC: "MID99",
+}
+
+# Hard coded Roche Rapid Labrary MIDs
+RLMID_HASH = {
+ ACACGACGACT: "RL1",
+ ACACGTAGTAT: "RL2",
+ ACACTACTCGT: "RL3",
+ ACGACACGTAT: "RL4",
+ ACGAGTAGACT: "RL5",
+ ACGCGTCTAGT: "RL6",
+ ACGTACACACT: "RL7",
+ ACGTACTGTGT: "RL8",
+ ACGTAGATCGT: "RL9",
+ ACTACGTCTCT: "RL10",
+ ACTATACGAGT: "RL11",
+ ACTCGCGTCGT: "RL12"
+}
+
+class BarCodeFinderError < StandardError; end
+
+class BarCodeFinder
+ def initialize(pos, max_mismatches)
+ @pos = pos
+ @max_mismatches = max_mismatches
+ @barcode_hash = {}
+ @barcode_sizes = []
+ end
+
+ def parse_barcodes(file)
+ length = 0
+ hash = {}
+
+ File.open(file, "r") do |ios|
+ while not ios.eof?
+ name, barcode = ios.readline.chomp.split(/\s+/)
+
+ if length > 0
+ raise BarCodeFinderError, "Uneven barcode length detected" if length != barcode.length
+ else
+ length = barcode.length
+ end
+
+ hash[barcode.upcase.to_sym] = name
+ end
+ end
+
+ load_barcodes(hash)
+ end
+
+ def load_barcodes(hash)
+ @barcode_sizes << hash.keys.first.to_s.size
+
+ @barcode_hash.merge!(hash)
+ end
+
+ def find_barcode(seq)
+ raise BarCodeFinderError, "No barcodes to find" if @barcode_hash.empty?
+
+ @barcode_sizes.each do |size|
+ hamming_dist = 0
+ barcode = seq[@pos ... @pos + size].upcase.to_sym
+
+ if @barcode_hash.has_key? barcode
+ return BarCode.new(barcode, @barcode_hash[barcode], @pos, size, hamming_dist)
+ elsif @max_mismatches > 0
+ @barcode_hash.each_key do |key|
+ if key.to_s.length == barcode.to_s.size
+ hamming_dist = barcode.to_s.hamming_distance(key.to_s)
+
+ if hamming_dist <= @max_mismatches
+ return BarCode.new(key, @barcode_hash[key], @pos, size, hamming_dist)
+ end
+ end
+ end
+ end
+ end
+
+ nil
+ end
+
+ private
+
+ # Class to hold a BacCode object.
+ class BarCode
+ attr_accessor :barcode, :name, :pos, :len, :mismatches
+
+ def initialize(barcode, name, pos, len, mismatches)
+ @barcode = barcode
+ @name = name
+ @pos = pos
+ @len = len
+ @mismatches = mismatches
+ end
+
+ def to_hash
+ hash = {}
+ hash[:BARCODE] = @barcode
+ hash[:BARCODE_NAME] = @name
+ hash[:BARCODE_POS] = @pos
+ hash[:BARCODE_LEN] = @len
+ hash[:BARCODE_MISMATCHES] = @mismatches
+ hash
+ end
+
+ def start
+ @pos + @len
+ end
+ end
+end
+
+casts = []
+casts << {:long=>'barcodes_in', :short=>'b', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil}
+casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'remove', :short=>'R', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+
+options = Biopieces.options_parse(ARGV, casts)
+
+bc_finder = BarCodeFinder.new(options[:pos], options[:mismatches])
+bc_finder.parse_barcodes(options[:barcodes_in]) if options[:barcodes_in]
+bc_finder.load_barcodes(GSMID_HASH) if options[:gsmids]
+bc_finder.load_barcodes(RLMID_HASH) if options[:rlmids]
+
+Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output|
+ input.each_record do |record|
+ if record.has_key? :SEQ
+ if barcode = bc_finder.find_barcode(record[:SEQ])
+ record.merge!(barcode.to_hash)
+
+ if options[:remove]
+ record[:SEQ] = record[:SEQ][barcode.start .. -1]
+ record[:SCORES] = record[:SCORES][barcode.start .. -1] if record[:SCORES]
+ record[:SEQ_LEN] = record[:SEQ].length
+ end
+ end
+ end
+
+ output.puts record
+ end
+end
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+__END__
--- /dev/null
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -I $in.1 -r -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -I $in.1 -rg -O $tmp"
+assert_no_diff $tmp $out.2
+clean
+
+run "$bp -I $in.1 -rgb $in.2 -O $tmp"
+assert_no_diff $tmp $out.3
+clean
+
+run "$bp -I $in.1 -p 4 -r -O $tmp"
+assert_no_diff $tmp $out.4
+clean
+
+run "$bp -I $in.1 -p 4 -rg -O $tmp"
+assert_no_diff $tmp $out.5
+clean
+
+run "$bp -I $in.1 -p 4 -rgb $in.2 -O $tmp"
+assert_no_diff $tmp $out.6
+clean
+
+run "$bp -I $in.1 -p 4 -m 1 -r -O $tmp"
+assert_no_diff $tmp $out.7
+clean
+
+run "$bp -I $in.1 -p 4 -m 1 -rg -O $tmp"
+assert_no_diff $tmp $out.8
+clean
+
+run "$bp -I $in.1 -p 4 -m 1 -rgb $in.2 -O $tmp"
+assert_no_diff $tmp $out.9
+clean
+
+run "$bp -I $in.1 -p 4 -m 2 -r -O $tmp"
+assert_no_diff $tmp $out.10
+clean
+
+run "$bp -I $in.1 -p 4 -m 2 -rg -O $tmp"
+assert_no_diff $tmp $out.11
+clean
+
+run "$bp -I $in.1 -p 4 -m 2 -rgb $in.2 -O $tmp"
+assert_no_diff $tmp $out.12
+clean
+
+run "$bp -I $in.1 -p 4 -m 2 -Rrg -O $tmp"
+assert_no_diff $tmp $out.13
+clean