From: martinahansen Date: Tue, 25 Oct 2011 14:52:19 +0000 (+0000) Subject: added find_barcodes biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=a271a3dec9077d8306cdf0e30196f994927eb7c5;p=biopieces.git added find_barcodes biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1599 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/find_barcodes b/bp_bin/find_barcodes new file mode 100755 index 0000000..aedea09 --- /dev/null +++ b/bp_bin/find_barcodes @@ -0,0 +1,337 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Find barcodes in sequences in the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +require 'maasha/biopieces' +require 'maasha/bits' +require 'pp' + +# Hard coded Roche Genome Sequencing MIDs +GSMID_HASH = { + ACGAGTGCGT: "MID1", + TCTCTATGCG: "MID10", + TAGACTGCAC: "MID100", + TAGCGCGCGC: "MID101", + TAGCTCTATC: "MID102", + TATAGACATC: "MID103", + TATGATACGC: "MID104", + TCACTCATAC: "MID105", + TCATCGAGTC: "MID106", + TCGAGCTCTC: "MID107", + TCGCAGACAC: "MID108", + TCTGTCTCGC: "MID109", + TGATACGTCT: "MID11", + TGAGTGACGC: "MID110", + TGATGTGTAC: "MID111", + TGCTATAGAC: "MID112", + TGCTCGCTAC: "MID113", + ACGTGCAGCG: "MID114", + ACTCACAGAG: "MID115", + AGACTCAGCG: "MID116", + AGAGAGTGTG: "MID117", + AGCTATCGCG: "MID118", + AGTCTGACTG: "MID119", + TACTGAGCTA: "MID12", + AGTGAGCTCG: "MID120", + ATAGCTCTCG: "MID121", + ATCACGTGCG: "MID122", + ATCGTAGCAG: "MID123", + ATCGTCTGTG: "MID124", + ATGTACGATG: "MID125", + ATGTGTCTAG: "MID126", + CACACGATAG: "MID127", + CACTCGCACG: "MID128", + CAGACGTCTG: "MID129", + CATAGTAGTG: "MID13", + CAGTACTGCG: "MID130", + CGACAGCGAG: "MID131", + CGATCTGTCG: "MID132", + CGCGTGCTAG: "MID133", + CGCTCGAGTG: "MID134", + CGTGATGACG: "MID135", + CTATGTACAG: "MID136", + CTCGATATAG: "MID137", + CTCGCACGCG: "MID138", + CTGCGTCACG: "MID139", + CGAGAGATAC: "MID14", + CTGTGCGTCG: "MID140", + TAGCATACTG: "MID141", + TATACATGTG: "MID142", + TATCACTCAG: "MID143", + TATCTGATAG: "MID144", + TCGTGACATG: "MID145", + TCTGATCGAG: "MID146", + TGACATCTCG: "MID147", + TGAGCTAGAG: "MID148", + TGATAGAGCG: "MID149", + ATACGACGTA: "MID15", + TGCGTGTGCG: "MID150", + TGCTAGTCAG: "MID151", + TGTATCACAG: "MID152", + TGTGCGCGTG: "MID153", + TCACGTACTA: "MID16", + CGTCTAGTAC: "MID17", + TCTACGTAGC: "MID18", + TGTACTACTC: "MID19", + ACGCTCGACA: "MID2", + ACGACTACAG: "MID20", + CGTAGACTAG: "MID21", + TACGAGTATG: "MID22", + TACTCTCGTG: "MID23", + TAGAGACGAG: "MID24", + TCGTCGCTCG: "MID25", + ACATACGCGT: "MID26", + ACGCGAGTAT: "MID27", + ACTACTATGT: "MID28", + ACTGTACAGT: "MID29", + AGACGCACTC: "MID3", + AGACTATACT: "MID30", + AGCGTCGTCT: "MID31", + AGTACGCTAT: "MID32", + ATAGAGTACT: "MID33", + CACGCTACGT: "MID34", + CAGTAGACGT: "MID35", + CGACGTGACT: "MID36", + TACACACACT: "MID37", + TACACGTGAT: "MID38", + TACAGATCGT: "MID39", + AGCACTGTAG: "MID4", + TACGCTGTCT: "MID40", + TAGTGTAGAT: "MID41", + TCGATCACGT: "MID42", + TCGCACTAGT: "MID43", + TCTAGCGACT: "MID44", + TCTATACTAT: "MID45", + TGACGTATGT: "MID46", + TGTGAGTAGT: "MID47", + ACAGTATATA: "MID48", + ACGCGATCGA: "MID49", + ATCAGACACG: "MID5", + ACTAGCAGTA: "MID50", + AGCTCACGTA: "MID51", + AGTATACATA: "MID52", + AGTCGAGAGA: "MID53", + AGTGCTACGA: "MID54", + CGATCGTATA: "MID55", + CGCAGTACGA: "MID56", + CGCGTATACA: "MID57", + CGTACAGTCA: "MID58", + CGTACTCAGA: "MID59", + ATATCGCGAG: "MID6", + CTACGCTCTA: "MID60", + CTATAGCGTA: "MID61", + TACGTCATCA: "MID62", + TAGTCGCATA: "MID63", + TATATATACA: "MID64", + TATGCTAGTA: "MID65", + TCACGCGAGA: "MID66", + TCGATAGTGA: "MID67", + TCGCTGCGTA: "MID68", + TCTGACGTCA: "MID69", + CGTGTCTCTA: "MID7", + TGAGTCAGTA: "MID70", + TGTAGTGTGA: "MID71", + TGTCACACGA: "MID72", + TGTCGTCGCA: "MID73", + ACACATACGC: "MID74", + ACAGTCGTGC: "MID75", + ACATGACGAC: "MID76", + ACGACAGCTC: "MID77", + ACGTCTCATC: "MID78", + ACTCATCTAC: "MID79", + CTCGCGTGTC: "MID8", + ACTCGCGCAC: "MID80", + AGAGCGTCAC: "MID81", + AGCGACTAGC: "MID82", + AGTAGTGATC: "MID83", + AGTGACACAC: "MID84", + AGTGTATGTC: "MID85", + ATAGATAGAC: "MID86", + ATATAGTCGC: "MID87", + ATCTACTGAC: "MID88", + CACGTAGATC: "MID89", + TAGTATCAGC: "MID9", + CACGTGTCGC: "MID90", + CATACTCTAC: "MID91", + CGACACTATC: "MID92", + CGAGACGCGC: "MID93", + CGTATGCGAC: "MID94", + CGTCGATCTC: "MID95", + CTACGACTGC: "MID96", + CTAGTCACTC: "MID97", + CTCTACGCTC: "MID98", + CTGTACATAC: "MID99", +} + +# Hard coded Roche Rapid Labrary MIDs +RLMID_HASH = { + ACACGACGACT: "RL1", + ACACGTAGTAT: "RL2", + ACACTACTCGT: "RL3", + ACGACACGTAT: "RL4", + ACGAGTAGACT: "RL5", + ACGCGTCTAGT: "RL6", + ACGTACACACT: "RL7", + ACGTACTGTGT: "RL8", + ACGTAGATCGT: "RL9", + ACTACGTCTCT: "RL10", + ACTATACGAGT: "RL11", + ACTCGCGTCGT: "RL12" +} + +class BarCodeFinderError < StandardError; end + +class BarCodeFinder + def initialize(pos, max_mismatches) + @pos = pos + @max_mismatches = max_mismatches + @barcode_hash = {} + @barcode_sizes = [] + end + + def parse_barcodes(file) + length = 0 + hash = {} + + File.open(file, "r") do |ios| + while not ios.eof? + name, barcode = ios.readline.chomp.split(/\s+/) + + if length > 0 + raise BarCodeFinderError, "Uneven barcode length detected" if length != barcode.length + else + length = barcode.length + end + + hash[barcode.upcase.to_sym] = name + end + end + + load_barcodes(hash) + end + + def load_barcodes(hash) + @barcode_sizes << hash.keys.first.to_s.size + + @barcode_hash.merge!(hash) + end + + def find_barcode(seq) + raise BarCodeFinderError, "No barcodes to find" if @barcode_hash.empty? + + @barcode_sizes.each do |size| + hamming_dist = 0 + barcode = seq[@pos ... @pos + size].upcase.to_sym + + if @barcode_hash.has_key? barcode + return BarCode.new(barcode, @barcode_hash[barcode], @pos, size, hamming_dist) + elsif @max_mismatches > 0 + @barcode_hash.each_key do |key| + if key.to_s.length == barcode.to_s.size + hamming_dist = barcode.to_s.hamming_distance(key.to_s) + + if hamming_dist <= @max_mismatches + return BarCode.new(key, @barcode_hash[key], @pos, size, hamming_dist) + end + end + end + end + end + + nil + end + + private + + # Class to hold a BacCode object. + class BarCode + attr_accessor :barcode, :name, :pos, :len, :mismatches + + def initialize(barcode, name, pos, len, mismatches) + @barcode = barcode + @name = name + @pos = pos + @len = len + @mismatches = mismatches + end + + def to_hash + hash = {} + hash[:BARCODE] = @barcode + hash[:BARCODE_NAME] = @name + hash[:BARCODE_POS] = @pos + hash[:BARCODE_LEN] = @len + hash[:BARCODE_MISMATCHES] = @mismatches + hash + end + + def start + @pos + @len + end + end +end + +casts = [] +casts << {:long=>'barcodes_in', :short=>'b', :type=>'file!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'pos', :short=>'p', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'mismatches', :short=>'m', :type=>'uint', :mandatory=>false, :default=>0, :allowed=>"0,1,2", :disallowed=>nil} +casts << {:long=>'gsmids', :short=>'g', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'rlmids', :short=>'r', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'remove', :short=>'R', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +options = Biopieces.options_parse(ARGV, casts) + +bc_finder = BarCodeFinder.new(options[:pos], options[:mismatches]) +bc_finder.parse_barcodes(options[:barcodes_in]) if options[:barcodes_in] +bc_finder.load_barcodes(GSMID_HASH) if options[:gsmids] +bc_finder.load_barcodes(RLMID_HASH) if options[:rlmids] + +Biopieces.open(options[:stream_in], options[:stream_out]) do |input, output| + input.each_record do |record| + if record.has_key? :SEQ + if barcode = bc_finder.find_barcode(record[:SEQ]) + record.merge!(barcode.to_hash) + + if options[:remove] + record[:SEQ] = record[:SEQ][barcode.start .. -1] + record[:SCORES] = record[:SCORES][barcode.start .. -1] if record[:SCORES] + record[:SEQ_LEN] = record[:SEQ].length + end + end + end + + output.puts record + end +end + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/find_barcodes.in.1 b/bp_test/in/find_barcodes.in.1 new file mode 100644 index 0000000..1a74b61 --- /dev/null +++ b/bp_test/in/find_barcodes.in.1 @@ -0,0 +1,48 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/in/find_barcodes.in.2 b/bp_test/in/find_barcodes.in.2 new file mode 100644 index 0000000..2e88d2f --- /dev/null +++ b/bp_test/in/find_barcodes.in.2 @@ -0,0 +1 @@ +Index_1 ATCACG diff --git a/bp_test/out/find_barcodes.out.1 b/bp_test/out/find_barcodes.out.1 new file mode 100644 index 0000000..dfa125e --- /dev/null +++ b/bp_test/out/find_barcodes.out.1 @@ -0,0 +1,53 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 0 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.10 b/bp_test/out/find_barcodes.out.10 new file mode 100644 index 0000000..5d1912a --- /dev/null +++ b/bp_test/out/find_barcodes.out.10 @@ -0,0 +1,63 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.11 b/bp_test/out/find_barcodes.out.11 new file mode 100644 index 0000000..392dd2b --- /dev/null +++ b/bp_test/out/find_barcodes.out.11 @@ -0,0 +1,78 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.12 b/bp_test/out/find_barcodes.out.12 new file mode 100644 index 0000000..7187155 --- /dev/null +++ b/bp_test/out/find_barcodes.out.12 @@ -0,0 +1,93 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 2 +--- diff --git a/bp_test/out/find_barcodes.out.13 b/bp_test/out/find_barcodes.out.13 new file mode 100644 index 0000000..fed8c2d --- /dev/null +++ b/bp_test/out/find_barcodes.out.13 @@ -0,0 +1,78 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: +SEQ_LEN: 0 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: T +SEQ_LEN: 1 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: +SEQ_LEN: 0 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: T +SEQ_LEN: 1 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: +SEQ_LEN: 0 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: +SEQ_LEN: 0 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 2 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.2 b/bp_test/out/find_barcodes.out.2 new file mode 100644 index 0000000..88e38ef --- /dev/null +++ b/bp_test/out/find_barcodes.out.2 @@ -0,0 +1,58 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 0 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 0 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.3 b/bp_test/out/find_barcodes.out.3 new file mode 100644 index 0000000..f6939ab --- /dev/null +++ b/bp_test/out/find_barcodes.out.3 @@ -0,0 +1,63 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 0 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 0 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 0 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.4 b/bp_test/out/find_barcodes.out.4 new file mode 100644 index 0000000..63655c1 --- /dev/null +++ b/bp_test/out/find_barcodes.out.4 @@ -0,0 +1,53 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.5 b/bp_test/out/find_barcodes.out.5 new file mode 100644 index 0000000..b3bb5da --- /dev/null +++ b/bp_test/out/find_barcodes.out.5 @@ -0,0 +1,58 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.6 b/bp_test/out/find_barcodes.out.6 new file mode 100644 index 0000000..d01dc99 --- /dev/null +++ b/bp_test/out/find_barcodes.out.6 @@ -0,0 +1,63 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.7 b/bp_test/out/find_barcodes.out.7 new file mode 100644 index 0000000..6895222 --- /dev/null +++ b/bp_test/out/find_barcodes.out.7 @@ -0,0 +1,58 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.8 b/bp_test/out/find_barcodes.out.8 new file mode 100644 index 0000000..4cae1ef --- /dev/null +++ b/bp_test/out/find_barcodes.out.8 @@ -0,0 +1,68 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/out/find_barcodes.out.9 b/bp_test/out/find_barcodes.out.9 new file mode 100644 index 0000000..178ebda --- /dev/null +++ b/bp_test/out/find_barcodes.out.9 @@ -0,0 +1,78 @@ +SEQ_NAME: test_gs +SEQ: ACGAGTGCGT +SEQ_LEN: 10 +--- +SEQ_NAME: test_rl +SEQ: ACACGACGACT +SEQ_LEN: 11 +--- +SEQ_NAME: test_Il +SEQ: ATCACG +SEQ_LEN: 6 +--- +SEQ_NAME: test_gs_4p +SEQ: atcgACGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_rl_4p +SEQ: atcgACACGACGACT +SEQ_LEN: 15 +BARCODE: ACATGACGAC +BARCODE_NAME: MID76 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p +SEQ: atcgATCACG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 0 +--- +SEQ_NAME: test_gs_4p_1mm +SEQ: atcgAgGAGTGCGT +SEQ_LEN: 14 +BARCODE: ACGAGTGCGT +BARCODE_NAME: MID1 +BARCODE_POS: 4 +BARCODE_LEN: 10 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_rl_4p_1mm +SEQ: atcgAgACGACGACT +SEQ_LEN: 15 +BARCODE: ACACGACGACT +BARCODE_NAME: RL1 +BARCODE_POS: 4 +BARCODE_LEN: 11 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_Il_4p_1mm +SEQ: atcgAaCACG +SEQ_LEN: 10 +BARCODE: ATCACG +BARCODE_NAME: Index_1 +BARCODE_POS: 4 +BARCODE_LEN: 6 +BARCODE_MISMATCHES: 1 +--- +SEQ_NAME: test_gs_4p_2mm +SEQ: atcgAgGAGTGCcT +SEQ_LEN: 14 +--- +SEQ_NAME: test_rl_4p_2mm +SEQ: atcgAgACGACGAgT +SEQ_LEN: 15 +--- +SEQ_NAME: test_Il_4p_2mm +SEQ: atcgAaCAgG +SEQ_LEN: 10 +--- diff --git a/bp_test/test/test_find_barcodes b/bp_test/test/test_find_barcodes new file mode 100755 index 0000000..c8317bf --- /dev/null +++ b/bp_test/test/test_find_barcodes @@ -0,0 +1,55 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in.1 -r -O $tmp" +assert_no_diff $tmp $out.1 +clean + +run "$bp -I $in.1 -rg -O $tmp" +assert_no_diff $tmp $out.2 +clean + +run "$bp -I $in.1 -rgb $in.2 -O $tmp" +assert_no_diff $tmp $out.3 +clean + +run "$bp -I $in.1 -p 4 -r -O $tmp" +assert_no_diff $tmp $out.4 +clean + +run "$bp -I $in.1 -p 4 -rg -O $tmp" +assert_no_diff $tmp $out.5 +clean + +run "$bp -I $in.1 -p 4 -rgb $in.2 -O $tmp" +assert_no_diff $tmp $out.6 +clean + +run "$bp -I $in.1 -p 4 -m 1 -r -O $tmp" +assert_no_diff $tmp $out.7 +clean + +run "$bp -I $in.1 -p 4 -m 1 -rg -O $tmp" +assert_no_diff $tmp $out.8 +clean + +run "$bp -I $in.1 -p 4 -m 1 -rgb $in.2 -O $tmp" +assert_no_diff $tmp $out.9 +clean + +run "$bp -I $in.1 -p 4 -m 2 -r -O $tmp" +assert_no_diff $tmp $out.10 +clean + +run "$bp -I $in.1 -p 4 -m 2 -rg -O $tmp" +assert_no_diff $tmp $out.11 +clean + +run "$bp -I $in.1 -p 4 -m 2 -rgb $in.2 -O $tmp" +assert_no_diff $tmp $out.12 +clean + +run "$bp -I $in.1 -p 4 -m 2 -Rrg -O $tmp" +assert_no_diff $tmp $out.13 +clean diff --git a/code_ruby/lib/maasha/seq.rb b/code_ruby/lib/maasha/seq.rb index 394d6fb..c3e4839 100644 --- a/code_ruby/lib/maasha/seq.rb +++ b/code_ruby/lib/maasha/seq.rb @@ -282,6 +282,7 @@ class Seq # Method to shuffle a sequence readomly inline. def shuffle! self.seq = self.seq.split('').shuffle!.join + self end # Method that returns a subsequence of from a given start position