From: martinahansen Date: Thu, 4 Dec 2008 10:18:04 +0000 (+0000) Subject: added python code X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=cd93aa2405f5ca4e2b4d0a0c36508dcb9f5ce73b;p=biopieces.git added python code git-svn-id: http://biopieces.googlecode.com/svn/trunk@328 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/lowercase_seq b/bp_bin/lowercase_seq new file mode 120000 index 0000000..16158e9 --- /dev/null +++ b/bp_bin/lowercase_seq @@ -0,0 +1 @@ +../code_python/Cjung/lowercase_seq \ No newline at end of file diff --git a/bp_doc/logo.svg b/bp_doc/logo.svg new file mode 100644 index 0000000..1759293 --- /dev/null +++ b/bp_doc/logo.svg @@ -0,0 +1,334 @@ + + + + + N + + + + A + + + + C + + + + G + + + + U + + + + C + + + + G + + + + U + + + + A + + + + G + + + + A + + + + U + + + + C + + + + U + + + + C + + + + A + + + + G + + + + U + + + + A + + + + C + + + + G + + + + G + + + + U + + + + C + + + + U + + + + A + + + + G + + + + C + + + + A + + + + G + + + + U + + + + G + + + + U + + + + C + + + + A + + + + G + + + + U + + + + C + + + + C + + + + G + + + + U + + + + A + + + + A + + + + G + + + + C + + + + U + + + + C + + + + U + + + + G + + + + A + + + + C + + + + U + + + + A + + + + G + + + + G + + + + U + + + + C + + + + U + + + + C + + + + A + + + + G + + + + C + + + + U + + + + A + + + + G + + + + A + + + + U + + + + G + + + + C + + + + U + + + + C + + + + A + + + + G + + + + C + + + + A + + + + G + + + + G + + + + U + + + + + + 2 + + + 1 + + + 0 + + + bits + + \ No newline at end of file diff --git a/code_c/Maasha/src/bed2fixedstep.c b/code_c/Maasha/src/bed2fixedstep.c index a060437..0463641 100644 --- a/code_c/Maasha/src/bed2fixedstep.c +++ b/code_c/Maasha/src/bed2fixedstep.c @@ -1,6 +1,5 @@ #include "common.h" #include "mem.h" -#include "filesys.h" #include "list.h" #include "ucsc.h" #include "hash.h" @@ -10,6 +9,19 @@ #define HASH_SIZE 8 #define BARRAY_SIZE ( 1 << 16 ) +static void usage() +{ + fprintf( stderr, + "\n" + "bed2fixedstep - collapse overlapping BED entries using a score\n" + "based on the last number following a _ in the 'name' column.\n" + "if no such number is found 1 is used.\n" + "\n" + "Usage: bed2fixedstep < > \n\n" + ); + + exit( EXIT_FAILURE ); +} long get_score( char *str ) { @@ -32,8 +44,6 @@ long get_score( char *str ) int main( int argc, char *argv[] ) { - char *file = NULL; - FILE *fp = NULL; bed_entry *entry = NULL; hash *chr_hash = NULL; hash_elem *bucket = NULL; @@ -49,10 +59,11 @@ int main( int argc, char *argv[] ) entry = bed_entry_new( BED_COLS ); chr_hash = hash_new( HASH_SIZE ); - file = argv[ argc - 1 ]; - fp = read_open( file ); - - while ( ( bed_entry_get( fp, &entry ) ) ) + if ( ! stdin ) { + usage(); + } + + while ( ( bed_entry_get( stdin, &entry ) ) ) { // bed_entry_put( entry, entry->cols ); @@ -70,8 +81,6 @@ int main( int argc, char *argv[] ) barray_interval_inc( ba, entry->chr_beg, entry->chr_end - 1, score ); } - close_stream( fp ); - // barray_print( ba ); for ( i = 0; i < chr_hash->table_size; i++ ) @@ -87,7 +96,7 @@ int main( int argc, char *argv[] ) { // printf( "chr: %s pos: %zu beg: %zu end: %zu\n", chr, pos, beg, end ); - printf( "fixedStep chrom=%s start=%zu step=1\n", chr, beg ); + printf( "fixedStep chrom=%s start=%zu step=1\n", chr, beg + 1 ); for ( j = beg; j <= end; j++ ) { printf( "%hd\n", ba->array[ j ] ); diff --git a/code_c/Maasha/src/lib/barray.c b/code_c/Maasha/src/lib/barray.c index 68ee72e..08fdc02 100644 --- a/code_c/Maasha/src/lib/barray.c +++ b/code_c/Maasha/src/lib/barray.c @@ -3,7 +3,8 @@ #include "common.h" #include "mem.h" #include "barray.h" - + +#define SIZE_BLOCK 10000000 barray *barray_new( size_t nmemb ) { @@ -32,9 +33,11 @@ size_t barray_new_size( size_t nmemb_old ) size_t nmemb_new = 1; while ( nmemb_new < nmemb_old ) { - nmemb_new <<= 1; + nmemb_new += SIZE_BLOCK; } +// fprintf( stderr, "New size: %zu\n", nmemb_new ); + return nmemb_new; } diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 53a0a24..5f6dd07 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -3153,6 +3153,10 @@ sub script_get_genome_align { $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } ); + } elsif ( $record->{ "REC_TYPE" } eq "PSL" ) { $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); @@ -3675,7 +3679,7 @@ sub script_blast_seq $tmp_in = "$BP_TMP/blast_query.seq"; $tmp_out = "$BP_TMP/blast.result"; - $fh_out = Maasha::Common::write_open( $tmp_in ); + $fh_out = Maasha::Filesys::file_write_open( $tmp_in ); while ( $record = get_record( $in ) ) { @@ -3748,7 +3752,7 @@ sub script_blast_seq unlink $tmp_in; - $fh_out = Maasha::Common::read_open( $tmp_out ); + $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); undef $record; diff --git a/code_perl/Maasha/C_bitarray.pm b/code_perl/Maasha/C_bitarray.pm index 523cbae..50ffcab 100644 --- a/code_perl/Maasha/C_bitarray.pm +++ b/code_perl/Maasha/C_bitarray.pm @@ -57,10 +57,12 @@ void c_array_interval_fill( SV *array, int beg, int end, int score ) size = len / sizeof( int ); + printf( "C: len: %d size: %d sizeof( int ): %ld\n", len, size, sizeof(int) ); + assert( beg >= 0 ); assert( end >= 0 ); assert( beg <= size ); - assert( end < size ); + assert( end <= size ); assert( score > 0 ); for ( i = beg; i < end + 1; i++ ) { @@ -168,8 +170,7 @@ sub c_array_init # Martin A. Hansen, November 2008. # Initializes a zeroed C integer array using - # Perls vec function to create a bit - # vector. + # Perls vec function to create a bit array. my ( $size, # number of elements in array $bits, # bit size @@ -179,9 +180,12 @@ sub c_array_init my ( $vec ); - $vec = ''; + $vec = ''; + + #vec( $vec, $size - 1, $bits ) = 0; + vec( $vec, 4 * $size - 1, $bits / 4 ) = 0; - vec( $vec, $size - 1, $bits ) = 0; + printf STDERR "P: size: %d bits: %d len: %d size: %d\n", $size, $bits, length( $vec ), length( $vec ) / 4; return $vec; } diff --git a/code_perl/Maasha/Match.pm b/code_perl/Maasha/Match.pm index 9ea1b43..240dd8d 100644 --- a/code_perl/Maasha/Match.pm +++ b/code_perl/Maasha/Match.pm @@ -156,7 +156,7 @@ sub match_vmatch foreach $record ( @{ $records } ) { - if ( $entry = Maasha::Biopieces::record2fasta( $record ) ) + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { next if length $entry->[ SEQ ] < 12; # assuming that the index is created for 12 as minimum length diff --git a/code_perl/Maasha/UCSC/BED.pm b/code_perl/Maasha/UCSC/BED.pm index 6a80b13..3dcf45f 100644 --- a/code_perl/Maasha/UCSC/BED.pm +++ b/code_perl/Maasha/UCSC/BED.pm @@ -228,7 +228,8 @@ sub bed_entry_check Maasha::Common::error( qq(Bad BED entry - score must be a whole number - not "$bed->[ score ]") ); } - if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { + # if ( $bed->[ score ] < 0 or $bed->[ score ] > 1000 ) { # disabled - too restrictive ! + if ( $bed->[ score ] < 0 ) { Maasha::Common::error( qq(Bad BED entry - score must be between 0 and 1000 - not "$bed->[ score ]") ); } diff --git a/code_perl/Maasha/UCSC/Wiggle.pm b/code_perl/Maasha/UCSC/Wiggle.pm index 40bdc40..8718b31 100644 --- a/code_perl/Maasha/UCSC/Wiggle.pm +++ b/code_perl/Maasha/UCSC/Wiggle.pm @@ -50,7 +50,7 @@ require Exporter; use constant { BITS => 32, # Number of bits in an integer - SEQ_MAX => 250_000_000, # Maximum sequence size + SEQ_MAX => 200_000_000, # Maximum sequence size chrom => 0, # BED field names chromStart => 1, chromEnd => 2, diff --git a/code_python/Cjung/Args.py b/code_python/Cjung/Args.py new file mode 100644 index 0000000..52027cc --- /dev/null +++ b/code_python/Cjung/Args.py @@ -0,0 +1,157 @@ +""" +Handling of arguments: options, arguments, file(s) content iterator + +For small scripts that: +- read some command line options +- read some command line positional arguments +- iterate over all lines of some files given on the command line, or stdin if none given +- give usage message if positional arguments are missing +- give usage message if input files are missing and stdin is not redirected +""" + +__author__ = 'Peter Kleiweg' +__version__ = '0.2' +__date__ = '2004/08/28' + +import os, sys, getopt + +class Args: + """ + Perform common tasks on command line arguments + + Instance data: + progname (string) -- name of program + opt (dictionary) -- options with values + infile (string) -- name of current file being processed + lineno (int) -- line number of last line read in current file + linesum (int) -- total of lines read + """ + + def __init__(self, usage='Usage: %(progname)s [opt...] [file...]'): + "init, usage string: embed program name as %(progname)s" + self.progname = os.path.basename(sys.argv[0]) + self.opt = {} + self.infile = None + self.lineno = 0 + self.linesum = 0 + self._argv = sys.argv[1:] + self._usage = usage + + def __iter__(self): + "iterator: set-up" + if self._argv: + self.infile = self._argv.pop(0) + self._in = open(self.infile, 'r') + self._stdin = False + else: + if sys.stdin.isatty(): + print "### USAGE in __iter__" + #self.usage() # Doesn't return + return None + self.infile = '' + self._in = sys.stdin + self._stdin = True + return self + + def next(self): + "iterator: get next line, possibly from next file" + while True: + line = self._in.readline() + if line: + self.lineno += 1 + self.linesum += 1 + return line + + if self._stdin: + break + + self._in.close() + try: + self.infile = self._argv.pop(0) + except IndexError: + break + self.lineno = 0 + self._in = open(self.infile, 'r') + + self.lineno = -1 + self.infile = None + raise StopIteration + + def getopt(self, shortopts, longopts=[]): + "get options and merge into dict 'opt'" + try: + options, self._argv = getopt.getopt(self._argv, shortopts, longopts) + except getopt.GetoptError: + print "### USAGE in getopt" + #self.usage() + return None + self.opt.update(dict(options)) + + def shift(self): + "pop first of remaining arguments (shift)" + try: + return self._argv.pop(0) + except IndexError: + #print "### USAGE in shift" + #self.usage() + return None + + + def pop(self): + "pop last of remaining arguments" + try: + return self._argv.pop() + except IndexError: + print "### USAGE in pop" + #self.usage() + return None + + def warning(self, text): + "print warning message to stderr, possibly with filename and lineno" + if self.lineno > 0: + print >> sys.stderr, '%s:%i: warning: %s' % (self.infile, self.lineno, text) + else: + print >> sys.stderr, '\nWarning %s: %s\n' % (self.progname, text) + + def error(self, text): + "print error message to stderr, possibly with filename and lineno, and exit" + if self.lineno > 0: + print >> sys.stderr, '%s:%i: %s' % (self.infile, self.lineno, text) + else: + print >> sys.stderr, '\nError %s: %s\n' % (self.progname, text) + sys.exit(1) + + def usage(self): + "print usage message, and exit" + print >> sys.stderr + print >> sys.stderr, self._usage % {'progname': self.progname} + print >> sys.stderr + #sys.exit(1) + + +if __name__ == '__main__': + + a = Args('Usage: %(progname)s [-a value] [-b value] [-c] word [file...]') + + a.opt['-a'] = 'option a' # set some default option values + a.opt['-b'] = 'option b' # + a.getopt('a:b:c') # get user supplied option values + + word = a.shift() # get the first of the remaining arguments + # use a.pop() to get the last instead + + for line in a: # iterate over the contents of all remaining arguments (file names) + if a.lineno == 1: + print 'starting new file:', a.infile + a.warning(line.rstrip()) + + print 'Options:', a.opt + print 'Word:', word + print 'Total number of lines:', a.linesum + + print 'Command line:', sys.argv # unchanged + + a.warning('warn 1') # print a warning + a.error('error') # print an error message and exit + a.warning('warn 2') # this won't show + diff --git a/code_python/Cjung/Args.pyc b/code_python/Cjung/Args.pyc new file mode 100644 index 0000000..8b12178 Binary files /dev/null and b/code_python/Cjung/Args.pyc differ diff --git a/code_python/Cjung/lowercase_seq b/code_python/Cjung/lowercase_seq new file mode 100755 index 0000000..02d3514 --- /dev/null +++ b/code_python/Cjung/lowercase_seq @@ -0,0 +1,179 @@ +#!/usr/bin/python + +import os, string, sys, getopt, Args + +record_delimiter = "\n---\n" + +class Lowercase_seq: + in_stream = None + out_stream = None + eo_buffer = False + buffer = '' + rec_dic = {} + rec_num = 0 + + ########################################### + def __init__(self): + pass + ########################################### + + ########################################### + def open_streams(self, input_file, output_file): + #print input_file, output_file + if input_file == '': + self.in_stream = sys.stdin + #print "in_stream = " + else: + try: + self.in_stream = open(input_file, 'r') + #print "in_stream = %s" % (input_file) + except: + raise IOError + + if output_file == '': + self.out_stream = sys.stdout + #print "out_stream = " + else: + try: + self.out_stream = open(output_file, 'w') + #print "out_stream = %s" % (output_file) + except: + raise IOError + ########################################### + + ########################################### + def close_streams(self): + if self.in_stream: + self.in_stream.close() + if self.out_stream: + self.out_stream.close() + ########################################### + + ########################################### + def get_record(self): + rec = '' + eof_flag = False + while not self.eo_buffer: + if eof_flag: + if self.buffer == '': + self.eo_buffer = True + break + else: + tmp = self.in_stream.read(1000) + if not tmp: + eof_flag = True + + self.buffer = self.buffer + tmp + delim_index = self.buffer.find(record_delimiter) + if delim_index >= 0: + rec = self.buffer[:delim_index] + self.buffer = self.buffer[delim_index + len(record_delimiter):] + break + return rec + ########################################### + + ########################################### + def process_record(self, rec): + #print "PARSE_RECORD" + #print rec + #print "===" + lines = rec.split("\n") + self.rec_num += 1 + self.rec_dic[self.rec_num] = {} + for l in lines: + toks = l.split(": ") + if toks[0]=="SEQ": + self.rec_dic[self.rec_num][toks[0]] = toks[1].lower() + else: + self.rec_dic[self.rec_num][toks[0]] = toks[1] + #self.rec_dic[self.rec_num][toks[0]] = toks[1] + #print self.rec_dic[self.rec_num] + return self.rec_num + ########################################### + + ########################################### + def put_record(self, r_num): + rec = self.rec_dic[r_num] + for k in rec.keys(): + #print "%s: %s" % (k, rec[k]) + self.out_stream.write("%s: %s\n" % (k, rec[k])) + #print "---" + self.out_stream.write("---\n") + ########################################### + + ########################################### + def print_usage(self, opt): + bp_dir = os.environ['BP_DIR'] + usage_path = bp_dir + os.path.sep + "bp_usage" + os.path.sep + "lowercase_seq.wiki" + os.system("print_usage -i %s %s" % (usage_path, opt)) + ########################################### + + +# main + +""" +print "############" +print len(sys.argv) +print sys.argv +print "############" +""" + + +lc_seq = Lowercase_seq() + +a = Args.Args('Usage: %(progname)s [-a value] [-b value] [-c] word [file...]') + +a.opt['-I'] = '' # input file +a.opt['-O'] = '' # output file +a.getopt('I:O:?:v') # get user supplied option values + +print >> sys.stderr, a.opt + +word = a.shift() # get the first of the remaining arguments + # use a.pop() to get the last instead +if not word == None: + sys.stderr.write("Unknown argument %s\n" % (word)) + sys.exit(1) + +if sys.stdin.isatty(): + lc_seq.print_usage('') + sys.exit(1) + +#for line in a: # iterate over the contents of all remaining arguments (file names) +# if a.lineno == 1: +# print 'starting new file:', a.infile +# a.warning(line.rstrip()) + +#print 'Options:', a.opt +#print 'Word:', word +#print 'Total number of lines:', a.linesum + + +if a.opt.has_key('-?'): + lc_seq.print_usage('-?') + sys.exit(1) +else: + try: + lc_seq.open_streams(a.opt['-I'], a.opt['-O']) + except: + sys.stderr.write("%s\n" % ("IOError")) + sys.exit(1) + + +while True: + rec = lc_seq.get_record() + if rec=='': + break + rec_num = lc_seq.process_record(rec) + lc_seq.put_record(rec_num) + +lc_seq.close_streams() + +#source = "Dmel_tRNAs_key_record_tuples.txt" +#lc_seq.open_stream(source) + +#lc_seq.get_record() +#lc_seq.put_record() + + +