From 1edb6821b6fd2c2d3d3e06f127d508610e325aec Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Tue, 12 Oct 2021 15:07:04 -0700 Subject: [PATCH] revise random file generator --- dev-bin/random_file_generator.pl | 225 ++++++++++++++++++++----------- 1 file changed, 148 insertions(+), 77 deletions(-) diff --git a/dev-bin/random_file_generator.pl b/dev-bin/random_file_generator.pl index 1650e042..6f41d6a2 100755 --- a/dev-bin/random_file_generator.pl +++ b/dev-bin/random_file_generator.pl @@ -30,36 +30,42 @@ Example: generate 100 random files from the scripts in the upper directory: EOM -my @source_files = @ARGV; +my @source_files = @ARGV; +my $rsource_files = []; +my $max_cases = 100; -my $max_cases = pop @source_files; -if ( $max_cases !~ /^\d+$/ ) { - push @source_files, $max_cases; - $max_cases = 100; +if (@source_files) { + $max_cases = pop @source_files; + if ( $max_cases !~ /^\d+$/ ) { + push @source_files, $max_cases; + $max_cases = 100; + } } -# Only work on regular source_files with non-zero length -@source_files=grep {-f $_ && !-z $_} @source_files; +if (@source_files) { -# Sort from largest to smallest; that way we work with the largest files -@source_files = - map { $_->[0] } - sort { $b->[1] <=> $a->[1] } - map { [ $_, -e $_ ? -s $_ : 0 ] } @source_files; + # Only work on regular source_files with non-zero length + @source_files = grep { -f $_ && !-z $_ } @source_files; -if ( !@source_files ) { die "$usage" } + # Sort from largest to smallest; that way we work with the largest files + @source_files = + map { $_->[0] } + sort { $b->[1] <=> $a->[1] } + map { [ $_, -e $_ ? -s $_ : 0 ] } @source_files; -my $rsource_files = []; -my $i = -1; -foreach my $file (@source_files) { - $i++; - last if ( $i >= $MAX_SOURCES ); - open( IN, '<', $file ) || die "cannot open $file: $!\n"; - $rsource_files->[$i] = []; - foreach my $line () { - push @{ $rsource_files->[$i] }, $line; + #if ( !@source_files ) { die "$usage" } + + my $i = -1; + foreach my $file (@source_files) { + $i++; + last if ( $i >= $MAX_SOURCES ); + open( IN, '<', $file ) || die "cannot open $file: $!\n"; + $rsource_files->[$i] = []; + foreach my $line () { + push @{ $rsource_files->[$i] }, $line; + } + close(IN); } - close(IN); } my $basename = "ranfile"; @@ -70,32 +76,38 @@ for ( my $nf = 1 ; $nf <= $max_cases ; $nf++ ) { my $frac = rand(1); my $ix = int( rand($nsources) ); $ix = random_index( $nsources - 1 ); - my $NMETH = 4; + my $NMETH = 4; my $method = random_index(3); my $rfile; - if ( $method == 3 ) { - my $nchars=1+random_index(1000); - $rfile = random_characters($nchars); -print STDERR "Method $method, nchars=$nchars\n"; + if ( $method == 3 || !$nsources ) { + + #my $nchars=1+random_index(1000); + #$rfile = random_characters($nchars); + my $nlines = 1 + random_index(100); + $rfile = random_printable_lines( $nlines, 100 ); + + #print STDERR "Method $method, nchars=$nchars\n"; + print STDERR "Method $method, nlines=$nlines\n"; } elsif ( $method == 2 ) { $rfile = skip_random_lines( $rsource_files->[$ix], $frac ); -print STDERR "Method $method, frac=$frac, file=$ix\n"; + print STDERR "Method $method, frac=$frac, file=$ix\n"; } elsif ( $method == 1 ) { $rfile = select_random_lines( $rsource_files->[$ix], $frac ); -print STDERR "Method $method, frac=$frac, file=$ix\n"; + print STDERR "Method $method, frac=$frac, file=$ix\n"; } elsif ( $method == 0 ) { $rfile = reverse_random_lines( $rsource_files->[$ix], $frac ); -print STDERR "Method $method, frac=$frac, file=$ix\n"; + print STDERR "Method $method, frac=$frac, file=$ix\n"; } # Shouldn't happen else { - my $nchars=1+random_index(1000); + my $nchars = 1 + random_index(1000); $rfile = random_characters($nchars); -print STDERR "FIXME: method=$method but NMETH=$NMETH; Method $method, nchars=$nchars\n"; + print STDERR +"FIXME: method=$method but NMETH=$NMETH; Method $method, nchars=$nchars\n"; } open( OUT, ">", $fname ) || die "cannot open $fname: $!\n"; foreach my $line ( @{$rfile} ) { @@ -104,7 +116,7 @@ print STDERR "FIXME: method=$method but NMETH=$NMETH; Method $method, nchars=$nc close OUT; } -sub random_index { +sub OLD_random_index { my ($ix_max) = @_; $ix_max = 0 if ( $ix_max < 0 ); my $ix_min = 0; @@ -114,17 +126,75 @@ sub random_index { return $ix; } +sub random_printable_lines { + my ( $nlines, $max_chars ) = @_; + $max_chars = 100 unless ($max_chars); + my @lines; + for ( 1 .. $nlines ) { + my $len = int( rand($max_chars) + 0.5 ); + my $str = random_printable_string($len); + push @lines, $str . "\n"; + } + return \@lines; +} + +sub random_printable_string { + my ($len) = @_; + $len = 1 unless ($len); + + # This is the decimal range of printable characters in ASCII. It is used to + # make quick preliminary checks before resorting to using a regex. + my $ord_printable_min = 33; + my $ord_printable_max = 126; + my $min = 200; + my $max = 0; + my $str = ""; + foreach ( 1 .. $len ) { + my $ord = random_index( $ord_printable_min, $ord_printable_max ); + my $ch = chr($ord); + + #print "$ord $ch\n"; + $str .= $ch; + if ( $ord > $max ) { $max = $ord } + if ( $ord < $min ) { $min = $ord } + } + + #print "min=$min max=$max [range is 33 - 126 ]\n"; + #print "$str\n"; + return ($str); +} + +sub random_index { + + # 1 arg: generate random index between 0 and arg1 + # 2 args: generate random index between arg1 and arg2 + my ( $ix_min, $ix_max ) = @_; + + if ( !defined($ix_max) ) { + $ix_max = $ix_min; + $ix_min = 0; + } + $ix_max = 0 if ( $ix_max < 0 ); + my $idiff = $ix_max - $ix_min; + my $ix = $ix_min + int( rand($idiff) + 0.5 ); + $ix = $ix_max if ( $ix > $ix_max ); + $ix = $ix_min if ( $ix < $ix_min ); + return $ix; +} + sub random_characters { my ($nchars) = @_; - my @qset1 = qw# { [ ( } ] ) , ; #; + my @qset1 = qw# { [ ( } ] ) ; #; + push @qset1, ','; my @qset2 = ( qw{a b c f g m q r s t w x y z V W X 0 1 8 9}, ';', '[', ']', '{', '}', '(', ')', '=', '?', '|', '+', '<', - '>', '.', '!', '~', '^', '*', '$', '@', '&', ':', '%', ',', + '>', '.', '!', '~', '^', '*', '$', '@', '&', ':', '%', '\\', '/', '_', ' ', "\n", "\t", '-', "'", '"', '`', '#', ); + push @qset2, ','; my @qset3 = ( '!%:', '!%:', '!%:', '!%:', @@ -186,18 +256,19 @@ sub random_characters { my @lines; my $ncpl = 0; my $line = ""; + for ( my $ich = 0 ; $ich < $nchars ; $ich++ ) { my $nset = random_index(2); my $ch; - if ($nset==0) { + if ( $nset == 0 ) { my $ix = random_index( @qset1 - 1 ); $ch = $qset1[$ix]; } - elsif ($nset==1) { + elsif ( $nset == 1 ) { my $ix = random_index( @qset2 - 1 ); $ch = $qset2[$ix]; } - elsif ($nset==2) { + elsif ( $nset == 2 ) { my $ix = random_index( @qset3 - 1 ); $ch = $qset3[$ix]; } @@ -214,75 +285,75 @@ sub random_characters { return \@lines; } -sub select_random_lines { +sub select_random_lines { # select some fraction of the lines in a source file # in any order - my ($rsource, $fkeep) = @_; + my ( $rsource, $fkeep ) = @_; - my $nlines = @{$rsource}; - my $num_keep = $nlines*$fkeep; - my $count=0; + my $nlines = @{$rsource}; + my $num_keep = $nlines * $fkeep; + my $count = 0; my @selected; - while ($count < $num_keep) { - my $ii = random_index($nlines-1); - push @selected, $rsource->[$ii]; - $count++; + while ( $count < $num_keep ) { + my $ii = random_index( $nlines - 1 ); + push @selected, $rsource->[$ii]; + $count++; } return \@selected; } -sub reverse_random_lines { +sub reverse_random_lines { # skip some fraction of the lines in a source file # and reverse the characters on a line - my ($rsource, $frand) = @_; + my ( $rsource, $frand ) = @_; my %select; - my $nlines = @{$rsource}; - my $num_delete = $nlines*$frand; - my $count=0; - while ($count < $num_delete) { - my $ii = rand($nlines); - $ii = int($ii); - $select{$ii} = 1; - $count++; + my $nlines = @{$rsource}; + my $num_delete = $nlines * $frand; + my $count = 0; + while ( $count < $num_delete ) { + my $ii = rand($nlines); + $ii = int($ii); + $select{$ii} = 1; + $count++; } - + my @lines; my $jj = -1; - foreach my $line (@{$rsource}) { + foreach my $line ( @{$rsource} ) { $jj++; - if ($select{$jj} ) { - chomp $line; - $line = reverse($line); - $line .= "\n"; - }; + if ( $select{$jj} ) { + chomp $line; + $line = reverse($line); + $line .= "\n"; + } push @lines, $line; } return \@lines; } -sub skip_random_lines { +sub skip_random_lines { # skip some fraction of the lines in a source file # but keep lines in the original order - my ($rsource, $fskip) = @_; + my ( $rsource, $fskip ) = @_; my %skip; - my $nlines = @{$rsource}; - my $num_delete = $nlines*$fskip; - my $count=0; - while ($count < $num_delete) { - my $ii = rand($nlines); - $ii = int($ii); - $skip{$ii} = 1; - $count++; + my $nlines = @{$rsource}; + my $num_delete = $nlines * $fskip; + my $count = 0; + while ( $count < $num_delete ) { + my $ii = rand($nlines); + $ii = int($ii); + $skip{$ii} = 1; + $count++; } - + my @selected; my $jj = -1; - foreach my $line (@{$rsource}) { + foreach my $line ( @{$rsource} ) { $jj++; next if $skip{$jj}; push @selected, $line; -- 2.39.5