From c23b3f0d83dc42b0d8eef92b3361293d48c3e6a5 Mon Sep 17 00:00:00 2001 From: Nathaniel Wesley Filardo Date: Thu, 6 Mar 2014 13:28:47 -0500 Subject: [PATCH] Rabin-fingerprint splitter --- rabinsplit.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++ rabinsplit.pl | 138 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 rabinsplit.c create mode 100644 rabinsplit.pl diff --git a/rabinsplit.c b/rabinsplit.c new file mode 100644 index 0000000..c8a1af1 --- /dev/null +++ b/rabinsplit.c @@ -0,0 +1,181 @@ +/* + * (C) 2014 Nathaniel Wesley Filardo + * + * Rabin Fingerprint stream/file splitter + * + * This is intended as a pre-processing stage to content-addressed storage + * systems (e.g. Venti). Either take a large file and split it, or make a + * tarball and split that, then vac up the resulting files. The idea is that + * we find stable parts of the files between signatures, so even if the + * offsets change, we'll recover. + * + * For technical discussion, see + * http://gsoc.cat-v.org/people/mjl/blog/2007/08/06/1_Rabin_fingerprints/ + * and + * Center for Research in Computing Technology, Harvard University. Tech + * Report TR-CSE-03-01. http://www.xmailserver.org/rabin.pdf + * + * Build with + * gcc -Wall --std=gnu99 -o rabinsplit rabinsplit.c + */ + +#include +#include +#include +#include +#include + +const unsigned int mulprime = 7; +const unsigned int win_data_offset = 1; + +struct { + unsigned int debug; + unsigned int writing; + unsigned int winsize; + unsigned int modulus; + unsigned int minsize; + char *outpfx; + unsigned int shiftout[256]; +} params; + +struct { + unsigned int *wind_head; + unsigned int *wind_end; + unsigned int *wind_cur; + unsigned int wind_sum; + unsigned long chunksize; + unsigned long totalsize; + unsigned int chunkcount; + FILE *outfile; +} ss; + +void build_shiftout(unsigned int so[256]) { + for (int i = 0; i < 256; i++) { + so[i] = i + win_data_offset; + } + for (int x = 0; x < params.winsize; x++) { + for (int i = 0; i < 256; i++) { + so[i] = (so[i] * mulprime) % params.modulus; + } + } +} + +void dosplit(void) { + if (params.debug) { + printf("SPLIT chunk=%u chunkbytes=%lu totalbytes=%lu\n", + ss.chunkcount, ss.chunksize, ss.totalsize); + } + + ss.chunksize = 0; + if(ss.outfile) { + fclose(ss.outfile); + } + if(params.writing) { + char buf[128]; + int n = snprintf(buf, sizeof(buf), "%s%08d", params.outpfx, ss.chunkcount); + if (n >= sizeof(buf)) { + fprintf(stderr, "Overlong output filename; bailing out."); + exit(-1); + } + + ss.outfile = fopen(buf, "w+b"); + if(!ss.outfile) { + fprintf(stderr, "Unable to open file: %s (errno=%d)", buf, errno); + exit(-1); + } + } + ss.chunkcount++; +} + +void procbyte(uint8_t in) { + ss.chunksize++; + ss.totalsize++; + + ss.wind_sum *= mulprime; + ss.wind_sum += in + win_data_offset; + ss.wind_sum -= *ss.wind_cur; + ss.wind_sum %= params.modulus; + + *ss.wind_cur = params.shiftout[in]; + + if(++ss.wind_cur == ss.wind_end) { ss.wind_cur = ss.wind_head; } + if(ss.outfile) { fputc(in,ss.outfile); } + if((ss.wind_sum == params.modulus - 1) + && ss.chunksize >= params.minsize) { dosplit(); } +} + +long +safe_strtoul(char *p, char *err) { + char *endp; + unsigned long int r; + + errno = 0; + r = strtoul(p, &endp, 10); + if ((*endp != '\0') || (errno != 0)) { + fprintf(stderr, "Could not understand %s %s (tail=%s, errno=%d)\n", err, p, endp, errno); + exit(-1); + } + + return r; +} + +void +help(void) { + printf("Rabin split options:\n"); + printf("\t-m \n"); + printf("\t-n disable writing\n"); + printf("\t-o \n"); + printf("\t-v enables verbose debugging output\n"); + printf("\t-w \n"); + printf("\t-z \n"); + exit(0); +} + +int +main(int argc, char **argv) { + params.debug = 0; + params.writing = 1; + params.winsize = 31; + params.minsize = 4096; + params.modulus = 10*1024*1024; + + { + int opt; + while((opt = getopt(argc, argv, "hm:no:vw:z:")) != -1) { + switch(opt) { + case 'h': help(); break; + case 'm': params.minsize = safe_strtoul(optarg, "minsize"); break; + case 'n': params.writing = 0; break; + case 'o': params.outpfx = optarg; break; + case 'v': params.debug++; break; + case 'w': params.winsize = safe_strtoul(optarg, "window size"); break; + case 'z': params.modulus = safe_strtoul(optarg, "modulus"); break; + default: fprintf(stderr, "Unrecognized option: '%c'\n", opt); help(); break; + } + } + } + + { + int bsize = params.winsize * sizeof(*ss.wind_head); + ss.wind_head = alloca(bsize); + ss.wind_end = ss.wind_head + params.winsize; + ss.wind_cur = ss.wind_head; + ss.wind_sum = 0; + for (int i = 0; i < params.winsize; i++) + { + ss.wind_head[i] = 0; + } + ss.chunkcount = 0; + ss.chunksize = 0; + ss.totalsize = 0; + ss.outfile = NULL; + } + + build_shiftout(params.shiftout); + dosplit(); // Get the party started + + int ci; + while((ci = fgetc(stdin)) >= 0) { + procbyte((uint8_t) ci); + } +} diff --git a/rabinsplit.pl b/rabinsplit.pl new file mode 100644 index 0000000..d54f45f --- /dev/null +++ b/rabinsplit.pl @@ -0,0 +1,138 @@ +#!/usr/bin/perl +# +# (C) 2010 Nathaniel Wesley Filardo +# +# Rabin Fingerprint stream/file splitter +# +# This is intended as a pre-processing stage to content-addressed storage +# systems (e.g. Venti). Either take a large file and split it, or make a +# tarball and split that, then vac up the resulting files. The idea is that +# we find stable parts of the files between signatures, so even if the +# offsets change, we'll recover. +# +# Ideally, set --max equal to (a positive integer multiple of) the block +# size of vac. +# +# For technical discussion, see +# http://gsoc.cat-v.org/people/mjl/blog/2007/08/06/1_Rabin_fingerprints/ +# and +# Center for Research in Computing Technology, Harvard University. Tech +# Report TR-CSE-03-01. http://www.xmailserver.org/rabin.pdf + +use strict; +use warnings; + +use Getopt::Long; + +my $WINDOW = 31; # Rabin fingerprinting window (bytes) +my $PRIME = 5; # Prime multiplier +my $MOD = 2**12; # Modulus (expected block size!) +my $CHUNKSIZE_MAX = 0; # Split at least this often +my $CHUNKSIZE_MIN = 0; # Split at most this often +my $CISHIFT = 0; # Another possible knob (for testing) +my $INFILE = "/dev/fd/0"; # Input file name +my $OUTPFX = undef; # Output prefix +my $OUTSWD = 8; # Output width +my $FINGERPRINT = $MOD-1; # Which value from Z_{$MOD} are we using? +my $WRITE = 1; # Disable output production; merely count chunks +my $VERBOSE = 0; + +my %OPTIONS = ( + 'in=s' => \$INFILE + , 'prefix=s' => \$OUTPFX + , 'swidth=i' => \$OUTSWD + , 'write!' => \$WRITE + + # Parameters to slicing and dicing + , 'max=i' => \$CHUNKSIZE_MAX + , 'min=i' => \$CHUNKSIZE_MIN + , 'mod=i' => \$MOD + , 'verbose+' => \$VERBOSE + + # Less likely to fiddle with these + , 'cishift=i' => \$CISHIFT + , 'fingerprint=i' => \$FINGERPRINT + , 'prime=i' => \$PRIME + , 'window=i' => \$WINDOW + ); +GetOptions(%OPTIONS) or die $!; + +die "Window too small!" if $WINDOW < 2; +die "Fingerprint out of range!" if $FINGERPRINT < 0 or $FINGERPRINT >= $MOD; +die "Don't set --no-write and --prefix." if defined $OUTPFX and not $WRITE; +die "Need input file." if not defined $INFILE; +$OUTPFX = "x" if not defined $OUTPFX; + +# Compute the shift-out table; we do this in iterated maps to avoid overflow +my $shiftout; +@$shiftout = map { ($_+$CISHIFT)%$MOD } (0..255); +for my $i (1..$WINDOW-1) { + @$shiftout = map { ($_*$PRIME)%$MOD } @$shiftout; +} + +my @buf = ( ); +my $sum = 0; +my $chunksize = 0; +my $curr_outn = 0; +my $curr_outf = undef; + +sub dosplit () { + (close $curr_outf or die) if defined $curr_outf; + $chunksize = 0; + (open ($curr_outf, ">:bytes", $OUTPFX.(sprintf "%0${OUTSWD}x", $curr_outn)) + or die "...: $!") + if $WRITE; + $curr_outn++; +} + +my $bytes_proc = 0; +sub procbyte ($) { + my ($_) = @_; + + $_ += $CISHIFT; + + push @buf, $_; + $sum *= $PRIME; + $sum += $_; + $sum %= $MOD; + $bytes_proc++; + $chunksize++; + if($CHUNKSIZE_MAX != 0 and $chunksize == $CHUNKSIZE_MAX) { + print STDERR "CHUNKFORCE \@$bytes_proc ($chunksize)\n" if $VERBOSE; + dosplit(); + } + if($sum == $FINGERPRINT) { + if ($chunksize > $CHUNKSIZE_MIN) { + print STDERR "CHUNKCHECK \@$bytes_proc ($chunksize)\n" if $VERBOSE; + dosplit(); + } else { + print STDERR "CHUNKSKIP \@$bytes_proc ($chunksize)\n" if $VERBOSE; + } + } + print STDERR "$_ : $sum : ", (join "-", @buf), "\n" if $VERBOSE > 3; + print $curr_outf (chr $_) if $WRITE; +} + +my $if; +open( $if, "<:bytes", $INFILE ) or die "Can't open '$INFILE': $!"; + +# Get things started. +dosplit(); + +while(scalar @buf < $WINDOW) { + my $r = read($if, $_, 1); + last if not defined $r or $r < 1; + $_ = ord; + procbyte($_); +} + +while(1 == read($if, $_, 1)) { + $_ = ord; + my $s = $$shiftout[(shift @buf) - $CISHIFT]; + $sum -= $s; + $sum %= $MOD; + procbyte($_); +} + +print STDERR "CHUNKEOF \@$bytes_proc ($chunksize)\n" if $VERBOSE; +printf "DONE bytes=%d maxchunk='%0${OUTSWD}x'\n", $bytes_proc, $curr_outn-1 if $VERBOSE; -- 2.50.1