From 2457393342a7be8b6ab24ae2608b1718a36d9816 Mon Sep 17 00:00:00 2001
From: Nathaniel Wesley Filardo <nwf@cs.jhu.edu>
Date: Thu, 6 Mar 2014 13:30:12 -0500
Subject: [PATCH] "Object manager"

Keep track of a read-mostly collection; check hashes, find duplicates, merge
avoiding duplicates, etc.
---
 objman.pl | 509 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 509 insertions(+)
 create mode 100755 objman.pl

diff --git a/objman.pl b/objman.pl
new file mode 100755
index 0000000..d90a413
--- /dev/null
+++ b/objman.pl
@@ -0,0 +1,509 @@
+#!/usr/bin/perl -w
+
+my $C_INDEXDB = "__objmandb";
+my $O_VERBOSE;
+
+use strict;
+use warnings;
+use BerkeleyDB;
+use Cwd 'abs_path';
+use Data::Dumper;
+require Digest;
+require Digest::SHA;
+use File::Basename;
+use File::Compare;
+use File::Copy;
+use File::Next;
+use Getopt::Long;
+use IO::Handle;
+use IO::File;
+use POSIX;
+
+# For testing hash collisions, use this.
+#my $G_DIGEST = 'CRC';
+#my $G_DIGESTER = Digest->new($G_DIGEST, type=>"crc8");
+
+# For day-to-day operations. use a real hash function, like SHA1:
+my $G_DIGEST = 'SHA1';
+my $G_DIGESTER = Digest->new($G_DIGEST);
+
+sub hashfile ($) {
+  my $fh = IO::File->new(@_);
+
+  $G_DIGESTER->addfile($fh);
+  return $G_DIGESTER->hexdigest;
+
+  # Note that reading ->hexdigest triggers a reset of the object,
+  # so we're perfectly OK having just one.
+}
+
+sub del_cursor_and_file($$$) {
+  my ($db_fh, $cursor, $file) = @_;
+  $cursor->c_del();
+  $db_fh->db_del($file);
+}
+
+sub open_db ($$$) {
+  my ($dbfile, $subname, $dupes) = @_;
+
+  $dbfile = $dbfile . "-" . $subname;
+
+  my $db = new BerkeleyDB::Btree
+# XXX Why don't sub databases work?!
+# Can I really not have two concurrent operations through this API?
+              -Filename => $dbfile,
+#							-Subname => $subname,
+              -Flags		=> DB_CREATE ,
+              -Property	=> ($dupes ? DB_DUP : 0)
+  or die "Cannot open file $dbfile: '$!' '$BerkeleyDB::Error'\n" ;
+
+  return $db;
+}
+
+sub build_index ($$$$) {
+  my ($db_hf, $db_fh, $collection, $newonly) = @_;
+
+  my $ignored = "";
+  my $fileiter = File::Next::files ( $collection );
+
+  while ( defined ( my $file = $fileiter->() ) ) {
+    # XXX HACK to prevent the index from trying to contain itself
+    next if $file =~ /$collection\/$C_INDEXDB.*/;
+    next if $newonly and $db_fh->db_get($file, $ignored) == 0;
+
+    my $hash = hashfile($file);
+    print STDERR "  Digest of $file is $hash\n" if $O_VERBOSE;
+    print STDERR "  Added $file\n" if $newonly and not $O_VERBOSE;
+    $db_hf->db_put($hash, $file);
+    $db_fh->db_put($file, $hash);
+  }
+}
+
+sub printdb ($) {
+  my ($db) = @_;
+
+  my $cursor = $db->db_cursor();
+  my ($k, $v) = ("","");
+
+  while ($cursor->c_get ($k, $v, DB_NEXT) == 0)
+  {
+    print " $k:\n";
+    do {
+      print "  $v\n";
+    } while ( $cursor->c_get ($k, $v, DB_NEXT_DUP) == 0 );
+  }
+}
+
+
+sub filecheck ($$$) {
+  my ($db_hf, $db_fh, $collection) = @_;
+
+  my $cursor = $db_fh->db_cursor();
+  my ($k, $v) = ("","");
+
+  print STDERR "Checking over files...\n";
+
+  while ($cursor->c_get ($k, $v, DB_NEXT) == 0)
+  {
+    my $count;
+    $cursor->c_count($count);
+
+    print STDERR " Checking $k:\n" if $O_VERBOSE;
+
+    if (not -e $k)
+    {
+      print STDERR "   $k seems to have gone away by itself...\n";
+      $cursor->c_del();
+      next;
+    }
+
+    if (not -r $k)
+    {
+      print STDERR "   No longer allowed to read $k?\n".
+      $cursor->c_del();
+      next;
+    }
+
+    my $newhash = hashfile($k);
+
+    if ($newhash ne $v)
+    {
+      print STDERR "   File $k changed from $v to $newhash"
+                   . "; pointwise semi-updating database.\n";
+      # XXX
+      # Can't delete the specific entry in the hash->file map.
+      # So we'll just let the deletion detector take care of it.
+      $cursor->c_del();
+      $db_hf->db_put($newhash, $k);
+      $db_fh->db_put($k, $newhash);
+      next;
+    }
+  }
+
+  print STDERR "Done checking.\n";
+}
+
+sub dupecheck ($$$$) {
+  my ($db_hf, $db_fh, $collection, $prune) = @_;
+
+  my $cursor = $db_hf->db_cursor();
+  my ($k, $v) = ("","");
+
+  print STDERR "Searching for duplicates...\n";
+
+  while ($cursor->c_get ($k, $v, DB_NEXT) == 0)
+  {
+    my $count;
+    $cursor->c_count($count);
+
+    my $firstfile = $v;
+
+    if ($count > 1)
+    {
+      if ($O_VERBOSE)
+      {
+        print STDERR " Database thinks duplicate files at $k:\n";
+      } else {
+        print STDERR " Database thinks these files are duplicates:\n";
+      }
+
+      my $index = 0;
+
+      do {{
+        print STDERR "  $v\n";
+
+        if ($index != 0 and $firstfile eq $v)
+        {
+          print STDERR "   WARNING: DATABASE CORRUPTED ".
+                       "(duplicate entry in hash)\n";
+          $cursor->c_del();
+        }
+
+        if (not -e $v)
+        {
+          print STDERR "   File seems to have gone away by itself...\n";
+          del_cursor_and_file($db_fh, $cursor, $v);
+          next;
+        }
+
+        if (not -r $v)
+        {
+          print STDERR "   No longer allowed to read file?\n".
+          del_cursor_and_file($db_fh, $cursor, $v);
+          next;
+        }
+
+        my $newhash = hashfile($v);
+
+        if ($newhash ne $k)
+        {
+          print STDERR "   File hash changed from $k to $newhash"
+                       . "; pointwise updating database.\n";
+          del_cursor_and_file($db_fh, $cursor, $v);
+          $db_hf->db_put($newhash, $v);
+          $db_fh->db_put($v, $newhash);
+      
+          print STDERR "Restarting...\n";
+          print STDERR "=" x 76 . "\n";
+
+          # Create a new cursor and start over.
+          $cursor = $db_hf->db_cursor();
+          ($k, $v) = ("","");
+          last;
+        }
+
+        ### Avoid comparing the first file to itself.
+        if ($firstfile ne $v and (compare($v, $firstfile) == 0))
+        {
+          print STDERR "   Seems genuine...\n" if $O_VERBOSE;
+          
+          if ($prune)
+          {
+            print STDERR "   Removing file: $v\n";
+            del_cursor_and_file($db_fh, $cursor, $v);
+            unlink $v;
+            last;
+          }
+        } elsif ( $firstfile ne $v ) {
+          print STDERR "   Perhaps you should buy lottery tickets.\n";
+        }
+
+        # TODO A pairwise compare would be better
+
+        $index++;
+      }} while ( $cursor->c_get ($k, $v, DB_NEXT_DUP) == 0 );
+    }	
+  }
+
+  print STDERR "Done searching for duplicates.\n";
+}
+
+  # Merge core
+  # Return values:
+  #	 0 - no merge necessary (duplicates detected)
+  #	 1 - file merged successfully
+  #	 -1 - unable to merge file
+  #	 -2 - want to merge, but told not to
+  #		-3 - please search for new files and re-attempt this merge
+  #				 (mergefile found deletions or alterations while searching and
+  #					did not find a copy of the original file; therefore, it is
+  #					unsure of whether the file should be merged)
+sub mergefile ($$$$$) {
+  my ($db_hf, $db_fh, $destination, $newfile, $noact) = @_;
+
+  my $hash = hashfile($newfile);
+  my $findnew = 0;
+
+  print STDERR " Hash is $hash\n" if $O_VERBOSE;
+
+  my $cursor = $db_hf->db_cursor();
+  my $oldfile;
+  if ($cursor->c_get ($hash, $oldfile, DB_SET) == 0)
+  {
+    do {{ # Note the doubled braces so that "next" works.	See perlsyn.
+      print STDERR "  Database suggests $oldfile ...\n" if $O_VERBOSE;
+
+      if (not -r $oldfile)
+      {
+        print STDERR "   Detected a deletion...\n";
+        del_cursor_and_file($db_fh, $cursor, $oldfile);
+
+        $findnew = 1;
+        next;
+      }
+
+      if (compare($oldfile, $newfile) == 0)
+      {
+        print STDERR "   And good thing too!	No merge necessary.\n"
+          if $O_VERBOSE;
+        return 0;
+      }
+
+      my $newhash = hashfile($oldfile);
+      if ($newhash ne $hash)
+      {
+        print STDERR "   Hash mismatch; pointwise update...\n";
+        del_cursor_and_file($db_fh, $cursor, $oldfile);
+        $db_hf->db_put($newhash, $oldfile);
+        $db_fh->db_put($oldfile, $newhash);
+
+        $findnew = 1;
+      }
+
+    }} while ($cursor->c_get ($hash, $oldfile, DB_NEXT_DUP) == 0);
+  }
+
+  return -3 if $findnew == 1;
+
+  my $destfile = $destination."/".basename($newfile);
+
+  print STDERR "  MERGE $newfile INTO $destfile \n" if $O_VERBOSE;
+
+  if (not $noact) {
+    if(not sysopen DESTINATION, $destfile, O_RDWR|O_CREAT|O_EXCL )
+    {
+      print STDERR "   Bad juju while opening destination file: $!\n";
+      return -1;
+    }
+
+    $db_hf->db_put($hash, $destfile);
+    $db_fh->db_put($destfile, $hash);
+    copy($newfile, *DESTINATION);
+
+    close DESTINATION;
+    return 1;
+  } else {
+    # Simulate a failure on no-action so that we don't delete files.
+    return -2;
+  }
+}
+
+my $O_COLLECTION;
+my $O_DUPECHECK;
+my $O_FILECHECK;
+my $O_HELP;
+my $O_LIST;
+my $O_MERGE;
+my $O_NEWDIRNAME;
+my $O_NEWONLY;
+my $O_NOMERGE;
+my $O_PRUNE;
+my $O_REBUILDINDEX;
+my $O_SOURCEDIR;
+
+my %cmdopts=(
+  "c:s"       => \$O_COLLECTION,
+  "d"         => \$O_DUPECHECK,
+  "f"         => \$O_FILECHECK,
+  "h|help"    => \$O_HELP,
+  "l"         => \$O_LIST,
+  "m"         => \$O_MERGE,
+  "n:s"       => \$O_NEWDIRNAME,
+  "o"         => \$O_NOMERGE,
+  "prune"     => \$O_PRUNE,
+  "r"         => \$O_REBUILDINDEX,
+  "s:s"       => \$O_SOURCEDIR,
+  "v"         => \$O_VERBOSE,
+  "w"          => \$O_NEWONLY,
+);
+GetOptions(%cmdopts);
+
+$O_HELP = 1 if not ($O_LIST or $O_MERGE or $O_DUPECHECK or $O_FILECHECK or $O_REBUILDINDEX);
+
+if($O_HELP)
+{
+  print "Dump index:      $0 -l [-c Collection]\n";
+  print "Rebuild index:   $0 -r [-w] [-c Collection]\n";
+  print "Duplicate check: $0 -d [-c Collection]\n";
+  print "File check:      $0 -f [-c Collection]\n";
+  print "Merge usage:     $0 -m [-o] [-c Collection] [-n New] [-s Src]\n";
+  print "Simulating merges:\n";
+  print "  -o will cause -m to check what would be merged, but not merge files.\n";
+  print "Deleting files:\n";
+  print "  --prune will cause -d to delete duplicates from the collection.\n";
+  print "  --prune will cause -m to delete duplicates from the source.\n";
+  print "          Combined with -o, this will delete only already copied files.\n";
+  print "          Combined with -w, this will delete only newly merged files.\n";
+  print "          Combined with both -o and -w, this will have no effect.\n";
+  print "Other options:\n";
+  print "  -r optionally takes -w to search only for new files.\n";
+  print "  If -v is given, I will verbosely explain my actions.\n";
+  print "  If -h is given, I will display this help message.\n";
+  print "Note that -lrdm are not exclusive, but mixes may be funny.\n";
+  print "  The order of operations is -r, -d, -m, then -l.\n";
+  exit 0;
+}
+
+if(not defined $O_COLLECTION)
+{
+  warn "Undefined collection: Assuming current directory." ;
+  $O_COLLECTION=abs_path(".");
+} else {
+  $O_COLLECTION=abs_path($O_COLLECTION);
+}
+
+print STDERR "Working with collection '$O_COLLECTION'\n";
+
+#die "Refusing to work on non-extant collection" unless -d $O_COLLECTION;
+#die "Refusing to work on non-readable collection" unless -r $O_COLLECTION;
+#die "Refusing to work on non-writable collection" unless -w $O_COLLECTION;
+#die "Refusing to work on non-enterable collection" unless -x $O_COLLECTION;
+
+my $indexfile = $O_COLLECTION . "/" . $C_INDEXDB;
+
+# Force rebuild if index file doesn't exist
+# TODO: Depends on proper access mechanisms to multiple sub databases at
+# once...
+#$O_REBUILDINDEX=1 if not -r $indexfile;
+
+print STDERR " Index file is '$indexfile'\n" if $O_VERBOSE;
+print STDERR " Berkely DB version is $BerkeleyDB::db_version\n" if $O_VERBOSE;
+
+# Open the database
+my $db_hf = open_db($indexfile, "hf", 1);		# Hash -> file name, dupes on
+my $db_fh = open_db($indexfile, "fh", 0);		# File -> hash, dupes off
+
+# If we are supposed to build the index, go do that now
+if($O_REBUILDINDEX)
+{
+  print STDERR "Building index...\n";
+
+  my ($oldh, $oldf) = (0,0);
+  if (not $O_NEWONLY)
+  {
+    $db_hf->truncate($oldh);
+    $db_fh->truncate($oldf);
+  } else {
+    $oldh = ${$db_hf->db_stat()}{'bt_ndata'};
+    $oldf = ${$db_fh->db_stat()}{'bt_ndata'};
+  }
+  build_index ($db_hf, $db_fh, $O_COLLECTION, $O_NEWONLY);
+
+  my $dbstat_hf = $db_hf->db_stat();
+  my $dbstat_fh = $db_fh->db_stat();
+
+  if ($O_VERBOSE)
+  {
+    print STDERR "Old database had $oldh hashes and $oldf files.\n";
+    print STDERR "Now have ";
+    print STDERR $$dbstat_hf {'bt_ndata'};
+    print STDERR " hashes and ";
+    print STDERR $$dbstat_fh {'bt_ndata'};
+    print STDERR " files\n";
+  }
+}
+
+if($O_DUPECHECK)
+{
+  dupecheck ($db_hf, $db_fh, $O_COLLECTION, $O_PRUNE);
+}
+
+if($O_FILECHECK)
+{
+  filecheck ($db_hf, $db_fh, $O_COLLECTION);
+}
+
+MERGE: {
+  if($O_MERGE)
+  {
+    warn "Must specify a new directory for merge" and last MERGE
+      unless $O_NEWDIRNAME or $O_NOMERGE;
+    warn "Must specify a source directory for merge" and last MERGE
+      unless $O_SOURCEDIR;
+
+    my $newdir = abs_path($O_COLLECTION."/".$O_NEWDIRNAME);
+
+    warn "Must specify a _new_ directory for merge!" and last MERGE
+      if -d $newdir and not $O_NOMERGE;
+
+    mkdir $newdir if not $O_NOMERGE;
+
+    my $fileiter = File::Next::files ( $O_SOURCEDIR );
+
+    while ( defined ( my $file = $fileiter->() ) )
+    {
+      MERGECORE: {
+        print STDERR "Merging $file ...\n";
+
+        my $retval = mergefile($db_hf, $db_fh, $newdir, $file, $O_NOMERGE);
+
+        if ( $O_PRUNE and $retval == 1) {
+          # Not keeping file and successfully merged
+          print STDERR " Merge successful; removing source file: $file\n";
+          unlink $file;
+        } elsif ($O_PRUNE and not $O_NEWONLY and $retval == 0) {
+          # Not keeping file and duplicate detected
+          print STDERR " No merge needed; removing source file: $file\n";
+          unlink $file;
+        } elsif ( $retval >= 0 ) {
+          # Keeping file (all deletions above) and some success case
+          print STDERR " No need to merge this file.\n";
+        } elsif ( $retval == -2 ) {
+          if ($O_NOMERGE)
+          {
+            print STDERR " Would merge this file, except told not to.\n"
+          } else {
+            print STDERR " Now I seem to be really confused...\n"
+          }
+        } elsif ( $retval == -3 ) {
+          print STDERR " Scanning for new files in collection first...\n";
+          build_index ($db_hf, $db_fh, $O_COLLECTION, 1);
+          goto MERGECORE;
+        } elsif ( $retval == -1 ) {
+          print STDERR " Something bad happened there... trying to go on.\n";
+        }
+      }
+    }
+  }
+}
+
+if($O_LIST)
+{
+  print "Printing hash to file database ...\n";
+  printdb ($db_hf);
+  print "Printing file to hash database ...\n";
+  printdb ($db_fh);
+}
+
+$db_fh->db_close();
+$db_hf->db_close();
-- 
2.50.1