From 0efc81f3969b3e5a21abad2ceb72426cad4d5dc4 Mon Sep 17 00:00:00 2001 From: Nathaniel Wesley Filardo Date: Mon, 3 Jan 2022 16:17:04 +0000 Subject: [PATCH] Initial checkin --- .gitignore | 1 + README.rst | 215 ++++++++++++++++ cdb | 597 +++++++++++++++++++++++++++++++++++++++++++++ cdb-digestrelative | 39 +++ cdblib.lua | 135 ++++++++++ 5 files changed, 987 insertions(+) create mode 100644 .gitignore create mode 100644 README.rst create mode 100755 cdb create mode 100755 cdb-digestrelative create mode 100644 cdblib.lua diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1377554 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.swp diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..07c7cbe --- /dev/null +++ b/README.rst @@ -0,0 +1,215 @@ +What is this? +############# + +Resurrecting an ancient project, a "mostly read-only file management tool". +It's intended for keeping a large list of checksums in a database so that +duplication, movement, and corruption of files can be detected. In addition to +maintaining a singular database, it also offers cross-database functionality. + +We speak of "observations" to mean an association of a file path and its +contents (or, at least, their cryptographic checksum). Most operations on the +checksum database pertain to one or more observations. + +Theory of Operation +################### + +This program is just a shim around a database; it does not interact with the +filesystem much itself. Instead, it should be used in composition with things +like ``find`` and the GNU coreutils digest programs (e.g. ``sha512sum``), +delegating details of filesystem traversal and choice of hash and so on to the +user. + +Supported Operations +#################### + +Initialize A Database +===================== + +:: + + cdb --db ${DB} init + +Observe A Path +============== + +Add the checksum of a single path to the database. This will create a new +checksum and/or a new path identifier as needed and will bind them together. :: + + sha512sum $FILE | cdb --db ${DB} addh + +Or, for all files under a path:: + + find $DIR -type f -exec sha512sum {} \+ | cdb --db ${DB} addh + +If we have a pile of digest files already, each of which contains digests of +paths relative to its location, we can generate a database, ``${DB2}`` from them +with the assistance of the ``digestrelative`` tool:: + + find ${DIR} -type f -name SHA512SUMS -print0 | cdb-digestrelative --inul | cdb --db ${DB} addh + +Revalidate A Path Observation +============================= + +Measure the checksum of a path and confirm that the database already held that +observation. Reports unexpected files as well as mis-checksummed contents. :: + + sha512sum $FILE | cdb --db ${DB} verh + +Or for all files under a path:: + + find $DIR -type f -exec sha512sum {} \; | cdb --db ${DB} verh + +This processing of digest streams is to be preferred to verifying a digest +stream as generated by the database, e.g.:: + + cdb --db ${DB} look \* | sha512sum -c + +because the former can be more informative in the case of mismatching digests +(specifically, the database can look for other paths that have the reported +digest). If it's easier to have the database generate the set of files, that +can be done:: + + cdb --db ${DB} look \* --no-hashes --unescape --nul | xargs -0 sha512sum | cdb --db ${DB} verh + +Add Checksums For Missing Files +=============================== + +We can quickly construct a "just paths" database, which associates all paths +with a single digest, from the current state of the file system as follows:: + + cdb --db ${JPDB} init + find ${DIR} -type f -printf "0 %p\\0" | ./cdb --db ${JPDB} addh --inul + +This database may not seem very useful, but when combined with ``cdb diff`` we +can quickly find all paths whose checksums are unknown to the database:: + + cdb --db ${DB} diff ${JPDB} --flavor=path --which=super + +We can then script computing those files' checksums and adding the new reports +to the database:: + + cdb --db ${DB} diff ${JPDB} --flavor=path --which=super --no-headers --nul --unescape > ${JPDB}.new-files0 + xargs -0 sha512sum > ${JPDB}.new < ${JPDB}.new-files0 + cdb --db ${DB} addh < ${JPDB}.new + +From Another Database +--------------------- + +If we have another database that knows digests for our files, rather than +computing digests again, we can extract checksums from ``${DB2}`` and install +them into ``${DB}``:: + + cdb --db ${DB2} mapp --inul < ${JPDB}.new-files0 | cdb --db ${DB} addh + +Responding to File Moves +======================== + +Armed with a "just paths" database as per the above, we can then direct the +database to prune tracked paths not in the "just paths" database if the hashes +are observed elsewhere:: + + cdb --db ${DB} diff ${JPDB} --flavor=path --which=sub --no-headers --nul --unescape > ${JPDB}.missing-files0 + cdb --db ${DB} domv --inul < ${JPDB}.missing-files0 + cdb --db ${DB} gc > ${DB}.gc + sqlite3 ${DB} < ${DB}.gc + +.. TODO or if the observed digest is now superseded? + +Find Duplicates +=============== + +Given a path prefix (possibly empty), report all logged observations below that +path of contents that exist in multiple locations (i.e., files with checksum +collisions). + +.. TODO + +Remove Path +=========== + +Cease to consider a particular path part of the database and remove all +observations made of it. Since this application is primarily for data hoarders +who tend not to delete things, one should prefer to :ref:`Respond to File Moves +` rather than risk removing the last observation of a +given hash. + +.. TODO + +Add Superseder +============== + +Indicate that some file contents are to be considered a lesser version of some +other contents. + +.. TODO + +Report Novelty +============== + +Given a path, measure its checksum and report if it does not match, and has not +been superseded by, any observation already recorded in the database. + +.. TODO + +.. This command would be useful for ingesting things into a library or pruning + collections of files outside the library. + +-------------------------------------------------------------------------------- + +Example Uses +############ + +A photo library +=============== + +Suppose ``/mnt/photos`` contains a collection of photos. We might want to... + +* measure all the files in that directory, flagging new and updated contents:: + + $ cksdb /mnt/photos/.cksdb observe /mnt/photos + +* measure all the files in that directory, automatically updating the database:: + + $ cksdb /mnt/photos/.cksdb observe --new --changed /mnt/photos + +* report duplicates anywhere in the library:: + + $ cksdb /mnt/photos/.cksdb ls --duplicate + +* report files in a particular directory that also exist anywhere else in the + library:: + + $ cksdb /mnt/photos/.cksdb ls --duplicate /mnt/photos/dir1 + +* restrict the search for duplication to another direcotry:: + + $ cksdb /mnt/photos/.cksdb ls --duplicate /mnt/photos/dir1 \ + --also /mnt/photos/dir2 + +* explicitly acknowledge a deletion by removing observations of it:: + + $ cksdb /mnt/photos/.cksdb rm /mnt/photos/filename + +* indicate that the last observed content of ``foo.jpg`` is superseded by the + last observed content of ``foo.raw``:: + + $ cksdb /mnt/photos/.cksdb supersede /mnt/photos/foo.jpg /mnt/photos/foo.raw + +* import files from outside the library, say, in ``/mnt/sdcard``, skipping + duplicate and superseded files and removing all examined files (that is, + imported, duplicate, and superseded; ``--harvest``):: + + $ cksdb /mnt/photos/.cksdb import --harvest /mnt/photos/newdir /mnt/sdcard + +* import from another database:: + + $ cksdb /mnt/photos/.cksdb import-db /mnt/oldphotos/.cksdb + +Cross-Database Operations +========================= + +Compute violations of set-theoretic relationships between a database and the +union of one or more other databases:: + + $ cksdb /mnt/photos/.cksdb is-subset /mnt/backups/photos/.cksdb + $ cksdb /mnt/photos/.cksdb is-superset /mnt/backups/photos/.cksdb diff --git a/cdb b/cdb new file mode 100755 index 0000000..c8040fd --- /dev/null +++ b/cdb @@ -0,0 +1,597 @@ +#!/usr/bin/env lua5.3 + +--------------------------------------------------------- Imports {{{ + +local argparse = require "argparse" +local dbi = require "DBI" +local plpath = require "pl.path" +local plstringx = require "pl.stringx" +local pltablex = require "pl.tablex" + +local cdblib = require "cdblib" + +----------------------------------------------------------------- }}} +--------------------------------------------------- SQL utilities {{{ + +local function sql_do(dbh, sql, ...) + local sth, err = dbh:prepare(sql) + if not sth then return false, err end + local ok, err = sth:execute(...) + if not ok then return false, err end + return sth +end + +local function sql_run_one(sth, ...) + local ok, err = sth:execute(...) + if not ok then return false, err end + return sth:fetch() +end + +----------------------------------------------------------------- }}} +-------------------------------------------------- SQL statements {{{ + +local function sql_mk_path_upsert(dbh) + return dbh:prepare([[INSERT INTO paths (path) VALUES (?) + ON CONFLICT DO UPDATE SET path = path RETURNING pathid]]) +end + +local function sql_mk_path_find(dbh) + return dbh:prepare([[SELECT pathid FROM paths WHERE path = ?]]) +end + +local function sql_mk_hash_upsert(dbh) + return dbh:prepare([[INSERT INTO hashes (hash) VALUES (?) + ON CONFLICT DO UPDATE SET hash = hash RETURNING hashid]]) +end + +local function sql_mk_hash_find(dbh) + return dbh:prepare([[SELECT hashid FROM hashes WHERE hash = ?]]) +end + +local function sql_mk_path_find_by_hash(dbh) + return dbh:prepare([[SELECT path + FROM path_hash NATURAL JOIN paths NATURAL JOIN hashes WHERE hash = ?]]) +end + +----------------------------------------------------------------- }}} +------------------------------------------------ Argparse, part 1 {{{ + +local function argparse_for_render(c) + c:flag("--unescape") + :description("Do not escape the filenames; ambiguous without --nul") + :default(false) + c:flag("--nul -0") + :description("NUL-terminate lines rather than newline") + :default(false) +end + +local function renderers_for(args) + return cdblib.renderers_for(args.nul, args.unescape) +end + +local function argparse_flag_inul(c) + return c:flag("--inul -1") + :description("Input is NUL-delimited, not newline") + :default(false) +end + +local function argparse_no_hashes(c) + c:flag("--no-hashes") + :description("Elide hashes in output; no leading space with --unescape") +end + +local function argparse_for_db_filter(c) + c:option("--predicate"):default("in") + -- TODO: :choices({"in", "out"}) + argparse_no_hashes(c) + argparse_for_render(c) +end + +local argp = argparse("cdb", "checksum database tool") + +-- global options +argp:option("--database --db") + :args(1) + :description("Indicate primary checksum database") + +local argp_groups = {} +local function argp_group(gname, cmd) + argp_groups[gname] = argp_groups[gname] or {} + table.insert(argp_groups[gname], cmd) +end + +local function mksubcmd(cmdinit, body) + local cmd = argp:command() + cmdinit(cmd) + cmd:action(function(args, name) + args.command = name + args.command_fn = body + end) + return cmd +end + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: init {{{ + +mksubcmd(function(c) + c:name("init") + :description("Initialize the database") + argp_group("Administrative Commands", c) + end, + function(args, dbh) + local function ddo(sql) assert(dbi.Do(dbh, sql)) end + ddo([[CREATE TABLE IF NOT EXISTS paths ( + pathid INTEGER PRIMARY KEY ASC, + path TEXT NOT NULL UNIQUE ON CONFLICT FAIL)]]) + ddo([[CREATE TABLE IF NOT EXISTS hashes ( + hashid INTEGER PRIMARY KEY ASC, + hash TEXT NOT NULL UNIQUE ON CONFLICT FAIL)]]) + ddo([[CREATE TABLE IF NOT EXISTS hash_hash_superseders ( + supersederid INTEGER PRIMARY KEY ASC, + oldid INTEGER REFERENCES hashes(hashid), + newid INTEGER REFERENCES hashes(hashid), + note TEXT, + UNIQUE(oldid, newid) ON CONFLICT FAIL)]]) + ddo([[CREATE INDEX IF NOT EXISTS hash_hash_superseders_idx_old + ON hash_hash_superseders (oldid)]]) + ddo([[CREATE TABLE IF NOT EXISTS path_hash ( + pairid INTEGER PRIMARY KEY ASC, + pathid INTEGER REFERENCES paths(pathid), + hashid INTEGER REFERENCES hashes(hashid), + timestamp INTEGER DEFAULT CURRENT_TIMESTAMP, + UNIQUE(pathid, hashid) ON CONFLICT FAIL)]]) + ddo([[CREATE INDEX IF NOT EXISTS path_hash_idx_pathid + ON path_hash (pathid)]]) + ddo([[CREATE INDEX IF NOT EXISTS path_hash_idx_hashid + ON path_hash (hashid)]]) + ddo([[CREATE VIEW IF NOT EXISTS v_path_hash AS SELECT + pairid, pathid, hashid, path, hash, timestamp + FROM path_hash NATURAL JOIN paths NATURAL JOIN hashes]]) + dbh:commit() +end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: stat {{{ + +mksubcmd(function(c) + c:name("stat") + :description("Report statistics") + argp_group("Administrative Commands", c) + end, + function(args, dbh) + local nhash, npath, nobsv + nhash = assert(sql_do(dbh, "SELECT COUNT(*) FROM hashes" )):fetch()[1] + npath = assert(sql_do(dbh, "SELECT COUNT(*) FROM paths" )):fetch()[1] + nobsv = assert(sql_do(dbh, "SELECT COUNT(*) FROM path_hash" )):fetch()[1] + print(("nhash=%d npath=%d nobsv=%d"):format(nhash, npath, nobsv)) + end) + +mksubcmd(function(c) + c:name("gc") + :description("Generate SQL to prune identifiers not used by observations") + argp_group("Administrative Commands", c) + end, + function(args, dbh) + local sth_paths_dead = assert(sql_do(dbh, + [[SELECT pathid, path FROM paths + WHERE pathid NOT IN (SELECT pathid FROM path_hash)]])) + for p in sth_paths_dead:rows() do + print("-- DEAD PATH", p[2]) + print(("DELETE FROM paths WHERE pathid = %d;"):format(p[1])) + end + + local sth_hashes_dead = assert(sql_do(dbh, + [[SELECT hashid, hash FROM hashes + WHERE hashid NOT IN (SELECT hashid FROM path_hash) + AND hashid NOT IN (SELECT oldid FROM hash_hash_superseders) + AND hashid NOT IN (SELECT newid FROM hash_hash_superseders)]])) + for h in sth_hashes_dead:rows() do + print("-- DEAD HASH", h[2]) + print(("DELETE FROM hashes WHERE hashid = %d;"):format(h[1])) + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: look {{{ + +mksubcmd(function(c) + c:name("look") + :description("Look up checksums for path glob(s)") + c:argument("glob", "Path globs to search"):args("+") + argparse_for_render(c) + argparse_no_hashes(c) + argp_group("Queries", c) + end, + function(args, dbh) + local sql = assert(dbh:prepare( + [[SELECT hash, path FROM v_path_hash WHERE path GLOB ?]])) + local render_both, render_path = renderers_for(args) + local renderer = args.no_hashes + and function(h, p) return render_path(p) end + or render_both + for _, glob in ipairs(args.glob) do + sql:execute(glob) + for row in sql:rows() do + io.write(renderer(table.unpack(row))) + end + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: mapp {{{ + +mksubcmd(function(c) + c:name("mapp") + :description("Map paths to hashes in the database, like look") + c:flag("--no-paths") + :description("Print only the resulting hashes") + :default(false) + argparse_for_render(c) + argparse_flag_inul(c) + argp_group("Queries", c) + end, + function(args, dbh) + local sth = assert(dbh:prepare( + [[SELECT hash FROM v_path_hash WHERE path = ?]])) + local render = args.no_paths + and function(h, p) return h, args.nul and '\0' or '\n' end + or renderers_for(args) + + local mkiter = cdblib.iter_lines_or_nul(args.inul) + for p in mkiter() do + sth:execute(p) + for row in sth:rows() do + io.write(render(row[1], p)) + end + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: maph {{{ + +mksubcmd(function(c) + c:name("maph") + :description("Map hashes to paths in the database") + argparse_for_render(c) + argparse_flag_inul(c) + argp_group("Queries", c) + end, + function(args, dbh) + local sth = assert(dbh:prepare( + [[SELECT path FROM v_path_hash WHERE hash = ?]])) + local render = renderers_for(args) + local mkiter = cdblib.iter_lines_or_nul(args.inul) + for h in mkiter() do + sth:execute(h) + for row in sth:rows() do + io.write(render(h, row[1])) + end + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: cflx {{{ + +mksubcmd(function(c) + c:name("cflx") + :description("Find conflicting measurements of paths") + argp_group("Queries", c) + end, + function(args, dbh) + local sth = assert(sql_do(dbh, + [[SELECT path, hash, pairid, timestamp FROM v_path_hash NATURAL JOIN + (SELECT pathid, pcount FROM + (SELECT pathid, COUNT(pairid) AS pcount FROM path_hash GROUP BY pathid) + WHERE pcount > 1)]])) + local lastpath = nil + for row in sth:rows() do + local p, h, pairid, ts = table.unpack(row) + if p ~= lastp then + lastp = p + print("PATH", (cdblib.escape_gnu_digest(p))) + end + print((" observed hash %s with id %d at %s"):format(h, pairid, ts)) + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: addh {{{ + +mksubcmd(function(c) + c:name("addh") + :description("Ingest digest tool output") + c:option("--graft"):default("") + :description("Graft a prefix to input file names") + c:flag("--replace-paths") + :description("Remove all existing observations of reported paths") + argparse_flag_inul(c) + argp_group("Updates", c) + end, + function(args, dbh) + local sth_path_upsert = assert(sql_mk_path_upsert(dbh)) + local sth_hash_upsert = assert(sql_mk_hash_upsert(dbh)) + local sth_obsv_del = assert(dbh:prepare( + [[DELETE FROM path_hash WHERE pathid = ?]])) + local sth_obsv_upsert = assert(dbh:prepare( + [[INSERT OR REPLACE INTO path_hash (pathid, hashid) VALUES (?, ?)]])) + + local mkiter = cdblib.iter_lines_or_nul(args.inul) + + local nadded = 0 + + for h, p in cdblib.iter_gnu_digest(mkiter)() do + p = plpath.normpath(plpath.join(args.graft, p)) + local pid = sql_run_one(sth_path_upsert, p)[1] + if args.replace_path then sth_obsv_del:execute(pid) end + local hid = sql_run_one(sth_hash_upsert, h)[1] + sth_obsv_upsert:execute(pid, hid) + if sth_obsv_upsert:affected() > 0 then dbh:commit() end + + nadded = nadded + 1 + io.write(("Processed %d hashes\r"):format(nadded)); io.flush() + end + io.write("\n") + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: filh {{{ + +mksubcmd(function(c) + c:name("filh") + :description("Filter digest tool lines against database by hash") + argparse_for_db_filter(c) + argp_group("Queries", c) + end, + function(args, dbh) + local rex = (args.predicate == "in") + local rend_both, rend_path = renderers_for(args) + local renderer = args.no_hashes + and function(h, p) return rend_path(p) end + or rend_both + local sth = sql_mk_hash_find(dbh) + for h, p in cdblib.iter_gnu_digest(cdblib.mk_lines_iter())() do + local res, err = sql_run_one(sth, h) + if res == false and err ~= nil then error(err) end + if (res ~= nil) == rex then io.write(renderer(h, p)) end + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: filp {{{ + +mksubcmd(function(c) + c:name("filp") + :description("Filter digest tool lines against database by path") + argparse_for_db_filter(c) + c:flag("--just-paths") + :description("Input is a series of paths without digests") + argparse_flag_inul(c) + argp_group("Queries", c) + end, + function(args, dbh) + local rex = (args.predicate == "in") + local rend_both, rend_path = renderers_for(args) + local renderer = args.no_hashes + and function(h, p) return rend_path(p) end + or rend_both + + local mkiter = cdblib.iter_lines_or_nul(args.inul) + local mkiter = args.just_paths + and cdblib.iter_just_paths_as_digest(mkiter) + or cdblib.iter_gnu_digest(mkiter) + + local sth = sql_mk_path_find(dbh) + for h, p in mkiter() do + local res, err = sql_run_one(sth, p) + if res == false and err ~= nil then error(err) end + if (res ~= nil) == rex then io.write(renderer(h, p)) end + end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: verh {{{ + +mksubcmd(function(c) + c:name("verh") + :description("Verify reported digests against database") + c:option("--graft") + :description("Graft a prefix to input file names") + :default("") + c:flag("--also-mismatch") + :description("Also report other hashes associated with a path") + argp_group("Queries", c) + end, + function(args, dbh) + local sth_path_find = assert(sql_mk_path_find(dbh)) + local sth_hash_find = assert(sql_mk_hash_find(dbh)) + local sth_path_find_by_hash = assert(sql_mk_path_find_by_hash(dbh)) + local sth_obsv_find_by_pathid_hash = + assert(dbh:prepare([[SELECT pairid + FROM path_hash NATURAL JOIN hashes WHERE pathid = ? AND hash = ?]])) + local sth_obsv_find_by_pathid_nothash = + assert(dbh:prepare([[SELECT hash + FROM path_hash NATURAL JOIN hashes WHERE pathid = ? AND hash != ?]])) + + local fail = 0 + for h, p in cdblib.iter_gnu_digest(cdblib.mk_lines_iter())() do + p = plpath.normpath(plpath.join(args.graft, p)) + local pid = sql_run_one(sth_path_find, p) + if pid == nil then + -- Path not in database + print(("Path '%s' is not in database"):format(p)) + + fail = fail + 1 + + local is_elsewhere = false + sth_path_find_by_hash:execute(h) + for pp in sth_path_find_by_hash:rows() do + is_elsewhere = true + print(("... but matching hash at '%s'"):format(pp[1])) + end + + if not is_elsewhere then + print(("... nor is its hash '%s'"):format(h)) + end + else + -- Path in database + pid = pid[1] + local didfail = false + local obsvid = sql_run_one(sth_obsv_find_by_pathid_hash, pid, h) + if obsvid == nil then + -- Observation not in database + print(("Path '%s' not associated with that hash in database"):format(p)) + didfail = true + else print("OK: ", p) -- XXX + end + if args.also_mismatch then + sth_obsv_find_by_pathid_nothash:execute(pid, h) + for row in sth_obsv_find_by_pathid_nothash:rows() do + if not didfail then + print(("Path '%s' at expected hash, but"):format(p)) + end + didfail = true + print(("... additional hash '%s' in database"):format(row[1])) + end + end + if didfail then fail = fail + 1 end + end + end + if fail ~= 0 then print(("%d total errors"):format(fail)) end + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: diff {{{ + +mksubcmd(function(c) + c:name("diff") + :description("Compare against another database") + c:argument("db2") + c:option("--flavor"):default("both") + :description("Database aspects to compare") + -- TODO :choices("hash", "path", "both") + c:option("--which"):default("symm") + :description("Direction of comparison") + -- TODO :choices("sub", "super", "symm") + c:flag("--no-headers") + :description("Suppress headers in output") + argparse_for_render(c) + argp_group("Queries", c) + end, + function(args, dbh) + local rend_hash, rend_path = renderers_for(args) + local header = args.no_headers and function() end or print + local function header(x) if not args.no_headers then print(x) end end + dbi.Do(dbh, "ATTACH DATABASE ? AS other", args.db2) + + if pltablex.find({"path", "both"}, args.flavor) then + if pltablex.find({"sub", "symm"}, args.which) then + header("-- Paths in local database not in remote:") + for row in sql_do(dbh, + [[SELECT path FROM paths + WHERE path NOT IN (SELECT path FROM other.paths) + ORDER BY path]]):rows() do + io.write(rend_path(row[1])) + end + end + if pltablex.find({"super", "symm"}, args.which) then + header("-- Paths in remote database not in local:") + for row in sql_do(dbh, + [[SELECT path FROM other.paths + WHERE path NOT IN (SELECT path FROM paths) + ORDER BY path]]):rows() do + io.write(rend_path(row[1])) + end + end + end + + if pltablex.find({"hash", "both"}, args.flavor) then + if pltablex.find({"sub", "symm"}, args.which) then + header("-- Hashes in local database not in remote:") + for row in sql_do(dbh, + [[SELECT hash, path + FROM hashes NATURAL JOIN path_hash NATURAL JOIN paths + WHERE hash NOT IN (SELECT hash FROM other.hashes) + ORDER BY path]]):rows() do + io.write(rend_hash(table.unpack(row))) + end + end + if pltablex.find({"super", "symm"}, args.which) then + header("-- Hashes in remote database not in local:") + for row in sql_do(dbh, + [[SELECT hash, path + FROM other.hashes NATURAL JOIN other.path_hash NATURAL JOIN other.paths + WHERE hash NOT IN (SELECT hash FROM hashes) + ORDER BY path]]):rows() do + io.write(rend_hash(table.unpack(row))) + end + end + end + header("-- End of diff report") + end) + +----------------------------------------------------------------- }}} +--------------------------------------------------- Command: domv {{{ + +mksubcmd(function(c) + c:name("domv") + :description("Remove given paths if hashes exist elsewhere") + c:flag("--dry-run -n") + :description("Do not perform deletions") + :default(false) + argparse_flag_inul(c) + argp_group("Updates", c) + end, + function(args, dbh) + local qsth = assert(dbh:prepare( + [[SELECT path FROM v_path_hash + WHERE hash IN (SELECT hash FROM v_path_hash WHERE path = ?1) + AND path != ?1]])) + local dsth = assert(dbh:prepare( + [[DELETE FROM path_hash WHERE pathid IN + (SELECT pathid FROM paths WHERE path = ?)]])) + + local mkiter = cdblib.iter_lines_or_nul(args.inul) + for p in mkiter() do + assert(qsth:execute(p)) + + print("Trying mv:", p) + + local ok = false + for row in qsth:rows() do + print("Found", row[1]) + ok = true + end + + if ok and not args.dry_run then + assert(dsth:execute(p)) + dbh:commit() + print("OK", dsth:affected()) + end + end + end) + +---------------------------------------------------------------------------- }}} +--------------------------------------------------- Command Grouping, Part 2 {{{ + +for _, g in ipairs{"Queries", "Updates", "Administrative Commmands"} do + argp:group(g, table.unpack(argp_groups[g] or {})) + argp_groups[g] = nil +end + +-- Any stragglers? +for k,v in pairs(argp_groups) do argp:group(k, table.unpack(v)) end + +---------------------------------------------------------------------------- }}} +-------------------------------------------------------- Top-level executive {{{ + +local args = argp:parse() +if not args.database then error "--database is required" end + +local dbh, err = dbi.Connect("SQLite3", args.database) +if not dbh then + error ("Database error: " .. err) +end + +args:command_fn(dbh) + +---------------------------------------------------------------------------- }}} diff --git a/cdb-digestrelative b/cdb-digestrelative new file mode 100755 index 0000000..925c259 --- /dev/null +++ b/cdb-digestrelative @@ -0,0 +1,39 @@ +#!/usr/bin/env lua5.3 + +-- Read a stream of GNU digest filenames and concat them together, adjusting +-- paths by prefixing the relative path of each digest file. + +local argparse = require "argparse" +local plpath = require "pl.path" + +local cdblib = require "cdblib" + +local argp = argparse("digestmangle", "GNU digest stream mangling tool") +argp:flag("--nul -0") + :description("NUL-terminate lines rather than newline") + :default(false) +argp:flag("--inul -1") + :description("Input is NUL terminated rather than newline") + :default(false) +argp:flag("--fnul -2") + :description("Input files are NUL terminated rather than newline") + :default(false) + +local args = argp:parse() + +local render = cdblib.renderers_for(args.nul, false) + +local function mklineiter(f) + return cdblib.iter_gnu_digest(cdblib.iter_lines_or_nul(args.fnul, f)) +end + +for fileline in cdblib.iter_lines_or_nul(args.inul)() do + local prefix = plpath.dirname(fileline) + + local f = io.open(fileline, "r") + for h, p in mklineiter(f)() do + local np = plpath.normpath(plpath.join(prefix, p)) + io.write(render(h, np)) + end + io.close(f) +end diff --git a/cdblib.lua b/cdblib.lua new file mode 100644 index 0000000..a2bb2c5 --- /dev/null +++ b/cdblib.lua @@ -0,0 +1,135 @@ +local plstringx = require "pl.stringx" + +local _M = {} + +-- Escape file name for GNU digest; returns new form and number, which is 0 if +-- string is unaltered and positive if escaping was necessary. +-- +-- The GNU digest specification is incomplete and does not promise that all +-- file names with backslashes are escaped, though that seems to be true in +-- practice. That is, while the tools appear to always generate the first of +-- these two options, the second appears to be permitted by documentation as +-- well: "\012...ef as\\df" and "012...ef as\df". We follow along and always +-- escape backslashes even if there are no \r or \n characters in the rest of +-- the string. +local function escape_gnu_digest(fn) + return fn:gsub("[\\\r\n]", {['\\']='\\\\', ['\r']='\\r', ['\n']='\\n'}) +end +_M.escape_gnu_digest = escape_gnu_digest + +-- The inverse transformation of escape_gnu_digest. Applied unconditionally, so +-- please condition invocation on knowing that the line needs to be escaped. +local function unescape_gnu_digest(fn) + return fn:gsub("\\.", {['\\\\']='\\', ['\\r']='\r', ['\\n']='\n'}) +end +_M.unescape_gnu_digest = unescape_gnu_digest + +function _M.iter_gnu_digest(baseiter) + return function() return coroutine.wrap(function() + for line in baseiter() do + if line == nil then return nil end + local esc, h, fn = line:match("^(\\?)(%x*) [ *](.*)$") + if esc == nil then + print("Bad line:", line) -- XXX + else + coroutine.yield(h, (esc == "") and fn or unescape_gnu_digest(fn)) + end + end + end) end +end + +function _M.iter_just_paths_as_digest(baseiter) + return function() return coroutine.wrap(function() + for line in baseiter() do + if line == nil then return nil end + coroutine.yield("-", line) + end + end) end +end + +-- a custom delimited string iterator, useful for nul-separated records, e.g. +-- :: (string, () -> () -!> string) -> () -> () -!> string +function _M.mk_delim_iter(delim, baseiter) + local ix = 0 + local s = { fin = {}, incomplete = {} } + + local function proc(chunk) + local splits = plstringx.split(chunk, delim) + + if #splits == 1 then -- zero or one delimiter + if #splits[1] == 0 then -- one delimiter (necessarily the whole string) + if #s.incomplete > 0 then -- and a prefix exists + s.fin = { table.concat(s.incomplete) } + s.incomplete = {} + end + else -- zero delimiters + table.insert(s.incomplete, chunk) -- grow incomplete fragment + end + else -- one or more delimiters + local ni = table.remove(splits) + + table.insert(s.incomplete, splits[1]) + splits[1] = table.concat(s.incomplete) + s.fin = splits + + s.incomplete = {} + if #ni ~= 0 then s.incomplete[1] = ni end + end + end + + return function() return coroutine.wrap(function() + for chunk in baseiter() do + proc(chunk) + + -- while we have a complete delimited string, return one + while #s.fin > 0 do + ix = ix + 1 + coroutine.yield(ix, table.remove(s.fin)) + end + end + end) end +end + +function _M.iter_just_2nd(baseiter) + return function() return coroutine.wrap(function() + for k, v in baseiter() do coroutine.yield(v) end end) + end +end + +-- :: (file or nil) -> () -> () -!> string +function _M.mk_read_iter(f) + f = f or io.input() + return function() return function() return f:read(1024) end end +end +function _M.mk_lines_iter(f) + return function() return (f or io.input()):lines() end +end + +-- Iterate stdin as either newline-terminated or NUL-terminated records +-- :: (boolean, file or nil) -> () -!> string +function _M.iter_lines_or_nul(nul, f) + assert(type(nul) == "boolean") + return nul and _M.iter_just_2nd(_M.mk_delim_iter("\0", _M.mk_read_iter(f))) + or _M.mk_lines_iter(f) +end + +function _M.renderers_for(nul, unescape) + assert(type(nul) == "boolean") + assert(type(unescape) == "boolean") + local fin = nul and '\0' or '\n' + local mangle_path = unescape + and function(p) return p, fin end + or function(p) + local np, nesc = escape_gnu_digest(p) + return (nesc == 0 and "" or "\\"), " ", np, fin + end + local mangle_full = unescape + and function(h, f) return "", h, " ", f, fin end + or function(h, f) + local nf, nesc = escape_gnu_digest(f) + return (nesc == 0 and "" or "\\"), h, " ", nf, fin + end + return mangle_full, mangle_path +end + +return _M -- 2.50.1