#!/usr/local/bin/perl
## $Id: bloomer,v 1.10 1994/10/20 23:54:31 mikew Exp $
## ========================================================================
## bloomer -- Bloom-filter maintenance/lookup
## Author          : Mike Williams <mikew@gopher.dosli.govt.nz>
## ========================================================================

# Copyright (C) 1994 Mike Williams
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# bloomer maintains and does lookups on a Bloom-filter, which is a
# probablistic word-membership checker, implemented as a large bitmap.
# 
# When loading words into the filter, each word is hashed using a number of
# independant hash-functions, generating a number of discrete values.  Each
# of these is used as an index into the bitmap, and the corresponding bits
# are set.
# 
# To perform a lookup, the candidate word is hashed using the same hash-
# functions, and the corresponding locations in the bitmap are checked.  If
# any of the bits are false, we can be sure the candidate word wasn't
# present in the original word-list.  If all bits are set, it is assumed
# that the word was (probably) present.
# 
# The accuracy of the Bloom-filter depends on the size of the bitmap, the
# number of hash-functions used, and the number of words entered into the
# filter.  To be more concrete, if
# 
#   h = number of hash-functions
#   N = bitmap (hash table) size
#   w = number of words entered
#   u = proportion of bits left unset
#   b = probability of a "bogus" hit
#   
# then	u = (1 - (h/N)) ^ w		
#  and 	b = (1 - u) ^ h
# 
# eg. let h = 8		(8 hash functions)
#         N = 524280 	(64k bitmap)
#         w = 24474	(/usr/dict/words, 200k)
#         
#    then u = 0.7212
#         b = 0.000131	(ie. 0.1% chance of a bogus hit)

#=== Usage ================================================================

# bloomer -n <filter> <size> <var>
#   Creates a new Bloom-filter file <filter>, using parameters <size> for
#   size of filter, and <var> for number of hash functions to use.  The
#   <size> parameters may be specified with the suffix "k", in which case
#   it specifies a file-size rather than a bitmap-size.

# bloomer -a <filter> 	
#   Reads words from stdin and inserts them into the Bloom-filter file
#   <filter>.  Words are echoed as they are added.  With the "-v" option,
#   hash values for the words are printed as well.

# bloomer <filter> 	
#   Reads words from stdin, echoing the ones that are matched by the filter
#   to stdout.  With the "-v" option, hash values for the words are printed
#   as well.

# bloomer -i <filter> 	
#   Shows info about the Bloom-filter file <filter>, showing the <size> and
#   <var> parameters it was created with, and calculating the percentage of
#   bits set in the filter.  With the "-v" option, it also prints each
#   bitmap location that is set.

$usage = <<EOF;
usage: bloomer [-v] -n <filter> <size> <nvar> 	create a new filter
       bloomer [-v] -a <filter> 		add words from STDIN
       bloomer [-v] [-t] <filter> 		test words from STDIN
       bloomer [-v] -i <filter> 		show info
EOF

require 'syscall.ph';
require 'getopts.pl';
&Getopts('naivt') || die "$usage";

#=== Initialisation =======================================================

$HSIZE = 8;			# Size (in bytes) of filter-file header
$MAGIC = 'BF1';			# Magic-string

if ($opt_t) {			# Test mode

    die $usage unless (@ARGV == 2);
    ($SIZE, $VARIANTS) = @ARGV;

    if ($SIZE =~ /^([0-9]+)k$/i) {
	$SIZE = ((int($1) * 1024) - $HSIZE) * 8;
    }

} elsif ($opt_n) {		# New file

    die $usage unless (@ARGV == 3);
    ($filter, $SIZE, $VARIANTS) = @ARGV;

    if ($SIZE =~ /^([0-9]+)k$/i) {
	$SIZE = ((int($1) * 1024) - $HSIZE) * 8;
    }

    # Create filter file
    unlink($filter);
    open (FILTER, "+> $filter") || die "$!";

    # Write header
    substr($buffer,0,3) = $MAGIC;
    substr($buffer,3,1) = pack ('C', $VARIANTS);
    substr($buffer,4,4) = pack ('N', $SIZE);
    syswrite (FILTER, $buffer, $HSIZE) || die "$!";

    # Quit unless -a or -i are specified
    exit(0) unless ($opt_a || $opt_i);
    
} else {			# Open existing file

    die $usage unless (@ARGV == 1);
    ($filter) = @ARGV;
    die "$filter: no such file\n" unless (-f $filter);
    if ($opt_a) {
	open (FILTER, "+>> $filter") || die "$!";
    } else {
	open (FILTER, "< $filter") || die "$!";
    }
    seek (FILTER, 0, 0);
    sysread (FILTER, $buffer, $HSIZE) || die "$!";
    die "bad magic number if file $filter"
	unless (substr($buffer,0,3) eq $MAGIC);
    ($VARIANTS) = unpack ('C', substr($buffer,3,1));
    ($SIZE)     = unpack ('N', substr($buffer,4,4));

}

#=== Hash function ========================================================

# Determine maximum shift amount
for ($maxShift = 1; (1 << $maxShift) < $SIZE; $maxShift++) {};

sub rotl
{
    local ($byte, $n) = @_;
    $byte = int($byte << ($n % 8));
    ((($byte) & 0xff00) >> 8) | (($byte) & 0xff);
}

sub hash 
{
    local ($str, $variant) = @_;

    defined($variant) || ($variant = 1);

    local ($hashval);

    local ($shift) = 0;
    local ($shift_inc) = 8 - int($variant / 8);

    for $char (unpack('C*', $str)) {

	# Rotate the character $variant bits
	$char = &rotl ($char, $variant);

	# Shift it over, and add to hashval
	$hashval += ($char << $shift);

	# Determine the shift for next time
	$shift += $shift_inc;
	$shift %= $maxShift;

    }
 
    ($hashval % $SIZE);
}

#=== Set/get values =======================================================

sub lseek
{
    local ($byte) = @_;
    syscall (&SYS_lseek, fileno(FILTER), $byte, 0);
}

sub setBit
{
    local ($bit) = @_;
    local ($offset, $relbit) = (int($bit/8)+$HSIZE, ($bit%8));

    local ($byte);
    &lseek ($offset); sysread (FILTER, $byte, 1);
    vec($byte,$relbit,1) = 1;
    &lseek ($offset); syswrite (FILTER, $byte, 1);
}

sub getBit
{
    local ($bit) = @_;
    local ($offset, $relbit) = (int($bit/8)+$HSIZE, ($bit%8));

    local ($byte);
    &lseek ($offset); sysread (FILTER, $byte, 1);
    vec($byte,$relbit,1);
}

#=== Main loop ============================================================

$| = 1;

local ($found);

if ($opt_i) {
    print "SIZE = $SIZE; VARIANTS = $VARIANTS\n";
    seek (FILTER, 0, 0);
    read (FILTER, $byte, $HSIZE);
    while (read (FILTER, $byte, 1)) {
	for $bit (0 .. 7) {
	    if (vec($byte,$bit,1)) {
		printf ("%6d\n", $offset*8+$bit) if ($opt_v);
		$hits++;
	    }
	}
	$offset++;
    }
    printf ("filter is %.2f%% full\n", 100*$hits/$SIZE);
} else {
  word:
    while (<STDIN>) {
	chop;
	s/#.*//;		# ignore comments
	next if /^\s*$/;	# skip blank lines
	@hash = ();
	for $variant (1 .. $VARIANTS) {
	    $hash = &hash($_, $variant);
	    push (@hash, $hash);
	    if ($opt_a) {
		&setBit($hash);
	    } elsif (!$opt_t) {
		if ($opt_v && !-t STDIN && -t STDERR) {
		    print STDERR "$_             \r";
		}
		&getBit($hash) || next word;
	    }
	}
	print "$_";
	print "\t", join (',', @hash) if ($opt_t || $opt_v);
	print "\n";
    }
}

##=== END of bloomer ======================================================
