very simple bloom filters in elixir:
defmodule Bloom do
defstruct [:bit_array, :size, :hash_count]
# it is not in there -> that's for sure.
# it might be there
def new(size, hash_count) do
bit_array = :binary.copy(<<0>>, div(size + 7, 8)) |> dbg()
%__MODULE__{
bit_array: bit_array,
size: size,
hash_count: hash_count
}
end
defp set_bit(bit_array, index) do
byte_index = div(index, 8)
bit_offset = rem(index, 8)
<> = bit_array
updated_byte = Bitwise.bor(byte, Bitwise.bsl(1, 7 - bit_offset))
<>
end
defp get_bit(bit_array, index) do
byte_index = div(index, 8)
bit_offset = rem(index, 8)
<<_::binary-size(byte_index), byte, _::binary>> = bit_array
Bitwise.band(byte, Bitwise.bsl(1, 7 - bit_offset)) != 0
end
def k_hashes(data, k, bit_array_size) do
# for i <- 0..(k - 1) do
# :crypto.hash(:sha256, "#{i}:#{data}")
# |> :binary.decode_unsigned()
# |> rem(bit_array_size)
# end
# Kirsch-Mitzenmacher optimization
h1 = :crypto.hash(:sha256, data) |> :binary.decode_unsigned()
h2 = :crypto.hash(:sha256, "salt" <> data) |> :binary.decode_unsigned()
for i <- 0..(k - 1) do
rem(h1 + i * h2, bit_array_size)
end
end
def insert(%__MODULE__{} = bloom, value) do
indices = k_hashes(value, bloom.hash_count, bloom.size)
updated_array =
Enum.reduce(indices, bloom.bit_array, fn index, acc ->
set_bit(acc, index)
end)
%{bloom | bit_array: updated_array}
end
def contains?(%__MODULE__{} = bloom, value) do
indices = k_hashes(value, bloom.hash_count, bloom.size)
Enum.all?(indices, fn index ->
get_bit(bloom.bit_array, index)
end)
end
end