Workspaces
Sunday 2025-05-11 (21 days ago)

very simple bloom filters in elixir:

defmodule Bloom do
  defstruct [:bit_array, :size, :hash_count]

  # it is not in there -> that's for sure.
  # it might be there

  def new(size, hash_count) do
    bit_array = :binary.copy(<<0>>, div(size + 7, 8)) |> dbg()

    %__MODULE__{
      bit_array: bit_array,
      size: size,
      hash_count: hash_count
    }
  end

  defp set_bit(bit_array, index) do
    byte_index = div(index, 8)
    bit_offset = rem(index, 8)

    <> = bit_array
    updated_byte = Bitwise.bor(byte, Bitwise.bsl(1, 7 - bit_offset))
    <>
  end

  defp get_bit(bit_array, index) do
    byte_index = div(index, 8)
    bit_offset = rem(index, 8)
    <<_::binary-size(byte_index), byte, _::binary>> = bit_array
    Bitwise.band(byte, Bitwise.bsl(1, 7 - bit_offset)) != 0
  end

  def k_hashes(data, k, bit_array_size) do
    # for i <- 0..(k - 1) do
    #   :crypto.hash(:sha256, "#{i}:#{data}")
    #   |> :binary.decode_unsigned()
    #   |> rem(bit_array_size)
    # end

    # Kirsch-Mitzenmacher optimization
    h1 = :crypto.hash(:sha256, data) |> :binary.decode_unsigned()
    h2 = :crypto.hash(:sha256, "salt" <> data) |> :binary.decode_unsigned()

    for i <- 0..(k - 1) do
      rem(h1 + i * h2, bit_array_size)
    end
  end

  def insert(%__MODULE__{} = bloom, value) do
    indices = k_hashes(value, bloom.hash_count, bloom.size)

    updated_array =
      Enum.reduce(indices, bloom.bit_array, fn index, acc ->
        set_bit(acc, index)
      end)

    %{bloom | bit_array: updated_array}
  end

  def contains?(%__MODULE__{} = bloom, value) do
    indices = k_hashes(value, bloom.hash_count, bloom.size)

    Enum.all?(indices, fn index ->
      get_bit(bloom.bit_array, index)
    end)
  end
end