Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions common/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ impl FixedSize for u64 {
const SIZE_IN_BYTES: usize = 8;
}

impl BinarySerializable for u128 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u128::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_u128::<Endianness>()
}
}

impl FixedSize for u128 {
const SIZE_IN_BYTES: usize = 16;
}

impl BinarySerializable for f32 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_f32::<Endianness>(*self)
Expand Down
5 changes: 3 additions & 2 deletions fastfield_codecs/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,10 @@ mod tests {

fn get_u128_column_from_data(data: &[u128]) -> Arc<dyn Column<u128>> {
let mut out = vec![];
serialize_u128(VecColumn::from(&data), &mut out).unwrap();
let iter_gen = || data.iter().cloned();
serialize_u128(iter_gen, data.len() as u64, &mut out).unwrap();
let out = OwnedBytes::new(out);
open_u128(out).unwrap()
open_u128::<u128>(out).unwrap()
}

#[bench]
Expand Down
108 changes: 74 additions & 34 deletions fastfield_codecs/src/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ use std::ops::RangeInclusive;

use tantivy_bitpacker::minmax;

use crate::monotonic_mapping::StrictlyMonotonicFn;

pub trait Column<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
Expand Down Expand Up @@ -143,16 +145,30 @@ struct MonotonicMappingColumn<C, T, Input> {
_phantom: PhantomData<Input>,
}

/// Creates a view of a column transformed by a monotonic mapping.
pub fn monotonic_map_column<C, T, Input: PartialOrd, Output: PartialOrd>(
/// Creates a view of a column transformed by a strictly monotonic mapping. See
/// [`StrictlyMonotonicFn`].
///
/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
/// monotonic_mapping.mapping() is expected to be injective, and we should always have
/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
///
/// The inverse of the mapping is required for:
/// `fn get_between_vals(&self, range: RangeInclusive<T>) -> Vec<u64> `
/// The user provides the original value range and we need to monotonic map them in the same way the
/// serialization does before calling the underlying column.
///
/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
/// monotonic_mapping during serialization.
pub fn monotonic_map_column<C, T, Input, Output>(
from_column: C,
monotonic_mapping: T,
) -> impl Column<Output>
where
C: Column<Input>,
T: Fn(Input) -> Output + Send + Sync,
Input: Send + Sync,
Output: Send + Sync,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
{
MonotonicMappingColumn {
from_column,
Expand All @@ -161,36 +177,46 @@ where
}
}

impl<C, T, Input: PartialOrd, Output: PartialOrd> Column<Output>
for MonotonicMappingColumn<C, T, Input>
impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: Fn(Input) -> Output + Send + Sync,
Input: Send + Sync,
Output: Send + Sync,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
{
#[inline]
fn get_val(&self, idx: u64) -> Output {
let from_val = self.from_column.get_val(idx);
(self.monotonic_mapping)(from_val)
self.monotonic_mapping.mapping(from_val)
}

fn min_value(&self) -> Output {
let from_min_value = self.from_column.min_value();
(self.monotonic_mapping)(from_min_value)
self.monotonic_mapping.mapping(from_min_value)
}

fn max_value(&self) -> Output {
let from_max_value = self.from_column.max_value();
(self.monotonic_mapping)(from_max_value)
self.monotonic_mapping.mapping(from_max_value)
}

fn num_vals(&self) -> u64 {
self.from_column.num_vals()
}

fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
Box::new(self.from_column.iter().map(&self.monotonic_mapping))
Box::new(
self.from_column
.iter()
.map(|el| self.monotonic_mapping.mapping(el)),
)
}

fn get_between_vals(&self, range: RangeInclusive<Output>) -> Vec<u64> {
self.from_column.get_between_vals(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
)
}

// We voluntarily do not implement get_range as it yields a regression,
Expand Down Expand Up @@ -236,19 +262,22 @@ where
#[cfg(test)]
mod tests {
use super::*;
use crate::MonotonicallyMappableToU64;
use crate::monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
StrictlyMonotonicMappingToInternalGCDBaseval,
};

#[test]
fn test_monotonic_mapping() {
let vals = &[1u64, 3u64][..];
let vals = &[3u64, 5u64][..];
let col = VecColumn::from(vals);
let mapped = monotonic_map_column(col, |el| el + 4);
assert_eq!(mapped.min_value(), 5u64);
assert_eq!(mapped.max_value(), 7u64);
let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
assert_eq!(mapped.min_value(), 1u64);
assert_eq!(mapped.max_value(), 3u64);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.num_vals(), 2);
assert_eq!(mapped.get_val(0), 5);
assert_eq!(mapped.get_val(1), 7);
assert_eq!(mapped.get_val(0), 1);
assert_eq!(mapped.get_val(1), 3);
}

#[test]
Expand All @@ -260,31 +289,42 @@ mod tests {

#[test]
fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64);
let val_i64s: Vec<i64> = mapped.iter().collect();
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
),
);
let val_i64s: Vec<u64> = mapped.iter().collect();
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
}
}

#[test]
fn test_monotonic_mapping_get_range() {
let vals: Vec<u64> = (-1..99).map(i64::to_u64).collect();
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(&vals);
let mapped = monotonic_map_column(col, |el| i64::from_u64(el) * 10i64);
assert_eq!(mapped.min_value(), -10i64);
assert_eq!(mapped.max_value(), 980i64);
let mapped = monotonic_map_column(
col,
StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
),
);

assert_eq!(mapped.min_value(), 0u64);
assert_eq!(mapped.max_value(), 9900u64);
assert_eq!(mapped.num_vals(), 100);
let val_i64s: Vec<i64> = mapped.iter().collect();
assert_eq!(val_i64s.len(), 100);
let val_u64s: Vec<u64> = mapped.iter().collect();
assert_eq!(val_u64s.len(), 100);
for i in 0..100 {
assert_eq!(val_i64s[i as usize], mapped.get_val(i));
assert_eq!(val_i64s[i as usize], i64::from_u64(vals[i as usize]) * 10);
assert_eq!(val_u64s[i as usize], mapped.get_val(i));
assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
}
let mut buf = [0i64; 20];
let mut buf = [0u64; 20];
mapped.get_range(7, &mut buf[..]);
assert_eq!(&val_i64s[7..][..20], &buf);
assert_eq!(&val_u64s[7..][..20], &buf);
}
}
19 changes: 12 additions & 7 deletions fastfield_codecs/src/compact_space/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,10 @@ pub struct IPCodecParams {

impl CompactSpaceCompressor {
/// Taking the vals as Vec may cost a lot of memory. It is used to sort the vals.
pub fn train_from(column: &impl Column<u128>) -> Self {
pub fn train_from(iter: impl Iterator<Item = u128>, num_vals: u64) -> Self {
Comment thread
PSeitz marked this conversation as resolved.
let mut values_sorted = BTreeSet::new();
values_sorted.extend(column.iter());
let total_num_values = column.num_vals();
values_sorted.extend(iter);
let total_num_values = num_vals;

let compact_space =
get_compact_space(&values_sorted, total_num_values, COST_PER_BLANK_IN_BITS);
Expand Down Expand Up @@ -443,7 +443,7 @@ impl CompactSpaceDecompressor {
mod tests {

use super::*;
use crate::{open_u128, serialize_u128, VecColumn};
use crate::{open_u128, serialize_u128};

#[test]
fn compact_space_test() {
Expand Down Expand Up @@ -513,7 +513,12 @@ mod tests {

fn test_aux_vals(u128_vals: &[u128]) -> OwnedBytes {
let mut out = Vec::new();
serialize_u128(VecColumn::from(u128_vals), &mut out).unwrap();
serialize_u128(
|| u128_vals.iter().cloned(),
u128_vals.len() as u64,
&mut out,
)
.unwrap();

let data = OwnedBytes::new(out);
test_all(data.clone(), u128_vals);
Expand Down Expand Up @@ -603,8 +608,8 @@ mod tests {
5_000_000_000,
];
let mut out = Vec::new();
serialize_u128(VecColumn::from(vals), &mut out).unwrap();
let decomp = open_u128(OwnedBytes::new(out)).unwrap();
serialize_u128(|| vals.iter().cloned(), vals.len() as u64, &mut out).unwrap();
let decomp = open_u128::<u128>(OwnedBytes::new(out)).unwrap();

assert_eq!(decomp.get_between_vals(199..=200), vec![0]);
assert_eq!(decomp.get_between_vals(199..=201), vec![0, 1]);
Expand Down
42 changes: 35 additions & 7 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ use std::sync::Arc;

use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
use monotonic_mapping::{
StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
};
use ownedbytes::OwnedBytes;
use serialize::Header;

Expand All @@ -22,6 +26,7 @@ mod compact_space;
mod line;
mod linear;
mod monotonic_mapping;
mod monotonic_mapping_u128;

mod column;
mod gcd;
Expand All @@ -31,7 +36,8 @@ use self::bitpacked::BitpackedCodec;
use self::blockwise_linear::BlockwiseLinearCodec;
pub use self::column::{monotonic_map_column, Column, VecColumn};
use self::linear::LinearCodec;
pub use self::monotonic_mapping::MonotonicallyMappableToU64;
pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
pub use self::serialize::{
estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
};
Expand Down Expand Up @@ -73,8 +79,13 @@ impl FastFieldCodecType {
}

/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<u128>>> {
Ok(Arc::new(CompactSpaceDecompressor::open(bytes)?))
pub fn open_u128<Item: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let reader = CompactSpaceDecompressor::open(bytes)?;
let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
StrictlyMonotonicMappingToInternal::<Item>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted)))
}

/// Returns the correct codec reader wrapped in the `Arc` for the data.
Expand All @@ -99,11 +110,15 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
let reader = C::open_from_bytes(bytes, normalized_header)?;
let min_value = header.min_value;
if let Some(gcd) = header.gcd {
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val * gcd.get());
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
let mapping = StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
);
Ok(Arc::new(monotonic_map_column(reader, mapping)))
} else {
let monotonic_mapping = move |val: u64| Item::from_u64(min_value + val);
Ok(Arc::new(monotonic_map_column(reader, monotonic_mapping)))
let mapping = StrictlyMonotonicMappingInverter::from(
StrictlyMonotonicMappingToInternalBaseval::new(min_value),
);
Ok(Arc::new(monotonic_map_column(reader, mapping)))
}
}

Expand Down Expand Up @@ -143,6 +158,7 @@ pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [

#[cfg(test)]
mod tests {

use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
Expand Down Expand Up @@ -177,6 +193,18 @@ mod tests {
`{data:?}`",
);
}

if !data.is_empty() {
let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
let expected_positions: Vec<u64> = data
.iter()
.enumerate()
.filter(|(_, el)| **el == data[test_rand_idx])
.map(|(pos, _)| pos as u64)
.collect();
let positions = reader.get_between_vals(data[test_rand_idx]..=data[test_rand_idx]);
assert_eq!(expected_positions, positions);
}
Some((estimation, actual_compression))
}

Expand Down
9 changes: 6 additions & 3 deletions fastfield_codecs/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ fn bench_ip() {
{
let mut data = vec![];
for dataset in dataset.chunks(500_000) {
serialize_u128(VecColumn::from(dataset), &mut data).unwrap();
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
}
let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression 50_000 chunks {:.4}", compression);
Expand All @@ -101,7 +101,10 @@ fn bench_ip() {
}

let mut data = vec![];
serialize_u128(VecColumn::from(&dataset), &mut data).unwrap();
{
print_time!("creation");
serialize_u128(|| dataset.iter().cloned(), dataset.len() as u64, &mut data).unwrap();
}

let compression = data.len() as f64 / (dataset.len() * 16) as f64;
println!("Compression {:.2}", compression);
Expand All @@ -110,7 +113,7 @@ fn bench_ip() {
(data.len() * 8) as f32 / dataset.len() as f32
);

let decompressor = open_u128(OwnedBytes::new(data)).unwrap();
let decompressor = open_u128::<u128>(OwnedBytes::new(data)).unwrap();
// Sample some ranges
for value in dataset.iter().take(1110).skip(1100).cloned() {
print_time!("get range");
Expand Down
Loading