use base64::prelude::*; pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) { let (sign, magnitude): (bool, u32) = if n >= 0 { (false, n.try_into().unwrap()) } else { (true, (-n).try_into().unwrap()) }; // zig-zag encoding let value: u32 = (magnitude << 1) | (if sign { 1 } else { 0 }); // Self-terminating hex use capital letters for everything but the // least significant digit, which is lowercase. For example, decimal 17 // would be `` Aa `` if zig-zag encoding weren't used. // // Zig-zag encoding, however, stores the sign bit as the last bit. // This means, in the last hexit, 1 is actually `c`, -1 is `b` // (`a` is the imaginary -0), and, because all the bits are shifted // by one, `` A` `` is actually 8 and `` Aa `` is -8. // // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html // describes the encoding in more detail. let mut shift: u32 = 28; let mut mask: u32 = 0xF0_00_00_00; // first skip leading zeroes while shift < 32 { let hexit = (value & mask) >> shift; if hexit != 0 || shift == 0 { break; } shift = shift.wrapping_sub(4); mask >>= 4; } // now write the rest while shift < 32 { let hexit = (value & mask) >> shift; let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap(); string.push(hex); shift = shift.wrapping_sub(4); mask >>= 4; } } // Used during bitmap encoding enum Container { /// number of ones, bits Bits(Box<[u64; 1024]>), /// list of entries Array(Vec), /// list of (start, len-1) Run(Vec<(u16, u16)>), } impl Container { fn popcount(&self) -> u32 { match self { Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(), Container::Array(array) => { array.len().try_into().expect("array can't be bigger than 2**32") } Container::Run(runs) => { runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum() } } } fn push(&mut self, value: u16) { match self { Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F), Container::Array(array) => { array.push(value); if array.len() >= 4096 { let array = std::mem::take(array); *self = Container::Bits(Box::new([0; 1024])); for value in array { self.push(value); } } } Container::Run(runs) => { if let Some(r) = runs.last_mut() && r.0 + r.1 + 1 == value { r.1 += 1; } else { runs.push((value, 0)); } } } } fn try_make_run(&mut self) -> bool { match self { Container::Bits(bits) => { let mut r: u64 = 0; for (i, chunk) in bits.iter().copied().enumerate() { let next_chunk = i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0); r += !chunk & u64::from((chunk << 1).count_ones()); r += !next_chunk & u64::from((chunk >> 63).count_ones()); } if (2 + 4 * r) >= 8192 { return false; } let bits = std::mem::replace(bits, Box::new([0; 1024])); *self = Container::Run(Vec::new()); for (i, bits) in bits.iter().copied().enumerate() { if bits == 0 { continue; } for j in 0..64 { let value = (u16::try_from(i).unwrap() << 6) | j; if bits & (1 << j) != 0 { self.push(value); } } } true } Container::Array(array) if array.len() <= 5 => false, Container::Array(array) => { let mut r = 0; let mut prev = None; for value in array.iter().copied() { if value.checked_sub(1) != prev { r += 1; } prev = Some(value); } if 2 + 4 * r >= 2 * array.len() + 2 { return false; } let array = std::mem::take(array); *self = Container::Run(Vec::new()); for value in array { self.push(value); } true } Container::Run(_) => true, } } } // checked against roaring-rs in // https://gitlab.com/notriddle/roaring-test pub(crate) fn write_bitmap_to_bytes( domain: &[u32], mut out: impl std::io::Write, ) -> std::io::Result<()> { // https://arxiv.org/pdf/1603.06549.pdf let mut keys = Vec::::new(); let mut containers = Vec::::new(); let mut key: u16; let mut domain_iter = domain.iter().copied().peekable(); let mut has_run = false; while let Some(entry) = domain_iter.next() { key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit"); let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"); let mut container = Container::Array(vec![value]); while let Some(entry) = domain_iter.peek().copied() { let entry_key: u16 = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit"); if entry_key != key { break; } domain_iter.next().expect("peeking just succeeded"); container .push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit")); } keys.push(key); has_run = container.try_make_run() || has_run; containers.push(container); } // https://github.com/RoaringBitmap/RoaringFormatSpec const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u32 = 12347; const NO_OFFSET_THRESHOLD: u32 = 4; let size: u32 = containers.len().try_into().unwrap(); let start_offset = if has_run { out.write_all(&u32::to_le_bytes(SERIAL_COOKIE | ((size - 1) << 16)))?; for set in containers.chunks(8) { let mut b = 0; for (i, container) in set.iter().enumerate() { if matches!(container, &Container::Run(..)) { b |= 1 << i; } } out.write_all(&[b])?; } if size < NO_OFFSET_THRESHOLD { 4 + 4 * size + size.div_ceil(8) } else { 4 + 8 * size + size.div_ceil(8) } } else { out.write_all(&u32::to_le_bytes(SERIAL_COOKIE_NO_RUNCONTAINER))?; out.write_all(&u32::to_le_bytes(containers.len().try_into().unwrap()))?; 4 + 4 + 4 * size + 4 * size }; for (&key, container) in keys.iter().zip(&containers) { // descriptive header let key: u32 = key.into(); let count: u32 = container.popcount() - 1; out.write_all(&u32::to_le_bytes((count << 16) | key))?; } if !has_run || size >= NO_OFFSET_THRESHOLD { // offset header let mut starting_offset = start_offset; for container in &containers { out.write_all(&u32::to_le_bytes(starting_offset))?; starting_offset += match container { Container::Bits(_) => 8192u32, Container::Array(array) => u32::try_from(array.len()).unwrap() * 2, Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4, }; } } for container in &containers { match container { Container::Bits(bits) => { for chunk in bits.iter() { out.write_all(&u64::to_le_bytes(*chunk))?; } } Container::Array(array) => { for value in array.iter() { out.write_all(&u16::to_le_bytes(*value))?; } } Container::Run(runs) => { out.write_all(&u16::to_le_bytes(runs.len().try_into().unwrap()))?; for (start, lenm1) in runs.iter().copied() { out.write_all(&u16::to_le_bytes(start))?; out.write_all(&u16::to_le_bytes(lenm1))?; } } } } Ok(()) } pub(crate) fn bitmap_to_string(domain: &[u32]) -> String { let mut buf = Vec::new(); let mut strbuf = String::new(); write_bitmap_to_bytes(domain, &mut buf).unwrap(); BASE64_STANDARD.encode_string(&buf, &mut strbuf); strbuf }