From 86b9550811b359a0af7a93523fbd14c5cdcc635f Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Sat, 30 Dec 2023 20:56:59 -0700 Subject: rustdoc-search: tighter encoding for `f` index Two optimizations for the function signature search: * Instead of using JSON arrays, like `[1,20]`, it uses VLQ hex with no commas, like `[aAd]`. * This also adds backrefs: if you have more than one function with exactly the same signature, it'll not only store it once, it'll *decode* it once, and store in the typeIdMap only once. Size change ----------- standard library ```console $ du -bs search-index-old.js search-index-new.js 4976370 search-index-old.js 4404391 search-index-new.js ``` ((4976370-4404391)/4404391)*100% = 12.9% Benchmarks are similarly shrunk: ```console $ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js 10555067 tmp/arti/toolchain_old/doc/search-index.js 8921236 tmp/arti/toolchain_new/doc/search-index.js 77018 tmp/cortex-m/toolchain_old/doc/search-index.js 66676 tmp/cortex-m/toolchain_new/doc/search-index.js 2876330 tmp/sqlx/toolchain_old/doc/search-index.js 2436812 tmp/sqlx/toolchain_new/doc/search-index.js 63632890 tmp/stm32f4/toolchain_old/doc/search-index.js 52337438 tmp/stm32f4/toolchain_new/doc/search-index.js 631150 tmp/ripgrep/toolchain_old/doc/search-index.js 541646 tmp/ripgrep/toolchain_new/doc/search-index.js ``` --- src/librustdoc/html/render/mod.rs | 146 +++++++++++++++++++---------- src/librustdoc/html/render/search_index.rs | 29 ++---- 2 files changed, 104 insertions(+), 71 deletions(-) (limited to 'src/librustdoc/html/render') diff --git a/src/librustdoc/html/render/mod.rs b/src/librustdoc/html/render/mod.rs index 118c6eeb289..345a47a21b8 100644 --- a/src/librustdoc/html/render/mod.rs +++ b/src/librustdoc/html/render/mod.rs @@ -58,7 +58,7 @@ use rustc_span::{ symbol::{sym, Symbol}, BytePos, FileName, RealFileName, }; -use serde::ser::{SerializeMap, SerializeSeq}; +use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::clean::{self, ItemId, RenderedLink, SelfTy}; @@ -123,44 +123,53 @@ pub(crate) struct IndexItem { } /// A type used for the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct RenderType { id: Option, generics: Option>, bindings: Option)>>, } -impl Serialize for RenderType { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let id = match &self.id { - // 0 is a sentinel, everything else is one-indexed - None => 0, - // concrete type - Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1, - // generic type parameter - Some(RenderTypeId::Index(idx)) => *idx, - _ => panic!("must convert render types to indexes before serializing"), - }; +impl RenderType { + pub fn write_to_string(&self, string: &mut String) { if self.generics.is_some() || self.bindings.is_some() { - let mut seq = serializer.serialize_seq(None)?; - seq.serialize_element(&id)?; - seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?; + string.push('{'); + // 0 is a sentinel, everything else is one-indexed + match self.id { + Some(id) => id.write_to_string(string), + None => string.push('`'), + } + string.push('{'); + for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + generic.write_to_string(string); + } + string.push('}'); if self.bindings.is_some() { - seq.serialize_element( - self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(), - )?; + string.push('{'); + for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + string.push('{'); + binding.0.write_to_string(string); + string.push('{'); + for constraint in &binding.1[..] { + constraint.write_to_string(string); + } + string.push('}'); + string.push('}'); + } + string.push('}'); } - seq.end() + string.push('}'); } else { - id.serialize(serializer) + // 0 is a sentinel, everything else is one-indexed + match self.id { + Some(id) => id.write_to_string(string), + None => string.push('`'), + } } } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum RenderTypeId { DefId(DefId), Primitive(clean::PrimitiveType), @@ -168,36 +177,50 @@ pub(crate) enum RenderTypeId { Index(isize), } -impl Serialize for RenderTypeId { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let id = match &self { +impl RenderTypeId { + pub fn write_to_string(&self, string: &mut String) { + // (sign, value) + let (sign, id): (bool, u32) = match &self { // 0 is a sentinel, everything else is one-indexed // concrete type - RenderTypeId::Index(idx) if *idx >= 0 => idx + 1, + RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()), // generic type parameter - RenderTypeId::Index(idx) => *idx, + RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()), _ => panic!("must convert render types to indexes before serializing"), }; - id.serialize(serializer) + // zig-zag notation + let value: u32 = (id << 1) | (if sign { 1 } else { 0 }); + // encode + let mut shift: u32 = 28; + let mut mask: u32 = 0xF0_00_00_00; + while shift < 32 { + let hexit = (value & mask) >> shift; + if hexit != 0 || shift == 0 { + let hex = + char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap(); + string.push(hex); + } + shift = shift.wrapping_sub(4); + mask = mask >> 4; + } } } /// Full type of functions/methods in the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct IndexItemFunctionType { inputs: Vec, output: Vec, where_clause: Vec>, } -impl Serialize for IndexItemFunctionType { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { +impl IndexItemFunctionType { + pub fn write_to_string<'a>( + &'a self, + string: &mut String, + backref_queue: &mut VecDeque<&'a IndexItemFunctionType>, + ) { + assert!(backref_queue.len() < 16); // If we couldn't figure out a type, just write `0`. let has_missing = self .inputs @@ -205,33 +228,58 @@ impl Serialize for IndexItemFunctionType { .chain(self.output.iter()) .any(|i| i.id.is_none() && i.generics.is_none()); if has_missing { - 0.serialize(serializer) + string.push('`'); + } else if let Some(idx) = backref_queue.iter().position(|other| *other == self) { + string.push( + char::try_from('0' as u32 + u32::try_from(idx).unwrap()) + .expect("last possible value is '?'"), + ); } else { - let mut seq = serializer.serialize_seq(None)?; + backref_queue.push_front(self); + if backref_queue.len() >= 16 { + backref_queue.pop_back(); + } + string.push('{'); match &self.inputs[..] { [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.inputs[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.inputs)?, } match &self.output[..] { [] if self.where_clause.is_empty() => {} [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.output[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.output)?, } for constraint in &self.where_clause { if let [one] = &constraint[..] && one.generics.is_none() && one.bindings.is_none() { - seq.serialize_element(one)?; + one.write_to_string(string); } else { - seq.serialize_element(constraint)?; + string.push('{'); + for item in &constraint[..] { + item.write_to_string(string); + } + string.push('}'); } } - seq.end() + string.push('}'); } } } diff --git a/src/librustdoc/html/render/search_index.rs b/src/librustdoc/html/render/search_index.rs index a1029320d2d..e49df400c83 100644 --- a/src/librustdoc/html/render/search_index.rs +++ b/src/librustdoc/html/render/search_index.rs @@ -1,5 +1,5 @@ use std::collections::hash_map::Entry; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, VecDeque}; use rustc_data_structures::fx::{FxHashMap, FxIndexMap}; use rustc_middle::ty::TyCtxt; @@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>( let mut full_paths = Vec::with_capacity(self.items.len()); let mut descriptions = Vec::with_capacity(self.items.len()); let mut parents = Vec::with_capacity(self.items.len()); - let mut functions = Vec::with_capacity(self.items.len()); + let mut functions = String::with_capacity(self.items.len()); let mut deprecated = Vec::with_capacity(self.items.len()); + let mut backref_queue = VecDeque::new(); + for (index, item) in self.items.iter().enumerate() { let n = item.ty as u8; let c = char::try_from(n + b'A').expect("item types must fit in ASCII"); @@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>( full_paths.push((index, &item.path)); } - // Fake option to get `0` out as a sentinel instead of `null`. - // We want to use `0` because it's three less bytes. - enum FunctionOption<'a> { - Function(&'a IndexItemFunctionType), - None, - } - impl<'a> Serialize for FunctionOption<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match self { - FunctionOption::None => 0.serialize(serializer), - FunctionOption::Function(ty) => ty.serialize(serializer), - } - } + match &item.search_type { + Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue), + None => functions.push('`'), } - functions.push(match &item.search_type { - Some(ty) => FunctionOption::Function(ty), - None => FunctionOption::None, - }); if item.deprecation.is_some() { deprecated.push(index); -- cgit 1.4.1-3-g733a5