diff options
| author | Michael Goulet <michael@errs.io> | 2024-01-05 23:41:42 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-01-05 23:41:42 -0500 |
| commit | b3f307434e2bcf8d26f7f0cd413f98caa70c2e26 (patch) | |
| tree | 3cb887a7b9c71fb23f7bfe11190c58a5ab553b8a /src/librustdoc/html/render | |
| parent | 9585ebc269d5e59d2cb36d53ec685e9650459dcd (diff) | |
| parent | 004bfc5eb2aa6789741055d3e6c479e34eb960cb (diff) | |
| download | rust-b3f307434e2bcf8d26f7f0cd413f98caa70c2e26.tar.gz rust-b3f307434e2bcf8d26f7f0cd413f98caa70c2e26.zip | |
Rollup merge of #119468 - notriddle:notriddle/compression, r=GuillaumeGomez
rustdoc-search: tighter encoding for f index
Depends on https://github.com/rust-lang/rust/pull/119457
Two optimizations for the function signature search:
* Instead of using JSON arrays, like `[1,20]`, it uses VLQ
hex with no commas, like `[aAd]`.
* This also adds backrefs: if you have more than one function
with exactly the same signature, it'll not only store it once,
it'll *decode* it once, and store in the typeIdMap only once.
Based partially on discussions on zulip:
https://rust-lang.zulipchat.com/#narrow/stream/266220-t-rustdoc/topic/search.20index.20size
Performance
-----------
https://notriddle.com/rustdoc-html-demo-8/compression-perf-v2/index.html
### memory/time profiler output (for more details, consult the above link)
<table>
<thead><tr><th>benchmark<th>before<th>after</tr></thead>
<tbody>
<tr><th>arti<td>
```
user: 002.789 s
sys: 000.390 s
wall: 002.096 s
child_RSS_high: 440796 KiB
group_mem_high: 414924 KiB
```
</td><td>
```
user: 002.295 s
sys: 000.278 s
wall: 001.738 s
child_RSS_high: 314588 KiB
group_mem_high: 285220 KiB
```
</td></tr><tr><th>cortex-m<td>
```
user: 000.127 s
sys: 000.030 s
wall: 000.134 s
child_RSS_high: 60264 KiB
group_mem_high: 23824 KiB
```
</td><td>
```
user: 000.136 s
sys: 000.038 s
wall: 000.137 s
child_RSS_high: 59204 KiB
group_mem_high: 22712 KiB
```
</td></tr><tr><th>sqlx<td>
```
user: 000.887 s
sys: 000.118 s
wall: 000.592 s
child_RSS_high: 190408 KiB
group_mem_high: 157804 KiB
```
</td><td>
```
user: 000.798 s
sys: 000.101 s
wall: 000.525 s
child_RSS_high: 159292 KiB
group_mem_high: 126292 KiB
```
</td></tr><tr><th>stm32f4<td>
```
user: 013.884 s
sys: 005.399 s
wall: 013.149 s
child_RSS_high: 1942244 KiB
group_mem_high: 1954916 KiB
```
</td><td>
```
user: 006.128 s
sys: 003.297 s
wall: 007.994 s
child_RSS_high: 1038108 KiB
group_mem_high: 1023900 KiB
```
</td></tr><tr><th>ripgrep<td>
```
user: 000.441 s
sys: 000.063 s
wall: 000.264 s
child_RSS_high: 109180 KiB
group_mem_high: 74272 KiB
```
</td><td>
```
user: 000.408 s
sys: 000.044 s
wall: 000.238 s
child_RSS_high: 101488 KiB
group_mem_high: 66000 KiB
```
</td></tr></tbody></table>
Size change
-----------
standard library without gzip:
```console
$ du -bs search-index-old.js search-index-new.js
4976370 search-index-old.js
4404391 search-index-new.js
```
((4976370-4404391)/4404391)*100% = 12.9%
with gzip:
```console
$ du -hs search-index-old.js.gz search-index-new.js.gz
520K search-index-old.js.gz
504K search-index-new.js.gz
$ du -bs search-index-old.js.gz search-index-new.js.gz
522092 search-index-old.js.gz
507654 search-index-new.js.gz
```
((522092-507654)/507654)*100% = 2.8%
Benchmarks are similarly shrunk.
Without gzip:
```console
$ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js
10555067 tmp/arti/toolchain_old/doc/search-index.js
8921236 tmp/arti/toolchain_new/doc/search-index.js
77018 tmp/cortex-m/toolchain_old/doc/search-index.js
66676 tmp/cortex-m/toolchain_new/doc/search-index.js
2876330 tmp/sqlx/toolchain_old/doc/search-index.js
2436812 tmp/sqlx/toolchain_new/doc/search-index.js
63632890 tmp/stm32f4/toolchain_old/doc/search-index.js
52337438 tmp/stm32f4/toolchain_new/doc/search-index.js
631150 tmp/ripgrep/toolchain_old/doc/search-index.js
541646 tmp/ripgrep/toolchain_new/doc/search-index.js
```
With gzip:
```console
$ du -bs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js.gz
1618852 tmp/arti/toolchain_old/doc/search-index.js.gz
1582007 tmp/arti/toolchain_new/doc/search-index.js.gz
16109 tmp/cortex-m/toolchain_old/doc/search-index.js.gz
15831 tmp/cortex-m/toolchain_new/doc/search-index.js.gz
422257 tmp/sqlx/toolchain_old/doc/search-index.js.gz
411507 tmp/sqlx/toolchain_new/doc/search-index.js.gz
4454761 tmp/stm32f4/toolchain_old/doc/search-index.js.gz
4334924 tmp/stm32f4/toolchain_new/doc/search-index.js.gz
98312 tmp/ripgrep/toolchain_old/doc/search-index.js.gz
96864 tmp/ripgrep/toolchain_new/doc/search-index.js.gz
$ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.j
s.gz
1.6M tmp/arti/toolchain_old/doc/search-index.js.gz
1.6M tmp/arti/toolchain_new/doc/search-index.js.gz
24K tmp/cortex-m/toolchain_old/doc/search-index.js.gz
24K tmp/cortex-m/toolchain_new/doc/search-index.js.gz
424K tmp/sqlx/toolchain_old/doc/search-index.js.gz
412K tmp/sqlx/toolchain_new/doc/search-index.js.gz
4.3M tmp/stm32f4/toolchain_old/doc/search-index.js.gz
4.2M tmp/stm32f4/toolchain_new/doc/search-index.js.gz
108K tmp/ripgrep/toolchain_old/doc/search-index.js.gz
104K tmp/ripgrep/toolchain_new/doc/search-index.js.gz
```
Diffstat (limited to 'src/librustdoc/html/render')
| -rw-r--r-- | src/librustdoc/html/render/mod.rs | 164 | ||||
| -rw-r--r-- | src/librustdoc/html/render/search_index.rs | 29 |
2 files changed, 122 insertions, 71 deletions
diff --git a/src/librustdoc/html/render/mod.rs b/src/librustdoc/html/render/mod.rs index 118c6eeb289..bea5ccd7c86 100644 --- a/src/librustdoc/html/render/mod.rs +++ b/src/librustdoc/html/render/mod.rs @@ -58,7 +58,7 @@ use rustc_span::{ symbol::{sym, Symbol}, BytePos, FileName, RealFileName, }; -use serde::ser::{SerializeMap, SerializeSeq}; +use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::clean::{self, ItemId, RenderedLink, SelfTy}; @@ -123,44 +123,58 @@ pub(crate) struct IndexItem { } /// A type used for the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct RenderType { id: Option<RenderTypeId>, generics: Option<Vec<RenderType>>, bindings: Option<Vec<(RenderTypeId, Vec<RenderType>)>>, } -impl Serialize for RenderType { - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - let id = match &self.id { +impl RenderType { + // Types are rendered as lists of lists, because that's pretty compact. + // The contents of the lists are always integers in self-terminating hex + // form, handled by `RenderTypeId::write_to_string`, so no commas are + // needed to separate the items. + pub fn write_to_string(&self, string: &mut String) { + fn write_optional_id(id: Option<RenderTypeId>, string: &mut String) { // 0 is a sentinel, everything else is one-indexed - None => 0, - // concrete type - Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1, - // generic type parameter - Some(RenderTypeId::Index(idx)) => *idx, - _ => panic!("must convert render types to indexes before serializing"), - }; + match id { + Some(id) => id.write_to_string(string), + None => string.push('`'), + } + } + // Either just the type id, or `{type, generics, bindings?}` + // where generics is a list of types, + // and bindings is a list of `{id, typelist}` pairs. if self.generics.is_some() || self.bindings.is_some() { - let mut seq = serializer.serialize_seq(None)?; - seq.serialize_element(&id)?; - seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?; + string.push('{'); + write_optional_id(self.id, string); + string.push('{'); + for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + generic.write_to_string(string); + } + string.push('}'); if self.bindings.is_some() { - seq.serialize_element( - self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(), - )?; + string.push('{'); + for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] { + string.push('{'); + binding.0.write_to_string(string); + string.push('{'); + for constraint in &binding.1[..] { + constraint.write_to_string(string); + } + string.push_str("}}"); + } + string.push('}'); } - seq.end() + string.push('}'); } else { - id.serialize(serializer) + write_optional_id(self.id, string); } } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum RenderTypeId { DefId(DefId), Primitive(clean::PrimitiveType), @@ -168,70 +182,122 @@ pub(crate) enum RenderTypeId { Index(isize), } -impl Serialize for RenderTypeId { - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - let id = match &self { +impl RenderTypeId { + pub fn write_to_string(&self, string: &mut String) { + // (sign, value) + let (sign, id): (bool, u32) = match &self { // 0 is a sentinel, everything else is one-indexed // concrete type - RenderTypeId::Index(idx) if *idx >= 0 => idx + 1, + RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()), // generic type parameter - RenderTypeId::Index(idx) => *idx, + RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()), _ => panic!("must convert render types to indexes before serializing"), }; - id.serialize(serializer) + // zig-zag encoding + let value: u32 = (id << 1) | (if sign { 1 } else { 0 }); + // Self-terminating hex use capital letters for everything but the + // least significant digit, which is lowercase. For example, decimal 17 + // would be `` Aa `` if zig-zag encoding weren't used. + // + // Zig-zag encoding, however, stores the sign bit as the last bit. + // This means, in the last hexit, 1 is actually `c`, -1 is `b` + // (`a` is the imaginary -0), and, because all the bits are shifted + // by one, `` A` `` is actually 8 and `` Aa `` is -8. + // + // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html + // describes the encoding in more detail. + let mut shift: u32 = 28; + let mut mask: u32 = 0xF0_00_00_00; + while shift < 32 { + let hexit = (value & mask) >> shift; + if hexit != 0 || shift == 0 { + let hex = + char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap(); + string.push(hex); + } + shift = shift.wrapping_sub(4); + mask = mask >> 4; + } } } /// Full type of functions/methods in the search index. -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq)] pub(crate) struct IndexItemFunctionType { inputs: Vec<RenderType>, output: Vec<RenderType>, where_clause: Vec<Vec<RenderType>>, } -impl Serialize for IndexItemFunctionType { - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - // If we couldn't figure out a type, just write `0`. +impl IndexItemFunctionType { + pub fn write_to_string<'a>( + &'a self, + string: &mut String, + backref_queue: &mut VecDeque<&'a IndexItemFunctionType>, + ) { + assert!(backref_queue.len() <= 16); + // If we couldn't figure out a type, just write 0, + // which is encoded as `` ` `` (see RenderTypeId::write_to_string). let has_missing = self .inputs .iter() .chain(self.output.iter()) .any(|i| i.id.is_none() && i.generics.is_none()); if has_missing { - 0.serialize(serializer) + string.push('`'); + } else if let Some(idx) = backref_queue.iter().position(|other| *other == self) { + // The backref queue has 16 items, so backrefs use + // a single hexit, disjoint from the ones used for numbers. + string.push( + char::try_from('0' as u32 + u32::try_from(idx).unwrap()) + .expect("last possible value is '?'"), + ); } else { - let mut seq = serializer.serialize_seq(None)?; + backref_queue.push_front(self); + if backref_queue.len() > 16 { + backref_queue.pop_back(); + } + string.push('{'); match &self.inputs[..] { [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.inputs[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.inputs)?, } match &self.output[..] { [] if self.where_clause.is_empty() => {} [one] if one.generics.is_none() && one.bindings.is_none() => { - seq.serialize_element(one)? + one.write_to_string(string); + } + _ => { + string.push('{'); + for item in &self.output[..] { + item.write_to_string(string); + } + string.push('}'); } - _ => seq.serialize_element(&self.output)?, } for constraint in &self.where_clause { if let [one] = &constraint[..] && one.generics.is_none() && one.bindings.is_none() { - seq.serialize_element(one)?; + one.write_to_string(string); } else { - seq.serialize_element(constraint)?; + string.push('{'); + for item in &constraint[..] { + item.write_to_string(string); + } + string.push('}'); } } - seq.end() + string.push('}'); } } } diff --git a/src/librustdoc/html/render/search_index.rs b/src/librustdoc/html/render/search_index.rs index a1029320d2d..e49df400c83 100644 --- a/src/librustdoc/html/render/search_index.rs +++ b/src/librustdoc/html/render/search_index.rs @@ -1,5 +1,5 @@ use std::collections::hash_map::Entry; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, VecDeque}; use rustc_data_structures::fx::{FxHashMap, FxIndexMap}; use rustc_middle::ty::TyCtxt; @@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>( let mut full_paths = Vec::with_capacity(self.items.len()); let mut descriptions = Vec::with_capacity(self.items.len()); let mut parents = Vec::with_capacity(self.items.len()); - let mut functions = Vec::with_capacity(self.items.len()); + let mut functions = String::with_capacity(self.items.len()); let mut deprecated = Vec::with_capacity(self.items.len()); + let mut backref_queue = VecDeque::new(); + for (index, item) in self.items.iter().enumerate() { let n = item.ty as u8; let c = char::try_from(n + b'A').expect("item types must fit in ASCII"); @@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>( full_paths.push((index, &item.path)); } - // Fake option to get `0` out as a sentinel instead of `null`. - // We want to use `0` because it's three less bytes. - enum FunctionOption<'a> { - Function(&'a IndexItemFunctionType), - None, - } - impl<'a> Serialize for FunctionOption<'a> { - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - match self { - FunctionOption::None => 0.serialize(serializer), - FunctionOption::Function(ty) => ty.serialize(serializer), - } - } + match &item.search_type { + Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue), + None => functions.push('`'), } - functions.push(match &item.search_type { - Some(ty) => FunctionOption::Function(ty), - None => FunctionOption::None, - }); if item.deprecation.is_some() { deprecated.push(index); |
