From 86da4be47f3a0d52b894535a227ee3f2515b9af3 Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Wed, 13 Nov 2024 10:46:27 -0700 Subject: rustdoc: use a trie for name-based search Preview and profiler results ---------------------------- Here's some quick profiling in Firefox done on the rust compiler docs: - Before: https://share.firefox.dev/3UPm3M8 - After: https://share.firefox.dev/40LXvYb Here's the results for the node.js profiler: - https://notriddle.com/rustdoc-html-demo-15/trie-perf/index.html Here's a copy that you can use to try it out. Compare it with [the nightly]. Try typing `typecheckercontext` one character at a time, slowly. - https://notriddle.com/rustdoc-html-demo-15/compiler-doc-trie/index.html [the nightly]: https://doc.rust-lang.org/nightly/nightly-rustc/ The fuzzy match algo is based on [Fast String Correction with Levenshtein-Automata] and the corresponding implementation code in [moman] and [Lucene]; the bit-packing representation comes from Lucene, but the actual matcher is more based on `fsc.py`. As suggested in the paper, a trie is used to represent the FSA dictionary. The same trie is used for prefix matching. Substring matching is done with a side table of three-character[^1] windows that point into the trie. [Fast String Correction with Levenshtein-Automata]: https://github.com/tpn/pdfs/blob/master/Fast%20String%20Correction%20with%20Levenshtein-Automata%20(2002)%20(10.1.1.16.652).pdf [Lucene]: https://fossies.org/linux/lucene/lucene/core/src/java/org/apache/lucene/util/automaton/Lev1TParametricDescription.java [moman]: https://gitlab.com/notriddle/moman-rustdoc User-visible changes -------------------- I don't expect anybody to notice anything, but it does cause two changes: - Substring matches, in the middle of a name, only apply if there's three or more characters in the search query. - Levenshtein distance limit now maxes out at two. In the old version, the limit was w/3, so you could get looser matches for queries with 9 or more characters[^1] in them. [^1]: technically utf-16 code units --- tests/rustdoc-js/non-english-identifier.js | 26 +++++++++++++++++++------- tests/rustdoc-js/path-maxeditdistance.js | 25 +++++++++++++++++++++---- tests/rustdoc-js/prototype.js | 4 +++- 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'tests/rustdoc-js') diff --git a/tests/rustdoc-js/non-english-identifier.js b/tests/rustdoc-js/non-english-identifier.js index 6a2c4cc637c..f2180b4c755 100644 --- a/tests/rustdoc-js/non-english-identifier.js +++ b/tests/rustdoc-js/non-english-identifier.js @@ -133,22 +133,34 @@ const EXPECTED = [ path: "non_english_identifier", href: "../non_english_identifier/trait.加法.html", desc: "Add" - }, + }], + in_args: [{ + name: "加上", + path: "non_english_identifier::加法", + href: "../non_english_identifier/trait.加法.html#tymethod.加上", + }], + returned: [], + }, + { // levensthein and substring checking only kick in at three characters + query: '加法宏', + others: [ { name: "中文名称的加法宏", path: "non_english_identifier", href: "../non_english_identifier/macro.中文名称的加法宏.html", - }, + }], + in_args: [], + returned: [], + }, + { // levensthein and substring checking only kick in at three characters + query: '加法A', + others: [ { name: "中文名称的加法API", path: "non_english_identifier", href: "../non_english_identifier/fn.中文名称的加法API.html", }], - in_args: [{ - name: "加上", - path: "non_english_identifier::加法", - href: "../non_english_identifier/trait.加法.html#tymethod.加上", - }], + in_args: [], returned: [], }, { // Extensive type-based search is still buggy, experimental & work-in-progress. diff --git a/tests/rustdoc-js/path-maxeditdistance.js b/tests/rustdoc-js/path-maxeditdistance.js index 73b24a6dddf..cf700193ac4 100644 --- a/tests/rustdoc-js/path-maxeditdistance.js +++ b/tests/rustdoc-js/path-maxeditdistance.js @@ -14,21 +14,38 @@ const EXPECTED = [ ], }, { - // swap br/rb; that's edit distance 2, where maxPathEditDistance = 3 (11 / 3) + // swap br/rb; that's edit distance 1, where maxPathEditDistance = 2 'query': 'arbacadarba::hocuspocusprestidigitation', 'others': [ { 'path': 'abracadabra', 'name': 'HocusPocusPrestidigitation' }, ], }, { - // truncate 5 chars, where maxEditDistance = 7 (21 / 3) - 'query': 'abracadarba::hocusprestidigitation', + // swap p/o o/p, that's also edit distance 1 + 'query': 'abracadabra::hocusopcusprestidigitation', 'others': [ { 'path': 'abracadabra', 'name': 'HocusPocusPrestidigitation' }, ], }, { - // truncate 9 chars, where maxEditDistance = 5 (17 / 3) + // swap p/o o/p and gi/ig, that's edit distance 2 + 'query': 'abracadabra::hocusopcusprestidiigtation', + 'others': [ + { 'path': 'abracadabra', 'name': 'HocusPocusPrestidigitation' }, + ], + }, + { + // swap p/o o/p, gi/ig, and ti/it, that's edit distance 3 and not shown (we stop at 2) + 'query': 'abracadabra::hocusopcusprestidiigtaiton', + 'others': [], + }, + { + // truncate 5 chars, where maxEditDistance = 2 + 'query': 'abracadarba::hocusprestidigitation', + 'others': [], + }, + { + // truncate 9 chars, where maxEditDistance = 2 'query': 'abracadarba::hprestidigitation', 'others': [], }, diff --git a/tests/rustdoc-js/prototype.js b/tests/rustdoc-js/prototype.js index da72fdce3db..0862acd8ffa 100644 --- a/tests/rustdoc-js/prototype.js +++ b/tests/rustdoc-js/prototype.js @@ -9,7 +9,9 @@ const EXPECTED = [ }, { 'query': '__proto__', - 'others': [], + 'others': [ + {"path": "", "name": "prototype"}, + ], 'returned': [], 'in_args': [], }, -- cgit 1.4.1-3-g733a5