diff options
| author | Matthias Krüger <476013+matthiaskrgr@users.noreply.github.com> | 2025-06-26 15:47:17 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-06-26 15:47:17 +0200 |
| commit | 158340f5617f7e6a555046802133536ab61764ba (patch) | |
| tree | dff498ed9af7d3ed9884615633c91cd423d8a5c2 /src | |
| parent | d5d5eb471e7ff3a2876569f229703274f70cbb49 (diff) | |
| parent | 1dfc8406dcb742453b36daf0ce7486183b1da79c (diff) | |
| download | rust-158340f5617f7e6a555046802133536ab61764ba.tar.gz rust-158340f5617f7e6a555046802133536ab61764ba.zip | |
Rollup merge of #141311 - folkertdev:tidy-natural-sort, r=jieyouxu
make `tidy-alphabetical` use a natural sort
The idea here is that these lines should be correctly sorted, even though a naive string comparison would say they are not:
```
foo2
foo10
```
This is the ["natural sort order"](https://en.wikipedia.org/wiki/Natural_sort_order).
There is more discussion in [#t-compiler/help > tidy natural sort](https://rust-lang.zulipchat.com/#narrow/channel/182449-t-compiler.2Fhelp/topic/tidy.20natural.20sort/with/519111079)
Unfortunately, no standard sorting tools are smart enough to to this automatically (casting some doubt on whether we should make this change). Here are some sort outputs:
```
> cat foo.txt | sort
foo
foo1
foo10
foo2
mp
mp1e2
np",
np1e2",
> cat foo.txt | sort -n
foo
foo1
foo10
foo2
mp
mp1e2
np",
np1e2",
> cat foo.txt | sort -V
foo
foo1
foo2
foo10
mp
mp1e2
np1e2",
np",
```
Disappointingly, "numeric" sort does not actually have the behavior we want. It only sorts by numeric value if the line starts with a number. The "version" sort looks promising, but does something very unintuitive if you look at the final 4 values. None of the other options seem to have the desired behavior in all cases:
```
-b, --ignore-leading-blanks ignore leading blanks
-d, --dictionary-order consider only blanks and alphanumeric characters
-f, --ignore-case fold lower case to upper case characters
-g, --general-numeric-sort compare according to general numerical value
-i, --ignore-nonprinting consider only printable characters
-M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'
-h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)
-n, --numeric-sort compare according to string numerical value
-R, --random-sort shuffle, but group identical keys. See shuf(1)
--random-source=FILE get random bytes from FILE
-r, --reverse reverse the result of comparisons
--sort=WORD sort according to WORD:
general-numeric -g, human-numeric -h, month -M,
numeric -n, random -R, version -V
-V, --version-sort natural sort of (version) numbers within text
```
r? ```@Noratrieb``` (it sounded like you know this code?)
Diffstat (limited to 'src')
| -rw-r--r-- | src/bootstrap/src/core/build_steps/dist.rs | 10 | ||||
| -rw-r--r-- | src/tools/tidy/src/alphabetical.rs | 59 | ||||
| -rw-r--r-- | src/tools/tidy/src/alphabetical/tests.rs | 147 |
3 files changed, 209 insertions, 7 deletions
diff --git a/src/bootstrap/src/core/build_steps/dist.rs b/src/bootstrap/src/core/build_steps/dist.rs index 95fc2f1aef9..25b7e5a1b5d 100644 --- a/src/bootstrap/src/core/build_steps/dist.rs +++ b/src/bootstrap/src/core/build_steps/dist.rs @@ -1056,18 +1056,18 @@ impl Step for PlainSourceTarball { let src_files = [ // tidy-alphabetical-start ".gitmodules", - "bootstrap.example.toml", - "Cargo.lock", - "Cargo.toml", - "configure", "CONTRIBUTING.md", "COPYRIGHT", + "Cargo.lock", + "Cargo.toml", "LICENSE-APACHE", - "license-metadata.json", "LICENSE-MIT", "README.md", "RELEASES.md", "REUSE.toml", + "bootstrap.example.toml", + "configure", + "license-metadata.json", "x", "x.ps1", "x.py", diff --git a/src/tools/tidy/src/alphabetical.rs b/src/tools/tidy/src/alphabetical.rs index a29286fa2c5..141083290c6 100644 --- a/src/tools/tidy/src/alphabetical.rs +++ b/src/tools/tidy/src/alphabetical.rs @@ -19,7 +19,9 @@ //! If a line ends with an opening delimiter, we effectively join the following line to it before //! checking it. E.g. `foo(\nbar)` is treated like `foo(bar)`. +use std::cmp::Ordering; use std::fmt::Display; +use std::iter::Peekable; use std::path::Path; use crate::walk::{filter_dirs, walk}; @@ -99,9 +101,9 @@ fn check_section<'a>( continue; } - let prev_line_trimmed_lowercase = prev_line.trim_start_matches(' ').to_lowercase(); + let prev_line_trimmed_lowercase = prev_line.trim_start_matches(' '); - if trimmed_line.to_lowercase() < prev_line_trimmed_lowercase { + if version_sort(&trimmed_line, &prev_line_trimmed_lowercase).is_lt() { tidy_error_ext!(err, bad, "{file}:{}: line not in alphabetical order", idx + 1); } @@ -143,3 +145,56 @@ pub fn check(path: &Path, bad: &mut bool) { check_lines(file, lines, &mut crate::tidy_error, bad) }); } + +fn consume_numeric_prefix<I: Iterator<Item = char>>(it: &mut Peekable<I>) -> String { + let mut result = String::new(); + + while let Some(&c) = it.peek() { + if !c.is_numeric() { + break; + } + + result.push(c); + it.next(); + } + + result +} + +// A sorting function that is case-sensitive, and sorts sequences of digits by their numeric value, +// so that `9` sorts before `12`. +fn version_sort(a: &str, b: &str) -> Ordering { + let mut it1 = a.chars().peekable(); + let mut it2 = b.chars().peekable(); + + while let (Some(x), Some(y)) = (it1.peek(), it2.peek()) { + match (x.is_numeric(), y.is_numeric()) { + (true, true) => { + let num1: String = consume_numeric_prefix(it1.by_ref()); + let num2: String = consume_numeric_prefix(it2.by_ref()); + + let int1: u64 = num1.parse().unwrap(); + let int2: u64 = num2.parse().unwrap(); + + // Compare strings when the numeric value is equal to handle "00" versus "0". + match int1.cmp(&int2).then_with(|| num1.cmp(&num2)) { + Ordering::Equal => continue, + different => return different, + } + } + (false, false) => match x.cmp(y) { + Ordering::Equal => { + it1.next(); + it2.next(); + continue; + } + different => return different, + }, + (false, true) | (true, false) => { + return x.cmp(y); + } + } + } + + it1.next().cmp(&it2.next()) +} diff --git a/src/tools/tidy/src/alphabetical/tests.rs b/src/tools/tidy/src/alphabetical/tests.rs index 29e89a693bf..4d05bc33ced 100644 --- a/src/tools/tidy/src/alphabetical/tests.rs +++ b/src/tools/tidy/src/alphabetical/tests.rs @@ -3,6 +3,7 @@ use std::str::from_utf8; use super::*; +#[track_caller] fn test(lines: &str, name: &str, expected_msg: &str, expected_bad: bool) { let mut actual_msg = Vec::new(); let mut actual_bad = false; @@ -15,10 +16,12 @@ fn test(lines: &str, name: &str, expected_msg: &str, expected_bad: bool) { assert_eq!(expected_bad, actual_bad); } +#[track_caller] fn good(lines: &str) { test(lines, "good", "", false); } +#[track_caller] fn bad(lines: &str, expected_msg: &str) { test(lines, "bad", expected_msg, true); } @@ -187,3 +190,147 @@ fn test_double_end() { "; bad(lines, "bad:5 found `tidy-alphabetical-end` expecting `tidy-alphabetical-start`"); } + +#[test] +fn test_numeric_good() { + good( + "\ + # tidy-alphabetical-start + rustc_ast = { path = \"../rustc_ast\" } + rustc_ast_lowering = { path = \"../rustc_ast_lowering\" } + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + fp-armv8 + fp16 + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + item1 + item2 + item10 + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + foo + foo_ + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + foo-bar + foo_bar + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + sme-lutv2 + sme2 + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + v5te + v6 + v6k + v6t2 + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + zve64d + zve64f + # tidy-alphabetical-end + ", + ); + + // Case is significant. + good( + "\ + # tidy-alphabetical-start + _ZYXW + _abcd + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + v0 + v00 + v000 + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + w005s09t + w5s009t + # tidy-alphabetical-end + ", + ); + + good( + "\ + # tidy-alphabetical-start + v0s + v00t + # tidy-alphabetical-end + ", + ); +} + +#[test] +fn test_numeric_bad() { + let lines = "\ + # tidy-alphabetical-start + item1 + item10 + item2 + # tidy-alphabetical-end + "; + bad(lines, "bad:4: line not in alphabetical order"); + + let lines = "\ + # tidy-alphabetical-start + zve64f + zve64d + # tidy-alphabetical-end + "; + bad(lines, "bad:3: line not in alphabetical order"); + + let lines = "\ + # tidy-alphabetical-start + 000 + 00 + # tidy-alphabetical-end + "; + bad(lines, "bad:3: line not in alphabetical order"); +} |
