about summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
authorMatthias Krüger <476013+matthiaskrgr@users.noreply.github.com>2025-06-26 15:47:17 +0200
committerGitHub <noreply@github.com>2025-06-26 15:47:17 +0200
commit158340f5617f7e6a555046802133536ab61764ba (patch)
treedff498ed9af7d3ed9884615633c91cd423d8a5c2 /src
parentd5d5eb471e7ff3a2876569f229703274f70cbb49 (diff)
parent1dfc8406dcb742453b36daf0ce7486183b1da79c (diff)
downloadrust-158340f5617f7e6a555046802133536ab61764ba.tar.gz
rust-158340f5617f7e6a555046802133536ab61764ba.zip
Rollup merge of #141311 - folkertdev:tidy-natural-sort, r=jieyouxu
make `tidy-alphabetical` use a natural sort

The idea here is that these lines should be correctly sorted, even though a naive string comparison would say they are not:

```
foo2
foo10
```

This is the ["natural sort order"](https://en.wikipedia.org/wiki/Natural_sort_order).

There is more discussion in [#t-compiler/help > tidy natural sort](https://rust-lang.zulipchat.com/#narrow/channel/182449-t-compiler.2Fhelp/topic/tidy.20natural.20sort/with/519111079)

Unfortunately, no standard sorting tools are smart enough to to this automatically (casting some doubt on whether we should make this change). Here are some sort outputs:

```
> cat foo.txt | sort
foo
foo1
foo10
foo2
mp
mp1e2
np",
np1e2",
> cat foo.txt | sort -n
foo
foo1
foo10
foo2
mp
mp1e2
np",
np1e2",
> cat foo.txt | sort -V
foo
foo1
foo2
foo10
mp
mp1e2
np1e2",
np",
```

Disappointingly, "numeric" sort does not actually have the behavior we want. It only sorts by numeric value if the line starts with a number. The "version" sort looks promising, but does something very unintuitive if you look at the final 4 values. None of the other options seem to have the desired behavior in all cases:

```
  -b, --ignore-leading-blanks  ignore leading blanks
  -d, --dictionary-order      consider only blanks and alphanumeric characters
  -f, --ignore-case           fold lower case to upper case characters
  -g, --general-numeric-sort  compare according to general numerical value
  -i, --ignore-nonprinting    consider only printable characters
  -M, --month-sort            compare (unknown) < 'JAN' < ... < 'DEC'
  -h, --human-numeric-sort    compare human readable numbers (e.g., 2K 1G)
  -n, --numeric-sort          compare according to string numerical value
  -R, --random-sort           shuffle, but group identical keys.  See shuf(1)
      --random-source=FILE    get random bytes from FILE
  -r, --reverse               reverse the result of comparisons
      --sort=WORD             sort according to WORD:
                                general-numeric -g, human-numeric -h, month -M,
                                numeric -n, random -R, version -V
  -V, --version-sort          natural sort of (version) numbers within text
```

r? ```@Noratrieb``` (it sounded like you know this code?)
Diffstat (limited to 'src')
-rw-r--r--src/bootstrap/src/core/build_steps/dist.rs10
-rw-r--r--src/tools/tidy/src/alphabetical.rs59
-rw-r--r--src/tools/tidy/src/alphabetical/tests.rs147
3 files changed, 209 insertions, 7 deletions
diff --git a/src/bootstrap/src/core/build_steps/dist.rs b/src/bootstrap/src/core/build_steps/dist.rs
index 95fc2f1aef9..25b7e5a1b5d 100644
--- a/src/bootstrap/src/core/build_steps/dist.rs
+++ b/src/bootstrap/src/core/build_steps/dist.rs
@@ -1056,18 +1056,18 @@ impl Step for PlainSourceTarball {
         let src_files = [
             // tidy-alphabetical-start
             ".gitmodules",
-            "bootstrap.example.toml",
-            "Cargo.lock",
-            "Cargo.toml",
-            "configure",
             "CONTRIBUTING.md",
             "COPYRIGHT",
+            "Cargo.lock",
+            "Cargo.toml",
             "LICENSE-APACHE",
-            "license-metadata.json",
             "LICENSE-MIT",
             "README.md",
             "RELEASES.md",
             "REUSE.toml",
+            "bootstrap.example.toml",
+            "configure",
+            "license-metadata.json",
             "x",
             "x.ps1",
             "x.py",
diff --git a/src/tools/tidy/src/alphabetical.rs b/src/tools/tidy/src/alphabetical.rs
index a29286fa2c5..141083290c6 100644
--- a/src/tools/tidy/src/alphabetical.rs
+++ b/src/tools/tidy/src/alphabetical.rs
@@ -19,7 +19,9 @@
 //! If a line ends with an opening delimiter, we effectively join the following line to it before
 //! checking it. E.g. `foo(\nbar)` is treated like `foo(bar)`.
 
+use std::cmp::Ordering;
 use std::fmt::Display;
+use std::iter::Peekable;
 use std::path::Path;
 
 use crate::walk::{filter_dirs, walk};
@@ -99,9 +101,9 @@ fn check_section<'a>(
             continue;
         }
 
-        let prev_line_trimmed_lowercase = prev_line.trim_start_matches(' ').to_lowercase();
+        let prev_line_trimmed_lowercase = prev_line.trim_start_matches(' ');
 
-        if trimmed_line.to_lowercase() < prev_line_trimmed_lowercase {
+        if version_sort(&trimmed_line, &prev_line_trimmed_lowercase).is_lt() {
             tidy_error_ext!(err, bad, "{file}:{}: line not in alphabetical order", idx + 1);
         }
 
@@ -143,3 +145,56 @@ pub fn check(path: &Path, bad: &mut bool) {
         check_lines(file, lines, &mut crate::tidy_error, bad)
     });
 }
+
+fn consume_numeric_prefix<I: Iterator<Item = char>>(it: &mut Peekable<I>) -> String {
+    let mut result = String::new();
+
+    while let Some(&c) = it.peek() {
+        if !c.is_numeric() {
+            break;
+        }
+
+        result.push(c);
+        it.next();
+    }
+
+    result
+}
+
+// A sorting function that is case-sensitive, and sorts sequences of digits by their numeric value,
+// so that `9` sorts before `12`.
+fn version_sort(a: &str, b: &str) -> Ordering {
+    let mut it1 = a.chars().peekable();
+    let mut it2 = b.chars().peekable();
+
+    while let (Some(x), Some(y)) = (it1.peek(), it2.peek()) {
+        match (x.is_numeric(), y.is_numeric()) {
+            (true, true) => {
+                let num1: String = consume_numeric_prefix(it1.by_ref());
+                let num2: String = consume_numeric_prefix(it2.by_ref());
+
+                let int1: u64 = num1.parse().unwrap();
+                let int2: u64 = num2.parse().unwrap();
+
+                // Compare strings when the numeric value is equal to handle "00" versus "0".
+                match int1.cmp(&int2).then_with(|| num1.cmp(&num2)) {
+                    Ordering::Equal => continue,
+                    different => return different,
+                }
+            }
+            (false, false) => match x.cmp(y) {
+                Ordering::Equal => {
+                    it1.next();
+                    it2.next();
+                    continue;
+                }
+                different => return different,
+            },
+            (false, true) | (true, false) => {
+                return x.cmp(y);
+            }
+        }
+    }
+
+    it1.next().cmp(&it2.next())
+}
diff --git a/src/tools/tidy/src/alphabetical/tests.rs b/src/tools/tidy/src/alphabetical/tests.rs
index 29e89a693bf..4d05bc33ced 100644
--- a/src/tools/tidy/src/alphabetical/tests.rs
+++ b/src/tools/tidy/src/alphabetical/tests.rs
@@ -3,6 +3,7 @@ use std::str::from_utf8;
 
 use super::*;
 
+#[track_caller]
 fn test(lines: &str, name: &str, expected_msg: &str, expected_bad: bool) {
     let mut actual_msg = Vec::new();
     let mut actual_bad = false;
@@ -15,10 +16,12 @@ fn test(lines: &str, name: &str, expected_msg: &str, expected_bad: bool) {
     assert_eq!(expected_bad, actual_bad);
 }
 
+#[track_caller]
 fn good(lines: &str) {
     test(lines, "good", "", false);
 }
 
+#[track_caller]
 fn bad(lines: &str, expected_msg: &str) {
     test(lines, "bad", expected_msg, true);
 }
@@ -187,3 +190,147 @@ fn test_double_end() {
     ";
     bad(lines, "bad:5 found `tidy-alphabetical-end` expecting `tidy-alphabetical-start`");
 }
+
+#[test]
+fn test_numeric_good() {
+    good(
+        "\
+        # tidy-alphabetical-start
+        rustc_ast = { path = \"../rustc_ast\" }
+        rustc_ast_lowering = { path = \"../rustc_ast_lowering\" }
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        fp-armv8
+        fp16
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        item1
+        item2
+        item10
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        foo
+        foo_
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        foo-bar
+        foo_bar
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        sme-lutv2
+        sme2
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        v5te
+        v6
+        v6k
+        v6t2
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        zve64d
+        zve64f
+        # tidy-alphabetical-end
+    ",
+    );
+
+    // Case is significant.
+    good(
+        "\
+        # tidy-alphabetical-start
+        _ZYXW
+        _abcd
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        v0
+        v00
+        v000
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        w005s09t
+        w5s009t
+        # tidy-alphabetical-end
+    ",
+    );
+
+    good(
+        "\
+        # tidy-alphabetical-start
+        v0s
+        v00t
+        # tidy-alphabetical-end
+    ",
+    );
+}
+
+#[test]
+fn test_numeric_bad() {
+    let lines = "\
+        # tidy-alphabetical-start
+        item1
+        item10
+        item2
+        # tidy-alphabetical-end
+    ";
+    bad(lines, "bad:4: line not in alphabetical order");
+
+    let lines = "\
+        # tidy-alphabetical-start
+        zve64f
+        zve64d
+        # tidy-alphabetical-end
+    ";
+    bad(lines, "bad:3: line not in alphabetical order");
+
+    let lines = "\
+        # tidy-alphabetical-start
+        000
+        00
+        # tidy-alphabetical-end
+    ";
+    bad(lines, "bad:3: line not in alphabetical order");
+}