about summary refs log tree commit diff
diff options
context:
space:
mode:
authorStuart Cook <Zalathar@users.noreply.github.com>2025-07-29 16:16:41 +1000
committerGitHub <noreply@github.com>2025-07-29 16:16:41 +1000
commit95ce5af4fe4575a558e27fcfebc0646936f39618 (patch)
treeef69e90c85868ec4f24f72a9373ee8f07f8f232f
parentcb6785f73df1aa3f558796a22a4ab9652cf38e26 (diff)
parent6693b3908f36374cd998e43445760f2b1552973d (diff)
downloadrust-95ce5af4fe4575a558e27fcfebc0646936f39618.tar.gz
rust-95ce5af4fe4575a558e27fcfebc0646936f39618.zip
Rollup merge of #143883 - pietroalbini:pa-linkchecker-extra-target, r=ehuss
Add `--link-targets-dir` argument to linkchecker

In my release notes API list tool (rust-lang/rust#143053) I want to check whether all links generated by the tool are actually valid, and using linkchecker seems to be the most sensible choice.

Linkchecker currently has a fairly big limitation though: it can only check a single directory, it checks *all* of the files within it, and link targets must point inside that same directory. This works great when checking the whole documentation package, but in my case I only need to check that one file contains valid links to the standard library docs.

To solve that, this PR adds a new `--link-targets-dir` flag to linkchecker. Directories passed to it will be valid link targets (with lower priority than the root being checked), but links within them will not be checked.

I'm not that happy with the name of the flag, happy for it to be bikeshedded.
-rw-r--r--src/bootstrap/src/core/build_steps/check.rs6
-rw-r--r--src/bootstrap/src/core/builder/mod.rs1
-rw-r--r--src/tools/linkchecker/Cargo.toml2
-rw-r--r--src/tools/linkchecker/main.rs118
4 files changed, 101 insertions, 26 deletions
diff --git a/src/bootstrap/src/core/build_steps/check.rs b/src/bootstrap/src/core/build_steps/check.rs
index cfe090b22dc..b4232409ba8 100644
--- a/src/bootstrap/src/core/build_steps/check.rs
+++ b/src/bootstrap/src/core/build_steps/check.rs
@@ -556,3 +556,9 @@ tool_check_step!(Compiletest {
     allow_features: COMPILETEST_ALLOW_FEATURES,
     default: false,
 });
+
+tool_check_step!(Linkchecker {
+    path: "src/tools/linkchecker",
+    mode: |_builder| Mode::ToolBootstrap,
+    default: false
+});
diff --git a/src/bootstrap/src/core/builder/mod.rs b/src/bootstrap/src/core/builder/mod.rs
index 923c3a9a935..020622d1c12 100644
--- a/src/bootstrap/src/core/builder/mod.rs
+++ b/src/bootstrap/src/core/builder/mod.rs
@@ -1033,6 +1033,7 @@ impl<'a> Builder<'a> {
                 check::Compiletest,
                 check::FeaturesStatusDump,
                 check::CoverageDump,
+                check::Linkchecker,
                 // This has special staging logic, it may run on stage 1 while others run on stage 0.
                 // It takes quite some time to build stage 1, so put this at the end.
                 //
diff --git a/src/tools/linkchecker/Cargo.toml b/src/tools/linkchecker/Cargo.toml
index 7123d43eb56..fb5bff3fe63 100644
--- a/src/tools/linkchecker/Cargo.toml
+++ b/src/tools/linkchecker/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "linkchecker"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 
 [[bin]]
 name = "linkchecker"
diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs
index 84cba3f8c44..1dc45728c90 100644
--- a/src/tools/linkchecker/main.rs
+++ b/src/tools/linkchecker/main.rs
@@ -17,12 +17,13 @@
 //! should catch the majority of "broken link" cases.
 
 use std::cell::{Cell, RefCell};
+use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
-use std::io::ErrorKind;
+use std::fs;
+use std::iter::once;
 use std::path::{Component, Path, PathBuf};
 use std::rc::Rc;
 use std::time::Instant;
-use std::{env, fs};
 
 use html5ever::tendril::ByteTendril;
 use html5ever::tokenizer::{
@@ -110,10 +111,25 @@ macro_rules! t {
     };
 }
 
+struct Cli {
+    docs: PathBuf,
+    link_targets_dirs: Vec<PathBuf>,
+}
+
 fn main() {
-    let docs = env::args_os().nth(1).expect("doc path should be first argument");
-    let docs = env::current_dir().unwrap().join(docs);
-    let mut checker = Checker { root: docs.clone(), cache: HashMap::new() };
+    let cli = match parse_cli() {
+        Ok(cli) => cli,
+        Err(err) => {
+            eprintln!("error: {err}");
+            usage_and_exit(1);
+        }
+    };
+
+    let mut checker = Checker {
+        root: cli.docs.clone(),
+        link_targets_dirs: cli.link_targets_dirs,
+        cache: HashMap::new(),
+    };
     let mut report = Report {
         errors: 0,
         start: Instant::now(),
@@ -125,7 +141,7 @@ fn main() {
         intra_doc_exceptions: 0,
         has_broken_urls: false,
     };
-    checker.walk(&docs, &mut report);
+    checker.walk(&cli.docs, &mut report);
     report.report();
     if report.errors != 0 {
         println!("found some broken links");
@@ -133,8 +149,50 @@ fn main() {
     }
 }
 
+fn parse_cli() -> Result<Cli, String> {
+    fn to_absolute_path(arg: &str) -> Result<PathBuf, String> {
+        std::path::absolute(arg).map_err(|e| format!("could not convert to absolute {arg}: {e}"))
+    }
+
+    let mut verbatim = false;
+    let mut docs = None;
+    let mut link_targets_dirs = Vec::new();
+
+    let mut args = std::env::args().skip(1);
+    while let Some(arg) = args.next() {
+        if !verbatim && arg == "--" {
+            verbatim = true;
+        } else if !verbatim && (arg == "-h" || arg == "--help") {
+            usage_and_exit(0)
+        } else if !verbatim && arg == "--link-targets-dir" {
+            link_targets_dirs.push(to_absolute_path(
+                &args.next().ok_or("missing value for --link-targets-dir")?,
+            )?);
+        } else if !verbatim && let Some(value) = arg.strip_prefix("--link-targets-dir=") {
+            link_targets_dirs.push(to_absolute_path(value)?);
+        } else if !verbatim && arg.starts_with('-') {
+            return Err(format!("unknown flag: {arg}"));
+        } else if docs.is_none() {
+            docs = Some(arg);
+        } else {
+            return Err("too many positional arguments".into());
+        }
+    }
+
+    Ok(Cli {
+        docs: to_absolute_path(&docs.ok_or("missing first positional argument")?)?,
+        link_targets_dirs,
+    })
+}
+
+fn usage_and_exit(code: i32) -> ! {
+    eprintln!("usage: linkchecker PATH [--link-targets-dir=PATH ...]");
+    std::process::exit(code)
+}
+
 struct Checker {
     root: PathBuf,
+    link_targets_dirs: Vec<PathBuf>,
     cache: Cache,
 }
 
@@ -420,37 +478,34 @@ impl Checker {
 
     /// Load a file from disk, or from the cache if available.
     fn load_file(&mut self, file: &Path, report: &mut Report) -> (String, &FileEntry) {
-        // https://docs.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
-        #[cfg(windows)]
-        const ERROR_INVALID_NAME: i32 = 123;
-
         let pretty_path =
             file.strip_prefix(&self.root).unwrap_or(file).to_str().unwrap().to_string();
 
-        let entry =
-            self.cache.entry(pretty_path.clone()).or_insert_with(|| match fs::metadata(file) {
+        for base in once(&self.root).chain(self.link_targets_dirs.iter()) {
+            let entry = self.cache.entry(pretty_path.clone());
+            if let Entry::Occupied(e) = &entry
+                && !matches!(e.get(), FileEntry::Missing)
+            {
+                break;
+            }
+
+            let file = base.join(&pretty_path);
+            entry.insert_entry(match fs::metadata(&file) {
                 Ok(metadata) if metadata.is_dir() => FileEntry::Dir,
                 Ok(_) => {
                     if file.extension().and_then(|s| s.to_str()) != Some("html") {
                         FileEntry::OtherFile
                     } else {
                         report.html_files += 1;
-                        load_html_file(file, report)
+                        load_html_file(&file, report)
                     }
                 }
-                Err(e) if e.kind() == ErrorKind::NotFound => FileEntry::Missing,
-                Err(e) => {
-                    // If a broken intra-doc link contains `::`, on windows, it will cause `ERROR_INVALID_NAME` rather than `NotFound`.
-                    // Explicitly check for that so that the broken link can be allowed in `LINKCHECK_EXCEPTIONS`.
-                    #[cfg(windows)]
-                    if e.raw_os_error() == Some(ERROR_INVALID_NAME)
-                        && file.as_os_str().to_str().map_or(false, |s| s.contains("::"))
-                    {
-                        return FileEntry::Missing;
-                    }
-                    panic!("unexpected read error for {}: {}", file.display(), e);
-                }
+                Err(e) if is_not_found_error(&file, &e) => FileEntry::Missing,
+                Err(e) => panic!("unexpected read error for {}: {}", file.display(), e),
             });
+        }
+
+        let entry = self.cache.get(&pretty_path).unwrap();
         (pretty_path, entry)
     }
 }
@@ -629,3 +684,16 @@ fn parse_ids(ids: &mut HashSet<String>, file: &str, source: &str, report: &mut R
         ids.insert(encoded);
     }
 }
+
+fn is_not_found_error(path: &Path, error: &std::io::Error) -> bool {
+    // https://docs.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499-
+    const WINDOWS_ERROR_INVALID_NAME: i32 = 123;
+
+    error.kind() == std::io::ErrorKind::NotFound
+        // If a broken intra-doc link contains `::`, on windows, it will cause `ERROR_INVALID_NAME`
+        // rather than `NotFound`. Explicitly check for that so that the broken link can be allowed
+        // in `LINKCHECK_EXCEPTIONS`.
+        || (cfg!(windows)
+            && error.raw_os_error() == Some(WINDOWS_ERROR_INVALID_NAME)
+            && path.as_os_str().to_str().map_or(false, |s| s.contains("::")))
+}