7 files changed, 280 insertions, 2 deletions
diff --git a/src/bootstrap/bootstrap.py b/src/bootstrap/bootstrap.py
index 6659894a171..5de7e6957c6 100644
--- a/src/bootstrap/bootstrap.py
+++ b/src/bootstrap/bootstrap.py
@@ -73,7 +73,8 @@ class RustBuild:
 
         if self.rustc().startswith(self.bin_root()) and \
            (not os.path.exists(self.rustc()) or self.rustc_out_of_date()):
-            shutil.rmtree(self.bin_root())
+            if os.path.exists(self.bin_root()):
+                shutil.rmtree(self.bin_root())
             filename = "rust-std-nightly-" + self.build + ".tar.gz"
             url = "https://static.rust-lang.org/dist/" + self.snap_rustc_date()
             tarball = os.path.join(rustc_cache, filename)
diff --git a/src/bootstrap/build/check.rs b/src/bootstrap/build/check.rs
new file mode 100644
index 00000000000..19293e80217
--- /dev/null
+++ b/src/bootstrap/build/check.rs
@@ -0,0 +1,21 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::process::Command;
+
+use build::{Build, Compiler};
+
+pub fn linkcheck(build: &Build, stage: u32, host: &str) {
+    println!("Linkcheck stage{} ({})", stage, host);
+    let compiler = Compiler::new(stage, host);
+    let linkchecker = build.tool(&compiler, "linkchecker");
+    build.run(Command::new(&linkchecker)
+                     .arg(build.out.join(host).join("doc")));
+}
diff --git a/src/bootstrap/build/mod.rs b/src/bootstrap/build/mod.rs
index 825cca6563c..9f24fba5843 100644
--- a/src/bootstrap/build/mod.rs
+++ b/src/bootstrap/build/mod.rs
@@ -30,6 +30,7 @@ macro_rules! t {
 
 mod cc;
 mod channel;
+mod check;
 mod clean;
 mod compile;
 mod config;
@@ -171,6 +172,9 @@ impl Build {
                 Rustc { stage } => {
                     compile::assemble_rustc(self, stage, target.target);
                 }
+                ToolLinkchecker { stage } => {
+                    compile::tool(self, stage, target.target, "linkchecker");
+                }
                 ToolRustbook { stage } => {
                     compile::tool(self, stage, target.target, "rustbook");
                 }
@@ -195,6 +199,10 @@ impl Build {
                     doc::rustc(self, stage, target.target, &doc_out);
                 }
 
+                CheckLinkcheck { stage } => {
+                    check::linkcheck(self, stage, target.target);
+                }
+
                 Doc { .. } | // pseudo-steps
                 Check { .. } => {}
             }
diff --git a/src/bootstrap/build/step.rs b/src/bootstrap/build/step.rs
index 23c678df9ac..7921edcff55 100644
--- a/src/bootstrap/build/step.rs
+++ b/src/bootstrap/build/step.rs
@@ -46,6 +46,7 @@ macro_rules! targets {
             }),
 
             // Various tools that we can build as part of the build.
+            (tool_linkchecker, ToolLinkchecker { stage: u32 }),
             (tool_rustbook, ToolRustbook { stage: u32 }),
 
             // Steps for long-running native builds. Ideally these wouldn't
@@ -71,6 +72,7 @@ macro_rules! targets {
             // Steps for running tests. The 'check' target is just a pseudo
             // target to depend on a bunch of others.
             (check, Check { stage: u32, compiler: Compiler<'a> }),
+            (check_linkcheck, CheckLinkcheck { stage: u32 }),
         }
     }
 }
@@ -200,6 +202,8 @@ fn add_steps<'a>(build: &'a Build,
         }
 
         targets!(add_step);
+
+        panic!("unknown step: {}", step);
     }
 }
 
@@ -273,7 +277,15 @@ impl<'a> Step<'a> {
                      self.doc_std(stage)]
             }
             Source::Check { stage, compiler: _ } => {
-                vec![]
+                vec![self.check_linkcheck(stage)]
+            }
+            Source::CheckLinkcheck { stage } => {
+                vec![self.tool_linkchecker(stage), self.doc(stage)]
+            }
+
+            Source::ToolLinkchecker { stage } => {
+                vec![self.libstd(stage, self.compiler(stage))]
+            }
             Source::ToolRustbook { stage } => {
                 vec![self.librustc(stage, self.compiler(stage))]
             }
diff --git a/src/tools/linkchecker/Cargo.lock b/src/tools/linkchecker/Cargo.lock
new file mode 100644
index 00000000000..8e94137d213
--- /dev/null
+++ b/src/tools/linkchecker/Cargo.lock
@@ -0,0 +1,64 @@
+[root]
+name = "linkchecker"
+version = "0.1.0"
+dependencies = [
+ "url 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "matches"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "rand"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "libc 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "rustc-serialize"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "unicode-bidi"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "matches 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
+name = "url"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "matches 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rustc-serialize 0.3.18 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-bidi 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-normalization 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "uuid 0.1.18 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "uuid"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rustc-serialize 0.3.18 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
diff --git a/src/tools/linkchecker/Cargo.toml b/src/tools/linkchecker/Cargo.toml
new file mode 100644
index 00000000000..29fc78a65e9
--- /dev/null
+++ b/src/tools/linkchecker/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "linkchecker"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[dependencies]
+url = "0.5"
+
+[[bin]]
+name = "linkchecker"
+path = "main.rs"
diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs
new file mode 100644
index 00000000000..e5e88081bc4
--- /dev/null
+++ b/src/tools/linkchecker/main.rs
@@ -0,0 +1,161 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Script to check the validity of `href` links in our HTML documentation.
+//!
+//! In the past we've been quite error prone to writing in broken links as most
+//! of them are manually rather than automatically added. As files move over
+//! time or apis change old links become stale or broken. The purpose of this
+//! script is to check all relative links in our documentation to make sure they
+//! actually point to a valid place.
+//!
+//! Currently this doesn't actually do any HTML parsing or anything fancy like
+//! that, it just has a simple "regex" to search for `href` tags. These values
+//! are then translated to file URLs if possible and then the destination is
+//! asserted to exist.
+//!
+//! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
+//! but this should catch the majority of "broken link" cases.
+
+extern crate url;
+
+use std::env;
+use std::fs::File;
+use std::io::prelude::*;
+use std::path::Path;
+
+use url::{Url, UrlParser};
+
+macro_rules! t {
+    ($e:expr) => (match $e {
+        Ok(e) => e,
+        Err(e) => panic!("{} failed with {}", stringify!($e), e),
+    })
+}
+
+fn main() {
+    let docs = env::args().nth(1).unwrap();
+    let docs = env::current_dir().unwrap().join(docs);
+    let mut url = Url::from_file_path(&docs).unwrap();
+    let mut errors = false;
+    walk(&docs, &docs, &mut url, &mut errors);
+    if errors {
+        panic!("found some broken links");
+    }
+}
+
+fn walk(root: &Path, dir: &Path, url: &mut Url, errors: &mut bool) {
+    for entry in t!(dir.read_dir()).map(|e| t!(e)) {
+        let path = entry.path();
+        let kind = t!(entry.file_type());
+        url.path_mut().unwrap().push(entry.file_name().into_string().unwrap());
+        if kind.is_dir() {
+            walk(root, &path, url, errors);
+        } else {
+            check(root, &path, url, errors);
+        }
+        url.path_mut().unwrap().pop();
+    }
+}
+
+fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
+    // ignore js files as they are not prone to errors as the rest of the
+    // documentation is and they otherwise bring up false positives.
+    if file.extension().and_then(|s| s.to_str()) == Some("js") {
+        return
+    }
+
+    let pretty_file = file.strip_prefix(root).unwrap_or(file);
+
+    // Unfortunately we're not 100% full of valid links today to we need a few
+    // whitelists to get this past `make check` today.
+    if let Some(path) = pretty_file.to_str() {
+        // FIXME(#32129)
+        if path == "std/string/struct.String.html" {
+            return
+        }
+        // FIXME(#32130)
+        if path.contains("btree_set/struct.BTreeSet.html") ||
+           path == "collections/struct.BTreeSet.html" {
+            return
+        }
+        // FIXME(#31948)
+        if path.contains("ParseFloatError") {
+            return
+        }
+
+        // currently
+        if path == "std/sys/ext/index.html" {
+            return
+        }
+
+        // weird reexports, but this module is on its way out, so chalk it up to
+        // "rustdoc weirdness" and move on from there
+        if path.contains("scoped_tls") {
+            return
+        }
+    }
+
+    let mut parser = UrlParser::new();
+    parser.base_url(base);
+    let mut contents = String::new();
+    if t!(File::open(file)).read_to_string(&mut contents).is_err() {
+        return
+    }
+
+    for (i, mut line) in contents.lines().enumerate() {
+        // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
+        while let Some(j) = line.find(" href") {
+            let rest = &line[j + 5..];
+            line = rest;
+            let pos_equals = match rest.find("=") {
+                Some(i) => i,
+                None => continue,
+            };
+            if rest[..pos_equals].trim_left_matches(" ") != "" {
+                continue
+            }
+            let rest = &rest[pos_equals + 1..];
+            let pos_quote = match rest.find("\"").or_else(|| rest.find("'")) {
+                Some(i) => i,
+                None => continue,
+            };
+            if rest[..pos_quote].trim_left_matches(" ") != "" {
+                continue
+            }
+            let rest = &rest[pos_quote + 1..];
+            let url = match rest.find("\"").or_else(|| rest.find("'")) {
+                Some(i) => &rest[..i],
+                None => continue,
+            };
+
+            // Once we've plucked out the URL, parse it using our base url and
+            // then try to extract a file path. If either if these fail then we
+            // just keep going.
+            let parsed_url = match parser.parse(url) {
+                Ok(url) => url,
+                Err(..) => continue,
+            };
+            let path = match parsed_url.to_file_path() {
+                Ok(path) => path,
+                Err(..) => continue,
+            };
+
+            // Alright, if we've found a file name then this file had better
+            // exist! If it doesn't then we register and print an error.
+            if !path.exists() {
+                *errors = true;
+                print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
+                let pretty_path = path.strip_prefix(root).unwrap_or(&path);
+                println!("{}", pretty_path.display());
+            }
+        }
+    }
+}