about summary refs log tree commit diff
diff options
context:
space:
mode:
authorStuart Cook <Zalathar@users.noreply.github.com>2025-08-26 14:19:16 +1000
committerGitHub <noreply@github.com>2025-08-26 14:19:16 +1000
commitaecc0287efcb91105bfd4751fc5a4e1873348758 (patch)
tree5d5d962bce08c66eb17b3900b61c3ffd0995bf2e
parente011dd47ee04cd1e62786b5a0b3bfe2d5e58ae35 (diff)
parentd022089f58fa6bf8f4f0bb020640836eb10eae7a (diff)
downloadrust-aecc0287efcb91105bfd4751fc5a4e1873348758.tar.gz
rust-aecc0287efcb91105bfd4751fc5a4e1873348758.zip
Rollup merge of #145535 - lolbinarycat:rustdoc-invalid_html_tags-svg-145529, r=GuillaumeGomez
make rustdoc::invalid_html_tags more robust

best reviewed a commit at a time.

I kept finding more edge case so I ended up having to make quite significant changes to the parser in order to make it preserve state across events and handle multiline attributes correctly.

fixes rust-lang/rust#145529
-rw-r--r--src/librustdoc/lib.rs1
-rw-r--r--src/librustdoc/passes/lint/html_tags.rs476
-rw-r--r--src/librustdoc/passes/lint/html_tags/tests.rs73
-rw-r--r--tests/rustdoc-ui/lints/invalid-html-tags.rs94
-rw-r--r--tests/rustdoc-ui/lints/invalid-html-tags.stderr71
5 files changed, 534 insertions, 181 deletions
diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs
index d891d1fba25..e2682045ab4 100644
--- a/src/librustdoc/lib.rs
+++ b/src/librustdoc/lib.rs
@@ -11,6 +11,7 @@
 #![feature(file_buffered)]
 #![feature(format_args_nl)]
 #![feature(if_let_guard)]
+#![feature(iter_advance_by)]
 #![feature(iter_intersperse)]
 #![feature(round_char_boundary)]
 #![feature(rustc_private)]
diff --git a/src/librustdoc/passes/lint/html_tags.rs b/src/librustdoc/passes/lint/html_tags.rs
index 19cf15d40a3..da09117b1bb 100644
--- a/src/librustdoc/passes/lint/html_tags.rs
+++ b/src/librustdoc/passes/lint/html_tags.rs
@@ -1,9 +1,11 @@
 //! Detects invalid HTML (like an unclosed `<span>`) in doc comments.
 
+use std::borrow::Cow;
 use std::iter::Peekable;
 use std::ops::Range;
 use std::str::CharIndices;
 
+use itertools::Itertools as _;
 use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag, TagEnd};
 use rustc_hir::HirId;
 use rustc_resolve::rustdoc::source_span_for_markdown_range;
@@ -101,7 +103,7 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: &
         });
     };
 
-    let mut tags = Vec::new();
+    let mut tagp = TagParser::new();
     let mut is_in_comment = None;
     let mut in_code_block = false;
 
@@ -126,70 +128,65 @@ pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item, hir_id: HirId, dox: &
     };
 
     let p = Parser::new_with_broken_link_callback(dox, main_body_opts(), Some(&mut replacer))
-        .into_offset_iter();
+        .into_offset_iter()
+        .coalesce(|a, b| {
+            // for some reason, pulldown-cmark splits html blocks into separate events for each line.
+            // we undo this, in order to handle multi-line tags.
+            match (a, b) {
+                ((Event::Html(_), ra), (Event::Html(_), rb)) if ra.end == rb.start => {
+                    let merged = ra.start..rb.end;
+                    Ok((Event::Html(Cow::Borrowed(&dox[merged.clone()]).into()), merged))
+                }
+                x => Err(x),
+            }
+        });
 
     for (event, range) in p {
         match event {
             Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
             Event::Html(text) | Event::InlineHtml(text) if !in_code_block => {
-                extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag)
+                tagp.extract_tags(&text, range, &mut is_in_comment, &report_diag)
             }
             Event::End(TagEnd::CodeBlock) => in_code_block = false,
             _ => {}
         }
     }
 
-    for (tag, range) in tags.iter().filter(|(t, _)| {
-        let t = t.to_lowercase();
-        !ALLOWED_UNCLOSED.contains(&t.as_str())
-    }) {
-        report_diag(format!("unclosed HTML tag `{tag}`"), range, true);
-    }
-
     if let Some(range) = is_in_comment {
         report_diag("Unclosed HTML comment".to_string(), &range, false);
+    } else if let &Some(quote_pos) = &tagp.quote_pos {
+        let qr = Range { start: quote_pos, end: quote_pos };
+        report_diag(
+            format!("unclosed quoted HTML attribute on tag `{}`", &tagp.tag_name),
+            &qr,
+            false,
+        );
+    } else {
+        if !tagp.tag_name.is_empty() {
+            report_diag(
+                format!("incomplete HTML tag `{}`", &tagp.tag_name),
+                &(tagp.tag_start_pos..dox.len()),
+                false,
+            );
+        }
+        for (tag, range) in tagp.tags.iter().filter(|(t, _)| {
+            let t = t.to_lowercase();
+            !is_implicitly_self_closing(&t)
+        }) {
+            report_diag(format!("unclosed HTML tag `{tag}`"), range, true);
+        }
     }
 }
 
+/// These tags are interpreted as self-closing if they lack an explicit closing tag.
 const ALLOWED_UNCLOSED: &[&str] = &[
     "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
     "source", "track", "wbr",
 ];
 
-fn drop_tag(
-    tags: &mut Vec<(String, Range<usize>)>,
-    tag_name: String,
-    range: Range<usize>,
-    f: &impl Fn(String, &Range<usize>, bool),
-) {
-    let tag_name_low = tag_name.to_lowercase();
-    if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
-        // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
-        // be emitted.
-        let should_not_warn = tags.iter().take(pos + 1).any(|(at, _)| {
-            let at = at.to_lowercase();
-            at == "script" || at == "style"
-        });
-        for (last_tag_name, last_tag_span) in tags.drain(pos + 1..) {
-            if should_not_warn {
-                continue;
-            }
-            let last_tag_name_low = last_tag_name.to_lowercase();
-            if ALLOWED_UNCLOSED.contains(&last_tag_name_low.as_str()) {
-                continue;
-            }
-            // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
-            // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
-            // have `h3`, meaning the tag wasn't closed as it should have.
-            f(format!("unclosed HTML tag `{last_tag_name}`"), &last_tag_span, true);
-        }
-        // Remove the `tag_name` that was originally closed
-        tags.pop();
-    } else {
-        // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
-        // but it helps for the visualization).
-        f(format!("unopened HTML tag `{tag_name}`"), &range, false);
-    }
+/// Allows constructs like `<img>`, but not `<img`.
+fn is_implicitly_self_closing(tag_name: &str) -> bool {
+    ALLOWED_UNCLOSED.contains(&tag_name)
 }
 
 fn extract_path_backwards(text: &str, end_pos: usize) -> Option<usize> {
@@ -252,151 +249,292 @@ fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool {
     c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit())
 }
 
-fn extract_html_tag(
-    tags: &mut Vec<(String, Range<usize>)>,
-    text: &str,
-    range: &Range<usize>,
-    start_pos: usize,
-    iter: &mut Peekable<CharIndices<'_>>,
-    f: &impl Fn(String, &Range<usize>, bool),
-) {
-    let mut tag_name = String::new();
-    let mut is_closing = false;
-    let mut prev_pos = start_pos;
+/// Parse html tags to ensure they are well-formed
+#[derive(Debug, Clone)]
+struct TagParser {
+    tags: Vec<(String, Range<usize>)>,
+    /// Name of the tag that is being parsed, if we are within a tag.
+    ///
+    /// Since the `<` and name of a tag must appear on the same line with no whitespace,
+    /// if this is the empty string, we are not in a tag.
+    tag_name: String,
+    tag_start_pos: usize,
+    is_closing: bool,
+    /// `true` if we are within a tag, but not within its name.
+    in_attrs: bool,
+    /// If we are in a quoted attribute, what quote char does it use?
+    ///
+    /// This needs to be stored in the struct since HTML5 allows newlines in quoted attrs.
+    quote: Option<char>,
+    quote_pos: Option<usize>,
+    after_eq: bool,
+}
 
-    loop {
-        let (pos, c) = match iter.peek() {
-            Some((pos, c)) => (*pos, *c),
-            // In case we reached the of the doc comment, we want to check that it's an
-            // unclosed HTML tag. For example "/// <h3".
-            None => (prev_pos, '\0'),
-        };
-        prev_pos = pos;
-        // Checking if this is a closing tag (like `</a>` for `<a>`).
-        if c == '/' && tag_name.is_empty() {
-            is_closing = true;
-        } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) {
-            tag_name.push(c);
-        } else {
-            if !tag_name.is_empty() {
-                let mut r = Range { start: range.start + start_pos, end: range.start + pos };
-                if c == '>' {
-                    // In case we have a tag without attribute, we can consider the span to
-                    // refer to it fully.
-                    r.end += 1;
+impl TagParser {
+    fn new() -> Self {
+        Self {
+            tags: Vec::new(),
+            tag_name: String::with_capacity(8),
+            tag_start_pos: 0,
+            is_closing: false,
+            in_attrs: false,
+            quote: None,
+            quote_pos: None,
+            after_eq: false,
+        }
+    }
+
+    fn drop_tag(&mut self, range: Range<usize>, f: &impl Fn(String, &Range<usize>, bool)) {
+        let tag_name_low = self.tag_name.to_lowercase();
+        if let Some(pos) = self.tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
+            // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
+            // be emitted.
+            let should_not_warn = self.tags.iter().take(pos + 1).any(|(at, _)| {
+                let at = at.to_lowercase();
+                at == "script" || at == "style"
+            });
+            for (last_tag_name, last_tag_span) in self.tags.drain(pos + 1..) {
+                if should_not_warn {
+                    continue;
                 }
-                if is_closing {
-                    // In case we have "</div >" or even "</div         >".
-                    if c != '>' {
-                        if !c.is_whitespace() {
-                            // It seems like it's not a valid HTML tag.
-                            break;
-                        }
-                        let mut found = false;
-                        for (new_pos, c) in text[pos..].char_indices() {
+                let last_tag_name_low = last_tag_name.to_lowercase();
+                if is_implicitly_self_closing(&last_tag_name_low) {
+                    continue;
+                }
+                // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
+                // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
+                // have `h3`, meaning the tag wasn't closed as it should have.
+                f(format!("unclosed HTML tag `{last_tag_name}`"), &last_tag_span, true);
+            }
+            // Remove the `tag_name` that was originally closed
+            self.tags.pop();
+        } else {
+            // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
+            // but it helps for the visualization).
+            f(format!("unopened HTML tag `{}`", &self.tag_name), &range, false);
+        }
+    }
+
+    /// Handle a `<` that appeared while parsing a tag.
+    fn handle_lt_in_tag(
+        &mut self,
+        range: Range<usize>,
+        lt_pos: usize,
+        f: &impl Fn(String, &Range<usize>, bool),
+    ) {
+        let global_pos = range.start + lt_pos;
+        // is this check needed?
+        if global_pos == self.tag_start_pos {
+            // `<` is in the tag because it is the start.
+            return;
+        }
+        // tried to start a new tag while in a tag
+        f(
+            format!("incomplete HTML tag `{}`", &self.tag_name),
+            &(self.tag_start_pos..global_pos),
+            false,
+        );
+        self.tag_parsed();
+    }
+
+    fn extract_html_tag(
+        &mut self,
+        text: &str,
+        range: &Range<usize>,
+        start_pos: usize,
+        iter: &mut Peekable<CharIndices<'_>>,
+        f: &impl Fn(String, &Range<usize>, bool),
+    ) {
+        let mut prev_pos = start_pos;
+
+        'outer_loop: loop {
+            let (pos, c) = match iter.peek() {
+                Some((pos, c)) => (*pos, *c),
+                // In case we reached the of the doc comment, we want to check that it's an
+                // unclosed HTML tag. For example "/// <h3".
+                None if self.tag_name.is_empty() => (prev_pos, '\0'),
+                None => break,
+            };
+            prev_pos = pos;
+            if c == '/' && self.tag_name.is_empty() {
+                // Checking if this is a closing tag (like `</a>` for `<a>`).
+                self.is_closing = true;
+            } else if !self.in_attrs && is_valid_for_html_tag_name(c, self.tag_name.is_empty()) {
+                self.tag_name.push(c);
+            } else {
+                if !self.tag_name.is_empty() {
+                    self.in_attrs = true;
+                    let mut r = Range { start: range.start + start_pos, end: range.start + pos };
+                    if c == '>' {
+                        // In case we have a tag without attribute, we can consider the span to
+                        // refer to it fully.
+                        r.end += 1;
+                    }
+                    if self.is_closing {
+                        // In case we have "</div >" or even "</div         >".
+                        if c != '>' {
                             if !c.is_whitespace() {
-                                if c == '>' {
-                                    r.end = range.start + new_pos + 1;
-                                    found = true;
-                                }
+                                // It seems like it's not a valid HTML tag.
                                 break;
                             }
-                        }
-                        if !found {
-                            break;
-                        }
-                    }
-                    drop_tag(tags, tag_name, r, f);
-                } else {
-                    let mut is_self_closing = false;
-                    let mut quote_pos = None;
-                    if c != '>' {
-                        let mut quote = None;
-                        let mut after_eq = false;
-                        for (i, c) in text[pos..].char_indices() {
-                            if !c.is_whitespace() {
-                                if let Some(q) = quote {
-                                    if c == q {
-                                        quote = None;
-                                        quote_pos = None;
-                                        after_eq = false;
+                            let mut found = false;
+                            for (new_pos, c) in text[pos..].char_indices() {
+                                if !c.is_whitespace() {
+                                    if c == '>' {
+                                        r.end = range.start + new_pos + 1;
+                                        found = true;
+                                    } else if c == '<' {
+                                        self.handle_lt_in_tag(range.clone(), pos + new_pos, f);
                                     }
-                                } else if c == '>' {
                                     break;
-                                } else if c == '/' && !after_eq {
-                                    is_self_closing = true;
-                                } else {
-                                    if is_self_closing {
-                                        is_self_closing = false;
-                                    }
-                                    if (c == '"' || c == '\'') && after_eq {
-                                        quote = Some(c);
-                                        quote_pos = Some(pos + i);
-                                    } else if c == '=' {
-                                        after_eq = true;
-                                    }
                                 }
-                            } else if quote.is_none() {
-                                after_eq = false;
+                            }
+                            if !found {
+                                break 'outer_loop;
                             }
                         }
-                    }
-                    if let Some(quote_pos) = quote_pos {
-                        let qr = Range { start: quote_pos, end: quote_pos };
-                        f(
-                            format!("unclosed quoted HTML attribute on tag `{tag_name}`"),
-                            &qr,
-                            false,
-                        );
-                    }
-                    if is_self_closing {
-                        // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus
-                        let valid = ALLOWED_UNCLOSED.contains(&&tag_name[..])
-                            || tags.iter().take(pos + 1).any(|(at, _)| {
-                                let at = at.to_lowercase();
-                                at == "svg" || at == "math"
-                            });
-                        if !valid {
-                            f(format!("invalid self-closing HTML tag `{tag_name}`"), &r, false);
-                        }
+                        self.drop_tag(r, f);
+                        self.tag_parsed();
                     } else {
-                        tags.push((tag_name, r));
+                        self.extract_opening_tag(text, range, r, pos, c, iter, f)
                     }
                 }
+                break;
             }
-            break;
+            iter.next();
         }
-        iter.next();
     }
-}
-
-fn extract_tags(
-    tags: &mut Vec<(String, Range<usize>)>,
-    text: &str,
-    range: Range<usize>,
-    is_in_comment: &mut Option<Range<usize>>,
-    f: &impl Fn(String, &Range<usize>, bool),
-) {
-    let mut iter = text.char_indices().peekable();
 
-    while let Some((start_pos, c)) = iter.next() {
-        if is_in_comment.is_some() {
-            if text[start_pos..].starts_with("-->") {
-                *is_in_comment = None;
+    fn extract_opening_tag(
+        &mut self,
+        text: &str,
+        range: &Range<usize>,
+        r: Range<usize>,
+        pos: usize,
+        c: char,
+        iter: &mut Peekable<CharIndices<'_>>,
+        f: &impl Fn(String, &Range<usize>, bool),
+    ) {
+        // we can store this as a local, since html5 does require the `/` and `>`
+        // to not be separated by whitespace.
+        let mut is_self_closing = false;
+        if c != '>' {
+            'parse_til_gt: {
+                for (i, c) in text[pos..].char_indices() {
+                    if !c.is_whitespace() {
+                        debug_assert_eq!(self.quote_pos.is_some(), self.quote.is_some());
+                        if let Some(q) = self.quote {
+                            if c == q {
+                                self.quote = None;
+                                self.quote_pos = None;
+                                self.after_eq = false;
+                            }
+                        } else if c == '>' {
+                            break 'parse_til_gt;
+                        } else if c == '<' {
+                            self.handle_lt_in_tag(range.clone(), pos + i, f);
+                        } else if c == '/' && !self.after_eq {
+                            is_self_closing = true;
+                        } else {
+                            if is_self_closing {
+                                is_self_closing = false;
+                            }
+                            if (c == '"' || c == '\'') && self.after_eq {
+                                self.quote = Some(c);
+                                self.quote_pos = Some(pos + i);
+                            } else if c == '=' {
+                                self.after_eq = true;
+                            }
+                        }
+                    } else if self.quote.is_none() {
+                        self.after_eq = false;
+                    }
+                    if !is_self_closing && !self.tag_name.is_empty() {
+                        iter.next();
+                    }
+                }
+                // if we've run out of text but still haven't found a `>`,
+                // return early without calling `tag_parsed` or emitting lints.
+                // this allows us to either find the `>` in a later event
+                // or emit a lint about it being missing.
+                return;
             }
-        } else if c == '<' {
-            if text[start_pos..].starts_with("<!--") {
-                // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
-                iter.next();
-                iter.next();
-                iter.next();
-                *is_in_comment = Some(Range {
-                    start: range.start + start_pos,
-                    end: range.start + start_pos + 3,
+        }
+        if is_self_closing {
+            // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus
+            let valid = ALLOWED_UNCLOSED.contains(&&self.tag_name[..])
+                || self.tags.iter().take(pos + 1).any(|(at, _)| {
+                    let at = at.to_lowercase();
+                    at == "svg" || at == "math"
                 });
-            } else {
-                extract_html_tag(tags, text, &range, start_pos, &mut iter, f);
+            if !valid {
+                f(format!("invalid self-closing HTML tag `{}`", self.tag_name), &r, false);
+            }
+        } else if !self.tag_name.is_empty() {
+            self.tags.push((std::mem::take(&mut self.tag_name), r));
+        }
+        self.tag_parsed();
+    }
+    /// Finished parsing a tag, reset related data.
+    fn tag_parsed(&mut self) {
+        self.tag_name.clear();
+        self.is_closing = false;
+        self.in_attrs = false;
+    }
+
+    fn extract_tags(
+        &mut self,
+        text: &str,
+        range: Range<usize>,
+        is_in_comment: &mut Option<Range<usize>>,
+        f: &impl Fn(String, &Range<usize>, bool),
+    ) {
+        let mut iter = text.char_indices().peekable();
+        let mut prev_pos = 0;
+        loop {
+            if self.quote.is_some() {
+                debug_assert!(self.in_attrs && self.quote_pos.is_some());
+            }
+            if self.in_attrs
+                && let Some(&(start_pos, _)) = iter.peek()
+            {
+                self.extract_html_tag(text, &range, start_pos, &mut iter, f);
+                // if no progress is being made, move forward forcefully.
+                if prev_pos == start_pos {
+                    iter.next();
+                }
+                prev_pos = start_pos;
+                continue;
+            }
+            let Some((start_pos, c)) = iter.next() else { break };
+            if is_in_comment.is_some() {
+                if text[start_pos..].starts_with("-->") {
+                    *is_in_comment = None;
+                }
+            } else if c == '<' {
+                // "<!--" is a valid attribute name under html5, so don't treat it as a comment if we're in a tag.
+                if self.tag_name.is_empty() && text[start_pos..].starts_with("<!--") {
+                    // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
+                    iter.next();
+                    iter.next();
+                    iter.next();
+                    *is_in_comment = Some(Range {
+                        start: range.start + start_pos,
+                        end: range.start + start_pos + 4,
+                    });
+                } else {
+                    if self.tag_name.is_empty() {
+                        self.tag_start_pos = range.start + start_pos;
+                    }
+                    self.extract_html_tag(text, &range, start_pos, &mut iter, f);
+                }
+            } else if !self.tag_name.is_empty() {
+                // partially inside html tag that spans across events
+                self.extract_html_tag(text, &range, start_pos, &mut iter, f);
             }
         }
     }
 }
+
+#[cfg(test)]
+mod tests;
diff --git a/src/librustdoc/passes/lint/html_tags/tests.rs b/src/librustdoc/passes/lint/html_tags/tests.rs
new file mode 100644
index 00000000000..81c1d21a55d
--- /dev/null
+++ b/src/librustdoc/passes/lint/html_tags/tests.rs
@@ -0,0 +1,73 @@
+use std::cell::RefCell;
+
+use super::*;
+
+#[test]
+fn test_extract_tags_nested_unclosed() {
+    let mut tagp = TagParser::new();
+    let diags = RefCell::new(Vec::new());
+    let dox = "<div>\n<br</div>";
+    tagp.extract_tags(dox, 0..dox.len(), &mut None, &|s, r, b| {
+        diags.borrow_mut().push((s, r.clone(), b));
+    });
+    assert_eq!(diags.borrow().len(), 1, "did not get expected diagnostics: {diags:?}");
+    assert_eq!(diags.borrow()[0].1, 6..9)
+}
+
+#[test]
+fn test_extract_tags_taglike_in_attr() {
+    let mut tagp = TagParser::new();
+    let diags = RefCell::new(Vec::new());
+    let dox = "<img src='<div>'>";
+    tagp.extract_tags(dox, 0..dox.len(), &mut None, &|s, r, b| {
+        diags.borrow_mut().push((s, r.clone(), b));
+    });
+    assert_eq!(diags.borrow().len(), 0, "unexpected diagnostics: {diags:?}");
+}
+
+#[test]
+fn test_extract_tags_taglike_in_multiline_attr() {
+    let mut tagp = TagParser::new();
+    let diags = RefCell::new(Vec::new());
+    let dox = "<img src=\"\nasd\n<div>\n\">";
+    tagp.extract_tags(dox, 0..dox.len(), &mut None, &|s, r, b| {
+        diags.borrow_mut().push((s, r.clone(), b));
+    });
+    assert_eq!(diags.borrow().len(), 0, "unexpected diagnostics: {diags:?}");
+}
+
+#[test]
+fn test_extract_tags_taglike_in_multievent_attr() {
+    let mut tagp = TagParser::new();
+    let diags = RefCell::new(Vec::new());
+    let dox = "<img src='<div>'>";
+    let split_point = 10;
+    let mut p = |range: Range<usize>| {
+        tagp.extract_tags(&dox[range.clone()], range, &mut None, &|s, r, b| {
+            diags.borrow_mut().push((s, r.clone(), b));
+        })
+    };
+    p(0..split_point);
+    p(split_point..dox.len());
+    assert_eq!(diags.borrow().len(), 0, "unexpected diagnostics: {diags:?}");
+}
+
+#[test]
+fn test_extract_tags_taglike_in_multiline_multievent_attr() {
+    let mut tagp = TagParser::new();
+    let diags = RefCell::new(Vec::new());
+    let dox = "<img src='\n foo:\n </div>\n <p/>\n <div>\n'>";
+    let mut p = |range: Range<usize>| {
+        tagp.extract_tags(&dox[range.clone()], range, &mut None, &|s, r, b| {
+            diags.borrow_mut().push((s, r.clone(), b));
+        })
+    };
+    let mut offset = 0;
+    for ln in dox.split_inclusive('\n') {
+        let new_offset = offset + ln.len();
+        p(offset..new_offset);
+        offset = new_offset;
+    }
+    assert_eq!(diags.borrow().len(), 0, "unexpected diagnostics: {diags:?}");
+    assert_eq!(tagp.tags.len(), 1);
+}
diff --git a/tests/rustdoc-ui/lints/invalid-html-tags.rs b/tests/rustdoc-ui/lints/invalid-html-tags.rs
index 317f1fd1d46..8003e5efdd5 100644
--- a/tests/rustdoc-ui/lints/invalid-html-tags.rs
+++ b/tests/rustdoc-ui/lints/invalid-html-tags.rs
@@ -43,7 +43,7 @@ pub fn b() {}
 ///   <h3>
 //~^ ERROR unclosed HTML tag `h3`
 /// <script
-//~^ ERROR unclosed HTML tag `script`
+//~^ ERROR incomplete HTML tag `script`
 pub fn c() {}
 
 // Unclosed tags shouldn't warn if they are nested inside a <script> elem.
@@ -72,6 +72,7 @@ pub fn e() {}
 /// <div></div >
 /// <div></div
 //~^ ERROR unclosed HTML tag `div`
+//~| ERROR incomplete HTML tag `div`
 pub fn f() {}
 
 /// <!---->
@@ -105,7 +106,7 @@ pub fn j() {}
 /// uiapp.run(&env::args().collect::<Vec<_>>());
 /// ```
 ///
-/// <Vec<_> shouldn't warn!
+// <Vec<_> shouldn't warn!
 /// ``````
 pub fn k() {}
 
@@ -121,3 +122,92 @@ pub fn no_error_1() {}
 /// backslashed \<<a href="">
 //~^ ERROR unclosed HTML tag `a`
 pub fn p() {}
+
+/// <svg width="512" height="512" viewBox="0 0 512" fill="none" xmlns="http://www.w3.org/2000/svg">
+///     <rect
+///        width="256"
+///        height="256"
+///        fill="#5064C8"
+///        stroke="black"
+///     />
+/// </svg>
+pub fn no_error_2() {}
+
+/// <div>
+///     <img
+///         src="https://example.com/ferris.png"
+///         width="512"
+///         height="512"
+///     />
+/// </div>
+pub fn no_error_3() {}
+
+/// > <div
+/// > class="foo">
+/// > </div>
+pub fn no_error_4() {}
+
+/// unfinished ALLOWED_UNCLOSED
+///
+/// note: CommonMark doesn't allow an html block to start with a multiline tag,
+/// so we use `<br>` a bunch to force these to be parsed as html blocks.
+///
+/// <br>
+/// <img
+//~^ ERROR incomplete HTML tag `img`
+pub fn q() {}
+
+/// nested unfinished ALLOWED_UNCLOSED
+/// <p><img</p>
+//~^ ERROR incomplete HTML tag `img`
+pub fn r() {}
+
+/// > <br>
+/// > <img
+//~^ ERROR incomplete HTML tag `img`
+/// > href="#broken"
+pub fn s() {}
+
+/// <br>
+/// <br<br>
+//~^ ERROR incomplete HTML tag `br`
+pub fn t() {}
+
+/// <br>
+/// <br
+//~^ ERROR incomplete HTML tag `br`
+pub fn u() {}
+
+/// <a href=">" alt="<">html5 allows this</a>
+pub fn no_error_5() {}
+
+/// <br>
+/// <img title="
+/// html5
+/// allows
+/// multiline
+/// attr
+/// values
+/// these are just text, not tags:
+/// </div>
+/// <p/>
+/// <div>
+/// ">
+pub fn no_error_6() {}
+
+/// <br>
+/// <a href="data:text/html,<!DOCTYPE>
+/// <html>
+/// <body><b>this is allowed for some reason</b></body>
+/// </html>
+/// ">what</a>
+pub fn no_error_7() {}
+
+/// Technically this is allowed per the html5 spec,
+/// but there's basically no legitemate reason to do it,
+/// so we don't allow it.
+///
+/// <p <!-->foobar</p>
+//~^ ERROR Unclosed HTML comment
+//~| ERROR incomplete HTML tag `p`
+pub fn v() {}
diff --git a/tests/rustdoc-ui/lints/invalid-html-tags.stderr b/tests/rustdoc-ui/lints/invalid-html-tags.stderr
index 9c2bfcf2c3d..b6ec22c2479 100644
--- a/tests/rustdoc-ui/lints/invalid-html-tags.stderr
+++ b/tests/rustdoc-ui/lints/invalid-html-tags.stderr
@@ -52,6 +52,12 @@ error: unclosed HTML tag `p`
 LL | ///    <br/> <p>
    |              ^^^
 
+error: incomplete HTML tag `script`
+  --> $DIR/invalid-html-tags.rs:45:5
+   |
+LL | /// <script
+   |     ^^^^^^^
+
 error: unclosed HTML tag `div`
   --> $DIR/invalid-html-tags.rs:41:5
    |
@@ -64,11 +70,11 @@ error: unclosed HTML tag `h3`
 LL | ///   <h3>
    |       ^^^^
 
-error: unclosed HTML tag `script`
-  --> $DIR/invalid-html-tags.rs:45:5
+error: incomplete HTML tag `div`
+  --> $DIR/invalid-html-tags.rs:73:10
    |
-LL | /// <script
-   |     ^^^^^^
+LL | /// <div></div
+   |          ^^^^^
 
 error: unclosed HTML tag `div`
   --> $DIR/invalid-html-tags.rs:73:5
@@ -77,28 +83,73 @@ LL | /// <div></div
    |     ^^^^^
 
 error: Unclosed HTML comment
-  --> $DIR/invalid-html-tags.rs:87:5
+  --> $DIR/invalid-html-tags.rs:88:5
    |
 LL | /// <!--
-   |     ^^^
+   |     ^^^^
 
 error: unopened HTML tag `unopened-tag`
-  --> $DIR/invalid-html-tags.rs:114:26
+  --> $DIR/invalid-html-tags.rs:115:26
    |
 LL | /// Web Components style </unopened-tag>
    |                          ^^^^^^^^^^^^^^^
 
 error: unclosed HTML tag `dashed-tags`
-  --> $DIR/invalid-html-tags.rs:112:26
+  --> $DIR/invalid-html-tags.rs:113:26
    |
 LL | /// Web Components style <dashed-tags>
    |                          ^^^^^^^^^^^^^
 
 error: unclosed HTML tag `a`
-  --> $DIR/invalid-html-tags.rs:121:19
+  --> $DIR/invalid-html-tags.rs:122:19
    |
 LL | /// backslashed \<<a href="">
    |                   ^^
 
-error: aborting due to 16 previous errors
+error: incomplete HTML tag `img`
+  --> $DIR/invalid-html-tags.rs:156:5
+   |
+LL | /// <img
+   |     ^^^^
+
+error: incomplete HTML tag `img`
+  --> $DIR/invalid-html-tags.rs:161:8
+   |
+LL | /// <p><img</p>
+   |        ^^^^
+
+error: incomplete HTML tag `img`
+  --> $DIR/invalid-html-tags.rs:166:7
+   |
+LL |   /// > <img
+   |  _______^
+LL | |
+LL | | /// > href="#broken"
+   | |____________________^
+
+error: incomplete HTML tag `br`
+  --> $DIR/invalid-html-tags.rs:172:5
+   |
+LL | /// <br<br>
+   |     ^^^
+
+error: incomplete HTML tag `br`
+  --> $DIR/invalid-html-tags.rs:177:5
+   |
+LL | /// <br
+   |     ^^^
+
+error: incomplete HTML tag `p`
+  --> $DIR/invalid-html-tags.rs:210:5
+   |
+LL | /// <p <!-->foobar</p>
+   |     ^^^
+
+error: Unclosed HTML comment
+  --> $DIR/invalid-html-tags.rs:210:8
+   |
+LL | /// <p <!-->foobar</p>
+   |        ^^^^
+
+error: aborting due to 24 previous errors