diff --git a/Cargo.lock b/Cargo.lock index fff30b0f27b..d552bb655b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -147,7 +147,21 @@ dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", - "anstyle-wincon", + "anstyle-wincon 2.1.0", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e1ebcb11de5c03c67de28a7df593d32191b44939c482e97702baaaa6ab6a5" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 3.0.2", "colorchoice", "utf8parse", ] @@ -186,6 +200,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "anyhow" version = "1.0.75" @@ -520,7 +544,7 @@ version = "4.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" dependencies = [ - "anstream", + "anstream 0.5.0", "anstyle", "clap_lex", "strsim", @@ -558,7 +582,7 @@ checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" name = "clippy" version = "0.1.78" dependencies = [ - "anstream", + "anstream 0.5.0", "clippy_config", "clippy_lints", "clippy_utils", @@ -1234,6 +1258,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] + [[package]] name = "env_logger" version = "0.10.0" @@ -1247,6 +1281,19 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e7cf40684ae96ade6232ed84582f40ce0a66efcd43a5117aef610534f8e0b8" +dependencies = [ + "anstream 0.6.11", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.0" @@ -1638,9 +1685,9 @@ dependencies = [ [[package]] name = "handlebars" -version = "4.3.7" +version = "5.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83c3372087601b532857d332f5957cbae686da52bb7810bf038c3e3c3cc2fa0d" +checksum = "ab283476b99e66691dee3f1640fea91487a8d81f50fb5ecc75538f8f8879a1e4" dependencies = [ "log", "pest", @@ -2227,6 +2274,7 @@ dependencies = [ name = "linkchecker" version = "0.1.0" dependencies = [ + "html5ever", "once_cell", "regex", ] @@ -2335,9 +2383,9 @@ dependencies = [ [[package]] name = "mdbook" -version = "0.4.36" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80992cb0e05f22cc052c99f8e883f1593b891014b96a8b4637fd274d7030c85e" +checksum = "0c33564061c3c640bed5ace7d6a2a1b65f2c64257d1ac930c15e94ed0fb561d3" dependencies = [ "ammonia", "anyhow", @@ -2345,14 +2393,13 @@ dependencies = [ "clap", "clap_complete", "elasticlunr-rs", - "env_logger", + "env_logger 0.11.1", "handlebars", "log", "memchr", "once_cell", "opener", - "pathdiff", - "pulldown-cmark", + "pulldown-cmark 0.10.0", "regex", "serde", "serde_json", @@ -2471,7 +2518,7 @@ dependencies = [ "aes", "colored", "ctrlc", - "env_logger", + "env_logger 0.10.0", "getrandom", "jemalloc-sys", "lazy_static", @@ -2689,7 +2736,7 @@ dependencies = [ "camino", "clap", "derive_builder", - "env_logger", + "env_logger 0.10.0", "fs_extra", "glob", "humansize", @@ -3012,6 +3059,24 @@ dependencies = [ "unicase", ] +[[package]] +name = "pulldown-cmark" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce76ce678ffc8e5675b22aa1405de0b7037e2fdf8913fea40d1926c6fe1e6e7" +dependencies = [ + "bitflags 2.4.1", + "memchr", + "pulldown-cmark-escape", + "unicase", +] + +[[package]] +name = "pulldown-cmark-escape" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d8f9aa0e3cbcfaf8bf00300004ee3b72f74770f9cbac93f6928771f613276b" + [[package]] name = "punycode" version = "0.4.1" @@ -3271,7 +3336,7 @@ name = "rustbook" version = "0.1.0" dependencies = [ "clap", - "env_logger", + "env_logger 0.10.0", "mdbook", ] @@ -4427,7 +4492,7 @@ name = "rustc_resolve" version = "0.0.0" dependencies = [ "bitflags 2.4.1", - "pulldown-cmark", + "pulldown-cmark 0.9.6", "rustc_arena", "rustc_ast", "rustc_ast_pretty", @@ -4971,9 +5036,9 @@ checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f" [[package]] name = "shlex" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "siphasher" diff --git a/src/tools/linkchecker/Cargo.toml b/src/tools/linkchecker/Cargo.toml index 1d8f2f91882..318a69ab835 100644 --- a/src/tools/linkchecker/Cargo.toml +++ b/src/tools/linkchecker/Cargo.toml @@ -10,3 +10,4 @@ path = "main.rs" [dependencies] regex = "1" once_cell = "1" +html5ever = "0.26.0" diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs index 7f73cac63cb..f49c6e79f13 100644 --- a/src/tools/linkchecker/main.rs +++ b/src/tools/linkchecker/main.rs @@ -14,6 +14,12 @@ //! A few exceptions are allowed as there's known bugs in rustdoc, but this //! should catch the majority of "broken link" cases. +use html5ever::tendril::ByteTendril; +use html5ever::tokenizer::{ + BufferQueue, TagToken, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, +}; +use once_cell::sync::Lazy; +use regex::Regex; use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::env; @@ -23,9 +29,6 @@ use std::path::{Component, Path, PathBuf}; use std::rc::Rc; use std::time::Instant; -use once_cell::sync::Lazy; -use regex::Regex; - // Add linkcheck exceptions here // If at all possible you should use intra-doc links to avoid linkcheck issues. These // are cases where that does not work @@ -182,163 +185,10 @@ impl Checker { } }; - // Search for anything that's the regex 'href[ ]*=[ ]*".*?"' - with_attrs_in_source(&source, " href", |url, i, base| { - // Ignore external URLs - if url.starts_with("http:") - || url.starts_with("https:") - || url.starts_with("javascript:") - || url.starts_with("ftp:") - || url.starts_with("irc:") - || url.starts_with("data:") - || url.starts_with("mailto:") - { - report.links_ignored_external += 1; - return; - } - report.links_checked += 1; - let (url, fragment) = match url.split_once('#') { - None => (url, None), - Some((url, fragment)) => (url, Some(fragment)), - }; - // NB: the `splitn` always succeeds, even if the delimiter is not present. - let url = url.splitn(2, '?').next().unwrap(); - - // Once we've plucked out the URL, parse it using our base url and - // then try to extract a file path. - let mut path = file.to_path_buf(); - if !base.is_empty() || !url.is_empty() { - path.pop(); - for part in Path::new(base).join(url).components() { - match part { - Component::Prefix(_) | Component::RootDir => { - // Avoid absolute paths as they make the docs not - // relocatable by making assumptions on where the docs - // are hosted relative to the site root. - report.errors += 1; - println!( - "{}:{}: absolute path - {}", - pretty_path, - i + 1, - Path::new(base).join(url).display() - ); - return; - } - Component::CurDir => {} - Component::ParentDir => { - path.pop(); - } - Component::Normal(s) => { - path.push(s); - } - } - } - } - - let (target_pretty_path, target_entry) = self.load_file(&path, report); - let (target_source, target_ids) = match target_entry { - FileEntry::Missing => { - if is_exception(file, &target_pretty_path) { - report.links_ignored_exception += 1; - } else { - report.errors += 1; - println!( - "{}:{}: broken link - `{}`", - pretty_path, - i + 1, - target_pretty_path - ); - } - return; - } - FileEntry::Dir => { - // Links to directories show as directory listings when viewing - // the docs offline so it's best to avoid them. - report.errors += 1; - println!( - "{}:{}: directory link to `{}` \ - (directory links should use index.html instead)", - pretty_path, - i + 1, - target_pretty_path - ); - return; - } - FileEntry::OtherFile => return, - FileEntry::Redirect { target } => { - let t = target.clone(); - let (target, redir_entry) = self.load_file(&t, report); - match redir_entry { - FileEntry::Missing => { - report.errors += 1; - println!( - "{}:{}: broken redirect from `{}` to `{}`", - pretty_path, - i + 1, - target_pretty_path, - target - ); - return; - } - FileEntry::Redirect { target } => { - // Redirect to a redirect, this link checker - // currently doesn't support this, since it would - // require cycle checking, etc. - report.errors += 1; - println!( - "{}:{}: redirect from `{}` to `{}` \ - which is also a redirect (not supported)", - pretty_path, - i + 1, - target_pretty_path, - target.display() - ); - return; - } - FileEntry::Dir => { - report.errors += 1; - println!( - "{}:{}: redirect from `{}` to `{}` \ - which is a directory \ - (directory links should use index.html instead)", - pretty_path, - i + 1, - target_pretty_path, - target - ); - return; - } - FileEntry::OtherFile => return, - FileEntry::HtmlFile { source, ids } => (source, ids), - } - } - FileEntry::HtmlFile { source, ids } => (source, ids), - }; - - // Alright, if we've found an HTML file for the target link. If - // this is a fragment link, also check that the `id` exists. - if let Some(ref fragment) = fragment { - // Fragments like `#1-6` are most likely line numbers to be - // interpreted by javascript, so we're ignoring these - if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { - return; - } - - parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report); - - if target_ids.borrow().contains(*fragment) { - return; - } - - if is_exception(file, &format!("#{}", fragment)) { - report.links_ignored_exception += 1; - } else { - report.errors += 1; - print!("{}:{}: broken link fragment ", pretty_path, i + 1); - println!("`#{}` pointing to `{}`", fragment, target_pretty_path); - }; - } - }); + let (base, urls) = get_urls(&source); + for (i, url) in urls { + self.check_url(file, &pretty_path, report, &base, i, &url); + } self.check_intra_doc_links(file, &pretty_path, &source, report); @@ -350,6 +200,159 @@ impl Checker { } } + fn check_url( + &mut self, + file: &Path, + pretty_path: &str, + report: &mut Report, + base: &Option, + i: u64, + url: &str, + ) { + // Ignore external URLs + if url.starts_with("http:") + || url.starts_with("https:") + || url.starts_with("javascript:") + || url.starts_with("ftp:") + || url.starts_with("irc:") + || url.starts_with("data:") + || url.starts_with("mailto:") + { + report.links_ignored_external += 1; + return; + } + report.links_checked += 1; + let (url, fragment) = match url.split_once('#') { + None => (url, None), + Some((url, fragment)) => (url, Some(fragment)), + }; + // NB: the `splitn` always succeeds, even if the delimiter is not present. + let url = url.splitn(2, '?').next().unwrap(); + + // Once we've plucked out the URL, parse it using our base url and + // then try to extract a file path. + let mut path = file.to_path_buf(); + if base.is_some() || !url.is_empty() { + let base = base.as_deref().unwrap_or(""); + path.pop(); + for part in Path::new(base).join(url).components() { + match part { + Component::Prefix(_) | Component::RootDir => { + // Avoid absolute paths as they make the docs not + // relocatable by making assumptions on where the docs + // are hosted relative to the site root. + report.errors += 1; + println!( + "{}:{}: absolute path - {}", + pretty_path, + i, + Path::new(base).join(url).display() + ); + return; + } + Component::CurDir => {} + Component::ParentDir => { + path.pop(); + } + Component::Normal(s) => { + path.push(s); + } + } + } + } + + let (target_pretty_path, target_entry) = self.load_file(&path, report); + let (target_source, target_ids) = match target_entry { + FileEntry::Missing => { + if is_exception(file, &target_pretty_path) { + report.links_ignored_exception += 1; + } else { + report.errors += 1; + println!("{}:{}: broken link - `{}`", pretty_path, i, target_pretty_path); + } + return; + } + FileEntry::Dir => { + // Links to directories show as directory listings when viewing + // the docs offline so it's best to avoid them. + report.errors += 1; + println!( + "{}:{}: directory link to `{}` \ + (directory links should use index.html instead)", + pretty_path, i, target_pretty_path + ); + return; + } + FileEntry::OtherFile => return, + FileEntry::Redirect { target } => { + let t = target.clone(); + let (target, redir_entry) = self.load_file(&t, report); + match redir_entry { + FileEntry::Missing => { + report.errors += 1; + println!( + "{}:{}: broken redirect from `{}` to `{}`", + pretty_path, i, target_pretty_path, target + ); + return; + } + FileEntry::Redirect { target } => { + // Redirect to a redirect, this link checker + // currently doesn't support this, since it would + // require cycle checking, etc. + report.errors += 1; + println!( + "{}:{}: redirect from `{}` to `{}` \ + which is also a redirect (not supported)", + pretty_path, + i, + target_pretty_path, + target.display() + ); + return; + } + FileEntry::Dir => { + report.errors += 1; + println!( + "{}:{}: redirect from `{}` to `{}` \ + which is a directory \ + (directory links should use index.html instead)", + pretty_path, i, target_pretty_path, target + ); + return; + } + FileEntry::OtherFile => return, + FileEntry::HtmlFile { source, ids } => (source, ids), + } + } + FileEntry::HtmlFile { source, ids } => (source, ids), + }; + + // Alright, if we've found an HTML file for the target link. If + // this is a fragment link, also check that the `id` exists. + if let Some(ref fragment) = fragment { + // Fragments like `#1-6` are most likely line numbers to be + // interpreted by javascript, so we're ignoring these + if fragment.splitn(2, '-').all(|f| f.chars().all(|c| c.is_numeric())) { + return; + } + + parse_ids(&mut target_ids.borrow_mut(), &pretty_path, target_source, report); + + if target_ids.borrow().contains(*fragment) { + return; + } + + if is_exception(file, &format!("#{}", fragment)) { + report.links_ignored_exception += 1; + } else { + report.errors += 1; + print!("{}:{}: broken link fragment ", pretty_path, i); + println!("`#{}` pointing to `{}`", fragment, target_pretty_path); + }; + } + } + fn check_intra_doc_links( &mut self, file: &Path, @@ -496,59 +499,93 @@ fn maybe_redirect(source: &str) -> Option { find_redirect(REDIRECT_RUSTDOC).or_else(|| find_redirect(REDIRECT_MDBOOK)) } -fn with_attrs_in_source(source: &str, attr: &str, mut f: F) { - let mut base = ""; - for (i, mut line) in source.lines().enumerate() { - while let Some(j) = line.find(attr) { - let rest = &line[j + attr.len()..]; - // The base tag should always be the first link in the document so - // we can get away with using one pass. - let is_base = line[..j].ends_with("(source: &str, sink: Sink) -> Sink { + let tendril: ByteTendril = source.as_bytes().into(); + let mut input = BufferQueue::new(); + input.push_back(tendril.try_reinterpret().unwrap()); - let rest = &rest[pos_equals + 1..]; + let mut tok = Tokenizer::new(sink, TokenizerOpts::default()); + let _ = tok.feed(&mut input); + assert!(input.is_empty()); + tok.end(); + tok.sink +} - let pos_quote = match rest.find(&['"', '\''][..]) { - Some(i) => i, - None => continue, - }; - let quote_delim = rest.as_bytes()[pos_quote] as char; +#[derive(Default)] +struct AttrCollector { + attr_name: &'static [u8], + base: Option, + found_attrs: Vec<(u64, String)>, + /// Tracks whether or not it is inside a