Auto merge of #107141 - notriddle:notriddle/max-lev-distance-2023, r=GuillaumeGomez

rustdoc: compute maximum Levenshtein distance based on the query

Preview: https://notriddle.com/notriddle-rustdoc-demos/search-lev-distance-2023/std/index.html?search=regex

The heuristic is pretty close to the name resolver, maxLevDistance = `Math.floor(queryLen / 3)`.

Fixes #103357
Fixes #82131

Similar to https://github.com/rust-lang/rust/pull/103710, but following the suggestion in https://github.com/rust-lang/rust/pull/103710#issuecomment-1296360267 to use `floor` instead of `ceil`, and unblocked now that https://github.com/rust-lang/rust/pull/105796 made it so that setting the max lev distance to `0` doesn't cause substring matches to be removed.
This commit is contained in:
bors 2023-02-06 02:09:00 +00:00
commit 7c3f0d6f30
6 changed files with 108 additions and 74 deletions

View file

@ -112,7 +112,6 @@ function levenshtein(s1, s2) {
} }
function initSearch(rawSearchIndex) { function initSearch(rawSearchIndex) {
const MAX_LEV_DISTANCE = 3;
const MAX_RESULTS = 200; const MAX_RESULTS = 200;
const NO_TYPE_FILTER = -1; const NO_TYPE_FILTER = -1;
/** /**
@ -897,13 +896,13 @@ function initSearch(rawSearchIndex) {
* @param {QueryElement} elem - The element from the parsed query. * @param {QueryElement} elem - The element from the parsed query.
* @param {integer} defaultLev - This is the value to return in case there are no generics. * @param {integer} defaultLev - This is the value to return in case there are no generics.
* *
* @return {integer} - Returns the best match (if any) or `MAX_LEV_DISTANCE + 1`. * @return {integer} - Returns the best match (if any) or `maxLevDistance + 1`.
*/ */
function checkGenerics(row, elem, defaultLev) { function checkGenerics(row, elem, defaultLev, maxLevDistance) {
if (row.generics.length === 0) { if (row.generics.length === 0) {
return elem.generics.length === 0 ? defaultLev : MAX_LEV_DISTANCE + 1; return elem.generics.length === 0 ? defaultLev : maxLevDistance + 1;
} else if (row.generics.length > 0 && row.generics[0].name === null) { } else if (row.generics.length > 0 && row.generics[0].name === null) {
return checkGenerics(row.generics[0], elem, defaultLev); return checkGenerics(row.generics[0], elem, defaultLev, maxLevDistance);
} }
// The names match, but we need to be sure that all generics kinda // The names match, but we need to be sure that all generics kinda
// match as well. // match as well.
@ -914,8 +913,8 @@ function initSearch(rawSearchIndex) {
elem_name = entry.name; elem_name = entry.name;
if (elem_name === "") { if (elem_name === "") {
// Pure generic, needs to check into it. // Pure generic, needs to check into it.
if (checkGenerics(entry, elem, MAX_LEV_DISTANCE + 1) !== 0) { if (checkGenerics(entry, elem, maxLevDistance + 1, maxLevDistance) !== 0) {
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
continue; continue;
} }
@ -942,7 +941,7 @@ function initSearch(rawSearchIndex) {
} }
} }
if (match === null) { if (match === null) {
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
elems[match] -= 1; elems[match] -= 1;
if (elems[match] === 0) { if (elems[match] === 0) {
@ -951,7 +950,7 @@ function initSearch(rawSearchIndex) {
} }
return 0; return 0;
} }
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
/** /**
@ -963,10 +962,10 @@ function initSearch(rawSearchIndex) {
* *
* @return {integer} - Returns a Levenshtein distance to the best match. * @return {integer} - Returns a Levenshtein distance to the best match.
*/ */
function checkIfInGenerics(row, elem) { function checkIfInGenerics(row, elem, maxLevDistance) {
let lev = MAX_LEV_DISTANCE + 1; let lev = maxLevDistance + 1;
for (const entry of row.generics) { for (const entry of row.generics) {
lev = Math.min(checkType(entry, elem, true), lev); lev = Math.min(checkType(entry, elem, true, maxLevDistance), lev);
if (lev === 0) { if (lev === 0) {
break; break;
} }
@ -983,15 +982,15 @@ function initSearch(rawSearchIndex) {
* @param {boolean} literalSearch * @param {boolean} literalSearch
* *
* @return {integer} - Returns a Levenshtein distance to the best match. If there is * @return {integer} - Returns a Levenshtein distance to the best match. If there is
* no match, returns `MAX_LEV_DISTANCE + 1`. * no match, returns `maxLevDistance + 1`.
*/ */
function checkType(row, elem, literalSearch) { function checkType(row, elem, literalSearch, maxLevDistance) {
if (row.name === null) { if (row.name === null) {
// This is a pure "generic" search, no need to run other checks. // This is a pure "generic" search, no need to run other checks.
if (row.generics.length > 0) { if (row.generics.length > 0) {
return checkIfInGenerics(row, elem); return checkIfInGenerics(row, elem, maxLevDistance);
} }
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
let lev = levenshtein(row.name, elem.name); let lev = levenshtein(row.name, elem.name);
@ -1005,9 +1004,9 @@ function initSearch(rawSearchIndex) {
return 0; return 0;
} }
} }
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} else if (elem.generics.length > 0) { } else if (elem.generics.length > 0) {
return checkGenerics(row, elem, MAX_LEV_DISTANCE + 1); return checkGenerics(row, elem, maxLevDistance + 1, maxLevDistance);
} }
return 0; return 0;
} else if (row.generics.length > 0) { } else if (row.generics.length > 0) {
@ -1017,22 +1016,20 @@ function initSearch(rawSearchIndex) {
} }
// The name didn't match so we now check if the type we're looking for is inside // The name didn't match so we now check if the type we're looking for is inside
// the generics! // the generics!
lev = checkIfInGenerics(row, elem); lev = Math.min(lev, checkIfInGenerics(row, elem, maxLevDistance));
// Now whatever happens, the returned distance is "less good" so we should mark return lev;
// it as such, and so we add 0.5 to the distance to make it "less good". } else if (lev > maxLevDistance) {
return lev + 0.5;
} else if (lev > MAX_LEV_DISTANCE) {
// So our item's name doesn't match at all and has generics. // So our item's name doesn't match at all and has generics.
// //
// Maybe it's present in a sub generic? For example "f<A<B<C>>>()", if we're // Maybe it's present in a sub generic? For example "f<A<B<C>>>()", if we're
// looking for "B<C>", we'll need to go down. // looking for "B<C>", we'll need to go down.
return checkIfInGenerics(row, elem); return checkIfInGenerics(row, elem, maxLevDistance);
} else { } else {
// At this point, the name kinda match and we have generics to check, so // At this point, the name kinda match and we have generics to check, so
// let's go! // let's go!
const tmp_lev = checkGenerics(row, elem, lev); const tmp_lev = checkGenerics(row, elem, lev, maxLevDistance);
if (tmp_lev > MAX_LEV_DISTANCE) { if (tmp_lev > maxLevDistance) {
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
// We compute the median value of both checks and return it. // We compute the median value of both checks and return it.
return (tmp_lev + lev) / 2; return (tmp_lev + lev) / 2;
@ -1040,7 +1037,7 @@ function initSearch(rawSearchIndex) {
} else if (elem.generics.length > 0) { } else if (elem.generics.length > 0) {
// In this case, we were expecting generics but there isn't so we simply reject this // In this case, we were expecting generics but there isn't so we simply reject this
// one. // one.
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
// No generics on our query or on the target type so we can return without doing // No generics on our query or on the target type so we can return without doing
// anything else. // anything else.
@ -1055,23 +1052,26 @@ function initSearch(rawSearchIndex) {
* @param {integer} typeFilter * @param {integer} typeFilter
* *
* @return {integer} - Returns a Levenshtein distance to the best match. If there is no * @return {integer} - Returns a Levenshtein distance to the best match. If there is no
* match, returns `MAX_LEV_DISTANCE + 1`. * match, returns `maxLevDistance + 1`.
*/ */
function findArg(row, elem, typeFilter) { function findArg(row, elem, typeFilter, maxLevDistance) {
let lev = MAX_LEV_DISTANCE + 1; let lev = maxLevDistance + 1;
if (row && row.type && row.type.inputs && row.type.inputs.length > 0) { if (row && row.type && row.type.inputs && row.type.inputs.length > 0) {
for (const input of row.type.inputs) { for (const input of row.type.inputs) {
if (!typePassesFilter(typeFilter, input.ty)) { if (!typePassesFilter(typeFilter, input.ty)) {
continue; continue;
} }
lev = Math.min(lev, checkType(input, elem, parsedQuery.literalSearch)); lev = Math.min(
lev,
checkType(input, elem, parsedQuery.literalSearch, maxLevDistance)
);
if (lev === 0) { if (lev === 0) {
return 0; return 0;
} }
} }
} }
return parsedQuery.literalSearch ? MAX_LEV_DISTANCE + 1 : lev; return parsedQuery.literalSearch ? maxLevDistance + 1 : lev;
} }
/** /**
@ -1082,10 +1082,10 @@ function initSearch(rawSearchIndex) {
* @param {integer} typeFilter * @param {integer} typeFilter
* *
* @return {integer} - Returns a Levenshtein distance to the best match. If there is no * @return {integer} - Returns a Levenshtein distance to the best match. If there is no
* match, returns `MAX_LEV_DISTANCE + 1`. * match, returns `maxLevDistance + 1`.
*/ */
function checkReturned(row, elem, typeFilter) { function checkReturned(row, elem, typeFilter, maxLevDistance) {
let lev = MAX_LEV_DISTANCE + 1; let lev = maxLevDistance + 1;
if (row && row.type && row.type.output.length > 0) { if (row && row.type && row.type.output.length > 0) {
const ret = row.type.output; const ret = row.type.output;
@ -1093,20 +1093,23 @@ function initSearch(rawSearchIndex) {
if (!typePassesFilter(typeFilter, ret_ty.ty)) { if (!typePassesFilter(typeFilter, ret_ty.ty)) {
continue; continue;
} }
lev = Math.min(lev, checkType(ret_ty, elem, parsedQuery.literalSearch)); lev = Math.min(
lev,
checkType(ret_ty, elem, parsedQuery.literalSearch, maxLevDistance)
);
if (lev === 0) { if (lev === 0) {
return 0; return 0;
} }
} }
} }
return parsedQuery.literalSearch ? MAX_LEV_DISTANCE + 1 : lev; return parsedQuery.literalSearch ? maxLevDistance + 1 : lev;
} }
function checkPath(contains, ty) { function checkPath(contains, ty, maxLevDistance) {
if (contains.length === 0) { if (contains.length === 0) {
return 0; return 0;
} }
let ret_lev = MAX_LEV_DISTANCE + 1; let ret_lev = maxLevDistance + 1;
const path = ty.path.split("::"); const path = ty.path.split("::");
if (ty.parent && ty.parent.name) { if (ty.parent && ty.parent.name) {
@ -1116,7 +1119,7 @@ function initSearch(rawSearchIndex) {
const length = path.length; const length = path.length;
const clength = contains.length; const clength = contains.length;
if (clength > length) { if (clength > length) {
return MAX_LEV_DISTANCE + 1; return maxLevDistance + 1;
} }
for (let i = 0; i < length; ++i) { for (let i = 0; i < length; ++i) {
if (i + clength > length) { if (i + clength > length) {
@ -1126,7 +1129,7 @@ function initSearch(rawSearchIndex) {
let aborted = false; let aborted = false;
for (let x = 0; x < clength; ++x) { for (let x = 0; x < clength; ++x) {
const lev = levenshtein(path[i + x], contains[x]); const lev = levenshtein(path[i + x], contains[x]);
if (lev > MAX_LEV_DISTANCE) { if (lev > maxLevDistance) {
aborted = true; aborted = true;
break; break;
} }
@ -1231,7 +1234,7 @@ function initSearch(rawSearchIndex) {
* following condition: * following condition:
* *
* * If it is a "literal search" (`parsedQuery.literalSearch`), then `lev` must be 0. * * If it is a "literal search" (`parsedQuery.literalSearch`), then `lev` must be 0.
* * If it is not a "literal search", `lev` must be <= `MAX_LEV_DISTANCE`. * * If it is not a "literal search", `lev` must be <= `maxLevDistance`.
* *
* The `results` map contains information which will be used to sort the search results: * The `results` map contains information which will be used to sort the search results:
* *
@ -1249,8 +1252,8 @@ function initSearch(rawSearchIndex) {
* @param {integer} lev * @param {integer} lev
* @param {integer} path_lev * @param {integer} path_lev
*/ */
function addIntoResults(results, fullId, id, index, lev, path_lev) { function addIntoResults(results, fullId, id, index, lev, path_lev, maxLevDistance) {
const inBounds = lev <= MAX_LEV_DISTANCE || index !== -1; const inBounds = lev <= maxLevDistance || index !== -1;
if (lev === 0 || (!parsedQuery.literalSearch && inBounds)) { if (lev === 0 || (!parsedQuery.literalSearch && inBounds)) {
if (results[fullId] !== undefined) { if (results[fullId] !== undefined) {
const result = results[fullId]; const result = results[fullId];
@ -1289,7 +1292,8 @@ function initSearch(rawSearchIndex) {
elem, elem,
results_others, results_others,
results_in_args, results_in_args,
results_returned results_returned,
maxLevDistance
) { ) {
if (!row || (filterCrates !== null && row.crate !== filterCrates)) { if (!row || (filterCrates !== null && row.crate !== filterCrates)) {
return; return;
@ -1298,13 +1302,13 @@ function initSearch(rawSearchIndex) {
const fullId = row.id; const fullId = row.id;
const searchWord = searchWords[pos]; const searchWord = searchWords[pos];
const in_args = findArg(row, elem, parsedQuery.typeFilter); const in_args = findArg(row, elem, parsedQuery.typeFilter, maxLevDistance);
const returned = checkReturned(row, elem, parsedQuery.typeFilter); const returned = checkReturned(row, elem, parsedQuery.typeFilter, maxLevDistance);
// path_lev is 0 because no parent path information is currently stored // path_lev is 0 because no parent path information is currently stored
// in the search index // in the search index
addIntoResults(results_in_args, fullId, pos, -1, in_args, 0); addIntoResults(results_in_args, fullId, pos, -1, in_args, 0, maxLevDistance);
addIntoResults(results_returned, fullId, pos, -1, returned, 0); addIntoResults(results_returned, fullId, pos, -1, returned, 0, maxLevDistance);
if (!typePassesFilter(parsedQuery.typeFilter, row.ty)) { if (!typePassesFilter(parsedQuery.typeFilter, row.ty)) {
return; return;
@ -1328,16 +1332,16 @@ function initSearch(rawSearchIndex) {
// No need to check anything else if it's a "pure" generics search. // No need to check anything else if it's a "pure" generics search.
if (elem.name.length === 0) { if (elem.name.length === 0) {
if (row.type !== null) { if (row.type !== null) {
lev = checkGenerics(row.type, elem, MAX_LEV_DISTANCE + 1); lev = checkGenerics(row.type, elem, maxLevDistance + 1, maxLevDistance);
// path_lev is 0 because we know it's empty // path_lev is 0 because we know it's empty
addIntoResults(results_others, fullId, pos, index, lev, 0); addIntoResults(results_others, fullId, pos, index, lev, 0, maxLevDistance);
} }
return; return;
} }
if (elem.fullPath.length > 1) { if (elem.fullPath.length > 1) {
path_lev = checkPath(elem.pathWithoutLast, row); path_lev = checkPath(elem.pathWithoutLast, row, maxLevDistance);
if (path_lev > MAX_LEV_DISTANCE) { if (path_lev > maxLevDistance) {
return; return;
} }
} }
@ -1351,11 +1355,11 @@ function initSearch(rawSearchIndex) {
lev = levenshtein(searchWord, elem.pathLast); lev = levenshtein(searchWord, elem.pathLast);
if (index === -1 && lev + path_lev > MAX_LEV_DISTANCE) { if (index === -1 && lev + path_lev > maxLevDistance) {
return; return;
} }
addIntoResults(results_others, fullId, pos, index, lev, path_lev); addIntoResults(results_others, fullId, pos, index, lev, path_lev, maxLevDistance);
} }
/** /**
@ -1367,7 +1371,7 @@ function initSearch(rawSearchIndex) {
* @param {integer} pos - Position in the `searchIndex`. * @param {integer} pos - Position in the `searchIndex`.
* @param {Object} results * @param {Object} results
*/ */
function handleArgs(row, pos, results) { function handleArgs(row, pos, results, maxLevDistance) {
if (!row || (filterCrates !== null && row.crate !== filterCrates)) { if (!row || (filterCrates !== null && row.crate !== filterCrates)) {
return; return;
} }
@ -1379,7 +1383,7 @@ function initSearch(rawSearchIndex) {
function checkArgs(elems, callback) { function checkArgs(elems, callback) {
for (const elem of elems) { for (const elem of elems) {
// There is more than one parameter to the query so all checks should be "exact" // There is more than one parameter to the query so all checks should be "exact"
const lev = callback(row, elem, NO_TYPE_FILTER); const lev = callback(row, elem, NO_TYPE_FILTER, maxLevDistance);
if (lev <= 1) { if (lev <= 1) {
nbLev += 1; nbLev += 1;
totalLev += lev; totalLev += lev;
@ -1400,12 +1404,21 @@ function initSearch(rawSearchIndex) {
return; return;
} }
const lev = Math.round(totalLev / nbLev); const lev = Math.round(totalLev / nbLev);
addIntoResults(results, row.id, pos, 0, lev, 0); addIntoResults(results, row.id, pos, 0, lev, 0, maxLevDistance);
} }
function innerRunQuery() { function innerRunQuery() {
let elem, i, nSearchWords, in_returned, row; let elem, i, nSearchWords, in_returned, row;
let queryLen = 0;
for (const elem of parsedQuery.elems) {
queryLen += elem.name.length;
}
for (const elem of parsedQuery.returned) {
queryLen += elem.name.length;
}
const maxLevDistance = Math.floor(queryLen / 3);
if (parsedQuery.foundElems === 1) { if (parsedQuery.foundElems === 1) {
if (parsedQuery.elems.length === 1) { if (parsedQuery.elems.length === 1) {
elem = parsedQuery.elems[0]; elem = parsedQuery.elems[0];
@ -1418,7 +1431,8 @@ function initSearch(rawSearchIndex) {
elem, elem,
results_others, results_others,
results_in_args, results_in_args,
results_returned results_returned,
maxLevDistance
); );
} }
} else if (parsedQuery.returned.length === 1) { } else if (parsedQuery.returned.length === 1) {
@ -1426,13 +1440,18 @@ function initSearch(rawSearchIndex) {
elem = parsedQuery.returned[0]; elem = parsedQuery.returned[0];
for (i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { for (i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) {
row = searchIndex[i]; row = searchIndex[i];
in_returned = checkReturned(row, elem, parsedQuery.typeFilter); in_returned = checkReturned(
addIntoResults(results_others, row.id, i, -1, in_returned); row,
elem,
parsedQuery.typeFilter,
maxLevDistance
);
addIntoResults(results_others, row.id, i, -1, in_returned, maxLevDistance);
} }
} }
} else if (parsedQuery.foundElems > 0) { } else if (parsedQuery.foundElems > 0) {
for (i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) { for (i = 0, nSearchWords = searchWords.length; i < nSearchWords; ++i) {
handleArgs(searchIndex[i], i, results_others); handleArgs(searchIndex[i], i, results_others, maxLevDistance);
} }
} }
} }
@ -1470,7 +1489,7 @@ function initSearch(rawSearchIndex) {
* *
* @return {boolean} - Whether the result is valid or not * @return {boolean} - Whether the result is valid or not
*/ */
function validateResult(name, path, keys, parent) { function validateResult(name, path, keys, parent, maxLevDistance) {
if (!keys || !keys.length) { if (!keys || !keys.length) {
return true; return true;
} }
@ -1485,7 +1504,7 @@ function initSearch(rawSearchIndex) {
(parent !== undefined && parent.name !== undefined && (parent !== undefined && parent.name !== undefined &&
parent.name.toLowerCase().indexOf(key) > -1) || parent.name.toLowerCase().indexOf(key) > -1) ||
// lastly check to see if the name was a levenshtein match // lastly check to see if the name was a levenshtein match
levenshtein(name, key) <= MAX_LEV_DISTANCE)) { levenshtein(name, key) <= maxLevDistance)) {
return false; return false;
} }
} }

View file

@ -340,7 +340,7 @@ pub mod details {
pub mod doc_block_table { pub mod doc_block_table {
pub trait DocBlockTableTrait { pub trait DocBlockTableTrait {
fn func(); fn foo();
} }
/// Struct doc. /// Struct doc.
@ -359,7 +359,7 @@ pub mod doc_block_table {
/// | header1 | header2 | /// | header1 | header2 |
/// |--------------------------|--------------------------| /// |--------------------------|--------------------------|
/// | Lorem Ipsum, Lorem Ipsum | Lorem Ipsum, Lorem Ipsum | /// | Lorem Ipsum, Lorem Ipsum | Lorem Ipsum, Lorem Ipsum |
fn func() { fn foo() {
println!(); println!();
} }
} }
@ -474,3 +474,15 @@ impl TypeWithImplDoc {
/// ///
/// </sub> /// </sub>
pub mod codeblock_sub {} pub mod codeblock_sub {}
pub mod search_results {
pub struct SearchResults {
pub foo: i32,
}
#[macro_export]
macro_rules! foo {
() => {};
}
}

View file

@ -0,0 +1,10 @@
// exact-check
// https://github.com/rust-lang/rust/issues/103357
const QUERY = 'regex';
const EXPECTED = {
'others': [],
'in_args': [],
'returned': [],
};

View file

@ -9,9 +9,5 @@ const EXPECTED = {
{ 'path': 'std', 'name': 'println' }, { 'path': 'std', 'name': 'println' },
{ 'path': 'std', 'name': 'eprint' }, { 'path': 'std', 'name': 'eprint' },
{ 'path': 'std', 'name': 'eprintln' }, { 'path': 'std', 'name': 'eprintln' },
{ 'path': 'std::pin', 'name': 'pin' },
{ 'path': 'std::future', 'name': 'join' },
{ 'path': 'std', 'name': 'line' },
{ 'path': 'std', 'name': 'write' },
], ],
}; };

View file

@ -1,5 +1,3 @@
// exact-check
const QUERY = [ const QUERY = [
'StructItem', 'StructItem',
'StructFieldItem', 'StructFieldItem',

View file

@ -4,6 +4,5 @@ const EXPECTED = {
'others': [ 'others': [
{ 'path': 'module_substring::Sig', 'name': 'pc' }, { 'path': 'module_substring::Sig', 'name': 'pc' },
{ 'path': 'module_substring::Si', 'name': 'pc' }, { 'path': 'module_substring::Si', 'name': 'pc' },
{ 'path': 'module_substring::Si', 'name': 'pa' },
], ],
}; };