Split internal and external link checker into separate programs (#916)

Closes https://github.com/Qiskit/documentation/issues/876.

This split brings several improvements:

1. Simpler internal link checker code.
2. The external link checker now deduplicates links across all files,
which avoids repeating requests.
3. The external link checker logs its progress every 20 links.
4. It's possible to check external links without checking internal
links.

We run the external link checker in our weekly cron job over all files,
other than translations and historical Qiskit versions. That change was
added in https://github.com/Qiskit/documentation/pull/913.
This commit is contained in:
Eric Arellano 2024-02-28 14:57:29 -05:00 committed by GitHub
parent a2edf63200
commit b91049d822
8 changed files with 127 additions and 77 deletions

View File

@ -19,7 +19,7 @@ on:
- "docs/api/qiskit-ibm-provider/**/*"
- "docs/api/qiskit-ibm-runtime/**/*"
- "public/api/**/*"
- "scripts/checkLinks.ts"
- "scripts/checkInternalLinks.ts"
- "scripts/lib/links/ignores.ts"
jobs:
@ -36,7 +36,7 @@ jobs:
run: npm ci
- name: Check internal links
run: >
npm run check:links --
npm run check:internal-links --
--qiskit-release-notes
--current-apis
--dev-apis

View File

@ -31,7 +31,7 @@ jobs:
- name: Spellcheck
run: npm run check:spelling
- name: Internal link checker
run: npm run check:links
run: npm run check:internal-links
- name: Formatting
run: npm run check:fmt
- name: Typecheck
@ -54,4 +54,4 @@ jobs:
if: steps.changed-docs-files.outputs.any_changed == 'true'
run: |
echo "${{ steps.changed-docs-files.outputs.all_changed_files }}" > changed.txt
npm run check-pages-render -- --from-file changed.txt
npm run check:pages-render -- --from-file changed.txt

View File

@ -34,7 +34,7 @@ jobs:
sleep 20
- name: Check all pages render
run: >
npm run check-pages-render --
npm run check:pages-render --
--non-api
--qiskit-release-notes
--current-apis
@ -54,10 +54,6 @@ jobs:
run: npm ci
- name: Check external links
run: >
npm run check:links --
--qiskit-release-notes
--current-apis
--dev-apis
--historical-apis
--skip-broken-historical
--external
npm run check:external-links --
'docs/**/*.{md,mdx,ipynb}'
'!docs/api/qiskit/[0-9]*/*'

View File

@ -178,27 +178,35 @@ about it.
## Check for broken links
CI will check for broken links. You can also check locally:
We have two broken link checkers: for internal links and for external links.
To check internal links:
```bash
# Only check for internal broken links
npm run check:links
# Only check non-API docs
npm run check:internal-links
# Enable the validation of external links
npm run check:links -- --external
# By default, only the non-API docs are checked. You can add any of the
# below arguments to also check API docs.
npm run check:links -- --current-apis --dev-apis --historical-apis --qiskit-release-notes
# You can add any of the below arguments to also check API docs.
npm run check:internal-links -- --current-apis --dev-apis --historical-apis --qiskit-release-notes
# However, `--historical-apis` currently has failing versions, so you may
# want to add `--skip-broken-historical`.
npm run check:links -- --historical-apis --skip-broken-historical
npm run check:internal-links -- --historical-apis --skip-broken-historical
# Or, run all the checks. Although this only checks non-API docs.
npm run check
```
To check external links:
```bash
# Specify the files you want after `--`
npm run check:external-links -- docs/run/index.md docs/run/circuit-execution.mdx
# You can also use globs
npm run check:external-links -- 'docs/run/*' '!docs/run/index.mdx'
```
## Check file metadata
Every file needs to have a `title` and `description`. The `lint` job in CI will fail with instructions for any bad file.
@ -260,9 +268,9 @@ It's possible to write broken pages that crash when loaded. This is usually due
To check that all the non-API docs render:
1. Start up the local preview with `./start` by following the instructions at [Preview the docs locally](#preview-the-docs-locally)
2. In a new tab, `npm run check-pages-render -- --non-api`
2. In a new tab, `npm run check:pages-render -- --non-api`
You can also check that API docs and translations render by using any of these arguments: `npm run check-pages-render -- --non-api --qiskit-release-notes --current-apis --dev-apis --historical-apis --translations`. Warning that this is exponentially slower.
You can also check that API docs and translations render by using any of these arguments: `npm run check:pages-render -- --non-api --qiskit-release-notes --current-apis --dev-apis --historical-apis --translations`. Warning that this is exponentially slower.
CI will check on every PR that any changed files render correctly. We also run a weekly cron job to check that every page renders correctly.

View File

@ -5,12 +5,13 @@
"author": "Qiskit Development Team",
"license": "Apache-2.0",
"scripts": {
"check": "npm run check:metadata && npm run check:spelling && npm run check:links && npm run check:fmt",
"check": "npm run check:metadata && npm run check:spelling && npm run check:internal-links && npm run check:fmt",
"check:metadata": "node -r esbuild-register scripts/commands/checkMetadata.ts",
"check:links": "node -r esbuild-register scripts/commands/checkLinks.ts",
"check:spelling": "cspell --relative --no-progress docs/**/*.md* docs/api/**/*.md* --config cspell/cSpell.json",
"check:fmt": "prettier --check .",
"check-pages-render": "node -r esbuild-register scripts/commands/checkPagesRender.ts",
"check:internal-links": "node -r esbuild-register scripts/commands/checkInternalLinks.ts",
"check:external-links": "node -r esbuild-register scripts/commands/checkExternalLinks.ts",
"check:pages-render": "node -r esbuild-register scripts/commands/checkPagesRender.ts",
"fmt": "prettier --write .",
"test": "jest",
"typecheck": "tsc",

View File

@ -0,0 +1,78 @@
// This code is a Qiskit project.
//
// (C) Copyright IBM 2024.
//
// This code is licensed under the Apache License, Version 2.0. You may
// obtain a copy of this license in the LICENSE file in the root directory
// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
//
// Any modifications or derivative works of this code must retain this
// copyright notice, and modified files need to carry a notice indicating
// that they have been altered from the originals.
import { globby } from "globby";
import yargs from "yargs/yargs";
import { hideBin } from "yargs/helpers";
import { ExternalLink } from "../lib/links/ExternalLink";
import { parseFile } from "../lib/links/extractLinks";
import { addLinksToMap } from "../lib/links/FileBatch";
interface Arguments {
[x: string]: unknown;
globs?: string[];
}
const readArgs = (): Arguments => {
return yargs(hideBin(process.argv))
.version(false)
.command("$0 [globs..]", "")
.parseSync();
};
async function main() {
const args = readArgs();
const filePaths = await globby(args.globs || []);
const links = await getLinks(filePaths);
// We use a for loop to reduce the risk of rate limiting.
let allGood = true;
let numLinksChecked = 1;
for (const link of links) {
const result = await link.check();
// This script can be slow, so log progress every 20 links.
if (numLinksChecked % 20 == 0) {
console.log(`Checked ${numLinksChecked} / ${links.length} links`);
}
numLinksChecked++;
if (result === undefined) continue;
console.error(result);
allGood = false;
}
if (!allGood) {
console.error(
`\n\n💔 Some external links appear broken, although it's possible they are flakes. Checked ${links.length} links.`,
);
process.exit(1);
}
console.log(
`\n\n✅ No external links are broken. Checked ${links.length} links.`,
);
}
async function getLinks(filePaths: string[]): Promise<ExternalLink[]> {
const linksToOriginFiles = new Map<string, string[]>();
for (const filePath of filePaths) {
const parsed = await parseFile(filePath);
addLinksToMap(filePath, parsed.externalLinks, linksToOriginFiles);
}
return Array.from(linksToOriginFiles.entries()).map(
([link, originFiles]) => new ExternalLink(link, originFiles),
);
}
main().then(() => process.exit());

View File

@ -29,7 +29,6 @@ const SYNTHETIC_FILES: string[] = [
interface Arguments {
[x: string]: unknown;
external: boolean;
currentApis: boolean;
devApis: boolean;
historicalApis: boolean;
@ -40,12 +39,6 @@ interface Arguments {
const readArgs = (): Arguments => {
return yargs(hideBin(process.argv))
.version(false)
.option("external", {
type: "boolean",
default: false,
description:
"Should external links be checked? This slows down the script, but is useful to check.",
})
.option("current-apis", {
type: "boolean",
default: false,
@ -91,7 +84,7 @@ async function main() {
let allGood = true;
for (const fileBatch of fileBatches) {
const allValidLinks = await fileBatch.check(args.external, otherFiles);
const allValidLinks = await fileBatch.checkInternalLinks(otherFiles);
if (!allValidLinks) {
allGood = false;
}

View File

@ -13,7 +13,6 @@
import { globby } from "globby";
import { InternalLink, File } from "./InternalLink";
import { ExternalLink } from "./ExternalLink";
import {
ALWAYS_IGNORED_URLS,
FILES_TO_IGNORES,
@ -59,75 +58,50 @@ export class FileBatch {
/**
* Load and process the file batch.
*
* Returns a triplet:
* Returns a pair:
* 1. A list of `File` objects with their anchors. These represent
* the universe of valid internal links for this batch, other
* than any additional we may add at check-time, e.g. images.
* 2. A list of InternalLink objects to validate.
* 3. A list of ExternalLink objects to validate.
*/
async load(
loadExternalLinks: boolean,
): Promise<[File[], InternalLink[], ExternalLink[]]> {
async load(): Promise<[File[], InternalLink[]]> {
const files: File[] = [];
for (let filePath of this.toLoad) {
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
}
const internalLinksToOriginFiles = new Map<string, string[]>();
const externalLinksToOriginFiles = new Map<string, string[]>();
const linksToOriginFiles = new Map<string, string[]>();
for (const filePath of this.toCheck) {
const parsed = await parseFile(filePath);
files.push(new File(filePath, parsed.anchors));
addLinksToMap(filePath, parsed.internalLinks, internalLinksToOriginFiles);
if (loadExternalLinks) {
addLinksToMap(
filePath,
parsed.externalLinks,
externalLinksToOriginFiles,
);
}
addLinksToMap(filePath, parsed.internalLinks, linksToOriginFiles);
}
const internalLinks = Array.from(internalLinksToOriginFiles.entries()).map(
const links = Array.from(linksToOriginFiles.entries()).map(
([link, originFiles]) => new InternalLink(link, originFiles),
);
const externalLinks = Array.from(externalLinksToOriginFiles.entries()).map(
([link, originFiles]) => new ExternalLink(link, originFiles),
);
return [files, internalLinks, externalLinks];
return [files, links];
}
/**
* Check that all links in this file batch are valid.
* Check that all internal links in this file batch are valid.
*
* Logs the results to the console and returns `true` if there were no issues.
*/
async check(
checkExternalLinks: boolean,
otherFiles: File[],
): Promise<boolean> {
console.log(`\n\nChecking links for ${this.description}`);
async checkInternalLinks(otherFiles: File[]): Promise<boolean> {
console.log(`\n\nChecking internal links for ${this.description}`);
const [docsFiles, internalLinkList, externalLinkList] =
await this.load(checkExternalLinks);
const [docsFiles, links] = await this.load();
const existingFiles = docsFiles.concat(otherFiles);
const results = internalLinkList.map((link) => link.check(existingFiles));
// For loop reduces the risk of rate-limiting.
for (let link of externalLinkList) {
results.push(await link.check());
}
let allGood = true;
results
.filter((res) => res !== undefined)
.forEach((linkError) => {
console.error(linkError);
allGood = false;
});
links.forEach((link) => {
const result = link.check(existingFiles);
if (result === undefined) return;
console.error(result);
allGood = false;
});
return allGood;
}
}