ci: crawl docs site for new deployments to track broken links (#7473)

2025-02-20 07:30:57 +00:00 · 2022-09-13 17:06:15 +01:00 · 2022-09-13 17:06:15 +01:00 · 7685c5948f
commit 7685c5948f
parent 4e604f664d
4 changed files with 773 additions and 15 deletions
--- a/.github/workflows/docs-e2e.yml
+++ b/.github/workflows/docs-e2e.yml
@ -0,0 +1,32 @@
+name: docs-e2e
+
+on:
+  workflow_dispatch:
+    inputs:
+      url:
+        required: false
+        description: The URL to run the test suite against.
+        type: string
+  deployment_status:
+
+jobs:
+  crawl-docs:
+    environment:
+      name: ${{ github.event.deployment.environment || 'Production' }}
+      url: ${{ github.event.inputs.url || github.event.deployment.payload.web_url || github.event.deployment_status.target_url }}
+    if: github.event.deployment_status.state == 'success' || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-node@v3
+        with:
+          node-version: ${{ matrix.node }}
+          cache: "yarn"
+
+      - name: Install dependencies
+        run: yarn --immutable
+
+      - run: node ./scripts/crawl.mjs
+        env:
+          BASE_URL: ${{ github.event.inputs.url || github.event.deployment.payload.web_url || github.event.deployment_status.target_url }}
--- a/package.json
+++ b/package.json
@ -47,12 +47,15 @@
    "unbuild": "^0.8.11"
  },
  "devDependencies": {
+    "@actions/core": "^1.9.1",
    "@nuxtjs/eslint-config-typescript": "^11.0.0",
+    "@types/crawler": "^1.2.2",
    "@types/node": "^16.11.58",
    "@types/rimraf": "^3",
    "@unocss/reset": "^0.45.21",
    "case-police": "^0.5.10",
    "changelogen": "^0.3.0",
+    "crawler": "^1.3.0",
    "eslint": "^8.23.1",
    "eslint-plugin-jsdoc": "^39.3.6",
    "execa": "^6.1.0",
--- a/scripts/crawl.mjs
+++ b/scripts/crawl.mjs
@ -0,0 +1,109 @@
+import Crawler from 'crawler'
+import consola from 'consola'
+import { parseURL, withoutTrailingSlash } from 'ufo'
+import chalk from 'chalk'
+import * as actions from '@actions/core'
+import { isCI } from 'std-env'
+
+const logger = consola.withTag('crawler')
+
+const baseURL = withoutTrailingSlash(process.env.BASE_URL || 'https://v3.nuxtjs.org')
+const startingURL = baseURL + '/'
+
+const excludedExtensions = ['svg', 'png', 'jpg', 'sketch', 'ico', 'gif']
+const urlsToOmit = ['http://localhost:3000']
+
+// TODO: remove when migrating to Nuxt 3/Docus
+const errorsToIgnore = [
+  '/guide/directory-structure/nuxt.config',
+  '/guide/directory-structure',
+  '/guide/directory-structure/app.config',
+  '/api/configuration/nuxt.config',
+  '/guide/deploy',
+  '/guide/features/app-config'
+]
+
+// GLOBALS
+const urls = new Set([startingURL])
+const erroredUrls = new Set()
+
+/**
+ * @param {string} path Path to check
+ * @param {string | undefined} referrer The referring page
+ */
+function queue (path, referrer) {
+  if (urlsToOmit.some(url => path.startsWith(url))) { return }
+
+  const { pathname, origin } = new URL(path, referrer)
+
+  // Don't crawl the same page more than once
+  const url = `${origin}${pathname}`
+  if (!url || urls.has(url) || !crawler) { return }
+
+  // Don't try to visit linked assets (e.g. SVGs)
+  const extension = url.split('.').pop()
+  if (extension && excludedExtensions.includes(extension)) { return }
+
+  // Don't crawl external URLs
+  if (origin !== baseURL) { return }
+
+  urls.add(url)
+
+  crawler.queue(url)
+}
+
+const crawler = new Crawler({
+  maxConnections: 100,
+  callback (error, res, done) {
+    const { $ } = res
+    const { uri } = res.options
+    // @ts-ignore
+    const { statusCode } = res.request.response
+
+    if (error || ![200, 301, 302].includes(statusCode) || !$) {
+      if (errorsToIgnore.includes(parseURL(uri).pathname)) {
+        const message = chalk.gray(`${chalk.bold('✗')} ${uri} (${statusCode}) (ignored)`)
+        logger.log(message)
+        return done()
+      }
+      const message = chalk.red(`${chalk.bold('✗')} ${uri} (${statusCode})`)
+      if (isCI) { actions.error(message) }
+      logger.log(message)
+      erroredUrls.add(uri)
+      return done()
+    }
+
+    if (!$) {
+      const message = `Could not parse HTML for ${uri}`
+      logger.error(message)
+      if (isCI) { actions.warning(message) }
+      return done()
+    }
+
+    $('a:not([href*=mailto])').each((_, el) => 'attribs' in el && queue(el.attribs.href, uri))
+
+    logger.success(chalk.green(uri))
+    logger.debug(uri, `[${crawler.queueSize} / ${urls.size}]`)
+
+    if (!isCI && crawler.queueSize === 1) {
+      logger.log('')
+      logger.info(`Checked \`${urls.size}\` pages.`)
+      // Tasks to run at the end.
+      if (erroredUrls.size) {
+        const message = `${chalk.bold(erroredUrls.size)} errors found on ${chalk.bold(baseURL)}.`
+        const error = new Error(`\n\n${message}\n`)
+        error.message = message
+        error.stack = ''
+        throw error
+      }
+    }
+
+    done()
+  }
+})
+
+logger.log('')
+logger.info(`Checking \`${baseURL}\`.`)
+logger.info(`Ignoring file extensions: \`${excludedExtensions.join(', ')}.\`\n`)
+
+crawler.queue(startingURL)
--- a/yarn.lock
+++ b/yarn.lock