Nuxt/scripts/crawl.mjs

import Crawler from 'crawler'
import consola from 'consola'
import { parseURL, withoutTrailingSlash } from 'ufo'
import chalk from 'chalk'
import * as actions from '@actions/core'
import { isCI } from 'std-env'

const logger = consola.withTag('crawler')

const baseURL = withoutTrailingSlash(process.env.BASE_URL || 'https://v3.nuxtjs.org')
const startingURL = baseURL + '/'

const excludedExtensions = ['svg', 'png', 'jpg', 'sketch', 'ico', 'gif']
const urlsToOmit = ['http://localhost:3000']

// TODO: remove when migrating to Nuxt 3/Docus
const errorsToIgnore = [
  '/guide/directory-structure/nuxt.config',
  '/guide/directory-structure',
  '/guide/directory-structure/app.config',
  '/api/configuration/nuxt.config',
  '/guide/deploy',
  '/guide/features/app-config'
]

// GLOBALS
const urls = new Set([startingURL])
const erroredUrls = new Set()

/**
 * @param {string} path Path to check
 * @param {string | undefined} referrer The referring page
 */
function queue (path, referrer) {
  if (!path) {
    const message = chalk.red(`${chalk.bold('✗')} ${referrer} linked to empty href`)
    if (isCI) { actions.error(message) }
    logger.log(message)
    return
  }

  if (urlsToOmit.some(url => path.startsWith(url))) { return }

  const { pathname, origin } = new URL(path, referrer)

  // Don't crawl the same page more than once
  const url = `${origin}${pathname}`
  if (!url || urls.has(url) || !crawler) { return }

  // Don't try to visit linked assets (e.g. SVGs)
  const extension = url.split('.').pop()
  if (extension && excludedExtensions.includes(extension)) { return }

  // Don't crawl external URLs
  if (origin !== baseURL) { return }

  urls.add(url)

  crawler.queue(url)
}

const crawler = new Crawler({
  maxConnections: 100,
  callback (error, res, done) {
    const { $ } = res
    const { uri } = res.options
    // @ts-ignore
    const { statusCode } = res.request.response

    if (error || ![200, 301, 302].includes(statusCode) || !$) {
      if (errorsToIgnore.includes(parseURL(uri).pathname)) {
        const message = chalk.gray(`${chalk.bold('✗')} ${uri} (${statusCode}) (ignored)`)
        logger.log(message)
        return done()
      }
      const message = chalk.red(`${chalk.bold('✗')} ${uri} (${statusCode})`)
      if (isCI) { actions.error(message) }
      logger.log(message)
      erroredUrls.add(uri)
      return done()
    }

    if (!$) {
      const message = `Could not parse HTML for ${uri}`
      logger.error(message)
      if (isCI) { actions.warning(message) }
      return done()
    }

    $('a:not([href*=mailto]):not([href*=tel])').each((_, el) => {
      if ('attribs' in el && 'href' in el.attribs) {
        queue(el.attribs.href, uri)
      }
    })

    logger.success(chalk.green(uri))
    logger.debug(uri, `[${crawler.queueSize} / ${urls.size}]`)

    if (!isCI && crawler.queueSize === 1) {
      logger.log('')
      logger.info(`Checked \`${urls.size}\` pages.`)
      // Tasks to run at the end.
      if (erroredUrls.size) {
        const message = `${chalk.bold(erroredUrls.size)} errors found on ${chalk.bold(baseURL)}.`
        const error = new Error(`\n\n${message}\n`)
        error.message = message
        error.stack = ''
        throw error
      }
    }

    done()
  }
})

logger.log('')
logger.info(`Checking \`${baseURL}\`.`)
logger.info(`Ignoring file extensions: \`${excludedExtensions.join(', ')}.\`\n`)

crawler.queue(startingURL)
ci: crawl docs site for new deployments to track broken links (#7473) 2022-09-13 16:06:15 +00:00			`import Crawler from 'crawler'`
			`import consola from 'consola'`
			`import { parseURL, withoutTrailingSlash } from 'ufo'`
			`import chalk from 'chalk'`
			`import * as actions from '@actions/core'`
			`import { isCI } from 'std-env'`

			`const logger = consola.withTag('crawler')`

			`const baseURL = withoutTrailingSlash(process.env.BASE_URL \|\| 'https://v3.nuxtjs.org')`
			`const startingURL = baseURL + '/'`

			`const excludedExtensions = ['svg', 'png', 'jpg', 'sketch', 'ico', 'gif']`
			`const urlsToOmit = ['http://localhost:3000']`

			`// TODO: remove when migrating to Nuxt 3/Docus`
			`const errorsToIgnore = [`
			`'/guide/directory-structure/nuxt.config',`
			`'/guide/directory-structure',`
			`'/guide/directory-structure/app.config',`
			`'/api/configuration/nuxt.config',`
			`'/guide/deploy',`
			`'/guide/features/app-config'`
			`]`

			`// GLOBALS`
			`const urls = new Set([startingURL])`
			`const erroredUrls = new Set()`

			`/**`
			`* @param {string} path Path to check`
			`* @param {string \| undefined} referrer The referring page`
			`*/`
			`function queue (path, referrer) {`
ci(crawl): ignore `<a>` links without hrefs (#7925) Co-authored-by: Alexander Lichter <github@lichter.io> 2022-10-03 13:38:06 +00:00			`if (!path) {`
			const message = chalk.red(`${chalk.bold('✗')} ${referrer} linked to empty href`)
			`if (isCI) { actions.error(message) }`
			`logger.log(message)`
			`return`
			`}`

ci: crawl docs site for new deployments to track broken links (#7473) 2022-09-13 16:06:15 +00:00			`if (urlsToOmit.some(url => path.startsWith(url))) { return }`

			`const { pathname, origin } = new URL(path, referrer)`

			`// Don't crawl the same page more than once`
			const url = `${origin}${pathname}`
			`if (!url \|\| urls.has(url) \|\| !crawler) { return }`

			`// Don't try to visit linked assets (e.g. SVGs)`
			`const extension = url.split('.').pop()`
			`if (extension && excludedExtensions.includes(extension)) { return }`

			`// Don't crawl external URLs`
			`if (origin !== baseURL) { return }`

			`urls.add(url)`

			`crawler.queue(url)`
			`}`

			`const crawler = new Crawler({`
			`maxConnections: 100,`
			`callback (error, res, done) {`
			`const { $ } = res`
			`const { uri } = res.options`
			`// @ts-ignore`
			`const { statusCode } = res.request.response`

			`if (error \|\| ![200, 301, 302].includes(statusCode) \|\| !$) {`
			`if (errorsToIgnore.includes(parseURL(uri).pathname)) {`
			const message = chalk.gray(`${chalk.bold('✗')} ${uri} (${statusCode}) (ignored)`)
			`logger.log(message)`
			`return done()`
			`}`
			const message = chalk.red(`${chalk.bold('✗')} ${uri} (${statusCode})`)
			`if (isCI) { actions.error(message) }`
			`logger.log(message)`
			`erroredUrls.add(uri)`
			`return done()`
			`}`

			`if (!$) {`
			const message = `Could not parse HTML for ${uri}`
			`logger.error(message)`
			`if (isCI) { actions.warning(message) }`
			`return done()`
			`}`

ci(crawl): ignore `<a>` links without hrefs (#7925) Co-authored-by: Alexander Lichter <github@lichter.io> 2022-10-03 13:38:06 +00:00			`$('a:not([href=mailto]):not([href=tel])').each((_, el) => {`
			`if ('attribs' in el && 'href' in el.attribs) {`
			`queue(el.attribs.href, uri)`
			`}`
			`})`
ci: crawl docs site for new deployments to track broken links (#7473) 2022-09-13 16:06:15 +00:00
			`logger.success(chalk.green(uri))`
			logger.debug(uri, `[${crawler.queueSize} / ${urls.size}]`)

			`if (!isCI && crawler.queueSize === 1) {`
			`logger.log('')`
			logger.info(`Checked \`${urls.size}\` pages.`)
			`// Tasks to run at the end.`
			`if (erroredUrls.size) {`
			const message = `${chalk.bold(erroredUrls.size)} errors found on ${chalk.bold(baseURL)}.`
			const error = new Error(`\n\n${message}\n`)
			`error.message = message`
			`error.stack = ''`
			`throw error`
			`}`
			`}`

			`done()`
			`}`
			`})`

			`logger.log('')`
			logger.info(`Checking \`${baseURL}\`.`)
			logger.info(`Ignoring file extensions: \`${excludedExtensions.join(', ')}.\`\n`)

			`crawler.queue(startingURL)`