1
0
mirror of synced 2025-12-19 18:10:59 -05:00

New broken link report (#16412)

* add linkinator npm package

* add new script that uses Linkinator

* reorg the excluded links file and update comments

* replace blc artifacts with linkinator artifacts in .gitignore

* update the scheduled workflow to use the new script

* dismantle BLC scripts

* add workflow_dispatch event so we can test this manually

* npm uninstall broken-link-checker

* use different exit codes depending on whether broken links are found
This commit is contained in:
Sarah Schneider
2020-11-10 15:28:44 -05:00
committed by GitHub
parent fa649bf494
commit ce33df1cd3
9 changed files with 471 additions and 1007 deletions

View File

@@ -1,6 +1,7 @@
name: Check all English links name: Check all English links
on: on:
workflow_dispatch:
schedule: schedule:
- cron: "40 19 * * *" # once a day at 19:40 UTC / 11:40 PST - cron: "40 19 * * *" # once a day at 19:40 UTC / 11:40 PST
@@ -10,21 +11,16 @@ jobs:
if: github.repository == 'github/docs-internal' if: github.repository == 'github/docs-internal'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@5a4ac9002d0be2fb38bd78e4b4dbde5606d7042f
- name: npm ci
run: npm ci
- name: npm run build
run: npm run build
- name: Run script - name: Run script
run: script/check-external-links en > broken_links.md run: script/check-english-links > broken_links.md
- name: Check if any broken links - name: Check if any broken links
id: check id: check
run: | run: |
if [ "$(grep 'All links are good' broken_links.md)" ]; then if [ "$(grep '0 broken links found' broken_links.md)" ]; then
echo ::set-output name=continue::no echo ::set-output name=continue::no
else else
echo "::set-output name=continue::yes" echo "::set-output name=continue::yes"
echo "::set-output name=title::$(grep 'found on help.github.com' broken_links.md)" echo "::set-output name=title::$(head -1 broken_links.md)"
fi fi
- if: ${{ steps.check.outputs.continue == 'yes' }} - if: ${{ steps.check.outputs.continue == 'yes' }}
name: Create issue from file name: Create issue from file

8
.gitignore vendored
View File

@@ -4,8 +4,6 @@
node_modules node_modules
npm-debug.log npm-debug.log
coverage coverage
.linkinator
# blc: broken link checker broken_links.md
blc_output.log dist
blc_output_internal.log
dist

View File

@@ -1,28 +1,20 @@
// Linkinator treats the following as regex.
module.exports = [ module.exports = [
// GitHub search links fail with "429 Too Many Requests" // Skip GitHub search links.
'https://github.com/search?*', 'https://github.com/search?.*',
'https://github.com/github/gitignore/search?', 'https://github.com/github/gitignore/search?',
// LinkedIn links fail due to bug: https://github.com/stevenvachon/broken-link-checker/issues/91 // These links require auth.
'https://www.linkedin.com/*', 'https://github.com/settings/profile',
'https://github.com/github/docs/edit',
// blc returns "BLC_UNKNOWN" on this link, even though cURL returns "302 Found"
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
// the codercat link works but blc reports a false 404
'https://github.com/Codertocat/hello-world-npm/packages/10696?version=1.0.1',
// this URL started returning 403 to blc and cURL even though it works in a browser; see docs-internal #10124
'https://haveibeenpwned.com/',
'https://haveibeenpwned.com/*',
// this is a private repo customers are given access to when they purchase Insights; see docs-internal #12037
'https://github.com/github/insights-releases/releases/latest', 'https://github.com/github/insights-releases/releases/latest',
// developer content uses these for examples; they should not be checked // Developer content uses these for examples; they should not be checked.
'http://localhost:1234/*', 'http://localhost:1234',
'localhost:3000', 'localhost:3000',
// this URL works but blc reports a false 404 // Oneoff links that link checkers think are broken but are not.
'https://haveibeenpwned.com/',
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
'http://www.w3.org/wiki/LinkHeader/' 'http://www.w3.org/wiki/LinkHeader/'
] ]

1100
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -50,6 +50,7 @@
"js-cookie": "^2.2.1", "js-cookie": "^2.2.1",
"js-yaml": "^3.14.0", "js-yaml": "^3.14.0",
"lil-env-thing": "^1.0.0", "lil-env-thing": "^1.0.0",
"linkinator": "^2.2.2",
"liquid": "^5.1.0", "liquid": "^5.1.0",
"lodash": "^4.17.19", "lodash": "^4.17.19",
"mini-css-extract-plugin": "^0.9.0", "mini-css-extract-plugin": "^0.9.0",
@@ -79,7 +80,6 @@
"await-sleep": "0.0.1", "await-sleep": "0.0.1",
"aws-sdk": "^2.610.0", "aws-sdk": "^2.610.0",
"babel-eslint": "^10.1.0", "babel-eslint": "^10.1.0",
"broken-link-checker": "^0.7.8",
"chalk": "^4.0.0", "chalk": "^4.0.0",
"commander": "^6.2.0", "commander": "^6.2.0",
"count-array-values": "^1.2.1", "count-array-values": "^1.2.1",

95
script/check-english-links.js Executable file
View File

@@ -0,0 +1,95 @@
#!/usr/bin/env node
const path = require('path')
const fs = require('fs')
const linkinator = require('linkinator')
const dedent = require('dedent')
const program = require('commander')
const { escapeRegExp } = require('lodash')
const checker = new linkinator.LinkChecker()
const rimraf = require('rimraf').sync
const root = 'https://docs.github.com'
const englishRoot = `${root}/en`
const { deprecated } = require('../lib/enterprise-server-releases')
// [start-readme]
//
// This script runs once per day via a scheduled GitHub Action to check all links in
// English content, not including deprecated Enterprise Server content. It opens an issue
// if it finds broken links. To exclude a link, add it to `lib/excluded-links.js`.
//
// [end-readme]
program
.description('Check all links in the English docs.')
.option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).')
.parse(process.argv)
// Skip excluded links defined in separate file.
const excludedLinks = require('../lib/excluded-links')
.map(link => escapeRegExp(link))
// Skip non-English content.
const languagesToSkip = Object.keys(require('../lib/languages'))
.filter(code => code !== 'en')
.map(code => `${root}/${code}`)
// Skip deprecated Enterprise content.
// Capture the old format https://docs.github.com/enterprise/2.1/
// and the new format https://docs.github.com/enterprise-server@2.19/.
const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})/`)
const config = {
path: englishRoot,
concurrency: 300,
// If this is a dry run, turn off recursion.
recurse: !program.dryRun,
silent: true,
// The values in this array are treated as regexes.
linksToSkip: [
enterpriseReleasesToSkip,
...languagesToSkip,
...excludedLinks
]
}
main()
async function main () {
const startTime = new Date()
// Clear and recreate a directory for logs.
const logFile = path.join(__dirname, '../.linkinator/full.log')
rimraf(path.dirname(logFile))
fs.mkdirSync(path.dirname(logFile), { recursive: true })
// Update CLI output and append to logfile after each checked link.
checker.on('link', result => {
fs.appendFileSync(logFile, JSON.stringify(result) + '\n')
})
// Start the scan; events will be logged as they occur.
const result = await checker.check(config)
// Scan is complete! Display the results.
const endTime = new Date()
const skippedLinks = result.links.filter(x => x.state === 'SKIPPED')
const brokenLinks = result.links.filter(x => x.state === 'BROKEN')
console.log(dedent`
${brokenLinks.length} broken links found on docs.github.com
Link scan completed in ${endTime - startTime}ms
Total links: ${result.links.length}
Skipped links: ${skippedLinks.length}
Broken links: ${brokenLinks.length}
For more details see ${path.relative(process.cwd(), logFile)}
`)
if (brokenLinks.length) {
console.log('\n\n' + JSON.stringify(brokenLinks, null, 2))
process.exit(1)
}
process.exit(0)
}

View File

@@ -1,134 +0,0 @@
#!/usr/bin/env bash
# [start-readme]
# The script is run once per day via a scheduled GitHub Action to check all links in the site. It automatically opens an issue if it finds broken links.
# To exclude a URL from the link check, add it to `lib/excluded-links.js`.
#
# For checking internal links, see `script/check-internal-links`.
# [end-readme]
internal=""
while getopts "h?i" opt; do
case "${opt}" in
h|\?) echo "Usage:"
echo " script/check-external-links [OPTIONS] [two-letter language code]"
echo ""
echo " script/check-external-links -i Check internal links. Without this flag, check all links."
echo " script/check-external-links -h Display this help message."
exit 0
;;
i) internal=" --internalOnly"
;;
esac
done
shift $((OPTIND -1))
if [ -z "${1}" ]
then
echo "error: must provide two-letter language code"
exit 1
fi
languageCode=${1}
# Pass options to script to construct blc command
blcCommand="$(./script/get-blc-command.js ${internal} --language ${languageCode})"
# Exit if script returned an error
if test $? -eq 1
then
exit 1
fi
# Determine logfile name based on options
logfile=""
if [ -z "${internal}" ]
then
logfile="blc_output.log"
else
logfile="blc_output_internal.log"
fi
# Kill any server running in the background, then start the server
killall node >/dev/null 2>&1
node server.js >/dev/null &
sleep 5
host="http://localhost:4000"
# Check whether localhost is accessible
hostStatus=$(curl -I --silent "${host}" | head -1)
isHostOK=$(echo "${hostStatus}" | grep "[2|3][0-9][0-9]")
if [ -z "${isHostOK}" ]
then
echo "Can't connect to ${host}!"
echo ${hostStatus}
echo ${isHostOK}
exit 1
fi
# Execute blc and save output
${blcCommand[@]} > ${logfile}
# We're done with the server now, so end the process
# killall node will also terminate this script, so find and kill the specific pid
pid=$(ps aux | grep "node server.js" | grep -v "grep" | awk '{ print $2 }'); kill -INT $pid >/dev/null 2>&1
# Recheck "403 Forbidden" results due to a bug
# https://github.com/stevenvachon/broken-link-checker/issues/58
# Also recheck "429" GitHub results
urlsToRecheck=$(egrep "HTTP_4(03|29)" ${logfile} | grep -o "http.* ")
if [ ! -z "${urlsToRecheck}" ]
then
for url in ${urlsToRecheck}
do
# Curl each URL and grep for 4xx or 5xx in status code response
status=$(curl -I --silent "${url}" | head -1 | grep "[4|5][0-9][0-9]")
if [ -z "${status}" ]
then
# If no 4/5xx found, the link is NOT really broken, so remove it from the list
# This command needs to work in all implementations of sed (Mac/GNU/etc)
sed -i'.bak' -e "s|^.*$url.*$||" ${logfile}
# Remove backup file
find . -name "${logfile}.bak" | xargs rm
fi
done
fi
# Count number of broken links in output
# Ignore "308 Permanent Redirect" results, which are not actually broken
numberOfBrokenLinks=$(grep "BROKEN" ${logfile} | grep -vc HTTP_308)
brokenLinks=$(grep "BROKEN" ${logfile} | grep -v HTTP_308)
# If broken links are found, exit with status 1 so the check run fails
if [ ${numberOfBrokenLinks} -gt 0 ]
then
# Print "links" or "link" in message depending on the number found
if [ ${numberOfBrokenLinks} -gt 1 ]
then
linkOrLinks="links"
else
linkOrLinks="link"
fi
echo -e "\n${numberOfBrokenLinks} broken ${linkOrLinks} found on help.github.com\n"
echo -e "Note: links that start with 'http://localhost:4000/' are internal links.\n"
# List broken links
echo "${brokenLinks}"
# Update final number of broken links
echo -e "\n$(tail -2 ${logfile})" | sed "s|. [0-9]* broken.|. ${numberOfBrokenLinks} broken.|"
# Exit without failure when checking all links so script/open-broken-links-issue can run
if [ -z "${internal}" ]
then
exit 0
else
exit 1
fi
else
echo "All links are good!"
echo -e "\n$(tail -2 ${logfile})"
exit 0
fi

View File

@@ -1,36 +0,0 @@
#!/usr/bin/env bash
# [start-readme]
# This script wraps tests/links-and-images.js and provides an option to output results to a file.
#
# For more information, see `tests/README.md#broken-link-test`.
# [end-readme]
# check if npx is installed
command -v npx >/dev/null 2>&1 || { echo -e "npx is not installed. Run:\n\n\$ npm install -g npx" >&2; exit 1; }
while getopts "h?f:" opt; do
case "${opt}" in
h|\?) echo "Usage:"
echo " script/check-internal-links [OPTIONS]"
echo ""
echo " script/check-internal-links -f [FILENAME] Output the results of tests/links-and-images to a file."
echo " script/check-internal-links -h Display this help message."
exit 0
;;
f) FILENAME="${OPTARG}"
;;
esac
done
shift $((OPTIND -1))
if [ "${FILENAME}" = "" ]
then
npx jest links-and-images
else
echo -e "Running tests/links-and-images.js\n"
npx jest links-and-images --no-color > ${FILENAME} 2>&1
echo "Done! Results in ${FILENAME}."
fi

View File

@@ -1,61 +0,0 @@
#!/usr/bin/env node
const supportedLanguages = Object.values(require('../lib/languages')).map(language => language.code)
const excludedLinks = require('../lib/excluded-links')
const program = require('commander')
// [start-readme]
//
// This script parses options for `script/check-external-links`.
//
// [end-readme]
program
.description('Construct a blc command to run in script/check-external-links')
.option('-L, --language <lang_code>', 'required language code')
.option('-i, --internalOnly', 'check internal links only')
.parse(process.argv)
const languageCode = program.language
if (!languageCode || !supportedLanguages.includes(languageCode)) {
console.error(`error: you must provide a currently supported language code: ${supportedLanguages.join(', ')}\n`)
process.exit(1)
}
// options for blc: https://github.com/stevenvachon/broken-link-checker#command-line-usage
const blcPackage = './node_modules/broken-link-checker/bin/blc'
const host = 'http://localhost:4000'
const hostWithLanguage = `${host}/${languageCode}`
const maintainOrder = '--ordered' // maintain order of links in html
const recursive = '--recursive'
const requestMethod = '--get'
const filterLevel = '--filter-level 3' // level 3 checks the most assets
const excludeExternal = program.internalOnly ? ' --exclude-external' : ''
let command = `${blcPackage} ${hostWithLanguage} ${maintainOrder} ${recursive} ${requestMethod} ${filterLevel}${excludeExternal}`
let excludeStrings = ''
// blc can only except one string per --exclude option
// so we need to construct multiple strings from exclusions array
excludedLinks.forEach(excludedLink => {
excludeStrings = `${excludeStrings} --exclude ${excludedLink}`
})
// prevent link checker from crawling other languages
// for example, if we're checking links on the English site
// we need to exclude links on Japanese, Chinese, Spanish, etc.
supportedLanguages.forEach(supportedLanguage => {
if (supportedLanguage === languageCode) return
const internalLink = `${host}/${supportedLanguage}/*`
const externalLink = `https://help.github.com/${supportedLanguage}/*`
excludeStrings = `${excludeStrings} --exclude ${internalLink}`
excludeStrings = `${excludeStrings} --exclude ${externalLink}`
})
// get final command
command = command + excludeStrings
// output final command
console.log(command)