New broken link report (#16412)
* add linkinator npm package * add new script that uses Linkinator * reorg the excluded links file and update comments * replace blc artifacts with linkinator artifacts in .gitignore * update the scheduled workflow to use the new script * dismantle BLC scripts * add workflow_dispatch event so we can test this manually * npm uninstall broken-link-checker * use different exit codes depending on whether broken links are found
This commit is contained in:
12
.github/workflows/check-all-english-links.yml
vendored
12
.github/workflows/check-all-english-links.yml
vendored
@@ -1,6 +1,7 @@
|
|||||||
name: Check all English links
|
name: Check all English links
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "40 19 * * *" # once a day at 19:40 UTC / 11:40 PST
|
- cron: "40 19 * * *" # once a day at 19:40 UTC / 11:40 PST
|
||||||
|
|
||||||
@@ -10,21 +11,16 @@ jobs:
|
|||||||
if: github.repository == 'github/docs-internal'
|
if: github.repository == 'github/docs-internal'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@5a4ac9002d0be2fb38bd78e4b4dbde5606d7042f
|
|
||||||
- name: npm ci
|
|
||||||
run: npm ci
|
|
||||||
- name: npm run build
|
|
||||||
run: npm run build
|
|
||||||
- name: Run script
|
- name: Run script
|
||||||
run: script/check-external-links en > broken_links.md
|
run: script/check-english-links > broken_links.md
|
||||||
- name: Check if any broken links
|
- name: Check if any broken links
|
||||||
id: check
|
id: check
|
||||||
run: |
|
run: |
|
||||||
if [ "$(grep 'All links are good' broken_links.md)" ]; then
|
if [ "$(grep '0 broken links found' broken_links.md)" ]; then
|
||||||
echo ::set-output name=continue::no
|
echo ::set-output name=continue::no
|
||||||
else
|
else
|
||||||
echo "::set-output name=continue::yes"
|
echo "::set-output name=continue::yes"
|
||||||
echo "::set-output name=title::$(grep 'found on help.github.com' broken_links.md)"
|
echo "::set-output name=title::$(head -1 broken_links.md)"
|
||||||
fi
|
fi
|
||||||
- if: ${{ steps.check.outputs.continue == 'yes' }}
|
- if: ${{ steps.check.outputs.continue == 'yes' }}
|
||||||
name: Create issue from file
|
name: Create issue from file
|
||||||
|
|||||||
8
.gitignore
vendored
8
.gitignore
vendored
@@ -4,8 +4,6 @@
|
|||||||
node_modules
|
node_modules
|
||||||
npm-debug.log
|
npm-debug.log
|
||||||
coverage
|
coverage
|
||||||
|
.linkinator
|
||||||
# blc: broken link checker
|
broken_links.md
|
||||||
blc_output.log
|
dist
|
||||||
blc_output_internal.log
|
|
||||||
dist
|
|
||||||
|
|||||||
@@ -1,28 +1,20 @@
|
|||||||
|
// Linkinator treats the following as regex.
|
||||||
module.exports = [
|
module.exports = [
|
||||||
// GitHub search links fail with "429 Too Many Requests"
|
// Skip GitHub search links.
|
||||||
'https://github.com/search?*',
|
'https://github.com/search?.*',
|
||||||
'https://github.com/github/gitignore/search?',
|
'https://github.com/github/gitignore/search?',
|
||||||
|
|
||||||
// LinkedIn links fail due to bug: https://github.com/stevenvachon/broken-link-checker/issues/91
|
// These links require auth.
|
||||||
'https://www.linkedin.com/*',
|
'https://github.com/settings/profile',
|
||||||
|
'https://github.com/github/docs/edit',
|
||||||
// blc returns "BLC_UNKNOWN" on this link, even though cURL returns "302 Found"
|
|
||||||
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
|
|
||||||
|
|
||||||
// the codercat link works but blc reports a false 404
|
|
||||||
'https://github.com/Codertocat/hello-world-npm/packages/10696?version=1.0.1',
|
|
||||||
|
|
||||||
// this URL started returning 403 to blc and cURL even though it works in a browser; see docs-internal #10124
|
|
||||||
'https://haveibeenpwned.com/',
|
|
||||||
'https://haveibeenpwned.com/*',
|
|
||||||
|
|
||||||
// this is a private repo customers are given access to when they purchase Insights; see docs-internal #12037
|
|
||||||
'https://github.com/github/insights-releases/releases/latest',
|
'https://github.com/github/insights-releases/releases/latest',
|
||||||
|
|
||||||
// developer content uses these for examples; they should not be checked
|
// Developer content uses these for examples; they should not be checked.
|
||||||
'http://localhost:1234/*',
|
'http://localhost:1234',
|
||||||
'localhost:3000',
|
'localhost:3000',
|
||||||
|
|
||||||
// this URL works but blc reports a false 404
|
// Oneoff links that link checkers think are broken but are not.
|
||||||
|
'https://haveibeenpwned.com/',
|
||||||
|
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
|
||||||
'http://www.w3.org/wiki/LinkHeader/'
|
'http://www.w3.org/wiki/LinkHeader/'
|
||||||
]
|
]
|
||||||
|
|||||||
1100
package-lock.json
generated
1100
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -50,6 +50,7 @@
|
|||||||
"js-cookie": "^2.2.1",
|
"js-cookie": "^2.2.1",
|
||||||
"js-yaml": "^3.14.0",
|
"js-yaml": "^3.14.0",
|
||||||
"lil-env-thing": "^1.0.0",
|
"lil-env-thing": "^1.0.0",
|
||||||
|
"linkinator": "^2.2.2",
|
||||||
"liquid": "^5.1.0",
|
"liquid": "^5.1.0",
|
||||||
"lodash": "^4.17.19",
|
"lodash": "^4.17.19",
|
||||||
"mini-css-extract-plugin": "^0.9.0",
|
"mini-css-extract-plugin": "^0.9.0",
|
||||||
@@ -79,7 +80,6 @@
|
|||||||
"await-sleep": "0.0.1",
|
"await-sleep": "0.0.1",
|
||||||
"aws-sdk": "^2.610.0",
|
"aws-sdk": "^2.610.0",
|
||||||
"babel-eslint": "^10.1.0",
|
"babel-eslint": "^10.1.0",
|
||||||
"broken-link-checker": "^0.7.8",
|
|
||||||
"chalk": "^4.0.0",
|
"chalk": "^4.0.0",
|
||||||
"commander": "^6.2.0",
|
"commander": "^6.2.0",
|
||||||
"count-array-values": "^1.2.1",
|
"count-array-values": "^1.2.1",
|
||||||
|
|||||||
95
script/check-english-links.js
Executable file
95
script/check-english-links.js
Executable file
@@ -0,0 +1,95 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
const path = require('path')
|
||||||
|
const fs = require('fs')
|
||||||
|
const linkinator = require('linkinator')
|
||||||
|
const dedent = require('dedent')
|
||||||
|
const program = require('commander')
|
||||||
|
const { escapeRegExp } = require('lodash')
|
||||||
|
const checker = new linkinator.LinkChecker()
|
||||||
|
const rimraf = require('rimraf').sync
|
||||||
|
const root = 'https://docs.github.com'
|
||||||
|
const englishRoot = `${root}/en`
|
||||||
|
const { deprecated } = require('../lib/enterprise-server-releases')
|
||||||
|
|
||||||
|
// [start-readme]
|
||||||
|
//
|
||||||
|
// This script runs once per day via a scheduled GitHub Action to check all links in
|
||||||
|
// English content, not including deprecated Enterprise Server content. It opens an issue
|
||||||
|
// if it finds broken links. To exclude a link, add it to `lib/excluded-links.js`.
|
||||||
|
//
|
||||||
|
// [end-readme]
|
||||||
|
|
||||||
|
program
|
||||||
|
.description('Check all links in the English docs.')
|
||||||
|
.option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).')
|
||||||
|
.parse(process.argv)
|
||||||
|
|
||||||
|
// Skip excluded links defined in separate file.
|
||||||
|
const excludedLinks = require('../lib/excluded-links')
|
||||||
|
.map(link => escapeRegExp(link))
|
||||||
|
|
||||||
|
// Skip non-English content.
|
||||||
|
const languagesToSkip = Object.keys(require('../lib/languages'))
|
||||||
|
.filter(code => code !== 'en')
|
||||||
|
.map(code => `${root}/${code}`)
|
||||||
|
|
||||||
|
// Skip deprecated Enterprise content.
|
||||||
|
// Capture the old format https://docs.github.com/enterprise/2.1/
|
||||||
|
// and the new format https://docs.github.com/enterprise-server@2.19/.
|
||||||
|
const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})/`)
|
||||||
|
|
||||||
|
const config = {
|
||||||
|
path: englishRoot,
|
||||||
|
concurrency: 300,
|
||||||
|
// If this is a dry run, turn off recursion.
|
||||||
|
recurse: !program.dryRun,
|
||||||
|
silent: true,
|
||||||
|
// The values in this array are treated as regexes.
|
||||||
|
linksToSkip: [
|
||||||
|
enterpriseReleasesToSkip,
|
||||||
|
...languagesToSkip,
|
||||||
|
...excludedLinks
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
async function main () {
|
||||||
|
const startTime = new Date()
|
||||||
|
|
||||||
|
// Clear and recreate a directory for logs.
|
||||||
|
const logFile = path.join(__dirname, '../.linkinator/full.log')
|
||||||
|
rimraf(path.dirname(logFile))
|
||||||
|
fs.mkdirSync(path.dirname(logFile), { recursive: true })
|
||||||
|
|
||||||
|
// Update CLI output and append to logfile after each checked link.
|
||||||
|
checker.on('link', result => {
|
||||||
|
fs.appendFileSync(logFile, JSON.stringify(result) + '\n')
|
||||||
|
})
|
||||||
|
|
||||||
|
// Start the scan; events will be logged as they occur.
|
||||||
|
const result = await checker.check(config)
|
||||||
|
|
||||||
|
// Scan is complete! Display the results.
|
||||||
|
const endTime = new Date()
|
||||||
|
const skippedLinks = result.links.filter(x => x.state === 'SKIPPED')
|
||||||
|
const brokenLinks = result.links.filter(x => x.state === 'BROKEN')
|
||||||
|
|
||||||
|
console.log(dedent`
|
||||||
|
${brokenLinks.length} broken links found on docs.github.com
|
||||||
|
|
||||||
|
Link scan completed in ${endTime - startTime}ms
|
||||||
|
Total links: ${result.links.length}
|
||||||
|
Skipped links: ${skippedLinks.length}
|
||||||
|
Broken links: ${brokenLinks.length}
|
||||||
|
For more details see ${path.relative(process.cwd(), logFile)}
|
||||||
|
`)
|
||||||
|
|
||||||
|
if (brokenLinks.length) {
|
||||||
|
console.log('\n\n' + JSON.stringify(brokenLinks, null, 2))
|
||||||
|
process.exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
process.exit(0)
|
||||||
|
}
|
||||||
@@ -1,134 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# [start-readme]
|
|
||||||
# The script is run once per day via a scheduled GitHub Action to check all links in the site. It automatically opens an issue if it finds broken links.
|
|
||||||
# To exclude a URL from the link check, add it to `lib/excluded-links.js`.
|
|
||||||
#
|
|
||||||
# For checking internal links, see `script/check-internal-links`.
|
|
||||||
# [end-readme]
|
|
||||||
|
|
||||||
internal=""
|
|
||||||
while getopts "h?i" opt; do
|
|
||||||
case "${opt}" in
|
|
||||||
h|\?) echo "Usage:"
|
|
||||||
echo " script/check-external-links [OPTIONS] [two-letter language code]"
|
|
||||||
echo ""
|
|
||||||
echo " script/check-external-links -i Check internal links. Without this flag, check all links."
|
|
||||||
echo " script/check-external-links -h Display this help message."
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
i) internal=" --internalOnly"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
shift $((OPTIND -1))
|
|
||||||
|
|
||||||
if [ -z "${1}" ]
|
|
||||||
then
|
|
||||||
echo "error: must provide two-letter language code"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
languageCode=${1}
|
|
||||||
|
|
||||||
# Pass options to script to construct blc command
|
|
||||||
blcCommand="$(./script/get-blc-command.js ${internal} --language ${languageCode})"
|
|
||||||
|
|
||||||
# Exit if script returned an error
|
|
||||||
if test $? -eq 1
|
|
||||||
then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Determine logfile name based on options
|
|
||||||
logfile=""
|
|
||||||
if [ -z "${internal}" ]
|
|
||||||
then
|
|
||||||
logfile="blc_output.log"
|
|
||||||
else
|
|
||||||
logfile="blc_output_internal.log"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Kill any server running in the background, then start the server
|
|
||||||
killall node >/dev/null 2>&1
|
|
||||||
node server.js >/dev/null &
|
|
||||||
sleep 5
|
|
||||||
|
|
||||||
host="http://localhost:4000"
|
|
||||||
|
|
||||||
# Check whether localhost is accessible
|
|
||||||
hostStatus=$(curl -I --silent "${host}" | head -1)
|
|
||||||
isHostOK=$(echo "${hostStatus}" | grep "[2|3][0-9][0-9]")
|
|
||||||
if [ -z "${isHostOK}" ]
|
|
||||||
then
|
|
||||||
echo "Can't connect to ${host}!"
|
|
||||||
echo ${hostStatus}
|
|
||||||
echo ${isHostOK}
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Execute blc and save output
|
|
||||||
${blcCommand[@]} > ${logfile}
|
|
||||||
|
|
||||||
# We're done with the server now, so end the process
|
|
||||||
# killall node will also terminate this script, so find and kill the specific pid
|
|
||||||
pid=$(ps aux | grep "node server.js" | grep -v "grep" | awk '{ print $2 }'); kill -INT $pid >/dev/null 2>&1
|
|
||||||
|
|
||||||
# Recheck "403 Forbidden" results due to a bug
|
|
||||||
# https://github.com/stevenvachon/broken-link-checker/issues/58
|
|
||||||
# Also recheck "429" GitHub results
|
|
||||||
urlsToRecheck=$(egrep "HTTP_4(03|29)" ${logfile} | grep -o "http.* ")
|
|
||||||
|
|
||||||
if [ ! -z "${urlsToRecheck}" ]
|
|
||||||
then
|
|
||||||
for url in ${urlsToRecheck}
|
|
||||||
do
|
|
||||||
# Curl each URL and grep for 4xx or 5xx in status code response
|
|
||||||
status=$(curl -I --silent "${url}" | head -1 | grep "[4|5][0-9][0-9]")
|
|
||||||
if [ -z "${status}" ]
|
|
||||||
then
|
|
||||||
# If no 4/5xx found, the link is NOT really broken, so remove it from the list
|
|
||||||
# This command needs to work in all implementations of sed (Mac/GNU/etc)
|
|
||||||
sed -i'.bak' -e "s|^.*$url.*$||" ${logfile}
|
|
||||||
# Remove backup file
|
|
||||||
find . -name "${logfile}.bak" | xargs rm
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Count number of broken links in output
|
|
||||||
# Ignore "308 Permanent Redirect" results, which are not actually broken
|
|
||||||
numberOfBrokenLinks=$(grep "BROKEN" ${logfile} | grep -vc HTTP_308)
|
|
||||||
brokenLinks=$(grep "BROKEN" ${logfile} | grep -v HTTP_308)
|
|
||||||
|
|
||||||
# If broken links are found, exit with status 1 so the check run fails
|
|
||||||
if [ ${numberOfBrokenLinks} -gt 0 ]
|
|
||||||
then
|
|
||||||
# Print "links" or "link" in message depending on the number found
|
|
||||||
if [ ${numberOfBrokenLinks} -gt 1 ]
|
|
||||||
then
|
|
||||||
linkOrLinks="links"
|
|
||||||
else
|
|
||||||
linkOrLinks="link"
|
|
||||||
fi
|
|
||||||
echo -e "\n${numberOfBrokenLinks} broken ${linkOrLinks} found on help.github.com\n"
|
|
||||||
echo -e "Note: links that start with 'http://localhost:4000/' are internal links.\n"
|
|
||||||
|
|
||||||
# List broken links
|
|
||||||
echo "${brokenLinks}"
|
|
||||||
|
|
||||||
# Update final number of broken links
|
|
||||||
echo -e "\n$(tail -2 ${logfile})" | sed "s|. [0-9]* broken.|. ${numberOfBrokenLinks} broken.|"
|
|
||||||
|
|
||||||
# Exit without failure when checking all links so script/open-broken-links-issue can run
|
|
||||||
if [ -z "${internal}" ]
|
|
||||||
then
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "All links are good!"
|
|
||||||
echo -e "\n$(tail -2 ${logfile})"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# [start-readme]
|
|
||||||
# This script wraps tests/links-and-images.js and provides an option to output results to a file.
|
|
||||||
#
|
|
||||||
# For more information, see `tests/README.md#broken-link-test`.
|
|
||||||
# [end-readme]
|
|
||||||
|
|
||||||
# check if npx is installed
|
|
||||||
command -v npx >/dev/null 2>&1 || { echo -e "npx is not installed. Run:\n\n\$ npm install -g npx" >&2; exit 1; }
|
|
||||||
|
|
||||||
while getopts "h?f:" opt; do
|
|
||||||
case "${opt}" in
|
|
||||||
h|\?) echo "Usage:"
|
|
||||||
echo " script/check-internal-links [OPTIONS]"
|
|
||||||
echo ""
|
|
||||||
echo " script/check-internal-links -f [FILENAME] Output the results of tests/links-and-images to a file."
|
|
||||||
echo " script/check-internal-links -h Display this help message."
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
f) FILENAME="${OPTARG}"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
shift $((OPTIND -1))
|
|
||||||
|
|
||||||
if [ "${FILENAME}" = "" ]
|
|
||||||
then
|
|
||||||
npx jest links-and-images
|
|
||||||
else
|
|
||||||
echo -e "Running tests/links-and-images.js\n"
|
|
||||||
|
|
||||||
npx jest links-and-images --no-color > ${FILENAME} 2>&1
|
|
||||||
|
|
||||||
echo "Done! Results in ${FILENAME}."
|
|
||||||
fi
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
|
|
||||||
const supportedLanguages = Object.values(require('../lib/languages')).map(language => language.code)
|
|
||||||
const excludedLinks = require('../lib/excluded-links')
|
|
||||||
const program = require('commander')
|
|
||||||
|
|
||||||
// [start-readme]
|
|
||||||
//
|
|
||||||
// This script parses options for `script/check-external-links`.
|
|
||||||
//
|
|
||||||
// [end-readme]
|
|
||||||
|
|
||||||
program
|
|
||||||
.description('Construct a blc command to run in script/check-external-links')
|
|
||||||
.option('-L, --language <lang_code>', 'required language code')
|
|
||||||
.option('-i, --internalOnly', 'check internal links only')
|
|
||||||
.parse(process.argv)
|
|
||||||
|
|
||||||
const languageCode = program.language
|
|
||||||
|
|
||||||
if (!languageCode || !supportedLanguages.includes(languageCode)) {
|
|
||||||
console.error(`error: you must provide a currently supported language code: ${supportedLanguages.join(', ')}\n`)
|
|
||||||
process.exit(1)
|
|
||||||
}
|
|
||||||
|
|
||||||
// options for blc: https://github.com/stevenvachon/broken-link-checker#command-line-usage
|
|
||||||
const blcPackage = './node_modules/broken-link-checker/bin/blc'
|
|
||||||
const host = 'http://localhost:4000'
|
|
||||||
const hostWithLanguage = `${host}/${languageCode}`
|
|
||||||
const maintainOrder = '--ordered' // maintain order of links in html
|
|
||||||
const recursive = '--recursive'
|
|
||||||
const requestMethod = '--get'
|
|
||||||
const filterLevel = '--filter-level 3' // level 3 checks the most assets
|
|
||||||
const excludeExternal = program.internalOnly ? ' --exclude-external' : ''
|
|
||||||
|
|
||||||
let command = `${blcPackage} ${hostWithLanguage} ${maintainOrder} ${recursive} ${requestMethod} ${filterLevel}${excludeExternal}`
|
|
||||||
|
|
||||||
let excludeStrings = ''
|
|
||||||
|
|
||||||
// blc can only except one string per --exclude option
|
|
||||||
// so we need to construct multiple strings from exclusions array
|
|
||||||
excludedLinks.forEach(excludedLink => {
|
|
||||||
excludeStrings = `${excludeStrings} --exclude ${excludedLink}`
|
|
||||||
})
|
|
||||||
|
|
||||||
// prevent link checker from crawling other languages
|
|
||||||
// for example, if we're checking links on the English site
|
|
||||||
// we need to exclude links on Japanese, Chinese, Spanish, etc.
|
|
||||||
supportedLanguages.forEach(supportedLanguage => {
|
|
||||||
if (supportedLanguage === languageCode) return
|
|
||||||
const internalLink = `${host}/${supportedLanguage}/*`
|
|
||||||
const externalLink = `https://help.github.com/${supportedLanguage}/*`
|
|
||||||
excludeStrings = `${excludeStrings} --exclude ${internalLink}`
|
|
||||||
excludeStrings = `${excludeStrings} --exclude ${externalLink}`
|
|
||||||
})
|
|
||||||
|
|
||||||
// get final command
|
|
||||||
command = command + excludeStrings
|
|
||||||
|
|
||||||
// output final command
|
|
||||||
console.log(command)
|
|
||||||
Reference in New Issue
Block a user