225 lines
8.0 KiB
JavaScript
Executable File
225 lines
8.0 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
// [start-readme]
|
|
//
|
|
// Run this script during the Enterprise deprecation process to download
|
|
// static copies of all pages for the oldest supported Enterprise version.
|
|
// See the Enterprise deprecation issue template for instructions.
|
|
//
|
|
// NOTE: If you get this error:
|
|
//
|
|
// Error [ERR_MODULE_NOT_FOUND]: Cannot find package 'website-scraper' ...
|
|
//
|
|
// it's because you haven't installed all the *optional* dependencies.
|
|
// To do that, run:
|
|
//
|
|
// npm install --include=optional
|
|
//
|
|
// [end-readme]
|
|
|
|
import path from 'path'
|
|
import fs from 'fs'
|
|
import { execSync } from 'child_process'
|
|
import scrape from 'website-scraper'
|
|
import { program } from 'commander'
|
|
import rimraf from 'rimraf'
|
|
import http from 'http'
|
|
|
|
import createApp from '../../lib/app.js'
|
|
import EnterpriseServerReleases from '../../lib/enterprise-server-releases.js'
|
|
import loadRedirects from '../../lib/redirects/precompile.js'
|
|
import { loadPageMap } from '../../lib/page-data.js'
|
|
import { languageKeys } from '../../lib/languages.js'
|
|
|
|
const port = '4001'
|
|
const host = `http://localhost:${port}`
|
|
const version = EnterpriseServerReleases.oldestSupported
|
|
const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise'
|
|
|
|
program
|
|
.description(
|
|
'Scrape HTML of the oldest supported Enterprise version and add it to a temp output directory.'
|
|
)
|
|
.option(
|
|
'-o, --output <PATH>',
|
|
`output directory to place scraped HTML files and redirects. By default, this temp directory is named 'tmpArchivalDir_<VERSION_TO_DEPRECATE>'`
|
|
)
|
|
.option('-d, --dry-run', 'only scrape the first 10 pages for testing purposes')
|
|
.option(
|
|
'-p, --page <PATH>',
|
|
'Note: this option is only used to re-scrape a page after the version was deprecated. Redirects will not be re-created because most of the deprecated content is already removed. This option scrapes a specific page in all languages. Pass the relative path to the page without a version or language prefix. ex: /admin/release-notes'
|
|
)
|
|
.parse(process.argv)
|
|
|
|
const output = program.opts().output
|
|
const dryRun = program.opts().dryRun
|
|
const singlePage = program.opts().page
|
|
const tmpArchivalDirectory = output
|
|
? path.join(process.cwd(), output)
|
|
: path.join(process.cwd(), `tmpArchivalDir_${version}`)
|
|
|
|
main()
|
|
|
|
class RewriteAssetPathsPlugin {
|
|
constructor(version, tempDirectory) {
|
|
this.version = version
|
|
this.tempDirectory = tempDirectory
|
|
}
|
|
|
|
apply(registerAction) {
|
|
registerAction('onResourceSaved', async ({ resource }) => {
|
|
// Show some activity
|
|
process.stdout.write('.')
|
|
|
|
// Only operate on HTML files
|
|
if (!resource.isHtml() && !resource.isCss()) return
|
|
|
|
// Get the text contents of the resource
|
|
const text = resource.getText()
|
|
let newBody = ''
|
|
|
|
// Rewrite HTML asset paths. Example:
|
|
// ../assets/images/foo/bar.png ->
|
|
// https://githubdocs.azureedge.net/github-images/enterprise/2.17/assets/images/foo/bar.png
|
|
if (resource.isHtml()) {
|
|
newBody = text.replace(
|
|
/(?<attribute>src|href)="(?:\.\.\/|\/)*(?<basepath>_next\/static|javascripts|stylesheets|assets\/fonts|assets\/cb-\d+\/images|node_modules)/g,
|
|
(match, attribute, basepath) => {
|
|
const replaced = `${REMOTE_ENTERPRISE_STORAGE_URL}/${this.version}/${basepath}`
|
|
return `${attribute}="${replaced}`
|
|
}
|
|
)
|
|
}
|
|
|
|
// Rewrite CSS asset paths. Example
|
|
// url("../assets/fonts/alliance/alliance-no-1-regular.woff") ->
|
|
// url("https://githubdocs.azureedge.net/github-images/enterprise/2.20/assets/fonts/alliance/alliance-no-1-regular.woff")
|
|
// url(../../../assets/cb-303/images/octicons/search-24.svg) ->
|
|
// url(https://githubdocs.azureedge.net/github-images/enterprise/2.20/assets/cb-303/images/octicons/search-24.svg)
|
|
if (resource.isCss()) {
|
|
newBody = text.replace(
|
|
/(?<attribute>url)(?<paren>\("|\()(?:\.\.\/)*(?<basepath>_next\/static|assets\/fonts|assets\/images|assets\/cb-\d+\/images)/g,
|
|
(match, attribute, paren, basepath) => {
|
|
const replaced = `${REMOTE_ENTERPRISE_STORAGE_URL}/${this.version}/${basepath}`
|
|
return `${attribute}${paren}${replaced}`
|
|
}
|
|
)
|
|
}
|
|
|
|
const filePath = path.join(this.tempDirectory, resource.getFilename())
|
|
await fs.promises.writeFile(filePath, newBody, 'binary')
|
|
})
|
|
}
|
|
}
|
|
|
|
async function main() {
|
|
// Build the production assets, to simulate a production deployment
|
|
console.log('Running `npm run build` for production assets')
|
|
execSync('npm run build', { stdio: 'inherit' })
|
|
console.log('Finish building production assets')
|
|
if (dryRun) {
|
|
console.log(
|
|
'\nThis is a dry run! Creating HTML for redirects and scraping the first 10 pages only.'
|
|
)
|
|
}
|
|
if (singlePage) {
|
|
console.log(`\nScraping HTML for a single page only ${singlePage}.`)
|
|
}
|
|
console.log(`Enterprise version to archive: ${version}`)
|
|
const pageName =
|
|
singlePage && singlePage.trim().startsWith('/') ? singlePage.slice(1) : singlePage
|
|
const pageMap = singlePage
|
|
? languageKeys.map((key) => `/${key}/enterprise-server@${version}/${pageName}`)
|
|
: await loadPageMap()
|
|
const permalinksPerVersion = singlePage
|
|
? pageMap
|
|
: Object.keys(pageMap).filter((key) => key.includes(`/enterprise-server@${version}`))
|
|
|
|
const urls = dryRun
|
|
? permalinksPerVersion.slice(0, 10).map((href) => `${host}${href}`)
|
|
: permalinksPerVersion.map((href) => `${host}${href}`)
|
|
|
|
console.log(`Found ${urls.length} pages for version ${version}`)
|
|
|
|
if (dryRun || singlePage) {
|
|
console.log(`\nScraping html for these pages only:\n${urls.join('\n')}\n`)
|
|
}
|
|
|
|
// remove temp directory
|
|
rimraf.sync(tmpArchivalDirectory)
|
|
|
|
const app = createApp()
|
|
const server = http.createServer(app)
|
|
server
|
|
.listen(port, async () => {
|
|
console.log(`started server on ${host}`)
|
|
|
|
await scrape({
|
|
urls,
|
|
urlFilter: (url) => {
|
|
// Do not download assets from other hosts like S3 or octodex.github.com
|
|
// (this will keep them as remote references in the downloaded pages)
|
|
return url.startsWith(`http://localhost:${port}/`)
|
|
},
|
|
directory: tmpArchivalDirectory,
|
|
filenameGenerator: 'bySiteStructure',
|
|
requestConcurrency: 6,
|
|
plugins: [new RewriteAssetPathsPlugin(version, tmpArchivalDirectory)],
|
|
}).catch((err) => {
|
|
console.error('scraping error')
|
|
console.error(err)
|
|
})
|
|
|
|
fs.renameSync(
|
|
path.join(tmpArchivalDirectory, `/localhost_${port}`),
|
|
path.join(tmpArchivalDirectory, version)
|
|
)
|
|
|
|
console.log(`\n\ndone scraping! added files to ${tmpArchivalDirectory}\n`)
|
|
if (!singlePage) {
|
|
// create redirect html files to preserve frontmatter redirects
|
|
await createRedirectsFile(
|
|
permalinksPerVersion,
|
|
pageMap,
|
|
path.join(tmpArchivalDirectory, version)
|
|
)
|
|
console.log(`next step: deprecate ${version} in lib/enterprise-server-releases.js`)
|
|
} else {
|
|
console.log('🏁 Scraping a single page is complete')
|
|
}
|
|
server.close()
|
|
})
|
|
.on('error', (err) => {
|
|
console.log('error listening to port ', port, err)
|
|
server.close()
|
|
})
|
|
}
|
|
|
|
async function createRedirectsFile(permalinks, pageMap, outputDirectory) {
|
|
console.log('Creating redirects file...')
|
|
const pagesPerVersion = permalinks.map((permalink) => pageMap[permalink])
|
|
const redirects = await loadRedirects(pagesPerVersion, pageMap)
|
|
const redirectsPerVersion = {}
|
|
|
|
Object.entries(redirects).forEach(([oldPath, newPath]) => {
|
|
// remove any liquid variables that sneak in
|
|
oldPath = oldPath.replace('/{{ page.version }}', '').replace('/{{ currentVersion }}', '')
|
|
// ignore any old paths that are not in this version
|
|
if (
|
|
!(
|
|
oldPath.includes(`/enterprise-server@${version}`) ||
|
|
oldPath.includes(`/enterprise/${version}`)
|
|
)
|
|
)
|
|
return
|
|
|
|
redirectsPerVersion[oldPath] = newPath
|
|
})
|
|
|
|
fs.writeFileSync(
|
|
path.join(outputDirectory, 'redirects.json'),
|
|
JSON.stringify(redirectsPerVersion, null, 2)
|
|
)
|
|
console.log(`Wrote ${outputDirectory}/redirects.json`)
|
|
}
|