#!/usr/bin/env node // [start-readme] // // Run this script during the Enterprise deprecation process to download // static copies of all pages for the oldest supported Enterprise version. // See the Enterprise deprecation issue template for instructions. // // NOTE: If you get this error: // // Error [ERR_MODULE_NOT_FOUND]: Cannot find package 'website-scraper' ... // // it's because you haven't installed all the *optional* dependencies. // To do that, run: // // npm install --include=optional // // [end-readme] import path from 'path' import fs from 'fs' import { execSync } from 'child_process' import scrape from 'website-scraper' import { program } from 'commander' import rimraf from 'rimraf' import http from 'http' import createApp from '../../lib/app.js' import EnterpriseServerReleases from '../../lib/enterprise-server-releases.js' import loadRedirects from '../../lib/redirects/precompile.js' import { loadPageMap } from '../../lib/page-data.js' import { languageKeys } from '../../lib/languages.js' const port = '4001' const host = `http://localhost:${port}` const version = EnterpriseServerReleases.oldestSupported const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise' program .description( 'Scrape HTML of the oldest supported Enterprise version and add it to a temp output directory.' ) .option( '-o, --output ', `output directory to place scraped HTML files and redirects. By default, this temp directory is named 'tmpArchivalDir_'` ) .option('-d, --dry-run', 'only scrape the first 10 pages for testing purposes') .option( '-p, --page ', 'Note: this option is only used to re-scrape a page after the version was deprecated. Redirects will not be re-created because most of the deprecated content is already removed. This option scrapes a specific page in all languages. Pass the relative path to the page without a version or language prefix. ex: /admin/release-notes' ) .parse(process.argv) const output = program.opts().output const dryRun = program.opts().dryRun const singlePage = program.opts().page const tmpArchivalDirectory = output ? path.join(process.cwd(), output) : path.join(process.cwd(), `tmpArchivalDir_${version}`) main() class RewriteAssetPathsPlugin { constructor(version, tempDirectory) { this.version = version this.tempDirectory = tempDirectory } apply(registerAction) { registerAction('onResourceSaved', async ({ resource }) => { // Show some activity process.stdout.write('.') // Only operate on HTML files if (!resource.isHtml() && !resource.isCss()) return // Get the text contents of the resource const text = resource.getText() let newBody = '' // Rewrite HTML asset paths. Example: // ../assets/images/foo/bar.png -> // https://githubdocs.azureedge.net/github-images/enterprise/2.17/assets/images/foo/bar.png if (resource.isHtml()) { newBody = text.replace( /(?src|href)="(?:\.\.\/|\/)*(?_next\/static|javascripts|stylesheets|assets\/fonts|assets\/cb-\d+\/images|node_modules)/g, (match, attribute, basepath) => { const replaced = `${REMOTE_ENTERPRISE_STORAGE_URL}/${this.version}/${basepath}` return `${attribute}="${replaced}` } ) } // Rewrite CSS asset paths. Example // url("../assets/fonts/alliance/alliance-no-1-regular.woff") -> // url("https://githubdocs.azureedge.net/github-images/enterprise/2.20/assets/fonts/alliance/alliance-no-1-regular.woff") // url(../../../assets/cb-303/images/octicons/search-24.svg) -> // url(https://githubdocs.azureedge.net/github-images/enterprise/2.20/assets/cb-303/images/octicons/search-24.svg) if (resource.isCss()) { newBody = text.replace( /(?url)(?\("|\()(?:\.\.\/)*(?_next\/static|assets\/fonts|assets\/images|assets\/cb-\d+\/images)/g, (match, attribute, paren, basepath) => { const replaced = `${REMOTE_ENTERPRISE_STORAGE_URL}/${this.version}/${basepath}` return `${attribute}${paren}${replaced}` } ) } const filePath = path.join(this.tempDirectory, resource.getFilename()) await fs.promises.writeFile(filePath, newBody, 'binary') }) } } async function main() { // Build the production assets, to simulate a production deployment console.log('Running `npm run build` for production assets') execSync('npm run build', { stdio: 'inherit' }) console.log('Finish building production assets') if (dryRun) { console.log( '\nThis is a dry run! Creating HTML for redirects and scraping the first 10 pages only.' ) } if (singlePage) { console.log(`\nScraping HTML for a single page only ${singlePage}.`) } console.log(`Enterprise version to archive: ${version}`) const pageName = singlePage && singlePage.trim().startsWith('/') ? singlePage.slice(1) : singlePage const pageMap = singlePage ? languageKeys.map((key) => `/${key}/enterprise-server@${version}/${pageName}`) : await loadPageMap() const permalinksPerVersion = singlePage ? pageMap : Object.keys(pageMap).filter((key) => key.includes(`/enterprise-server@${version}`)) const urls = dryRun ? permalinksPerVersion.slice(0, 10).map((href) => `${host}${href}`) : permalinksPerVersion.map((href) => `${host}${href}`) console.log(`Found ${urls.length} pages for version ${version}`) if (dryRun || singlePage) { console.log(`\nScraping html for these pages only:\n${urls.join('\n')}\n`) } // remove temp directory rimraf.sync(tmpArchivalDirectory) const app = createApp() const server = http.createServer(app) server .listen(port, async () => { console.log(`started server on ${host}`) await scrape({ urls, urlFilter: (url) => { // Do not download assets from other hosts like S3 or octodex.github.com // (this will keep them as remote references in the downloaded pages) return url.startsWith(`http://localhost:${port}/`) }, directory: tmpArchivalDirectory, filenameGenerator: 'bySiteStructure', requestConcurrency: 6, plugins: [new RewriteAssetPathsPlugin(version, tmpArchivalDirectory)], }).catch((err) => { console.error('scraping error') console.error(err) }) fs.renameSync( path.join(tmpArchivalDirectory, `/localhost_${port}`), path.join(tmpArchivalDirectory, version) ) console.log(`\n\ndone scraping! added files to ${tmpArchivalDirectory}\n`) if (!singlePage) { // create redirect html files to preserve frontmatter redirects await createRedirectsFile( permalinksPerVersion, pageMap, path.join(tmpArchivalDirectory, version) ) console.log(`next step: deprecate ${version} in lib/enterprise-server-releases.js`) } else { console.log('🏁 Scraping a single page is complete') } server.close() }) .on('error', (err) => { console.log('error listening to port ', port, err) server.close() }) } async function createRedirectsFile(permalinks, pageMap, outputDirectory) { console.log('Creating redirects file...') const pagesPerVersion = permalinks.map((permalink) => pageMap[permalink]) const redirects = await loadRedirects(pagesPerVersion, pageMap) const redirectsPerVersion = {} Object.entries(redirects).forEach(([oldPath, newPath]) => { // remove any liquid variables that sneak in oldPath = oldPath.replace('/{{ page.version }}', '').replace('/{{ currentVersion }}', '') // ignore any old paths that are not in this version if ( !( oldPath.includes(`/enterprise-server@${version}`) || oldPath.includes(`/enterprise/${version}`) ) ) return redirectsPerVersion[oldPath] = newPath }) fs.writeFileSync( path.join(outputDirectory, 'redirects.json'), JSON.stringify(redirectsPerVersion, null, 2) ) console.log(`Wrote ${outputDirectory}/redirects.json`) }