From 44db141ee3c51b78a39b571828fa84b0d093ef91 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Thu, 30 Mar 2023 10:47:55 -0400 Subject: [PATCH] Disk cache getRemoteJSON (#35872) --- .github/workflows/test.yml | 9 ++ .gitignore | 3 + middleware/archived-enterprise-versions.js | 24 +---- middleware/get-remote-json.js | 87 ++++++++++++++++ tests/unit/get-remote-json.js | 115 +++++++++++++++++++++ 5 files changed, 215 insertions(+), 23 deletions(-) create mode 100644 middleware/get-remote-json.js create mode 100644 tests/unit/get-remote-json.js diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4234b8f29f..04348af4ae 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -153,6 +153,15 @@ jobs: - name: Run build script run: npm run build + - name: Disk cache used by getRemoteJSON function in middleware + uses: actions/cache@9b0c1fce7a93df8e3bb8926b0d6e9d89e92f20a7 + with: + path: .remotejson-cache + # Very liberal cache key. Note, for this to become populated + # for other branches, you have to manually run this workflow + # at least once using the "Run workflow" button. + key: ${{ runner.os }}-remotejson + - name: Index fixtures into the local Elasticsearch # For the sake of saving time, only run this step if the group # is one that will run tests against an Elasticsearch on localhost. diff --git a/.gitignore b/.gitignore index 1efe6cc410..4276911e17 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ semmle-code .installed.package-lock.json assets/images/help/writing/unordered-list-rendered (1).png + +# Used by getRemoteJSON() +.remotejson-cache/ diff --git a/middleware/archived-enterprise-versions.js b/middleware/archived-enterprise-versions.js index cf2ff340c1..3cf1ed1888 100644 --- a/middleware/archived-enterprise-versions.js +++ b/middleware/archived-enterprise-versions.js @@ -16,6 +16,7 @@ import { readCompressedJsonFileFallbackLazily } from '../lib/read-json-file.js' import { archivedCacheControl, languageCacheControl } from './cache-control.js' import { pathLanguagePrefixed, languagePrefixPathRegex } from '../lib/languages.js' import getRedirect, { splitPathByLanguage } from '../lib/get-redirect.js' +import getRemoteJSON from './get-remote-json.js' const REMOTE_ENTERPRISE_STORAGE_URL = 'https://githubdocs.azureedge.net/enterprise' @@ -75,29 +76,6 @@ const retryConfiguration = { limit: 3 } // unnecessary error reporting. const timeoutConfiguration = { response: 1500 } -async function getRemoteJSON(url, config) { - let fromCache = true - if (!_getRemoteJSONCache.has(url)) { - fromCache = false - // got will, by default, follow redirects and it will throw if the ultimate - // response is not a 2xx. - // But it's possible that the page is a 200 OK but it's just not a JSON - // page at all. Then we can't assume we can deserialize it. - const res = await got(url, config) - if (!res.headers['content-type'].startsWith('application/json')) { - throw new Error( - `Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})` - ) - } - - _getRemoteJSONCache.set(url, JSON.parse(res.body)) - } - const tags = [`url:${url}`, `from_cache:${fromCache}`] - statsd.increment('middleware.archived_get_remote_json', 1, tags) - return _getRemoteJSONCache.get(url) -} -const _getRemoteJSONCache = new Map() - // This module handles requests for deprecated GitHub Enterprise versions // by routing them to static content in help-docs-archived-enterprise-versions diff --git a/middleware/get-remote-json.js b/middleware/get-remote-json.js new file mode 100644 index 0000000000..e0ea61b19e --- /dev/null +++ b/middleware/get-remote-json.js @@ -0,0 +1,87 @@ +import path from 'path' +import fs from 'fs' +import crypto from 'crypto' + +import got from 'got' +import statsd from '../lib/statsd.js' + +// The only reason this is exported is for the sake of the unit tests' +// ability to test in-memory miss after purging this with a mutation +export const cache = new Map() + +const inProd = process.env.NODE_ENV === 'production' + +// Wrapper on `got()` that is able to both cache in memory and on disk. +// The on-disk caching is in `.remotejson/`. +// We use this for downloading `redirects.json` files from the +// help-docs-archived-enterprise-versions repo as a proxy. A lot of those +// .json files are large and they're also static which makes them +// ideal for caching. +// Note that there's 2 layers of caching here: +// 1. Is it in memory cache? +// 2. No, is it on disk? +// 3. No, download from the internet then store responses in memory and disk +export default async function getRemoteJSON(url, config) { + // We could get fancy and make the cache key depend on the `config` too + // given that this is A) only used for archived enterprise stuff, + // and B) the config is only applicable on cache miss when doing the `got()`. + const cacheKey = url + + // Assume it's in the in-memory cache first. + // Later we'll update this if we find we need to. + let fromCache = 'memory' + + if (!cache.has(cacheKey)) { + fromCache = 'not' + + let foundOnDisk = false + const tempFilename = crypto.createHash('md5').update(url).digest('hex') + + // Do this here instead of at the top of the file so that it becomes + // possible to override this in unit tests. + const ROOT = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT || '.remotejson-cache' + + const onDisk = path.join(ROOT, `${tempFilename}.json`) + // Never even try reading from disk in production. + if (!inProd && fs.existsSync(onDisk)) { + const body = fs.readFileSync(onDisk, 'utf-8') + // It might exist on disk, but it could be empty + if (body) { + try { + // It might be corrupted JSON. + cache.set(cacheKey, JSON.parse(body)) + fromCache = 'disk' + foundOnDisk = true + } catch (error) { + if (!(error instanceof SyntaxError)) { + throw error + } + } + } + } + + if (!foundOnDisk) { + // got will, by default, follow redirects and it will throw if the ultimate + // response is not a 2xx. + // But it's possible that the page is a 200 OK but it's just not a JSON + // page at all. Then we can't assume we can deserialize it. + const res = await got(url, config) + if (!res.headers['content-type'].startsWith('application/json')) { + throw new Error( + `Fetching '${url}' resulted in a non-JSON response (${res.headers['content-type']})` + ) + } + cache.set(cacheKey, JSON.parse(res.body)) + + // Only write to disk for testing and local preview. + // In production, we never write to disk. Only in-memory. + if (!inProd) { + fs.mkdirSync(path.dirname(onDisk), { recursive: true }) + fs.writeFileSync(onDisk, res.body, 'utf-8') + } + } + } + const tags = [`url:${url}`, `from_cache:${fromCache}`] + statsd.increment('middleware.get_remote_json', 1, tags) + return cache.get(cacheKey) +} diff --git a/tests/unit/get-remote-json.js b/tests/unit/get-remote-json.js new file mode 100644 index 0000000000..5c2acb92dc --- /dev/null +++ b/tests/unit/get-remote-json.js @@ -0,0 +1,115 @@ +import fs from 'fs' +import path from 'path' +import os from 'os' + +import rimraf from 'rimraf' +import { expect, test, describe, beforeAll, afterAll } from '@jest/globals' +import nock from 'nock' +import getRemoteJSON, { cache } from '../../middleware/get-remote-json.js' + +/** + * + * These unit tests test that the in-memory cache works and when it's + * not a cache it, it can benefit from using the disk cache. + */ + +describe('getRemoteJSON', () => { + const envVarValueBefore = process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT + const tempDir = path.join(os.tmpdir(), 'remotejson-test') + + beforeAll(() => { + process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempDir + }) + + afterAll(() => { + process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = envVarValueBefore + rimraf.sync(tempDir) + }) + + afterEach(() => { + nock.cleanAll() + }) + + test('simple in-memory caching', async () => { + const url = 'http://example.com/redirects.json' + const { origin, pathname } = new URL(url) + nock(origin).get(pathname).reply(200, { foo: 'bar' }) + const data = await getRemoteJSON(url, {}) + expect(data.foo).toBe('bar') + expect(cache.get(url)).toBeTruthy() + // Second time, despite not setting up a second nock(), will work + // because it can use memory now. + const data2 = await getRemoteJSON(url, {}) + expect(data2.foo).toBe('bar') + expect(cache.get(url)).toBeTruthy() + }) + + test('benefit from disk-based caching', async () => { + const url = 'http://example.com/cool.json' + const { origin, pathname } = new URL(url) + nock(origin).get(pathname).reply(200, { cool: true }) + const data = await getRemoteJSON(url, {}) + expect(data.cool).toBe(true) + expect(cache.get(url)).toBeTruthy() + cache.delete(url) + + // This time, the nock won't fail despite not using `.persist()`. + // That means it didn't need the network because it was able to + // use the disk cache. + const data2 = await getRemoteJSON(url, {}) + expect(data2.cool).toBe(true) + }) + + test('recover from disk corruption (empty)', async () => { + const tempTempDir = path.join(tempDir, 'empty-files') + process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir + const url = 'http://example.com/empty.json' + const { origin, pathname } = new URL(url) + nock(origin).get(pathname).reply(200, { cool: true }) + await getRemoteJSON(url, {}) + + // Make every file in the cache directory an empty file + for (const file of fs.readdirSync(tempTempDir)) { + fs.writeFileSync(path.join(tempTempDir, file), '') + } + + cache.delete(url) + // If we don't do this, nock will fail because a second network + // request became necessary. + nock(origin).get(pathname).reply(200, { cool: true }) + + const data = await getRemoteJSON(url, {}) + expect(data.cool).toBe(true) + }) + + test('recover from disk corruption (bad JSON)', async () => { + const tempTempDir = path.join(tempDir, 'corrupt-files') + process.env.GET_REMOTE_JSON_DISK_CACHE_ROOT = tempTempDir + const url = 'http://example.com/corrupt.json' + const { origin, pathname } = new URL(url) + nock(origin).get(pathname).reply(200, { cool: true }) + await getRemoteJSON(url, {}) + + // Make every file in the cache directory an empty file + for (const file of fs.readdirSync(tempTempDir)) { + fs.writeFileSync(path.join(tempTempDir, file), '{"not:JSON{') + } + + cache.delete(url) + // If we don't do this, nock will fail because a second network + // request became necessary. + nock(origin).get(pathname).reply(200, { cool: true }) + + const data = await getRemoteJSON(url, {}) + expect(data.cool).toBe(true) + }) + + test('not-actually JSON despite URL', async () => { + const url = 'http://example.com/might-look-like.json' + const { origin, pathname } = new URL(url) + nock(origin).get(pathname).reply(200, 'here', { + 'Content-Type': 'text/html', + }) + await expect(getRemoteJSON(url, {})).rejects.toThrowError(/resulted in a non-JSON response/) + }) +})