import rateLimit from 'express-rate-limit' import statsd from '../lib/statsd.js' import { noCacheControl } from './cache-control.js' const EXPIRES_IN_AS_SECONDS = 60 const MAX = process.env.RATE_LIMIT_MAX ? parseInt(process.env.RATE_LIMIT_MAX, 10) : 100 if (isNaN(MAX)) { throw new Error(`process.env.RATE_LIMIT_MAX (${process.env.RATE_LIMIT_MAX}) not a number`) } const ipv4WithPort = /^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):\d{1,5}$/ export default rateLimit({ // 1 minute windowMs: EXPIRES_IN_AS_SECONDS * 1000, // limit each IP to X requests per windowMs // We currently have about 25 instances in production. That's routed // in Azure to spread the requests to each healthy instance. // So, the true rate limit, per `windowMs`, is this number multiplied // by the current number of instances. max: MAX, // Return rate limit info in the `RateLimit-*` headers standardHeaders: true, // Disable the `X-RateLimit-*` headers legacyHeaders: false, keyGenerator: (req) => { let { ip } = req // In our Azure preview environment, with the way the proxying works, // the `x-forwarded-for` is always the origin IP with a port number // attached. E.g. `75.40.90.27:56675, 169.254.129.1` // This port number portion changes with every request, so we strip it. ip = ip.replace(ipv4WithPort, '$1') return ip }, skip: (req) => { // Always ignore these if (req.path === '/api/events') return true // If the query string looks totally regular, then skip if (!isSuspiciousRequest(req)) return true // This is so we can get a sense of how many requests are being // treated as suspicious. They don't necessarily get rate limited. const tags = [ `url:${req.url}`, `ip:${req.ip}`, `path:${req.path}`, `qs:${req.url.split('?')[1]}`, ] statsd.increment('middleware.rate_limit_dont_skip', 1, tags) return false }, handler: (req, res, next, options) => { const tags = [`url:${req.url}`, `ip:${req.ip}`, `path:${req.path}`] statsd.increment('middleware.rate_limit', 1, tags) noCacheControl(res) res.status(options.statusCode).send(options.message) }, }) const RECOGNIZED_KEYS_BY_PREFIX = { '/_next/data/': ['versionId', 'productId', 'restPage', 'apiVersion', 'category', 'subcategory'], '/api/search': ['query', 'language', 'version', 'page', 'product', 'autocomplete', 'limit'], '/api/anchor-redirect': ['hash', 'path'], '/api/webhooks': ['category', 'version'], '/api/pageinfo': ['pathname'], } const RECOGNIZED_KEYS = { search: ['query', 'page'], } const MISC_KEYS = [ // Learning track pages 'learn', 'learnProduct', // Platform picker 'platform', // Tool picker 'tool', // When apiVersion isn't the only one. E.g. ?apiVersion=XXX&tool=vscode 'apiVersion', // Lowercase for rest pages 'apiversion', ] /** * Return true if the request looks like a DoS request. I.e. suspcious. * * We've seen lots of requests slip past the CDN and its edge rate limiter * that clearly are not realistic URLs that you'd get in a browser. * For example `?action=octrh&api=h9vcd&immagine=jzs3c&lang=xb0kp&m=rrmek` * There are certain URLs that have query strings that are valid, but * have one more query string keys. In particular the `/api/..` endpoints. * * Remember, just because this function might return true, it doesn't mean * the request will be rate limited. It has to be both suspicous AND * have lots and lots of requests. * * @param {Request} req * @returns boolean */ function isSuspiciousRequest(req) { const keys = Object.keys(req.query) // Since this function can only speculate by query strings (at the // moment), if the URL doesn't have any query strings it's not suspicious. if (!keys.length) { return false } // E.g. `/en/rest/actions?apiVersion=YYYY-MM-DD` if (keys.length === 1 && keys[0] === 'apiVersion') return false // Now check what query string keys are *left* based on a list of // recognized keys per different prefixes. for (const [prefix, recognizedKeys] of Object.entries(RECOGNIZED_KEYS_BY_PREFIX)) { if (req.path.startsWith(prefix)) { return keys.filter((key) => !recognizedKeys.includes(key)).length > 0 } } // E.g. `/fr/search?query=foo if (req.path.split('/')[2] === 'search') { return keys.filter((key) => !RECOGNIZED_KEYS.search.includes(key)).length > 0 } const unrecognizedKeys = keys.filter((key) => !MISC_KEYS.includes(key)) return unrecognizedKeys.length > 0 }