1
0
mirror of synced 2025-12-19 09:57:42 -05:00

better search scraping error handling (don't fail on single page) (#58004)

This commit is contained in:
Evan Bonsignori
2025-10-17 11:49:27 -07:00
committed by GitHub
parent 0d415645a9
commit 2f78652f55
4 changed files with 269 additions and 53 deletions

View File

@@ -180,6 +180,19 @@ jobs:
ls -lh /tmp/records
- name: Check for scraping failures
id: check-failures
run: |
if [ -f /tmp/records/failures-summary.json ]; then
FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
echo "has_failures=true" >> $GITHUB_OUTPUT
echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
else
echo "has_failures=false" >> $GITHUB_OUTPUT
echo "✅ All pages scraped successfully"
fi
- name: Check that Elasticsearch is accessible
run: |
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
@@ -211,6 +224,19 @@ jobs:
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
run: npm run purge-fastly-edge-cache
- name: Alert on scraping failures
if: ${{ steps.check-failures.outputs.has_failures == 'true' && github.event_name != 'workflow_dispatch' }}
uses: ./.github/actions/slack-alert
with:
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
message: |
:warning: ${{ steps.check-failures.outputs.failed_pages }} page(s) failed to scrape for general search indexing (language: ${{ matrix.language }})
The indexing completed but some pages could not be scraped. This may affect search results for those pages.
Workflow: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
- uses: ./.github/actions/slack-alert
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
with:

View File

@@ -48,6 +48,18 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10)
// when multiple docs match on a certain keyword(s).
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
interface FailedPage {
url?: string
relativePath?: string
error: string
errorType: string
}
export interface BuildRecordsResult {
records: Record[]
failedPages: FailedPage[]
}
export default async function buildRecords(
indexName: string,
indexablePages: Page[],
@@ -55,7 +67,7 @@ export default async function buildRecords(
languageCode: string,
redirects: Redirects,
config: Config = {} as Config,
): Promise<Record[]> {
): Promise<BuildRecordsResult> {
// Determine the page version from the index version
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
@@ -96,6 +108,9 @@ export default async function buildRecords(
const hasPopularPages = Object.keys(popularPages).length > 0
// Track failed pages
const failedPages: FailedPage[] = []
const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
.on('page', (page) => {
if (!noMarkers) process.stdout.write(pageMarker)
@@ -114,23 +129,105 @@ export default async function buildRecords(
records.push(newRecord)
})
.on('error', (err) => {
if (err instanceof HTTPError && !err.response.ok) {
console.log(
'\n' +
boxen(chalk.bold(err.request.requestUrl?.pathname), {
title: chalk.red('The URL it failed on was'),
padding: 1,
borderColor: 'red',
}) +
'\n',
)
// Track the failure
const url = (err as any).url
const relativePath = (err as any).relativePath
// Check for HTTPError by name since it may come from a different module
if ((err instanceof HTTPError || err?.name === 'HTTPError') && (err as any).response) {
const httpErr = err as any
failedPages.push({
url: httpErr.request?.requestUrl?.pathname || url,
relativePath,
error: err.message,
errorType: `HTTP ${httpErr.response?.statusCode || 'Error'}`,
})
if (!noMarkers) process.stdout.write(chalk.red('✗'))
} else if (err instanceof Error) {
// Enhanced error handling for timeout and network errors
const errorType = (err.cause as any)?.code || err.name
const isTimeout =
errorType === 'UND_ERR_HEADERS_TIMEOUT' ||
errorType === 'UND_ERR_CONNECT_TIMEOUT' ||
err.message.includes('timed out')
failedPages.push({
url,
relativePath,
error: err.message,
errorType: isTimeout ? 'Timeout' : errorType || 'Unknown Error',
})
if (!noMarkers) process.stdout.write(chalk.red('✗'))
} else {
console.error(err)
failedPages.push({
url,
relativePath,
error: String(err),
errorType: 'Unknown Error',
})
if (!noMarkers) process.stdout.write(chalk.red('✗'))
}
})
return eventToPromise(waiter, 'done').then(() => {
// Wait for 'done' event but ignore 'error' events (they're handled by the error listener above)
return eventToPromise(waiter, 'done', { ignoreErrors: true }).then(() => {
console.log('\nrecords in index: ', records.length)
return records
// Report failed pages if any
if (failedPages.length > 0) {
console.log(
'\n' +
boxen(
chalk.bold.red(`${failedPages.length} page(s) failed to scrape\n\n`) +
failedPages
.slice(0, 10) // Show first 10 failures
.map((failure, idx) => {
return (
chalk.gray(`${idx + 1}. `) +
chalk.yellow(failure.errorType) +
'\n' +
(failure.relativePath
? chalk.cyan(` Path: `) + failure.relativePath + '\n'
: '') +
(failure.url ? chalk.cyan(` URL: `) + failure.url + '\n' : '') +
chalk.gray(` Error: ${failure.error}`)
)
})
.join('\n\n') +
(failedPages.length > 10
? `\n\n${chalk.gray(`... and ${failedPages.length - 10} more`)}`
: ''),
{
title: chalk.red('⚠ Failed Pages'),
padding: 1,
borderColor: 'yellow',
},
) +
'\n',
)
// Log suggestion
console.log(
chalk.yellow(
`💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`,
),
)
if (failedPages.some((f) => f.errorType === 'Timeout')) {
console.log(
chalk.gray(
` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`,
),
)
}
}
return {
records,
failedPages,
}
})
}

View File

@@ -32,6 +32,15 @@ interface DomWaiterOptions {
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
const emitter = new EventEmitter()
// Add a default no-op error handler to prevent EventEmitter from throwing
// when errors are emitted before the caller attaches their error handler
// This will be overridden/supplemented by the caller's error handler
const defaultErrorHandler = () => {
// No-op: prevents EventEmitter from throwing
// External handlers will still receive the error
}
emitter.on('error', defaultErrorHandler)
const defaults = {
parseDOM: true,
json: false,
@@ -43,7 +52,12 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
const limiter = new Bottleneck(opts)
pages.forEach((page) => {
limiter.schedule(() => getPage(page, emitter, opts))
limiter
.schedule(() => getPage(page, emitter, opts))
.catch((err) => {
// Catch any unhandled promise rejections
emitter.emit('error', err)
})
})
limiter.on('idle', () => {
@@ -58,46 +72,87 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
}
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
emitter.emit('beforePageLoad', page)
// Wrap everything in a try-catch to ensure no errors escape
try {
emitter.emit('beforePageLoad', page)
if (opts.json) {
try {
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
if (!response.ok) {
throw new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
if (opts.json) {
try {
const response = await fetchWithRetry(page.url!, undefined, {
retries: 3,
throwHttpErrors: false,
timeout: 60000,
})
if (!response.ok) {
const httpError = new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
// Add URL and path info directly to the HTTPError
;(httpError as any).url = page.url
;(httpError as any).relativePath = page.relativePath
// Emit error instead of throwing
emitter.emit('error', httpError)
return // Exit early, don't continue processing
}
const json = await response.json()
const pageCopy = Object.assign({}, page, { json })
emitter.emit('page', pageCopy)
} catch (err) {
// Enhance error with URL information
if (err instanceof Error && page.url) {
const enhancedError = new Error(err.message, { cause: err.cause })
enhancedError.name = err.name
enhancedError.stack = err.stack
;(enhancedError as any).url = page.url
;(enhancedError as any).relativePath = page.relativePath
emitter.emit('error', enhancedError)
} else {
emitter.emit('error', err)
}
}
const json = await response.json()
const pageCopy = Object.assign({}, page, { json })
emitter.emit('page', pageCopy)
} catch (err) {
if (err instanceof Error) {
err.message = `Failed to fetch ${page.url}: ${err.message}`
} else {
try {
const response = await fetchWithRetry(page.url!, undefined, {
retries: 3,
throwHttpErrors: false,
timeout: 60000,
})
if (!response.ok) {
const httpError = new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
// Add URL and path info directly to the HTTPError
;(httpError as any).url = page.url
;(httpError as any).relativePath = page.relativePath
// Emit error instead of throwing
emitter.emit('error', httpError)
return // Exit early, don't continue processing
}
const body = await response.text()
const pageCopy = Object.assign({}, page, { body })
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
emitter.emit('page', pageCopy)
} catch (err) {
// Enhance error with URL information
if (err instanceof Error && page.url) {
const enhancedError = new Error(err.message, { cause: err.cause })
enhancedError.name = err.name
enhancedError.stack = err.stack
;(enhancedError as any).url = page.url
;(enhancedError as any).relativePath = page.relativePath
emitter.emit('error', enhancedError)
} else {
emitter.emit('error', err)
}
}
emitter.emit('error', err)
}
} else {
try {
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
if (!response.ok) {
throw new HTTPError(
`HTTP ${response.status}: ${response.statusText}`,
{ ok: response.ok, statusCode: response.status },
{ requestUrl: { pathname: page.url } },
)
}
const body = await response.text()
const pageCopy = Object.assign({}, page, { body })
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
emitter.emit('page', pageCopy)
} catch (err) {
if (err instanceof Error) {
err.message = `Failed to fetch ${page.url}: ${err.message}`
}
emitter.emit('error', err)
}
} catch (err) {
// Ultimate catch-all to ensure nothing escapes
console.error('Unexpected error in getPage:', err)
emitter.emit('error', err)
}
}

View File

@@ -45,13 +45,21 @@ export default async function scrapeIntoIndexJson({
})
let countRecordsTotal = 0
let totalFailedPages = 0
const allFailures: Array<{
indexName: string
languageCode: string
indexVersion: string
failures: Array<{ url?: string; relativePath?: string; error: string; errorType: string }>
}> = []
// Build and validate all indices
for (const languageCode of languagesToBuild) {
for (const indexVersion of versionsToBuild) {
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
const records = await buildRecords(
const { records, failedPages } = await buildRecords(
indexName,
indexablePages,
indexVersion,
@@ -60,6 +68,17 @@ export default async function scrapeIntoIndexJson({
config,
)
countRecordsTotal += records.length
if (failedPages.length > 0) {
totalFailedPages += failedPages.length
allFailures.push({
indexName,
languageCode,
indexVersion,
failures: failedPages,
})
}
const fileWritten = await writeIndexRecords(indexName, records, outDirectory)
console.log(`wrote records to ${fileWritten}`)
}
@@ -71,6 +90,25 @@ export default async function scrapeIntoIndexJson({
console.log(`Took ${chalk.bold(formatSeconds(tookSec))}`)
const rate = (countRecordsTotal / tookSec).toFixed(1)
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
// Write failures summary to a file for GitHub Actions to read
if (totalFailedPages > 0) {
const fs = await import('fs')
const path = await import('path')
const failuresSummaryPath = path.join(outDirectory, 'failures-summary.json')
await fs.promises.writeFile(
failuresSummaryPath,
JSON.stringify(
{
totalFailedPages,
failures: allFailures,
},
null,
2,
),
)
console.log(`\n${chalk.yellow('⚠')} Wrote failures summary to ${failuresSummaryPath}`)
}
}
function formatSeconds(seconds: number): string {