better search scraping error handling (don't fail on single page) (#58004)
This commit is contained in:
26
.github/workflows/index-general-search.yml
vendored
26
.github/workflows/index-general-search.yml
vendored
@@ -180,6 +180,19 @@ jobs:
|
||||
|
||||
ls -lh /tmp/records
|
||||
|
||||
- name: Check for scraping failures
|
||||
id: check-failures
|
||||
run: |
|
||||
if [ -f /tmp/records/failures-summary.json ]; then
|
||||
FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
|
||||
echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
|
||||
echo "has_failures=true" >> $GITHUB_OUTPUT
|
||||
echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
|
||||
else
|
||||
echo "has_failures=false" >> $GITHUB_OUTPUT
|
||||
echo "✅ All pages scraped successfully"
|
||||
fi
|
||||
|
||||
- name: Check that Elasticsearch is accessible
|
||||
run: |
|
||||
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
|
||||
@@ -211,6 +224,19 @@ jobs:
|
||||
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
|
||||
run: npm run purge-fastly-edge-cache
|
||||
|
||||
- name: Alert on scraping failures
|
||||
if: ${{ steps.check-failures.outputs.has_failures == 'true' && github.event_name != 'workflow_dispatch' }}
|
||||
uses: ./.github/actions/slack-alert
|
||||
with:
|
||||
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
|
||||
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
|
||||
message: |
|
||||
:warning: ${{ steps.check-failures.outputs.failed_pages }} page(s) failed to scrape for general search indexing (language: ${{ matrix.language }})
|
||||
|
||||
The indexing completed but some pages could not be scraped. This may affect search results for those pages.
|
||||
|
||||
Workflow: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
- uses: ./.github/actions/slack-alert
|
||||
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
|
||||
with:
|
||||
|
||||
@@ -48,6 +48,18 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10)
|
||||
// when multiple docs match on a certain keyword(s).
|
||||
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
|
||||
|
||||
interface FailedPage {
|
||||
url?: string
|
||||
relativePath?: string
|
||||
error: string
|
||||
errorType: string
|
||||
}
|
||||
|
||||
export interface BuildRecordsResult {
|
||||
records: Record[]
|
||||
failedPages: FailedPage[]
|
||||
}
|
||||
|
||||
export default async function buildRecords(
|
||||
indexName: string,
|
||||
indexablePages: Page[],
|
||||
@@ -55,7 +67,7 @@ export default async function buildRecords(
|
||||
languageCode: string,
|
||||
redirects: Redirects,
|
||||
config: Config = {} as Config,
|
||||
): Promise<Record[]> {
|
||||
): Promise<BuildRecordsResult> {
|
||||
// Determine the page version from the index version
|
||||
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
|
||||
|
||||
@@ -96,6 +108,9 @@ export default async function buildRecords(
|
||||
|
||||
const hasPopularPages = Object.keys(popularPages).length > 0
|
||||
|
||||
// Track failed pages
|
||||
const failedPages: FailedPage[] = []
|
||||
|
||||
const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
|
||||
.on('page', (page) => {
|
||||
if (!noMarkers) process.stdout.write(pageMarker)
|
||||
@@ -114,23 +129,105 @@ export default async function buildRecords(
|
||||
records.push(newRecord)
|
||||
})
|
||||
.on('error', (err) => {
|
||||
if (err instanceof HTTPError && !err.response.ok) {
|
||||
console.log(
|
||||
'\n' +
|
||||
boxen(chalk.bold(err.request.requestUrl?.pathname), {
|
||||
title: chalk.red('The URL it failed on was'),
|
||||
padding: 1,
|
||||
borderColor: 'red',
|
||||
}) +
|
||||
'\n',
|
||||
)
|
||||
// Track the failure
|
||||
const url = (err as any).url
|
||||
const relativePath = (err as any).relativePath
|
||||
|
||||
// Check for HTTPError by name since it may come from a different module
|
||||
if ((err instanceof HTTPError || err?.name === 'HTTPError') && (err as any).response) {
|
||||
const httpErr = err as any
|
||||
failedPages.push({
|
||||
url: httpErr.request?.requestUrl?.pathname || url,
|
||||
relativePath,
|
||||
error: err.message,
|
||||
errorType: `HTTP ${httpErr.response?.statusCode || 'Error'}`,
|
||||
})
|
||||
|
||||
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||
} else if (err instanceof Error) {
|
||||
// Enhanced error handling for timeout and network errors
|
||||
const errorType = (err.cause as any)?.code || err.name
|
||||
const isTimeout =
|
||||
errorType === 'UND_ERR_HEADERS_TIMEOUT' ||
|
||||
errorType === 'UND_ERR_CONNECT_TIMEOUT' ||
|
||||
err.message.includes('timed out')
|
||||
|
||||
failedPages.push({
|
||||
url,
|
||||
relativePath,
|
||||
error: err.message,
|
||||
errorType: isTimeout ? 'Timeout' : errorType || 'Unknown Error',
|
||||
})
|
||||
|
||||
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||
} else {
|
||||
console.error(err)
|
||||
failedPages.push({
|
||||
url,
|
||||
relativePath,
|
||||
error: String(err),
|
||||
errorType: 'Unknown Error',
|
||||
})
|
||||
|
||||
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||
}
|
||||
})
|
||||
|
||||
return eventToPromise(waiter, 'done').then(() => {
|
||||
// Wait for 'done' event but ignore 'error' events (they're handled by the error listener above)
|
||||
return eventToPromise(waiter, 'done', { ignoreErrors: true }).then(() => {
|
||||
console.log('\nrecords in index: ', records.length)
|
||||
return records
|
||||
|
||||
// Report failed pages if any
|
||||
if (failedPages.length > 0) {
|
||||
console.log(
|
||||
'\n' +
|
||||
boxen(
|
||||
chalk.bold.red(`${failedPages.length} page(s) failed to scrape\n\n`) +
|
||||
failedPages
|
||||
.slice(0, 10) // Show first 10 failures
|
||||
.map((failure, idx) => {
|
||||
return (
|
||||
chalk.gray(`${idx + 1}. `) +
|
||||
chalk.yellow(failure.errorType) +
|
||||
'\n' +
|
||||
(failure.relativePath
|
||||
? chalk.cyan(` Path: `) + failure.relativePath + '\n'
|
||||
: '') +
|
||||
(failure.url ? chalk.cyan(` URL: `) + failure.url + '\n' : '') +
|
||||
chalk.gray(` Error: ${failure.error}`)
|
||||
)
|
||||
})
|
||||
.join('\n\n') +
|
||||
(failedPages.length > 10
|
||||
? `\n\n${chalk.gray(`... and ${failedPages.length - 10} more`)}`
|
||||
: ''),
|
||||
{
|
||||
title: chalk.red('⚠ Failed Pages'),
|
||||
padding: 1,
|
||||
borderColor: 'yellow',
|
||||
},
|
||||
) +
|
||||
'\n',
|
||||
)
|
||||
|
||||
// Log suggestion
|
||||
console.log(
|
||||
chalk.yellow(
|
||||
`💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`,
|
||||
),
|
||||
)
|
||||
|
||||
if (failedPages.some((f) => f.errorType === 'Timeout')) {
|
||||
console.log(
|
||||
chalk.gray(
|
||||
` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
records,
|
||||
failedPages,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -32,6 +32,15 @@ interface DomWaiterOptions {
|
||||
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
|
||||
const emitter = new EventEmitter()
|
||||
|
||||
// Add a default no-op error handler to prevent EventEmitter from throwing
|
||||
// when errors are emitted before the caller attaches their error handler
|
||||
// This will be overridden/supplemented by the caller's error handler
|
||||
const defaultErrorHandler = () => {
|
||||
// No-op: prevents EventEmitter from throwing
|
||||
// External handlers will still receive the error
|
||||
}
|
||||
emitter.on('error', defaultErrorHandler)
|
||||
|
||||
const defaults = {
|
||||
parseDOM: true,
|
||||
json: false,
|
||||
@@ -43,7 +52,12 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
|
||||
const limiter = new Bottleneck(opts)
|
||||
|
||||
pages.forEach((page) => {
|
||||
limiter.schedule(() => getPage(page, emitter, opts))
|
||||
limiter
|
||||
.schedule(() => getPage(page, emitter, opts))
|
||||
.catch((err) => {
|
||||
// Catch any unhandled promise rejections
|
||||
emitter.emit('error', err)
|
||||
})
|
||||
})
|
||||
|
||||
limiter.on('idle', () => {
|
||||
@@ -58,46 +72,87 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
|
||||
}
|
||||
|
||||
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
|
||||
emitter.emit('beforePageLoad', page)
|
||||
// Wrap everything in a try-catch to ensure no errors escape
|
||||
try {
|
||||
emitter.emit('beforePageLoad', page)
|
||||
|
||||
if (opts.json) {
|
||||
try {
|
||||
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
|
||||
if (!response.ok) {
|
||||
throw new HTTPError(
|
||||
`HTTP ${response.status}: ${response.statusText}`,
|
||||
{ ok: response.ok, statusCode: response.status },
|
||||
{ requestUrl: { pathname: page.url } },
|
||||
)
|
||||
if (opts.json) {
|
||||
try {
|
||||
const response = await fetchWithRetry(page.url!, undefined, {
|
||||
retries: 3,
|
||||
throwHttpErrors: false,
|
||||
timeout: 60000,
|
||||
})
|
||||
if (!response.ok) {
|
||||
const httpError = new HTTPError(
|
||||
`HTTP ${response.status}: ${response.statusText}`,
|
||||
{ ok: response.ok, statusCode: response.status },
|
||||
{ requestUrl: { pathname: page.url } },
|
||||
)
|
||||
// Add URL and path info directly to the HTTPError
|
||||
;(httpError as any).url = page.url
|
||||
;(httpError as any).relativePath = page.relativePath
|
||||
// Emit error instead of throwing
|
||||
emitter.emit('error', httpError)
|
||||
return // Exit early, don't continue processing
|
||||
}
|
||||
const json = await response.json()
|
||||
const pageCopy = Object.assign({}, page, { json })
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
// Enhance error with URL information
|
||||
if (err instanceof Error && page.url) {
|
||||
const enhancedError = new Error(err.message, { cause: err.cause })
|
||||
enhancedError.name = err.name
|
||||
enhancedError.stack = err.stack
|
||||
;(enhancedError as any).url = page.url
|
||||
;(enhancedError as any).relativePath = page.relativePath
|
||||
emitter.emit('error', enhancedError)
|
||||
} else {
|
||||
emitter.emit('error', err)
|
||||
}
|
||||
}
|
||||
const json = await response.json()
|
||||
const pageCopy = Object.assign({}, page, { json })
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
if (err instanceof Error) {
|
||||
err.message = `Failed to fetch ${page.url}: ${err.message}`
|
||||
} else {
|
||||
try {
|
||||
const response = await fetchWithRetry(page.url!, undefined, {
|
||||
retries: 3,
|
||||
throwHttpErrors: false,
|
||||
timeout: 60000,
|
||||
})
|
||||
if (!response.ok) {
|
||||
const httpError = new HTTPError(
|
||||
`HTTP ${response.status}: ${response.statusText}`,
|
||||
{ ok: response.ok, statusCode: response.status },
|
||||
{ requestUrl: { pathname: page.url } },
|
||||
)
|
||||
// Add URL and path info directly to the HTTPError
|
||||
;(httpError as any).url = page.url
|
||||
;(httpError as any).relativePath = page.relativePath
|
||||
// Emit error instead of throwing
|
||||
emitter.emit('error', httpError)
|
||||
return // Exit early, don't continue processing
|
||||
}
|
||||
const body = await response.text()
|
||||
const pageCopy = Object.assign({}, page, { body })
|
||||
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
// Enhance error with URL information
|
||||
if (err instanceof Error && page.url) {
|
||||
const enhancedError = new Error(err.message, { cause: err.cause })
|
||||
enhancedError.name = err.name
|
||||
enhancedError.stack = err.stack
|
||||
;(enhancedError as any).url = page.url
|
||||
;(enhancedError as any).relativePath = page.relativePath
|
||||
emitter.emit('error', enhancedError)
|
||||
} else {
|
||||
emitter.emit('error', err)
|
||||
}
|
||||
}
|
||||
emitter.emit('error', err)
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
|
||||
if (!response.ok) {
|
||||
throw new HTTPError(
|
||||
`HTTP ${response.status}: ${response.statusText}`,
|
||||
{ ok: response.ok, statusCode: response.status },
|
||||
{ requestUrl: { pathname: page.url } },
|
||||
)
|
||||
}
|
||||
const body = await response.text()
|
||||
const pageCopy = Object.assign({}, page, { body })
|
||||
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
|
||||
emitter.emit('page', pageCopy)
|
||||
} catch (err) {
|
||||
if (err instanceof Error) {
|
||||
err.message = `Failed to fetch ${page.url}: ${err.message}`
|
||||
}
|
||||
emitter.emit('error', err)
|
||||
}
|
||||
} catch (err) {
|
||||
// Ultimate catch-all to ensure nothing escapes
|
||||
console.error('Unexpected error in getPage:', err)
|
||||
emitter.emit('error', err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,13 +45,21 @@ export default async function scrapeIntoIndexJson({
|
||||
})
|
||||
|
||||
let countRecordsTotal = 0
|
||||
let totalFailedPages = 0
|
||||
const allFailures: Array<{
|
||||
indexName: string
|
||||
languageCode: string
|
||||
indexVersion: string
|
||||
failures: Array<{ url?: string; relativePath?: string; error: string; errorType: string }>
|
||||
}> = []
|
||||
|
||||
// Build and validate all indices
|
||||
for (const languageCode of languagesToBuild) {
|
||||
for (const indexVersion of versionsToBuild) {
|
||||
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
|
||||
|
||||
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
|
||||
const records = await buildRecords(
|
||||
const { records, failedPages } = await buildRecords(
|
||||
indexName,
|
||||
indexablePages,
|
||||
indexVersion,
|
||||
@@ -60,6 +68,17 @@ export default async function scrapeIntoIndexJson({
|
||||
config,
|
||||
)
|
||||
countRecordsTotal += records.length
|
||||
|
||||
if (failedPages.length > 0) {
|
||||
totalFailedPages += failedPages.length
|
||||
allFailures.push({
|
||||
indexName,
|
||||
languageCode,
|
||||
indexVersion,
|
||||
failures: failedPages,
|
||||
})
|
||||
}
|
||||
|
||||
const fileWritten = await writeIndexRecords(indexName, records, outDirectory)
|
||||
console.log(`wrote records to ${fileWritten}`)
|
||||
}
|
||||
@@ -71,6 +90,25 @@ export default async function scrapeIntoIndexJson({
|
||||
console.log(`Took ${chalk.bold(formatSeconds(tookSec))}`)
|
||||
const rate = (countRecordsTotal / tookSec).toFixed(1)
|
||||
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
|
||||
|
||||
// Write failures summary to a file for GitHub Actions to read
|
||||
if (totalFailedPages > 0) {
|
||||
const fs = await import('fs')
|
||||
const path = await import('path')
|
||||
const failuresSummaryPath = path.join(outDirectory, 'failures-summary.json')
|
||||
await fs.promises.writeFile(
|
||||
failuresSummaryPath,
|
||||
JSON.stringify(
|
||||
{
|
||||
totalFailedPages,
|
||||
failures: allFailures,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
)
|
||||
console.log(`\n${chalk.yellow('⚠')} Wrote failures summary to ${failuresSummaryPath}`)
|
||||
}
|
||||
}
|
||||
|
||||
function formatSeconds(seconds: number): string {
|
||||
|
||||
Reference in New Issue
Block a user