better search scraping error handling (don't fail on single page) (#58004)
This commit is contained in:
26
.github/workflows/index-general-search.yml
vendored
26
.github/workflows/index-general-search.yml
vendored
@@ -180,6 +180,19 @@ jobs:
|
|||||||
|
|
||||||
ls -lh /tmp/records
|
ls -lh /tmp/records
|
||||||
|
|
||||||
|
- name: Check for scraping failures
|
||||||
|
id: check-failures
|
||||||
|
run: |
|
||||||
|
if [ -f /tmp/records/failures-summary.json ]; then
|
||||||
|
FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
|
||||||
|
echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
|
||||||
|
echo "has_failures=true" >> $GITHUB_OUTPUT
|
||||||
|
echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
|
||||||
|
else
|
||||||
|
echo "has_failures=false" >> $GITHUB_OUTPUT
|
||||||
|
echo "✅ All pages scraped successfully"
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Check that Elasticsearch is accessible
|
- name: Check that Elasticsearch is accessible
|
||||||
run: |
|
run: |
|
||||||
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
|
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
|
||||||
@@ -211,6 +224,19 @@ jobs:
|
|||||||
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
|
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
|
||||||
run: npm run purge-fastly-edge-cache
|
run: npm run purge-fastly-edge-cache
|
||||||
|
|
||||||
|
- name: Alert on scraping failures
|
||||||
|
if: ${{ steps.check-failures.outputs.has_failures == 'true' && github.event_name != 'workflow_dispatch' }}
|
||||||
|
uses: ./.github/actions/slack-alert
|
||||||
|
with:
|
||||||
|
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
|
||||||
|
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
|
||||||
|
message: |
|
||||||
|
:warning: ${{ steps.check-failures.outputs.failed_pages }} page(s) failed to scrape for general search indexing (language: ${{ matrix.language }})
|
||||||
|
|
||||||
|
The indexing completed but some pages could not be scraped. This may affect search results for those pages.
|
||||||
|
|
||||||
|
Workflow: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||||
|
|
||||||
- uses: ./.github/actions/slack-alert
|
- uses: ./.github/actions/slack-alert
|
||||||
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
|
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
|
||||||
with:
|
with:
|
||||||
|
|||||||
@@ -48,6 +48,18 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10)
|
|||||||
// when multiple docs match on a certain keyword(s).
|
// when multiple docs match on a certain keyword(s).
|
||||||
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
|
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
|
||||||
|
|
||||||
|
interface FailedPage {
|
||||||
|
url?: string
|
||||||
|
relativePath?: string
|
||||||
|
error: string
|
||||||
|
errorType: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface BuildRecordsResult {
|
||||||
|
records: Record[]
|
||||||
|
failedPages: FailedPage[]
|
||||||
|
}
|
||||||
|
|
||||||
export default async function buildRecords(
|
export default async function buildRecords(
|
||||||
indexName: string,
|
indexName: string,
|
||||||
indexablePages: Page[],
|
indexablePages: Page[],
|
||||||
@@ -55,7 +67,7 @@ export default async function buildRecords(
|
|||||||
languageCode: string,
|
languageCode: string,
|
||||||
redirects: Redirects,
|
redirects: Redirects,
|
||||||
config: Config = {} as Config,
|
config: Config = {} as Config,
|
||||||
): Promise<Record[]> {
|
): Promise<BuildRecordsResult> {
|
||||||
// Determine the page version from the index version
|
// Determine the page version from the index version
|
||||||
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
|
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
|
||||||
|
|
||||||
@@ -96,6 +108,9 @@ export default async function buildRecords(
|
|||||||
|
|
||||||
const hasPopularPages = Object.keys(popularPages).length > 0
|
const hasPopularPages = Object.keys(popularPages).length > 0
|
||||||
|
|
||||||
|
// Track failed pages
|
||||||
|
const failedPages: FailedPage[] = []
|
||||||
|
|
||||||
const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
|
const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
|
||||||
.on('page', (page) => {
|
.on('page', (page) => {
|
||||||
if (!noMarkers) process.stdout.write(pageMarker)
|
if (!noMarkers) process.stdout.write(pageMarker)
|
||||||
@@ -114,23 +129,105 @@ export default async function buildRecords(
|
|||||||
records.push(newRecord)
|
records.push(newRecord)
|
||||||
})
|
})
|
||||||
.on('error', (err) => {
|
.on('error', (err) => {
|
||||||
if (err instanceof HTTPError && !err.response.ok) {
|
// Track the failure
|
||||||
console.log(
|
const url = (err as any).url
|
||||||
'\n' +
|
const relativePath = (err as any).relativePath
|
||||||
boxen(chalk.bold(err.request.requestUrl?.pathname), {
|
|
||||||
title: chalk.red('The URL it failed on was'),
|
// Check for HTTPError by name since it may come from a different module
|
||||||
padding: 1,
|
if ((err instanceof HTTPError || err?.name === 'HTTPError') && (err as any).response) {
|
||||||
borderColor: 'red',
|
const httpErr = err as any
|
||||||
}) +
|
failedPages.push({
|
||||||
'\n',
|
url: httpErr.request?.requestUrl?.pathname || url,
|
||||||
)
|
relativePath,
|
||||||
|
error: err.message,
|
||||||
|
errorType: `HTTP ${httpErr.response?.statusCode || 'Error'}`,
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||||
|
} else if (err instanceof Error) {
|
||||||
|
// Enhanced error handling for timeout and network errors
|
||||||
|
const errorType = (err.cause as any)?.code || err.name
|
||||||
|
const isTimeout =
|
||||||
|
errorType === 'UND_ERR_HEADERS_TIMEOUT' ||
|
||||||
|
errorType === 'UND_ERR_CONNECT_TIMEOUT' ||
|
||||||
|
err.message.includes('timed out')
|
||||||
|
|
||||||
|
failedPages.push({
|
||||||
|
url,
|
||||||
|
relativePath,
|
||||||
|
error: err.message,
|
||||||
|
errorType: isTimeout ? 'Timeout' : errorType || 'Unknown Error',
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||||
} else {
|
} else {
|
||||||
console.error(err)
|
failedPages.push({
|
||||||
|
url,
|
||||||
|
relativePath,
|
||||||
|
error: String(err),
|
||||||
|
errorType: 'Unknown Error',
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!noMarkers) process.stdout.write(chalk.red('✗'))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
return eventToPromise(waiter, 'done').then(() => {
|
// Wait for 'done' event but ignore 'error' events (they're handled by the error listener above)
|
||||||
|
return eventToPromise(waiter, 'done', { ignoreErrors: true }).then(() => {
|
||||||
console.log('\nrecords in index: ', records.length)
|
console.log('\nrecords in index: ', records.length)
|
||||||
return records
|
|
||||||
|
// Report failed pages if any
|
||||||
|
if (failedPages.length > 0) {
|
||||||
|
console.log(
|
||||||
|
'\n' +
|
||||||
|
boxen(
|
||||||
|
chalk.bold.red(`${failedPages.length} page(s) failed to scrape\n\n`) +
|
||||||
|
failedPages
|
||||||
|
.slice(0, 10) // Show first 10 failures
|
||||||
|
.map((failure, idx) => {
|
||||||
|
return (
|
||||||
|
chalk.gray(`${idx + 1}. `) +
|
||||||
|
chalk.yellow(failure.errorType) +
|
||||||
|
'\n' +
|
||||||
|
(failure.relativePath
|
||||||
|
? chalk.cyan(` Path: `) + failure.relativePath + '\n'
|
||||||
|
: '') +
|
||||||
|
(failure.url ? chalk.cyan(` URL: `) + failure.url + '\n' : '') +
|
||||||
|
chalk.gray(` Error: ${failure.error}`)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.join('\n\n') +
|
||||||
|
(failedPages.length > 10
|
||||||
|
? `\n\n${chalk.gray(`... and ${failedPages.length - 10} more`)}`
|
||||||
|
: ''),
|
||||||
|
{
|
||||||
|
title: chalk.red('⚠ Failed Pages'),
|
||||||
|
padding: 1,
|
||||||
|
borderColor: 'yellow',
|
||||||
|
},
|
||||||
|
) +
|
||||||
|
'\n',
|
||||||
|
)
|
||||||
|
|
||||||
|
// Log suggestion
|
||||||
|
console.log(
|
||||||
|
chalk.yellow(
|
||||||
|
`💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (failedPages.some((f) => f.errorType === 'Timeout')) {
|
||||||
|
console.log(
|
||||||
|
chalk.gray(
|
||||||
|
` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
records,
|
||||||
|
failedPages,
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,6 +32,15 @@ interface DomWaiterOptions {
|
|||||||
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
|
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
|
||||||
const emitter = new EventEmitter()
|
const emitter = new EventEmitter()
|
||||||
|
|
||||||
|
// Add a default no-op error handler to prevent EventEmitter from throwing
|
||||||
|
// when errors are emitted before the caller attaches their error handler
|
||||||
|
// This will be overridden/supplemented by the caller's error handler
|
||||||
|
const defaultErrorHandler = () => {
|
||||||
|
// No-op: prevents EventEmitter from throwing
|
||||||
|
// External handlers will still receive the error
|
||||||
|
}
|
||||||
|
emitter.on('error', defaultErrorHandler)
|
||||||
|
|
||||||
const defaults = {
|
const defaults = {
|
||||||
parseDOM: true,
|
parseDOM: true,
|
||||||
json: false,
|
json: false,
|
||||||
@@ -43,7 +52,12 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
|
|||||||
const limiter = new Bottleneck(opts)
|
const limiter = new Bottleneck(opts)
|
||||||
|
|
||||||
pages.forEach((page) => {
|
pages.forEach((page) => {
|
||||||
limiter.schedule(() => getPage(page, emitter, opts))
|
limiter
|
||||||
|
.schedule(() => getPage(page, emitter, opts))
|
||||||
|
.catch((err) => {
|
||||||
|
// Catch any unhandled promise rejections
|
||||||
|
emitter.emit('error', err)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
limiter.on('idle', () => {
|
limiter.on('idle', () => {
|
||||||
@@ -58,46 +72,87 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
|
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
|
||||||
|
// Wrap everything in a try-catch to ensure no errors escape
|
||||||
|
try {
|
||||||
emitter.emit('beforePageLoad', page)
|
emitter.emit('beforePageLoad', page)
|
||||||
|
|
||||||
if (opts.json) {
|
if (opts.json) {
|
||||||
try {
|
try {
|
||||||
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
|
const response = await fetchWithRetry(page.url!, undefined, {
|
||||||
|
retries: 3,
|
||||||
|
throwHttpErrors: false,
|
||||||
|
timeout: 60000,
|
||||||
|
})
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new HTTPError(
|
const httpError = new HTTPError(
|
||||||
`HTTP ${response.status}: ${response.statusText}`,
|
`HTTP ${response.status}: ${response.statusText}`,
|
||||||
{ ok: response.ok, statusCode: response.status },
|
{ ok: response.ok, statusCode: response.status },
|
||||||
{ requestUrl: { pathname: page.url } },
|
{ requestUrl: { pathname: page.url } },
|
||||||
)
|
)
|
||||||
|
// Add URL and path info directly to the HTTPError
|
||||||
|
;(httpError as any).url = page.url
|
||||||
|
;(httpError as any).relativePath = page.relativePath
|
||||||
|
// Emit error instead of throwing
|
||||||
|
emitter.emit('error', httpError)
|
||||||
|
return // Exit early, don't continue processing
|
||||||
}
|
}
|
||||||
const json = await response.json()
|
const json = await response.json()
|
||||||
const pageCopy = Object.assign({}, page, { json })
|
const pageCopy = Object.assign({}, page, { json })
|
||||||
emitter.emit('page', pageCopy)
|
emitter.emit('page', pageCopy)
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof Error) {
|
// Enhance error with URL information
|
||||||
err.message = `Failed to fetch ${page.url}: ${err.message}`
|
if (err instanceof Error && page.url) {
|
||||||
}
|
const enhancedError = new Error(err.message, { cause: err.cause })
|
||||||
|
enhancedError.name = err.name
|
||||||
|
enhancedError.stack = err.stack
|
||||||
|
;(enhancedError as any).url = page.url
|
||||||
|
;(enhancedError as any).relativePath = page.relativePath
|
||||||
|
emitter.emit('error', enhancedError)
|
||||||
|
} else {
|
||||||
emitter.emit('error', err)
|
emitter.emit('error', err)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
try {
|
try {
|
||||||
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
|
const response = await fetchWithRetry(page.url!, undefined, {
|
||||||
|
retries: 3,
|
||||||
|
throwHttpErrors: false,
|
||||||
|
timeout: 60000,
|
||||||
|
})
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new HTTPError(
|
const httpError = new HTTPError(
|
||||||
`HTTP ${response.status}: ${response.statusText}`,
|
`HTTP ${response.status}: ${response.statusText}`,
|
||||||
{ ok: response.ok, statusCode: response.status },
|
{ ok: response.ok, statusCode: response.status },
|
||||||
{ requestUrl: { pathname: page.url } },
|
{ requestUrl: { pathname: page.url } },
|
||||||
)
|
)
|
||||||
|
// Add URL and path info directly to the HTTPError
|
||||||
|
;(httpError as any).url = page.url
|
||||||
|
;(httpError as any).relativePath = page.relativePath
|
||||||
|
// Emit error instead of throwing
|
||||||
|
emitter.emit('error', httpError)
|
||||||
|
return // Exit early, don't continue processing
|
||||||
}
|
}
|
||||||
const body = await response.text()
|
const body = await response.text()
|
||||||
const pageCopy = Object.assign({}, page, { body })
|
const pageCopy = Object.assign({}, page, { body })
|
||||||
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
|
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
|
||||||
emitter.emit('page', pageCopy)
|
emitter.emit('page', pageCopy)
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err instanceof Error) {
|
// Enhance error with URL information
|
||||||
err.message = `Failed to fetch ${page.url}: ${err.message}`
|
if (err instanceof Error && page.url) {
|
||||||
}
|
const enhancedError = new Error(err.message, { cause: err.cause })
|
||||||
|
enhancedError.name = err.name
|
||||||
|
enhancedError.stack = err.stack
|
||||||
|
;(enhancedError as any).url = page.url
|
||||||
|
;(enhancedError as any).relativePath = page.relativePath
|
||||||
|
emitter.emit('error', enhancedError)
|
||||||
|
} else {
|
||||||
emitter.emit('error', err)
|
emitter.emit('error', err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (err) {
|
||||||
|
// Ultimate catch-all to ensure nothing escapes
|
||||||
|
console.error('Unexpected error in getPage:', err)
|
||||||
|
emitter.emit('error', err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -45,13 +45,21 @@ export default async function scrapeIntoIndexJson({
|
|||||||
})
|
})
|
||||||
|
|
||||||
let countRecordsTotal = 0
|
let countRecordsTotal = 0
|
||||||
|
let totalFailedPages = 0
|
||||||
|
const allFailures: Array<{
|
||||||
|
indexName: string
|
||||||
|
languageCode: string
|
||||||
|
indexVersion: string
|
||||||
|
failures: Array<{ url?: string; relativePath?: string; error: string; errorType: string }>
|
||||||
|
}> = []
|
||||||
|
|
||||||
// Build and validate all indices
|
// Build and validate all indices
|
||||||
for (const languageCode of languagesToBuild) {
|
for (const languageCode of languagesToBuild) {
|
||||||
for (const indexVersion of versionsToBuild) {
|
for (const indexVersion of versionsToBuild) {
|
||||||
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
|
const { indexName } = getElasticSearchIndex('generalSearch', indexVersion, languageCode)
|
||||||
|
|
||||||
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
|
// The page version will be the new version, e.g., free-pro-team@latest, enterprise-server@3.7
|
||||||
const records = await buildRecords(
|
const { records, failedPages } = await buildRecords(
|
||||||
indexName,
|
indexName,
|
||||||
indexablePages,
|
indexablePages,
|
||||||
indexVersion,
|
indexVersion,
|
||||||
@@ -60,6 +68,17 @@ export default async function scrapeIntoIndexJson({
|
|||||||
config,
|
config,
|
||||||
)
|
)
|
||||||
countRecordsTotal += records.length
|
countRecordsTotal += records.length
|
||||||
|
|
||||||
|
if (failedPages.length > 0) {
|
||||||
|
totalFailedPages += failedPages.length
|
||||||
|
allFailures.push({
|
||||||
|
indexName,
|
||||||
|
languageCode,
|
||||||
|
indexVersion,
|
||||||
|
failures: failedPages,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
const fileWritten = await writeIndexRecords(indexName, records, outDirectory)
|
const fileWritten = await writeIndexRecords(indexName, records, outDirectory)
|
||||||
console.log(`wrote records to ${fileWritten}`)
|
console.log(`wrote records to ${fileWritten}`)
|
||||||
}
|
}
|
||||||
@@ -71,6 +90,25 @@ export default async function scrapeIntoIndexJson({
|
|||||||
console.log(`Took ${chalk.bold(formatSeconds(tookSec))}`)
|
console.log(`Took ${chalk.bold(formatSeconds(tookSec))}`)
|
||||||
const rate = (countRecordsTotal / tookSec).toFixed(1)
|
const rate = (countRecordsTotal / tookSec).toFixed(1)
|
||||||
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
|
console.log(`Rate ~${chalk.bold(rate)} pages per second.`)
|
||||||
|
|
||||||
|
// Write failures summary to a file for GitHub Actions to read
|
||||||
|
if (totalFailedPages > 0) {
|
||||||
|
const fs = await import('fs')
|
||||||
|
const path = await import('path')
|
||||||
|
const failuresSummaryPath = path.join(outDirectory, 'failures-summary.json')
|
||||||
|
await fs.promises.writeFile(
|
||||||
|
failuresSummaryPath,
|
||||||
|
JSON.stringify(
|
||||||
|
{
|
||||||
|
totalFailedPages,
|
||||||
|
failures: allFailures,
|
||||||
|
},
|
||||||
|
null,
|
||||||
|
2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
console.log(`\n${chalk.yellow('⚠')} Wrote failures summary to ${failuresSummaryPath}`)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatSeconds(seconds: number): string {
|
function formatSeconds(seconds: number): string {
|
||||||
|
|||||||
Reference in New Issue
Block a user