mirror of
https://github.com/Lissy93/web-check.git
synced 2026-05-13 06:01:02 -04:00
- Sitemap endpoint now recursively expands sitemap-index files - Fixes #165 - Strips :port from target URLs in get-ip, dns, dns-server, ports, mail-config - Fixes #203 - Configurable trust proxy (TRUST_PROXY env) so app works behind Traefik/nginx - Fixes #157 - Tranco rank now correctly says "top 1 million" (was "100 million") - Fixes #257 - Adds engines.node ">=20" so Vercel picks a supported runtime - Re #212 - Raises Vercel maxDuration from 10s to 60s, cutting most 504 timeouts - Re #251 - Re #287 - Bumps axios 1.4.8 to 1.16, closing 4 high-severity SSRF/DoS CVEs - Re #289 - Fixes mail-config crash where dns module was awaited as if promise-based - Adds reusable structured logging util for the API - Bumps a whole bunch of deps, and resolves lots of open npm CVEs
74 lines
2.3 KiB
JavaScript
74 lines
2.3 KiB
JavaScript
import axios from 'axios';
|
|
import xml2js from 'xml2js';
|
|
import middleware from './_common/middleware.js';
|
|
|
|
const HARD_TIMEOUT = 5000;
|
|
const MAX_DEPTH = 3;
|
|
const MAX_CHILD_SITEMAPS = 25;
|
|
|
|
// Fetch a single XML sitemap and parse it.
|
|
const fetchSitemap = async (sitemapUrl) => {
|
|
const res = await axios.get(sitemapUrl, { timeout: HARD_TIMEOUT });
|
|
return new xml2js.Parser().parseStringPromise(res.data);
|
|
};
|
|
|
|
// Find a sitemap URL listed in robots.txt as a fallback when /sitemap.xml is missing.
|
|
const findSitemapInRobots = async (baseUrl) => {
|
|
const robots = await axios.get(`${baseUrl}/robots.txt`, { timeout: HARD_TIMEOUT });
|
|
for (const line of robots.data.split('\n')) {
|
|
if (line.toLowerCase().startsWith('sitemap:')) {
|
|
return line.split(/\s+/)[1]?.trim() || null;
|
|
}
|
|
}
|
|
return null;
|
|
};
|
|
|
|
// Recursively expand a sitemap-index into its child url sets.
|
|
const expandSitemap = async (parsed, depth) => {
|
|
if (!parsed?.sitemapindex?.sitemap || depth >= MAX_DEPTH) return parsed;
|
|
const children = parsed.sitemapindex.sitemap
|
|
.map(s => s?.loc?.[0])
|
|
.filter(Boolean)
|
|
.slice(0, MAX_CHILD_SITEMAPS);
|
|
const fetched = await Promise.all(
|
|
children.map(loc => fetchSitemap(loc).catch(err => ({ error: err.message, loc })))
|
|
);
|
|
const expanded = await Promise.all(
|
|
fetched.map(child => child?.error ? child : expandSitemap(child, depth + 1))
|
|
);
|
|
const urls = expanded.flatMap(child => child?.urlset?.url || []);
|
|
return {
|
|
sitemapindex: parsed.sitemapindex,
|
|
urlset: urls.length ? { url: urls } : undefined,
|
|
sources: children,
|
|
};
|
|
};
|
|
|
|
const sitemapHandler = async (url) => {
|
|
let sitemapUrl = `${url}/sitemap.xml`;
|
|
try {
|
|
let parsed;
|
|
try {
|
|
parsed = await fetchSitemap(sitemapUrl);
|
|
} catch (error) {
|
|
if (error.response && error.response.status === 404) {
|
|
const robotsSitemap = await findSitemapInRobots(url);
|
|
if (!robotsSitemap) return { skipped: 'No sitemap found' };
|
|
sitemapUrl = robotsSitemap;
|
|
parsed = await fetchSitemap(sitemapUrl);
|
|
} else {
|
|
throw error;
|
|
}
|
|
}
|
|
return await expandSitemap(parsed, 0);
|
|
} catch (error) {
|
|
if (error.code === 'ECONNABORTED') {
|
|
return { error: `Request timed-out after ${HARD_TIMEOUT}ms` };
|
|
}
|
|
return { error: error.message };
|
|
}
|
|
};
|
|
|
|
export const handler = middleware(sitemapHandler);
|
|
export default handler;
|