mirror of
https://github.com/Lissy93/web-check.git
synced 2026-05-13 06:01:02 -04:00
35 lines
1.3 KiB
JavaScript
35 lines
1.3 KiB
JavaScript
import middleware from './_common/middleware.js';
|
|
import { httpGet } from './_common/http.js';
|
|
import { parseTarget } from './_common/parse-target.js';
|
|
import { upstreamError } from './_common/upstream.js';
|
|
|
|
// Extract User-agent / Allow / Disallow rules from a robots.txt body
|
|
const parseRobotsTxt = (content) => {
|
|
const rules = [];
|
|
for (let line of content.split('\n')) {
|
|
line = line.trim();
|
|
const ruleMatch = line.match(/^(Allow|Disallow|User-agent):\s*(\S*)$/i);
|
|
if (ruleMatch) rules.push({ lbl: ruleMatch[1], val: ruleMatch[2] });
|
|
}
|
|
return { robots: rules };
|
|
};
|
|
|
|
const robotsHandler = async (url) => {
|
|
const { protocol, hostname } = parseTarget(url);
|
|
const host = hostname.includes(':') ? `[${hostname}]` : hostname;
|
|
try {
|
|
const res = await httpGet(`${protocol}//${host}/robots.txt`);
|
|
const parsed = parseRobotsTxt(res.data || '');
|
|
return parsed.robots.length ? parsed : { skipped: 'No robots.txt rules found for this host' };
|
|
} catch (error) {
|
|
const status = error.response?.status;
|
|
if (status >= 400 && status < 500) {
|
|
return { skipped: 'No robots.txt file present on this host' };
|
|
}
|
|
return upstreamError(error, 'robots.txt fetch');
|
|
}
|
|
};
|
|
|
|
export const handler = middleware(robotsHandler);
|
|
export default handler;
|