const mdastToHTML = require('./mdast-to-html'); // Captures hanzi (pinyin) pairs (hanzi, optional whitespace, then pinyin parentheses) const HANZI_PINYIN_PAIR = '([\u4e00-\u9fff]+)\\s*\\(([^)]+)\\)'; // Matches the BLANK placeholder const BLANK_TOKEN = 'BLANK'; // Matches Chinese and English punctuation const PUNCTUATION = '[,。?!!?,;:;:、]+'; // Matches Latin text with spaces const OTHER_TEXT = '([a-zA-Z\\s]+)'; const HANZI_PINYIN_REGEX = new RegExp( `${HANZI_PINYIN_PAIR}|${BLANK_TOKEN}|${PUNCTUATION}|${OTHER_TEXT}`, 'g' ); /** * Parses all hanzi-pinyin pairs from text * @param {string} text - Text potentially containing multiple hanzi (pinyin) patterns * @returns {Array<{hanzi: string, pinyin: string}>} Array of parsed pairs */ function parseHanziPinyinPairs(text) { const pairs = []; const regex = new RegExp(HANZI_PINYIN_REGEX); let match; while ((match = regex.exec(text)) !== null) { if (match[1] && match[2]) { pairs.push({ hanzi: match[1].trim(), pinyin: match[2].trim() }); } } return pairs; } /** * Custom handler for Chinese inline code to render as ruby elements * Matches hanzi-pinyin pairs, BLANK, and punctuation as separate elements * @param {object} state - The state object from mdast-util-to-hast * @param {object} node - The inlineCode node * @returns {object|Array} Hast element node or array of nodes */ function chineseInlineCodeHandler(state, node) { const rubyPairs = parseHanziPinyinPairs(node.value); if (rubyPairs.length > 0) { const matches = [...node.value.matchAll(HANZI_PINYIN_REGEX)]; const nodes = matches.map(fullMatch => { if (fullMatch[1] && fullMatch[2]) { return { type: 'element', tagName: 'ruby', properties: {}, children: [ { type: 'text', value: fullMatch[1].trim() }, { type: 'element', tagName: 'rp', properties: {}, children: [{ type: 'text', value: '(' }] }, { type: 'element', tagName: 'rt', properties: {}, children: [{ type: 'text', value: fullMatch[2].trim() }] }, { type: 'element', tagName: 'rp', properties: {}, children: [{ type: 'text', value: ')' }] } ] }; } // Other captures (BLANK, punctuation, other text including spaces) should preserve exactly return { type: 'text', value: fullMatch[0] }; }); return nodes.length === 1 ? nodes[0] : nodes; } // If static text, return code return { type: 'element', tagName: 'span', properties: { className: 'highlighted-text' }, children: [{ type: 'text', value: node.value }] }; } /** * Custom handler for inline code to render as span elements * @param {object} state - The state object from mdast-util-to-hast * @param {object} node - The inlineCode node * @returns {object} Hast element node */ function spanInlineCodeHandler(state, node) { return { type: 'element', tagName: 'span', properties: { className: 'highlighted-text' }, children: [{ type: 'text', value: node.value }] }; } const spanOrRubyOptions = { handlers: { inlineCode: chineseInlineCodeHandler } }; const spanOptions = { handlers: { inlineCode: spanInlineCodeHandler } }; const createMdastToHtml = lang => { if (lang === 'zh-CN') { return x => mdastToHTML(x, spanOrRubyOptions); } else if (lang === 'en-US' || lang === 'es') { return x => mdastToHTML(x, spanOptions); } else { return mdastToHTML; } }; module.exports = { parseHanziPinyinPairs, createMdastToHtml };