const mdastToHTML = require('./mdast-to-html');
// Captures hanzi (pinyin) pairs (hanzi, optional whitespace, then pinyin parentheses)
const HANZI_PINYIN_PAIR = '([\u4e00-\u9fff]+)\\s*\\(([^)]+)\\)';
// Matches the BLANK placeholder
const BLANK_TOKEN = 'BLANK';
// Matches Chinese and English punctuation
const PUNCTUATION = '[,。?!!?,;:;:、]+';
// Matches Latin text with spaces
const OTHER_TEXT = '([a-zA-Z\\s]+)';
const HANZI_PINYIN_REGEX = new RegExp(
`${HANZI_PINYIN_PAIR}|${BLANK_TOKEN}|${PUNCTUATION}|${OTHER_TEXT}`,
'g'
);
/**
* Parses all hanzi-pinyin pairs from text
* @param {string} text - Text potentially containing multiple hanzi (pinyin) patterns
* @returns {Array<{hanzi: string, pinyin: string}>} Array of parsed pairs
*/
function parseHanziPinyinPairs(text) {
const pairs = [];
const regex = new RegExp(HANZI_PINYIN_REGEX);
let match;
while ((match = regex.exec(text)) !== null) {
if (match[1] && match[2]) {
pairs.push({
hanzi: match[1].trim(),
pinyin: match[2].trim()
});
}
}
return pairs;
}
/**
* Custom handler for Chinese inline code to render as ruby elements
* Matches hanzi-pinyin pairs, BLANK, and punctuation as separate elements
* @param {object} state - The state object from mdast-util-to-hast
* @param {object} node - The inlineCode node
* @returns {object|Array