feat(client,challenge-parser): update fill-in-the-blank to support Chinese (#63741)

This commit is contained in:
Huyen Nguyen
2025-11-25 11:02:22 -08:00
committed by GitHub
parent b6fff6e2b7
commit 33325b9002
24 changed files with 964 additions and 176 deletions

View File

@@ -1,61 +1,94 @@
const mdastToHTML = require('./mdast-to-html');
/**
* Parses Chinese text in format: hanzi (pinyin)
* @param {string} text - Text in format: hanzi (pinyin)
* @returns {{ hanzi: string, pinyin: string } | null} Parsed hanzi and pinyin, or null if not matching
*/
function parseChinesePattern(text) {
const match = text.match(/^(.+?)\s*\((.+?)\)$/);
// Captures hanzi (pinyin) pairs (hanzi, optional whitespace, then pinyin parentheses)
const HANZI_PINYIN_PAIR = '([\u4e00-\u9fff]+)\\s*\\(([^)]+)\\)';
if (!match) {
return null;
// Matches the BLANK placeholder
const BLANK_TOKEN = 'BLANK';
// Matches Chinese and English punctuation
const PUNCTUATION = '[,。?!!?,;:;:、]+';
// Matches Latin text with spaces
const OTHER_TEXT = '([a-zA-Z\\s]+)';
const HANZI_PINYIN_REGEX = new RegExp(
`${HANZI_PINYIN_PAIR}|${BLANK_TOKEN}|${PUNCTUATION}|${OTHER_TEXT}`,
'g'
);
/**
* Parses all hanzi-pinyin pairs from text
* @param {string} text - Text potentially containing multiple hanzi (pinyin) patterns
* @returns {Array<{hanzi: string, pinyin: string}>} Array of parsed pairs
*/
function parseHanziPinyinPairs(text) {
const pairs = [];
const regex = new RegExp(HANZI_PINYIN_REGEX);
let match;
while ((match = regex.exec(text)) !== null) {
if (match[1] && match[2]) {
pairs.push({
hanzi: match[1].trim(),
pinyin: match[2].trim()
});
}
}
return {
hanzi: match[1].trim(),
pinyin: match[2].trim()
};
return pairs;
}
/**
* Custom handler for Chinese inline code to render as ruby elements
* Matches hanzi-pinyin pairs, BLANK, and punctuation as separate elements
* @param {object} state - The state object from mdast-util-to-hast
* @param {object} node - The inlineCode node
* @returns {object} Hast element node
* @returns {object|Array<object>} Hast element node or array of nodes
*/
function chineseInlineCodeHandler(state, node) {
const parsed = parseChinesePattern(node.value);
const rubyPairs = parseHanziPinyinPairs(node.value);
if (parsed) {
return {
type: 'element',
tagName: 'ruby',
properties: {},
children: [
{ type: 'text', value: parsed.hanzi },
{
if (rubyPairs.length > 0) {
const matches = [...node.value.matchAll(HANZI_PINYIN_REGEX)];
const nodes = matches.map(fullMatch => {
if (fullMatch[1] && fullMatch[2]) {
return {
type: 'element',
tagName: 'rp',
tagName: 'ruby',
properties: {},
children: [{ type: 'text', value: '(' }]
},
{
type: 'element',
tagName: 'rt',
properties: {},
children: [{ type: 'text', value: parsed.pinyin }]
},
{
type: 'element',
tagName: 'rp',
properties: {},
children: [{ type: 'text', value: ')' }]
}
]
};
children: [
{ type: 'text', value: fullMatch[1].trim() },
{
type: 'element',
tagName: 'rp',
properties: {},
children: [{ type: 'text', value: '(' }]
},
{
type: 'element',
tagName: 'rt',
properties: {},
children: [{ type: 'text', value: fullMatch[2].trim() }]
},
{
type: 'element',
tagName: 'rp',
properties: {},
children: [{ type: 'text', value: ')' }]
}
]
};
}
// Other captures (BLANK, punctuation, other text including spaces) should preserve exactly
return { type: 'text', value: fullMatch[0] };
});
return nodes.length === 1 ? nodes[0] : nodes;
}
// If static text, return code
return {
type: 'element',
// TODO: change this to span
@@ -75,4 +108,7 @@ const rubyOptions = {
const createMdastToHtml = lang =>
lang == 'zh-CN' ? x => mdastToHTML(x, rubyOptions) : mdastToHTML;
module.exports = { parseChinesePattern, createMdastToHtml };
module.exports = {
parseHanziPinyinPairs,
createMdastToHtml
};