Files
dify/web/app/components/workflow/skill/utils/file-utils.ts
yyh 7161c3dd80 fix(web): exclude PDF from text-like file detection
PDF files were incorrectly parsed as text because isTextLikeFile
did not exclude PDF after removing it from BINARY_EXTENSIONS.
2026-02-05 17:45:38 +08:00

171 lines
4.1 KiB
TypeScript

import { FileAppearanceTypeEnum } from '@/app/components/base/file-uploader/types'
const MARKDOWN_EXTENSIONS = new Set(['md', 'markdown', 'mdx'])
const CODE_EXTENSIONS = new Set(['json', 'yaml', 'yml', 'toml', 'js', 'jsx', 'ts', 'tsx', 'py', 'schema'])
const IMAGE_EXTENSIONS = new Set(['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'bmp', 'ico', 'tiff', 'psd', 'heic', 'heif', 'avif'])
const VIDEO_EXTENSIONS = new Set(['mp4', 'mov', 'webm', 'mpeg', 'mpg', 'm4v', 'avi', 'mkv', 'flv', 'wmv', '3gp'])
const SQLITE_EXTENSIONS = new Set(['db', 'sqlite', 'sqlite3'])
const PDF_EXTENSIONS_SET = new Set(['pdf'])
const BINARY_EXTENSIONS = new Set([
'mp3',
'wav',
'ogg',
'flac',
'm4a',
'aac',
'wma',
'aiff',
'opus',
'zip',
'tar',
'gz',
'rar',
'7z',
'bz2',
'xz',
'tgz',
'tbz2',
'lz',
'lzma',
'cab',
'iso',
'dmg',
'exe',
'dll',
'so',
'dylib',
'bin',
'o',
'obj',
'class',
'pyc',
'pyo',
'pyd',
'wasm',
'app',
'msi',
'deb',
'rpm',
'doc',
'docx',
'xls',
'xlsx',
'ppt',
'pptx',
'odt',
'ods',
'odp',
'rtf',
'epub',
'mobi',
'ttf',
'otf',
'woff',
'woff2',
'eot',
'db',
'sqlite',
'sqlite3',
'mdb',
'accdb',
'jar',
'war',
'ear',
'apk',
'ipa',
'aab',
'lock',
])
export function getFileExtension(name?: string, extension?: string): string {
if (extension)
return extension.replace(/^\./, '').toLowerCase()
if (!name)
return ''
return name.split('.').pop()?.toLowerCase() ?? ''
}
const AUDIO_EXTENSIONS = ['mp3', 'm4a', 'wav', 'amr', 'mpga', 'ogg', 'flac', 'aac', 'wma', 'aiff', 'opus']
const EXCEL_EXTENSIONS = ['xlsx', 'xls', 'csv']
const WORD_EXTENSIONS = ['doc', 'docx']
const PPT_EXTENSIONS = ['ppt', 'pptx']
const EXTENSION_TO_ICON_TYPE = new Map<string, FileAppearanceTypeEnum>(
([
[['gif'], FileAppearanceTypeEnum.gif],
[IMAGE_EXTENSIONS, FileAppearanceTypeEnum.image],
[VIDEO_EXTENSIONS, FileAppearanceTypeEnum.video],
[AUDIO_EXTENSIONS, FileAppearanceTypeEnum.audio],
[PDF_EXTENSIONS_SET, FileAppearanceTypeEnum.pdf],
[MARKDOWN_EXTENSIONS, FileAppearanceTypeEnum.markdown],
[EXCEL_EXTENSIONS, FileAppearanceTypeEnum.excel],
[WORD_EXTENSIONS, FileAppearanceTypeEnum.word],
[PPT_EXTENSIONS, FileAppearanceTypeEnum.ppt],
[CODE_EXTENSIONS, FileAppearanceTypeEnum.code],
[SQLITE_EXTENSIONS, FileAppearanceTypeEnum.database],
] as [Iterable<string>, FileAppearanceTypeEnum][]).flatMap(
([exts, type]) => [...exts].map(e => [e, type] as [string, FileAppearanceTypeEnum]),
),
)
export function getFileIconType(name: string, ext?: string | null): FileAppearanceTypeEnum {
const extension = ext?.replace(/^\./, '').toLowerCase() ?? name.split('.').pop()?.toLowerCase() ?? ''
return EXTENSION_TO_ICON_TYPE.get(extension) ?? FileAppearanceTypeEnum.document
}
export function isMarkdownFile(extension: string): boolean {
return MARKDOWN_EXTENSIONS.has(extension)
}
export function isBinaryFile(extension: string): boolean {
return BINARY_EXTENSIONS.has(extension)
}
export function isTextLikeFile(extension: string): boolean {
return !isBinaryFile(extension) && !isImageFile(extension) && !isVideoFile(extension) && !isPdfFile(extension)
}
export function isImageFile(extension: string): boolean {
return IMAGE_EXTENSIONS.has(extension)
}
export function isVideoFile(extension: string): boolean {
return VIDEO_EXTENSIONS.has(extension)
}
export function isSQLiteFile(extension: string): boolean {
return SQLITE_EXTENSIONS.has(extension)
}
export function isPdfFile(extension: string): boolean {
return PDF_EXTENSIONS_SET.has(extension)
}
export function getFileLanguage(name: string): string {
const extension = name.split('.').pop()?.toLowerCase() ?? ''
const languageMap: Record<string, string> = {
md: 'markdown',
markdown: 'markdown',
mdx: 'markdown',
json: 'json',
jsonl: 'json',
yaml: 'yaml',
yml: 'yaml',
js: 'javascript',
jsx: 'javascript',
ts: 'typescript',
tsx: 'typescript',
py: 'python',
html: 'html',
css: 'css',
xml: 'xml',
sql: 'sql',
sh: 'shell',
bash: 'shell',
}
return languageMap[extension] ?? 'plaintext'
}