From c924b71a5c4a7169351280c7094f4de80bdb719e Mon Sep 17 00:00:00 2001 From: Dream Hunter Date: Fri, 12 Jun 2026 00:41:19 +0800 Subject: [PATCH] fix: compact HTML before AI email extraction (#1057) fix: compact html before ai extraction --- CHANGELOG.md | 2 ++ CHANGELOG_EN.md | 2 ++ worker/src/email/ai_extract.ts | 61 +++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27a0f3e0..5aa9db51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ ### Bug Fixes +- fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本,避免样式模板过长导致验证码位于 4000 字截断之后而无法识别 + ### Improvements ## v1.9.0 diff --git a/CHANGELOG_EN.md b/CHANGELOG_EN.md index 37e8190d..0dcde639 100644 --- a/CHANGELOG_EN.md +++ b/CHANGELOG_EN.md @@ -12,6 +12,8 @@ ### Bug Fixes +- fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window + ### Improvements ## v1.9.0 diff --git a/worker/src/email/ai_extract.ts b/worker/src/email/ai_extract.ts index 28eacf4e..2d891ab2 100644 --- a/worker/src/email/ai_extract.ts +++ b/worker/src/email/ai_extract.ts @@ -166,6 +166,65 @@ async function saveExtractMetadata( } } +function decodeHtmlEntities(text: string): string { + const entities: Record = { + amp: '&', + lt: '<', + gt: '>', + quot: '"', + apos: "'", + nbsp: ' ', + }; + + const decodeCodePoint = (value: number, fallback: string) => { + if (!Number.isFinite(value) || value < 0 || value > 0x10ffff) { + return fallback; + } + return String.fromCodePoint(value); + }; + + return text.replace(/&(#x[0-9a-f]+|#\d+|[a-z][a-z0-9]+);/gi, (match, entity) => { + const normalized = entity.toLowerCase(); + if (normalized.startsWith('#x')) { + const value = Number.parseInt(normalized.slice(2), 16); + return decodeCodePoint(value, match); + } + if (normalized.startsWith('#')) { + const value = Number.parseInt(normalized.slice(1), 10); + return decodeCodePoint(value, match); + } + return entities[normalized] ?? match; + }); +} + +function htmlToTextForAi(html: string): string { + return decodeHtmlEntities( + html + .replace(/<\s*(script|style|head|svg)[^>]*>[\s\S]*?<\s*\/\s*\1\s*>/gi, ' ') + .replace(//g, ' ') + .replace(/]*\bhref=(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, ' $3 $2 ') + .replace(/<\s*br\s*\/?>/gi, '\n') + .replace(/<\/\s*(p|div|tr|td|th|li|table|section|article|header|footer|h[1-6])\s*>/gi, '\n') + .replace(/<[^>]+>/g, ' ') + ) + .replace(/[ \t\r\f\v]+/g, ' ') + .replace(/\n\s+/g, '\n') + .replace(/\n{3,}/g, '\n\n') + .trim(); +} + +function getEmailContentForExtract(parsedEmail: Awaited>): string { + if (parsedEmail?.text) { + return parsedEmail.text; + } + + if (!parsedEmail?.html) { + return ""; + } + + return htmlToTextForAi(parsedEmail.html) || parsedEmail.html; +} + /** * Main extraction function * Checks if extraction is enabled, processes the email content, and saves to database. @@ -219,7 +278,7 @@ export async function extractEmailInfo( // Parse email to get content (shared by the AI path and the regex fallback) const parsedEmail = await commonParseMail(parsedEmailContext); - const emailContent = parsedEmail?.text || parsedEmail?.html || ""; + const emailContent = getEmailContentForExtract(parsedEmail); if (!emailContent) { return null;