From c924b71a5c4a7169351280c7094f4de80bdb719e Mon Sep 17 00:00:00 2001
From: Dream Hunter <dreamhunter2333@gmail.com>
Date: Fri, 12 Jun 2026 00:41:19 +0800
Subject: [PATCH] fix: compact HTML before AI email extraction (#1057)

fix: compact html before ai extraction
---
 CHANGELOG.md                   |  2 ++
 CHANGELOG_EN.md                |  2 ++
 worker/src/email/ai_extract.ts | 61 +++++++++++++++++++++++++++++++++-
 3 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27a0f3e0..5aa9db51 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,8 @@
 
 ### Bug Fixes
 
+- fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本，避免样式模板过长导致验证码位于 4000 字截断之后而无法识别
+
 ### Improvements
 
 ## v1.9.0
diff --git a/CHANGELOG_EN.md b/CHANGELOG_EN.md
index 37e8190d..0dcde639 100644
--- a/CHANGELOG_EN.md
+++ b/CHANGELOG_EN.md
@@ -12,6 +12,8 @@
 
 ### Bug Fixes
 
+- fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window
+
 ### Improvements
 
 ## v1.9.0
diff --git a/worker/src/email/ai_extract.ts b/worker/src/email/ai_extract.ts
index 28eacf4e..2d891ab2 100644
--- a/worker/src/email/ai_extract.ts
+++ b/worker/src/email/ai_extract.ts
@@ -166,6 +166,65 @@ async function saveExtractMetadata(
     }
 }
 
+function decodeHtmlEntities(text: string): string {
+    const entities: Record<string, string> = {
+        amp: '&',
+        lt: '<',
+        gt: '>',
+        quot: '"',
+        apos: "'",
+        nbsp: ' ',
+    };
+
+    const decodeCodePoint = (value: number, fallback: string) => {
+        if (!Number.isFinite(value) || value < 0 || value > 0x10ffff) {
+            return fallback;
+        }
+        return String.fromCodePoint(value);
+    };
+
+    return text.replace(/&(#x[0-9a-f]+|#\d+|[a-z][a-z0-9]+);/gi, (match, entity) => {
+        const normalized = entity.toLowerCase();
+        if (normalized.startsWith('#x')) {
+            const value = Number.parseInt(normalized.slice(2), 16);
+            return decodeCodePoint(value, match);
+        }
+        if (normalized.startsWith('#')) {
+            const value = Number.parseInt(normalized.slice(1), 10);
+            return decodeCodePoint(value, match);
+        }
+        return entities[normalized] ?? match;
+    });
+}
+
+function htmlToTextForAi(html: string): string {
+    return decodeHtmlEntities(
+        html
+            .replace(/<\s*(script|style|head|svg)[^>]*>[\s\S]*?<\s*\/\s*\1\s*>/gi, ' ')
+            .replace(/<!--[\s\S]*?-->/g, ' ')
+            .replace(/<a\b[^>]*\bhref=(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, ' $3 $2 ')
+            .replace(/<\s*br\s*\/?>/gi, '\n')
+            .replace(/<\/\s*(p|div|tr|td|th|li|table|section|article|header|footer|h[1-6])\s*>/gi, '\n')
+            .replace(/<[^>]+>/g, ' ')
+    )
+        .replace(/[ \t\r\f\v]+/g, ' ')
+        .replace(/\n\s+/g, '\n')
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+}
+
+function getEmailContentForExtract(parsedEmail: Awaited<ReturnType<typeof commonParseMail>>): string {
+    if (parsedEmail?.text) {
+        return parsedEmail.text;
+    }
+
+    if (!parsedEmail?.html) {
+        return "";
+    }
+
+    return htmlToTextForAi(parsedEmail.html) || parsedEmail.html;
+}
+
 /**
  * Main extraction function
  * Checks if extraction is enabled, processes the email content, and saves to database.
@@ -219,7 +278,7 @@ export async function extractEmailInfo(
 
         // Parse email to get content (shared by the AI path and the regex fallback)
         const parsedEmail = await commonParseMail(parsedEmailContext);
-        const emailContent = parsedEmail?.text || parsedEmail?.html || "";
+        const emailContent = getEmailContentForExtract(parsedEmail);
 
         if (!emailContent) {
             return null;