fix: validate extracted links against full email content

fix: validate AI extracted link domains
2026-07-04 13:51:35 +08:00 · 2026-07-03 23:53:15 +08:00 · 2026-07-03 23:11:09 +08:00
3 changed files with 73 additions and 5 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@

 ### Bug Fixes

+- fix: |AI 提取| 校验 AI 提取出的链接域名必须存在于邮件内容中，避免小模型将验证链接域名替换成临时邮箱域名导致错误跳转（issue #1072）
 - fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本，避免样式模板过长导致验证码位于 4000 字截断之后而无法识别
 - fix: |Frontend| 移动端 Header 增加页头内边距，避免标题、菜单按钮与屏幕边缘过近

--- a/CHANGELOG_EN.md
+++ b/CHANGELOG_EN.md
@@ -12,6 +12,7 @@

 ### Bug Fixes

+- fix: |AI Extract| Validate extracted link domains against domains present in the mail content to prevent small models from replacing verification-link domains with the temporary mailbox domain (issue #1072)
 - fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window
 - fix: |Frontend| Add mobile Header page padding so the title and menu button no longer sit too close to the screen edge

--- a/worker/src/email/ai_extract.ts
+++ b/worker/src/email/ai_extract.ts
@@ -75,7 +75,8 @@ If the extracted content is in markdown link format [text](url):
 2. **Single Selection**: Choose ONLY ONE type based on the highest priority match
 3. **Real Data Only**: Never invent, guess, or fabricate content
 4. **Complete URLs**: Links must be full, valid URLs as they appear in the email
-5. **Clean Extraction**: Return only the raw extracted content, no extra text
+5. **No Domain Modification**: Never modify, rewrite, or substitute URL domains. If the exact URL domain is uncertain, return none
+6. **Clean Extraction**: Return only the raw extracted content, no extra text

 # Output Format (JSON only)
 {
@@ -225,6 +226,70 @@ function getEmailContentForExtract(parsedEmail: Awaited<ReturnType<typeof common
    return htmlToTextForAi(parsedEmail.html) || parsedEmail.html;
 }

+const LINK_EXTRACT_TYPES = new Set<ExtractResult['type']>([
+    'auth_link',
+    'service_link',
+    'subscription_link',
+    'other_link',
+]);
+
+function normalizeHostname(hostname: string): string {
+    return hostname.toLowerCase().replace(/\.$/, '');
+}
+
+function trimUrlBoundaryPunctuation(value: string): string {
+    return value.replace(/[),.;!?]+$/g, '');
+}
+
+function getUrlHostname(value: string): string | null {
+    const match = value.match(/https?:\/\/[^\s<>"'`]+/i);
+    if (!match) {
+        return null;
+    }
+
+    try {
+        const url = new URL(trimUrlBoundaryPunctuation(match[0]));
+        return normalizeHostname(url.hostname);
+    } catch {
+        return null;
+    }
+}
+
+function extractHostnamesFromContent(content: string): Set<string> {
+    const hostnames = new Set<string>();
+    const matches = content.matchAll(/https?:\/\/[^\s<>"'`]+/gi);
+
+    for (const match of matches) {
+        try {
+            const url = new URL(trimUrlBoundaryPunctuation(match[0]));
+            hostnames.add(normalizeHostname(url.hostname));
+        } catch {
+            continue;
+        }
+    }
+
+    return hostnames;
+}
+
+function validateExtractedLinkDomain(result: ExtractResult, content: string): ExtractResult {
+    if (!LINK_EXTRACT_TYPES.has(result.type)) {
+        return result;
+    }
+
+    const resultHostname = getUrlHostname(result.result);
+    if (!resultHostname) {
+        return { type: 'none', result: '', result_text: '' };
+    }
+
+    const contentHostnames = extractHostnamesFromContent(content);
+    if (contentHostnames.has(resultHostname)) {
+        return result;
+    }
+
+    console.warn(`AI extraction discarded link with hallucinated domain: ${resultHostname}`);
+    return { type: 'none', result: '', result_text: '' };
+}
+
 /**
 * Main extraction function
 * Checks if extraction is enabled, processes the email content, and saves to database.
@@ -304,13 +369,14 @@ export async function extractEmailInfo(
            : emailContent;

        const result = await extractWithCloudflareAI(truncatedContent, env);
+        const validatedResult = validateExtractedLinkDomain(result, emailContent);

        // If extraction found something useful, save it to database
-        if (result.type !== 'none' && result.result) {
-            await saveExtractMetadata(env, message_id, result);
-            console.log(`AI extraction completed for ${message_id}: ${result.type}`);
+        if (validatedResult.type !== 'none' && validatedResult.result) {
+            await saveExtractMetadata(env, message_id, validatedResult);
+            console.log(`AI extraction completed for ${message_id}: ${validatedResult.type}`);
        }
-        return result;
+        return validatedResult;
    } catch (e) {
        console.error('AI email extraction error:', e);
        return null;
Author	SHA1	Message	Date
dreamhunter2333	19cf9518b3	fix: validate extracted links against full email content	2026-07-03 23:53:15 +08:00
dreamhunter2333	7e0e9376ba	fix: validate AI extracted link domains	2026-07-03 23:11:09 +08:00