Compare commits

...

2 Commits

Author SHA1 Message Date
dreamhunter2333
19cf9518b3 fix: validate extracted links against full email content 2026-07-03 23:53:15 +08:00
dreamhunter2333
7e0e9376ba fix: validate AI extracted link domains 2026-07-03 23:11:09 +08:00
3 changed files with 73 additions and 5 deletions

View File

@@ -12,6 +12,7 @@
### Bug Fixes
- fix: |AI 提取| 校验 AI 提取出的链接域名必须存在于邮件内容中避免小模型将验证链接域名替换成临时邮箱域名导致错误跳转issue #1072
- fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本,避免样式模板过长导致验证码位于 4000 字截断之后而无法识别
- fix: |Frontend| 移动端 Header 增加页头内边距,避免标题、菜单按钮与屏幕边缘过近

View File

@@ -12,6 +12,7 @@
### Bug Fixes
- fix: |AI Extract| Validate extracted link domains against domains present in the mail content to prevent small models from replacing verification-link domains with the temporary mailbox domain (issue #1072)
- fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window
- fix: |Frontend| Add mobile Header page padding so the title and menu button no longer sit too close to the screen edge

View File

@@ -75,7 +75,8 @@ If the extracted content is in markdown link format [text](url):
2. **Single Selection**: Choose ONLY ONE type based on the highest priority match
3. **Real Data Only**: Never invent, guess, or fabricate content
4. **Complete URLs**: Links must be full, valid URLs as they appear in the email
5. **Clean Extraction**: Return only the raw extracted content, no extra text
5. **No Domain Modification**: Never modify, rewrite, or substitute URL domains. If the exact URL domain is uncertain, return none
6. **Clean Extraction**: Return only the raw extracted content, no extra text
# Output Format (JSON only)
{
@@ -225,6 +226,70 @@ function getEmailContentForExtract(parsedEmail: Awaited<ReturnType<typeof common
return htmlToTextForAi(parsedEmail.html) || parsedEmail.html;
}
const LINK_EXTRACT_TYPES = new Set<ExtractResult['type']>([
'auth_link',
'service_link',
'subscription_link',
'other_link',
]);
function normalizeHostname(hostname: string): string {
return hostname.toLowerCase().replace(/\.$/, '');
}
function trimUrlBoundaryPunctuation(value: string): string {
return value.replace(/[),.;!?]+$/g, '');
}
function getUrlHostname(value: string): string | null {
const match = value.match(/https?:\/\/[^\s<>"'`]+/i);
if (!match) {
return null;
}
try {
const url = new URL(trimUrlBoundaryPunctuation(match[0]));
return normalizeHostname(url.hostname);
} catch {
return null;
}
}
function extractHostnamesFromContent(content: string): Set<string> {
const hostnames = new Set<string>();
const matches = content.matchAll(/https?:\/\/[^\s<>"'`]+/gi);
for (const match of matches) {
try {
const url = new URL(trimUrlBoundaryPunctuation(match[0]));
hostnames.add(normalizeHostname(url.hostname));
} catch {
continue;
}
}
return hostnames;
}
function validateExtractedLinkDomain(result: ExtractResult, content: string): ExtractResult {
if (!LINK_EXTRACT_TYPES.has(result.type)) {
return result;
}
const resultHostname = getUrlHostname(result.result);
if (!resultHostname) {
return { type: 'none', result: '', result_text: '' };
}
const contentHostnames = extractHostnamesFromContent(content);
if (contentHostnames.has(resultHostname)) {
return result;
}
console.warn(`AI extraction discarded link with hallucinated domain: ${resultHostname}`);
return { type: 'none', result: '', result_text: '' };
}
/**
* Main extraction function
* Checks if extraction is enabled, processes the email content, and saves to database.
@@ -304,13 +369,14 @@ export async function extractEmailInfo(
: emailContent;
const result = await extractWithCloudflareAI(truncatedContent, env);
const validatedResult = validateExtractedLinkDomain(result, emailContent);
// If extraction found something useful, save it to database
if (result.type !== 'none' && result.result) {
await saveExtractMetadata(env, message_id, result);
console.log(`AI extraction completed for ${message_id}: ${result.type}`);
if (validatedResult.type !== 'none' && validatedResult.result) {
await saveExtractMetadata(env, message_id, validatedResult);
console.log(`AI extraction completed for ${message_id}: ${validatedResult.type}`);
}
return result;
return validatedResult;
} catch (e) {
console.error('AI email extraction error:', e);
return null;