fix: compact HTML before AI email extraction (#1057)

fix: compact html before ai extraction
This commit is contained in:
Dream Hunter
2026-06-12 00:41:19 +08:00
committed by GitHub
parent 4f0b44de2e
commit c924b71a5c
3 changed files with 64 additions and 1 deletions

View File

@@ -12,6 +12,8 @@
### Bug Fixes
- fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本,避免样式模板过长导致验证码位于 4000 字截断之后而无法识别
### Improvements
## v1.9.0

View File

@@ -12,6 +12,8 @@
### Bug Fixes
- fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window
### Improvements
## v1.9.0

View File

@@ -166,6 +166,65 @@ async function saveExtractMetadata(
}
}
function decodeHtmlEntities(text: string): string {
const entities: Record<string, string> = {
amp: '&',
lt: '<',
gt: '>',
quot: '"',
apos: "'",
nbsp: ' ',
};
const decodeCodePoint = (value: number, fallback: string) => {
if (!Number.isFinite(value) || value < 0 || value > 0x10ffff) {
return fallback;
}
return String.fromCodePoint(value);
};
return text.replace(/&(#x[0-9a-f]+|#\d+|[a-z][a-z0-9]+);/gi, (match, entity) => {
const normalized = entity.toLowerCase();
if (normalized.startsWith('#x')) {
const value = Number.parseInt(normalized.slice(2), 16);
return decodeCodePoint(value, match);
}
if (normalized.startsWith('#')) {
const value = Number.parseInt(normalized.slice(1), 10);
return decodeCodePoint(value, match);
}
return entities[normalized] ?? match;
});
}
function htmlToTextForAi(html: string): string {
return decodeHtmlEntities(
html
.replace(/<\s*(script|style|head|svg)[^>]*>[\s\S]*?<\s*\/\s*\1\s*>/gi, ' ')
.replace(/<!--[\s\S]*?-->/g, ' ')
.replace(/<a\b[^>]*\bhref=(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, ' $3 $2 ')
.replace(/<\s*br\s*\/?>/gi, '\n')
.replace(/<\/\s*(p|div|tr|td|th|li|table|section|article|header|footer|h[1-6])\s*>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
)
.replace(/[ \t\r\f\v]+/g, ' ')
.replace(/\n\s+/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function getEmailContentForExtract(parsedEmail: Awaited<ReturnType<typeof commonParseMail>>): string {
if (parsedEmail?.text) {
return parsedEmail.text;
}
if (!parsedEmail?.html) {
return "";
}
return htmlToTextForAi(parsedEmail.html) || parsedEmail.html;
}
/**
* Main extraction function
* Checks if extraction is enabled, processes the email content, and saves to database.
@@ -219,7 +278,7 @@ export async function extractEmailInfo(
// Parse email to get content (shared by the AI path and the regex fallback)
const parsedEmail = await commonParseMail(parsedEmailContext);
const emailContent = parsedEmail?.text || parsedEmail?.html || "";
const emailContent = getEmailContentForExtract(parsedEmail);
if (!emailContent) {
return null;