mirror of
https://github.com/dreamhunter2333/cloudflare_temp_email.git
synced 2026-06-21 07:24:41 +08:00
fix: compact HTML before AI email extraction (#1057)
fix: compact html before ai extraction
This commit is contained in:
@@ -12,6 +12,8 @@
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- fix: |AI 提取| HTML-only 邮件在发送给 Workers AI 前会先压缩为可读文本,避免样式模板过长导致验证码位于 4000 字截断之后而无法识别
|
||||
|
||||
### Improvements
|
||||
|
||||
## v1.9.0
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
|
||||
### Bug Fixes
|
||||
|
||||
- fix: |AI Extract| Convert HTML-only mail bodies into compact readable text before sending them to Workers AI, preventing long templates from pushing verification codes past the 4000-character truncation window
|
||||
|
||||
### Improvements
|
||||
|
||||
## v1.9.0
|
||||
|
||||
@@ -166,6 +166,65 @@ async function saveExtractMetadata(
|
||||
}
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(text: string): string {
|
||||
const entities: Record<string, string> = {
|
||||
amp: '&',
|
||||
lt: '<',
|
||||
gt: '>',
|
||||
quot: '"',
|
||||
apos: "'",
|
||||
nbsp: ' ',
|
||||
};
|
||||
|
||||
const decodeCodePoint = (value: number, fallback: string) => {
|
||||
if (!Number.isFinite(value) || value < 0 || value > 0x10ffff) {
|
||||
return fallback;
|
||||
}
|
||||
return String.fromCodePoint(value);
|
||||
};
|
||||
|
||||
return text.replace(/&(#x[0-9a-f]+|#\d+|[a-z][a-z0-9]+);/gi, (match, entity) => {
|
||||
const normalized = entity.toLowerCase();
|
||||
if (normalized.startsWith('#x')) {
|
||||
const value = Number.parseInt(normalized.slice(2), 16);
|
||||
return decodeCodePoint(value, match);
|
||||
}
|
||||
if (normalized.startsWith('#')) {
|
||||
const value = Number.parseInt(normalized.slice(1), 10);
|
||||
return decodeCodePoint(value, match);
|
||||
}
|
||||
return entities[normalized] ?? match;
|
||||
});
|
||||
}
|
||||
|
||||
function htmlToTextForAi(html: string): string {
|
||||
return decodeHtmlEntities(
|
||||
html
|
||||
.replace(/<\s*(script|style|head|svg)[^>]*>[\s\S]*?<\s*\/\s*\1\s*>/gi, ' ')
|
||||
.replace(/<!--[\s\S]*?-->/g, ' ')
|
||||
.replace(/<a\b[^>]*\bhref=(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, ' $3 $2 ')
|
||||
.replace(/<\s*br\s*\/?>/gi, '\n')
|
||||
.replace(/<\/\s*(p|div|tr|td|th|li|table|section|article|header|footer|h[1-6])\s*>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
)
|
||||
.replace(/[ \t\r\f\v]+/g, ' ')
|
||||
.replace(/\n\s+/g, '\n')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function getEmailContentForExtract(parsedEmail: Awaited<ReturnType<typeof commonParseMail>>): string {
|
||||
if (parsedEmail?.text) {
|
||||
return parsedEmail.text;
|
||||
}
|
||||
|
||||
if (!parsedEmail?.html) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return htmlToTextForAi(parsedEmail.html) || parsedEmail.html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main extraction function
|
||||
* Checks if extraction is enabled, processes the email content, and saves to database.
|
||||
@@ -219,7 +278,7 @@ export async function extractEmailInfo(
|
||||
|
||||
// Parse email to get content (shared by the AI path and the regex fallback)
|
||||
const parsedEmail = await commonParseMail(parsedEmailContext);
|
||||
const emailContent = parsedEmail?.text || parsedEmail?.html || "";
|
||||
const emailContent = getEmailContentForExtract(parsedEmail);
|
||||
|
||||
if (!emailContent) {
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user