From b7718100c5db50157a7bac345b1d07e267e9bbdd Mon Sep 17 00:00:00 2001 From: Gene Dai Date: Tue, 2 Jun 2026 15:35:43 +0800 Subject: [PATCH] feat: regex fallback for verification code extraction without Workers AI (#1048) feat: add regex fallback for verification code extraction without Workers AI When AI email extraction is enabled but no Workers AI binding is available, fall back to a built-in, zero-dependency regex extractor so self-hosted deployments without Workers AI still surface verification codes in Telegram notifications and webhooks. - Add worker/src/email/extract_code.ts: rule-based multilingual (English / Chinese / Japanese / Korean) verification-code extractor with year and YYYYMMDD date rejection to avoid false positives. - ai_extract.ts: share the allowlist check and content parsing across both paths, extract a saveExtractMetadata helper, and use the regex fallback when env.AI is absent. - Reuse the existing aiExtractResult pipeline (auth_code type), so Telegram and webhook output need no changes. - Update bilingual CHANGELOG and AI-extract feature docs. --- CHANGELOG.md | 1 + CHANGELOG_EN.md | 1 + .../docs/en/guide/feature/ai-extract.md | 12 +++ .../docs/zh/guide/feature/ai-extract.md | 12 +++ worker/src/email/ai_extract.ts | 78 ++++++++++++------ worker/src/email/extract_code.ts | 80 +++++++++++++++++++ 6 files changed, 158 insertions(+), 26 deletions(-) create mode 100644 worker/src/email/extract_code.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 64295519..496474ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ ### Features +- feat: |AI 识别| 未配置 Workers AI 绑定时,自动回退到内置正则提取验证码(支持中英日韩,并排除年份与 `YYYYMMDD` 日期误判),让无 Workers AI 的自部署用户也能在 Telegram 推送与 Webhook 中拿到验证码 - feat: |Telegram| Telegram 新邮件推送与 `/mails` 历史邮件查看支持展示 AI 提取结果,包含验证码、验证链接、服务链接、订阅链接等关键信息 - feat: |Webhook| 邮件 Webhook 模板支持填充 AI 提取结果占位符,包括 `aiExtractType`、`aiExtractResult`、`aiExtractResultText` - feat: |Frontend| 新增 `DISABLE_SHOW_GITHUB_FOR_USER` 配置,可仅对普通用户隐藏 Header 的 GitHub/版本入口,admin 仍可见(issue #1041) diff --git a/CHANGELOG_EN.md b/CHANGELOG_EN.md index bb916cae..4aea67e2 100644 --- a/CHANGELOG_EN.md +++ b/CHANGELOG_EN.md @@ -10,6 +10,7 @@ ### Features +- feat: |AI Extract| Fall back to a built-in regex verification-code extractor (English / Chinese / Japanese / Korean, with year and `YYYYMMDD` date rejection) when no Workers AI binding is configured, so self-hosted deployments without Workers AI still surface codes in Telegram pushes and webhooks - feat: |Telegram| Show AI extraction results in Telegram new-mail notifications and `/mails` history views, including verification codes, auth links, service links, and subscription links - feat: |Webhook| Support AI extraction placeholders in mail webhook templates, including `aiExtractType`, `aiExtractResult`, and `aiExtractResultText` - feat: |Frontend| Add `DISABLE_SHOW_GITHUB_FOR_USER` to hide the Header GitHub/version entry from normal users while keeping it visible to admin users (issue #1041) diff --git a/vitepress-docs/docs/en/guide/feature/ai-extract.md b/vitepress-docs/docs/en/guide/feature/ai-extract.md index 39e2a085..55a354dd 100644 --- a/vitepress-docs/docs/en/guide/feature/ai-extract.md +++ b/vitepress-docs/docs/en/guide/feature/ai-extract.md @@ -43,6 +43,18 @@ Or add in Cloudflare Dashboard Worker settings: - **Variable name**: `AI` - **Type**: Workers AI +## Fallback Without a Workers AI Binding + +If `ENABLE_AI_EMAIL_EXTRACT` is enabled but **no Workers AI binding is configured** (e.g. a self-hosted deployment without Workers AI), the system automatically falls back to a built-in **regex verification-code extractor**: + +- Extracts **verification codes** (`auth_code`) only; links are not extracted (link extraction requires AI) +- Zero dependency, zero cost, runs locally inside the Worker +- Supports common verification-code formats in English, Chinese, Japanese and Korean +- Rejects years (e.g. `2026`) and `YYYYMMDD` dates to reduce false positives +- Results are written to `metadata` and reuse the same Telegram / webhook placeholders (`aiExtractType` is `auth_code` in this case) + +When a Workers AI binding is configured, AI extraction is still preferred (recognizing both codes and links) and this fallback does not apply. + ## Address Allowlist (Optional) To control costs and resource usage, you can configure an address allowlist in the Admin console's **AI Extract Settings** page: diff --git a/vitepress-docs/docs/zh/guide/feature/ai-extract.md b/vitepress-docs/docs/zh/guide/feature/ai-extract.md index c3b72b96..6c549e6d 100644 --- a/vitepress-docs/docs/zh/guide/feature/ai-extract.md +++ b/vitepress-docs/docs/zh/guide/feature/ai-extract.md @@ -43,6 +43,18 @@ binding = "AI" - **Variable name**: `AI` - **Type**: Workers AI +## 无 Workers AI 绑定时的正则兜底 + +如果启用了 `ENABLE_AI_EMAIL_EXTRACT` 但**没有配置 Workers AI 绑定**(例如自部署时未开通 Workers AI),系统会自动回退到内置的**正则验证码提取**: + +- 仅提取**验证码**(`auth_code`),不提取链接(链接提取依赖 AI) +- 零依赖、零成本,在 Worker 内本地完成 +- 支持中文、英文、日文、韩文常见验证码格式 +- 自动排除年份(如 `2026`)与 `YYYYMMDD` 日期,降低误判 +- 提取结果同样写入 `metadata`,并复用 Telegram 推送与 Webhook 占位符(此时 `aiExtractType` 为 `auth_code`) + +当配置了 Workers AI 绑定时,仍优先使用 AI 提取(可识别验证码与各类链接),不受此回退影响。 + ## 地址白名单(可选) 为了控制成本和资源使用,可以在 Admin 控制台的 **AI 提取设置** 页面配置地址白名单: diff --git a/worker/src/email/ai_extract.ts b/worker/src/email/ai_extract.ts index bfc7c6f1..28eacf4e 100644 --- a/worker/src/email/ai_extract.ts +++ b/worker/src/email/ai_extract.ts @@ -7,6 +7,7 @@ */ import { commonParseMail } from "../common"; +import { extractCode } from "./extract_code"; import { getBooleanValue, getJsonSetting } from "../utils"; import { CONSTANTS } from "../constants"; import { Context } from "hono"; @@ -137,9 +138,39 @@ async function extractWithCloudflareAI( throw new Error('Unexpected response format from Cloudflare AI'); } +/** + * Persist an extraction result to the raw_mails metadata column. + * Shared by the Workers AI path and the regex fallback path. + * + * @param env - Cloudflare Workers environment bindings + * @param message_id - The email message ID + * @param result - The extraction result to persist + */ +async function saveExtractMetadata( + env: Bindings, + message_id: string | null, + result: ExtractResult +): Promise { + try { + const metadata = JSON.stringify({ + ai_extract: result, + extracted_at: new Date().toISOString() + }); + + // Update the raw_mails record with metadata + await env.DB.prepare( + `UPDATE raw_mails SET metadata = ? WHERE message_id = ?` + ).bind(metadata, message_id).run(); + } catch (e) { + console.error('AI extraction metadata save error:', e); + } +} + /** * Main extraction function - * Checks if AI extraction is enabled, processes the email content, and saves to database + * Checks if extraction is enabled, processes the email content, and saves to database. + * Uses Cloudflare Workers AI when the `AI` binding is available; otherwise falls back + * to a built-in regex extractor that surfaces verification codes only. * * @param parsedEmailContext - The parsed email context * @param env - Cloudflare Workers environment bindings @@ -154,18 +185,12 @@ export async function extractEmailInfo( address: string ): Promise { try { - // Check if AI extraction is enabled via environment variable + // Check if extraction is enabled via environment variable if (!getBooleanValue(env.ENABLE_AI_EMAIL_EXTRACT)) { return null; } - // Ensure AI binding is available - if (!env.AI) { - console.error('AI binding not available'); - return null; - } - - // Check allowlist if enabled + // Check allowlist if enabled (applies to both AI and the regex fallback) const aiSettings = await getJsonSetting( { env: env } as Context, CONSTANTS.AI_EXTRACT_SETTINGS_KEY @@ -187,12 +212,12 @@ export async function extractEmailInfo( }); if (!isAllowed) { - console.log(`AI extraction skipped for ${address}: not in allowlist`); + console.log(`Email extraction skipped for ${address}: not in allowlist`); return null; } } - // Parse email to get content + // Parse email to get content (shared by the AI path and the regex fallback) const parsedEmail = await commonParseMail(parsedEmailContext); const emailContent = parsedEmail?.text || parsedEmail?.html || ""; @@ -200,6 +225,20 @@ export async function extractEmailInfo( return null; } + // Fallback: when no Workers AI binding is available, use a built-in regex + // extractor so self-hosted deployments without Workers AI still surface + // verification codes. Telegram / webhook reuse the same ExtractResult. + if (!env.AI) { + const code = extractCode(emailContent); + if (!code) { + return null; + } + const result: ExtractResult = { type: 'auth_code', result: code, result_text: '' }; + await saveExtractMetadata(env, message_id, result); + console.log(`Regex code extraction completed for ${message_id}`); + return result; + } + // Truncate content if too long (max 4000 characters to avoid token limits) const truncatedContent = emailContent.length > 4000 ? emailContent.substring(0, 4000) + '...[truncated]' @@ -209,21 +248,8 @@ export async function extractEmailInfo( // If extraction found something useful, save it to database if (result.type !== 'none' && result.result) { - try { - const metadata = JSON.stringify({ - ai_extract: result, - extracted_at: new Date().toISOString() - }); - - // Update the raw_mails record with metadata - await env.DB.prepare( - `UPDATE raw_mails SET metadata = ? WHERE message_id = ?` - ).bind(metadata, message_id).run(); - - console.log(`AI extraction completed for ${message_id}: ${result.type}`); - } catch (e) { - console.error('AI extraction metadata save error:', e); - } + await saveExtractMetadata(env, message_id, result); + console.log(`AI extraction completed for ${message_id}: ${result.type}`); } return result; } catch (e) { diff --git a/worker/src/email/extract_code.ts b/worker/src/email/extract_code.ts new file mode 100644 index 00000000..dac2564f --- /dev/null +++ b/worker/src/email/extract_code.ts @@ -0,0 +1,80 @@ +/** + * Regex-based verification code extraction. + * + * Extracts verification codes (4-12 digit / alphanumeric) from email text. + * Covers common formats across English, Chinese, Japanese and Korean. + * + * This is a zero-dependency fallback used when Workers AI is not bound, so + * self-hosted deployments without Workers AI still surface verification codes. + * + * Design principles (learned from QA): + * 1. Prefer codes that appear after an explicit code keyword. + * 2. Reject plausible dates/years (e.g. "2026", "20260411") — too common as + * date markers in subject lines, almost never a real verification code. + * 3. Allow multi-character delimiters (a keyword followed by a colon or "is"). + * 4. Fall back to standalone digits ONLY if no keyword-guided match found + * AND the digits don't look like a year or YYYYMMDD date. + */ +export function extractCode(text: string): string | null { + // DELIM: at least one explicit delimiter must follow the code keyword before + // the code itself. Allowed delimiters: an ASCII or full-width colon, the word + // "is", or a common CJK delimiter particle (see the DELIM regex below). + // Multiple delimiters in sequence are OK. Whitespace around them is allowed. + // + // Why mandatory: without a delimiter, "verification code Your email" would + // capture "Your" as a false alphanumeric code. Bare "verification code 123456" + // without any delimiter still falls through to the standalone-digit fallback. + const DELIM = '\\s*(?:[::]|\\bis\\b|是|为|です)[\\s::]*'; + + // Keyword groups — labels that precede a verification code + const CN_JA_KO_KW = '验证码|认证码|确认码|認証コード|인증\\s*코드|코드'; + const EN_KW = 'verification\\s*code|confirm(?:ation)?\\s*code|security\\s*code|passcode|OTP|pin\\s*code'; + const ALL_KW = `${CN_JA_KO_KW}|${EN_KW}`; + + const keywordPatterns = [ + // 1. "code is 123456" / "code: 123456" — numeric preferred (bare "code" keyword) + new RegExp(`\\bcode${DELIM}(\\d{4,12})\\b`, 'i'), + // 2. Full keyword + mandatory delimiter + digits + new RegExp(`(?:${ALL_KW})${DELIM}(\\d{4,12})\\b`, 'i'), + // 3. Bare "code" + delimiter + alphanumeric + new RegExp(`\\bcode${DELIM}([A-Za-z0-9]{4,12})\\b`, 'i'), + // 4. Full keyword + delimiter + alphanumeric + new RegExp(`(?:${ALL_KW})${DELIM}([A-Za-z0-9]{4,12})\\b`, 'i'), + ]; + + for (const pattern of keywordPatterns) { + const match = text.match(pattern); + if (match?.[1] && !looksLikeDate(match[1])) return match[1]; + } + + // Fallback: standalone digit sequence, but reject year/date patterns + const standaloneMatch = text.match(/(?:^|\s)(\d{4,12})(?:\s|$|\.|,)/m); + if (standaloneMatch?.[1] && !looksLikeDate(standaloneMatch[1])) { + return standaloneMatch[1]; + } + + return null; +} + +/** + * Heuristic: does this digit sequence look like a date/year we should reject? + * - 4 digits matching 19xx or 20xx → year (e.g. 2026) + * - 8 digits matching YYYYMMDD with plausible month/day → date (e.g. 20260411) + */ +function looksLikeDate(digits: string): boolean { + // 4-digit year: 1900-2099 + if (digits.length === 4) { + const n = parseInt(digits, 10); + if (n >= 1900 && n <= 2099) return true; + } + // 8-digit YYYYMMDD + if (digits.length === 8) { + const year = parseInt(digits.slice(0, 4), 10); + const month = parseInt(digits.slice(4, 6), 10); + const day = parseInt(digits.slice(6, 8), 10); + if (year >= 1900 && year <= 2099 && month >= 1 && month <= 12 && day >= 1 && day <= 31) { + return true; + } + } + return false; +}