mirror of
https://github.com/dreamhunter2333/cloudflare_temp_email.git
synced 2026-06-21 15:37:01 +08:00
feat: regex fallback for verification code extraction without Workers AI (#1048)
feat: add regex fallback for verification code extraction without Workers AI When AI email extraction is enabled but no Workers AI binding is available, fall back to a built-in, zero-dependency regex extractor so self-hosted deployments without Workers AI still surface verification codes in Telegram notifications and webhooks. - Add worker/src/email/extract_code.ts: rule-based multilingual (English / Chinese / Japanese / Korean) verification-code extractor with year and YYYYMMDD date rejection to avoid false positives. - ai_extract.ts: share the allowlist check and content parsing across both paths, extract a saveExtractMetadata helper, and use the regex fallback when env.AI is absent. - Reuse the existing aiExtractResult pipeline (auth_code type), so Telegram and webhook output need no changes. - Update bilingual CHANGELOG and AI-extract feature docs.
This commit is contained in:
@@ -10,6 +10,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
- feat: |AI 识别| 未配置 Workers AI 绑定时,自动回退到内置正则提取验证码(支持中英日韩,并排除年份与 `YYYYMMDD` 日期误判),让无 Workers AI 的自部署用户也能在 Telegram 推送与 Webhook 中拿到验证码
|
||||
- feat: |Telegram| Telegram 新邮件推送与 `/mails` 历史邮件查看支持展示 AI 提取结果,包含验证码、验证链接、服务链接、订阅链接等关键信息
|
||||
- feat: |Webhook| 邮件 Webhook 模板支持填充 AI 提取结果占位符,包括 `aiExtractType`、`aiExtractResult`、`aiExtractResultText`
|
||||
- feat: |Frontend| 新增 `DISABLE_SHOW_GITHUB_FOR_USER` 配置,可仅对普通用户隐藏 Header 的 GitHub/版本入口,admin 仍可见(issue #1041)
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
### Features
|
||||
|
||||
- feat: |AI Extract| Fall back to a built-in regex verification-code extractor (English / Chinese / Japanese / Korean, with year and `YYYYMMDD` date rejection) when no Workers AI binding is configured, so self-hosted deployments without Workers AI still surface codes in Telegram pushes and webhooks
|
||||
- feat: |Telegram| Show AI extraction results in Telegram new-mail notifications and `/mails` history views, including verification codes, auth links, service links, and subscription links
|
||||
- feat: |Webhook| Support AI extraction placeholders in mail webhook templates, including `aiExtractType`, `aiExtractResult`, and `aiExtractResultText`
|
||||
- feat: |Frontend| Add `DISABLE_SHOW_GITHUB_FOR_USER` to hide the Header GitHub/version entry from normal users while keeping it visible to admin users (issue #1041)
|
||||
|
||||
@@ -43,6 +43,18 @@ Or add in Cloudflare Dashboard Worker settings:
|
||||
- **Variable name**: `AI`
|
||||
- **Type**: Workers AI
|
||||
|
||||
## Fallback Without a Workers AI Binding
|
||||
|
||||
If `ENABLE_AI_EMAIL_EXTRACT` is enabled but **no Workers AI binding is configured** (e.g. a self-hosted deployment without Workers AI), the system automatically falls back to a built-in **regex verification-code extractor**:
|
||||
|
||||
- Extracts **verification codes** (`auth_code`) only; links are not extracted (link extraction requires AI)
|
||||
- Zero dependency, zero cost, runs locally inside the Worker
|
||||
- Supports common verification-code formats in English, Chinese, Japanese and Korean
|
||||
- Rejects years (e.g. `2026`) and `YYYYMMDD` dates to reduce false positives
|
||||
- Results are written to `metadata` and reuse the same Telegram / webhook placeholders (`aiExtractType` is `auth_code` in this case)
|
||||
|
||||
When a Workers AI binding is configured, AI extraction is still preferred (recognizing both codes and links) and this fallback does not apply.
|
||||
|
||||
## Address Allowlist (Optional)
|
||||
|
||||
To control costs and resource usage, you can configure an address allowlist in the Admin console's **AI Extract Settings** page:
|
||||
|
||||
@@ -43,6 +43,18 @@ binding = "AI"
|
||||
- **Variable name**: `AI`
|
||||
- **Type**: Workers AI
|
||||
|
||||
## 无 Workers AI 绑定时的正则兜底
|
||||
|
||||
如果启用了 `ENABLE_AI_EMAIL_EXTRACT` 但**没有配置 Workers AI 绑定**(例如自部署时未开通 Workers AI),系统会自动回退到内置的**正则验证码提取**:
|
||||
|
||||
- 仅提取**验证码**(`auth_code`),不提取链接(链接提取依赖 AI)
|
||||
- 零依赖、零成本,在 Worker 内本地完成
|
||||
- 支持中文、英文、日文、韩文常见验证码格式
|
||||
- 自动排除年份(如 `2026`)与 `YYYYMMDD` 日期,降低误判
|
||||
- 提取结果同样写入 `metadata`,并复用 Telegram 推送与 Webhook 占位符(此时 `aiExtractType` 为 `auth_code`)
|
||||
|
||||
当配置了 Workers AI 绑定时,仍优先使用 AI 提取(可识别验证码与各类链接),不受此回退影响。
|
||||
|
||||
## 地址白名单(可选)
|
||||
|
||||
为了控制成本和资源使用,可以在 Admin 控制台的 **AI 提取设置** 页面配置地址白名单:
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
import { commonParseMail } from "../common";
|
||||
import { extractCode } from "./extract_code";
|
||||
import { getBooleanValue, getJsonSetting } from "../utils";
|
||||
import { CONSTANTS } from "../constants";
|
||||
import { Context } from "hono";
|
||||
@@ -137,9 +138,39 @@ async function extractWithCloudflareAI(
|
||||
throw new Error('Unexpected response format from Cloudflare AI');
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist an extraction result to the raw_mails metadata column.
|
||||
* Shared by the Workers AI path and the regex fallback path.
|
||||
*
|
||||
* @param env - Cloudflare Workers environment bindings
|
||||
* @param message_id - The email message ID
|
||||
* @param result - The extraction result to persist
|
||||
*/
|
||||
async function saveExtractMetadata(
|
||||
env: Bindings,
|
||||
message_id: string | null,
|
||||
result: ExtractResult
|
||||
): Promise<void> {
|
||||
try {
|
||||
const metadata = JSON.stringify({
|
||||
ai_extract: result,
|
||||
extracted_at: new Date().toISOString()
|
||||
});
|
||||
|
||||
// Update the raw_mails record with metadata
|
||||
await env.DB.prepare(
|
||||
`UPDATE raw_mails SET metadata = ? WHERE message_id = ?`
|
||||
).bind(metadata, message_id).run();
|
||||
} catch (e) {
|
||||
console.error('AI extraction metadata save error:', e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main extraction function
|
||||
* Checks if AI extraction is enabled, processes the email content, and saves to database
|
||||
* Checks if extraction is enabled, processes the email content, and saves to database.
|
||||
* Uses Cloudflare Workers AI when the `AI` binding is available; otherwise falls back
|
||||
* to a built-in regex extractor that surfaces verification codes only.
|
||||
*
|
||||
* @param parsedEmailContext - The parsed email context
|
||||
* @param env - Cloudflare Workers environment bindings
|
||||
@@ -154,18 +185,12 @@ export async function extractEmailInfo(
|
||||
address: string
|
||||
): Promise<ExtractResult | null> {
|
||||
try {
|
||||
// Check if AI extraction is enabled via environment variable
|
||||
// Check if extraction is enabled via environment variable
|
||||
if (!getBooleanValue(env.ENABLE_AI_EMAIL_EXTRACT)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure AI binding is available
|
||||
if (!env.AI) {
|
||||
console.error('AI binding not available');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check allowlist if enabled
|
||||
// Check allowlist if enabled (applies to both AI and the regex fallback)
|
||||
const aiSettings = await getJsonSetting<AiExtractSettings>(
|
||||
{ env: env } as Context<HonoCustomType>,
|
||||
CONSTANTS.AI_EXTRACT_SETTINGS_KEY
|
||||
@@ -187,12 +212,12 @@ export async function extractEmailInfo(
|
||||
});
|
||||
|
||||
if (!isAllowed) {
|
||||
console.log(`AI extraction skipped for ${address}: not in allowlist`);
|
||||
console.log(`Email extraction skipped for ${address}: not in allowlist`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse email to get content
|
||||
// Parse email to get content (shared by the AI path and the regex fallback)
|
||||
const parsedEmail = await commonParseMail(parsedEmailContext);
|
||||
const emailContent = parsedEmail?.text || parsedEmail?.html || "";
|
||||
|
||||
@@ -200,6 +225,20 @@ export async function extractEmailInfo(
|
||||
return null;
|
||||
}
|
||||
|
||||
// Fallback: when no Workers AI binding is available, use a built-in regex
|
||||
// extractor so self-hosted deployments without Workers AI still surface
|
||||
// verification codes. Telegram / webhook reuse the same ExtractResult.
|
||||
if (!env.AI) {
|
||||
const code = extractCode(emailContent);
|
||||
if (!code) {
|
||||
return null;
|
||||
}
|
||||
const result: ExtractResult = { type: 'auth_code', result: code, result_text: '' };
|
||||
await saveExtractMetadata(env, message_id, result);
|
||||
console.log(`Regex code extraction completed for ${message_id}`);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Truncate content if too long (max 4000 characters to avoid token limits)
|
||||
const truncatedContent = emailContent.length > 4000
|
||||
? emailContent.substring(0, 4000) + '...[truncated]'
|
||||
@@ -209,21 +248,8 @@ export async function extractEmailInfo(
|
||||
|
||||
// If extraction found something useful, save it to database
|
||||
if (result.type !== 'none' && result.result) {
|
||||
try {
|
||||
const metadata = JSON.stringify({
|
||||
ai_extract: result,
|
||||
extracted_at: new Date().toISOString()
|
||||
});
|
||||
|
||||
// Update the raw_mails record with metadata
|
||||
await env.DB.prepare(
|
||||
`UPDATE raw_mails SET metadata = ? WHERE message_id = ?`
|
||||
).bind(metadata, message_id).run();
|
||||
|
||||
console.log(`AI extraction completed for ${message_id}: ${result.type}`);
|
||||
} catch (e) {
|
||||
console.error('AI extraction metadata save error:', e);
|
||||
}
|
||||
await saveExtractMetadata(env, message_id, result);
|
||||
console.log(`AI extraction completed for ${message_id}: ${result.type}`);
|
||||
}
|
||||
return result;
|
||||
} catch (e) {
|
||||
|
||||
80
worker/src/email/extract_code.ts
Normal file
80
worker/src/email/extract_code.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
/**
|
||||
* Regex-based verification code extraction.
|
||||
*
|
||||
* Extracts verification codes (4-12 digit / alphanumeric) from email text.
|
||||
* Covers common formats across English, Chinese, Japanese and Korean.
|
||||
*
|
||||
* This is a zero-dependency fallback used when Workers AI is not bound, so
|
||||
* self-hosted deployments without Workers AI still surface verification codes.
|
||||
*
|
||||
* Design principles (learned from QA):
|
||||
* 1. Prefer codes that appear after an explicit code keyword.
|
||||
* 2. Reject plausible dates/years (e.g. "2026", "20260411") — too common as
|
||||
* date markers in subject lines, almost never a real verification code.
|
||||
* 3. Allow multi-character delimiters (a keyword followed by a colon or "is").
|
||||
* 4. Fall back to standalone digits ONLY if no keyword-guided match found
|
||||
* AND the digits don't look like a year or YYYYMMDD date.
|
||||
*/
|
||||
export function extractCode(text: string): string | null {
|
||||
// DELIM: at least one explicit delimiter must follow the code keyword before
|
||||
// the code itself. Allowed delimiters: an ASCII or full-width colon, the word
|
||||
// "is", or a common CJK delimiter particle (see the DELIM regex below).
|
||||
// Multiple delimiters in sequence are OK. Whitespace around them is allowed.
|
||||
//
|
||||
// Why mandatory: without a delimiter, "verification code Your email" would
|
||||
// capture "Your" as a false alphanumeric code. Bare "verification code 123456"
|
||||
// without any delimiter still falls through to the standalone-digit fallback.
|
||||
const DELIM = '\\s*(?:[::]|\\bis\\b|是|为|です)[\\s::]*';
|
||||
|
||||
// Keyword groups — labels that precede a verification code
|
||||
const CN_JA_KO_KW = '验证码|认证码|确认码|認証コード|인증\\s*코드|코드';
|
||||
const EN_KW = 'verification\\s*code|confirm(?:ation)?\\s*code|security\\s*code|passcode|OTP|pin\\s*code';
|
||||
const ALL_KW = `${CN_JA_KO_KW}|${EN_KW}`;
|
||||
|
||||
const keywordPatterns = [
|
||||
// 1. "code is 123456" / "code: 123456" — numeric preferred (bare "code" keyword)
|
||||
new RegExp(`\\bcode${DELIM}(\\d{4,12})\\b`, 'i'),
|
||||
// 2. Full keyword + mandatory delimiter + digits
|
||||
new RegExp(`(?:${ALL_KW})${DELIM}(\\d{4,12})\\b`, 'i'),
|
||||
// 3. Bare "code" + delimiter + alphanumeric
|
||||
new RegExp(`\\bcode${DELIM}([A-Za-z0-9]{4,12})\\b`, 'i'),
|
||||
// 4. Full keyword + delimiter + alphanumeric
|
||||
new RegExp(`(?:${ALL_KW})${DELIM}([A-Za-z0-9]{4,12})\\b`, 'i'),
|
||||
];
|
||||
|
||||
for (const pattern of keywordPatterns) {
|
||||
const match = text.match(pattern);
|
||||
if (match?.[1] && !looksLikeDate(match[1])) return match[1];
|
||||
}
|
||||
|
||||
// Fallback: standalone digit sequence, but reject year/date patterns
|
||||
const standaloneMatch = text.match(/(?:^|\s)(\d{4,12})(?:\s|$|\.|,)/m);
|
||||
if (standaloneMatch?.[1] && !looksLikeDate(standaloneMatch[1])) {
|
||||
return standaloneMatch[1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Heuristic: does this digit sequence look like a date/year we should reject?
|
||||
* - 4 digits matching 19xx or 20xx → year (e.g. 2026)
|
||||
* - 8 digits matching YYYYMMDD with plausible month/day → date (e.g. 20260411)
|
||||
*/
|
||||
function looksLikeDate(digits: string): boolean {
|
||||
// 4-digit year: 1900-2099
|
||||
if (digits.length === 4) {
|
||||
const n = parseInt(digits, 10);
|
||||
if (n >= 1900 && n <= 2099) return true;
|
||||
}
|
||||
// 8-digit YYYYMMDD
|
||||
if (digits.length === 8) {
|
||||
const year = parseInt(digits.slice(0, 4), 10);
|
||||
const month = parseInt(digits.slice(4, 6), 10);
|
||||
const day = parseInt(digits.slice(6, 8), 10);
|
||||
if (year >= 1900 && year <= 2099 && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
Reference in New Issue
Block a user