feat: add AI email extraction with Cloudflare Workers AI

Add AI-powered email content extraction feature using Cloudflare Workers AI to automatically identify and extract important information from emails including verification codes, authentication links, service links, and subscription links.

Features:
- AI extraction with priority-based logic (auth_code > auth_link > service_link > subscription_link > other_link)
- Admin allowlist configuration with wildcard support (*@example.com)
- Frontend display in both email list (compact) and detail view (full mode)
- Bilingual documentation (Chinese/English)
- Database migration: add metadata field to raw_mails (v0.0.3 -> v0.0.4)

Technical highlights:
- Proper regex escaping for wildcard pattern matching
- Content truncation to avoid AI token limits
- Error handling that won't affect email receiving
- JSON schema validation for AI responses
- Type-safe TypeScript implementation
- Vue I18n support with special character escaping

References:
- Inspired by Alle Project: https://github.com/bestruirui/Alle
- Uses Cloudflare Workers AI JSON Mode

🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Dream Hunter
2025-12-06 16:28:19 +08:00
committed by GitHub
parent a2a9f9e25f
commit dbb55d948f
27 changed files with 2473 additions and 1637 deletions

View File

@@ -11,26 +11,26 @@
"build": "wrangler deploy --dry-run --outdir dist --minify"
},
"devDependencies": {
"@cloudflare/workers-types": "^4.20251111.0",
"@cloudflare/workers-types": "^4.20251205.0",
"@eslint/js": "9.18.0",
"@simplewebauthn/types": "10.0.0",
"@types/node": "^22.19.1",
"eslint": "9.18.0",
"globals": "^15.15.0",
"typescript-eslint": "^8.46.4",
"wrangler": "^4.47.0"
"typescript-eslint": "^8.48.1",
"wrangler": "^4.53.0"
},
"dependencies": {
"@aws-sdk/client-s3": "3.888.0",
"@aws-sdk/s3-request-presigner": "3.888.0",
"@simplewebauthn/server": "10.0.1",
"hono": "^4.10.5",
"hono": "^4.10.7",
"jsonpath-plus": "^10.3.0",
"mimetext": "^3.0.27",
"postal-mime": "^2.6.0",
"postal-mime": "^2.6.1",
"resend": "^4.8.0",
"telegraf": "4.16.3",
"worker-mailer": "^1.2.0"
"worker-mailer": "^1.2.1"
},
"pnpm": {
"patchedDependencies": {

833
worker/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,27 @@
import { Context } from "hono";
import { CONSTANTS } from "../constants";
import { getJsonSetting, saveSetting } from "../utils";
export type AiExtractSettings = {
enableAllowList: boolean;
allowList: string[];
}
async function getAiExtractSettings(c: Context<HonoCustomType>): Promise<Response> {
const settings = await getJsonSetting<AiExtractSettings>(c, CONSTANTS.AI_EXTRACT_SETTINGS_KEY) || {
enableAllowList: false,
allowList: []
};
return c.json(settings);
}
async function saveAiExtractSettings(c: Context<HonoCustomType>): Promise<Response> {
const settings = await c.req.json<AiExtractSettings>();
await saveSetting(c, CONSTANTS.AI_EXTRACT_SETTINGS_KEY, JSON.stringify(settings));
return c.json({ success: true })
}
export default {
getAiExtractSettings,
saveAiExtractSettings,
}

View File

@@ -9,6 +9,7 @@ CREATE TABLE IF NOT EXISTS raw_mails (
source TEXT,
address TEXT,
raw TEXT,
metadata TEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
@@ -142,8 +143,11 @@ export default {
const query = `ALTER TABLE address ADD password TEXT;`
await c.env.DB.exec(query);
}
if (version == "v0.0.3") {
// migration from v0.0.3 to v0.0.4
await c.env.DB.exec(`ALTER TABLE raw_mails ADD COLUMN metadata TEXT;`);
}
if (version != CONSTANTS.DB_VERSION) {
// TODO: Perform migration logic here
// remove all \r and \n characters from the query string
// split by ; and join with a ;\n
const query = DB_INIT_QUERIES.replace(/[\r\n]/g, "")

View File

@@ -15,6 +15,7 @@ import admin_mail_api from './admin_mail_api'
import { sendMailbyAdmin } from './send_mail'
import db_api from './db_api'
import ip_blacklist_settings from './ip_blacklist_settings'
import ai_extract_settings from './ai_extract_settings'
import { EmailRuleSettings } from '../models'
export const api = new Hono<HonoCustomType>()
@@ -377,3 +378,7 @@ api.post('admin/db_migration', db_api.migrate);
// IP blacklist settings
api.get("/admin/ip_blacklist/settings", ip_blacklist_settings.getIpBlacklistSettings);
api.post("/admin/ip_blacklist/settings", ip_blacklist_settings.saveIpBlacklistSettings);
// AI extract settings
api.get("/admin/ai_extract/settings", ai_extract_settings.getAiExtractSettings);
api.post("/admin/ai_extract/settings", ai_extract_settings.saveAiExtractSettings);

View File

@@ -3,7 +3,7 @@ export const CONSTANTS = {
// DB Version
DB_VERSION_KEY: 'db_version',
DB_VERSION: "v0.0.3",
DB_VERSION: "v0.0.4",
// DB settings
ADDRESS_BLOCK_LIST_KEY: 'address_block_list',
@@ -16,6 +16,7 @@ export const CONSTANTS = {
EMAIL_RULE_SETTINGS_KEY: 'email_rule_settings',
ROLE_ADDRESS_CONFIG_KEY: 'role_address_config',
IP_BLACKLIST_SETTINGS_KEY: 'ip_blacklist_settings',
AI_EXTRACT_SETTINGS_KEY: 'ai_extract_settings',
// KV
TG_KV_PREFIX: "temp-mail-telegram",

View File

@@ -0,0 +1,235 @@
/**
* AI Email Extraction Module
*
* This module provides email content analysis using Cloudflare Workers AI.
* It extracts important information like verification codes, authentication links,
* service links, and subscription management links from email content.
*/
import { commonParseMail } from "../common";
import { getBooleanValue, getJsonSetting } from "../utils";
import { CONSTANTS } from "../constants";
import { Context } from "hono";
import type { AiExtractSettings } from "../admin_api/ai_extract_settings";
// AI Prompt for email analysis
const PROMPT = `
You are an expert email analyzer. Your task is to first UNDERSTAND the email content, then EXTRACT the most relevant information based on priority.
# Step 1: UNDERSTAND the Email
Read the entire email carefully and determine its:
- Overall purpose (verification, marketing, notification, etc.)
- Key context and situation
- What the sender wants the recipient to do
- Any security-sensitive content
# Step 2: EXTRACT Based on Priority
After understanding, extract the most important item according to this priority order:
**Priority 1: auth_code (Authentication Code)**
- Numeric or alphanumeric codes used for login verification
- Keywords: verification code, OTP, security code, confirmation code, auth code, 验证码, 校验码
- Extract ONLY the code itself (remove spaces, hyphens, etc.)
- Example: "123456" from "Your verification code is 123-456"
**Priority 2: auth_link (Authentication Link)**
- Links used for login, email verification, account activation, or password reset
- Keywords: verify, confirm, activate, login, signin, signup, reset, 验证, 激活, 登录
- Must be a real, complete URL (http:// or https://)
- Never fabricate or infer links that don't exist in the content
- Example: "https://example.com/verify?token=abc123"
**Priority 3: service_link (Service Link)**
- Links related to specific services or actions
- Keywords: commit, pull request, issue, repository, deployment, GitHub, GitLab, code review
- Real URLs for technical or service-related notifications
- Example: GitHub commit link, deployment notification link
**Priority 4: subscription_link (Subscription Management Link)**
- Links for managing email subscriptions, typically unsubscribe
- Keywords: unsubscribe, opt-out, manage preferences, 退订, 取消订阅
- Usually found at the bottom of marketing emails
- Real URLs for subscription control
**Priority 5: other_link (Other Valuable Link)**
- Any other link that might be useful or important
- Only extract if no higher-priority items exist
- Must be a real, complete URL from the content
**Priority 6: none**
- No relevant codes, links, or valuable content found
- Email appears to be plain text or irrelevant
# Special Case: Markdown Link Format
If the extracted content is in markdown link format [text](url):
- Extract the text inside the brackets as result_text
- When brackets are empty, analyze the email context and language
- Generate a concise, meaningful description (2-5 words) for result_text
- Match the email's language (Chinese → Chinese description, English → English)
# Critical Rules
1. **Understand First**: Always analyze the email's purpose before extracting
2. **Single Selection**: Choose ONLY ONE type based on the highest priority match
3. **Real Data Only**: Never invent, guess, or fabricate content
4. **Complete URLs**: Links must be full, valid URLs as they appear in the email
5. **Clean Extraction**: Return only the raw extracted content, no extra text
# Output Format (JSON only)
{
"type": "auth_code|auth_link|service_link|subscription_link|other_link|none",
"result": "the extracted code/link OR empty string",
"result_text": "the display text from markdown-format links."
}
IMPORTANT: Return ONLY the JSON, no explanations or additional text.
`;
/**
* Extract important information from email content using Cloudflare Workers AI
*
* @param content - The email content to analyze (plain text or HTML)
* @param env - Cloudflare Workers environment bindings
* @returns Promise<ExtractResult> - The extracted information
*/
async function extractWithCloudflareAI(
content: string,
env: Bindings
): Promise<ExtractResult> {
// Get the AI model name from environment variable or use default
const modelName = env.AI_EXTRACT_MODEL || '@cf/meta/llama-3.1-8b-instruct';
const result = await env.AI.run(modelName as keyof AiModels, {
messages: [
{ role: 'system', content: PROMPT },
{ role: 'user', content },
],
response_format: {
type: 'json_schema',
json_schema: {
type: 'object',
properties: {
type: {
type: 'string',
enum: ['auth_code', 'auth_link', 'service_link', 'subscription_link', 'other_link', 'none']
},
result: { type: 'string' },
result_text: { type: 'string' },
},
required: ['type', 'result', 'result_text'],
},
},
stream: false,
});
// @ts-expect-error result.response
const response = result.response;
if (typeof response === 'string') {
return JSON.parse(response) as ExtractResult;
}
if (response && typeof response === 'object') {
return response as ExtractResult;
}
throw new Error('Unexpected response format from Cloudflare AI');
}
/**
* Main extraction function
* Checks if AI extraction is enabled, processes the email content, and saves to database
*
* @param parsedEmailContext - The parsed email context
* @param env - Cloudflare Workers environment bindings
* @param message_id - The email message ID
* @param address - The recipient email address
* @returns Promise<void>
*/
export async function extractEmailInfo(
parsedEmailContext: ParsedEmailContext,
env: Bindings,
message_id: string | null,
address: string
): Promise<void> {
try {
// Check if AI extraction is enabled via environment variable
if (!getBooleanValue(env.ENABLE_AI_EMAIL_EXTRACT)) {
return;
}
// Ensure AI binding is available
if (!env.AI) {
console.error('AI binding not available');
return;
}
// Check allowlist if enabled
const aiSettings = await getJsonSetting<AiExtractSettings>(
{ env: env } as Context<HonoCustomType>,
CONSTANTS.AI_EXTRACT_SETTINGS_KEY
);
if (aiSettings?.enableAllowList && aiSettings.allowList?.length > 0) {
const isAllowed = aiSettings.allowList.some(pattern => {
// Support wildcard matching
if (pattern.includes('*')) {
// Escape special regex characters except *
const escapedPattern = pattern
.replace(/[.+?^${}()|[\]\\]/g, '\\$&')
.replace(/\*/g, '.*');
const regex = new RegExp('^' + escapedPattern + '$');
return regex.test(address);
}
// Exact match
return address === pattern;
});
if (!isAllowed) {
console.log(`AI extraction skipped for ${address}: not in allowlist`);
return;
}
}
// Parse email to get content
const parsedEmail = await commonParseMail(parsedEmailContext);
const emailContent = parsedEmail?.text || parsedEmail?.html || "";
if (!emailContent) {
return;
}
// Truncate content if too long (max 4000 characters to avoid token limits)
const truncatedContent = emailContent.length > 4000
? emailContent.substring(0, 4000) + '...[truncated]'
: emailContent;
const result = await extractWithCloudflareAI(truncatedContent, env);
// If extraction found something useful, save it to database
if (result.type !== 'none' && result.result) {
const metadata = JSON.stringify({
ai_extract: result,
extracted_at: new Date().toISOString()
});
// Update the raw_mails record with metadata
await env.DB.prepare(
`UPDATE raw_mails SET metadata = ? WHERE message_id = ?`
).bind(metadata, message_id).run();
console.log(`AI extraction completed for ${message_id}: ${result.type}`);
}
} catch (e) {
console.error('AI email extraction error:', e);
}
}
/**
* Type definition for extraction result
*/
export type ExtractResult = {
type: 'auth_code' | 'auth_link' | 'service_link' | 'subscription_link' | 'other_link' | 'none';
result: string;
result_text: string;
};

View File

@@ -7,6 +7,7 @@ import { isBlocked } from "./black_list";
import { triggerWebhook, triggerAnotherWorker, commonParseMail } from "../common";
import { check_if_junk_mail } from "./check_junk";
import { remove_attachment_if_need } from "./check_attachment";
import { extractEmailInfo } from "./ai_extract";
import { EmailRuleSettings } from "../models";
import { CONSTANTS } from "../constants";
@@ -155,6 +156,9 @@ async function email(message: ForwardableEmailMessage, env: Bindings, ctx: Execu
// auto reply email
await auto_reply(message, env);
// AI email content extraction
await extractEmailInfo(parsedEmailContext, env, message_id, message.to);
}
export { email }

View File

@@ -11,6 +11,7 @@ type Bindings = {
RATE_LIMITER: any
SEND_MAIL: any
ASSETS: Fetcher
AI: Ai
// config
DEFAULT_LANG: string | undefined
@@ -86,6 +87,10 @@ type Bindings = {
// webhook config
FRONTEND_URL: string | undefined
// AI extraction config
ENABLE_AI_EMAIL_EXTRACT: string | boolean | undefined
AI_EXTRACT_MODEL: string | undefined
}
type JwtPayload = {

View File

@@ -107,6 +107,10 @@ ENABLE_AUTO_REPLY = false
# REMOVE_EXCEED_SIZE_ATTACHMENT = true
# remove all attachment, mail maybe mising some information due to parsing
# REMOVE_ALL_ATTACHMENT = true
# AI email extraction, automatically extract verification codes, auth links, etc.
# ENABLE_AI_EMAIL_EXTRACT = true
# AI model name, choose from https://developers.cloudflare.com/workers-ai/models/#text-generation
# AI_EXTRACT_MODEL = "@cf/meta/llama-3.1-8b-instruct"
# Calling other woker to process email
# ENABLE_ANOTHER_WORKER = false
# ANOTHER_WORKER_LIST = """
@@ -127,6 +131,10 @@ binding = "DB"
database_name = "xxx"
database_id = "xxx"
# Workers AI binding (required for AI email extraction)
# [ai]
# binding = "AI"
# kv config for send email verification code
# [[kv_namespaces]]
# binding = "KV"