feat(extractor): 实现智能字段提取器 smartExtractList
- 添加 smartExtractList() 智能列表提取函数 - 支持基于起始模式和结束关键词的列表边界识别 - 修复 insurance_period 和 withdrawal_modes 字段类型处理 - 优化 payment_periods 过滤逻辑,排除无效项 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Showing
2 changed files
with
670 additions
and
0 deletions
| 1 | +## [2026-02-15] - 智能字段提取器完善 | ||
| 2 | + | ||
| 3 | +### 新增 | ||
| 4 | +- 实现 `smartExtractList()` 智能列表提取函数 | ||
| 5 | +- 支持基于起始模式和结束关键词的列表边界识别 | ||
| 6 | + | ||
| 7 | +### 修复 | ||
| 8 | +- 修复 `insurance_period` 和 `withdrawal_modes` 字段的 `postProcess` 函数类型处理问题 | ||
| 9 | +- 优化 `payment_periods` 字段的过滤逻辑,排除"投保年龄"等无效项 | ||
| 10 | + | ||
| 11 | +--- | ||
| 12 | + | ||
| 13 | +**详细信息**: | ||
| 14 | +- **影响文件**: scripts/smart-field-extractor.js | ||
| 15 | +- **技术栈**: Node.js, 正则表达式, 智能提取 | ||
| 16 | +- **测试状态**: 单元测试通过 | ||
| 17 | +- **备注**: 提升了字段提取的准确性和健壮性 | ||
| 18 | + | ||
| 19 | +--- | ||
| 20 | + | ||
| 21 | +## [2026-02-14] - 文档解析使用说明完善 | ||
| 22 | + | ||
| 23 | +### 优化 | ||
| 24 | +- 补充解析链路与使用思路,明确审核与合并边界 | ||
| 25 | +- 更新解析命令说明与默认行为 | ||
| 26 | + | ||
| 27 | +--- | ||
| 28 | + | ||
| 29 | +**详细信息**: | ||
| 30 | +- **影响文件**: docs/to-parse/README.md, README.md | ||
| 31 | +- **技术栈**: 文档维护 | ||
| 32 | +- **测试状态**: 未运行(仅文档更新) | ||
| 33 | +- **备注**: 使用方式与链路更清晰 | ||
| 34 | + | ||
| 35 | +--- | ||
| 36 | + | ||
| 37 | +## [2026-02-14] - 文档解析审核流程落地 | ||
| 38 | + | ||
| 39 | +### 优化 | ||
| 40 | +- 修复审核模板重复定义与内容断裂,统一字段命名与展示结构 | ||
| 41 | +- 完善审核流程指引,明确 pending/approved 目录治理与合并步骤 | ||
| 42 | +- 默认解析仅生成待审核文件,写入配置需显式开启 | ||
| 43 | + | ||
| 44 | +--- | ||
| 45 | + | ||
| 46 | +**详细信息**: | ||
| 47 | +- **影响文件**: scripts/parse-docs.js, docs/to-parse/README.md, docs/tasks/plan/改进文档解析工具-添加审核流程.md, README.md | ||
| 48 | +- **技术栈**: Node.js, 文档维护 | ||
| 49 | +- **测试状态**: pnpm test 通过;pnpm lint 30 warnings | ||
| 50 | +- **备注**: 已生成待审核文件并完成可读性校验 | ||
| 51 | + | ||
| 52 | +--- | ||
| 53 | + | ||
| 1 | ## [2026-02-14] - 文档解析审核方案整理 | 54 | ## [2026-02-14] - 文档解析审核方案整理 |
| 2 | 55 | ||
| 3 | ### 优化 | 56 | ### 优化 | ... | ... |
scripts/smart-field-extractor.js
0 → 100644
| 1 | +/** | ||
| 2 | + * 智能字段提取器 | ||
| 3 | + * | ||
| 4 | + * @description 从保险产品文档中智能提取配置字段,支持中英文、繁简体 | ||
| 5 | + * @module scripts/smart-field-extractor | ||
| 6 | + * @author Claude Code | ||
| 7 | + * @created 2026-02-14 | ||
| 8 | + */ | ||
| 9 | + | ||
| 10 | +/** | ||
| 11 | + * 字段提取规则配置 | ||
| 12 | + * | ||
| 13 | + * @description 定义每个字段的匹配规则、优先级和默认值 | ||
| 14 | + */ | ||
| 15 | +const FIELD_RULES = { | ||
| 16 | + // 产品名称 | ||
| 17 | + product_name: { | ||
| 18 | + priority: 1, | ||
| 19 | + patterns: [ | ||
| 20 | + /产品名称[::]\s*([^\n]+)/, | ||
| 21 | + /计划书名称[::]\s*([^\n]+)/, | ||
| 22 | + /Product\s+Name[::]\s*([^\n]+)/i, | ||
| 23 | + /^#\s+(.+)$/m // Markdown 标题 | ||
| 24 | + ], | ||
| 25 | + fallback: null, // 必填,无默认值 | ||
| 26 | + required: true | ||
| 27 | + }, | ||
| 28 | + | ||
| 29 | + // 产品类型 | ||
| 30 | + product_type: { | ||
| 31 | + priority: 2, | ||
| 32 | + patterns: [ | ||
| 33 | + // 从内容推断 | ||
| 34 | + { | ||
| 35 | + type: 'content_match', | ||
| 36 | + rules: [ | ||
| 37 | + { keywords: ['储蓄', 'saving', '传承', '家传', '红利', '提取'], value: 'savings' }, | ||
| 38 | + { keywords: ['重疾', 'critical', '守护', '严重疾病'], value: 'critical-illness' }, | ||
| 39 | + { keywords: ['人寿', 'life', '创富', '身故保障'], value: 'life-insurance' } | ||
| 40 | + ] | ||
| 41 | + } | ||
| 42 | + ], | ||
| 43 | + fallback: 'savings', | ||
| 44 | + required: true | ||
| 45 | + }, | ||
| 46 | + | ||
| 47 | + // 币种 | ||
| 48 | + currency: { | ||
| 49 | + priority: 3, | ||
| 50 | + patterns: [ | ||
| 51 | + // 统计货币符号出现次数 | ||
| 52 | + { | ||
| 53 | + type: 'count_match', | ||
| 54 | + rules: [ | ||
| 55 | + { pattern: /\$/g, value: 'USD' }, | ||
| 56 | + { pattern: /HK\$/g, value: 'HKD' }, | ||
| 57 | + { pattern: /¥|人民币/g, value: 'CNY' }, | ||
| 58 | + { pattern: /€/g, value: 'EUR' } | ||
| 59 | + ] | ||
| 60 | + }, | ||
| 61 | + /币种[::]\s*(USD|CNY|HKD|EUR)/i, | ||
| 62 | + /Currency[::]\s*(USD|CNY|HKD|EUR)/i | ||
| 63 | + ], | ||
| 64 | + fallback: 'USD', | ||
| 65 | + required: true | ||
| 66 | + }, | ||
| 67 | + | ||
| 68 | + // 缴费年期 | ||
| 69 | + payment_periods: { | ||
| 70 | + priority: 4, | ||
| 71 | + patterns: [ | ||
| 72 | + // 匹配 "年繳保費繳費年期" 或 "缴费年期" 后面的列表 | ||
| 73 | + // 策略:匹配到包含 "年" 或 "整付" 的所有行,直到遇到其他关键字 | ||
| 74 | + { | ||
| 75 | + type: 'smart_list_extract', | ||
| 76 | + startPattern: /(?:年繳保費)?繳費年期[::\s]*\n/, | ||
| 77 | + endKeywords: ['提取', '保險期間', '保险期间', '投保年龄', '投保年齡', '選是', '選項', 'GC宏', 'FA宏', 'LV2'], | ||
| 78 | + itemFilter: (line) => { | ||
| 79 | + const trimmed = line.trim() | ||
| 80 | + // 排除包含"投保年龄"等关键字的行 | ||
| 81 | + if (trimmed.includes('投保') || trimmed.includes('年龄') || trimmed.includes('年齡')) { | ||
| 82 | + return false | ||
| 83 | + } | ||
| 84 | + // 精确匹配 "整付" 或 "X年" 格式 | ||
| 85 | + return trimmed && ( | ||
| 86 | + /^\d+\s*年$/.test(trimmed) || | ||
| 87 | + trimmed === '整付' || | ||
| 88 | + /^\d+年$/.test(trimmed) || | ||
| 89 | + /^[-•·]\s*\d+\s*年$/.test(trimmed) // 支持列表格式 "- 3年" | ||
| 90 | + ) | ||
| 91 | + } | ||
| 92 | + } | ||
| 93 | + ], | ||
| 94 | + fallback: ['整付', '3年', '5年'], | ||
| 95 | + required: true, | ||
| 96 | + postProcess: (values) => { | ||
| 97 | + // 过滤并标准化 | ||
| 98 | + const normalized = values | ||
| 99 | + .map(v => v.trim()) | ||
| 100 | + // 排除包含"投保"等无效关键字 | ||
| 101 | + .filter(v => v && !v.includes('投保') && !v.includes('年龄') && !v.includes('年齡')) | ||
| 102 | + .filter(v => v.includes('年') || v.includes('整付')) | ||
| 103 | + .map(v => { | ||
| 104 | + // 提取数字+年格式 | ||
| 105 | + const match = v.match(/(\d+)\s*年|整付/i) | ||
| 106 | + if (match) { | ||
| 107 | + return match[0].includes('整付') ? '整付' : `${match[1]}年` | ||
| 108 | + } | ||
| 109 | + return v | ||
| 110 | + }) | ||
| 111 | + | ||
| 112 | + // 去重、排序 | ||
| 113 | + return [...new Set(normalized)].sort((a, b) => { | ||
| 114 | + if (a === '整付') return -1 | ||
| 115 | + if (b === '整付') return 1 | ||
| 116 | + return parseInt(a) - parseInt(b) | ||
| 117 | + }) | ||
| 118 | + } | ||
| 119 | + }, | ||
| 120 | + | ||
| 121 | + // 年龄范围 | ||
| 122 | + age_range: { | ||
| 123 | + priority: 5, | ||
| 124 | + patterns: [ | ||
| 125 | + // 匹配 "0-75岁" 格式 | ||
| 126 | + { | ||
| 127 | + type: 'range_extract', | ||
| 128 | + pattern: /(\d+)\s*[-~至]\s*(\d+)\s*岁?/ | ||
| 129 | + }, | ||
| 130 | + // 匹配 "投保年龄:0-75岁" 格式 | ||
| 131 | + /投保年龄[::]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/, | ||
| 132 | + /年龄范围[::]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/ | ||
| 133 | + ], | ||
| 134 | + fallback: { min: 0, max: 75 }, | ||
| 135 | + required: true, | ||
| 136 | + postProcess: (match) => { | ||
| 137 | + if (match && typeof match === 'object' && match.min !== undefined) { | ||
| 138 | + return match | ||
| 139 | + } | ||
| 140 | + if (Array.isArray(match) && match.length >= 2) { | ||
| 141 | + return { min: parseInt(match[1]), max: parseInt(match[2]) } | ||
| 142 | + } | ||
| 143 | + return null | ||
| 144 | + } | ||
| 145 | + }, | ||
| 146 | + | ||
| 147 | + // 保险期间 | ||
| 148 | + insurance_period: { | ||
| 149 | + priority: 6, | ||
| 150 | + patterns: [ | ||
| 151 | + /保險期間[::]\s*([^\n]+)/, | ||
| 152 | + /保险期间[::]\s*([^\n]+)/, | ||
| 153 | + /Insurance\s+Period[::]\s*([^\n]+)/i, | ||
| 154 | + /保障期间[::]\s*([^\n]+)/ | ||
| 155 | + ], | ||
| 156 | + fallback: '终身', | ||
| 157 | + required: true, | ||
| 158 | + postProcess: (value) => { | ||
| 159 | + // 处理正则匹配结果(数组)或直接字符串 | ||
| 160 | + let str = value | ||
| 161 | + if (Array.isArray(value)) { | ||
| 162 | + str = value[1] || value[0] || '' | ||
| 163 | + } | ||
| 164 | + if (!str || typeof str !== 'string') return '终身' | ||
| 165 | + | ||
| 166 | + const normalized = str.trim() | ||
| 167 | + // 标准化常见表述 | ||
| 168 | + if (normalized.includes('终身') || normalized.includes('終身') || normalized.toLowerCase().includes('whole life')) { | ||
| 169 | + return '终身' | ||
| 170 | + } | ||
| 171 | + return normalized | ||
| 172 | + } | ||
| 173 | + }, | ||
| 174 | + | ||
| 175 | + // 提取方式(仅储蓄类) | ||
| 176 | + withdrawal_modes: { | ||
| 177 | + priority: 7, | ||
| 178 | + patterns: [ | ||
| 179 | + { | ||
| 180 | + type: 'list_extract', | ||
| 181 | + pattern: /提取选项[::]\s*([^\n]+)/, | ||
| 182 | + itemPattern: /指定提取金额|最高固定提取金额/g | ||
| 183 | + }, | ||
| 184 | + /提取方式[::]\s*([^\n]+)/ | ||
| 185 | + ], | ||
| 186 | + fallback: ['年龄指定金额', '最高固定金额'], | ||
| 187 | + required: false, | ||
| 188 | + productType: ['savings'], | ||
| 189 | + postProcess: (values) => { | ||
| 190 | + // 处理正则匹配结果(数组,第一个元素是完整匹配,第二个是捕获组) | ||
| 191 | + if (Array.isArray(values) && values.length > 1 && typeof values[1] === 'string') { | ||
| 192 | + values = values[1] | ||
| 193 | + } | ||
| 194 | + | ||
| 195 | + if (typeof values === 'string') { | ||
| 196 | + // 从单行文本中提取 | ||
| 197 | + const modes = [] | ||
| 198 | + if (values.includes('指定提取金额')) modes.push('指定提取金额') | ||
| 199 | + if (values.includes('最高固定提取金额')) modes.push('最高固定提取金额') | ||
| 200 | + return modes.length > 0 ? modes : ['年龄指定金额', '最高固定金额'] | ||
| 201 | + } | ||
| 202 | + return Array.isArray(values) ? values : ['年龄指定金额', '最高固定金额'] | ||
| 203 | + } | ||
| 204 | + }, | ||
| 205 | + | ||
| 206 | + // 提取期(仅储蓄类) | ||
| 207 | + withdrawal_periods: { | ||
| 208 | + priority: 8, | ||
| 209 | + patterns: [ | ||
| 210 | + { | ||
| 211 | + type: 'list_extract', | ||
| 212 | + pattern: /提取期[((]年[))][::]\s*([\s\S]*?)(?=\n\n|\n\n|$)/, | ||
| 213 | + itemPattern: /^\s*[-•·]\s*(\d+\s*年)|^\s*(\d+)\s*年\s*$/gm | ||
| 214 | + } | ||
| 215 | + ], | ||
| 216 | + fallback: ['1年', '3年', '5年', '10年'], | ||
| 217 | + required: false, | ||
| 218 | + productType: ['savings'], | ||
| 219 | + postProcess: (values) => { | ||
| 220 | + const normalized = values.map(v => { | ||
| 221 | + const match = v.match(/(\d+)\s*年/) | ||
| 222 | + return match ? `${match[1]}年` : v.trim() | ||
| 223 | + }) | ||
| 224 | + return [...new Set(normalized)].sort((a, b) => parseInt(a) - parseInt(b)) | ||
| 225 | + } | ||
| 226 | + } | ||
| 227 | +} | ||
| 228 | + | ||
| 229 | +/** | ||
| 230 | + * 从文本中提取字段值 | ||
| 231 | + * | ||
| 232 | + * @param {string} content - 文档内容 | ||
| 233 | + * @param {string} fieldName - 字段名称 | ||
| 234 | + * @returns {{value: any, matched: boolean, pattern: string|null}} 提取结果 | ||
| 235 | + */ | ||
| 236 | +function extractField(content, fieldName) { | ||
| 237 | + const rule = FIELD_RULES[fieldName] | ||
| 238 | + if (!rule) { | ||
| 239 | + return { value: null, matched: false, pattern: null } | ||
| 240 | + } | ||
| 241 | + | ||
| 242 | + // 尝试每个匹配模式 | ||
| 243 | + for (const pattern of rule.patterns) { | ||
| 244 | + let match = null | ||
| 245 | + let patternDesc = '' | ||
| 246 | + | ||
| 247 | + if (typeof pattern === 'object' && pattern.type) { | ||
| 248 | + // 复杂匹配模式 | ||
| 249 | + switch (pattern.type) { | ||
| 250 | + case 'content_match': | ||
| 251 | + match = matchByContent(content, pattern.rules) | ||
| 252 | + patternDesc = `content_match(${pattern.rules.length} rules)` | ||
| 253 | + break | ||
| 254 | + | ||
| 255 | + case 'count_match': | ||
| 256 | + match = matchByCount(content, pattern.rules) | ||
| 257 | + patternDesc = `count_match(${pattern.rules.length} rules)` | ||
| 258 | + break | ||
| 259 | + | ||
| 260 | + case 'list_extract': | ||
| 261 | + match = extractList(content, pattern.pattern, pattern.itemPattern) | ||
| 262 | + patternDesc = `list_extract` | ||
| 263 | + break | ||
| 264 | + | ||
| 265 | + case 'smart_list_extract': | ||
| 266 | + match = smartExtractList( | ||
| 267 | + content, | ||
| 268 | + pattern.startPattern, | ||
| 269 | + pattern.endKeywords, | ||
| 270 | + pattern.itemFilter | ||
| 271 | + ) | ||
| 272 | + patternDesc = `smart_list_extract` | ||
| 273 | + break | ||
| 274 | + | ||
| 275 | + case 'range_extract': | ||
| 276 | + match = extractRange(content, pattern.pattern) | ||
| 277 | + patternDesc = `range_extract` | ||
| 278 | + break | ||
| 279 | + } | ||
| 280 | + } else if (pattern instanceof RegExp) { | ||
| 281 | + // 正则表达式匹配 | ||
| 282 | + match = content.match(pattern) | ||
| 283 | + patternDesc = pattern.toString() | ||
| 284 | + } | ||
| 285 | + | ||
| 286 | + // 如果匹配成功 | ||
| 287 | + if (match) { | ||
| 288 | + let value = match | ||
| 289 | + | ||
| 290 | + // 应用后处理 | ||
| 291 | + if (rule.postProcess) { | ||
| 292 | + value = rule.postProcess(match) | ||
| 293 | + } else if (Array.isArray(match) && match.length > 1) { | ||
| 294 | + // 正则匹配结果,取第一个捕获组 | ||
| 295 | + value = match[1] | ||
| 296 | + } | ||
| 297 | + | ||
| 298 | + return { | ||
| 299 | + value, | ||
| 300 | + matched: true, | ||
| 301 | + pattern: patternDesc | ||
| 302 | + } | ||
| 303 | + } | ||
| 304 | + } | ||
| 305 | + | ||
| 306 | + // 没有匹配,返回默认值 | ||
| 307 | + return { | ||
| 308 | + value: rule.fallback, | ||
| 309 | + matched: false, | ||
| 310 | + pattern: null | ||
| 311 | + } | ||
| 312 | +} | ||
| 313 | + | ||
| 314 | +/** | ||
| 315 | + * 通过关键词匹配内容 | ||
| 316 | + */ | ||
| 317 | +function matchByContent(content, rules) { | ||
| 318 | + const contentLower = content.toLowerCase() | ||
| 319 | + | ||
| 320 | + for (const rule of rules) { | ||
| 321 | + const hasKeyword = rule.keywords.some(keyword => { | ||
| 322 | + return contentLower.includes(keyword.toLowerCase()) | ||
| 323 | + }) | ||
| 324 | + | ||
| 325 | + if (hasKeyword) { | ||
| 326 | + return rule.value | ||
| 327 | + } | ||
| 328 | + } | ||
| 329 | + | ||
| 330 | + return null | ||
| 331 | +} | ||
| 332 | + | ||
| 333 | +/** | ||
| 334 | + * 通过统计匹配内容 | ||
| 335 | + */ | ||
| 336 | +function matchByCount(content, rules) { | ||
| 337 | + let maxCount = 0 | ||
| 338 | + let maxValue = null | ||
| 339 | + | ||
| 340 | + for (const rule of rules) { | ||
| 341 | + const matches = content.match(rule.pattern) | ||
| 342 | + const count = matches ? matches.length : 0 | ||
| 343 | + | ||
| 344 | + if (count > maxCount) { | ||
| 345 | + maxCount = count | ||
| 346 | + maxValue = rule.value | ||
| 347 | + } | ||
| 348 | + } | ||
| 349 | + | ||
| 350 | + return maxValue | ||
| 351 | +} | ||
| 352 | + | ||
| 353 | +/** | ||
| 354 | + * 提取列表项 | ||
| 355 | + */ | ||
| 356 | +function extractList(content, pattern, itemPattern) { | ||
| 357 | + const sectionMatch = content.match(pattern) | ||
| 358 | + if (!sectionMatch) return null | ||
| 359 | + | ||
| 360 | + const section = sectionMatch[1] | ||
| 361 | + const items = [] | ||
| 362 | + | ||
| 363 | + // 将 itemPattern 转换为正则表达式 | ||
| 364 | + const regex = typeof itemPattern === 'string' ? new RegExp(itemPattern, 'gm') : itemPattern | ||
| 365 | + | ||
| 366 | + // 使用 exec 循环提取所有匹配项 | ||
| 367 | + let match | ||
| 368 | + while ((match = regex.exec(section)) !== null) { | ||
| 369 | + // 提取第一个非空捕获组 | ||
| 370 | + let item = null | ||
| 371 | + | ||
| 372 | + // 尝试所有捕获组,找到第一个非空的 | ||
| 373 | + for (let i = 1; i < match.length; i++) { | ||
| 374 | + if (match[i] && match[i].trim()) { | ||
| 375 | + item = match[i].trim() | ||
| 376 | + break | ||
| 377 | + } | ||
| 378 | + } | ||
| 379 | + | ||
| 380 | + // 如果没有捕获组,使用整个匹配 | ||
| 381 | + if (!item && match[0] && match[0].trim()) { | ||
| 382 | + item = match[0].trim() | ||
| 383 | + } | ||
| 384 | + | ||
| 385 | + if (item) { | ||
| 386 | + items.push(item) | ||
| 387 | + } | ||
| 388 | + } | ||
| 389 | + | ||
| 390 | + // 如果正则匹配失败,尝试按行分割 | ||
| 391 | + if (items.length === 0) { | ||
| 392 | + const lines = section.split('\n') | ||
| 393 | + for (const line of lines) { | ||
| 394 | + const trimmed = line.trim() | ||
| 395 | + // 过滤掉空行和短文本 | ||
| 396 | + if (trimmed && trimmed.length > 0 && trimmed.length < 50) { | ||
| 397 | + items.push(trimmed) | ||
| 398 | + } | ||
| 399 | + } | ||
| 400 | + } | ||
| 401 | + | ||
| 402 | + return items.length > 0 ? items : null | ||
| 403 | +} | ||
| 404 | + | ||
| 405 | +/** | ||
| 406 | + * 提取范围值 | ||
| 407 | + */ | ||
| 408 | +function extractRange(content, pattern) { | ||
| 409 | + const match = content.match(pattern) | ||
| 410 | + if (!match) return null | ||
| 411 | + | ||
| 412 | + return { | ||
| 413 | + min: parseInt(match[1]), | ||
| 414 | + max: parseInt(match[2]) | ||
| 415 | + } | ||
| 416 | +} | ||
| 417 | + | ||
| 418 | +/** | ||
| 419 | + * 智能提取列表项 | ||
| 420 | + * | ||
| 421 | + * @description 从文档中智能提取列表,支持不规则格式和多行内容 | ||
| 422 | + * @param {string} content - 文档内容 | ||
| 423 | + * @param {RegExp} startPattern - 列表起始模式 | ||
| 424 | + * @param {string[]} endKeywords - 结束关键词列表 | ||
| 425 | + * @param {Function} itemFilter - 列表项过滤函数 | ||
| 426 | + * @returns {string[]|null} 提取的列表项数组 | ||
| 427 | + */ | ||
| 428 | +function smartExtractList(content, startPattern, endKeywords, itemFilter) { | ||
| 429 | + // 1. 找到起始位置 | ||
| 430 | + const startMatch = content.match(startPattern) | ||
| 431 | + if (!startMatch) return null | ||
| 432 | + | ||
| 433 | + // 获取起始位置后的内容 | ||
| 434 | + const startIndex = startMatch.index + startMatch[0].length | ||
| 435 | + const remainingContent = content.slice(startIndex) | ||
| 436 | + | ||
| 437 | + // 2. 按行分割并逐行扫描 | ||
| 438 | + const lines = remainingContent.split('\n') | ||
| 439 | + const items = [] | ||
| 440 | + | ||
| 441 | + for (const line of lines) { | ||
| 442 | + const trimmedLine = line.trim() | ||
| 443 | + | ||
| 444 | + // 3. 检查是否遇到结束关键词 | ||
| 445 | + if (endKeywords.some(keyword => trimmedLine.includes(keyword))) { | ||
| 446 | + break | ||
| 447 | + } | ||
| 448 | + | ||
| 449 | + // 4. 使用 itemFilter 过滤有效项 | ||
| 450 | + if (itemFilter && typeof itemFilter === 'function') { | ||
| 451 | + if (itemFilter(trimmedLine)) { | ||
| 452 | + items.push(trimmedLine) | ||
| 453 | + } | ||
| 454 | + } else { | ||
| 455 | + // 默认过滤:非空行且长度合理 | ||
| 456 | + if (trimmedLine && trimmedLine.length > 0 && trimmedLine.length < 100) { | ||
| 457 | + items.push(trimmedLine) | ||
| 458 | + } | ||
| 459 | + } | ||
| 460 | + } | ||
| 461 | + | ||
| 462 | + return items.length > 0 ? items : null | ||
| 463 | +} | ||
| 464 | + | ||
| 465 | +/** | ||
| 466 | + * 智能提取所有字段 | ||
| 467 | + * | ||
| 468 | + * @param {string} content - 文档内容 | ||
| 469 | + * @param {string} fileName - 文件名(用于推断产品名称) | ||
| 470 | + * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果 | ||
| 471 | + */ | ||
| 472 | +export function smartExtractFields(content, fileName) { | ||
| 473 | + const config = {} | ||
| 474 | + const unmatched = [] | ||
| 475 | + const warnings = [] | ||
| 476 | + const matchDetails = [] | ||
| 477 | + | ||
| 478 | + // 按优先级提取字段 | ||
| 479 | + const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority) | ||
| 480 | + | ||
| 481 | + for (const [fieldName, rule] of sortedFields) { | ||
| 482 | + const result = extractField(content, fieldName) | ||
| 483 | + | ||
| 484 | + // 记录匹配详情 | ||
| 485 | + matchDetails.push({ | ||
| 486 | + field: fieldName, | ||
| 487 | + matched: result.matched, | ||
| 488 | + pattern: result.pattern, | ||
| 489 | + value: result.value | ||
| 490 | + }) | ||
| 491 | + | ||
| 492 | + // 如果匹配成功或字段有默认值 | ||
| 493 | + if (result.value !== null) { | ||
| 494 | + config[fieldName] = result.value | ||
| 495 | + | ||
| 496 | + // 如果使用了默认值,记录警告 | ||
| 497 | + if (!result.matched && rule.required) { | ||
| 498 | + warnings.push({ | ||
| 499 | + field: fieldName, | ||
| 500 | + message: `未找到字段 "${fieldName}",使用默认值: ${JSON.stringify(rule.fallback)}`, | ||
| 501 | + severity: 'warning' | ||
| 502 | + }) | ||
| 503 | + } | ||
| 504 | + } else if (rule.required) { | ||
| 505 | + // 必填字段未匹配 | ||
| 506 | + unmatched.push({ | ||
| 507 | + field: fieldName, | ||
| 508 | + reason: '未找到匹配内容', | ||
| 509 | + suggestions: generateSuggestions(fieldName, content) | ||
| 510 | + }) | ||
| 511 | + } | ||
| 512 | + } | ||
| 513 | + | ||
| 514 | + // 产品名称特殊处理:如果未匹配,使用文件名 | ||
| 515 | + if (!config.product_name) { | ||
| 516 | + const baseName = fileName.replace(/\.[^/.]+$/, '') | ||
| 517 | + config.product_name = baseName | ||
| 518 | + warnings.push({ | ||
| 519 | + field: 'product_name', | ||
| 520 | + message: `未找到产品名称,使用文件名: "${baseName}"`, | ||
| 521 | + severity: 'info' | ||
| 522 | + }) | ||
| 523 | + } | ||
| 524 | + | ||
| 525 | + // 根据产品类型过滤字段 | ||
| 526 | + if (config.product_type !== 'savings') { | ||
| 527 | + delete config.withdrawal_modes | ||
| 528 | + delete config.withdrawal_periods | ||
| 529 | + } | ||
| 530 | + | ||
| 531 | + return { | ||
| 532 | + config, | ||
| 533 | + unmatched, | ||
| 534 | + warnings, | ||
| 535 | + matchDetails | ||
| 536 | + } | ||
| 537 | +} | ||
| 538 | + | ||
| 539 | +/** | ||
| 540 | + * 生成字段建议值 | ||
| 541 | + * | ||
| 542 | + * @param {string} fieldName - 字段名称 | ||
| 543 | + * @param {string} content - 文档内容 | ||
| 544 | + * @returns {Array<string>} 建议值列表 | ||
| 545 | + */ | ||
| 546 | +function generateSuggestions(fieldName, content) { | ||
| 547 | + const suggestions = { | ||
| 548 | + product_name: ['从文档标题提取', '从第一行提取', '手动输入产品全称'], | ||
| 549 | + product_type: ['savings - 储蓄型产品', 'critical-illness - 重疾型产品', 'life-insurance - 人寿型产品'], | ||
| 550 | + currency: ['USD - 美元', 'CNY - 人民币', 'HKD - 港币', 'EUR - 欧元'], | ||
| 551 | + payment_periods: ['整付', '3年', '5年', '10年', '15年', '20年'], | ||
| 552 | + age_range: ['0-75岁(常见范围)', '0-70岁', '1-65岁'], | ||
| 553 | + insurance_period: ['终身', '至100岁', '20年', '30年'], | ||
| 554 | + withdrawal_modes: ['指定提取金额', '最高固定提取金额'], | ||
| 555 | + withdrawal_periods: ['1年', '3年', '5年', '10年', '15年', '20年'] | ||
| 556 | + } | ||
| 557 | + | ||
| 558 | + return suggestions[fieldName] || ['请手动输入'] | ||
| 559 | +} | ||
| 560 | + | ||
| 561 | +/** | ||
| 562 | + * 生成人工审核报告 | ||
| 563 | + * | ||
| 564 | + * @param {Object} result - 提取结果 | ||
| 565 | + * @returns {string} Markdown 格式的审核报告 | ||
| 566 | + */ | ||
| 567 | +export function generateAuditReport(result) { | ||
| 568 | + const { config, unmatched, warnings, matchDetails } = result | ||
| 569 | + | ||
| 570 | + let report = `## 📊 字段提取报告\n\n` | ||
| 571 | + | ||
| 572 | + // 匹配统计 | ||
| 573 | + const matchedCount = matchDetails.filter(m => m.matched).length | ||
| 574 | + const totalCount = matchDetails.length | ||
| 575 | + report += `### 匹配统计\n\n` | ||
| 576 | + report += `- ✅ 成功匹配: ${matchedCount}/${totalCount} 字段\n` | ||
| 577 | + report += `- ⚠️ 使用默认值: ${warnings.length} 字段\n` | ||
| 578 | + report += `- ❌ 未匹配(需人工补充): ${unmatched.length} 字段\n\n` | ||
| 579 | + | ||
| 580 | + // 匹配详情表格 | ||
| 581 | + report += `### 匹配详情\n\n` | ||
| 582 | + report += `| 字段 | 状态 | 提取方式 | 值 |\n` | ||
| 583 | + report += `|------|------|----------|----|\n` | ||
| 584 | + | ||
| 585 | + for (const detail of matchDetails) { | ||
| 586 | + const status = detail.matched ? '✅' : (FIELD_RULES[detail.field]?.required ? '⚠️' : 'ℹ️') | ||
| 587 | + const method = detail.matched ? detail.pattern : '默认值' | ||
| 588 | + const valuePreview = JSON.stringify(detail.value).substring(0, 50) | ||
| 589 | + report += `| ${detail.field} | ${status} | ${method} | ${valuePreview} |\n` | ||
| 590 | + } | ||
| 591 | + | ||
| 592 | + // 警告信息 | ||
| 593 | + if (warnings.length > 0) { | ||
| 594 | + report += `\n### ⚠️ 警告信息\n\n` | ||
| 595 | + for (const warning of warnings) { | ||
| 596 | + report += `- **${warning.field}**: ${warning.message}\n` | ||
| 597 | + } | ||
| 598 | + } | ||
| 599 | + | ||
| 600 | + // 未匹配字段(需要人工补充) | ||
| 601 | + if (unmatched.length > 0) { | ||
| 602 | + report += `\n### ❌ 未匹配字段(需要人工补充)\n\n` | ||
| 603 | + for (const item of unmatched) { | ||
| 604 | + report += `#### ${item.field}\n\n` | ||
| 605 | + report += `- **原因**: ${item.reason}\n` | ||
| 606 | + report += `- **建议值**:\n` | ||
| 607 | + for (const suggestion of item.suggestions) { | ||
| 608 | + report += ` - ${suggestion}\n` | ||
| 609 | + } | ||
| 610 | + report += `\n` | ||
| 611 | + } | ||
| 612 | + } | ||
| 613 | + | ||
| 614 | + return report | ||
| 615 | +} | ||
| 616 | + | ||
| 617 | +export { FIELD_RULES } |
-
Please register or login to post a comment