feat(extractor): 实现智能字段提取器 smartExtractList

- 添加 smartExtractList() 智能列表提取函数 - 支持基于起始模式和结束关键词的列表边界识别 - 修复 insurance_period 和 withdrawal_modes 字段类型处理 - 优化 payment_periods 过滤逻辑，排除无效项 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

feat(extractor): 实现智能字段提取器 smartExtractList
- 添加 smartExtractList() 智能列表提取函数 - 支持基于起始模式和结束关键词的列表边界识别 - 修复 insurance_period 和 withdrawal_modes 字段类型处理 - 优化 payment_periods 过滤逻辑，排除无效项 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
hookehuyr
Commit 3ddf8b87a5c5c6a79cedf6a5f3e739621ff3ed97 3ddf8b87 1 parent 7a564df0
Showing 2 changed files with 670 additions and 0 deletions
docs/CHANGELOG.md
scripts/smart-field-extractor.js
--- a/docs/CHANGELOG.md
View file @3ddf8b8
+++ b/docs/CHANGELOG.md
View file @3ddf8b8
+ ## [2026-02-15] - 智能字段提取器完善
+ 
+ ### 新增
+ - 实现 `smartExtractList()` 智能列表提取函数
+ - 支持基于起始模式和结束关键词的列表边界识别
+ 
+ ### 修复
+ - 修复 `insurance_period` 和 `withdrawal_modes` 字段的 `postProcess` 函数类型处理问题
+ - 优化 `payment_periods` 字段的过滤逻辑，排除"投保年龄"等无效项
+ 
+ ---
+ 
+ **详细信息**：
+ - **影响文件**: scripts/smart-field-extractor.js
+ - **技术栈**: Node.js, 正则表达式, 智能提取
+ - **测试状态**: 单元测试通过
+ - **备注**: 提升了字段提取的准确性和健壮性
+ 
+ ---
+ 
+ ## [2026-02-14] - 文档解析使用说明完善
+ 
+ ### 优化
+ - 补充解析链路与使用思路，明确审核与合并边界
+ - 更新解析命令说明与默认行为
+ 
+ ---
+ 
+ **详细信息**：
+ - **影响文件**: docs/to-parse/README.md, README.md
+ - **技术栈**: 文档维护
+ - **测试状态**: 未运行（仅文档更新）
+ - **备注**: 使用方式与链路更清晰
+ 
+ ---
+ 
+ ## [2026-02-14] - 文档解析审核流程落地
+ 
+ ### 优化
+ - 修复审核模板重复定义与内容断裂，统一字段命名与展示结构
+ - 完善审核流程指引，明确 pending/approved 目录治理与合并步骤
+ - 默认解析仅生成待审核文件，写入配置需显式开启
+ 
+ ---
+ 
+ **详细信息**：
+ - **影响文件**: scripts/parse-docs.js, docs/to-parse/README.md, docs/tasks/plan/改进文档解析工具-添加审核流程.md, README.md
+ - **技术栈**: Node.js, 文档维护
+ - **测试状态**: pnpm test 通过；pnpm lint 30 warnings
+ - **备注**: 已生成待审核文件并完成可读性校验
+ 
+ ---
+ 
 ## [2026-02-14] - 文档解析审核方案整理
 
 ### 优化
--- a/scripts/smart-field-extractor.js 0 → 100644
View file @3ddf8b8
+++ b/scripts/smart-field-extractor.js 0 → 100644
View file @3ddf8b8
+ /**
+  * 智能字段提取器
+  *
+  * @description 从保险产品文档中智能提取配置字段，支持中英文、繁简体
+  * @module scripts/smart-field-extractor
+  * @author Claude Code
+  * @created 2026-02-14
+  */
+ 
+ /**
+  * 字段提取规则配置
+  *
+  * @description 定义每个字段的匹配规则、优先级和默认值
+  */
+ const FIELD_RULES = {
+   // 产品名称
+   product_name: {
+     priority: 1,
+     patterns: [
+       /产品名称[：:]\s*([^\n]+)/,
+       /计划书名称[：:]\s*([^\n]+)/,
+       /Product\s+Name[：:]\s*([^\n]+)/i,
+       /^#\s+(.+)$/m // Markdown 标题
+     ],
+     fallback: null, // 必填，无默认值
+     required: true
+   },
+ 
+   // 产品类型
+   product_type: {
+     priority: 2,
+     patterns: [
+       // 从内容推断
+       {
+         type: 'content_match',
+         rules: [
+           { keywords: ['储蓄', 'saving', '传承', '家传', '红利', '提取'], value: 'savings' },
+           { keywords: ['重疾', 'critical', '守护', '严重疾病'], value: 'critical-illness' },
+           { keywords: ['人寿', 'life', '创富', '身故保障'], value: 'life-insurance' }
+         ]
+       }
+     ],
+     fallback: 'savings',
+     required: true
+   },
+ 
+   // 币种
+   currency: {
+     priority: 3,
+     patterns: [
+       // 统计货币符号出现次数
+       {
+         type: 'count_match',
+         rules: [
+           { pattern: /\$/g, value: 'USD' },
+           { pattern: /HK\$/g, value: 'HKD' },
+           { pattern: /¥|人民币/g, value: 'CNY' },
+           { pattern: /€/g, value: 'EUR' }
+         ]
+       },
+       /币种[：:]\s*(USD|CNY|HKD|EUR)/i,
+       /Currency[：:]\s*(USD|CNY|HKD|EUR)/i
+     ],
+     fallback: 'USD',
+     required: true
+   },
+ 
+   // 缴费年期
+   payment_periods: {
+     priority: 4,
+     patterns: [
+       // 匹配 "年繳保費繳費年期" 或 "缴费年期" 后面的列表
+       // 策略：匹配到包含 "年" 或 "整付" 的所有行，直到遇到其他关键字
+       {
+         type: 'smart_list_extract',
+         startPattern: /(?:年繳保費)?繳費年期[：:\s]*\n/,
+         endKeywords: ['提取', '保險期間', '保险期间', '投保年龄', '投保年齡', '選是', '選項', 'GC宏', 'FA宏', 'LV2'],
+         itemFilter: (line) => {
+           const trimmed = line.trim()
+           // 排除包含"投保年龄"等关键字的行
+           if (trimmed.includes('投保') || trimmed.includes('年龄') || trimmed.includes('年齡')) {
+             return false
+           }
+           // 精确匹配 "整付" 或 "X年" 格式
+           return trimmed && (
+             /^\d+\s*年$/.test(trimmed) ||
+             trimmed === '整付' ||
+             /^\d+年$/.test(trimmed) ||
+             /^[-•·]\s*\d+\s*年$/.test(trimmed)  // 支持列表格式 "- 3年"
+           )
+         }
+       }
+     ],
+     fallback: ['整付', '3年', '5年'],
+     required: true,
+     postProcess: (values) => {
+       // 过滤并标准化
+       const normalized = values
+         .map(v => v.trim())
+         // 排除包含"投保"等无效关键字
+         .filter(v => v && !v.includes('投保') && !v.includes('年龄') && !v.includes('年齡'))
+         .filter(v => v.includes('年') || v.includes('整付'))
+         .map(v => {
+           // 提取数字+年格式
+           const match = v.match(/(\d+)\s*年|整付/i)
+           if (match) {
+             return match[0].includes('整付') ? '整付' : `${match[1]}年`
+           }
+           return v
+         })
+ 
+       // 去重、排序
+       return [...new Set(normalized)].sort((a, b) => {
+         if (a === '整付') return -1
+         if (b === '整付') return 1
+         return parseInt(a) - parseInt(b)
+       })
+     }
+   },
+ 
+   // 年龄范围
+   age_range: {
+     priority: 5,
+     patterns: [
+       // 匹配 "0-75岁" 格式
+       {
+         type: 'range_extract',
+         pattern: /(\d+)\s*[-~至]\s*(\d+)\s*岁?/
+       },
+       // 匹配 "投保年龄：0-75岁" 格式
+       /投保年龄[：:]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/,
+       /年龄范围[：:]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/
+     ],
+     fallback: { min: 0, max: 75 },
+     required: true,
+     postProcess: (match) => {
+       if (match && typeof match === 'object' && match.min !== undefined) {
+         return match
+       }
+       if (Array.isArray(match) && match.length >= 2) {
+         return { min: parseInt(match[1]), max: parseInt(match[2]) }
+       }
+       return null
+     }
+   },
+ 
+   // 保险期间
+   insurance_period: {
+     priority: 6,
+     patterns: [
+       /保險期間[：:]\s*([^\n]+)/,
+       /保险期间[：:]\s*([^\n]+)/,
+       /Insurance\s+Period[：:]\s*([^\n]+)/i,
+       /保障期间[：:]\s*([^\n]+)/
+     ],
+     fallback: '终身',
+     required: true,
+     postProcess: (value) => {
+       // 处理正则匹配结果（数组）或直接字符串
+       let str = value
+       if (Array.isArray(value)) {
+         str = value[1] || value[0] || ''
+       }
+       if (!str || typeof str !== 'string') return '终身'
+ 
+       const normalized = str.trim()
+       // 标准化常见表述
+       if (normalized.includes('终身') || normalized.includes('終身') || normalized.toLowerCase().includes('whole life')) {
+         return '终身'
+       }
+       return normalized
+     }
+   },
+ 
+   // 提取方式（仅储蓄类）
+   withdrawal_modes: {
+     priority: 7,
+     patterns: [
+       {
+         type: 'list_extract',
+         pattern: /提取选项[：:]\s*([^\n]+)/,
+         itemPattern: /指定提取金额|最高固定提取金额/g
+       },
+       /提取方式[：:]\s*([^\n]+)/
+     ],
+     fallback: ['年龄指定金额', '最高固定金额'],
+     required: false,
+     productType: ['savings'],
+     postProcess: (values) => {
+       // 处理正则匹配结果（数组，第一个元素是完整匹配，第二个是捕获组）
+       if (Array.isArray(values) && values.length > 1 && typeof values[1] === 'string') {
+         values = values[1]
+       }
+ 
+       if (typeof values === 'string') {
+         // 从单行文本中提取
+         const modes = []
+         if (values.includes('指定提取金额')) modes.push('指定提取金额')
+         if (values.includes('最高固定提取金额')) modes.push('最高固定提取金额')
+         return modes.length > 0 ? modes : ['年龄指定金额', '最高固定金额']
+       }
+       return Array.isArray(values) ? values : ['年龄指定金额', '最高固定金额']
+     }
+   },
+ 
+   // 提取期（仅储蓄类）
+   withdrawal_periods: {
+     priority: 8,
+     patterns: [
+       {
+         type: 'list_extract',
+         pattern: /提取期[（(]年[）)][：:]\s*([\s\S]*?)(?=\n\n|\n\n|$)/,
+         itemPattern: /^\s*[-•·]\s*(\d+\s*年)|^\s*(\d+)\s*年\s*$/gm
+       }
+     ],
+     fallback: ['1年', '3年', '5年', '10年'],
+     required: false,
+     productType: ['savings'],
+     postProcess: (values) => {
+       const normalized = values.map(v => {
+         const match = v.match(/(\d+)\s*年/)
+         return match ? `${match[1]}年` : v.trim()
+       })
+       return [...new Set(normalized)].sort((a, b) => parseInt(a) - parseInt(b))
+     }
+   }
+ }
+ 
+ /**
+  * 从文本中提取字段值
+  *
+  * @param {string} content - 文档内容
+  * @param {string} fieldName - 字段名称
+  * @returns {{value: any, matched: boolean, pattern: string|null}} 提取结果
+  */
+ function extractField(content, fieldName) {
+   const rule = FIELD_RULES[fieldName]
+   if (!rule) {
+     return { value: null, matched: false, pattern: null }
+   }
+ 
+   // 尝试每个匹配模式
+   for (const pattern of rule.patterns) {
+     let match = null
+     let patternDesc = ''
+ 
+     if (typeof pattern === 'object' && pattern.type) {
+       // 复杂匹配模式
+       switch (pattern.type) {
+         case 'content_match':
+           match = matchByContent(content, pattern.rules)
+           patternDesc = `content_match(${pattern.rules.length} rules)`
+           break
+ 
+         case 'count_match':
+           match = matchByCount(content, pattern.rules)
+           patternDesc = `count_match(${pattern.rules.length} rules)`
+           break
+ 
+         case 'list_extract':
+           match = extractList(content, pattern.pattern, pattern.itemPattern)
+           patternDesc = `list_extract`
+           break
+ 
+         case 'smart_list_extract':
+           match = smartExtractList(
+             content,
+             pattern.startPattern,
+             pattern.endKeywords,
+             pattern.itemFilter
+           )
+           patternDesc = `smart_list_extract`
+           break
+ 
+         case 'range_extract':
+           match = extractRange(content, pattern.pattern)
+           patternDesc = `range_extract`
+           break
+       }
+     } else if (pattern instanceof RegExp) {
+       // 正则表达式匹配
+       match = content.match(pattern)
+       patternDesc = pattern.toString()
+     }
+ 
+     // 如果匹配成功
+     if (match) {
+       let value = match
+ 
+       // 应用后处理
+       if (rule.postProcess) {
+         value = rule.postProcess(match)
+       } else if (Array.isArray(match) && match.length > 1) {
+         // 正则匹配结果，取第一个捕获组
+         value = match[1]
+       }
+ 
+       return {
+         value,
+         matched: true,
+         pattern: patternDesc
+       }
+     }
+   }
+ 
+   // 没有匹配，返回默认值
+   return {
+     value: rule.fallback,
+     matched: false,
+     pattern: null
+   }
+ }
+ 
+ /**
+  * 通过关键词匹配内容
+  */
+ function matchByContent(content, rules) {
+   const contentLower = content.toLowerCase()
+ 
+   for (const rule of rules) {
+     const hasKeyword = rule.keywords.some(keyword => {
+       return contentLower.includes(keyword.toLowerCase())
+     })
+ 
+     if (hasKeyword) {
+       return rule.value
+     }
+   }
+ 
+   return null
+ }
+ 
+ /**
+  * 通过统计匹配内容
+  */
+ function matchByCount(content, rules) {
+   let maxCount = 0
+   let maxValue = null
+ 
+   for (const rule of rules) {
+     const matches = content.match(rule.pattern)
+     const count = matches ? matches.length : 0
+ 
+     if (count > maxCount) {
+       maxCount = count
+       maxValue = rule.value
+     }
+   }
+ 
+   return maxValue
+ }
+ 
+ /**
+  * 提取列表项
+  */
+ function extractList(content, pattern, itemPattern) {
+   const sectionMatch = content.match(pattern)
+   if (!sectionMatch) return null
+ 
+   const section = sectionMatch[1]
+   const items = []
+ 
+   // 将 itemPattern 转换为正则表达式
+   const regex = typeof itemPattern === 'string' ? new RegExp(itemPattern, 'gm') : itemPattern
+ 
+   // 使用 exec 循环提取所有匹配项
+   let match
+   while ((match = regex.exec(section)) !== null) {
+     // 提取第一个非空捕获组
+     let item = null
+ 
+     // 尝试所有捕获组，找到第一个非空的
+     for (let i = 1; i < match.length; i++) {
+       if (match[i] && match[i].trim()) {
+         item = match[i].trim()
+         break
+       }
+     }
+ 
+     // 如果没有捕获组，使用整个匹配
+     if (!item && match[0] && match[0].trim()) {
+       item = match[0].trim()
+     }
+ 
+     if (item) {
+       items.push(item)
+     }
+   }
+ 
+   // 如果正则匹配失败，尝试按行分割
+   if (items.length === 0) {
+     const lines = section.split('\n')
+     for (const line of lines) {
+       const trimmed = line.trim()
+       // 过滤掉空行和短文本
+       if (trimmed && trimmed.length > 0 && trimmed.length < 50) {
+         items.push(trimmed)
+       }
+     }
+   }
+ 
+   return items.length > 0 ? items : null
+ }
+ 
+ /**
+  * 提取范围值
+  */
+ function extractRange(content, pattern) {
+   const match = content.match(pattern)
+   if (!match) return null
+ 
+   return {
+     min: parseInt(match[1]),
+     max: parseInt(match[2])
+   }
+ }
+ 
+ /**
+  * 智能提取列表项
+  *
+  * @description 从文档中智能提取列表，支持不规则格式和多行内容
+  * @param {string} content - 文档内容
+  * @param {RegExp} startPattern - 列表起始模式
+  * @param {string[]} endKeywords - 结束关键词列表
+  * @param {Function} itemFilter - 列表项过滤函数
+  * @returns {string[]|null} 提取的列表项数组
+  */
+ function smartExtractList(content, startPattern, endKeywords, itemFilter) {
+   // 1. 找到起始位置
+   const startMatch = content.match(startPattern)
+   if (!startMatch) return null
+ 
+   // 获取起始位置后的内容
+   const startIndex = startMatch.index + startMatch[0].length
+   const remainingContent = content.slice(startIndex)
+ 
+   // 2. 按行分割并逐行扫描
+   const lines = remainingContent.split('\n')
+   const items = []
+ 
+   for (const line of lines) {
+     const trimmedLine = line.trim()
+ 
+     // 3. 检查是否遇到结束关键词
+     if (endKeywords.some(keyword => trimmedLine.includes(keyword))) {
+       break
+     }
+ 
+     // 4. 使用 itemFilter 过滤有效项
+     if (itemFilter && typeof itemFilter === 'function') {
+       if (itemFilter(trimmedLine)) {
+         items.push(trimmedLine)
+       }
+     } else {
+       // 默认过滤：非空行且长度合理
+       if (trimmedLine && trimmedLine.length > 0 && trimmedLine.length < 100) {
+         items.push(trimmedLine)
+       }
+     }
+   }
+ 
+   return items.length > 0 ? items : null
+ }
+ 
+ /**
+  * 智能提取所有字段
+  *
+  * @param {string} content - 文档内容
+  * @param {string} fileName - 文件名（用于推断产品名称）
+  * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
+  */
+ export function smartExtractFields(content, fileName) {
+   const config = {}
+   const unmatched = []
+   const warnings = []
+   const matchDetails = []
+ 
+   // 按优先级提取字段
+   const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
+ 
+   for (const [fieldName, rule] of sortedFields) {
+     const result = extractField(content, fieldName)
+ 
+     // 记录匹配详情
+     matchDetails.push({
+       field: fieldName,
+       matched: result.matched,
+       pattern: result.pattern,
+       value: result.value
+     })
+ 
+     // 如果匹配成功或字段有默认值
+     if (result.value !== null) {
+       config[fieldName] = result.value
+ 
+       // 如果使用了默认值，记录警告
+       if (!result.matched && rule.required) {
+         warnings.push({
+           field: fieldName,
+           message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
+           severity: 'warning'
+         })
+       }
+     } else if (rule.required) {
+       // 必填字段未匹配
+       unmatched.push({
+         field: fieldName,
+         reason: '未找到匹配内容',
+         suggestions: generateSuggestions(fieldName, content)
+       })
+     }
+   }
+ 
+   // 产品名称特殊处理：如果未匹配，使用文件名
+   if (!config.product_name) {
+     const baseName = fileName.replace(/\.[^/.]+$/, '')
+     config.product_name = baseName
+     warnings.push({
+       field: 'product_name',
+       message: `未找到产品名称，使用文件名: "${baseName}"`,
+       severity: 'info'
+     })
+   }
+ 
+   // 根据产品类型过滤字段
+   if (config.product_type !== 'savings') {
+     delete config.withdrawal_modes
+     delete config.withdrawal_periods
+   }
+ 
+   return {
+     config,
+     unmatched,
+     warnings,
+     matchDetails
+   }
+ }
+ 
+ /**
+  * 生成字段建议值
+  *
+  * @param {string} fieldName - 字段名称
+  * @param {string} content - 文档内容
+  * @returns {Array<string>} 建议值列表
+  */
+ function generateSuggestions(fieldName, content) {
+   const suggestions = {
+     product_name: ['从文档标题提取', '从第一行提取', '手动输入产品全称'],
+     product_type: ['savings - 储蓄型产品', 'critical-illness - 重疾型产品', 'life-insurance - 人寿型产品'],
+     currency: ['USD - 美元', 'CNY - 人民币', 'HKD - 港币', 'EUR - 欧元'],
+     payment_periods: ['整付', '3年', '5年', '10年', '15年', '20年'],
+     age_range: ['0-75岁（常见范围）', '0-70岁', '1-65岁'],
+     insurance_period: ['终身', '至100岁', '20年', '30年'],
+     withdrawal_modes: ['指定提取金额', '最高固定提取金额'],
+     withdrawal_periods: ['1年', '3年', '5年', '10年', '15年', '20年']
+   }
+ 
+   return suggestions[fieldName] || ['请手动输入']
+ }
+ 
+ /**
+  * 生成人工审核报告
+  *
+  * @param {Object} result - 提取结果
+  * @returns {string} Markdown 格式的审核报告
+  */
+ export function generateAuditReport(result) {
+   const { config, unmatched, warnings, matchDetails } = result
+ 
+   let report = `## 📊 字段提取报告\n\n`
+ 
+   // 匹配统计
+   const matchedCount = matchDetails.filter(m => m.matched).length
+   const totalCount = matchDetails.length
+   report += `### 匹配统计\n\n`
+   report += `- ✅ 成功匹配: ${matchedCount}/${totalCount} 字段\n`
+   report += `- ⚠️  使用默认值: ${warnings.length} 字段\n`
+   report += `- ❌ 未匹配（需人工补充）: ${unmatched.length} 字段\n\n`
+ 
+   // 匹配详情表格
+   report += `### 匹配详情\n\n`
+   report += `| 字段 | 状态 | 提取方式 | 值 |\n`
+   report += `|------|------|----------|----|\n`
+ 
+   for (const detail of matchDetails) {
+     const status = detail.matched ? '✅' : (FIELD_RULES[detail.field]?.required ? '⚠️' : 'ℹ️')
+     const method = detail.matched ? detail.pattern : '默认值'
+     const valuePreview = JSON.stringify(detail.value).substring(0, 50)
+     report += `| ${detail.field} | ${status} | ${method} | ${valuePreview} |\n`
+   }
+ 
+   // 警告信息
+   if (warnings.length > 0) {
+     report += `\n### ⚠️  警告信息\n\n`
+     for (const warning of warnings) {
+       report += `- **${warning.field}**: ${warning.message}\n`
+     }
+   }
+ 
+   // 未匹配字段（需要人工补充）
+   if (unmatched.length > 0) {
+     report += `\n### ❌ 未匹配字段（需要人工补充）\n\n`
+     for (const item of unmatched) {
+       report += `#### ${item.field}\n\n`
+       report += `- **原因**: ${item.reason}\n`
+       report += `- **建议值**:\n`
+       for (const suggestion of item.suggestions) {
+         report += `  - ${suggestion}\n`
+       }
+       report += `\n`
+     }
+   }
+ 
+   return report
+ }
+ 
+ export { FIELD_RULES }