feat(extractor): 实现智能字段提取器 smartExtractList

- 添加 smartExtractList() 智能列表提取函数 - 支持基于起始模式和结束关键词的列表边界识别 - 修复 insurance_period 和 withdrawal_modes 字段类型处理 - 优化 payment_periods 过滤逻辑，排除无效项 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

feat(extractor): 实现智能字段提取器 smartExtractList
- 添加 smartExtractList() 智能列表提取函数 - 支持基于起始模式和结束关键词的列表边界识别 - 修复 insurance_period 和 withdrawal_modes 字段类型处理 - 优化 payment_periods 过滤逻辑，排除无效项 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
hookehuyr
Commit 3ddf8b87a5c5c6a79cedf6a5f3e739621ff3ed97 3ddf8b87 1 parent 7a564df0
Showing 2 changed files with 670 additions and 0 deletions
docs/CHANGELOG.md
scripts/smart-field-extractor.js
--- a/docs/CHANGELOG.md
View file @3ddf8b8
+++ b/docs/CHANGELOG.md
View file @3ddf8b8
+## [2026-02-15] - 智能字段提取器完善
+
+### 新增
+- 实现 `smartExtractList()` 智能列表提取函数
+- 支持基于起始模式和结束关键词的列表边界识别
+
+### 修复
+- 修复 `insurance_period` 和 `withdrawal_modes` 字段的 `postProcess` 函数类型处理问题
+- 优化 `payment_periods` 字段的过滤逻辑，排除"投保年龄"等无效项
+
+---
+
+**详细信息**：
+- **影响文件**: scripts/smart-field-extractor.js
+- **技术栈**: Node.js, 正则表达式, 智能提取
+- **测试状态**: 单元测试通过
+- **备注**: 提升了字段提取的准确性和健壮性
+
+---
+
+## [2026-02-14] - 文档解析使用说明完善
+
+### 优化
+- 补充解析链路与使用思路，明确审核与合并边界
+- 更新解析命令说明与默认行为
+
+---
+
+**详细信息**：
+- **影响文件**: docs/to-parse/README.md, README.md
+- **技术栈**: 文档维护
+- **测试状态**: 未运行（仅文档更新）
+- **备注**: 使用方式与链路更清晰
+
+---
+
+## [2026-02-14] - 文档解析审核流程落地
+
+### 优化
+- 修复审核模板重复定义与内容断裂，统一字段命名与展示结构
+- 完善审核流程指引，明确 pending/approved 目录治理与合并步骤
+- 默认解析仅生成待审核文件，写入配置需显式开启
+
+---
+
+**详细信息**：
+- **影响文件**: scripts/parse-docs.js, docs/to-parse/README.md, docs/tasks/plan/改进文档解析工具-添加审核流程.md, README.md
+- **技术栈**: Node.js, 文档维护
+- **测试状态**: pnpm test 通过；pnpm lint 30 warnings
+- **备注**: 已生成待审核文件并完成可读性校验
+
+---
+
 ## [2026-02-14] - 文档解析审核方案整理
 ### 优化
--- a/scripts/smart-field-extractor.js 0 → 100644
View file @3ddf8b8
+++ b/scripts/smart-field-extractor.js 0 → 100644
View file @3ddf8b8
+/**
+ * 智能字段提取器
+ *
+ * @description 从保险产品文档中智能提取配置字段，支持中英文、繁简体
+ * @module scripts/smart-field-extractor
+ * @author Claude Code
+ * @created 2026-02-14
+ */
+
+/**
+ * 字段提取规则配置
+ *
+ * @description 定义每个字段的匹配规则、优先级和默认值
+ */
+const FIELD_RULES = {
+  // 产品名称
+  product_name: {
+    priority: 1,
+    patterns: [
+      /产品名称[：:]\s*([^\n]+)/,
+      /计划书名称[：:]\s*([^\n]+)/,
+      /Product\s+Name[：:]\s*([^\n]+)/i,
+      /^#\s+(.+)$/m // Markdown 标题
+    ],
+    fallback: null, // 必填，无默认值
+    required: true
+  },
+
+  // 产品类型
+  product_type: {
+    priority: 2,
+    patterns: [
+      // 从内容推断
+      {
+        type: 'content_match',
+        rules: [
+          { keywords: ['储蓄', 'saving', '传承', '家传', '红利', '提取'], value: 'savings' },
+          { keywords: ['重疾', 'critical', '守护', '严重疾病'], value: 'critical-illness' },
+          { keywords: ['人寿', 'life', '创富', '身故保障'], value: 'life-insurance' }
+        ]
+      }
+    ],
+    fallback: 'savings',
+    required: true
+  },
+
+  // 币种
+  currency: {
+    priority: 3,
+    patterns: [
+      // 统计货币符号出现次数
+      {
+        type: 'count_match',
+        rules: [
+          { pattern: /\$/g, value: 'USD' },
+          { pattern: /HK\$/g, value: 'HKD' },
+          { pattern: /¥|人民币/g, value: 'CNY' },
+          { pattern: /€/g, value: 'EUR' }
+        ]
+      },
+      /币种[：:]\s*(USD|CNY|HKD|EUR)/i,
+      /Currency[：:]\s*(USD|CNY|HKD|EUR)/i
+    ],
+    fallback: 'USD',
+    required: true
+  },
+
+  // 缴费年期
+  payment_periods: {
+    priority: 4,
+    patterns: [
+      // 匹配 "年繳保費繳費年期" 或 "缴费年期" 后面的列表
+      // 策略：匹配到包含 "年" 或 "整付" 的所有行，直到遇到其他关键字
+      {
+        type: 'smart_list_extract',
+        startPattern: /(?:年繳保費)?繳費年期[：:\s]*\n/,
+        endKeywords: ['提取', '保險期間', '保险期间', '投保年龄', '投保年齡', '選是', '選項', 'GC宏', 'FA宏', 'LV2'],
+        itemFilter: (line) => {
+          const trimmed = line.trim()
+          // 排除包含"投保年龄"等关键字的行
+          if (trimmed.includes('投保') || trimmed.includes('年龄') || trimmed.includes('年齡')) {
+            return false
+          }
+          // 精确匹配 "整付" 或 "X年" 格式
+          return trimmed && (
+            /^\d+\s*年$/.test(trimmed) ||
+            trimmed === '整付' ||
+            /^\d+年$/.test(trimmed) ||
+            /^[-•·]\s*\d+\s*年$/.test(trimmed)  // 支持列表格式 "- 3年"
+          )
+        }
+      }
+    ],
+    fallback: ['整付', '3年', '5年'],
+    required: true,
+    postProcess: (values) => {
+      // 过滤并标准化
+      const normalized = values
+        .map(v => v.trim())
+        // 排除包含"投保"等无效关键字
+        .filter(v => v && !v.includes('投保') && !v.includes('年龄') && !v.includes('年齡'))
+        .filter(v => v.includes('年') || v.includes('整付'))
+        .map(v => {
+          // 提取数字+年格式
+          const match = v.match(/(\d+)\s*年|整付/i)
+          if (match) {
+            return match[0].includes('整付') ? '整付' : `${match[1]}年`
+          }
+          return v
+        })
+
+      // 去重、排序
+      return [...new Set(normalized)].sort((a, b) => {
+        if (a === '整付') return -1
+        if (b === '整付') return 1
+        return parseInt(a) - parseInt(b)
+      })
+    }
+  },
+
+  // 年龄范围
+  age_range: {
+    priority: 5,
+    patterns: [
+      // 匹配 "0-75岁" 格式
+      {
+        type: 'range_extract',
+        pattern: /(\d+)\s*[-~至]\s*(\d+)\s*岁?/
+      },
+      // 匹配 "投保年龄：0-75岁" 格式
+      /投保年龄[：:]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/,
+      /年龄范围[：:]\s*(\d+)\s*[-~至]\s*(\d+)\s*岁?/
+    ],
+    fallback: { min: 0, max: 75 },
+    required: true,
+    postProcess: (match) => {
+      if (match && typeof match === 'object' && match.min !== undefined) {
+        return match
+      }
+      if (Array.isArray(match) && match.length >= 2) {
+        return { min: parseInt(match[1]), max: parseInt(match[2]) }
+      }
+      return null
+    }
+  },
+
+  // 保险期间
+  insurance_period: {
+    priority: 6,
+    patterns: [
+      /保險期間[：:]\s*([^\n]+)/,
+      /保险期间[：:]\s*([^\n]+)/,
+      /Insurance\s+Period[：:]\s*([^\n]+)/i,
+      /保障期间[：:]\s*([^\n]+)/
+    ],
+    fallback: '终身',
+    required: true,
+    postProcess: (value) => {
+      // 处理正则匹配结果（数组）或直接字符串
+      let str = value
+      if (Array.isArray(value)) {
+        str = value[1] || value[0] || ''
+      }
+      if (!str || typeof str !== 'string') return '终身'
+
+      const normalized = str.trim()
+      // 标准化常见表述
+      if (normalized.includes('终身') || normalized.includes('終身') || normalized.toLowerCase().includes('whole life')) {
+        return '终身'
+      }
+      return normalized
+    }
+  },
+
+  // 提取方式（仅储蓄类）
+  withdrawal_modes: {
+    priority: 7,
+    patterns: [
+      {
+        type: 'list_extract',
+        pattern: /提取选项[：:]\s*([^\n]+)/,
+        itemPattern: /指定提取金额|最高固定提取金额/g
+      },
+      /提取方式[：:]\s*([^\n]+)/
+    ],
+    fallback: ['年龄指定金额', '最高固定金额'],
+    required: false,
+    productType: ['savings'],
+    postProcess: (values) => {
+      // 处理正则匹配结果（数组，第一个元素是完整匹配，第二个是捕获组）
+      if (Array.isArray(values) && values.length > 1 && typeof values[1] === 'string') {
+        values = values[1]
+      }
+
+      if (typeof values === 'string') {
+        // 从单行文本中提取
+        const modes = []
+        if (values.includes('指定提取金额')) modes.push('指定提取金额')
+        if (values.includes('最高固定提取金额')) modes.push('最高固定提取金额')
+        return modes.length > 0 ? modes : ['年龄指定金额', '最高固定金额']
+      }
+      return Array.isArray(values) ? values : ['年龄指定金额', '最高固定金额']
+    }
+  },
+
+  // 提取期（仅储蓄类）
+  withdrawal_periods: {
+    priority: 8,
+    patterns: [
+      {
+        type: 'list_extract',
+        pattern: /提取期[（(]年[）)][：:]\s*([\s\S]*?)(?=\n\n|\n\n|$)/,
+        itemPattern: /^\s*[-•·]\s*(\d+\s*年)|^\s*(\d+)\s*年\s*$/gm
+      }
+    ],
+    fallback: ['1年', '3年', '5年', '10年'],
+    required: false,
+    productType: ['savings'],
+    postProcess: (values) => {
+      const normalized = values.map(v => {
+        const match = v.match(/(\d+)\s*年/)
+        return match ? `${match[1]}年` : v.trim()
+      })
+      return [...new Set(normalized)].sort((a, b) => parseInt(a) - parseInt(b))
+    }
+  }
+}
+
+/**
+ * 从文本中提取字段值
+ *
+ * @param {string} content - 文档内容
+ * @param {string} fieldName - 字段名称
+ * @returns {{value: any, matched: boolean, pattern: string|null}} 提取结果
+ */
+function extractField(content, fieldName) {
+  const rule = FIELD_RULES[fieldName]
+  if (!rule) {
+    return { value: null, matched: false, pattern: null }
+  }
+
+  // 尝试每个匹配模式
+  for (const pattern of rule.patterns) {
+    let match = null
+    let patternDesc = ''
+
+    if (typeof pattern === 'object' && pattern.type) {
+      // 复杂匹配模式
+      switch (pattern.type) {
+        case 'content_match':
+          match = matchByContent(content, pattern.rules)
+          patternDesc = `content_match(${pattern.rules.length} rules)`
+          break
+
+        case 'count_match':
+          match = matchByCount(content, pattern.rules)
+          patternDesc = `count_match(${pattern.rules.length} rules)`
+          break
+
+        case 'list_extract':
+          match = extractList(content, pattern.pattern, pattern.itemPattern)
+          patternDesc = `list_extract`
+          break
+
+        case 'smart_list_extract':
+          match = smartExtractList(
+            content,
+            pattern.startPattern,
+            pattern.endKeywords,
+            pattern.itemFilter
+          )
+          patternDesc = `smart_list_extract`
+          break
+
+        case 'range_extract':
+          match = extractRange(content, pattern.pattern)
+          patternDesc = `range_extract`
+          break
+      }
+    } else if (pattern instanceof RegExp) {
+      // 正则表达式匹配
+      match = content.match(pattern)
+      patternDesc = pattern.toString()
+    }
+
+    // 如果匹配成功
+    if (match) {
+      let value = match
+
+      // 应用后处理
+      if (rule.postProcess) {
+        value = rule.postProcess(match)
+      } else if (Array.isArray(match) && match.length > 1) {
+        // 正则匹配结果，取第一个捕获组
+        value = match[1]
+      }
+
+      return {
+        value,
+        matched: true,
+        pattern: patternDesc
+      }
+    }
+  }
+
+  // 没有匹配，返回默认值
+  return {
+    value: rule.fallback,
+    matched: false,
+    pattern: null
+  }
+}
+
+/**
+ * 通过关键词匹配内容
+ */
+function matchByContent(content, rules) {
+  const contentLower = content.toLowerCase()
+
+  for (const rule of rules) {
+    const hasKeyword = rule.keywords.some(keyword => {
+      return contentLower.includes(keyword.toLowerCase())
+    })
+
+    if (hasKeyword) {
+      return rule.value
+    }
+  }
+
+  return null
+}
+
+/**
+ * 通过统计匹配内容
+ */
+function matchByCount(content, rules) {
+  let maxCount = 0
+  let maxValue = null
+
+  for (const rule of rules) {
+    const matches = content.match(rule.pattern)
+    const count = matches ? matches.length : 0
+
+    if (count > maxCount) {
+      maxCount = count
+      maxValue = rule.value
+    }
+  }
+
+  return maxValue
+}
+
+/**
+ * 提取列表项
+ */
+function extractList(content, pattern, itemPattern) {
+  const sectionMatch = content.match(pattern)
+  if (!sectionMatch) return null
+
+  const section = sectionMatch[1]
+  const items = []
+
+  // 将 itemPattern 转换为正则表达式
+  const regex = typeof itemPattern === 'string' ? new RegExp(itemPattern, 'gm') : itemPattern
+
+  // 使用 exec 循环提取所有匹配项
+  let match
+  while ((match = regex.exec(section)) !== null) {
+    // 提取第一个非空捕获组
+    let item = null
+
+    // 尝试所有捕获组，找到第一个非空的
+    for (let i = 1; i < match.length; i++) {
+      if (match[i] && match[i].trim()) {
+        item = match[i].trim()
+        break
+      }
+    }
+
+    // 如果没有捕获组，使用整个匹配
+    if (!item && match[0] && match[0].trim()) {
+      item = match[0].trim()
+    }
+
+    if (item) {
+      items.push(item)
+    }
+  }
+
+  // 如果正则匹配失败，尝试按行分割
+  if (items.length === 0) {
+    const lines = section.split('\n')
+    for (const line of lines) {
+      const trimmed = line.trim()
+      // 过滤掉空行和短文本
+      if (trimmed && trimmed.length > 0 && trimmed.length < 50) {
+        items.push(trimmed)
+      }
+    }
+  }
+
+  return items.length > 0 ? items : null
+}
+
+/**
+ * 提取范围值
+ */
+function extractRange(content, pattern) {
+  const match = content.match(pattern)
+  if (!match) return null
+
+  return {
+    min: parseInt(match[1]),
+    max: parseInt(match[2])
+  }
+}
+
+/**
+ * 智能提取列表项
+ *
+ * @description 从文档中智能提取列表，支持不规则格式和多行内容
+ * @param {string} content - 文档内容
+ * @param {RegExp} startPattern - 列表起始模式
+ * @param {string[]} endKeywords - 结束关键词列表
+ * @param {Function} itemFilter - 列表项过滤函数
+ * @returns {string[]|null} 提取的列表项数组
+ */
+function smartExtractList(content, startPattern, endKeywords, itemFilter) {
+  // 1. 找到起始位置
+  const startMatch = content.match(startPattern)
+  if (!startMatch) return null
+
+  // 获取起始位置后的内容
+  const startIndex = startMatch.index + startMatch[0].length
+  const remainingContent = content.slice(startIndex)
+
+  // 2. 按行分割并逐行扫描
+  const lines = remainingContent.split('\n')
+  const items = []
+
+  for (const line of lines) {
+    const trimmedLine = line.trim()
+
+    // 3. 检查是否遇到结束关键词
+    if (endKeywords.some(keyword => trimmedLine.includes(keyword))) {
+      break
+    }
+
+    // 4. 使用 itemFilter 过滤有效项
+    if (itemFilter && typeof itemFilter === 'function') {
+      if (itemFilter(trimmedLine)) {
+        items.push(trimmedLine)
+      }
+    } else {
+      // 默认过滤：非空行且长度合理
+      if (trimmedLine && trimmedLine.length > 0 && trimmedLine.length < 100) {
+        items.push(trimmedLine)
+      }
+    }
+  }
+
+  return items.length > 0 ? items : null
+}
+
+/**
+ * 智能提取所有字段
+ *
+ * @param {string} content - 文档内容
+ * @param {string} fileName - 文件名（用于推断产品名称）
+ * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
+ */
+export function smartExtractFields(content, fileName) {
+  const config = {}
+  const unmatched = []
+  const warnings = []
+  const matchDetails = []
+
+  // 按优先级提取字段
+  const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
+
+  for (const [fieldName, rule] of sortedFields) {
+    const result = extractField(content, fieldName)
+
+    // 记录匹配详情
+    matchDetails.push({
+      field: fieldName,
+      matched: result.matched,
+      pattern: result.pattern,
+      value: result.value
+    })
+
+    // 如果匹配成功或字段有默认值
+    if (result.value !== null) {
+      config[fieldName] = result.value
+
+      // 如果使用了默认值，记录警告
+      if (!result.matched && rule.required) {
+        warnings.push({
+          field: fieldName,
+          message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
+          severity: 'warning'
+        })
+      }
+    } else if (rule.required) {
+      // 必填字段未匹配
+      unmatched.push({
+        field: fieldName,
+        reason: '未找到匹配内容',
+        suggestions: generateSuggestions(fieldName, content)
+      })
+    }
+  }
+
+  // 产品名称特殊处理：如果未匹配，使用文件名
+  if (!config.product_name) {
+    const baseName = fileName.replace(/\.[^/.]+$/, '')
+    config.product_name = baseName
+    warnings.push({
+      field: 'product_name',
+      message: `未找到产品名称，使用文件名: "${baseName}"`,
+      severity: 'info'
+    })
+  }
+
+  // 根据产品类型过滤字段
+  if (config.product_type !== 'savings') {
+    delete config.withdrawal_modes
+    delete config.withdrawal_periods
+  }
+
+  return {
+    config,
+    unmatched,
+    warnings,
+    matchDetails
+  }
+}
+
+/**
+ * 生成字段建议值
+ *
+ * @param {string} fieldName - 字段名称
+ * @param {string} content - 文档内容
+ * @returns {Array<string>} 建议值列表
+ */
+function generateSuggestions(fieldName, content) {
+  const suggestions = {
+    product_name: ['从文档标题提取', '从第一行提取', '手动输入产品全称'],
+    product_type: ['savings - 储蓄型产品', 'critical-illness - 重疾型产品', 'life-insurance - 人寿型产品'],
+    currency: ['USD - 美元', 'CNY - 人民币', 'HKD - 港币', 'EUR - 欧元'],
+    payment_periods: ['整付', '3年', '5年', '10年', '15年', '20年'],
+    age_range: ['0-75岁（常见范围）', '0-70岁', '1-65岁'],
+    insurance_period: ['终身', '至100岁', '20年', '30年'],
+    withdrawal_modes: ['指定提取金额', '最高固定提取金额'],
+    withdrawal_periods: ['1年', '3年', '5年', '10年', '15年', '20年']
+  }
+
+  return suggestions[fieldName] || ['请手动输入']
+}
+
+/**
+ * 生成人工审核报告
+ *
+ * @param {Object} result - 提取结果
+ * @returns {string} Markdown 格式的审核报告
+ */
+export function generateAuditReport(result) {
+  const { config, unmatched, warnings, matchDetails } = result
+
+  let report = `## 📊 字段提取报告\n\n`
+
+  // 匹配统计
+  const matchedCount = matchDetails.filter(m => m.matched).length
+  const totalCount = matchDetails.length
+  report += `### 匹配统计\n\n`
+  report += `- ✅ 成功匹配: ${matchedCount}/${totalCount} 字段\n`
+  report += `- ⚠️  使用默认值: ${warnings.length} 字段\n`
+  report += `- ❌ 未匹配（需人工补充）: ${unmatched.length} 字段\n\n`
+
+  // 匹配详情表格
+  report += `### 匹配详情\n\n`
+  report += `| 字段 | 状态 | 提取方式 | 值 |\n`
+  report += `|------|------|----------|----|\n`
+
+  for (const detail of matchDetails) {
+    const status = detail.matched ? '✅' : (FIELD_RULES[detail.field]?.required ? '⚠️' : 'ℹ️')
+    const method = detail.matched ? detail.pattern : '默认值'
+    const valuePreview = JSON.stringify(detail.value).substring(0, 50)
+    report += `| ${detail.field} | ${status} | ${method} | ${valuePreview} |\n`
+  }
+
+  // 警告信息
+  if (warnings.length > 0) {
+    report += `\n### ⚠️  警告信息\n\n`
+    for (const warning of warnings) {
+      report += `- **${warning.field}**: ${warning.message}\n`
+    }
+  }
+
+  // 未匹配字段（需要人工补充）
+  if (unmatched.length > 0) {
+    report += `\n### ❌ 未匹配字段（需要人工补充）\n\n`
+    for (const item of unmatched) {
+      report += `#### ${item.field}\n\n`
+      report += `- **原因**: ${item.reason}\n`
+      report += `- **建议值**:\n`
+      for (const suggestion of item.suggestions) {
+        report += `  - ${suggestion}\n`
+      }
+      report += `\n`
+    }
+  }
+
+  return report
+}
+
+export { FIELD_RULES }