feat(parse): 支持多产品文档解析

- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别（GS、GC、FA、LV2 等） - 支持产品命名模式（以"計劃"、"保障"、"保险"、"壽險"结尾） - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式测试结果： - 成功解析计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

feat(parse): 支持多产品文档解析
- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别（GS、GC、FA、LV2 等） - 支持产品命名模式（以"計劃"、"保障"、"保险"、"壽險"结尾） - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式测试结果： - 成功解析计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
hookehuyr
Commit 6ea29159e6b4547711fb646752541674e5de2f8f 6ea29159 1 parent d5f911d1
Showing 3 changed files with 372 additions and 74 deletions
scripts/parse-docs.js
scripts/product-splitter.js
scripts/smart-field-extractor.js
--- a/scripts/parse-docs.js
View file @6ea2915
+++ b/scripts/parse-docs.js
View file @6ea2915
--- a/scripts/product-splitter.js 0 → 100644
View file @6ea2915
+++ b/scripts/product-splitter.js 0 → 100644
View file @6ea2915
+/**
+ * 产品分割器
+ *
+ * @description 从包含多个保险产品的文档中识别并分割出各个产品
+ * @module scripts/product-splitter
+ * @author Claude Code
+ * @created 2026-02-15
+ */
+
+/**
+ * 产品标题匹配规则
+ *
+ * @description 用于识别文档中的产品标题行
+ * 格式示例：
+ * - GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
+ * - GC宏摯家傳承保險計劃- 性別, 年齡, 出生年月日
+ * - FA 宏浚傳承保障計劃
+ * - LV2 赤霞珠終身壽險計劃2基本人壽保障選項
+ */
+const PRODUCT_TITLE_PATTERNS = [
+  // 产品代码 + 产品名称 + 可选后缀
+  // GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
+  /^([A-Z]{2,4}\d?)\s*([^\n\-]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,
+
+  // 产品代码 + 空格 + 产品名称
+  // FA 宏浚傳承保障計劃
+  /^([A-Z]{2,4}\d?)\s+([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险))/gm,
+
+  // 纯产品名称（包含"計劃"）
+  // 宏摯傳承保障計劃
+  /^([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,
+
+  // 产品代码开头的行
+  /^([A-Z]{2,4}\d?)\s*[-:]\s*([^\n]+)/gm
+]
+
+/**
+ * 产品代码前缀列表（用于优先匹配）
+ */
+const PRODUCT_CODE_PREFIXES = [
+  'GS', 'GC', 'FA', 'LV2', 'LV', 'CR', 'HR', 'PR', 'SR',
+  'TR', 'UR', 'WR', 'XR', 'YR', 'ZR'
+]
+
+/**
+ * 检测文档中包含的产品数量
+ *
+ * @param {string} content - 文档内容
+ * @returns {number} 产品数量
+ */
+export function detectProductCount(content) {
+  const matches = findProductTitles(content)
+  return matches.length
+}
+
+/**
+ * 查找文档中所有产品标题
+ *
+ * @param {string} content - 文档内容
+ * @returns {Array<{index: number, code: string, name: string, fullTitle: string}>} 产品标题列表
+ */
+export function findProductTitles(content) {
+  const products = []
+  const seenCodes = new Set()
+
+  // 策略1: 优先匹配产品代码前缀
+  for (const prefix of PRODUCT_CODE_PREFIXES) {
+    // 匹配 "GS宏摯傳承保障計劃" 或 "GS 宏摯傳承保障計劃"
+    const regex = new RegExp(
+      `^(${prefix}\\d?)\\s*([\\u4e00-\\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险)[^\\n]*)`,
+      'gm'
+    )
+
+    let match
+    while ((match = regex.exec(content)) !== null) {
+      const code = match[1]
+      const name = match[2].trim()
+
+      // 去重
+      if (seenCodes.has(code)) continue
+      seenCodes.add(code)
+
+      products.push({
+        index: match.index,
+        code,
+        name,
+        fullTitle: match[0].trim()
+      })
+    }
+  }
+
+  // 策略2: 如果没找到，尝试通用模式匹配
+  if (products.length === 0) {
+    // 匹配包含"計劃"的产品名称行
+    const regex = /^([A-Z]{2,4}\d?)?\s*([^\n]*?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm
+
+    let match
+    while ((match = regex.exec(content)) !== null) {
+      const fullTitle = match[0].trim()
+      if (fullTitle.length < 5) continue // 过滤太短的匹配
+
+      products.push({
+        index: match.index,
+        code: match[1] || null,
+        name: match[2] || fullTitle,
+        fullTitle
+      })
+    }
+  }
+
+  // 按出现位置排序
+  products.sort((a, b) => a.index - b.index)
+
+  return products
+}
+
+/**
+ * 将文档内容按产品分割
+ *
+ * @param {string} content - 文档内容
+ * @returns {Array<{code: string, name: string, content: string, fullTitle: string}>} 分割后的产品列表
+ */
+export function splitByProducts(content) {
+  const products = findProductTitles(content)
+
+  if (products.length === 0) {
+    // 没有找到多个产品，返回整个文档作为单个产品
+    return [{
+      code: null,
+      name: null,
+      content: content,
+      fullTitle: null
+    }]
+  }
+
+  if (products.length === 1) {
+    // 只有一个产品，返回整个文档
+    return [{
+      code: products[0].code,
+      name: products[0].name,
+      content: content,
+      fullTitle: products[0].fullTitle
+    }]
+  }
+
+  // 多个产品，按位置分割
+  const result = []
+
+  for (let i = 0; i < products.length; i++) {
+    const product = products[i]
+    const startIndex = product.index
+    const endIndex = (i < products.length - 1) ? products[i + 1].index : content.length
+
+    const productContent = content.slice(startIndex, endIndex).trim()
+
+    result.push({
+      code: product.code,
+      name: product.name,
+      content: productContent,
+      fullTitle: product.fullTitle
+    })
+  }
+
+  return result
+}
+
+/**
+ * 智能提取产品名称
+ *
+ * @description 从产品标题或内容中提取标准化的产品名称
+ * @param {string} fullTitle - 产品完整标题
+ * @param {string} content - 产品内容片段
+ * @returns {string} 产品名称
+ */
+export function extractProductName(fullTitle, content) {
+  if (!fullTitle && !content) return null
+
+  // 优先从完整标题提取
+  if (fullTitle) {
+    // 移除产品代码前缀
+    let name = fullTitle.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')
+
+    // 移除后缀说明（如 "- 性別, 年齡, 出生年月日"）
+    name = name.split(/[-—:：]/)[0].trim()
+
+    if (name && name.length > 2) {
+      return name
+    }
+  }
+
+  // 从内容中查找产品名称
+  const patterns = [
+    /产品名称[：:]\s*([^\n]+)/,
+    /计划书名称[：:]\s*([^\n]+)/,
+    /([A-Z]{2,4}\d?\s*[\u4e00-\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险))/
+  ]
+
+  for (const pattern of patterns) {
+    const match = content.match(pattern)
+    if (match) {
+      // 清理产品名称
+      let name = match[1] || match[0]
+      name = name.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')
+      name = name.split(/[-—:：]/)[0].trim()
+      if (name && name.length > 2) {
+        return name
+      }
+    }
+  }
+
+  return null
+}
+
+/**
+ * 生成产品分割报告
+ *
+ * @param {string} content - 原始文档内容
+ * @param {Array} products - 分割后的产品列表
+ * @returns {string} Markdown 格式的报告
+ */
+export function generateSplitReport(content, products) {
+  let report = `## 📊 产品分割报告\n\n`
+
+  report += `### 分割统计\n\n`
+  report += `- 文档总长度: ${content.length} 字符\n`
+  report += `- 识别产品数: ${products.length} 个\n\n`
+
+  report += `### 产品列表\n\n`
+  report += `| 序号 | 产品代码 | 产品名称 | 内容长度 |\n`
+  report += `|------|---------|---------|----------|\n`
+
+  products.forEach((product, index) => {
+    const code = product.code || '-'
+    const name = product.name || product.fullTitle?.slice(0, 20) || '-'
+    const length = product.content.length
+    report += `| ${index + 1} | ${code} | ${name.slice(0, 30)} | ${length} 字符 |\n`
+  })
+
+  return report
+}
+
+export {
+  PRODUCT_TITLE_PATTERNS,
+  PRODUCT_CODE_PREFIXES
+}
--- a/scripts/smart-field-extractor.js
View file @6ea2915
+++ b/scripts/smart-field-extractor.js
View file @6ea2915
@@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) {
 }
 /**
- * 智能提取所有字段
- *
- * @param {string} content - 文档内容
- * @param {string} fileName - 文件名（用于推断产品名称）
- * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
- */
-export function smartExtractFields(content, fileName) {
-  const config = {}
-  const unmatched = []
-  const warnings = []
-  const matchDetails = []
-
-  // 按优先级提取字段
-  const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
-
-  for (const [fieldName, rule] of sortedFields) {
-    const result = extractField(content, fieldName)
-
-    // 记录匹配详情
-    matchDetails.push({
-      field: fieldName,
-      matched: result.matched,
-      pattern: result.pattern,
-      value: result.value
-    })
-
-    // 如果匹配成功或字段有默认值
-    if (result.value !== null) {
-      config[fieldName] = result.value
-
-      // 如果使用了默认值，记录警告
-      if (!result.matched && rule.required) {
-        warnings.push({
-          field: fieldName,
-          message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
-          severity: 'warning'
-        })
-      }
-    } else if (rule.required) {
-      // 必填字段未匹配
-      unmatched.push({
-        field: fieldName,
-        reason: '未找到匹配内容',
-        suggestions: generateSuggestions(fieldName, content)
-      })
-    }
-  }
-
-  // 产品名称特殊处理：如果未匹配，使用文件名
-  if (!config.product_name) {
-    const baseName = fileName.replace(/\.[^/.]+$/, '')
-    config.product_name = baseName
-    warnings.push({
-      field: 'product_name',
-      message: `未找到产品名称，使用文件名: "${baseName}"`,
-      severity: 'info'
-    })
-  }
-
-  // 根据产品类型过滤字段
-  if (config.product_type !== 'savings') {
-    delete config.withdrawal_modes
-    delete config.withdrawal_periods
-  }
-
-  return {
-    config,
-    unmatched,
-    warnings,
-    matchDetails
-  }
-}
-
-/**
  * 生成字段建议值
  *
  * @param {string} fieldName - 字段名称
@@ -614,4 +540,131 @@ export function generateAuditReport(result) {
   return report
 }
+/**
+ * 智能提取所有字段（支持多产品）
+ *
+ * @description 从单个产品内容片段中提取字段，优先使用传入的产品名称
+ * @param {string} content - 产品内容片段
+ * @param {string} fileName - 文件名
+ * @param {Object} options - 额外选项
+ * @param {string} options.productCode - 产品代码（如 GS、GC、FA）
+ * @param {string} options.productName - 产品名称（从分割器获取）
+ * @returns {{config: Object, unmatched: Array, warnings: Array, matchDetails: Array}} 提取结果
+ */
+export function smartExtractFieldsForProduct(content, fileName, options = {}) {
+  const { productCode, productName } = options
+  const config = {}
+  const unmatched = []
+  const warnings = []
+  const matchDetails = []
+
+  // 按优先级提取字段
+  const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
+
+  for (const [fieldName, rule] of sortedFields) {
+    // 跳过 product_name，后面特殊处理
+    if (fieldName === 'product_name') continue
+
+    const result = extractField(content, fieldName)
+
+    // 记录匹配详情
+    matchDetails.push({
+      field: fieldName,
+      matched: result.matched,
+      pattern: result.pattern,
+      value: result.value
+    })
+
+    // 如果匹配成功或字段有默认值
+    if (result.value !== null) {
+      config[fieldName] = result.value
+
+      // 如果使用了默认值，记录警告
+      if (!result.matched && rule.required) {
+        warnings.push({
+          field: fieldName,
+          message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
+          severity: 'warning'
+        })
+      }
+    } else if (rule.required) {
+      // 必填字段未匹配
+      unmatched.push({
+        field: fieldName,
+        reason: '未找到匹配内容',
+        suggestions: generateSuggestions(fieldName, content)
+      })
+    }
+  }
+
+  // ========== 产品名称特殊处理 ==========
+  // 优先级: 传入的产品名称 > 从内容提取 > 文件名
+  if (productName) {
+    // 使用分割器传入的产品名称
+    config.product_name = productName
+    matchDetails.unshift({
+      field: 'product_name',
+      matched: true,
+      pattern: 'product_splitter',
+      value: productName
+    })
+  } else {
+    // 尝试从内容提取
+    const nameResult = extractField(content, 'product_name')
+    if (nameResult.matched && nameResult.value) {
+      config.product_name = nameResult.value
+      matchDetails.unshift({
+        field: 'product_name',
+        matched: true,
+        pattern: nameResult.pattern,
+        value: nameResult.value
+      })
+    } else {
+      // 使用文件名
+      const baseName = fileName.replace(/\.[^/.]+$/, '')
+      config.product_name = baseName
+      warnings.push({
+        field: 'product_name',
+        message: `未找到产品名称，使用文件名: "${baseName}"`,
+        severity: 'info'
+      })
+      matchDetails.unshift({
+        field: 'product_name',
+        matched: false,
+        pattern: 'filename_fallback',
+        value: baseName
+      })
+    }
+  }
+
+  // 如果有产品代码，添加到配置中
+  if (productCode) {
+    config.product_code = productCode
+  }
+
+  // 根据产品类型过滤字段
+  if (config.product_type !== 'savings') {
+    delete config.withdrawal_modes
+    delete config.withdrawal_periods
+  }
+
+  return {
+    config,
+    unmatched,
+    warnings,
+    matchDetails
+  }
+}
+
+/**
+ * 智能提取所有字段（原始函数，保持兼容）
+ *
+ * @param {string} content - 文档内容
+ * @param {string} fileName - 文件名（用于推断产品名称）
+ * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
+ */
+export function smartExtractFields(content, fileName) {
+  return smartExtractFieldsForProduct(content, fileName, {})
+}
+
 export { FIELD_RULES }