feat(parse): 支持多产品文档解析

- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别（GS、GC、FA、LV2 等） - 支持产品命名模式（以"計劃"、"保障"、"保险"、"壽險"结尾） - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式测试结果： - 成功解析计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

feat(parse): 支持多产品文档解析
- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别（GS、GC、FA、LV2 等） - 支持产品命名模式（以"計劃"、"保障"、"保险"、"壽險"结尾） - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式测试结果： - 成功解析计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
hookehuyr
Commit 6ea29159e6b4547711fb646752541674e5de2f8f 6ea29159 1 parent d5f911d1
Showing 3 changed files with 372 additions and 74 deletions
scripts/parse-docs.js
scripts/product-splitter.js
scripts/smart-field-extractor.js
--- a/scripts/parse-docs.js
View file @6ea2915
+++ b/scripts/parse-docs.js
View file @6ea2915
--- a/scripts/product-splitter.js 0 → 100644
View file @6ea2915
+++ b/scripts/product-splitter.js 0 → 100644
View file @6ea2915
+ /**
+  * 产品分割器
+  *
+  * @description 从包含多个保险产品的文档中识别并分割出各个产品
+  * @module scripts/product-splitter
+  * @author Claude Code
+  * @created 2026-02-15
+  */
+ 
+ /**
+  * 产品标题匹配规则
+  *
+  * @description 用于识别文档中的产品标题行
+  * 格式示例：
+  * - GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
+  * - GC宏摯家傳承保險計劃- 性別, 年齡, 出生年月日
+  * - FA 宏浚傳承保障計劃
+  * - LV2 赤霞珠終身壽險計劃2基本人壽保障選項
+  */
+ const PRODUCT_TITLE_PATTERNS = [
+   // 产品代码 + 产品名称 + 可选后缀
+   // GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
+   /^([A-Z]{2,4}\d?)\s*([^\n\-]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,
+ 
+   // 产品代码 + 空格 + 产品名称
+   // FA 宏浚傳承保障計劃
+   /^([A-Z]{2,4}\d?)\s+([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险))/gm,
+ 
+   // 纯产品名称（包含"計劃"）
+   // 宏摯傳承保障計劃
+   /^([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,
+ 
+   // 产品代码开头的行
+   /^([A-Z]{2,4}\d?)\s*[-:]\s*([^\n]+)/gm
+ ]
+ 
+ /**
+  * 产品代码前缀列表（用于优先匹配）
+  */
+ const PRODUCT_CODE_PREFIXES = [
+   'GS', 'GC', 'FA', 'LV2', 'LV', 'CR', 'HR', 'PR', 'SR',
+   'TR', 'UR', 'WR', 'XR', 'YR', 'ZR'
+ ]
+ 
+ /**
+  * 检测文档中包含的产品数量
+  *
+  * @param {string} content - 文档内容
+  * @returns {number} 产品数量
+  */
+ export function detectProductCount(content) {
+   const matches = findProductTitles(content)
+   return matches.length
+ }
+ 
+ /**
+  * 查找文档中所有产品标题
+  *
+  * @param {string} content - 文档内容
+  * @returns {Array<{index: number, code: string, name: string, fullTitle: string}>} 产品标题列表
+  */
+ export function findProductTitles(content) {
+   const products = []
+   const seenCodes = new Set()
+ 
+   // 策略1: 优先匹配产品代码前缀
+   for (const prefix of PRODUCT_CODE_PREFIXES) {
+     // 匹配 "GS宏摯傳承保障計劃" 或 "GS 宏摯傳承保障計劃"
+     const regex = new RegExp(
+       `^(${prefix}\\d?)\\s*([\\u4e00-\\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险)[^\\n]*)`,
+       'gm'
+     )
+ 
+     let match
+     while ((match = regex.exec(content)) !== null) {
+       const code = match[1]
+       const name = match[2].trim()
+ 
+       // 去重
+       if (seenCodes.has(code)) continue
+       seenCodes.add(code)
+ 
+       products.push({
+         index: match.index,
+         code,
+         name,
+         fullTitle: match[0].trim()
+       })
+     }
+   }
+ 
+   // 策略2: 如果没找到，尝试通用模式匹配
+   if (products.length === 0) {
+     // 匹配包含"計劃"的产品名称行
+     const regex = /^([A-Z]{2,4}\d?)?\s*([^\n]*?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm
+ 
+     let match
+     while ((match = regex.exec(content)) !== null) {
+       const fullTitle = match[0].trim()
+       if (fullTitle.length < 5) continue // 过滤太短的匹配
+ 
+       products.push({
+         index: match.index,
+         code: match[1] || null,
+         name: match[2] || fullTitle,
+         fullTitle
+       })
+     }
+   }
+ 
+   // 按出现位置排序
+   products.sort((a, b) => a.index - b.index)
+ 
+   return products
+ }
+ 
+ /**
+  * 将文档内容按产品分割
+  *
+  * @param {string} content - 文档内容
+  * @returns {Array<{code: string, name: string, content: string, fullTitle: string}>} 分割后的产品列表
+  */
+ export function splitByProducts(content) {
+   const products = findProductTitles(content)
+ 
+   if (products.length === 0) {
+     // 没有找到多个产品，返回整个文档作为单个产品
+     return [{
+       code: null,
+       name: null,
+       content: content,
+       fullTitle: null
+     }]
+   }
+ 
+   if (products.length === 1) {
+     // 只有一个产品，返回整个文档
+     return [{
+       code: products[0].code,
+       name: products[0].name,
+       content: content,
+       fullTitle: products[0].fullTitle
+     }]
+   }
+ 
+   // 多个产品，按位置分割
+   const result = []
+ 
+   for (let i = 0; i < products.length; i++) {
+     const product = products[i]
+     const startIndex = product.index
+     const endIndex = (i < products.length - 1) ? products[i + 1].index : content.length
+ 
+     const productContent = content.slice(startIndex, endIndex).trim()
+ 
+     result.push({
+       code: product.code,
+       name: product.name,
+       content: productContent,
+       fullTitle: product.fullTitle
+     })
+   }
+ 
+   return result
+ }
+ 
+ /**
+  * 智能提取产品名称
+  *
+  * @description 从产品标题或内容中提取标准化的产品名称
+  * @param {string} fullTitle - 产品完整标题
+  * @param {string} content - 产品内容片段
+  * @returns {string} 产品名称
+  */
+ export function extractProductName(fullTitle, content) {
+   if (!fullTitle && !content) return null
+ 
+   // 优先从完整标题提取
+   if (fullTitle) {
+     // 移除产品代码前缀
+     let name = fullTitle.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')
+ 
+     // 移除后缀说明（如 "- 性別, 年齡, 出生年月日"）
+     name = name.split(/[-—:：]/)[0].trim()
+ 
+     if (name && name.length > 2) {
+       return name
+     }
+   }
+ 
+   // 从内容中查找产品名称
+   const patterns = [
+     /产品名称[：:]\s*([^\n]+)/,
+     /计划书名称[：:]\s*([^\n]+)/,
+     /([A-Z]{2,4}\d?\s*[\u4e00-\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险))/
+   ]
+ 
+   for (const pattern of patterns) {
+     const match = content.match(pattern)
+     if (match) {
+       // 清理产品名称
+       let name = match[1] || match[0]
+       name = name.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')
+       name = name.split(/[-—:：]/)[0].trim()
+       if (name && name.length > 2) {
+         return name
+       }
+     }
+   }
+ 
+   return null
+ }
+ 
+ /**
+  * 生成产品分割报告
+  *
+  * @param {string} content - 原始文档内容
+  * @param {Array} products - 分割后的产品列表
+  * @returns {string} Markdown 格式的报告
+  */
+ export function generateSplitReport(content, products) {
+   let report = `## 📊 产品分割报告\n\n`
+ 
+   report += `### 分割统计\n\n`
+   report += `- 文档总长度: ${content.length} 字符\n`
+   report += `- 识别产品数: ${products.length} 个\n\n`
+ 
+   report += `### 产品列表\n\n`
+   report += `| 序号 | 产品代码 | 产品名称 | 内容长度 |\n`
+   report += `|------|---------|---------|----------|\n`
+ 
+   products.forEach((product, index) => {
+     const code = product.code || '-'
+     const name = product.name || product.fullTitle?.slice(0, 20) || '-'
+     const length = product.content.length
+     report += `| ${index + 1} | ${code} | ${name.slice(0, 30)} | ${length} 字符 |\n`
+   })
+ 
+   return report
+ }
+ 
+ export {
+   PRODUCT_TITLE_PATTERNS,
+   PRODUCT_CODE_PREFIXES
+ }
--- a/scripts/smart-field-extractor.js
View file @6ea2915
+++ b/scripts/smart-field-extractor.js
View file @6ea2915
@@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) {
 }
 
 /**
-  * 智能提取所有字段
-  *
-  * @param {string} content - 文档内容
-  * @param {string} fileName - 文件名（用于推断产品名称）
-  * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
-  */
- export function smartExtractFields(content, fileName) {
-   const config = {}
-   const unmatched = []
-   const warnings = []
-   const matchDetails = []
- 
-   // 按优先级提取字段
-   const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
- 
-   for (const [fieldName, rule] of sortedFields) {
-     const result = extractField(content, fieldName)
- 
-     // 记录匹配详情
-     matchDetails.push({
-       field: fieldName,
-       matched: result.matched,
-       pattern: result.pattern,
-       value: result.value
-     })
- 
-     // 如果匹配成功或字段有默认值
-     if (result.value !== null) {
-       config[fieldName] = result.value
- 
-       // 如果使用了默认值，记录警告
-       if (!result.matched && rule.required) {
-         warnings.push({
-           field: fieldName,
-           message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
-           severity: 'warning'
-         })
-       }
-     } else if (rule.required) {
-       // 必填字段未匹配
-       unmatched.push({
-         field: fieldName,
-         reason: '未找到匹配内容',
-         suggestions: generateSuggestions(fieldName, content)
-       })
-     }
-   }
- 
-   // 产品名称特殊处理：如果未匹配，使用文件名
-   if (!config.product_name) {
-     const baseName = fileName.replace(/\.[^/.]+$/, '')
-     config.product_name = baseName
-     warnings.push({
-       field: 'product_name',
-       message: `未找到产品名称，使用文件名: "${baseName}"`,
-       severity: 'info'
-     })
-   }
- 
-   // 根据产品类型过滤字段
-   if (config.product_type !== 'savings') {
-     delete config.withdrawal_modes
-     delete config.withdrawal_periods
-   }
- 
-   return {
-     config,
-     unmatched,
-     warnings,
-     matchDetails
-   }
- }
- 
- /**
  * 生成字段建议值
  *
  * @param {string} fieldName - 字段名称
@@ -614,4 +540,131 @@ export function generateAuditReport(result) {
   return report
 }
 
+ /**
+  * 智能提取所有字段（支持多产品）
+  *
+  * @description 从单个产品内容片段中提取字段，优先使用传入的产品名称
+  * @param {string} content - 产品内容片段
+  * @param {string} fileName - 文件名
+  * @param {Object} options - 额外选项
+  * @param {string} options.productCode - 产品代码（如 GS、GC、FA）
+  * @param {string} options.productName - 产品名称（从分割器获取）
+  * @returns {{config: Object, unmatched: Array, warnings: Array, matchDetails: Array}} 提取结果
+  */
+ export function smartExtractFieldsForProduct(content, fileName, options = {}) {
+   const { productCode, productName } = options
+   const config = {}
+   const unmatched = []
+   const warnings = []
+   const matchDetails = []
+ 
+   // 按优先级提取字段
+   const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority)
+ 
+   for (const [fieldName, rule] of sortedFields) {
+     // 跳过 product_name，后面特殊处理
+     if (fieldName === 'product_name') continue
+ 
+     const result = extractField(content, fieldName)
+ 
+     // 记录匹配详情
+     matchDetails.push({
+       field: fieldName,
+       matched: result.matched,
+       pattern: result.pattern,
+       value: result.value
+     })
+ 
+     // 如果匹配成功或字段有默认值
+     if (result.value !== null) {
+       config[fieldName] = result.value
+ 
+       // 如果使用了默认值，记录警告
+       if (!result.matched && rule.required) {
+         warnings.push({
+           field: fieldName,
+           message: `未找到字段 "${fieldName}"，使用默认值: ${JSON.stringify(rule.fallback)}`,
+           severity: 'warning'
+         })
+       }
+     } else if (rule.required) {
+       // 必填字段未匹配
+       unmatched.push({
+         field: fieldName,
+         reason: '未找到匹配内容',
+         suggestions: generateSuggestions(fieldName, content)
+       })
+     }
+   }
+ 
+   // ========== 产品名称特殊处理 ==========
+   // 优先级: 传入的产品名称 > 从内容提取 > 文件名
+   if (productName) {
+     // 使用分割器传入的产品名称
+     config.product_name = productName
+     matchDetails.unshift({
+       field: 'product_name',
+       matched: true,
+       pattern: 'product_splitter',
+       value: productName
+     })
+   } else {
+     // 尝试从内容提取
+     const nameResult = extractField(content, 'product_name')
+     if (nameResult.matched && nameResult.value) {
+       config.product_name = nameResult.value
+       matchDetails.unshift({
+         field: 'product_name',
+         matched: true,
+         pattern: nameResult.pattern,
+         value: nameResult.value
+       })
+     } else {
+       // 使用文件名
+       const baseName = fileName.replace(/\.[^/.]+$/, '')
+       config.product_name = baseName
+       warnings.push({
+         field: 'product_name',
+         message: `未找到产品名称，使用文件名: "${baseName}"`,
+         severity: 'info'
+       })
+       matchDetails.unshift({
+         field: 'product_name',
+         matched: false,
+         pattern: 'filename_fallback',
+         value: baseName
+       })
+     }
+   }
+ 
+   // 如果有产品代码，添加到配置中
+   if (productCode) {
+     config.product_code = productCode
+   }
+ 
+   // 根据产品类型过滤字段
+   if (config.product_type !== 'savings') {
+     delete config.withdrawal_modes
+     delete config.withdrawal_periods
+   }
+ 
+   return {
+     config,
+     unmatched,
+     warnings,
+     matchDetails
+   }
+ }
+ 
+ /**
+  * 智能提取所有字段（原始函数，保持兼容）
+  *
+  * @param {string} content - 文档内容
+  * @param {string} fileName - 文件名（用于推断产品名称）
+  * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果
+  */
+ export function smartExtractFields(content, fileName) {
+   return smartExtractFieldsForProduct(content, fileName, {})
+ }
+ 
 export { FIELD_RULES }