feat(parse): 支持多产品文档解析
- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别(GS、GC、FA、LV2 等) - 支持产品命名模式(以"計劃"、"保障"、"保险"、"壽險"结尾) - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式 测试结果: - 成功解析 计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Showing
3 changed files
with
372 additions
and
74 deletions
This diff is collapsed. Click to expand it.
scripts/product-splitter.js
0 → 100644
| 1 | +/** | ||
| 2 | + * 产品分割器 | ||
| 3 | + * | ||
| 4 | + * @description 从包含多个保险产品的文档中识别并分割出各个产品 | ||
| 5 | + * @module scripts/product-splitter | ||
| 6 | + * @author Claude Code | ||
| 7 | + * @created 2026-02-15 | ||
| 8 | + */ | ||
| 9 | + | ||
| 10 | +/** | ||
| 11 | + * 产品标题匹配规则 | ||
| 12 | + * | ||
| 13 | + * @description 用于识别文档中的产品标题行 | ||
| 14 | + * 格式示例: | ||
| 15 | + * - GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日 | ||
| 16 | + * - GC宏摯家傳承保險計劃- 性別, 年齡, 出生年月日 | ||
| 17 | + * - FA 宏浚傳承保障計劃 | ||
| 18 | + * - LV2 赤霞珠終身壽險計劃2基本人壽保障選項 | ||
| 19 | + */ | ||
| 20 | +const PRODUCT_TITLE_PATTERNS = [ | ||
| 21 | + // 产品代码 + 产品名称 + 可选后缀 | ||
| 22 | + // GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日 | ||
| 23 | + /^([A-Z]{2,4}\d?)\s*([^\n\-]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm, | ||
| 24 | + | ||
| 25 | + // 产品代码 + 空格 + 产品名称 | ||
| 26 | + // FA 宏浚傳承保障計劃 | ||
| 27 | + /^([A-Z]{2,4}\d?)\s+([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险))/gm, | ||
| 28 | + | ||
| 29 | + // 纯产品名称(包含"計劃") | ||
| 30 | + // 宏摯傳承保障計劃 | ||
| 31 | + /^([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm, | ||
| 32 | + | ||
| 33 | + // 产品代码开头的行 | ||
| 34 | + /^([A-Z]{2,4}\d?)\s*[-:]\s*([^\n]+)/gm | ||
| 35 | +] | ||
| 36 | + | ||
| 37 | +/** | ||
| 38 | + * 产品代码前缀列表(用于优先匹配) | ||
| 39 | + */ | ||
| 40 | +const PRODUCT_CODE_PREFIXES = [ | ||
| 41 | + 'GS', 'GC', 'FA', 'LV2', 'LV', 'CR', 'HR', 'PR', 'SR', | ||
| 42 | + 'TR', 'UR', 'WR', 'XR', 'YR', 'ZR' | ||
| 43 | +] | ||
| 44 | + | ||
| 45 | +/** | ||
| 46 | + * 检测文档中包含的产品数量 | ||
| 47 | + * | ||
| 48 | + * @param {string} content - 文档内容 | ||
| 49 | + * @returns {number} 产品数量 | ||
| 50 | + */ | ||
| 51 | +export function detectProductCount(content) { | ||
| 52 | + const matches = findProductTitles(content) | ||
| 53 | + return matches.length | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +/** | ||
| 57 | + * 查找文档中所有产品标题 | ||
| 58 | + * | ||
| 59 | + * @param {string} content - 文档内容 | ||
| 60 | + * @returns {Array<{index: number, code: string, name: string, fullTitle: string}>} 产品标题列表 | ||
| 61 | + */ | ||
| 62 | +export function findProductTitles(content) { | ||
| 63 | + const products = [] | ||
| 64 | + const seenCodes = new Set() | ||
| 65 | + | ||
| 66 | + // 策略1: 优先匹配产品代码前缀 | ||
| 67 | + for (const prefix of PRODUCT_CODE_PREFIXES) { | ||
| 68 | + // 匹配 "GS宏摯傳承保障計劃" 或 "GS 宏摯傳承保障計劃" | ||
| 69 | + const regex = new RegExp( | ||
| 70 | + `^(${prefix}\\d?)\\s*([\\u4e00-\\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险)[^\\n]*)`, | ||
| 71 | + 'gm' | ||
| 72 | + ) | ||
| 73 | + | ||
| 74 | + let match | ||
| 75 | + while ((match = regex.exec(content)) !== null) { | ||
| 76 | + const code = match[1] | ||
| 77 | + const name = match[2].trim() | ||
| 78 | + | ||
| 79 | + // 去重 | ||
| 80 | + if (seenCodes.has(code)) continue | ||
| 81 | + seenCodes.add(code) | ||
| 82 | + | ||
| 83 | + products.push({ | ||
| 84 | + index: match.index, | ||
| 85 | + code, | ||
| 86 | + name, | ||
| 87 | + fullTitle: match[0].trim() | ||
| 88 | + }) | ||
| 89 | + } | ||
| 90 | + } | ||
| 91 | + | ||
| 92 | + // 策略2: 如果没找到,尝试通用模式匹配 | ||
| 93 | + if (products.length === 0) { | ||
| 94 | + // 匹配包含"計劃"的产品名称行 | ||
| 95 | + const regex = /^([A-Z]{2,4}\d?)?\s*([^\n]*?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm | ||
| 96 | + | ||
| 97 | + let match | ||
| 98 | + while ((match = regex.exec(content)) !== null) { | ||
| 99 | + const fullTitle = match[0].trim() | ||
| 100 | + if (fullTitle.length < 5) continue // 过滤太短的匹配 | ||
| 101 | + | ||
| 102 | + products.push({ | ||
| 103 | + index: match.index, | ||
| 104 | + code: match[1] || null, | ||
| 105 | + name: match[2] || fullTitle, | ||
| 106 | + fullTitle | ||
| 107 | + }) | ||
| 108 | + } | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + // 按出现位置排序 | ||
| 112 | + products.sort((a, b) => a.index - b.index) | ||
| 113 | + | ||
| 114 | + return products | ||
| 115 | +} | ||
| 116 | + | ||
| 117 | +/** | ||
| 118 | + * 将文档内容按产品分割 | ||
| 119 | + * | ||
| 120 | + * @param {string} content - 文档内容 | ||
| 121 | + * @returns {Array<{code: string, name: string, content: string, fullTitle: string}>} 分割后的产品列表 | ||
| 122 | + */ | ||
| 123 | +export function splitByProducts(content) { | ||
| 124 | + const products = findProductTitles(content) | ||
| 125 | + | ||
| 126 | + if (products.length === 0) { | ||
| 127 | + // 没有找到多个产品,返回整个文档作为单个产品 | ||
| 128 | + return [{ | ||
| 129 | + code: null, | ||
| 130 | + name: null, | ||
| 131 | + content: content, | ||
| 132 | + fullTitle: null | ||
| 133 | + }] | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + if (products.length === 1) { | ||
| 137 | + // 只有一个产品,返回整个文档 | ||
| 138 | + return [{ | ||
| 139 | + code: products[0].code, | ||
| 140 | + name: products[0].name, | ||
| 141 | + content: content, | ||
| 142 | + fullTitle: products[0].fullTitle | ||
| 143 | + }] | ||
| 144 | + } | ||
| 145 | + | ||
| 146 | + // 多个产品,按位置分割 | ||
| 147 | + const result = [] | ||
| 148 | + | ||
| 149 | + for (let i = 0; i < products.length; i++) { | ||
| 150 | + const product = products[i] | ||
| 151 | + const startIndex = product.index | ||
| 152 | + const endIndex = (i < products.length - 1) ? products[i + 1].index : content.length | ||
| 153 | + | ||
| 154 | + const productContent = content.slice(startIndex, endIndex).trim() | ||
| 155 | + | ||
| 156 | + result.push({ | ||
| 157 | + code: product.code, | ||
| 158 | + name: product.name, | ||
| 159 | + content: productContent, | ||
| 160 | + fullTitle: product.fullTitle | ||
| 161 | + }) | ||
| 162 | + } | ||
| 163 | + | ||
| 164 | + return result | ||
| 165 | +} | ||
| 166 | + | ||
| 167 | +/** | ||
| 168 | + * 智能提取产品名称 | ||
| 169 | + * | ||
| 170 | + * @description 从产品标题或内容中提取标准化的产品名称 | ||
| 171 | + * @param {string} fullTitle - 产品完整标题 | ||
| 172 | + * @param {string} content - 产品内容片段 | ||
| 173 | + * @returns {string} 产品名称 | ||
| 174 | + */ | ||
| 175 | +export function extractProductName(fullTitle, content) { | ||
| 176 | + if (!fullTitle && !content) return null | ||
| 177 | + | ||
| 178 | + // 优先从完整标题提取 | ||
| 179 | + if (fullTitle) { | ||
| 180 | + // 移除产品代码前缀 | ||
| 181 | + let name = fullTitle.replace(/^[A-Z]{2,4}\d?\s*[-::]?\s*/, '') | ||
| 182 | + | ||
| 183 | + // 移除后缀说明(如 "- 性別, 年齡, 出生年月日") | ||
| 184 | + name = name.split(/[-—::]/)[0].trim() | ||
| 185 | + | ||
| 186 | + if (name && name.length > 2) { | ||
| 187 | + return name | ||
| 188 | + } | ||
| 189 | + } | ||
| 190 | + | ||
| 191 | + // 从内容中查找产品名称 | ||
| 192 | + const patterns = [ | ||
| 193 | + /产品名称[::]\s*([^\n]+)/, | ||
| 194 | + /计划书名称[::]\s*([^\n]+)/, | ||
| 195 | + /([A-Z]{2,4}\d?\s*[\u4e00-\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险))/ | ||
| 196 | + ] | ||
| 197 | + | ||
| 198 | + for (const pattern of patterns) { | ||
| 199 | + const match = content.match(pattern) | ||
| 200 | + if (match) { | ||
| 201 | + // 清理产品名称 | ||
| 202 | + let name = match[1] || match[0] | ||
| 203 | + name = name.replace(/^[A-Z]{2,4}\d?\s*[-::]?\s*/, '') | ||
| 204 | + name = name.split(/[-—::]/)[0].trim() | ||
| 205 | + if (name && name.length > 2) { | ||
| 206 | + return name | ||
| 207 | + } | ||
| 208 | + } | ||
| 209 | + } | ||
| 210 | + | ||
| 211 | + return null | ||
| 212 | +} | ||
| 213 | + | ||
| 214 | +/** | ||
| 215 | + * 生成产品分割报告 | ||
| 216 | + * | ||
| 217 | + * @param {string} content - 原始文档内容 | ||
| 218 | + * @param {Array} products - 分割后的产品列表 | ||
| 219 | + * @returns {string} Markdown 格式的报告 | ||
| 220 | + */ | ||
| 221 | +export function generateSplitReport(content, products) { | ||
| 222 | + let report = `## 📊 产品分割报告\n\n` | ||
| 223 | + | ||
| 224 | + report += `### 分割统计\n\n` | ||
| 225 | + report += `- 文档总长度: ${content.length} 字符\n` | ||
| 226 | + report += `- 识别产品数: ${products.length} 个\n\n` | ||
| 227 | + | ||
| 228 | + report += `### 产品列表\n\n` | ||
| 229 | + report += `| 序号 | 产品代码 | 产品名称 | 内容长度 |\n` | ||
| 230 | + report += `|------|---------|---------|----------|\n` | ||
| 231 | + | ||
| 232 | + products.forEach((product, index) => { | ||
| 233 | + const code = product.code || '-' | ||
| 234 | + const name = product.name || product.fullTitle?.slice(0, 20) || '-' | ||
| 235 | + const length = product.content.length | ||
| 236 | + report += `| ${index + 1} | ${code} | ${name.slice(0, 30)} | ${length} 字符 |\n` | ||
| 237 | + }) | ||
| 238 | + | ||
| 239 | + return report | ||
| 240 | +} | ||
| 241 | + | ||
| 242 | +export { | ||
| 243 | + PRODUCT_TITLE_PATTERNS, | ||
| 244 | + PRODUCT_CODE_PREFIXES | ||
| 245 | +} |
| ... | @@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) { | ... | @@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) { |
| 463 | } | 463 | } |
| 464 | 464 | ||
| 465 | /** | 465 | /** |
| 466 | - * 智能提取所有字段 | ||
| 467 | - * | ||
| 468 | - * @param {string} content - 文档内容 | ||
| 469 | - * @param {string} fileName - 文件名(用于推断产品名称) | ||
| 470 | - * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果 | ||
| 471 | - */ | ||
| 472 | -export function smartExtractFields(content, fileName) { | ||
| 473 | - const config = {} | ||
| 474 | - const unmatched = [] | ||
| 475 | - const warnings = [] | ||
| 476 | - const matchDetails = [] | ||
| 477 | - | ||
| 478 | - // 按优先级提取字段 | ||
| 479 | - const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority) | ||
| 480 | - | ||
| 481 | - for (const [fieldName, rule] of sortedFields) { | ||
| 482 | - const result = extractField(content, fieldName) | ||
| 483 | - | ||
| 484 | - // 记录匹配详情 | ||
| 485 | - matchDetails.push({ | ||
| 486 | - field: fieldName, | ||
| 487 | - matched: result.matched, | ||
| 488 | - pattern: result.pattern, | ||
| 489 | - value: result.value | ||
| 490 | - }) | ||
| 491 | - | ||
| 492 | - // 如果匹配成功或字段有默认值 | ||
| 493 | - if (result.value !== null) { | ||
| 494 | - config[fieldName] = result.value | ||
| 495 | - | ||
| 496 | - // 如果使用了默认值,记录警告 | ||
| 497 | - if (!result.matched && rule.required) { | ||
| 498 | - warnings.push({ | ||
| 499 | - field: fieldName, | ||
| 500 | - message: `未找到字段 "${fieldName}",使用默认值: ${JSON.stringify(rule.fallback)}`, | ||
| 501 | - severity: 'warning' | ||
| 502 | - }) | ||
| 503 | - } | ||
| 504 | - } else if (rule.required) { | ||
| 505 | - // 必填字段未匹配 | ||
| 506 | - unmatched.push({ | ||
| 507 | - field: fieldName, | ||
| 508 | - reason: '未找到匹配内容', | ||
| 509 | - suggestions: generateSuggestions(fieldName, content) | ||
| 510 | - }) | ||
| 511 | - } | ||
| 512 | - } | ||
| 513 | - | ||
| 514 | - // 产品名称特殊处理:如果未匹配,使用文件名 | ||
| 515 | - if (!config.product_name) { | ||
| 516 | - const baseName = fileName.replace(/\.[^/.]+$/, '') | ||
| 517 | - config.product_name = baseName | ||
| 518 | - warnings.push({ | ||
| 519 | - field: 'product_name', | ||
| 520 | - message: `未找到产品名称,使用文件名: "${baseName}"`, | ||
| 521 | - severity: 'info' | ||
| 522 | - }) | ||
| 523 | - } | ||
| 524 | - | ||
| 525 | - // 根据产品类型过滤字段 | ||
| 526 | - if (config.product_type !== 'savings') { | ||
| 527 | - delete config.withdrawal_modes | ||
| 528 | - delete config.withdrawal_periods | ||
| 529 | - } | ||
| 530 | - | ||
| 531 | - return { | ||
| 532 | - config, | ||
| 533 | - unmatched, | ||
| 534 | - warnings, | ||
| 535 | - matchDetails | ||
| 536 | - } | ||
| 537 | -} | ||
| 538 | - | ||
| 539 | -/** | ||
| 540 | * 生成字段建议值 | 466 | * 生成字段建议值 |
| 541 | * | 467 | * |
| 542 | * @param {string} fieldName - 字段名称 | 468 | * @param {string} fieldName - 字段名称 |
| ... | @@ -614,4 +540,131 @@ export function generateAuditReport(result) { | ... | @@ -614,4 +540,131 @@ export function generateAuditReport(result) { |
| 614 | return report | 540 | return report |
| 615 | } | 541 | } |
| 616 | 542 | ||
| 543 | +/** | ||
| 544 | + * 智能提取所有字段(支持多产品) | ||
| 545 | + * | ||
| 546 | + * @description 从单个产品内容片段中提取字段,优先使用传入的产品名称 | ||
| 547 | + * @param {string} content - 产品内容片段 | ||
| 548 | + * @param {string} fileName - 文件名 | ||
| 549 | + * @param {Object} options - 额外选项 | ||
| 550 | + * @param {string} options.productCode - 产品代码(如 GS、GC、FA) | ||
| 551 | + * @param {string} options.productName - 产品名称(从分割器获取) | ||
| 552 | + * @returns {{config: Object, unmatched: Array, warnings: Array, matchDetails: Array}} 提取结果 | ||
| 553 | + */ | ||
| 554 | +export function smartExtractFieldsForProduct(content, fileName, options = {}) { | ||
| 555 | + const { productCode, productName } = options | ||
| 556 | + const config = {} | ||
| 557 | + const unmatched = [] | ||
| 558 | + const warnings = [] | ||
| 559 | + const matchDetails = [] | ||
| 560 | + | ||
| 561 | + // 按优先级提取字段 | ||
| 562 | + const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority) | ||
| 563 | + | ||
| 564 | + for (const [fieldName, rule] of sortedFields) { | ||
| 565 | + // 跳过 product_name,后面特殊处理 | ||
| 566 | + if (fieldName === 'product_name') continue | ||
| 567 | + | ||
| 568 | + const result = extractField(content, fieldName) | ||
| 569 | + | ||
| 570 | + // 记录匹配详情 | ||
| 571 | + matchDetails.push({ | ||
| 572 | + field: fieldName, | ||
| 573 | + matched: result.matched, | ||
| 574 | + pattern: result.pattern, | ||
| 575 | + value: result.value | ||
| 576 | + }) | ||
| 577 | + | ||
| 578 | + // 如果匹配成功或字段有默认值 | ||
| 579 | + if (result.value !== null) { | ||
| 580 | + config[fieldName] = result.value | ||
| 581 | + | ||
| 582 | + // 如果使用了默认值,记录警告 | ||
| 583 | + if (!result.matched && rule.required) { | ||
| 584 | + warnings.push({ | ||
| 585 | + field: fieldName, | ||
| 586 | + message: `未找到字段 "${fieldName}",使用默认值: ${JSON.stringify(rule.fallback)}`, | ||
| 587 | + severity: 'warning' | ||
| 588 | + }) | ||
| 589 | + } | ||
| 590 | + } else if (rule.required) { | ||
| 591 | + // 必填字段未匹配 | ||
| 592 | + unmatched.push({ | ||
| 593 | + field: fieldName, | ||
| 594 | + reason: '未找到匹配内容', | ||
| 595 | + suggestions: generateSuggestions(fieldName, content) | ||
| 596 | + }) | ||
| 597 | + } | ||
| 598 | + } | ||
| 599 | + | ||
| 600 | + // ========== 产品名称特殊处理 ========== | ||
| 601 | + // 优先级: 传入的产品名称 > 从内容提取 > 文件名 | ||
| 602 | + if (productName) { | ||
| 603 | + // 使用分割器传入的产品名称 | ||
| 604 | + config.product_name = productName | ||
| 605 | + matchDetails.unshift({ | ||
| 606 | + field: 'product_name', | ||
| 607 | + matched: true, | ||
| 608 | + pattern: 'product_splitter', | ||
| 609 | + value: productName | ||
| 610 | + }) | ||
| 611 | + } else { | ||
| 612 | + // 尝试从内容提取 | ||
| 613 | + const nameResult = extractField(content, 'product_name') | ||
| 614 | + if (nameResult.matched && nameResult.value) { | ||
| 615 | + config.product_name = nameResult.value | ||
| 616 | + matchDetails.unshift({ | ||
| 617 | + field: 'product_name', | ||
| 618 | + matched: true, | ||
| 619 | + pattern: nameResult.pattern, | ||
| 620 | + value: nameResult.value | ||
| 621 | + }) | ||
| 622 | + } else { | ||
| 623 | + // 使用文件名 | ||
| 624 | + const baseName = fileName.replace(/\.[^/.]+$/, '') | ||
| 625 | + config.product_name = baseName | ||
| 626 | + warnings.push({ | ||
| 627 | + field: 'product_name', | ||
| 628 | + message: `未找到产品名称,使用文件名: "${baseName}"`, | ||
| 629 | + severity: 'info' | ||
| 630 | + }) | ||
| 631 | + matchDetails.unshift({ | ||
| 632 | + field: 'product_name', | ||
| 633 | + matched: false, | ||
| 634 | + pattern: 'filename_fallback', | ||
| 635 | + value: baseName | ||
| 636 | + }) | ||
| 637 | + } | ||
| 638 | + } | ||
| 639 | + | ||
| 640 | + // 如果有产品代码,添加到配置中 | ||
| 641 | + if (productCode) { | ||
| 642 | + config.product_code = productCode | ||
| 643 | + } | ||
| 644 | + | ||
| 645 | + // 根据产品类型过滤字段 | ||
| 646 | + if (config.product_type !== 'savings') { | ||
| 647 | + delete config.withdrawal_modes | ||
| 648 | + delete config.withdrawal_periods | ||
| 649 | + } | ||
| 650 | + | ||
| 651 | + return { | ||
| 652 | + config, | ||
| 653 | + unmatched, | ||
| 654 | + warnings, | ||
| 655 | + matchDetails | ||
| 656 | + } | ||
| 657 | +} | ||
| 658 | + | ||
| 659 | +/** | ||
| 660 | + * 智能提取所有字段(原始函数,保持兼容) | ||
| 661 | + * | ||
| 662 | + * @param {string} content - 文档内容 | ||
| 663 | + * @param {string} fileName - 文件名(用于推断产品名称) | ||
| 664 | + * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果 | ||
| 665 | + */ | ||
| 666 | +export function smartExtractFields(content, fileName) { | ||
| 667 | + return smartExtractFieldsForProduct(content, fileName, {}) | ||
| 668 | +} | ||
| 669 | + | ||
| 617 | export { FIELD_RULES } | 670 | export { FIELD_RULES } | ... | ... |
-
Please register or login to post a comment