feat(parse): 支持多产品文档解析
- 新增 product-splitter.js 产品边界检测模块 - 支持产品代码前缀识别(GS、GC、FA、LV2 等) - 支持产品命名模式(以"計劃"、"保障"、"保险"、"壽險"结尾) - 自动检测和分割多产品文档 - 增强 parse-docs.js 多产品处理 - parseSingleFile() 返回数组支持多产品 - generateAuditFile() 支持产品索引参数 - 单文件模式 (--file=) 正确处理多产品结果 - buildParseSummary() 统计多产品数量 - 优化 smart-field-extractor.js - 新增 smartExtractFieldsForProduct() 单产品提取 - 移除重复的函数定义 - 包装函数兼容新旧调用方式 测试结果: - 成功解析 计划书模版2.docx 中的 4 个保险产品 - 每个产品生成独立的审核文件 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Showing
3 changed files
with
642 additions
and
112 deletions
| ... | @@ -30,7 +30,8 @@ import { | ... | @@ -30,7 +30,8 @@ import { |
| 30 | MARKITDOWN_CONFIG, | 30 | MARKITDOWN_CONFIG, |
| 31 | AI_SERVICE_CONFIG | 31 | AI_SERVICE_CONFIG |
| 32 | } from './parse-config.js' | 32 | } from './parse-config.js' |
| 33 | -import { smartExtractFields, generateAuditReport } from './smart-field-extractor.js' | 33 | +import { smartExtractFields, smartExtractFieldsForProduct, generateAuditReport } from './smart-field-extractor.js' |
| 34 | +import { splitByProducts, findProductTitles, generateSplitReport } from './product-splitter.js' | ||
| 34 | 35 | ||
| 35 | // ========== 配置区 ========== | 36 | // ========== 配置区 ========== |
| 36 | 37 | ||
| ... | @@ -470,7 +471,7 @@ const AI_PARSE_PROMPT = `你是一个保险产品配置专家。请从以下文 | ... | @@ -470,7 +471,7 @@ const AI_PARSE_PROMPT = `你是一个保险产品配置专家。请从以下文 |
| 470 | * | 471 | * |
| 471 | * @description 使用 markitdown + AI 智能解析文档并提取配置 | 472 | * @description 使用 markitdown + AI 智能解析文档并提取配置 |
| 472 | * @param {string} docPath - 文档路径 | 473 | * @param {string} docPath - 文档路径 |
| 473 | - * @returns {Promise<Object>} 解析后的配置对象 | 474 | + * @returns {Promise<Object|Array<Object>>} 解析后的配置对象或配置数组(多产品) |
| 474 | */ | 475 | */ |
| 475 | async function parseDocumentWithAI(docPath) { | 476 | async function parseDocumentWithAI(docPath) { |
| 476 | console.log(`\n🤖 正在智能解析: ${path.basename(docPath)}`) | 477 | console.log(`\n🤖 正在智能解析: ${path.basename(docPath)}`) |
| ... | @@ -493,7 +494,78 @@ async function parseDocumentWithAI(docPath) { | ... | @@ -493,7 +494,78 @@ async function parseDocumentWithAI(docPath) { |
| 493 | const content = parse_result.text | 494 | const content = parse_result.text |
| 494 | const fileName = path.basename(docPath) | 495 | const fileName = path.basename(docPath) |
| 495 | 496 | ||
| 496 | - // 步骤 2: 使用智能字段提取器 | 497 | + // ========== 步骤 2: 检测并分割多产品 ========== |
| 498 | + const productTitles = findProductTitles(content) | ||
| 499 | + | ||
| 500 | + if (productTitles.length > 1) { | ||
| 501 | + // 多产品文档 | ||
| 502 | + console.log(`\n📦 检测到 ${productTitles.length} 个产品:`) | ||
| 503 | + productTitles.forEach((p, i) => { | ||
| 504 | + console.log(` ${i + 1}. [${p.code || '?'}] ${p.name || p.fullTitle?.slice(0, 30)}`) | ||
| 505 | + }) | ||
| 506 | + | ||
| 507 | + // 分割文档 | ||
| 508 | + const products = splitByProducts(content) | ||
| 509 | + const splitReport = generateSplitReport(content, products) | ||
| 510 | + console.log('\n' + splitReport) | ||
| 511 | + | ||
| 512 | + // 对每个产品分别提取字段 | ||
| 513 | + const configs = [] | ||
| 514 | + for (let i = 0; i < products.length; i++) { | ||
| 515 | + const product = products[i] | ||
| 516 | + console.log(`\n${'='.repeat(40)}`) | ||
| 517 | + console.log(`📋 处理产品 ${i + 1}/${products.length}: ${product.name || product.code || '未命名'}`) | ||
| 518 | + console.log('='.repeat(40)) | ||
| 519 | + | ||
| 520 | + const extractResult = smartExtractFieldsForProduct( | ||
| 521 | + product.content, | ||
| 522 | + fileName, | ||
| 523 | + { | ||
| 524 | + productCode: product.code, | ||
| 525 | + productName: product.name | ||
| 526 | + } | ||
| 527 | + ) | ||
| 528 | + | ||
| 529 | + // 生成审核报告 | ||
| 530 | + const auditReport = generateAuditReport(extractResult) | ||
| 531 | + console.log('\n' + auditReport) | ||
| 532 | + | ||
| 533 | + // 构建配置对象 | ||
| 534 | + const config = { | ||
| 535 | + ...extractResult.config, | ||
| 536 | + is_savings: extractResult.config.product_type === 'savings', | ||
| 537 | + form_schema: { base_fields: [], withdrawal_fields: [], reset_map: {} }, | ||
| 538 | + submit_mapping: {} | ||
| 539 | + } | ||
| 540 | + | ||
| 541 | + // 保存匹配详情 | ||
| 542 | + config._extractDetails = { | ||
| 543 | + matched: extractResult.matchDetails.filter(m => m.matched).map(m => m.field), | ||
| 544 | + unmatched: extractResult.unmatched, | ||
| 545 | + warnings: extractResult.warnings, | ||
| 546 | + productIndex: i, | ||
| 547 | + totalProducts: products.length | ||
| 548 | + } | ||
| 549 | + | ||
| 550 | + config.form_sn = generateFormSn(config) | ||
| 551 | + | ||
| 552 | + const matchedCount = extractResult.matchDetails.filter(m => m.matched).length | ||
| 553 | + const totalCount = extractResult.matchDetails.length | ||
| 554 | + | ||
| 555 | + console.log(`\n✅ 产品 ${i + 1} 解析成功 (智能匹配 ${matchedCount}/${totalCount} 字段)`) | ||
| 556 | + console.log(` 产品名称: ${config.product_name}`) | ||
| 557 | + console.log(` 产品代码: ${product.code || '-'}`) | ||
| 558 | + console.log(` 产品类型: ${config.product_type}`) | ||
| 559 | + console.log(` 币种: ${config.currency}`) | ||
| 560 | + console.log(` 缴费年期: ${JSON.stringify(config.payment_periods)}`) | ||
| 561 | + | ||
| 562 | + configs.push(config) | ||
| 563 | + } | ||
| 564 | + | ||
| 565 | + return configs // 返回数组 | ||
| 566 | + } | ||
| 567 | + | ||
| 568 | + // ========== 单产品文档 ========== | ||
| 497 | console.log('🧠 使用智能字段提取器...') | 569 | console.log('🧠 使用智能字段提取器...') |
| 498 | const extractResult = smartExtractFields(content, fileName) | 570 | const extractResult = smartExtractFields(content, fileName) |
| 499 | 571 | ||
| ... | @@ -598,6 +670,10 @@ function inferCurrency(content) { | ... | @@ -598,6 +670,10 @@ function inferCurrency(content) { |
| 598 | 670 | ||
| 599 | /** | 671 | /** |
| 600 | * 解析单个文档 | 672 | * 解析单个文档 |
| 673 | + * | ||
| 674 | + * @description 支持单产品和多产品文档解析 | ||
| 675 | + * - 单产品文档:返回单个结果对象 | ||
| 676 | + * - 多产品文档:返回结果数组(每个产品一个结果) | ||
| 601 | */ | 677 | */ |
| 602 | async function parseSingleFile(filePath) { | 678 | async function parseSingleFile(filePath) { |
| 603 | const fileName = path.basename(filePath) | 679 | const fileName = path.basename(filePath) |
| ... | @@ -605,21 +681,46 @@ async function parseSingleFile(filePath) { | ... | @@ -605,21 +681,46 @@ async function parseSingleFile(filePath) { |
| 605 | console.log("📄 处理文件: " + fileName) | 681 | console.log("📄 处理文件: " + fileName) |
| 606 | console.log("=".repeat(60)) | 682 | console.log("=".repeat(60)) |
| 607 | 683 | ||
| 608 | - // 解析文档 | 684 | + // 解析文档(可能返回单个 config 或 configs 数组) |
| 609 | - const config = await parseDocumentWithAI(filePath) | 685 | + const parseResult = await parseDocumentWithAI(filePath) |
| 610 | 686 | ||
| 611 | - if (!config) { | 687 | + if (!parseResult) { |
| 612 | console.log("⏭️ 跳过文件: " + fileName + " (解析失败)") | 688 | console.log("⏭️ 跳过文件: " + fileName + " (解析失败)") |
| 613 | return { success: false, file: fileName, reason: 'parse_failed' } | 689 | return { success: false, file: fileName, reason: 'parse_failed' } |
| 614 | } | 690 | } |
| 615 | 691 | ||
| 692 | + // 统一处理为数组形式 | ||
| 693 | + const configs = Array.isArray(parseResult) ? parseResult : [parseResult] | ||
| 694 | + | ||
| 695 | + // 多产品提示 | ||
| 696 | + if (configs.length > 1) { | ||
| 697 | + console.log("\n📦 检测到多产品文档,共 " + configs.length + " 个产品") | ||
| 698 | + } | ||
| 699 | + | ||
| 700 | + // 处理每个产品配置 | ||
| 701 | + const results = [] | ||
| 702 | + for (let i = 0; i < configs.length; i++) { | ||
| 703 | + const config = configs[i] | ||
| 704 | + const productIndex = configs.length > 1 ? ` [${i + 1}/${configs.length}]` : '' | ||
| 705 | + | ||
| 706 | + if (configs.length > 1 && config.product_name) { | ||
| 707 | + console.log("\n--- 处理产品: " + config.product_name + " ---") | ||
| 708 | + } | ||
| 709 | + | ||
| 616 | const validation = validateParsedConfig(config) | 710 | const validation = validateParsedConfig(config) |
| 617 | if (!validation.valid) { | 711 | if (!validation.valid) { |
| 618 | - console.error("❌ 校验失败: " + fileName) | 712 | + console.error("❌ 校验失败" + productIndex + ": " + (config.product_name || fileName)) |
| 619 | validation.errors.forEach(message => { | 713 | validation.errors.forEach(message => { |
| 620 | console.error(" - " + message) | 714 | console.error(" - " + message) |
| 621 | }) | 715 | }) |
| 622 | - return { success: false, file: fileName, reason: 'validation_failed', errors: validation.errors } | 716 | + results.push({ |
| 717 | + success: false, | ||
| 718 | + file: fileName, | ||
| 719 | + productName: config.product_name || `产品${i + 1}`, | ||
| 720 | + reason: 'validation_failed', | ||
| 721 | + errors: validation.errors | ||
| 722 | + }) | ||
| 723 | + continue | ||
| 623 | } | 724 | } |
| 624 | 725 | ||
| 625 | // 添加源文件信息 | 726 | // 添加源文件信息 |
| ... | @@ -628,18 +729,42 @@ async function parseSingleFile(filePath) { | ... | @@ -628,18 +729,42 @@ async function parseSingleFile(filePath) { |
| 628 | // 生成配置代码 | 729 | // 生成配置代码 |
| 629 | const { formSn, code } = generateConfigCode(config) | 730 | const { formSn, code } = generateConfigCode(config) |
| 630 | 731 | ||
| 631 | - console.log("\n📝 生成 form_sn: " + formSn) | 732 | + console.log("\n📝 生成 form_sn: " + formSn + productIndex) |
| 632 | console.log("📋 生成配置代码:\n" + code) | 733 | console.log("📋 生成配置代码:\n" + code) |
| 633 | 734 | ||
| 634 | - // ✨ 新增:生成待审核文件(不直接写入正式配置) | 735 | + // 生成待审核文件 |
| 635 | - const auditFile = await generateAuditFile(fileName, config, code) | 736 | + const auditFile = await generateAuditFile(fileName, config, code, i, configs.length) |
| 636 | if (auditFile) { | 737 | if (auditFile) { |
| 637 | console.log("\n✅ 已生成待审核文件: " + auditFile) | 738 | console.log("\n✅ 已生成待审核文件: " + auditFile) |
| 638 | console.log("📋 请审核后手动移动到 src/config/plan-templates.js") | 739 | console.log("📋 请审核后手动移动到 src/config/plan-templates.js") |
| 639 | - return { success: true, formSn, code, file: fileName, config, auditFile } | ||
| 640 | } | 740 | } |
| 641 | 741 | ||
| 642 | - return { success: true, formSn, code, file: fileName, config, auditFile } | 742 | + results.push({ |
| 743 | + success: true, | ||
| 744 | + formSn, | ||
| 745 | + code, | ||
| 746 | + file: fileName, | ||
| 747 | + productName: config.product_name || `产品${i + 1}`, | ||
| 748 | + config, | ||
| 749 | + auditFile | ||
| 750 | + }) | ||
| 751 | + } | ||
| 752 | + | ||
| 753 | + // 单产品时返回单个结果对象(保持向后兼容) | ||
| 754 | + // 多产品时返回数组 | ||
| 755 | + if (configs.length === 1) { | ||
| 756 | + return results[0] | ||
| 757 | + } | ||
| 758 | + | ||
| 759 | + // 多产品返回特殊结构 | ||
| 760 | + return { | ||
| 761 | + success: results.some(r => r.success), | ||
| 762 | + file: fileName, | ||
| 763 | + multiProduct: true, | ||
| 764 | + productCount: configs.length, | ||
| 765 | + successCount: results.filter(r => r.success).length, | ||
| 766 | + results // 每个产品的详细结果 | ||
| 767 | + } | ||
| 643 | } | 768 | } |
| 644 | 769 | ||
| 645 | /** | 770 | /** |
| ... | @@ -649,16 +774,31 @@ async function parseSingleFile(filePath) { | ... | @@ -649,16 +774,31 @@ async function parseSingleFile(filePath) { |
| 649 | * @param {string} fileName - 原始文件名 | 774 | * @param {string} fileName - 原始文件名 |
| 650 | * @param {Object} config - 解析的配置对象 | 775 | * @param {Object} config - 解析的配置对象 |
| 651 | * @param {string} code - 生成的配置代码 | 776 | * @param {string} code - 生成的配置代码 |
| 777 | + * @param {number} productIndex - 产品索引(多产品文档时使用,从 0 开始) | ||
| 778 | + * @param {number} totalProducts - 产品总数(多产品文档时使用) | ||
| 652 | * @returns {Promise<string|null>} 审核文件路径 | 779 | * @returns {Promise<string|null>} 审核文件路径 |
| 653 | */ | 780 | */ |
| 654 | -async function generateAuditFile(fileName, config, code) { | 781 | +async function generateAuditFile(fileName, config, code, productIndex = 0, totalProducts = 1) { |
| 655 | const AUDIT_PENDING_DIR = path.resolve(process.cwd(), 'docs/parse-audit/pending') | 782 | const AUDIT_PENDING_DIR = path.resolve(process.cwd(), 'docs/parse-audit/pending') |
| 656 | const AUDIT_APPROVED_DIR = path.resolve(process.cwd(), 'docs/parse-audit/approved') | 783 | const AUDIT_APPROVED_DIR = path.resolve(process.cwd(), 'docs/parse-audit/approved') |
| 657 | ensureDir(AUDIT_PENDING_DIR) | 784 | ensureDir(AUDIT_PENDING_DIR) |
| 658 | ensureDir(AUDIT_APPROVED_DIR) | 785 | ensureDir(AUDIT_APPROVED_DIR) |
| 659 | 786 | ||
| 660 | const date = new Date().toISOString().split('T')[0] | 787 | const date = new Date().toISOString().split('T')[0] |
| 661 | - const auditFileName = `${date}-${fileName.replace(/\.[^/.]+$/, '')}.md` | 788 | + const baseFileName = fileName.replace(/\.[^/.]+$/, '') |
| 789 | + | ||
| 790 | + // 多产品文档时,为每个产品生成独立文件 | ||
| 791 | + let auditFileName | ||
| 792 | + if (totalProducts > 1 && config.product_name) { | ||
| 793 | + // 使用产品名称作为文件名的一部分 | ||
| 794 | + const productSlug = config.product_name | ||
| 795 | + .replace(/[^a-zA-Z0-9\u4e00-\u9fa5]/g, '-') // 保留中文、英文、数字 | ||
| 796 | + .replace(/-+/g, '-') | ||
| 797 | + .slice(0, 30) // 限制长度 | ||
| 798 | + auditFileName = `${date}-${baseFileName}-${productSlug}.md` | ||
| 799 | + } else { | ||
| 800 | + auditFileName = `${date}-${baseFileName}.md` | ||
| 801 | + } | ||
| 662 | const auditFilePath = path.join(AUDIT_PENDING_DIR, auditFileName) | 802 | const auditFilePath = path.join(AUDIT_PENDING_DIR, auditFileName) |
| 663 | const formSn = generateFormSn(config) | 803 | const formSn = generateFormSn(config) |
| 664 | const formSchemaPreview = config.form_schema ? JSON.stringify(config.form_schema, null, 2) : '// 请手动补充' | 804 | const formSchemaPreview = config.form_schema ? JSON.stringify(config.form_schema, null, 2) : '// 请手动补充' |
| ... | @@ -857,9 +997,16 @@ export function updateConfigContent(existingContent, newConfigs) { | ... | @@ -857,9 +997,16 @@ export function updateConfigContent(existingContent, newConfigs) { |
| 857 | return null | 997 | return null |
| 858 | } | 998 | } |
| 859 | 999 | ||
| 860 | - const insertContent = newConfigs.map((item, index) => { | 1000 | + // 过滤掉没有 code 的配置项 |
| 1001 | + const validConfigs = newConfigs.filter(item => item && item.code) | ||
| 1002 | + if (validConfigs.length === 0) { | ||
| 1003 | + console.warn('⚠️ 没有有效的配置代码可插入') | ||
| 1004 | + return null | ||
| 1005 | + } | ||
| 1006 | + | ||
| 1007 | + const insertContent = validConfigs.map((item, index) => { | ||
| 861 | const code = item.code.trimEnd() | 1008 | const code = item.code.trimEnd() |
| 862 | - return index === newConfigs.length - 1 ? code : code + ',' | 1009 | + return index === validConfigs.length - 1 ? code : code + ',' |
| 863 | }).join('\n\n') | 1010 | }).join('\n\n') |
| 864 | 1011 | ||
| 865 | const before = existingContent.substring(0, range.endIndex) | 1012 | const before = existingContent.substring(0, range.endIndex) |
| ... | @@ -1099,7 +1246,8 @@ export function buildConfigUpdateResult(existingContent, newConfigs, options = { | ... | @@ -1099,7 +1246,8 @@ export function buildConfigUpdateResult(existingContent, newConfigs, options = { |
| 1099 | 1246 | ||
| 1100 | export function buildParseSummary(results, duration_ms) { | 1247 | export function buildParseSummary(results, duration_ms) { |
| 1101 | const summary = { | 1248 | const summary = { |
| 1102 | - total: results.length, | 1249 | + total_docs: results.length, |
| 1250 | + total_products: 0, | ||
| 1103 | success: 0, | 1251 | success: 0, |
| 1104 | failed: 0, | 1252 | failed: 0, |
| 1105 | duration_ms, | 1253 | duration_ms, |
| ... | @@ -1108,6 +1256,34 @@ export function buildParseSummary(results, duration_ms) { | ... | @@ -1108,6 +1256,34 @@ export function buildParseSummary(results, duration_ms) { |
| 1108 | } | 1256 | } |
| 1109 | 1257 | ||
| 1110 | results.forEach(result => { | 1258 | results.forEach(result => { |
| 1259 | + // 处理多产品文档 | ||
| 1260 | + if (result.multiProduct) { | ||
| 1261 | + summary.total_products += result.productCount | ||
| 1262 | + | ||
| 1263 | + if (result.results) { | ||
| 1264 | + result.results.forEach(r => { | ||
| 1265 | + if (r.success) { | ||
| 1266 | + summary.success += 1 | ||
| 1267 | + summary.success_list.push({ | ||
| 1268 | + form_sn: r.formSn, | ||
| 1269 | + product_name: r.config?.product_name || r.productName, | ||
| 1270 | + file: r.file | ||
| 1271 | + }) | ||
| 1272 | + } else { | ||
| 1273 | + summary.failed += 1 | ||
| 1274 | + summary.failed_list.push({ | ||
| 1275 | + file: r.file, | ||
| 1276 | + product_name: r.productName, | ||
| 1277 | + reason: r.reason || 'unknown', | ||
| 1278 | + errors: r.errors || [] | ||
| 1279 | + }) | ||
| 1280 | + } | ||
| 1281 | + }) | ||
| 1282 | + } | ||
| 1283 | + } else { | ||
| 1284 | + // 单产品文档 | ||
| 1285 | + summary.total_products += 1 | ||
| 1286 | + | ||
| 1111 | if (result.success) { | 1287 | if (result.success) { |
| 1112 | summary.success += 1 | 1288 | summary.success += 1 |
| 1113 | summary.success_list.push({ | 1289 | summary.success_list.push({ |
| ... | @@ -1123,6 +1299,7 @@ export function buildParseSummary(results, duration_ms) { | ... | @@ -1123,6 +1299,7 @@ export function buildParseSummary(results, duration_ms) { |
| 1123 | errors: result.errors || [] | 1299 | errors: result.errors || [] |
| 1124 | }) | 1300 | }) |
| 1125 | } | 1301 | } |
| 1302 | + } | ||
| 1126 | }) | 1303 | }) |
| 1127 | 1304 | ||
| 1128 | return summary | 1305 | return summary |
| ... | @@ -1246,6 +1423,8 @@ function updateConfigFile(newConfigs, options = {}) { | ... | @@ -1246,6 +1423,8 @@ function updateConfigFile(newConfigs, options = {}) { |
| 1246 | 1423 | ||
| 1247 | /** | 1424 | /** |
| 1248 | * 处理所有文档 | 1425 | * 处理所有文档 |
| 1426 | + * | ||
| 1427 | + * @description 支持单产品和多产品文档的批量处理 | ||
| 1249 | */ | 1428 | */ |
| 1250 | async function parseAllDocs(docs, options = {}) { | 1429 | async function parseAllDocs(docs, options = {}) { |
| 1251 | if (docs.length === 0) { | 1430 | if (docs.length === 0) { |
| ... | @@ -1263,19 +1442,45 @@ async function parseAllDocs(docs, options = {}) { | ... | @@ -1263,19 +1442,45 @@ async function parseAllDocs(docs, options = {}) { |
| 1263 | 1442 | ||
| 1264 | for (const doc of docs) { | 1443 | for (const doc of docs) { |
| 1265 | const result = await parseSingleFile(doc.fullPath) | 1444 | const result = await parseSingleFile(doc.fullPath) |
| 1445 | + | ||
| 1446 | + // 处理多产品返回值 | ||
| 1447 | + if (result.multiProduct) { | ||
| 1448 | + // 多产品文档 | ||
| 1449 | + console.log("\n📦 文档 " + result.file + " 包含 " + result.productCount + " 个产品,成功 " + result.successCount + " 个") | ||
| 1450 | + | ||
| 1451 | + // 添加文档级结果 | ||
| 1266 | results.push(result) | 1452 | results.push(result) |
| 1267 | - if (result.success) { | 1453 | + |
| 1454 | + // 展开每个成功的产品到 successResults | ||
| 1455 | + if (result.results) { | ||
| 1456 | + result.results.forEach(r => { | ||
| 1457 | + if (r.success && r.code) { | ||
| 1458 | + successResults.push(r) | ||
| 1459 | + } | ||
| 1460 | + }) | ||
| 1461 | + } | ||
| 1462 | + } else { | ||
| 1463 | + // 单产品文档 | ||
| 1464 | + results.push(result) | ||
| 1465 | + if (result.success && result.code) { | ||
| 1268 | successResults.push(result) | 1466 | successResults.push(result) |
| 1269 | } | 1467 | } |
| 1270 | } | 1468 | } |
| 1469 | + } | ||
| 1470 | + | ||
| 1471 | + // 计算实际产品数量 | ||
| 1472 | + const totalProducts = results.reduce((sum, r) => { | ||
| 1473 | + return sum + (r.multiProduct ? r.productCount : 1) | ||
| 1474 | + }, 0) | ||
| 1475 | + const successProducts = successResults.length | ||
| 1271 | 1476 | ||
| 1272 | // 汇总 | 1477 | // 汇总 |
| 1273 | console.log("\n" + "=".repeat(60)) | 1478 | console.log("\n" + "=".repeat(60)) |
| 1274 | console.log("📊 解析结果汇总") | 1479 | console.log("📊 解析结果汇总") |
| 1275 | console.log("=".repeat(60)) | 1480 | console.log("=".repeat(60)) |
| 1276 | - console.log("总计: " + docs.length + " 个文档") | 1481 | + console.log("文档: " + docs.length + " 个") |
| 1277 | - console.log("成功: " + successResults.length + " 个") | 1482 | + console.log("产品: " + totalProducts + " 个(成功: " + successProducts + ", 失败: " + (totalProducts - successProducts) + ")") |
| 1278 | - console.log("失败: " + (results.length - successResults.length) + " 个") | 1483 | + |
| 1279 | const summary = buildParseSummary(results, Date.now() - start_time) | 1484 | const summary = buildParseSummary(results, Date.now() - start_time) |
| 1280 | console.log("耗时: " + summary.duration_ms + "ms") | 1485 | console.log("耗时: " + summary.duration_ms + "ms") |
| 1281 | 1486 | ||
| ... | @@ -1283,13 +1488,21 @@ async function parseAllDocs(docs, options = {}) { | ... | @@ -1283,13 +1488,21 @@ async function parseAllDocs(docs, options = {}) { |
| 1283 | if (successResults.length > 0) { | 1488 | if (successResults.length > 0) { |
| 1284 | console.log("\n✅ 成功解析的产品:") | 1489 | console.log("\n✅ 成功解析的产品:") |
| 1285 | successResults.forEach(r => { | 1490 | successResults.forEach(r => { |
| 1286 | - console.log(" - " + r.formSn + ": " + r.config.product_name) | 1491 | + const productInfo = r.config?.product_name || r.productName || '未知产品' |
| 1492 | + console.log(" - " + r.formSn + ": " + productInfo) | ||
| 1287 | }) | 1493 | }) |
| 1288 | } | 1494 | } |
| 1289 | - if (summary.failed_list.length > 0) { | 1495 | + |
| 1290 | - console.log("\n⚠️ 失败明细:") | 1496 | + // 显示失败信息 |
| 1291 | - summary.failed_list.forEach(item => { | 1497 | + const failedResults = results.filter(r => !r.success || (r.multiProduct && r.successCount < r.productCount)) |
| 1292 | - console.log(" - " + item.file + " (" + item.reason + ")") | 1498 | + if (failedResults.length > 0) { |
| 1499 | + console.log("\n⚠️ 失败/部分失败:") | ||
| 1500 | + failedResults.forEach(r => { | ||
| 1501 | + if (r.multiProduct) { | ||
| 1502 | + console.log(" - " + r.file + " (" + r.successCount + "/" + r.productCount + " 成功)") | ||
| 1503 | + } else { | ||
| 1504 | + console.log(" - " + r.file + " (" + (r.reason || 'unknown') + ")") | ||
| 1505 | + } | ||
| 1293 | }) | 1506 | }) |
| 1294 | } | 1507 | } |
| 1295 | 1508 | ||
| ... | @@ -1298,7 +1511,7 @@ async function parseAllDocs(docs, options = {}) { | ... | @@ -1298,7 +1511,7 @@ async function parseAllDocs(docs, options = {}) { |
| 1298 | if (successResults.length > 0) { | 1511 | if (successResults.length > 0) { |
| 1299 | update_result = updateConfigFile(successResults, options) | 1512 | update_result = updateConfigFile(successResults, options) |
| 1300 | } else { | 1513 | } else { |
| 1301 | - console.log("\n❌ 没有成功解析的文档,配置文件未更新") | 1514 | + console.log("\n❌ 没有成功解析的产品,配置文件未更新") |
| 1302 | } | 1515 | } |
| 1303 | const audit_record = buildAuditRecord(summary, options, update_result, 'batch') | 1516 | const audit_record = buildAuditRecord(summary, options, update_result, 'batch') |
| 1304 | writeAuditLog(audit_record) | 1517 | writeAuditLog(audit_record) |
| ... | @@ -1362,21 +1575,40 @@ async function main() { | ... | @@ -1362,21 +1575,40 @@ async function main() { |
| 1362 | 1575 | ||
| 1363 | if (targetDoc) { | 1576 | if (targetDoc) { |
| 1364 | const start_time = Date.now() | 1577 | const start_time = Date.now() |
| 1365 | - const result = await parseSingleFile(targetDoc.fullPath, parserMode) | 1578 | + const result = await parseSingleFile(targetDoc.fullPath) |
| 1366 | const summary = buildParseSummary([result], Date.now() - start_time) | 1579 | const summary = buildParseSummary([result], Date.now() - start_time) |
| 1580 | + | ||
| 1581 | + // 计算产品数量 | ||
| 1582 | + const productCount = result.multiProduct ? result.productCount : 1 | ||
| 1583 | + const successCount = result.multiProduct ? result.successCount : (result.success ? 1 : 0) | ||
| 1584 | + | ||
| 1367 | console.log("\n📊 解析结果汇总") | 1585 | console.log("\n📊 解析结果汇总") |
| 1368 | - console.log("总计: " + summary.total + " 个文档") | 1586 | + console.log("文档: 1 个") |
| 1369 | - console.log("成功: " + summary.success + " 个") | 1587 | + console.log("产品: " + productCount + " 个(成功: " + successCount + ", 失败: " + (productCount - successCount) + ")") |
| 1370 | - console.log("失败: " + summary.failed + " 个") | ||
| 1371 | console.log("耗时: " + summary.duration_ms + "ms") | 1588 | console.log("耗时: " + summary.duration_ms + "ms") |
| 1372 | - if (result.success) { | 1589 | + |
| 1373 | - const update_result = updateConfigFile([result], { dry_run: dryRunMode }) | 1590 | + // 收集成功的产品配置 |
| 1374 | - const audit_record = buildAuditRecord(summary, { dry_run: dryRunMode }, update_result, 'single') | 1591 | + let successConfigs = [] |
| 1375 | - writeAuditLog(audit_record) | 1592 | + if (result.multiProduct) { |
| 1593 | + // 多产品文档:展开子结果 | ||
| 1594 | + if (result.results) { | ||
| 1595 | + successConfigs = result.results.filter(r => r.success && r.code) | ||
| 1596 | + } | ||
| 1597 | + } else if (result.success && result.code) { | ||
| 1598 | + // 单产品文档 | ||
| 1599 | + successConfigs = [result] | ||
| 1600 | + } | ||
| 1601 | + | ||
| 1602 | + // 更新配置文件 | ||
| 1603 | + let update_result = null | ||
| 1604 | + if (successConfigs.length > 0) { | ||
| 1605 | + update_result = updateConfigFile(successConfigs, { dry_run: dryRunMode }) | ||
| 1376 | } else { | 1606 | } else { |
| 1377 | - const audit_record = buildAuditRecord(summary, { dry_run: dryRunMode }, null, 'single') | 1607 | + console.log("\n❌ 没有成功解析的产品,配置文件未更新") |
| 1378 | - writeAuditLog(audit_record) | ||
| 1379 | } | 1608 | } |
| 1609 | + | ||
| 1610 | + const audit_record = buildAuditRecord(summary, { dry_run: dryRunMode }, update_result, 'single') | ||
| 1611 | + writeAuditLog(audit_record) | ||
| 1380 | } else { | 1612 | } else { |
| 1381 | console.log("❌ 找不到文件: " + fileName) | 1613 | console.log("❌ 找不到文件: " + fileName) |
| 1382 | } | 1614 | } | ... | ... |
scripts/product-splitter.js
0 → 100644
| 1 | +/** | ||
| 2 | + * 产品分割器 | ||
| 3 | + * | ||
| 4 | + * @description 从包含多个保险产品的文档中识别并分割出各个产品 | ||
| 5 | + * @module scripts/product-splitter | ||
| 6 | + * @author Claude Code | ||
| 7 | + * @created 2026-02-15 | ||
| 8 | + */ | ||
| 9 | + | ||
| 10 | +/** | ||
| 11 | + * 产品标题匹配规则 | ||
| 12 | + * | ||
| 13 | + * @description 用于识别文档中的产品标题行 | ||
| 14 | + * 格式示例: | ||
| 15 | + * - GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日 | ||
| 16 | + * - GC宏摯家傳承保險計劃- 性別, 年齡, 出生年月日 | ||
| 17 | + * - FA 宏浚傳承保障計劃 | ||
| 18 | + * - LV2 赤霞珠終身壽險計劃2基本人壽保障選項 | ||
| 19 | + */ | ||
| 20 | +const PRODUCT_TITLE_PATTERNS = [ | ||
| 21 | + // 产品代码 + 产品名称 + 可选后缀 | ||
| 22 | + // GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日 | ||
| 23 | + /^([A-Z]{2,4}\d?)\s*([^\n\-]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm, | ||
| 24 | + | ||
| 25 | + // 产品代码 + 空格 + 产品名称 | ||
| 26 | + // FA 宏浚傳承保障計劃 | ||
| 27 | + /^([A-Z]{2,4}\d?)\s+([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险))/gm, | ||
| 28 | + | ||
| 29 | + // 纯产品名称(包含"計劃") | ||
| 30 | + // 宏摯傳承保障計劃 | ||
| 31 | + /^([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm, | ||
| 32 | + | ||
| 33 | + // 产品代码开头的行 | ||
| 34 | + /^([A-Z]{2,4}\d?)\s*[-:]\s*([^\n]+)/gm | ||
| 35 | +] | ||
| 36 | + | ||
| 37 | +/** | ||
| 38 | + * 产品代码前缀列表(用于优先匹配) | ||
| 39 | + */ | ||
| 40 | +const PRODUCT_CODE_PREFIXES = [ | ||
| 41 | + 'GS', 'GC', 'FA', 'LV2', 'LV', 'CR', 'HR', 'PR', 'SR', | ||
| 42 | + 'TR', 'UR', 'WR', 'XR', 'YR', 'ZR' | ||
| 43 | +] | ||
| 44 | + | ||
| 45 | +/** | ||
| 46 | + * 检测文档中包含的产品数量 | ||
| 47 | + * | ||
| 48 | + * @param {string} content - 文档内容 | ||
| 49 | + * @returns {number} 产品数量 | ||
| 50 | + */ | ||
| 51 | +export function detectProductCount(content) { | ||
| 52 | + const matches = findProductTitles(content) | ||
| 53 | + return matches.length | ||
| 54 | +} | ||
| 55 | + | ||
| 56 | +/** | ||
| 57 | + * 查找文档中所有产品标题 | ||
| 58 | + * | ||
| 59 | + * @param {string} content - 文档内容 | ||
| 60 | + * @returns {Array<{index: number, code: string, name: string, fullTitle: string}>} 产品标题列表 | ||
| 61 | + */ | ||
| 62 | +export function findProductTitles(content) { | ||
| 63 | + const products = [] | ||
| 64 | + const seenCodes = new Set() | ||
| 65 | + | ||
| 66 | + // 策略1: 优先匹配产品代码前缀 | ||
| 67 | + for (const prefix of PRODUCT_CODE_PREFIXES) { | ||
| 68 | + // 匹配 "GS宏摯傳承保障計劃" 或 "GS 宏摯傳承保障計劃" | ||
| 69 | + const regex = new RegExp( | ||
| 70 | + `^(${prefix}\\d?)\\s*([\\u4e00-\\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险)[^\\n]*)`, | ||
| 71 | + 'gm' | ||
| 72 | + ) | ||
| 73 | + | ||
| 74 | + let match | ||
| 75 | + while ((match = regex.exec(content)) !== null) { | ||
| 76 | + const code = match[1] | ||
| 77 | + const name = match[2].trim() | ||
| 78 | + | ||
| 79 | + // 去重 | ||
| 80 | + if (seenCodes.has(code)) continue | ||
| 81 | + seenCodes.add(code) | ||
| 82 | + | ||
| 83 | + products.push({ | ||
| 84 | + index: match.index, | ||
| 85 | + code, | ||
| 86 | + name, | ||
| 87 | + fullTitle: match[0].trim() | ||
| 88 | + }) | ||
| 89 | + } | ||
| 90 | + } | ||
| 91 | + | ||
| 92 | + // 策略2: 如果没找到,尝试通用模式匹配 | ||
| 93 | + if (products.length === 0) { | ||
| 94 | + // 匹配包含"計劃"的产品名称行 | ||
| 95 | + const regex = /^([A-Z]{2,4}\d?)?\s*([^\n]*?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm | ||
| 96 | + | ||
| 97 | + let match | ||
| 98 | + while ((match = regex.exec(content)) !== null) { | ||
| 99 | + const fullTitle = match[0].trim() | ||
| 100 | + if (fullTitle.length < 5) continue // 过滤太短的匹配 | ||
| 101 | + | ||
| 102 | + products.push({ | ||
| 103 | + index: match.index, | ||
| 104 | + code: match[1] || null, | ||
| 105 | + name: match[2] || fullTitle, | ||
| 106 | + fullTitle | ||
| 107 | + }) | ||
| 108 | + } | ||
| 109 | + } | ||
| 110 | + | ||
| 111 | + // 按出现位置排序 | ||
| 112 | + products.sort((a, b) => a.index - b.index) | ||
| 113 | + | ||
| 114 | + return products | ||
| 115 | +} | ||
| 116 | + | ||
| 117 | +/** | ||
| 118 | + * 将文档内容按产品分割 | ||
| 119 | + * | ||
| 120 | + * @param {string} content - 文档内容 | ||
| 121 | + * @returns {Array<{code: string, name: string, content: string, fullTitle: string}>} 分割后的产品列表 | ||
| 122 | + */ | ||
| 123 | +export function splitByProducts(content) { | ||
| 124 | + const products = findProductTitles(content) | ||
| 125 | + | ||
| 126 | + if (products.length === 0) { | ||
| 127 | + // 没有找到多个产品,返回整个文档作为单个产品 | ||
| 128 | + return [{ | ||
| 129 | + code: null, | ||
| 130 | + name: null, | ||
| 131 | + content: content, | ||
| 132 | + fullTitle: null | ||
| 133 | + }] | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + if (products.length === 1) { | ||
| 137 | + // 只有一个产品,返回整个文档 | ||
| 138 | + return [{ | ||
| 139 | + code: products[0].code, | ||
| 140 | + name: products[0].name, | ||
| 141 | + content: content, | ||
| 142 | + fullTitle: products[0].fullTitle | ||
| 143 | + }] | ||
| 144 | + } | ||
| 145 | + | ||
| 146 | + // 多个产品,按位置分割 | ||
| 147 | + const result = [] | ||
| 148 | + | ||
| 149 | + for (let i = 0; i < products.length; i++) { | ||
| 150 | + const product = products[i] | ||
| 151 | + const startIndex = product.index | ||
| 152 | + const endIndex = (i < products.length - 1) ? products[i + 1].index : content.length | ||
| 153 | + | ||
| 154 | + const productContent = content.slice(startIndex, endIndex).trim() | ||
| 155 | + | ||
| 156 | + result.push({ | ||
| 157 | + code: product.code, | ||
| 158 | + name: product.name, | ||
| 159 | + content: productContent, | ||
| 160 | + fullTitle: product.fullTitle | ||
| 161 | + }) | ||
| 162 | + } | ||
| 163 | + | ||
| 164 | + return result | ||
| 165 | +} | ||
| 166 | + | ||
| 167 | +/** | ||
| 168 | + * 智能提取产品名称 | ||
| 169 | + * | ||
| 170 | + * @description 从产品标题或内容中提取标准化的产品名称 | ||
| 171 | + * @param {string} fullTitle - 产品完整标题 | ||
| 172 | + * @param {string} content - 产品内容片段 | ||
| 173 | + * @returns {string} 产品名称 | ||
| 174 | + */ | ||
| 175 | +export function extractProductName(fullTitle, content) { | ||
| 176 | + if (!fullTitle && !content) return null | ||
| 177 | + | ||
| 178 | + // 优先从完整标题提取 | ||
| 179 | + if (fullTitle) { | ||
| 180 | + // 移除产品代码前缀 | ||
| 181 | + let name = fullTitle.replace(/^[A-Z]{2,4}\d?\s*[-::]?\s*/, '') | ||
| 182 | + | ||
| 183 | + // 移除后缀说明(如 "- 性別, 年齡, 出生年月日") | ||
| 184 | + name = name.split(/[-—::]/)[0].trim() | ||
| 185 | + | ||
| 186 | + if (name && name.length > 2) { | ||
| 187 | + return name | ||
| 188 | + } | ||
| 189 | + } | ||
| 190 | + | ||
| 191 | + // 从内容中查找产品名称 | ||
| 192 | + const patterns = [ | ||
| 193 | + /产品名称[::]\s*([^\n]+)/, | ||
| 194 | + /计划书名称[::]\s*([^\n]+)/, | ||
| 195 | + /([A-Z]{2,4}\d?\s*[\u4e00-\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险))/ | ||
| 196 | + ] | ||
| 197 | + | ||
| 198 | + for (const pattern of patterns) { | ||
| 199 | + const match = content.match(pattern) | ||
| 200 | + if (match) { | ||
| 201 | + // 清理产品名称 | ||
| 202 | + let name = match[1] || match[0] | ||
| 203 | + name = name.replace(/^[A-Z]{2,4}\d?\s*[-::]?\s*/, '') | ||
| 204 | + name = name.split(/[-—::]/)[0].trim() | ||
| 205 | + if (name && name.length > 2) { | ||
| 206 | + return name | ||
| 207 | + } | ||
| 208 | + } | ||
| 209 | + } | ||
| 210 | + | ||
| 211 | + return null | ||
| 212 | +} | ||
| 213 | + | ||
| 214 | +/** | ||
| 215 | + * 生成产品分割报告 | ||
| 216 | + * | ||
| 217 | + * @param {string} content - 原始文档内容 | ||
| 218 | + * @param {Array} products - 分割后的产品列表 | ||
| 219 | + * @returns {string} Markdown 格式的报告 | ||
| 220 | + */ | ||
| 221 | +export function generateSplitReport(content, products) { | ||
| 222 | + let report = `## 📊 产品分割报告\n\n` | ||
| 223 | + | ||
| 224 | + report += `### 分割统计\n\n` | ||
| 225 | + report += `- 文档总长度: ${content.length} 字符\n` | ||
| 226 | + report += `- 识别产品数: ${products.length} 个\n\n` | ||
| 227 | + | ||
| 228 | + report += `### 产品列表\n\n` | ||
| 229 | + report += `| 序号 | 产品代码 | 产品名称 | 内容长度 |\n` | ||
| 230 | + report += `|------|---------|---------|----------|\n` | ||
| 231 | + | ||
| 232 | + products.forEach((product, index) => { | ||
| 233 | + const code = product.code || '-' | ||
| 234 | + const name = product.name || product.fullTitle?.slice(0, 20) || '-' | ||
| 235 | + const length = product.content.length | ||
| 236 | + report += `| ${index + 1} | ${code} | ${name.slice(0, 30)} | ${length} 字符 |\n` | ||
| 237 | + }) | ||
| 238 | + | ||
| 239 | + return report | ||
| 240 | +} | ||
| 241 | + | ||
| 242 | +export { | ||
| 243 | + PRODUCT_TITLE_PATTERNS, | ||
| 244 | + PRODUCT_CODE_PREFIXES | ||
| 245 | +} |
| ... | @@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) { | ... | @@ -463,80 +463,6 @@ function smartExtractList(content, startPattern, endKeywords, itemFilter) { |
| 463 | } | 463 | } |
| 464 | 464 | ||
| 465 | /** | 465 | /** |
| 466 | - * 智能提取所有字段 | ||
| 467 | - * | ||
| 468 | - * @param {string} content - 文档内容 | ||
| 469 | - * @param {string} fileName - 文件名(用于推断产品名称) | ||
| 470 | - * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果 | ||
| 471 | - */ | ||
| 472 | -export function smartExtractFields(content, fileName) { | ||
| 473 | - const config = {} | ||
| 474 | - const unmatched = [] | ||
| 475 | - const warnings = [] | ||
| 476 | - const matchDetails = [] | ||
| 477 | - | ||
| 478 | - // 按优先级提取字段 | ||
| 479 | - const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority) | ||
| 480 | - | ||
| 481 | - for (const [fieldName, rule] of sortedFields) { | ||
| 482 | - const result = extractField(content, fieldName) | ||
| 483 | - | ||
| 484 | - // 记录匹配详情 | ||
| 485 | - matchDetails.push({ | ||
| 486 | - field: fieldName, | ||
| 487 | - matched: result.matched, | ||
| 488 | - pattern: result.pattern, | ||
| 489 | - value: result.value | ||
| 490 | - }) | ||
| 491 | - | ||
| 492 | - // 如果匹配成功或字段有默认值 | ||
| 493 | - if (result.value !== null) { | ||
| 494 | - config[fieldName] = result.value | ||
| 495 | - | ||
| 496 | - // 如果使用了默认值,记录警告 | ||
| 497 | - if (!result.matched && rule.required) { | ||
| 498 | - warnings.push({ | ||
| 499 | - field: fieldName, | ||
| 500 | - message: `未找到字段 "${fieldName}",使用默认值: ${JSON.stringify(rule.fallback)}`, | ||
| 501 | - severity: 'warning' | ||
| 502 | - }) | ||
| 503 | - } | ||
| 504 | - } else if (rule.required) { | ||
| 505 | - // 必填字段未匹配 | ||
| 506 | - unmatched.push({ | ||
| 507 | - field: fieldName, | ||
| 508 | - reason: '未找到匹配内容', | ||
| 509 | - suggestions: generateSuggestions(fieldName, content) | ||
| 510 | - }) | ||
| 511 | - } | ||
| 512 | - } | ||
| 513 | - | ||
| 514 | - // 产品名称特殊处理:如果未匹配,使用文件名 | ||
| 515 | - if (!config.product_name) { | ||
| 516 | - const baseName = fileName.replace(/\.[^/.]+$/, '') | ||
| 517 | - config.product_name = baseName | ||
| 518 | - warnings.push({ | ||
| 519 | - field: 'product_name', | ||
| 520 | - message: `未找到产品名称,使用文件名: "${baseName}"`, | ||
| 521 | - severity: 'info' | ||
| 522 | - }) | ||
| 523 | - } | ||
| 524 | - | ||
| 525 | - // 根据产品类型过滤字段 | ||
| 526 | - if (config.product_type !== 'savings') { | ||
| 527 | - delete config.withdrawal_modes | ||
| 528 | - delete config.withdrawal_periods | ||
| 529 | - } | ||
| 530 | - | ||
| 531 | - return { | ||
| 532 | - config, | ||
| 533 | - unmatched, | ||
| 534 | - warnings, | ||
| 535 | - matchDetails | ||
| 536 | - } | ||
| 537 | -} | ||
| 538 | - | ||
| 539 | -/** | ||
| 540 | * 生成字段建议值 | 466 | * 生成字段建议值 |
| 541 | * | 467 | * |
| 542 | * @param {string} fieldName - 字段名称 | 468 | * @param {string} fieldName - 字段名称 |
| ... | @@ -614,4 +540,131 @@ export function generateAuditReport(result) { | ... | @@ -614,4 +540,131 @@ export function generateAuditReport(result) { |
| 614 | return report | 540 | return report |
| 615 | } | 541 | } |
| 616 | 542 | ||
| 543 | +/** | ||
| 544 | + * 智能提取所有字段(支持多产品) | ||
| 545 | + * | ||
| 546 | + * @description 从单个产品内容片段中提取字段,优先使用传入的产品名称 | ||
| 547 | + * @param {string} content - 产品内容片段 | ||
| 548 | + * @param {string} fileName - 文件名 | ||
| 549 | + * @param {Object} options - 额外选项 | ||
| 550 | + * @param {string} options.productCode - 产品代码(如 GS、GC、FA) | ||
| 551 | + * @param {string} options.productName - 产品名称(从分割器获取) | ||
| 552 | + * @returns {{config: Object, unmatched: Array, warnings: Array, matchDetails: Array}} 提取结果 | ||
| 553 | + */ | ||
| 554 | +export function smartExtractFieldsForProduct(content, fileName, options = {}) { | ||
| 555 | + const { productCode, productName } = options | ||
| 556 | + const config = {} | ||
| 557 | + const unmatched = [] | ||
| 558 | + const warnings = [] | ||
| 559 | + const matchDetails = [] | ||
| 560 | + | ||
| 561 | + // 按优先级提取字段 | ||
| 562 | + const sortedFields = Object.entries(FIELD_RULES).sort((a, b) => a[1].priority - b[1].priority) | ||
| 563 | + | ||
| 564 | + for (const [fieldName, rule] of sortedFields) { | ||
| 565 | + // 跳过 product_name,后面特殊处理 | ||
| 566 | + if (fieldName === 'product_name') continue | ||
| 567 | + | ||
| 568 | + const result = extractField(content, fieldName) | ||
| 569 | + | ||
| 570 | + // 记录匹配详情 | ||
| 571 | + matchDetails.push({ | ||
| 572 | + field: fieldName, | ||
| 573 | + matched: result.matched, | ||
| 574 | + pattern: result.pattern, | ||
| 575 | + value: result.value | ||
| 576 | + }) | ||
| 577 | + | ||
| 578 | + // 如果匹配成功或字段有默认值 | ||
| 579 | + if (result.value !== null) { | ||
| 580 | + config[fieldName] = result.value | ||
| 581 | + | ||
| 582 | + // 如果使用了默认值,记录警告 | ||
| 583 | + if (!result.matched && rule.required) { | ||
| 584 | + warnings.push({ | ||
| 585 | + field: fieldName, | ||
| 586 | + message: `未找到字段 "${fieldName}",使用默认值: ${JSON.stringify(rule.fallback)}`, | ||
| 587 | + severity: 'warning' | ||
| 588 | + }) | ||
| 589 | + } | ||
| 590 | + } else if (rule.required) { | ||
| 591 | + // 必填字段未匹配 | ||
| 592 | + unmatched.push({ | ||
| 593 | + field: fieldName, | ||
| 594 | + reason: '未找到匹配内容', | ||
| 595 | + suggestions: generateSuggestions(fieldName, content) | ||
| 596 | + }) | ||
| 597 | + } | ||
| 598 | + } | ||
| 599 | + | ||
| 600 | + // ========== 产品名称特殊处理 ========== | ||
| 601 | + // 优先级: 传入的产品名称 > 从内容提取 > 文件名 | ||
| 602 | + if (productName) { | ||
| 603 | + // 使用分割器传入的产品名称 | ||
| 604 | + config.product_name = productName | ||
| 605 | + matchDetails.unshift({ | ||
| 606 | + field: 'product_name', | ||
| 607 | + matched: true, | ||
| 608 | + pattern: 'product_splitter', | ||
| 609 | + value: productName | ||
| 610 | + }) | ||
| 611 | + } else { | ||
| 612 | + // 尝试从内容提取 | ||
| 613 | + const nameResult = extractField(content, 'product_name') | ||
| 614 | + if (nameResult.matched && nameResult.value) { | ||
| 615 | + config.product_name = nameResult.value | ||
| 616 | + matchDetails.unshift({ | ||
| 617 | + field: 'product_name', | ||
| 618 | + matched: true, | ||
| 619 | + pattern: nameResult.pattern, | ||
| 620 | + value: nameResult.value | ||
| 621 | + }) | ||
| 622 | + } else { | ||
| 623 | + // 使用文件名 | ||
| 624 | + const baseName = fileName.replace(/\.[^/.]+$/, '') | ||
| 625 | + config.product_name = baseName | ||
| 626 | + warnings.push({ | ||
| 627 | + field: 'product_name', | ||
| 628 | + message: `未找到产品名称,使用文件名: "${baseName}"`, | ||
| 629 | + severity: 'info' | ||
| 630 | + }) | ||
| 631 | + matchDetails.unshift({ | ||
| 632 | + field: 'product_name', | ||
| 633 | + matched: false, | ||
| 634 | + pattern: 'filename_fallback', | ||
| 635 | + value: baseName | ||
| 636 | + }) | ||
| 637 | + } | ||
| 638 | + } | ||
| 639 | + | ||
| 640 | + // 如果有产品代码,添加到配置中 | ||
| 641 | + if (productCode) { | ||
| 642 | + config.product_code = productCode | ||
| 643 | + } | ||
| 644 | + | ||
| 645 | + // 根据产品类型过滤字段 | ||
| 646 | + if (config.product_type !== 'savings') { | ||
| 647 | + delete config.withdrawal_modes | ||
| 648 | + delete config.withdrawal_periods | ||
| 649 | + } | ||
| 650 | + | ||
| 651 | + return { | ||
| 652 | + config, | ||
| 653 | + unmatched, | ||
| 654 | + warnings, | ||
| 655 | + matchDetails | ||
| 656 | + } | ||
| 657 | +} | ||
| 658 | + | ||
| 659 | +/** | ||
| 660 | + * 智能提取所有字段(原始函数,保持兼容) | ||
| 661 | + * | ||
| 662 | + * @param {string} content - 文档内容 | ||
| 663 | + * @param {string} fileName - 文件名(用于推断产品名称) | ||
| 664 | + * @returns {{config: Object, unmatched: Array, warnings: Array}} 提取结果 | ||
| 665 | + */ | ||
| 666 | +export function smartExtractFields(content, fileName) { | ||
| 667 | + return smartExtractFieldsForProduct(content, fileName, {}) | ||
| 668 | +} | ||
| 669 | + | ||
| 617 | export { FIELD_RULES } | 670 | export { FIELD_RULES } | ... | ... |
-
Please register or login to post a comment