docs(parse): 文档解析审核流程完善
- 整理审核流程文档并对齐字段命名与目录规范 - 补充审核模板修复重点与解析策略改进点 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Showing
11 changed files
with
420 additions
and
37 deletions
| ... | @@ -77,6 +77,7 @@ pnpm lint | ... | @@ -77,6 +77,7 @@ pnpm lint |
| 77 | - ✅ **写入稳态化** - 结构化插入、重复检测与 dry-run 预览已接入 | 77 | - ✅ **写入稳态化** - 结构化插入、重复检测与 dry-run 预览已接入 |
| 78 | - ✅ **输出结构补齐** - 解析输出 JSON 结构与稳定 form_sn 规则已明确 | 78 | - ✅ **输出结构补齐** - 解析输出 JSON 结构与稳定 form_sn 规则已明确 |
| 79 | - ✅ **审计与摘要** - 解析摘要与审计日志输出已接入 | 79 | - ✅ **审计与摘要** - 解析摘要与审计日志输出已接入 |
| 80 | +- ✅ **审核流程规划** - 整理审核流程方案并对齐字段与目录规范 | ||
| 80 | 81 | ||
| 81 | ### 测试与验证 | 82 | ### 测试与验证 |
| 82 | - ✅ **回归测试** - pnpm test 通过,pnpm lint 存在 30 个现存警告 | 83 | - ✅ **回归测试** - pnpm test 通过,pnpm lint 存在 30 个现存警告 | ... | ... |
| 1 | +## [2026-02-14] - 文档解析审核方案整理 | ||
| 2 | + | ||
| 3 | +### 优化 | ||
| 4 | +- 整理审核流程文档并对齐字段命名与目录规范 | ||
| 5 | +- 补充审核模板修复重点与解析策略改进点 | ||
| 6 | + | ||
| 7 | +--- | ||
| 8 | + | ||
| 9 | +**详细信息**: | ||
| 10 | +- **影响文件**: docs/tasks/plan/改进文档解析工具-添加审核流程.md, README.md | ||
| 11 | +- **技术栈**: 文档维护 | ||
| 12 | +- **测试状态**: 未运行(仅文档更新) | ||
| 13 | +- **备注**: 明确审核流程现状与修复范围 | ||
| 14 | + | ||
| 15 | +--- | ||
| 16 | + | ||
| 17 | +## [2026-02-14] - markitdown 文档解析服务集成 | ||
| 18 | + | ||
| 19 | +### 新增 | ||
| 20 | +- 集成 markitdown CLI 工具支持 PDF/DOCX 文档解析 | ||
| 21 | +- 创建 parse-config.js 统一配置管理模块 | ||
| 22 | +- 添加配置状态检查命令 `npm run parse:docs:status` | ||
| 23 | +- 创建 .env.example 环境变量模板 | ||
| 24 | +- 新增 scripts/README.md 使用指南 | ||
| 25 | + | ||
| 26 | +### 优化 | ||
| 27 | +- MD/TXT 文件直接读取,无需 markitdown 处理 | ||
| 28 | +- PDF/DOCX 文件通过 markitdown CLI 转换 | ||
| 29 | +- 添加 markitdown 失败时的本地库回退机制 | ||
| 30 | + | ||
| 31 | +--- | ||
| 32 | + | ||
| 33 | +**详细信息**: | ||
| 34 | +- **影响文件**: scripts/parse-config.js, scripts/parse-docs.js, scripts/.env.example, scripts/README.md, package.json | ||
| 35 | +- **技术栈**: Node.js, Python (markitdown v0.1.4), child_process | ||
| 36 | +- **测试状态**: 已通过(MD 文件解析验证) | ||
| 37 | +- **备注**: markitdown CLI 已安装,配置已启用 (type: 'cli') | ||
| 38 | + | ||
| 39 | +--- | ||
| 40 | + | ||
| 1 | ## [2026-02-14] - 空表单回退规则补齐 | 41 | ## [2026-02-14] - 空表单回退规则补齐 |
| 2 | 42 | ||
| 3 | ### 修复 | 43 | ### 修复 | ... | ... |
| ... | @@ -4,3 +4,4 @@ | ... | @@ -4,3 +4,4 @@ |
| 4 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771077989896.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-readme-a4296d1f"],"at":"2026-02-14T14:06:29.896Z"} | 4 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771077989896.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-readme-a4296d1f"],"at":"2026-02-14T14:06:29.896Z"} |
| 5 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771078080604.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-readme-a4296d1f"],"at":"2026-02-14T14:08:00.605Z"} | 5 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771078080604.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-readme-a4296d1f"],"at":"2026-02-14T14:08:00.605Z"} |
| 6 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771078351660.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-2-148b3acd"],"at":"2026-02-14T14:12:31.660Z"} | 6 | {"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771078351660.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-2-148b3acd"],"at":"2026-02-14T14:12:31.660Z"} |
| 7 | +{"action":"update","backup_file":"/Users/huyirui/program/itomix/git/manulife-weapp/docs/parsed-backup/plan-templates.backup.1771080130974.js","target_file":"/Users/huyirui/program/itomix/git/manulife-weapp/src/config/plan-templates.js","form_sn_list":["savings-2-55bcffc2"],"at":"2026-02-14T14:42:10.974Z"} | ... | ... |
| ... | @@ -5,3 +5,9 @@ | ... | @@ -5,3 +5,9 @@ |
| 5 | {"at":"2026-02-14T14:06:29.897Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-readme-a4296d1f","product_name":"README","file":"README.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-readme-a4296d1f"],"conflicts":[],"reason":null}} | 5 | {"at":"2026-02-14T14:06:29.897Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-readme-a4296d1f","product_name":"README","file":"README.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-readme-a4296d1f"],"conflicts":[],"reason":null}} |
| 6 | {"at":"2026-02-14T14:08:00.605Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-readme-a4296d1f","product_name":"README","file":"README.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-readme-a4296d1f"],"conflicts":[],"reason":null}} | 6 | {"at":"2026-02-14T14:08:00.605Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-readme-a4296d1f","product_name":"README","file":"README.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-readme-a4296d1f"],"conflicts":[],"reason":null}} |
| 7 | {"at":"2026-02-14T14:12:31.661Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":1,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-2-148b3acd"],"conflicts":[],"reason":null}} | 7 | {"at":"2026-02-14T14:12:31.661Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":1,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-2-148b3acd"],"conflicts":[],"reason":null}} |
| 8 | +{"at":"2026-02-14T14:34:05.582Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":false,"dry_run":false,"updated_count":0,"form_sn_list":[],"conflicts":["savings-2-148b3acd"],"reason":"conflict"}} | ||
| 9 | +{"at":"2026-02-14T14:34:22.438Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":false,"dry_run":false,"updated_count":0,"form_sn_list":[],"conflicts":["savings-2-148b3acd"],"reason":"conflict"}} | ||
| 10 | +{"at":"2026-02-14T14:34:50.292Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":1153,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":false,"dry_run":false,"updated_count":0,"form_sn_list":[],"conflicts":["savings-2-148b3acd"],"reason":"conflict"}} | ||
| 11 | +{"at":"2026-02-14T14:35:12.489Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":6,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":false,"dry_run":false,"updated_count":0,"form_sn_list":[],"conflicts":["savings-2-148b3acd"],"reason":"conflict"}} | ||
| 12 | +{"at":"2026-02-14T14:35:32.726Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":2,"success_list":[{"form_sn":"savings-2-148b3acd","product_name":"测试计划书-智享未来2","file":"测试计划书-智享未来2.md"}],"failed_list":[]},"change_summary":{"ok":false,"dry_run":false,"updated_count":0,"form_sn_list":[],"conflicts":["savings-2-148b3acd"],"reason":"conflict"}} | ||
| 13 | +{"at":"2026-02-14T14:42:10.975Z","mode":"single","options":{"dry_run":false},"summary":{"total":1,"success":1,"failed":0,"duration_ms":28,"success_list":[{"form_sn":"savings-2-55bcffc2","product_name":"计划书模版2","file":"计划书模版2.docx"}],"failed_list":[]},"change_summary":{"ok":true,"dry_run":false,"updated_count":1,"form_sn_list":["savings-2-55bcffc2"],"conflicts":[],"reason":null}} | ... | ... |
This diff is collapsed. Click to expand it.
| ... | @@ -24,16 +24,28 @@ | ... | @@ -24,16 +24,28 @@ |
| 24 | - 需要"人工辅助"的半自动化方式 | 24 | - 需要"人工辅助"的半自动化方式 |
| 25 | - 在自动解析和直接生成配置之间增加审核环节 | 25 | - 在自动解析和直接生成配置之间增加审核环节 |
| 26 | 26 | ||
| 27 | +### 现状评审与差距 | ||
| 28 | +1. **审核流程已接入但模板不稳定**: | ||
| 29 | + - parse-docs.js 已生成待审核文件并阻断写入配置,但审核模板存在重复定义与内容断裂风险 | ||
| 30 | +2. **字段命名不一致**: | ||
| 31 | + - 现有方案示例使用 name/type,与实际解析配置字段 product_name/product_type 不一致 | ||
| 32 | +3. **审核指引不清晰**: | ||
| 33 | + - 审核文件的“通过后操作”指向备份文件,未明确 pending/approved 目录治理 | ||
| 34 | +4. **解析结果可读性不足**: | ||
| 35 | + - 审核模板对 form_schema 与 submit_mapping 预览不足,无法快速确认关键字段 | ||
| 36 | +5. **解析方式描述需要更新**: | ||
| 37 | + - mammoth 的 Markdown 输出存在局限,复杂表格准确性不足,需要明确替代策略 | ||
| 38 | + | ||
| 27 | --- | 39 | --- |
| 28 | 40 | ||
| 29 | ## 解决方案 | 41 | ## 解决方案 |
| 30 | 42 | ||
| 31 | ### 方案设计 | 43 | ### 方案设计 |
| 32 | -采用 **"解析 → 审核 → 生成"** 三步流程,支持多种解析方式: | 44 | +采用 **"解析 → 审核 → 人工合并"** 三步流程,支持多种解析方式: |
| 33 | 45 | ||
| 34 | ``` | 46 | ``` |
| 35 | ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ | 47 | ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ |
| 36 | -│ 选择解析方式 │ → │ 生成待审核文件 │ → │ 人工审核后移动 │ | 48 | +│ 选择解析方式 │ → │ 生成待审核文件 │ → │ 审核通过后合并 │ |
| 37 | │ mammoth/MCP │ │ (markdown) │ │ 到正式配置 │ | 49 | │ mammoth/MCP │ │ (markdown) │ │ 到正式配置 │ |
| 38 | └─────────────────┘ └─────────────────┘ └─────────────────┘ | 50 | └─────────────────┘ └─────────────────┘ └─────────────────┘ |
| 39 | ``` | 51 | ``` |
| ... | @@ -52,17 +64,21 @@ | ... | @@ -52,17 +64,21 @@ |
| 52 | - **mammoth**: 快速预览、简单文档、离线使用 | 64 | - **mammoth**: 快速预览、简单文档、离线使用 |
| 53 | - **MCP**: 复杂文档、准确度要求高、有网络连接 | 65 | - **MCP**: 复杂文档、准确度要求高、有网络连接 |
| 54 | 66 | ||
| 67 | +#### 注意事项 | ||
| 68 | +- mammoth 的 Markdown 输出能力有限,复杂结构建议使用 HTML 输出后再转 Markdown | ||
| 69 | +- 审核文件统一输出到 `docs/parse-audit/pending/`,通过后移动到 `docs/parse-audit/approved/` | ||
| 70 | + | ||
| 55 | ### 技术实现 | 71 | ### 技术实现 |
| 56 | 72 | ||
| 57 | -#### 1. 改进extractProductBasicInfo | 73 | +#### 1. 改进基础信息抽取 |
| 58 | -尝试从多个位置提取产品基本信息: | 74 | +从多个位置提取产品基本信息(与实际配置字段对齐): |
| 59 | 75 | ||
| 60 | ```javascript | 76 | ```javascript |
| 61 | // 尝试从文档标题、表格、特定文本模式提取 | 77 | // 尝试从文档标题、表格、特定文本模式提取 |
| 62 | async function extractProductBasicInfo(content, fileName) { | 78 | async function extractProductBasicInfo(content, fileName) { |
| 63 | const info = { | 79 | const info = { |
| 64 | - name: '', | 80 | + product_name: '', |
| 65 | - type: 'savings', // 默认储蓄型 | 81 | + product_type: 'savings', |
| 66 | currency: 'USD', | 82 | currency: 'USD', |
| 67 | form_sn: generateFormSn(fileName) | 83 | form_sn: generateFormSn(fileName) |
| 68 | } | 84 | } |
| ... | @@ -70,7 +86,7 @@ async function extractProductBasicInfo(content, fileName) { | ... | @@ -70,7 +86,7 @@ async function extractProductBasicInfo(content, fileName) { |
| 70 | // 策略1: 从文档标题提取 | 86 | // 策略1: 从文档标题提取 |
| 71 | const titleMatch = content.match(/^#\s+(.+)$/m) | 87 | const titleMatch = content.match(/^#\s+(.+)$/m) |
| 72 | if (titleMatch) { | 88 | if (titleMatch) { |
| 73 | - info.name = cleanProductName(titleMatch[1].trim()) | 89 | + info.product_name = cleanProductName(titleMatch[1].trim()) |
| 74 | } | 90 | } |
| 75 | 91 | ||
| 76 | // 策略2: 从表格中提取"币种"信息 | 92 | // 策略2: 从表格中提取"币种"信息 |
| ... | @@ -81,9 +97,9 @@ async function extractProductBasicInfo(content, fileName) { | ... | @@ -81,9 +97,9 @@ async function extractProductBasicInfo(content, fileName) { |
| 81 | 97 | ||
| 82 | // 策略3: 从表格中提取"产品类型"信息 | 98 | // 策略3: 从表格中提取"产品类型"信息 |
| 83 | if (content.includes('重疾') || content.includes('危疾')) { | 99 | if (content.includes('重疾') || content.includes('危疾')) { |
| 84 | - info.type = 'critical-illness' | 100 | + info.product_type = 'critical-illness' |
| 85 | } else if (content.includes('人寿')) { | 101 | } else if (content.includes('人寿')) { |
| 86 | - info.type = 'life-insurance' | 102 | + info.product_type = 'life-insurance' |
| 87 | } | 103 | } |
| 88 | 104 | ||
| 89 | return info | 105 | return info |
| ... | @@ -110,8 +126,8 @@ async function generateAuditFile(fileName, config, code) { | ... | @@ -110,8 +126,8 @@ async function generateAuditFile(fileName, config, code) { |
| 110 | 126 | ||
| 111 | | 字段 | 提取值 | 需要确认 | | 127 | | 字段 | 提取值 | 需要确认 | |
| 112 | |------|--------|---------| | 128 | |------|--------|---------| |
| 113 | -| 产品名称 | ${config.name || '未提取'} | ✅ 请核对产品名称 | | 129 | +| 产品名称 | ${config.product_name || '未提取'} | ✅ 请核对产品名称 | |
| 114 | -| 产品类型 | ${config.type || '未提取'} | ✅ 请确认产品类型 | | 130 | +| 产品类型 | ${config.product_type || '未提取'} | ✅ 请确认产品类型 | |
| 115 | | 币种 | ${config.currency || 'USD'} | ✅ 请确认币种 | | 131 | | 币种 | ${config.currency || 'USD'} | ✅ 请确认币种 | |
| 116 | | form_sn | \`${config.form_sn || '未生成'}` | ✅ 请确认form_sn唯一性 | | 132 | | form_sn | \`${config.form_sn || '未生成'}` | ✅ 请确认form_sn唯一性 | |
| 117 | 133 | ||
| ... | @@ -151,9 +167,9 @@ ${code.submit_mapping || '// 请手动补充'} | ... | @@ -151,9 +167,9 @@ ${code.submit_mapping || '// 请手动补充'} |
| 151 | 167 | ||
| 152 | ### 确认无误 | 168 | ### 确认无误 |
| 153 | \`\`\`bash | 169 | \`\`\`bash |
| 154 | -# 1. 移动配置到正式文件 | 170 | +# 1. 移动到 approved 目录 |
| 155 | mv docs/parse-audit/pending/${auditFileName} \\ | 171 | mv docs/parse-audit/pending/${auditFileName} \\ |
| 156 | - src/config/plan-templates.backup.js | 172 | + docs/parse-audit/approved/ |
| 157 | 173 | ||
| 158 | # 2. 合并到正式配置 | 174 | # 2. 合并到正式配置 |
| 159 | # 手动复制或使用工具合并 | 175 | # 手动复制或使用工具合并 |
| ... | @@ -185,28 +201,23 @@ rm docs/parse-audit/pending/${auditFileName} | ... | @@ -185,28 +201,23 @@ rm docs/parse-audit/pending/${auditFileName} |
| 185 | 201 | ||
| 186 | ## 实施计划 | 202 | ## 实施计划 |
| 187 | 203 | ||
| 188 | -### 阶段1: 改进解析逻辑 (30分钟) | 204 | +### 阶段1: 修复审核模板与字段对齐 |
| 189 | -- [ ] 改进extractProductBasicInfo函数 | 205 | +- [ ] 清理 generateAuditFile 重复定义与模板断裂问题 |
| 190 | - - [ ] 添加文档标题提取 | 206 | +- [ ] 统一字段命名为 product_name/product_type/currency/form_sn |
| 191 | - - [ ] 添加币种信息提取 | 207 | +- [ ] 优化审核模板展示 form_schema 与 submit_mapping |
| 192 | - - [ ] 添加产品类型推断 | 208 | + |
| 193 | - - [ ] 测试验证提取效果 | 209 | +### 阶段2: 审核流程治理 |
| 194 | - | 210 | +- [ ] 确认 pending/approved 目录结构 |
| 195 | -### 阶段2: 实现审核文件生成 (20分钟) | 211 | +- [ ] 明确审核通过后的合并指引 |
| 196 | -- [ ] 实现generateAuditFile函数 | 212 | +- [ ] 补齐审核状态与审核意见模板 |
| 197 | - - [ ] 创建待审核目录结构 | 213 | + |
| 198 | - - [ ] 测试生成markdown格式 | 214 | +### 阶段3: 解析策略补齐 |
| 199 | - - [ ] 添加文件路径返回 | 215 | +- [ ] 增加标题/币种/类型的启发式补位策略 |
| 200 | - | 216 | +- [ ] 补齐文档样本验证与失败兜底说明 |
| 201 | -### 阶段3: 集成到主流程 (10分钟) | 217 | + |
| 202 | -- [ ] 更新parse-docs.js主函数 | 218 | +### 阶段4: 测试验证 |
| 203 | - - [ ] 添加成功提示和审核引导 | 219 | +- [ ] 使用实际文档回归生成审核文件 |
| 204 | - - [ ] 错误处理和日志输出 | 220 | +- [ ] 校验审核模板完整性与可读性 |
| 205 | - | ||
| 206 | -### 阶段4: 测试验证 (10分钟) | ||
| 207 | -- [ ] 使用实际文档测试 | ||
| 208 | -- [ ] 验证生成的审核文件格式 | ||
| 209 | - - [ ] 确认目录结构正确 | ||
| 210 | 221 | ||
| 211 | --- | 222 | --- |
| 212 | 223 | ||
| ... | @@ -234,6 +245,7 @@ rm docs/parse-audit/pending/${auditFileName} | ... | @@ -234,6 +245,7 @@ rm docs/parse-audit/pending/${auditFileName} |
| 234 | | 提取仍不准确 | 需要大量人工补充 | 提供清晰的标记和默认值 | | 245 | | 提取仍不准确 | 需要大量人工补充 | 提供清晰的标记和默认值 | |
| 235 | | 审核文件过多 | 难以管理 | 定期清理已审核文件 | | 246 | | 审核文件过多 | 难以管理 | 定期清理已审核文件 | |
| 236 | | 目录权限问题 | 无法写入文件 | 提前创建目录并检查权限 | | 247 | | 目录权限问题 | 无法写入文件 | 提前创建目录并检查权限 | |
| 248 | +| mammoth 输出限制 | 表格/结构信息丢失 | 使用 HTML 输出后再转 Markdown | | ||
| 237 | 249 | ||
| 238 | --- | 250 | --- |
| 239 | 251 | ||
| ... | @@ -254,6 +266,6 @@ rm docs/parse-audit/pending/${auditFileName} | ... | @@ -254,6 +266,6 @@ rm docs/parse-audit/pending/${auditFileName} |
| 254 | --- | 266 | --- |
| 255 | 267 | ||
| 256 | ## 相关文档 | 268 | ## 相关文档 |
| 257 | -- [mamoth使用文档](https://github.com/mwilliamtohman/mammoth) | 269 | +- [mammoth 使用文档](https://github.com/mwilliamson/mammoth.js) |
| 258 | - [计划书模板配置规范](../../src/config/CLAUDE.md) | 270 | - [计划书模板配置规范](../../src/config/CLAUDE.md) |
| 259 | - [代码注释规范](~/.claude/rules/code-commenting.md) | 271 | - [代码注释规范](~/.claude/rules/code-commenting.md) | ... | ... |
| ... | @@ -36,6 +36,7 @@ | ... | @@ -36,6 +36,7 @@ |
| 36 | "prepare": "husky", | 36 | "prepare": "husky", |
| 37 | "parse:docs": "node scripts/parse-docs.js", | 37 | "parse:docs": "node scripts/parse-docs.js", |
| 38 | "parse:docs:list": "node scripts/parse-docs.js --list", | 38 | "parse:docs:list": "node scripts/parse-docs.js --list", |
| 39 | + "parse:docs:status": "node scripts/parse-docs.js --status", | ||
| 39 | "parse:docs:file": "node scripts/parse-docs.js --file=", | 40 | "parse:docs:file": "node scripts/parse-docs.js --file=", |
| 40 | "release": "standard-version" | 41 | "release": "standard-version" |
| 41 | }, | 42 | }, | ... | ... |
scripts/.env.example
0 → 100644
| 1 | +# 文档解析服务配置示例 | ||
| 2 | +# | ||
| 3 | +# 使用方式: | ||
| 4 | +# 1. 复制此文件为 .env | ||
| 5 | +# 2. 填写你的 API Key | ||
| 6 | +# 3. 运行 npm run parse:docs -- --status 检查配置状态 | ||
| 7 | + | ||
| 8 | +# ========== markitdown 配置 ========== | ||
| 9 | +# markitdown 服务类型: cli | docker | http | disabled | ||
| 10 | +MARKITDOWN_TYPE=disabled | ||
| 11 | + | ||
| 12 | +# markitdown HTTP API 地址(仅当 MARKITDOWN_TYPE=http 时需要) | ||
| 13 | +# MARKITDOWN_URL=http://localhost:8000/convert | ||
| 14 | + | ||
| 15 | +# ========== AI 服务配置 ========== | ||
| 16 | +# AI 服务类型: openai | anthropic | openrouter | disabled | ||
| 17 | +AI_SERVICE_TYPE=disabled | ||
| 18 | + | ||
| 19 | +# OpenAI 配置 | ||
| 20 | +OPENAI_API_KEY=sk-your-openai-api-key-here | ||
| 21 | +OPENAI_BASE_URL=https://api.openai.com/v1 | ||
| 22 | +OPENAI_MODEL=gpt-4-turbo | ||
| 23 | + | ||
| 24 | +# Anthropic 配置 | ||
| 25 | +ANTHROPIC_API_KEY=sk-ant-your-anthropic-api-key-here | ||
| 26 | +ANTHROPIC_BASE_URL=https://api.anthropic.com/v1 | ||
| 27 | +ANTHROPIC_MODEL=claude-3-sonnet-20240229 | ||
| 28 | + | ||
| 29 | +# OpenRouter 配置 | ||
| 30 | +OPENROUTER_API_KEY=sk-or-your-openrouter-api-key-here | ||
| 31 | +OPENROUTER_BASE_URL=https://openrouter.ai/api/v1 | ||
| 32 | +OPENROUTER_MODEL=anthropic/claude-3-sonnet |
scripts/README.md
0 → 100644
| 1 | +# 文档解析工具使用指南 | ||
| 2 | + | ||
| 3 | +## 功能概述 | ||
| 4 | + | ||
| 5 | +文档解析工具用于将保险产品文档(PDF、DOCX)自动解析为计划书配置,支持智能识别产品类型、币种、缴费年期等信息。 | ||
| 6 | + | ||
| 7 | +## 快速开始 | ||
| 8 | + | ||
| 9 | +### 1. 查看待处理文档 | ||
| 10 | + | ||
| 11 | +```bash | ||
| 12 | +npm run parse:docs:list | ||
| 13 | +``` | ||
| 14 | + | ||
| 15 | +### 2. 查看配置状态 | ||
| 16 | + | ||
| 17 | +```bash | ||
| 18 | +npm run parse:docs:status | ||
| 19 | +``` | ||
| 20 | + | ||
| 21 | +输出示例: | ||
| 22 | +``` | ||
| 23 | +🔧 文档解析服务配置状态: | ||
| 24 | +────────────────────────────────────────────────── | ||
| 25 | +📄 markitdown: ❌ 未配置 | ||
| 26 | +🤖 AI 服务: ❌ 未配置 | ||
| 27 | +────────────────────────────────────────────────── | ||
| 28 | + | ||
| 29 | +💡 配置提示: | ||
| 30 | + 1. 使用 markitdown: 安装 Python 并运行 "pip install markitdown" | ||
| 31 | + 2. 配置 AI 服务: 设置环境变量(.env 文件) | ||
| 32 | +``` | ||
| 33 | + | ||
| 34 | +### 3. 解析所有文档 | ||
| 35 | + | ||
| 36 | +```bash | ||
| 37 | +npm run parse:docs | ||
| 38 | +``` | ||
| 39 | + | ||
| 40 | +### 4. 解析单个文档 | ||
| 41 | + | ||
| 42 | +```bash | ||
| 43 | +npm run parse:docs:file="产品说明书.pdf" | ||
| 44 | +``` | ||
| 45 | + | ||
| 46 | +## 配置 AI 服务(可选) | ||
| 47 | + | ||
| 48 | +如需启用智能解析功能,请配置以下环境变量: | ||
| 49 | + | ||
| 50 | +### 方法 1: 使用 .env 文件 | ||
| 51 | + | ||
| 52 | +```bash | ||
| 53 | +# 复制示例配置 | ||
| 54 | +cp scripts/.env.example scripts/.env | ||
| 55 | + | ||
| 56 | +# 编辑 .env 文件,填写 API Key | ||
| 57 | +vim scripts/.env | ||
| 58 | +``` | ||
| 59 | + | ||
| 60 | +### 方法 2: 使用环境变量 | ||
| 61 | + | ||
| 62 | +```bash | ||
| 63 | +export AI_SERVICE_TYPE=openai | ||
| 64 | +export OPENAI_API_KEY=sk-your-key-here | ||
| 65 | +npm run parse:docs | ||
| 66 | +``` | ||
| 67 | + | ||
| 68 | +## 支持的 AI 服务 | ||
| 69 | + | ||
| 70 | +| 服务 | 说明 | 环境变量 | | ||
| 71 | +|------|------|---------| | ||
| 72 | +| OpenAI | GPT-4/GPT-3.5 | `OPENAI_API_KEY` | | ||
| 73 | +| Anthropic | Claude 3 Sonnet | `ANTHROPIC_API_KEY` | | ||
| 74 | +| OpenRouter | 聚合服务 | `OPENROUTER_API_KEY` | | ||
| 75 | + | ||
| 76 | +## 解析流程 | ||
| 77 | + | ||
| 78 | +1. **文档转换**:将 PDF/DOCX 转换为可读文本 | ||
| 79 | +2. **AI 解析**:从文本中提取结构化配置(产品类型、币种、年期等) | ||
| 80 | +3. **生成代码**:生成 `plan-templates.js` 配置代码 | ||
| 81 | +4. **更新配置**:自动更新到配置文件 | ||
| 82 | + | ||
| 83 | +## 当前状态 | ||
| 84 | + | ||
| 85 | +- ✅ **基础功能**:支持 PDF、DOCX 文本提取 | ||
| 86 | +- ✅ **启发式推断**:根据文件名和内容推断产品类型和币种 | ||
| 87 | +- ⏳ **AI 解析**:待集成 AI 服务(需要配置 API Key) | ||
| 88 | + | ||
| 89 | +## 文档位置 | ||
| 90 | + | ||
| 91 | +待解析文档放在:`docs/to-parse/` 文件夹 | ||
| 92 | + | ||
| 93 | +支持格式:`.pdf`, `.docx`, `.doc`, `.txt`, `.md` |
scripts/parse-config.js
0 → 100644
| 1 | +/** | ||
| 2 | + * 文档解析服务配置 | ||
| 3 | + * | ||
| 4 | + * @description 配置 markitdown 和 AI 服务的访问信息 | ||
| 5 | + * @module scripts/parse-config | ||
| 6 | + * @author Claude Code | ||
| 7 | + * @created 2026-02-14 | ||
| 8 | + */ | ||
| 9 | + | ||
| 10 | +/** | ||
| 11 | + * markitdown 服务配置 | ||
| 12 | + * | ||
| 13 | + * @description markitdown 是一个将 PDF/DOCX/PPTX/XLSX 等多种格式转换为 Markdown 的工具 | ||
| 14 | + * | ||
| 15 | + * 安装方式: | ||
| 16 | + * 1. Python: pip install markitdown | ||
| 17 | + * 2. Docker: docker pull ghcr.io/onurtemiz/markitdown | ||
| 18 | + * 3. HTTP API: 部署 markitdown 服务 | ||
| 19 | + * | ||
| 20 | + * 使用方式: | ||
| 21 | + * - CLI: markitdown input.docx output.md | ||
| 22 | + * - Docker: docker run --rm -v $(pwd):/app markitdown input.docx output.md | ||
| 23 | + * - HTTP: POST /convert with file upload | ||
| 24 | + */ | ||
| 25 | +export const MARKITDOWN_CONFIG = { | ||
| 26 | + // markitdown 服务类型 | ||
| 27 | + // - 'cli': 使用命令行工具(需要本地安装 Python) | ||
| 28 | + // - 'docker': 使用 Docker 容器 | ||
| 29 | + // - 'http': 使用 HTTP API | ||
| 30 | + // - 'disabled': 禁用,使用本地库 | ||
| 31 | + type: 'cli', | ||
| 32 | + | ||
| 33 | + // CLI 配置 | ||
| 34 | + cli: { | ||
| 35 | + command: 'markitdown', // 命令行工具路径 | ||
| 36 | + timeout: 30000 // 超时时间(毫秒) | ||
| 37 | + }, | ||
| 38 | + | ||
| 39 | + // Docker 配置 | ||
| 40 | + docker: { | ||
| 41 | + image: 'ghcr.io/onurtemiz/markitdown', | ||
| 42 | + timeout: 30000 | ||
| 43 | + }, | ||
| 44 | + | ||
| 45 | + // HTTP API 配置 | ||
| 46 | + http: { | ||
| 47 | + url: process.env.MARKITDOWN_URL || 'http://localhost:8000/convert', | ||
| 48 | + timeout: 30000 | ||
| 49 | + } | ||
| 50 | +} | ||
| 51 | + | ||
| 52 | +/** | ||
| 53 | + * AI 解析服务配置 | ||
| 54 | + * | ||
| 55 | + * @description 配置用于智能解析文档内容的 AI 服务 | ||
| 56 | + * | ||
| 57 | + * 支持的 AI 服务: | ||
| 58 | + * - 'openai': OpenAI GPT-4/GPT-3.5 | ||
| 59 | + * - 'anthropic': Anthropic Claude | ||
| 60 | + * - 'openrouter': OpenRouter(聚合服务) | ||
| 61 | + * - 'disabled': 禁用 AI 解析 | ||
| 62 | + */ | ||
| 63 | +export const AI_SERVICE_CONFIG = { | ||
| 64 | + // AI 服务类型 | ||
| 65 | + type: process.env.AI_SERVICE_TYPE || 'disabled', | ||
| 66 | + | ||
| 67 | + // OpenAI 配置 | ||
| 68 | + openai: { | ||
| 69 | + apiKey: process.env.OPENAI_API_KEY || '', | ||
| 70 | + baseURL: process.env.OPENAI_BASE_URL || 'https://api.openai.com/v1', | ||
| 71 | + model: process.env.OPENAI_MODEL || 'gpt-4-turbo', | ||
| 72 | + timeout: 60000 | ||
| 73 | + }, | ||
| 74 | + | ||
| 75 | + // Anthropic 配置 | ||
| 76 | + anthropic: { | ||
| 77 | + apiKey: process.env.ANTHROPIC_API_KEY || '', | ||
| 78 | + baseURL: process.env.ANTHROPIC_BASE_URL || 'https://api.anthropic.com/v1', | ||
| 79 | + model: process.env.ANTHROPIC_MODEL || 'claude-3-sonnet-20240229', | ||
| 80 | + timeout: 60000 | ||
| 81 | + }, | ||
| 82 | + | ||
| 83 | + // OpenRouter 配置 | ||
| 84 | + openrouter: { | ||
| 85 | + apiKey: process.env.OPENROUTER_API_KEY || '', | ||
| 86 | + baseURL: process.env.OPENROUTER_BASE_URL || 'https://openrouter.ai/api/v1', | ||
| 87 | + model: process.env.OPENROUTER_MODEL || 'anthropic/claude-3-sonnet', | ||
| 88 | + timeout: 60000 | ||
| 89 | + } | ||
| 90 | +} | ||
| 91 | + | ||
| 92 | +/** | ||
| 93 | + * 检查 markitdown 服务是否可用 | ||
| 94 | + * | ||
| 95 | + * @returns {Promise<boolean>} 是否可用 | ||
| 96 | + */ | ||
| 97 | +export async function checkMarkitdownAvailable() { | ||
| 98 | + const { type } = MARKITDOWN_CONFIG | ||
| 99 | + | ||
| 100 | + if (type === 'disabled') { | ||
| 101 | + return false | ||
| 102 | + } | ||
| 103 | + | ||
| 104 | + if (type === 'cli') { | ||
| 105 | + // 检查命令是否存在 | ||
| 106 | + const { exec } = require('child_process') | ||
| 107 | + return new Promise((resolve) => { | ||
| 108 | + exec(`${MARKITDOWN_CONFIG.cli.command} --version`, (error) => { | ||
| 109 | + resolve(!error) | ||
| 110 | + }) | ||
| 111 | + }) | ||
| 112 | + } | ||
| 113 | + | ||
| 114 | + if (type === 'http') { | ||
| 115 | + // 检查 HTTP 服务是否可访问 | ||
| 116 | + try { | ||
| 117 | + const response = await fetch(MARKITDOWN_CONFIG.http.url, { | ||
| 118 | + method: 'HEAD', | ||
| 119 | + timeout: 5000 | ||
| 120 | + }) | ||
| 121 | + return response.ok | ||
| 122 | + } catch { | ||
| 123 | + return false | ||
| 124 | + } | ||
| 125 | + } | ||
| 126 | + | ||
| 127 | + if (type === 'docker') { | ||
| 128 | + // 检查 Docker 是否可用 | ||
| 129 | + const { exec } = require('child_process') | ||
| 130 | + return new Promise((resolve) => { | ||
| 131 | + exec('docker --version', (error) => { | ||
| 132 | + resolve(!error) | ||
| 133 | + }) | ||
| 134 | + }) | ||
| 135 | + } | ||
| 136 | + | ||
| 137 | + return false | ||
| 138 | +} | ||
| 139 | + | ||
| 140 | +/** | ||
| 141 | + * 检查 AI 服务是否配置 | ||
| 142 | + * | ||
| 143 | + * @returns {boolean} 是否已配置 API Key | ||
| 144 | + */ | ||
| 145 | +export function checkAIServiceConfigured() { | ||
| 146 | + const { type } = AI_SERVICE_CONFIG | ||
| 147 | + | ||
| 148 | + if (type === 'disabled') { | ||
| 149 | + return false | ||
| 150 | + } | ||
| 151 | + | ||
| 152 | + if (type === 'openai') { | ||
| 153 | + return !!AI_SERVICE_CONFIG.openai.apiKey | ||
| 154 | + } | ||
| 155 | + | ||
| 156 | + if (type === 'anthropic') { | ||
| 157 | + return !!AI_SERVICE_CONFIG.anthropic.apiKey | ||
| 158 | + } | ||
| 159 | + | ||
| 160 | + if (type === 'openrouter') { | ||
| 161 | + return !!AI_SERVICE_CONFIG.openrouter.apiKey | ||
| 162 | + } | ||
| 163 | + | ||
| 164 | + return false | ||
| 165 | +} | ||
| 166 | + | ||
| 167 | +/** | ||
| 168 | + * 打印配置状态 | ||
| 169 | + */ | ||
| 170 | +export function printConfigStatus() { | ||
| 171 | + console.log('\n🔧 文档解析服务配置状态:') | ||
| 172 | + console.log('─'.repeat(50)) | ||
| 173 | + | ||
| 174 | + // markitdown 状态 | ||
| 175 | + const markitdownType = MARKITDOWN_CONFIG.type | ||
| 176 | + console.log(`📄 markitdown: ${markitdownType === 'disabled' ? '❌ 未配置' : '✅ ' + markitdownType}`) | ||
| 177 | + | ||
| 178 | + // AI 服务状态 | ||
| 179 | + const aiType = AI_SERVICE_CONFIG.type | ||
| 180 | + console.log(`🤖 AI 服务: ${aiType === 'disabled' ? '❌ 未配置' : '✅ ' + aiType}`) | ||
| 181 | + | ||
| 182 | + if (aiType !== 'disabled') { | ||
| 183 | + const configured = checkAIServiceConfigured() | ||
| 184 | + console.log(` API Key: ${configured ? '✅ 已配置' : '❌ 未配置'}`) | ||
| 185 | + } | ||
| 186 | + | ||
| 187 | + console.log('─'.repeat(50)) | ||
| 188 | + console.log('') | ||
| 189 | + console.log('💡 配置提示:') | ||
| 190 | + console.log(' 1. 使用 markitdown: 安装 Python 并运行 "pip install markitdown"') | ||
| 191 | + console.log(' 2. 配置 AI 服务: 设置环境变量(.env 文件)') | ||
| 192 | + console.log(' - OPENAI_API_KEY: OpenAI API Key') | ||
| 193 | + console.log(' - ANTHROPIC_API_KEY: Anthropic API Key') | ||
| 194 | + console.log(' - AI_SERVICE_TYPE: openai | anthropic | openrouter') | ||
| 195 | + console.log('') | ||
| 196 | +} |
| ... | @@ -365,7 +365,8 @@ export const PLAN_TEMPLATES = { | ... | @@ -365,7 +365,8 @@ export const PLAN_TEMPLATES = { |
| 365 | form_schema: savingsFormSchema, | 365 | form_schema: savingsFormSchema, |
| 366 | submit_mapping: savingsSubmitMapping | 366 | submit_mapping: savingsSubmitMapping |
| 367 | } | 367 | } |
| 368 | - }} | 368 | + }, |
| 369 | +} | ||
| 369 | 370 | ||
| 370 | /** | 371 | /** |
| 371 | * 全局功能开关 | 372 | * 全局功能开关 | ... | ... |
-
Please register or login to post a comment