product-splitter.js 6.54 KB

Raw Blame History Permalink

/**
 * 产品分割器
 *
 * @description 从包含多个保险产品的文档中识别并分割出各个产品
 * @module scripts/product-splitter
 * @author Claude Code
 * @created 2026-02-15
 */

/**
 * 产品标题匹配规则
 *
 * @description 用于识别文档中的产品标题行
 * 格式示例：
 * - GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
 * - GC宏摯家傳承保險計劃- 性別, 年齡, 出生年月日
 * - FA 宏浚傳承保障計劃
 * - LV2 赤霞珠終身壽險計劃2基本人壽保障選項
 */
const PRODUCT_TITLE_PATTERNS = [
  // 产品代码 + 产品名称 + 可选后缀
  // GS宏摯傳承保障計劃 - 性別, 年齡, 出生年月日
  /^([A-Z]{2,4}\d?)\s*([^\n\-]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,

  // 产品代码 + 空格 + 产品名称
  // FA 宏浚傳承保障計劃
  /^([A-Z]{2,4}\d?)\s+([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险))/gm,

  // 纯产品名称（包含"計劃"）
  // 宏摯傳承保障計劃
  /^([^\n]{2,30}?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm,

  // 产品代码开头的行
  /^([A-Z]{2,4}\d?)\s*[-:]\s*([^\n]+)/gm
]

/**
 * 产品代码前缀列表（用于优先匹配）
 */
const PRODUCT_CODE_PREFIXES = [
  'GS', 'GC', 'FA', 'LV2', 'LV', 'CR', 'HR', 'PR', 'SR',
  'TR', 'UR', 'WR', 'XR', 'YR', 'ZR'
]

/**
 * 检测文档中包含的产品数量
 *
 * @param {string} content - 文档内容
 * @returns {number} 产品数量
 */
export function detectProductCount(content) {
  const matches = findProductTitles(content)
  return matches.length
}

/**
 * 查找文档中所有产品标题
 *
 * @param {string} content - 文档内容
 * @returns {Array<{index: number, code: string, name: string, fullTitle: string}>} 产品标题列表
 */
export function findProductTitles(content) {
  const products = []
  const seenCodes = new Set()

  // 策略1: 优先匹配产品代码前缀
  for (const prefix of PRODUCT_CODE_PREFIXES) {
    // 匹配 "GS宏摯傳承保障計劃" 或 "GS 宏摯傳承保障計劃"
    const regex = new RegExp(
      `^(${prefix}\\d?)\\s*([\\u4e00-\\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险)[^\\n]*)`,
      'gm'
    )

    let match
    while ((match = regex.exec(content)) !== null) {
      const code = match[1]
      const name = match[2].trim()

      // 去重
      if (seenCodes.has(code)) continue
      seenCodes.add(code)

      products.push({
        index: match.index,
        code,
        name,
        fullTitle: match[0].trim()
      })
    }
  }

  // 策略2: 如果没找到，尝试通用模式匹配
  if (products.length === 0) {
    // 匹配包含"計劃"的产品名称行
    const regex = /^([A-Z]{2,4}\d?)?\s*([^\n]*?(?:計劃|计划|保障|保险|壽險|壽险)[^\n]*)/gm

    let match
    while ((match = regex.exec(content)) !== null) {
      const fullTitle = match[0].trim()
      if (fullTitle.length < 5) continue // 过滤太短的匹配

      products.push({
        index: match.index,
        code: match[1] || null,
        name: match[2] || fullTitle,
        fullTitle
      })
    }
  }

  // 按出现位置排序
  products.sort((a, b) => a.index - b.index)

  return products
}

/**
 * 将文档内容按产品分割
 *
 * @param {string} content - 文档内容
 * @returns {Array<{code: string, name: string, content: string, fullTitle: string}>} 分割后的产品列表
 */
export function splitByProducts(content) {
  const products = findProductTitles(content)

  if (products.length === 0) {
    // 没有找到多个产品，返回整个文档作为单个产品
    return [{
      code: null,
      name: null,
      content: content,
      fullTitle: null
    }]
  }

  if (products.length === 1) {
    // 只有一个产品，返回整个文档
    return [{
      code: products[0].code,
      name: products[0].name,
      content: content,
      fullTitle: products[0].fullTitle
    }]
  }

  // 多个产品，按位置分割
  const result = []

  for (let i = 0; i < products.length; i++) {
    const product = products[i]
    const startIndex = product.index
    const endIndex = (i < products.length - 1) ? products[i + 1].index : content.length

    const productContent = content.slice(startIndex, endIndex).trim()

    result.push({
      code: product.code,
      name: product.name,
      content: productContent,
      fullTitle: product.fullTitle
    })
  }

  return result
}

/**
 * 智能提取产品名称
 *
 * @description 从产品标题或内容中提取标准化的产品名称
 * @param {string} fullTitle - 产品完整标题
 * @param {string} content - 产品内容片段
 * @returns {string} 产品名称
 */
export function extractProductName(fullTitle, content) {
  if (!fullTitle && !content) return null

  // 优先从完整标题提取
  if (fullTitle) {
    // 移除产品代码前缀
    let name = fullTitle.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')

    // 移除后缀说明（如 "- 性別, 年齡, 出生年月日"）
    name = name.split(/[-—:：]/)[0].trim()

    if (name && name.length > 2) {
      return name
    }
  }

  // 从内容中查找产品名称
  const patterns = [
    /产品名称[：:]\s*([^\n]+)/,
    /计划书名称[：:]\s*([^\n]+)/,
    /([A-Z]{2,4}\d?\s*[\u4e00-\u9fa5]+(?:計劃|计划|保障|保险|壽險|壽险))/
  ]

  for (const pattern of patterns) {
    const match = content.match(pattern)
    if (match) {
      // 清理产品名称
      let name = match[1] || match[0]
      name = name.replace(/^[A-Z]{2,4}\d?\s*[-:：]?\s*/, '')
      name = name.split(/[-—:：]/)[0].trim()
      if (name && name.length > 2) {
        return name
      }
    }
  }

  return null
}

/**
 * 生成产品分割报告
 *
 * @param {string} content - 原始文档内容
 * @param {Array} products - 分割后的产品列表
 * @returns {string} Markdown 格式的报告
 */
export function generateSplitReport(content, products) {
  let report = `## 📊 产品分割报告\n\n`

  report += `### 分割统计\n\n`
  report += `- 文档总长度: ${content.length} 字符\n`
  report += `- 识别产品数: ${products.length} 个\n\n`

  report += `### 产品列表\n\n`
  report += `| 序号 | 产品代码 | 产品名称 | 内容长度 |\n`
  report += `|------|---------|---------|----------|\n`

  products.forEach((product, index) => {
    const code = product.code || '-'
    const name = product.name || product.fullTitle?.slice(0, 20) || '-'
    const length = product.content.length
    report += `| ${index + 1} | ${code} | ${name.slice(0, 30)} | ${length} 字符 |\n`
  })

  return report
}

export {
  PRODUCT_TITLE_PATTERNS,
  PRODUCT_CODE_PREFIXES
}