htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_2kV1.py

import os
import re
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.lines import Line2D
from typing import Optional, Tuple, List, Dict, Any, Union
from pathlib import Path
import numpy as np
import base64
from io import BytesIO
from jinja2 import Template
from colorama import Fore, Style, init
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import psutil

# 初始化colorama
init(autoreset=True)

# 避免 SettingWithCopy 警告影响输出可读性
pd.options.mode.chained_assignment = None

# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# HTML模板 - 更新为支持多关键词
# 性能优化配置
OPTIMIZATION_CONFIG = {
    'max_workers': min(mp.cpu_count(), 8),  # 限制最大工作线程数
    'chunk_size': 50000,  # 分块读取大小
    'use_threading': True,  # 使用多线程
    'memory_limit_gb': psutil.virtual_memory().available // (1024 ** 3) * 0.7,  # 内存限制
}

# HTML模板 - 增加汇总对比报告部分
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>测试报告分析 - {{ keyword }}</title>
    <style>
        /* 样式保持不变，根据需要调整 */
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
        }
        .header {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 10px;
            margin-bottom: 20px;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
        }
        .test-card {
            background: white;
            border-radius: 10px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
            transition: transform 0.2s ease;
        }
        .test-card:hover {
            transform: translateY(-2px);
            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15);
        }
        .test-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 15px;
            padding-bottom: 10px;
            border-bottom: 2px solid #eaeaea;
        }
        .test-title {
            font-size: 18px;
            font-weight: bold;
            color: #333;
        }
        .test-stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
            margin-bottom: 15px;
        }
        .stat-item {
            background: #f8f9fa;
            padding: 12px;
            border-radius: 8px;
            text-align: center;
        }
        .stat-label {
            font-size: 12px;
            color: #666;
            margin-bottom: 5px;
        }
        .stat-value {
            font-size: 16px;
            font-weight: bold;
            color: #333;
        }
        .plot-container {
            text-align: center;
            margin: 20px 0;
        }
        .plot-image {
            max-width: 100%;
            height: auto;
            border-radius: 8px;
            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
        }
        .sn-plots-container {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }
        .sn-plot-item {
            background: #f8f9fa;
            padding: 15px;
            border-radius: 8px;
            text-align: center;
        }
        .sn-plot-title {
            font-size: 14px;
            font-weight: bold;
            margin-bottom: 10px;
            color: #555;
        }
        .summary {
            background: white;
            border-radius: 10px;
            padding: 20px;
            margin-top: 20px;
            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
        }
        .summary-item {
            margin: 10px 0;
            padding: 10px;
            background: #f8f9fa;
            border-radius: 6px;
        }
        .timestamp {
            text-align: center;
            color: #666;
            margin-top: 30px;
            font-size: 12px;
        }
        .badge {
            padding: 4px 8px;
            border-radius: 12px;
            font-size: 12px;
            font-weight: bold;
        }
        .badge-success {
            background: #d4edda;
            color: #155724;
        }
        .badge-warning {
            background: #fff3cd;
            color: #856404;
        }
        .badge-danger {
            background: #f8d7da;
            color: #721c24;
        }
        .section-title {
            font-size: 16px;
            font-weight: bold;
            margin: 20px 0 10px 0;
            color: #333;
            border-left: 4px solid #667eea;
            padding-left: 10px;
        }
        .file-info {
            background: #e7f3ff;
            padding: 10px;
            border-radius: 6px;
            margin: 10px 0;
        }
        .comparison-section {
            background: white;
            border-radius: 10px;
            padding: 20px;
            margin-top: 30px;
        }
        .comparison-plots {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(500px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }
        .keyword-group {
            background: #f0f8ff;
            border-left: 4px solid #4169e1;
            padding: 15px;
            margin: 20px 0;
            border-radius: 8px;
        }
        .keyword-title {
            font-size: 18px;
            font-weight: bold;
            color: #4169e1;
            margin-bottom: 10px;
        }
        .sub-test-card {
            background: #f8f9fa;
            border-radius: 8px;
            padding: 15px;
            margin: 10px 0;
            border-left: 3px solid #667eea;
        }
    </style>
</head>
<body>
    <div class="header">
        <h1>📊 测试报告分析 (多文件合并)</h1>
        <p>关键词: <strong>{{ keyword }}</strong> | 生成时间: {{ timestamp }}</p>
        <p>共分析 {{ test_count }} 个测试项，{{ total_points }} 个数据点，来自 {{ file_count }} 个文件</p>
        {% if keyword_groups|length > 1 %}
        <p>关键词分组: {{ keyword_groups|length }} 组</p>
        {% endif %}
    </div>

    <div class="summary">
        <h3>📁 处理的文件列表</h3>
        {% for file_info in file_infos %}
        <div class="file-info">
            <strong>{{ loop.index }}. {{ file_info.filename }}</strong><br>
            路径: {{ file_info.path }}<br>
            数据行数: {{ file_info.rows }} | 测试项数: {{ file_info.tests }}
        </div>
        {% endfor %}
    </div>

    <!-- 按关键词分组显示测试项 -->
    {% for group_name, group_tests in keyword_groups.items() %}
    <div class="keyword-group">
        <div class="keyword-title">🔍 关键词组: {{ group_name }}</div>

        <!-- 显示该关键词组下的所有测试项 -->
        {% for test_name, test_list in group_tests.items() %}
        <div class="sub-test-card">
            <div class="test-header">
                <div class="test-title">📋 {{ test_name }}</div>
                {% if test_list[0].status %}
                <div class="badge badge-{{ test_list[0].status }}">
                    {{ test_list[0].status_display }}
                </div>
                {% endif %}
            </div>

            {% for test in test_list %}
            <div class="test-stats">
                <div class="stat-item">
                    <div class="stat-label">数据点数</div>
                    <div class="stat-value">{{ test.stats.count }}</div>
                </div>
                <div class="stat-item">
                    <div class="stat-label">平均值</div>
                    <div class="stat-value">{{ "%.4f"|format(test.stats.mean) }}</div>
                </div>
                <div class="stat-item">
                    <div class="stat-label">中位数</div>
                    <div class="stat-value">{{ "%.4f"|format(test.stats.median) }}</div>
                </div>
                <div class="stat-item">
                    <div class="stat-label">标准差</div>
                    <div class="stat-value">{{ "%.4f"|format(test.stats.std) }}</div>
                </div>
                <div class="stat-item">
                    <div class="stat-label">最小值</div>
                    <div class="stat-value">{{ "%.4f"|format(test.stats.min) }}</div>
                </div>
                <div class="stat-item">
                    <div class="stat-label">最大值</div>
                    <div class="stat-value">{{ "%.4f"|format(test.stats.max) }}</div>
                </div>
            </div>

            {% if test.limits.lower is not none or test.limits.upper is not none %}
            <div class="test-stats">
                {% if test.limits.lower is not none %}
                <div class="stat-item">
                    <div class="stat-label">下限值</div>
                    <div class="stat-value">{{ "%.4f"|format(test.limits.lower) }}</div>
                </div>
                {% endif %}
                {% if test.limits.upper is not none %}
                <div class="stat-item">
                    <div class="stat-label">上限值</div>
                    <div class="stat-value">{{ "%.4f"|format(test.limits.upper) }}</div>
                </div>
                {% endif %}
            </div>
            {% endif %}

            <!-- 汇总图 -->
            <div class="section-title">📈 汇总视图 (所有SN)</div>
            <div class="plot-container">
                <img src="data:image/png;base64,{{ test.summary_plot_image }}" alt="{{ test.name }} 汇总散点图" class="plot-image">
            </div>

            <!-- SN独立图 -->
            {% if test.sn_plot_images %}
            <div class="section-title">🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)</div>
            <div class="sn-plots-container">
                {% for sn_plot in test.sn_plot_images %}
                <div class="sn-plot-item">
                    <div class="sn-plot-title">
                        {% if sn_plot.cell_no %}
                            Cell: {{ sn_plot.cell_no }}, SN: {{ sn_plot.sn_no }}
                        {% else %}
                            SN: {{ sn_plot.sn }}
                        {% endif %}
                    </div>
                    {% if sn_plot.has_cell_data %}
                    <div class="cell-info">
                        <small>Cell编号信息:</small>
                        <table style="width:100%; font-size:12px; margin:10px 0; border-collapse: collapse;">
                            <tr style="background-color:#f0f0f0;">
                                <th style="padding:5px; border:1px solid #ddd;">Cell编号</th>
                                <th style="padding:5px; border:1px solid #ddd;">数据点</th>
                                <th style="padding:5px; border:1px solid #ddd;">均值</th>
                                <th style="padding:5px; border:1px solid #ddd;">标准差</th>
                                <th style="padding:5px; border:1px solid #ddd;">最小值</th>
                                <th style="padding:5px; border:1px solid #ddd;">最大值</th>
                            </tr>
                            {% for cell_no, stats_dict in sn_plot.cell_info.items() %}
                            <tr>
                                <td style="padding:5px; border:1px solid #ddd;">{{ cell_no }}</td>
                                <td style="padding:5px; border:1px solid #ddd;">{{ stats_dict.count }}</td>
                                <td style="padding:5px; border:1px solid #ddd;">{{ "%.4f"|format(stats_dict.mean) }}</td>
                                <td style="padding:5px; border:1px solid #ddd;">{{ "%.4f"|format(stats_dict.std) }}</td>
                                <td style="padding:5px; border:1px solid #ddd;">{{ "%.4f"|format(stats_dict.min) }}</td>
                                <td style="padding:5px; border:1px solid #ddd;">{{ "%.4f"|format(stats_dict.max) }}</td>
                            </tr>
                            {% endfor %}

                        </table>
                    </div>
                    {% endif %}
                    <img src="data:image/png;base64,{{ sn_plot.image }}" alt="{{ test.name }} - SN {{ sn_plot.sn }} 散点图" class="plot-image">
                </div>
                {% endfor %}
            </div>
            {% endif %}


            {% if not loop.last %}
            <hr style="margin: 20px 0; border: none; border-top: 1px solid #e0e0e0;">
            {% endif %}
            {% endfor %}
        </div>
        {% endfor %}
    </div>
    {% endfor %}

    <!-- 汇总对比报告 -->
    {% if comparison_plots %}
    <div class="comparison-section">
        <h2>📊 汇总对比报告</h2>
        <div class="comparison-plots">
            {% for plot in comparison_plots %}
            <div class="plot-container">
                <h3>{{ plot.title }}</h3>
                <img src="data:image/png;base64,{{ plot.image }}" alt="{{ plot.title }}" class="plot-image">
                {% if plot.description %}
                <p>{{ plot.description }}</p>
                {% endif %}
            </div>
            {% endfor %}
        </div>
    </div>
    {% endif %}

    <div class="summary">
        <h3>📈 分析摘要</h3>
        <div class="summary-item">
            <strong>文件夹路径:</strong> {{ folder_path }}
        </div>
        <div class="summary-item">
            <strong>分析时间:</strong> {{ analysis_time }}秒
        </div>
        <div class="summary-item">
            <strong>关键词分组:</strong> {{ keyword_groups|length }} 组
            <ul>
                {% for group_name, group_tests in keyword_groups.items() %}
                <li>{{ group_name }}: {{ group_tests|length }} 个测试项</li>
                {% endfor %}
            </ul>
        </div>
        <div class="summary-item">
            <strong>测试项分布:</strong>
            <ul>
                <li>正常: {{ status_counts.normal }} 个</li>
                <li>警告: {{ status_counts.warning }} 个</li>
                <li>异常: {{ status_counts.abnormal }} 个</li>
            </ul>
        </div>
        <div class="summary-item">
            <strong>数据摘要:</strong>
            <ul>
                <li>总文件数: {{ file_count }} 个</li>
                <li>总数据行数: {{ total_rows }} 行</li>
                <li>总测试项数: {{ test_count }} 个</li>
                <li>总数据点数: {{ total_points }} 个</li>
            </ul>
        </div>
    </div>

    <div class="timestamp">
        报告生成于 {{ timestamp }} | 多文件测试报告分析系统
    </div>
</body>
</html>
"""


class MultiFileTestReportScatterPlotter:
    def __init__(self):
        self.folder_path: Optional[str] = None
        self.df: Optional[pd.DataFrame] = None
        self.output_dir: Optional[str] = None
        # self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit"]
        # 更新required_columns，增加Cell编号
        self.required_columns = [
            "Test Name New",
            "SN",
            "Measurement",
            "Test Time",
            "Lower Limit",
            "Upper Limit",
            "Cell"  # 新增Cell编号列
        ]
        self.col_lower: Optional[str] = None
        self.col_upper: Optional[str] = None
        self.html_report_path: Optional[str] = None
        self.comparison_report_path: Optional[str] = None
        self.file_infos: List[Dict[str, Any]] = []

        # 缓存处理过的数据
        self._processed_data_cache: Dict[str, Any] = {}

        # 性能监控
        self.performance_stats = {
            'load_times': [],
            'memory_usage': [],
            'file_sizes': []
        }

    def _print_stage(self, msg: str, color=Fore.CYAN) -> None:
        """统一的阶段信息输出"""
        print(f"\n{color}{'=' * 50}")
        print(f"📋 {msg}")
        print(f"{'=' * 50}{Style.RESET_ALL}")

    def _print_progress(self, current: int, total: int, prefix: str = "进度",
                        color=Fore.YELLOW) -> None:
        """改进的进度条显示"""
        if total <= 0:
            return

        percent = (current / total) * 100
        bar_len = 40
        filled = int(bar_len * current / total)
        bar = "█" * filled + "░" * (bar_len - filled)

        sys.stdout.write(f"\r{color}{prefix}: [{bar}] {current}/{total} ({percent:.1f}%){Style.RESET_ALL}")
        sys.stdout.flush()

        if current == total:
            print(f"{Fore.GREEN} ✅ 完成{Style.RESET_ALL}")

    def _print_warning(self, msg: str) -> None:
        """警告信息输出"""
        print(f"{Fore.YELLOW}⚠️  {msg}{Style.RESET_ALL}")

    def _print_success(self, msg: str) -> None:
        """成功信息输出"""
        print(f"{Fore.GREEN}✅  {msg}{Style.RESET_ALL}")

    def _print_error(self, msg: str) -> None:
        """错误信息输出"""
        print(f"{Fore.RED}❌  {msg}{Style.RESET_ALL}")

    def _get_memory_usage(self) -> float:
        """获取当前内存使用量（GB）"""
        process = psutil.Process()
        return process.memory_info().rss / (1024 ** 3)

    def _check_memory_safe(self, file_size_mb: float) -> bool:
        """检查内存是否安全"""
        available_memory = psutil.virtual_memory().available / (1024 ** 3)
        estimated_need = file_size_mb * 5 / 1024  # 估算需要的内存（GB）
        return available_memory > estimated_need + 1  # 保留1GB安全空间

    def _load_single_file_optimized(self, file_info: Dict[str, Any]) -> Optional[pd.DataFrame]:
        """优化单文件加载方法"""
        file_path = file_info['path']
        filename = file_info['filename']

        try:
            start_time = time.time()
            file_size_mb = os.path.getsize(file_path) / (1024 ** 2)

            # 内存安全检查
            if not self._check_memory_safe(file_size_mb):
                self._print_warning(f"内存不足，跳过大文件: {filename} ({file_size_mb:.1f}MB)")
                return None

            # 选择合适的引擎
            file_ext = file_path.lower()
            if file_ext.endswith('.xlsx'):
                engine = 'openpyxl'
            elif file_ext.endswith('.xls'):
                engine = 'xlrd'
            else:
                self._print_warning(f"不支持的文件格式: {filename}")
                return None

            # 快速获取工作表信息
            try:
                excel_file = pd.ExcelFile(file_path, engine=engine)
                sheet_names = excel_file.sheet_names

                # 选择工作表
                target_sheets = ["Merged All Tests", "All Tests", sheet_names[0] if sheet_names else None]
                selected_sheet = next((s for s in target_sheets if s and s in sheet_names), None)

                if not selected_sheet:
                    self._print_warning(f"未找到目标工作表: {filename}")
                    return None

            except Exception as e:
                self._print_warning(f"无法读取工作表信息 {filename}: {e}")
                return None

            # 优化读取参数
            read_kwargs = {
                'io': file_path,
                'sheet_name': selected_sheet,
                'engine': engine,
                'dtype': 'object',
                'na_filter': False,
                'usecols': self.required_columns,  # 只读取需要的列
            }

            # 对于大文件，使用分块读取
            if file_size_mb > 50:  # 50MB以上使用分块读取
                chunks = []
                for chunk in pd.read_excel(**read_kwargs, chunksize=OPTIMIZATION_CONFIG['chunk_size']):
                    chunks.append(chunk)

                if chunks:
                    df = pd.concat(chunks, ignore_index=True)
                else:
                    df = pd.DataFrame()
            else:
                df = pd.read_excel(**read_kwargs)

            if df.empty:
                self._print_warning(f"文件为空: {filename}")
                return None

            # 检查必要列
            missing_columns = [col for col in self.required_columns if col not in df.columns]
            if missing_columns:
                self._print_warning(f"缺少必要列 {filename}: {missing_columns}")
                return None

            # 添加文件标识
            df['_source_file'] = filename

            load_time = time.time() - start_time
            file_info.update({
                'load_time': round(load_time, 2),
                'file_size_mb': round(file_size_mb, 2),
                'engine': engine,
                'rows': len(df)
            })

            self.performance_stats['load_times'].append(load_time)
            self.performance_stats['file_sizes'].append(file_size_mb)
            self.performance_stats['memory_usage'].append(self._get_memory_usage())

            self._print_success(f"加载完成: {filename} ({len(df)}行, {load_time:.2f}s)")
            return df

        except Exception as e:
            self._print_error(f"加载文件失败 {filename}: {e}")
            return None

    def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
        """优化的大小写不敏感列查找"""
        if self.df is None:
            return None

        columns_lower = {col.lower().strip(): col for col in self.df.columns}
        for candidate in candidates:
            key = candidate.lower().strip()
            if key in columns_lower:
                return columns_lower[key]
        return None

    # 以下方法保持不变（为节省空间省略部分重复代码）
    def get_folder_path(self) -> None:
        """获取文件夹路径"""
        self._print_stage("输入文件夹路径")

        while True:
            print(f"{Fore.WHITE}请输入包含Excel文件的文件夹路径: ")
            folder_path = input("> ").strip()

            if not folder_path:
                continue

            path_obj = Path(folder_path)
            if path_obj.exists() and path_obj.is_dir():
                self.folder_path = str(path_obj.resolve())
                print(f"{Fore.GREEN}已选择文件夹: {self.folder_path}{Style.RESET_ALL}")
                break
            else:
                self._print_error(f"文件夹不存在: {folder_path}，请重新输入")

    def find_excel_files(self) -> List[str]:
        """查找文件夹中的所有Excel文件"""
        self._print_stage("扫描Excel文件")

        excel_files = []
        valid_extensions = ('.xlsx', '.xls')

        try:
            for file_path in Path(self.folder_path).rglob('*'):
                if file_path.suffix.lower() in valid_extensions and file_path.is_file():
                    excel_files.append(str(file_path.resolve()))

            # 按文件名排序
            excel_files.sort()

            self._print_success(f"找到 {len(excel_files)} 个Excel文件")
            for i, file_path in enumerate(excel_files, 1):
                print(f"  {i:2d}. {os.path.basename(file_path)}")

            return excel_files

        except Exception as e:
            self._print_error(f"扫描文件夹时发生错误: {e}")
            return []

    def load_multiple_files_optimized(self, excel_files: List[str]) -> None:
        """优化多文件加载方法"""
        self._print_stage("并行加载Excel文件")
        start_time = time.time()

        # 准备文件信息
        file_infos = [{'path': path, 'filename': os.path.basename(path)} for path in excel_files]

        all_dataframes = []
        self.file_infos = []

        if OPTIMIZATION_CONFIG['use_threading'] and len(excel_files) > 1:
            # 使用多线程并行加载
            with ThreadPoolExecutor(max_workers=OPTIMIZATION_CONFIG['max_workers']) as executor:
                futures = {executor.submit(self._load_single_file_optimized, file_info): file_info
                           for file_info in file_infos}

                completed = 0
                for future in futures:
                    try:
                        df = future.result(timeout=300)  # 5分钟超时
                        if df is not None:
                            all_dataframes.append(df)
                            self.file_infos.append(futures[future])
                        completed += 1
                        self._print_progress(completed, len(excel_files), "并行加载文件")
                    except Exception as e:
                        file_info = futures[future]
                        self._print_error(f"加载失败 {file_info['filename']}: {e}")
        else:
            # 顺序加载
            for i, file_info in enumerate(file_infos, 1):
                self._print_progress(i, len(file_infos), "加载文件")
                df = self._load_single_file_optimized(file_info)
                if df is not None:
                    all_dataframes.append(df)
                    self.file_infos.append(file_info)

        if not all_dataframes:
            raise ValueError("没有成功加载任何Excel文件")

        # 合并数据
        self._print_stage("合并数据")
        merge_start = time.time()

        try:
            self.df = pd.concat(all_dataframes, ignore_index=True, sort=False)
            merge_time = time.time() - merge_start

            total_time = time.time() - start_time
            avg_load_time = np.mean(self.performance_stats['load_times']) if self.performance_stats['load_times'] else 0

            self._print_success(f"合并完成: {len(self.df)}行, {len(all_dataframes)}个文件")
            self._print_success(f"加载耗时: {total_time:.2f}s (平均: {avg_load_time:.2f}s/文件)")
            self._print_success(f"合并耗时: {merge_time:.2f}s")

            # 显示性能统计
            print(f"\n{Fore.CYAN}📊 性能统计:")
            print(f"  平均加载时间: {avg_load_time:.2f}s")
            print(f"  峰值内存使用: {max(self.performance_stats['memory_usage']):.2f}GB")
            print(f"  总文件大小: {sum(self.performance_stats['file_sizes']):.1f}MB{Style.RESET_ALL}")

        except Exception as e:
            self._print_error(f"合并数据失败: {e}")
            raise

        # 记录上下限列名
        self.col_lower = self._find_column_case_insensitive([
            "Lower Limit", "lower limit", "lower_limit", "ll", "lower"
        ])
        self.col_upper = self._find_column_case_insensitive([
            "Upper Limit", "upper limit", "upper_limit", "ul", "upper"
        ])

    def get_keywords(self) -> Tuple[pd.DataFrame, str, List[str], Dict[str, List[str]]]:
        """获取用户输入的多个关键词并筛选数据，返回关键词分组信息"""
        self._print_stage("筛选关键词")

        while True:
            keyword_input = input("请输入筛选关键词(多个关键词用','分割，匹配 'Test Name New'): ").strip()

            if not keyword_input:
                print("❌ 关键词不能为空，请重新输入")
                continue

            # 检查数据框是否为空
            if self.df.empty:
                print("⚠️ 数据框为空，无法进行筛选")
                return pd.DataFrame(), keyword_input, [], {}

            # 检查列是否存在
            if "Test Name New" not in self.df.columns:
                print("❌ 列 'Test Name New' 不存在于数据框中")
                print(f"可用列: {list(self.df.columns)}")
                return pd.DataFrame(), keyword_input, [], {}

            try:
                # 分割关键词
                keywords = [k.strip() for k in keyword_input.split(',') if k.strip()]

                # 构建多个关键词的筛选条件
                mask = pd.Series(False, index=self.df.index)
                keyword_groups = {}

                for keyword in keywords:
                    keyword_mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
                    mask = mask | keyword_mask

                    # 记录每个关键词匹配的测试项
                    matched_tests = self.df.loc[keyword_mask, "Test Name New"].unique().tolist()
                    keyword_groups[keyword] = matched_tests

                filtered_df = self.df.loc[mask].copy()

                if filtered_df.empty:
                    # 提供友好的提示和建议
                    print(f"⚠️ 没有找到包含关键词 '{keyword_input}' 的测试项")

                    # 显示部分可用的测试项作为参考
                    available_tests = self.df["Test Name New"].dropna().unique()
                    if len(available_tests) > 0:
                        print("📋 可用的测试项示例:")
                        for test in available_tests[:5]:
                            print(f"  - {test}")
                        if len(available_tests) > 5:
                            print(f"  ... 还有 {len(available_tests) - 5} 个测试项")

                    # 提供重新输入或退出的选项
                    choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ")
                    if choice == "1":
                        continue
                    elif choice == "2":
                        filtered_df = self.df.copy()
                        unique_tests = filtered_df["Test Name New"].unique().tolist()
                        # 为所有数据创建一个默认分组
                        keyword_groups = {"所有测试项": unique_tests}
                        print(f"✅ 使用所有数据: {len(filtered_df)} 行，{len(unique_tests)} 个测试项")
                        return filtered_df, "", unique_tests, keyword_groups
                    else:
                        print("👋 退出筛选操作")
                        return pd.DataFrame(), keyword_input, [], {}
                else:
                    unique_tests = filtered_df["Test Name New"].unique().tolist()
                    print(f"✅ 匹配到 {len(filtered_df)} 行数据，涉及 {len(unique_tests)} 个不同测试项")
                    print(f"📊 关键词分组: {len(keyword_groups)} 组")
                    for keyword, tests in keyword_groups.items():
                        print(f"  - '{keyword}': {len(tests)} 个测试项")

                    return filtered_df, keyword_input, unique_tests, keyword_groups

            except Exception as e:
                print(f"❌ 筛选过程中发生错误: {e}")
                print("请检查数据格式或重新输入关键词")
                continue

    def create_output_dir(self, keyword) -> None:
        """创建输出目录"""
        self._print_stage("创建输出目录")

        if not self.folder_path:
            raise ValueError("文件夹路径未设置")

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.output_dir = os.path.join(self.folder_path, f"scatter_report_out")
        safe_keyword = self._safe_filename(keyword) if keyword else "all_data"
        self.html_report_path = os.path.join(self.output_dir, f"{safe_keyword}_report_{timestamp}.html")

        # 为汇总对比报告创建单独的文件路径
        self.comparison_report_path = os.path.join(self.output_dir,
                                                   f"{safe_keyword}_comparison_report_{timestamp}.html")

        os.makedirs(self.output_dir, exist_ok=True)
        print(f"输出目录: {self.output_dir}")

    @staticmethod
    def _safe_filename(name: str) -> str:
        """生成安全的文件名"""
        safe = "".join(c for c in str(name) if c.isalnum() or c in (","," ", "_", "-")).strip()
        return safe or "Unknown_Test"

    def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
        Optional[float], Optional[float], List[float], List[float]]:
        """提取某个测试项的上下限数值"""
        lower_plot = upper_plot = None
        lower_set = []
        upper_set = []

        if self.col_lower and self.col_lower in df_one_test.columns:
            lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
            lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
            if lower_set:
                lower_plot = min(lower_set)

        if self.col_upper and self.col_upper in df_one_test.columns:
            upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
            upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
            if upper_set:
                upper_plot = max(upper_set)

        return lower_plot, upper_plot, lower_set, upper_set

    @staticmethod
    def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
        """统一的系列清洗和转换方法"""
        if series.empty:
            return series

        if target_type == 'numeric':
            # 数值转换优化
            if pd.api.types.is_numeric_dtype(series):
                return series.astype(float)

            # 批量字符串处理
            cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
            return pd.to_numeric(cleaned, errors='coerce')

        elif target_type == 'datetime':
            return MultiFileTestReportScatterPlotter._convert_to_datetime(series)

        return series

    @staticmethod
    def _convert_to_datetime(series: pd.Series) -> pd.Series:
        """优化的日期时间转换"""
        if pd.api.types.is_datetime64_any_dtype(series):
            return series

        # 预处理：转换为数值和字符串两种形式
        numeric_series = pd.to_numeric(series, errors='coerce')
        string_series = series.astype(str).str.strip()

        result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')

        # 数值时间戳处理 - 优化逻辑
        masks = {
            'ms': numeric_series >= 1e12,  # 调整为更合理的阈值
            's_ms': (numeric_series >= 1e9) & (numeric_series < 1e12),  # 包含秒和毫秒的混合情况
            'excel': (numeric_series > 20000) & (numeric_series < 60000)
        }

        for mask_type, mask in masks.items():
            if mask.any():
                if mask_type == 'ms':
                    result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
                elif mask_type == 's_ms':
                    # 对有小数部分的时间戳使用浮点数处理
                    timestamp_values = numeric_series.loc[mask]

                    # 检查是否有小数部分
                    has_decimal = (timestamp_values % 1 != 0)

                    # 对整数部分（秒级时间戳）处理
                    if (~has_decimal).any():
                        integer_mask = mask & (~has_decimal)
                        result.loc[integer_mask] = pd.to_datetime(
                            numeric_series.loc[integer_mask].astype('int64'), unit='s'
                        )

                    # 对小数部分（可能是毫秒级）处理
                    if has_decimal.any():
                        decimal_mask = mask & has_decimal
                        # 尝试毫秒单位转换
                        result.loc[decimal_mask] = pd.to_datetime(
                            numeric_series.loc[decimal_mask] * 1000, unit='ms'
                        )

                elif mask_type == 'excel':
                    origin = pd.Timestamp('1899-12-30')
                    result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')

        # 字符串日期处理
        remaining_mask = result.isna()
        if remaining_mask.any():
            remaining_strings = string_series.loc[remaining_mask]

            # 特定格式优先处理
            format_patterns = [
                (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
            ]

            for pattern, date_format in format_patterns:
                format_mask = remaining_strings.str.match(pattern)
                if format_mask.any():
                    result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
                        remaining_strings.loc[format_mask], format=date_format, errors='coerce'
                    )

            # 通用解析
            still_na_mask = result.isna() & remaining_mask
            if still_na_mask.any():
                result.loc[still_na_mask] = pd.to_datetime(
                    string_series.loc[still_na_mask], errors='coerce'
                )

        return result

    def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
        """数据预处理"""
        # 数值转换
        test_data['Measurement_num'] = self._clean_and_convert_series(
            test_data['Measurement'], 'numeric'
        )
        test_data['TestTime_dt'] = self._clean_and_convert_series(
            test_data['Test Time'], 'datetime'
        )
        # 确保Cell编号存在（如果原数据中有）
        if 'Cell' in test_data.columns:
            # 清理数据：去除首尾空格，并尝试转换为数值类型
            test_data['Cell编号'] = test_data['Cell'].astype(str).str.strip()

            # 尝试将清理后的字符串转换为数值（例如整数）
            # errors='coerce' 会将无法转换的值设为NaN（非数字）
            test_data['Cell编号_数值'] = pd.to_numeric(test_data['Cell编号'], errors='coerce')

            # 检查是否存在转换失败的值（即NaN）
            failed_conversions = test_data['Cell编号_数值'].isna().sum()
            if failed_conversions > 0:
                print(f"警告：发现 {failed_conversions} 个 'Cell' 值无法转换为数字，这些条目将保留为字符串或根据业务逻辑处理。")
                # 业务决策：对于无法转换的，可以保留原字符串，或使用一个默认值
		        # 例如，将无法转换的条目其数值编号设为-1或一个特定的标识值
		        # test_data.loc[test_data['Cell编号_数值'].isna(), 'Cell编号_数值'] = -1

            # 此时，您可以根据需求选择使用 'Cell编号'（字符串）或 'Cell编号_数值'（数字）进行后续分组和可视化
            # 对于绘图着色和排序，使用 'Cell编号_数值' 列
            grouping_column = 'Cell编号_数值'

        # 去除无效数据
        valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
        return valid_data.sort_values('TestTime_dt')

    def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
        """计算统计信息"""
        stats = {
            'count': len(y_data),
            'mean': y_data.mean(),
            'median': y_data.median(),
            'min': y_data.min(),
            'max': y_data.max(),
            'std': y_data.std(),
            'q1': y_data.quantile(0.25),
            'q3': y_data.quantile(0.75)
        }
        return stats

    def _plot_to_base64(self, fig) -> str:
        """将图表转换为base64编码"""
        buf = BytesIO()
        fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
        buf.seek(0)
        img_str = base64.b64encode(buf.read()).decode('utf-8')
        plt.close(fig)
        return img_str

    def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str,
                             lower_plot: Optional[float], upper_plot: Optional[float]) -> str:
        """创建汇总图（所有SN在一个图中）"""
        fig, ax = plt.subplots(figsize=(12, 8))

        # 检查是否有Cell编号列
        has_cell_no = 'Cell编号' in test_data.columns

        # 分组绘制
        if has_cell_no and not test_data['Cell编号'].isna().all():
            # 先按Cell编号_数值排序，再按SN排序
            test_data_sorted = test_data.sort_values(['Cell编号_数值', 'SN'])

            # 按Cell编号_数值和SN分组
            cell_sn_groups = list(test_data_sorted.groupby(['Cell编号_数值', 'SN']))

            # 生成颜色映射
            colors = plt.cm.Set3(np.linspace(0, 1, len(cell_sn_groups)))

            for idx, ((cell_no, sn), group) in enumerate(cell_sn_groups):
                label = f"Cell:{cell_no}, SN:{sn}"
                ax.scatter(group['TestTime_dt'], group['Measurement_num'],
                           color=colors[idx], alpha=0.7, s=25, label=label)
        else:
            # 只按SN分组排序
            test_data_sorted = test_data.sort_values('SN')
            sn_groups = [(f"{sn}_no_cell", group) for sn, group in test_data_sorted.groupby("SN")]

            # 生成颜色映射
            colors = plt.cm.Set3(np.linspace(0, 1, len(sn_groups)))

            for idx, (group_key, group) in enumerate(sn_groups):
                sn = str(group_key).replace('_no_cell', '')
                label = f"SN: {sn}"
                ax.scatter(group['TestTime_dt'], group['Measurement_num'],
                           color=colors[idx], alpha=0.7, s=25, label=label)

        # 计算统计信息
        y_data = test_data['Measurement_num']
        stats = self._calculate_statistics(y_data)

        # 绘制限值线和统计线
        x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max()

        if lower_plot is not None:
            ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
        if upper_plot is not None:
            ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")

        # 添加统计线
        ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
                  linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
        ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
                  linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')

        # 设置图形属性
        title = f"汇总图 - {test_name}"
        if has_cell_no and not test_data['Cell编号'].isna().all():
            title += " (按Cell→SN排序)"
        ax.set_title(title)
        ax.set_xlabel("Test Time")
        ax.set_ylabel("Measurement Value")
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='x', rotation=45)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

        return self._plot_to_base64(fig)

    def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str,
                         lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]:
        """为每个SN创建独立图表"""
        sn_plots = []

        if "SN" not in test_data.columns:
            return sn_plots

        # 检查是否有Cell编号列
        has_cell_no = 'Cell编号' in test_data.columns

        # 如果有Cell编号，先按Cell编号_数值排序，再按SN排序
        if has_cell_no and not test_data['Cell编号'].isna().all():
            # 先按Cell编号_数值排序，再按SN排序
            test_data_sorted = test_data.sort_values(['Cell编号_数值', 'SN'])

            # 按Cell编号_数值分组，然后对每个Cell内的数据按SN排序
            cell_groups = test_data_sorted.groupby('Cell编号_数值')

            for cell_no, cell_group in cell_groups:
                if cell_group.empty:
                    continue

                # 对当前Cell内的数据按SN排序
                cell_group_sorted = cell_group.sort_values('SN')

                # 按SN分组
                sn_groups = cell_group_sorted.groupby('SN')

                for sn, group in sn_groups:
                    if group.empty:
                        continue

                    fig, ax = plt.subplots(figsize=(10, 6))

                    # 绘制当前SN和Cell的数据点
                    ax.scatter(group['TestTime_dt'], group['Measurement_num'],
                               color='blue', alpha=0.7, s=30, label=f"SN:{sn}, Cell:{cell_no}")

                    # 计算当前SN和Cell组合的统计信息
                    y_data = group['Measurement_num']
                    stats = self._calculate_statistics(y_data)

                    # 绘制限值线
                    x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()

                    if lower_plot is not None:
                        ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
                    if upper_plot is not None:
                        ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")

                    # 添加统计线
                    ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
                              linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
                    ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
                              linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')

                    # 设置图形属性
                    ax.set_title(f"SN独立图 - {test_name} (Cell: {cell_no}, SN: {sn})")
                    ax.set_xlabel("Test Time")
                    ax.set_ylabel("Measurement Value")
                    ax.grid(True, alpha=0.3)
                    ax.tick_params(axis='x', rotation=45)
                    ax.legend()

                    # 转换为base64
                    plot_image = self._plot_to_base64(fig)

                    # 收集当前Cell编号的统计信息 - 修复格式
                    cell_info = {}
                    if has_cell_no:
                        # 计算当前Cell编号的详细统计信息
                        cell_stats_dict = {
                            'count': len(group),
                            'mean': float(group['Measurement_num'].mean()),
                            'std': float(group['Measurement_num'].std()),
                            'min': float(group['Measurement_num'].min()),
                            'max': float(group['Measurement_num'].max())
                        }
                        cell_info = {str(cell_no): cell_stats_dict}

                    sn_plots.append({
                        "sn": f"Cell_{cell_no}_SN_{sn}",
                        "image": plot_image,
                        "cell_info": cell_info if cell_info else None,
                        "has_cell_data": True,
                        "cell_no": str(cell_no),
                        "sn_no": str(sn)
                    })
        else:
            # 没有Cell编号，只按SN分组排序
            test_data_sorted = test_data.sort_values('SN')
            sn_groups = test_data_sorted.groupby("SN")

            for sn, group in sn_groups:
                if group.empty:
                    continue

                fig, ax = plt.subplots(figsize=(10, 6))

                # 没有Cell编号，按SN着色
                ax.scatter(group['TestTime_dt'], group['Measurement_num'],
                           color='blue', alpha=0.7, s=30, label=f"SN: {sn}")

                # 计算当前SN的统计信息
                y_data = group['Measurement_num']
                stats = self._calculate_statistics(y_data)

                # 绘制限值线
                x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()

                if lower_plot is not None:
                    ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
                if upper_plot is not None:
                    ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")

                # 添加统计线
                ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
                          linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
                ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
                          linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')

                # 设置图形属性
                ax.set_title(f"SN独立图 - {test_name} (SN: {sn})")
                ax.set_xlabel("Test Time")
                ax.set_ylabel("Measurement Value")
                ax.grid(True, alpha=0.3)
                ax.tick_params(axis='x', rotation=45)
                ax.legend()

                # 转换为base64
                plot_image = self._plot_to_base64(fig)

                sn_plots.append({
                    "sn": str(sn),
                    "image": plot_image,
                    "cell_info": None,
                    "has_cell_data": False,
                    "cell_no": None,
                    "sn_no": str(sn)
                })

        return sn_plots

    def _determine_test_status(self, stats: Dict[str, float],
                               lower_limit: Optional[float],
                               upper_limit: Optional[float]) -> Dict[str, Any]:
        """确定测试状态"""
        status = "success"
        status_display = "正常"

        if lower_limit is not None and upper_limit is not None:
            # 检查是否超出限值
            if stats['min'] < lower_limit or stats['max'] > upper_limit:
                status = "danger"
                status_display = "异常"
            elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or
                  stats['std'] > (upper_limit - lower_limit) * 0.2):
                status = "warning"
                status_display = "警告"

        return {"status": status, "status_display": status_display}

    def _create_comparison_plots(self, filtered_df: pd.DataFrame, keyword_groups: Dict[str, List[str]]) -> List[
        Dict[str, str]]:
        """创建汇总对比报告图表，按关键词和测试项分组显示"""
        comparison_plots = []

        if filtered_df.empty or "Test Name New" not in filtered_df.columns:
            return comparison_plots

        # 预处理数据
        processed_data = self._preprocess_test_data(filtered_df.copy())

        if processed_data.empty:
            return comparison_plots

        # 1. 时间序列散点图 - 按关键词和测试项分别显示
        num_tests_total = sum(len(tests) for tests in keyword_groups.values())
        if num_tests_total > 0:
            # 计算适当的子图布局
            max_cols = 2
            num_rows = (num_tests_total + max_cols - 1) // max_cols

            fig1, axes = plt.subplots(num_rows, max_cols, figsize=(16, 6 * num_rows))
            if num_rows == 1:
                axes = axes.reshape(1, -1)
            elif num_tests_total == 1:
                axes = np.array([[axes]])

            axes = axes.flatten()

            plot_idx = 0
            colors = plt.cm.Set3(np.linspace(0, 1, 12))

            for keyword, test_names in keyword_groups.items():
                for test_name in test_names:
                    if plot_idx >= len(axes):
                        break

                    # 筛选当前测试项的数据
                    test_data = processed_data[processed_data["Test Name New"] == test_name]

                    if test_data.empty:
                        continue

                    ax = axes[plot_idx]

                    # 按SN分组绘制
                    sn_groups = test_data.groupby("SN")
                    color_idx = 0

                    for sn, group in sn_groups:
                        ax.scatter(group['TestTime_dt'], group['Measurement_num'],
                                   c=[colors[color_idx % len(colors)]], label=str(sn), alpha=0.7, s=30)
                        color_idx += 1

                    # 添加上下限
                    lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
                    if lower_plot is not None:
                        ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=2, label="Lower Limit")
                    if upper_plot is not None:
                        ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=2, label="Upper Limit")

                    ax.set_title(f"{test_name}\n(关键词: {keyword})", fontsize=10)
                    ax.set_xlabel("测试时间")
                    ax.set_ylabel("测量值")
                    ax.grid(True, alpha=0.3)
                    ax.tick_params(axis='x', rotation=45)
                    ax.legend(fontsize=8)

                    plot_idx += 1

            # 隐藏多余的子图
            for idx in range(plot_idx, len(axes)):
                axes[idx].set_visible(False)

            plt.tight_layout()
            comparison_plots.append({
                "title": "时间序列散点图（按测试项分组）",
                "image": self._plot_to_base64(fig1),
                "description": "每个测试项单独显示，按SN区分不同数据点"
            })

        # 2. 箱线图 - 按关键词分组，每个关键词组内的测试项分别显示
        if len(keyword_groups) > 0:
            fig2, axes = plt.subplots(len(keyword_groups), 1, figsize=(14, 6 * len(keyword_groups)))
            if len(keyword_groups) == 1:
                axes = [axes]

            for idx, (keyword, test_names) in enumerate(keyword_groups.items()):
                ax = axes[idx]

                boxplot_data = []
                boxplot_labels = []

                for test_name in test_names:
                    test_data = processed_data[processed_data["Test Name New"] == test_name]
                    if not test_data.empty:
                        boxplot_data.append(test_data['Measurement_num'].values)
                        boxplot_labels.append(test_name)

                if boxplot_data:
                    box_plot = ax.boxplot(boxplot_data, tick_labels=boxplot_labels, patch_artist=True)

                    # 设置箱线图颜色
                    colors_box = plt.cm.Set3(np.linspace(0, 1, len(boxplot_data)))
                    for patch, color in zip(box_plot['boxes'], colors_box):
                        patch.set_facecolor(color)

                    # 添加上下限线（使用第一个测试项的数据）
                    if boxplot_data:
                        first_test_data = processed_data[processed_data["Test Name New"] == test_names[0]]
                        lower_plot, upper_plot, _, _ = self._extract_limits(first_test_data)
                        if lower_plot is not None:
                            ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=2, label="Lower Limit")
                        if upper_plot is not None:
                            ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=2, label="Upper Limit")

                    ax.set_title(f"箱线图 - {keyword}组", fontsize=12)
                    ax.set_ylabel("测量值")
                    ax.tick_params(axis='x', rotation=45)
                    ax.grid(True, alpha=0.3)
                    ax.legend()

            plt.tight_layout()
            comparison_plots.append({
                "title": "箱线图（按关键词分组）",
                "image": self._plot_to_base64(fig2),
                "description": "每个关键词组单独显示，组内测试项分别绘制箱线图"
            })

        # 3. 概率分布直方图 - 按测试项分组显示
        if num_tests_total > 0:
            max_cols = 2
            num_rows = (num_tests_total + max_cols - 1) // max_cols

            fig3, axes = plt.subplots(num_rows, max_cols, figsize=(16, 6 * num_rows))
            if num_rows == 1:
                axes = axes.reshape(1, -1)
            elif num_tests_total == 1:
                axes = np.array([[axes]])

            axes = axes.flatten()

            plot_idx = 0
            colors_hist = plt.cm.Set3(np.linspace(0, 1, 12))

            for keyword, test_names in keyword_groups.items():
                for test_name in test_names:
                    if plot_idx >= len(axes):
                        break

                    test_data = processed_data[processed_data["Test Name New"] == test_name]

                    if test_data.empty or len(test_data) < 2:
                        continue

                    ax = axes[plot_idx]

                    # 绘制概率分布直方图
                    sns.histplot(test_data['Measurement_num'], kde=True,
                                 color=colors_hist[plot_idx % len(colors_hist)], alpha=0.7, ax=ax)

                    # 添加上下限线
                    lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
                    if lower_plot is not None:
                        ax.axvline(x=lower_plot, color='green', linestyle='--', linewidth=2, label="Lower Limit")
                    if upper_plot is not None:
                        ax.axvline(x=upper_plot, color='red', linestyle='--', linewidth=2, label="Upper Limit")

                    ax.set_title(f"{test_name}\n(关键词: {keyword})", fontsize=10)
                    ax.set_xlabel("测量值")
                    ax.set_ylabel("频率")
                    ax.grid(True, alpha=0.3)
                    ax.legend()

                    plot_idx += 1

            # 隐藏多余的子图
            for idx in range(plot_idx, len(axes)):
                axes[idx].set_visible(False)

            plt.tight_layout()
            comparison_plots.append({
                "title": "概率分布直方图（按测试项分组）",
                "image": self._plot_to_base64(fig3),
                "description": "每个测试项单独显示概率分布和上下限"
            })

        return comparison_plots

    def _organize_tests_by_keyword_groups(self, test_results: List[Dict[str, Any]],
                                          keyword_groups: Dict[str, List[str]]) -> Dict[
        str, Dict[str, List[Dict[str, Any]]]]:
        """按关键词分组组织测试结果，每个测试项单独显示"""
        organized_groups = {}

        # 构建测试名称到测试结果的映射
        test_name_to_result = {test['name']: test for test in test_results}

        # 为每个关键词组分配测试结果
        for keyword, test_names in keyword_groups.items():
            group_tests = {}
            for test_name in test_names:
                if test_name in test_name_to_result:
                    # 每个测试项单独作为一个分组
                    group_tests[test_name] = [test_name_to_result[test_name]]

            if group_tests:
                organized_groups[keyword] = group_tests

        return organized_groups

    def generate_html_report(self, filtered_df: pd.DataFrame, keyword: str,
                             unique_tests: List[str], keyword_groups: Dict[str, List[str]]) -> None:
        """生成HTML报告"""
        self._print_stage("生成HTML报告")
        start_time = time.time()

        test_results = []
        total_points = 0
        status_counts = {"success": 0, "warning": 0, "danger": 0}

        for i, test_name in enumerate(unique_tests, 1):
            self._print_progress(i, len(unique_tests), "生成测试报告")

            # 获取测试数据
            test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
            test_data = self._preprocess_test_data(test_data)

            if test_data.empty:
                continue

            # 提取限值信息
            lower_plot, upper_plot, _, _ = self._extract_limits(test_data)

            # 计算统计信息
            y_data = test_data['Measurement_num']
            stats = self._calculate_statistics(y_data)
            total_points += stats['count']

            # 生成汇总图表
            summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot)

            # 生成SN独立图表
            sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot)

            # 确定测试状态
            status_info = self._determine_test_status(stats, lower_plot, upper_plot)
            status_counts[status_info["status"]] += 1

            # 添加到结果列表
            test_results.append({
                "name": test_name,
                "stats": stats,
                "limits": {"lower": lower_plot, "upper": upper_plot},
                "summary_plot_image": summary_plot_image,
                "sn_plot_images": sn_plot_images,
                "status": status_info["status"],
                "status_display": status_info["status_display"]
            })

        # 按关键词分组组织测试结果，每个测试项单独显示
        organized_keyword_groups = self._organize_tests_by_keyword_groups(test_results, keyword_groups)

        # 生成汇总对比报告图表
        comparison_plots = self._create_comparison_plots(filtered_df, keyword_groups)

        # 渲染HTML模板
        template = Template(HTML_TEMPLATE)
        html_content = template.render(
            keyword=keyword if keyword else "所有数据",
            timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            test_count=len(test_results),
            total_points=total_points,
            tests=test_results,
            folder_path=self.folder_path,
            analysis_time=round(time.time() - start_time, 2),
            status_counts={"normal": status_counts["success"], "warning": status_counts["warning"],
                           "abnormal": status_counts["danger"]},
            file_count=len(self.file_infos),
            file_infos=self.file_infos,
            total_rows=len(self.df) if self.df is not None else 0,
            comparison_plots=comparison_plots,
            keyword_groups=organized_keyword_groups
        )

        # 调试：检查生成的HTML内容
        if comparison_plots:
            if "comparison_plots" in html_content or "时间序列散点图" in html_content:
                self._print_success(f"对比图已成功嵌入HTML")
            else:
                # print("❌ 对比图未正确嵌入HTML")
                self._print_error("对比图未正确嵌入HTML")

        # 保存HTML文件
        with open(self.html_report_path, 'w', encoding='utf-8') as f:
            f.write(html_content)

        self._print_success(f"HTML报告已生成: {self.html_report_path}")
        self._print_success(
            f"共处理 {len(self.file_infos)} 个文件，{len(test_results)} 个测试项，{total_points} 个数据点")
        if len(keyword_groups) > 1:
            self._print_success(f"已生成 {len(comparison_plots)} 个对比图表，{len(keyword_groups)}个关键词：{", ".join(keyword_groups)}。")

    def run(self) -> None:
        """运行主程序"""
        try:
            self.get_folder_path()
            excel_files = self.find_excel_files()

            if not excel_files:
                self._print_error("没有找到可用的Excel文件")
                return

            # 使用优化后的加载方法
            self.load_multiple_files_optimized(excel_files)

            while True:
                # 修改为使用多关键词输入方法，返回关键词分组信息
                filtered_df, keyword, unique_tests, keyword_groups = self.get_keywords()
                if filtered_df.empty:
                    self._print_warning("没有数据可处理，退出程序")
                    break

                self.create_output_dir(keyword)
                self.generate_html_report(filtered_df, keyword, unique_tests, keyword_groups)

                self._print_success("分析完成！")
                print(f"📊 报告文件: {self.html_report_path}")
                print(f"📁 输出目录: {self.output_dir}")
                if len(keyword_groups) > 1:
                    print(f"🔍 对比关键词/组: {', '.join(keyword_groups)}")

                # 询问是否继续分析其他关键词
                continue_choice = input("\n是否继续分析其他关键词？(y/n): ").strip().lower()
                if continue_choice not in ['y', 'yes', '是']:
                    break

        except KeyboardInterrupt:
            self._print_warning("用户中断程序")
        except Exception as e:
            self._print_error(f"发生错误: {type(e).__name__}: {str(e)}")
            import traceback
            traceback.print_exc()


if __name__ == "__main__":
    plotter = MultiFileTestReportScatterPlotter()
    plotter.run()