\d+)', base)
+ if m:
+ sn = m.group('sn').strip()
+ ts = m.group('time').strip()
+ cell = m.group('cell').strip()
+ return sn, ts, cell
+
+ # 兜底:尝试匹配 SN(以 F 开头的字母数字串)、时间(括号内)、Cell(-数字)
+ sn_match = re.search(r'\bF[A-Z0-9]+\b', base)
+ time_match = re.search(r'\(([^)]+)\)', base)
+ cell_match = re.search(r'-(\d+)(?:\.\w+)?$', base)
+
+ if sn_match:
+ sn = sn_match.group(0).strip()
+ if time_match:
+ ts = time_match.group(1).strip()
+ if cell_match:
+ cell = cell_match.group(1).strip()
+
+ return sn, ts, cell
+
+ def merge_reports(self):
+ """合并所有报告中的 'All Tests' 工作表,并拆分 Source File name 为 SN/TestCycleTime/Cell 列"""
+ if not self.source_files:
+ return False
+
+ self._print_stage("合并报告数据")
+ start_time = time.time()
+
+ # 初始化合并数据,保留表头
+ self.merged_data = []
+ header_added = False
+ source_col_idx = None # 记录“Source File name”列索引
+
+ total_files = self.stats["total_files"]
+ for idx, file_path in enumerate(self.source_files, start=1):
+ filename = os.path.basename(file_path)
+ # 文件级进度条
+ self._print_progress(idx, total_files, prefix="文件处理")
+ try:
+ wb = load_workbook(file_path, read_only=True, data_only=True)
+ if 'All Tests' not in wb.sheetnames:
+ self.stats["skipped_no_sheet"] += 1
+ print(f"\n文件 {filename} 中没有 'All Tests' 工作表,已跳过")
+ wb.close()
+ continue
+
+ sheet = wb['All Tests']
+
+ # 添加表头(只添加一次)
+ if not header_added and sheet.max_row > 0:
+ header = [cell.value for cell in sheet[1]]
+ # 定位 Source File name 列(大小写不敏感)
+ source_col_idx = None
+ for i, h in enumerate(header):
+ if h and str(h).strip().lower() == "source file name":
+ source_col_idx = i
+ break
+
+ # 扩展表头:新增 SN / TestCycleTime / Cell / 数据来源
+ extended_header = list(header)
+ extended_header += ["SN", "TestCycleTime", "Cell", "数据来源"]
+ self.merged_data.append(extended_header)
+ header_added = True
+
+ # 统计行数(不含表头)
+ data_rows_count = max(sheet.max_row - 1, 0)
+
+ # 添加数据行
+ added_rows = 0
+ for row in sheet.iter_rows(min_row=2, values_only=True):
+ if row is None:
+ continue
+ # 过滤全空行
+ if all(cell is None for cell in row):
+ continue
+
+ row_list = list(row)
+
+ # 从 Source File name 列解析三项
+ sn, ts, cell = "", "", ""
+ if source_col_idx is not None and source_col_idx < len(row_list):
+ sn, ts, cell = self._parse_source_filename(row_list[source_col_idx])
+
+ # 追加解析列与数据来源列
+ row_list += [sn, ts, cell, filename]
+ self.merged_data.append(row_list)
+ added_rows += 1
+
+ wb.close()
+
+ self.stats["processed_files"] += 1
+ self.stats["total_rows_merged"] += added_rows
+ # 每个文件处理完成后给出简报
+ print(f"\n→ 已处理: {filename} | 预估行数: {data_rows_count} | 实际合并行数: {added_rows} | 累计合并行数: {self.stats['total_rows_merged']}")
+
+ except Exception as e:
+ self.stats["errors"] += 1
+ print(f"\n处理文件 {filename} 时出错: {type(e).__name__}: {str(e)}")
+ continue
+
+ elapsed = time.time() - start_time
+ print(f"\n合并阶段完成,耗时: {elapsed:.1f} 秒")
+ return len(self.merged_data) > 1 # 至少有一个表头和一个数据行
+
+ def save_merged_report(self):
+ """保存合并后的报告到选择的目录"""
+ if not self.merged_data or not self.selected_folder:
+ return False
+
+ self._print_stage("保存合并结果")
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"测试报告合并_{timestamp}.xlsx"
+ self.output_filepath = os.path.join(self.selected_folder, output_filename)
+
+ try:
+ wb = Workbook()
+ ws = wb.active
+ ws.title = "Merged All Tests"
+
+ # 写入时也给出简单的进度(每写入一定行数提示一次)
+ total_rows = len(self.merged_data)
+ last_print = time.time()
+ for i, row in enumerate(self.merged_data, start=1):
+ ws.append(row)
+ # 控制输出频率,避免大量打印影响速度
+ if i == total_rows or (time.time() - last_print) > 0.5:
+ self._print_progress(i, total_rows, prefix="写入Excel行")
+ last_print = time.time()
+
+ wb.save(self.output_filepath)
+ print(f"\n文件已保存: {self.output_filepath}")
+ return True
+ except Exception as e:
+ print(f"保存合并报告时出错: {type(e).__name__}: {str(e)}")
+ return False
+
+ def save_merged_report_xlsxwriter(self):
+ """使用xlsxwriter引擎保存,带进度显示"""
+ if not self.merged_data or not self.selected_folder:
+ return False
+
+ self._print_stage("保存合并结果")
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"测试报告合并_{timestamp}.xlsx"
+ self.output_filepath = os.path.join(self.selected_folder, output_filename)
+
+ try:
+ import pandas as pd
+ import xlsxwriter
+
+ # 将数据转换为DataFrame
+ headers = self.merged_data[0]
+ data_rows = self.merged_data[1:]
+ total_rows = len(data_rows)
+
+ print(f"开始保存,共{total_rows}行数据到工作表 'Merged All Tests'...")
+
+ # 创建workbook和worksheet
+ workbook = xlsxwriter.Workbook(self.output_filepath)
+ worksheet = workbook.add_worksheet('Merged All Tests')
+
+ # 写入表头
+ header_format = workbook.add_format({
+ 'bold': True,
+ 'fg_color': '#D7E4BC',
+ 'border': 1
+ })
+
+ for col_num, header in enumerate(headers):
+ worksheet.write(0, col_num, header, header_format)
+
+ # 写入数据并显示进度
+ processed_rows = 0
+ batch_size = 1000 # 每批处理的行数
+
+ for start_idx in range(0, total_rows, batch_size):
+ end_idx = min(start_idx + batch_size, total_rows)
+ batch_data = data_rows[start_idx:end_idx]
+
+ # 写入这一批数据
+ for row_offset, row_data in enumerate(batch_data):
+ for col_num, cell_value in enumerate(row_data):
+ worksheet.write(start_idx + row_offset + 1, col_num, cell_value)
+
+ processed_rows += 1
+
+ # 每处理一定数量或最后一行时更新进度
+ if processed_rows % max(1, total_rows // 20) == 0 or processed_rows == total_rows:
+ percentage = int((processed_rows / total_rows) * 100)
+ print(f"\r保存进度: {percentage}% ({processed_rows}/{total_rows}行)", end="", flush=True)
+
+ # 自动调整列宽
+ for idx, _ in enumerate(headers):
+ worksheet.set_column(idx, idx, 15) # 默认宽度
+
+ workbook.close()
+
+ print("\r保存完成!" + " " * 40) # 清空进度行
+ print(f"文件已保存: {self.output_filepath}")
+ print(f"工作表名: Merged All Tests")
+ return True
+
+ except ImportError:
+ print("xlsxwriter未安装,使用备选方案")
+ return self.save_merged_report()
+
+ def save_merged_report_xlsxwriter_with_progress(self):
+ """使用xlsxwriter带进度显示的保存"""
+ if not self.merged_data or not self.selected_folder:
+ return False
+
+ self._print_stage("保存合并结果")
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"测试报告合并_{timestamp}.xlsx"
+ self.output_filepath = os.path.join(self.selected_folder, output_filename)
+
+ try:
+ import xlsxwriter
+ from tqdm import tqdm # 可选可选:更美观的进度条
+
+ headers = self.merged_data[0]
+ data_rows = self.merged_data[1:]
+ total_rows = len(data_rows)
+
+ print(f"开始保存,共{len(headers)}列{total_rows}行数据...")
+
+ # 创建workbook和worksheet
+ workbook = xlsxwriter.Workbook(self.output_filepath)
+ # worksheet = workbook.add_worksheet()
+ worksheet = workbook.add_worksheet('Merged All Tests')
+ # 写入表头
+ for col_num, header in enumerate(headers):
+ worksheet.write(0, col_num, header)
+
+ # 批量写入数据并显示进度
+ batch_size = 500 # 每批处理的行数
+
+ # 如果有tqdm就用美观进度条,否则用简易版本
+ try:
+ from tqdm import tqdm
+ pbar = tqdm(total=total_rows, desc="保存进度", unit="行")
+ except ImportError:
+ pbar = None
+
+ rows_saved = 0
+ for start_idx in range(0, total_rows, batch_size):
+ end_idx = min(start_idx + batch_size, total_rows)
+ batch_data = data_rows[start_idx:end_idx]
+
+ # 写入这一批数据
+ for row_offset, row_data in enumerate(batch_data):
+ for col_num, cell_value in enumerate(row_data):
+ worksheet.write(start_idx + row_offset + 1, col_num, cell_value)
+
+ rows_saved += 1
+ if pbar:
+ pbar.update(1)
+ elif rows_saved % max(1, total_rows // 10) == 0 or rows_saved == total_rows:
+ percentage = int((rows_saved / total_rows) * 100)
+ print(f"\r保存进度: {percentage}% ({rows_saved}/{total_rows}行)", end="", flush=True)
+
+ if pbar:
+ pbar.close()
+ else:
+ print("\r保存完成!" + " " * 30) # 清空进度行
+
+ workbook.close()
+ print(f"文件已保存: {self.output_filepath}")
+ return True
+
+ except ImportError:
+ print("xlsxwriter, tqdm 未安装,使用备选方案")
+ return self.save_merged_report_xlsxwriter()
+
+ def run(self):
+ """运行合并流程"""
+ print("=== 测试报告合并工具 ===")
+
+ # 1. 选择目录
+ # if not self.select_directory():
+ # print("未选择目录,程序退出")
+ # return
+ source_dir = self._get_directory_from_console()
+ if not source_dir:
+ print(f"{Fore.RED}❌ 未选择目录,程序退出")
+ return
+
+ # 2. 扫描文件
+ if not self.scan_files():
+ print("指定目录中没有找到Excel文件")
+ return
+
+ print(f"准备处理 {len(self.source_files)} 个文件...")
+
+ # 3. 合并报告
+ if not self.merge_reports():
+ print("没有找到包含 'All Tests' 工作表的文件或合并数据为空")
+ # 汇总统计
+ self._print_stage("处理摘要")
+ print(f"总文件数: {self.stats['total_files']}")
+ print(f"成功处理: {self.stats['processed_files']}")
+ print(f"跳过(无工作表): {self.stats['skipped_no_sheet']}")
+ print(f"错误文件: {self.stats['errors']}")
+ print(f"合并总行数: {self.stats['total_rows_merged']}")
+ return
+
+ # 4. 保存结果
+ # if self.save_merged_report():
+ # if self.save_merged_report_xlsxwriter():
+ if self.save_merged_report_xlsxwriter_with_progress():
+ print("合并完成!")
+ else:
+ print("保存合并报告时出错")
+
+ # 汇总统计
+ self._print_stage("处理摘要")
+ print(f"总文件数: {self.stats['total_files']}")
+ print(f"成功处理: {self.stats['processed_files']}")
+ print(f"跳过(无工作表): {self.stats['skipped_no_sheet']}")
+ print(f"错误文件: {self.stats['errors']}")
+ print(f"合并总行数: {self.stats['total_rows_merged']}")
+
+
+if __name__ == "__main__":
+ merger = TestReportMerger()
+ merger.run()
diff --git a/htmlProcess/htmlReportProcess_cmd_p/.gitignore b/htmlProcess/htmlReportProcess_cmd_p/.gitignore
new file mode 100644
index 0000000..681aa08
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_cmd_p/.gitignore
@@ -0,0 +1,26 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+
+
+
+htmlReportProcess_Merge_picHtml_V3.py
+
+htmlReportProcess_Merge_picHtml_V2.py
+
+htmlReportProcess_Merge_pic_V2.py
+
+#/htmlReportProcess*/
+
+
+htmlReportProcess_cmd_pV2.py
+htmlReportProcess_cmd_pV3.py
+
+
+htmlReportProcess_cmd_V2.py
+
+htmlReportProcess.py
\ No newline at end of file
diff --git a/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_V1.py b/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_V1.py
new file mode 100644
index 0000000..5c56053
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_V1.py
@@ -0,0 +1,620 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import os
+import re
+import sys
+from datetime import datetime
+from colorama import Fore, Style, init
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+init(autoreset=True)
+
+
+class ProgressTracker:
+ """终端进度显示控制器(增强版)"""
+
+ def __init__(self):
+ self.processed = 0
+ self.total = 0
+ self.start_time = datetime.now()
+
+ def begin(self, total_files, sn_file_counts=None):
+ """初始化进度跟踪,显示文件分布统计(sn_file_counts可选)"""
+ self.total = total_files
+ self.start_time = datetime.now()
+
+ # 构建文件分布统计信息
+ dist_info = []
+ if sn_file_counts:
+ for sn, count in sn_file_counts.items():
+ dist_info.append(f"{sn[:31]}: {count}个 html文件。")
+ print(f"{dist_info}")
+
+ # 格式化输出
+ stats_line = f"{Fore.CYAN}▶ 开始处理 {self.total} 个任务"
+ if dist_info:
+ dist_line = f"{Fore.MAGENTA}⚫ SN文件分布:\n{'\n'.join(dist_info)}"
+ print(f"\n{stats_line.ljust(80)}")
+ print(f"{dist_line.ljust(580)}{Style.RESET_ALL}")
+ else:
+ print(f"\n{stats_line.ljust(80)}")
+
+ def update(self, success=True, prefix=''):
+ """更新进度信息"""
+ self.processed += 1
+ time_used = self._format_timedelta(datetime.now() - self.start_time)
+ percent = self.processed / self.total * 100
+
+ status_icon = f"{Fore.GREEN}✓" if success else f"{Fore.RED}✗"
+ status_text = f"{status_icon} {self.processed}/{self.total} [{(percent / 5):.0f}|{'▉' * int(percent / 5)}{' ' * (20 - int(percent / 5))}|]"
+
+ sys_info = [
+ f"{prefix}{status_text.ljust(40)}",
+ f"进度: {percent:.1f}%".ljust(15),
+ f"耗时: {time_used}".ljust(15),
+ f"速度: {self.processed / (datetime.now() - self.start_time).total_seconds():.1f} 任务/秒"
+ ]
+
+ print('\x1b[2K\r' + ' ‖ '.join(sys_info), end='', flush=True)
+
+ def end(self, prefix=''):
+ """结束进度跟踪"""
+ print(f"\n{Fore.GREEN}✔ {prefix}处理完成! 总耗时: {(datetime.now() - self.start_time).total_seconds():.1f}秒\n")
+
+ def _format_timedelta(self, delta):
+ """格式化时间差"""
+ seconds = delta.total_seconds()
+ return f"{int(seconds // 3600):02}:{int((seconds % 3600) // 60):02}:{int(seconds % 60):02}"
+
+
+class HTMLReportProcessor:
+ """HTML报告处理核心类(增强版:并行+线程安全)"""
+
+ def __init__(self):
+ # 所有共享数据仅在主线程合并,避免并发写入
+ self.sn_data_map = {}
+ self.progress = ProgressTracker()
+ # 跟踪每个SN的文件来源
+ self.sn_source_files = {}
+ # 跟踪每个SN的fail数量
+ self.sn_fail_counts = {}
+ # 存储SN文件分布
+ self.sn_file_counts = {}
+
+ @staticmethod
+ def _clean_test_name(raw_name):
+ """清洗测试名称"""
+ return re.sub(r'^Round\d+_\d+_', '', raw_name)
+
+ def _extract_sn(self, soup, filename):
+ """双重机制提取SN号(增强版正则)"""
+ try:
+ sn_regex = r'F[A-Z0-9]{15}(?:[A-Z0-9]{5,})?' # 支持短格式和长格式
+
+ if soup is not None:
+ # 机制1:从HTML内容提取
+ sn_tag = soup.find('h3', string=re.compile(r'Serial Number:', re.I))
+ if sn_tag:
+ # 优化点2:增加边界检测防止误匹配
+ content_match = re.search(rf'\b({sn_regex})\b', sn_tag.get_text(), flags=re.I)
+ if content_match:
+ return content_match.group(1)
+ else:
+ return "UNKNOWN_SN"
+ else:
+ # 如果未找到sn_tag,尝试从文档其他位置检索
+ html_text = soup.get_text(" ", strip=True)
+ content_match = re.search(rf'\b({sn_regex})\b', html_text, flags=re.I)
+ return content_match.group(1) if content_match else "UNKNOWN_SN"
+ else:
+ # 机制2:从文件名提取
+ content_match = re.search(rf'\b({sn_regex})\b', filename, flags=re.I)
+ return content_match.group(1) if content_match else "UNKNOWN_SN"
+
+ except Exception as e:
+ print(f"SN提取失败: {filename} - {str(e)}")
+ return "ERROR_SN"
+
+ def process_files(self, source_dir):
+ """处理目录中的所有文件(并行版)"""
+ all_files = self._scan_files(source_dir)
+
+ # 预扫描文件,收集SN分布
+ self._collect_sn_distribution(all_files)
+
+ # 传递SN分布信息给进度跟踪器
+ self.progress.begin(len(all_files), self.sn_file_counts)
+
+ # 并行处理文件:主线程负责合并数据和打印进度
+ max_workers = self._calc_max_workers(env_var="OVERRIDE_WORKERS")
+ print(f"{Fore.CYAN}▶ 使用线程并发数(HTML解析): {max_workers}")
+
+ futures = []
+ results = []
+ errors = []
+
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
+ for f in all_files:
+ futures.append(executor.submit(self._process_single_file, f))
+
+ for future in as_completed(futures):
+ try:
+ res = future.result()
+ except Exception as e:
+ # 捕获未处理的异常
+ res = {'success': False, 'error': f"未知异常: {type(e).__name__}: {e}"}
+
+ # 更新进度(主线程)
+ self.progress.update(res.get('success', False), prefix='HTML解析: ')
+
+ # 成功/失败归集
+ if res.get('success'):
+ results.append(res)
+ else:
+ errors.append(res.get('error'))
+
+ self.progress.end(prefix='HTML解析')
+
+ # 输出错误明细,便于定位问题
+ if errors:
+ print(f"\n{Fore.RED}✗ 以下文件处理失败(共 {len(errors)} 个):")
+ for err in errors[:50]:
+ print(f" - {err}")
+ if len(errors) > 50:
+ print(f" ... 其余 {len(errors) - 50} 条省略")
+
+ # 合并结果到共享数据结构(主线程)
+ for res in results:
+ sn = res['sn']
+ headers = res['headers']
+ rows = res['rows']
+ filename = res['filename']
+ fail_count = res['file_fail_count']
+
+ self._store_data(sn, headers, rows)
+
+ if sn not in self.sn_source_files:
+ self.sn_source_files[sn] = set()
+ self.sn_source_files[sn].add(filename)
+
+ self.sn_fail_counts[sn] = self.sn_fail_counts.get(sn, 0) + fail_count
+
+ # 添加报告统计信息
+ self._add_report_statistics()
+ return self.sn_data_map
+
+ def _calc_max_workers(self, env_var="OVERRIDE_WORKERS"):
+ """根据机器性能自动计算线程数,可通过环境变量覆盖"""
+ override = os.getenv(env_var)
+ if override and override.isdigit():
+ return max(1, int(override))
+ # I/O + 中等CPU场景,适度放大
+ cpu = os.cpu_count() or 2
+ return max(4, min(32, cpu * 2))
+
+ def _collect_sn_distribution(self, file_list):
+ """预扫描文件,收集SN分布信息"""
+ print(f"{Fore.YELLOW}⌛ 正在扫描文件分布...")
+
+ for file_path in file_list:
+ filename = os.path.basename(file_path)
+ try:
+ # 直接从文件名提取SN(不解析文件内容)
+ sn = self._extract_sn_from_filename(filename)
+ # 更新SN文件计数
+ self.sn_file_counts[sn] = self.sn_file_counts.get(sn, 0) + 1
+ except Exception as e:
+ print(f"\n{Fore.RED}⚠ 处理失败: {filename} - :{str(e)}")
+ pass
+
+ print(f"{Fore.GREEN}✔ SN分布扫描完成!")
+
+ def _extract_sn_from_filename(self, filename):
+ """仅从文件名提取SN号(优化版)"""
+ sn_regex = r'F[A-Z0-9]{15}(?:[A-Z0-9]{5,})?'
+ content_match = re.search(rf'\b({sn_regex})\b', filename, flags=re.I)
+ return content_match.group(1) if content_match else "UNKNOWN_SN"
+
+ def _add_report_statistics(self):
+ """为每个SN添加报告统计信息"""
+ for sn, data_info in self.sn_data_map.items():
+ # 添加文件来源数量
+ source_count = len(self.sn_source_files.get(sn, []))
+ data_info['report_stats'] = {
+ 'source_files_count': source_count,
+ # 注意:此处暂存解析阶段的失败数量,Excel生成时会以 df_fail 的长度为准覆盖
+ 'fail_count': self.sn_fail_counts.get(sn, 0)
+ }
+
+ def _scan_files(self, source_dir):
+ """扫描目标目录中的HTML文件"""
+ all_files = []
+ for root_dir, _, files in os.walk(source_dir):
+ all_files.extend(
+ [os.path.join(root_dir, f) for f in files if f.lower().endswith(('.html', '.htm'))]
+ )
+ return all_files
+
+ def _process_single_file(self, file_path):
+ """处理单个文件(线程安全:不修改共享状态,返回结果)"""
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ html_content = f.read()
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ filename = os.path.basename(file_path)
+ sn = self._extract_sn(soup, filename)
+
+ table = soup.find('table', border=1) or soup.find('table')
+ if not table:
+ raise ValueError("未找到有效数据表格")
+
+ # 修改:将 sn 传入,返回 file_fail_count,确保与数据存储同一 SN
+ headers, rows, file_fail_count = self._process_table(table, sn, filename, html_content)
+
+ return {
+ 'success': True,
+ 'sn': sn,
+ 'headers': headers,
+ 'rows': rows,
+ 'file_fail_count': file_fail_count,
+ 'filename': filename
+ }
+ except Exception as e:
+ return {
+ 'success': False,
+ 'error': f"{os.path.basename(file_path)} - {type(e).__name__}: {str(e)}"
+ }
+
+ def _find_status_index(self, headers):
+ """根据表头动态识别状态列索引"""
+ if not headers:
+ return None
+ candidates = ('status', 'result', 'test status')
+ for idx, h in enumerate(headers):
+ h_norm = str(h).strip().lower()
+ if h_norm in candidates or re.search(r'status|result', h_norm, flags=re.I):
+ return idx
+ return None
+
+ def _process_table(self, table, sn, filename, html_content):
+ """处理数据表格(修复FAIL统计与状态列识别;参数传入避免并发问题)"""
+ # 更健壮的表头行识别
+ header_tr = table.find('tr', bgcolor='#eeeeee')
+ if not header_tr:
+ # 尝试第一个包含 th 的行
+ for tr in table.find_all('tr'):
+ if tr.find('th'):
+ header_tr = tr
+ break
+ if not header_tr:
+ # 兜底使用第一行
+ header_tr = table.find('tr')
+
+ headers = [th.get_text(strip=True) for th in header_tr.find_all(['th', 'td'])]
+ if len(headers) > 11:
+ headers = headers[:11]
+
+ # 插入新增列
+ try:
+ test_name_idx = headers.index('Test Name')
+ except ValueError:
+ # 如果表头没有 Test Name,尝试模糊匹配
+ test_name_idx = next((i for i, h in enumerate(headers) if re.search(r'test\s*name', h, flags=re.I)), 1)
+ headers.insert(test_name_idx + 1, 'Test Name New')
+ headers.append('Test Time')
+ headers.append('Source File name')
+
+ # 动态识别状态列索引
+ status_col_idx = self._find_status_index(headers)
+
+ # 初始化基准时间戳和全局叠加器
+ base_timestamp = None
+ global_elapsed_accumulator = 0.0
+ elapsed_append = 0
+ file_fail_count = 0 # 跟踪当前文件的FAIL数量
+
+ # 从报告头部提取 Start Time 作为初始基准
+ start_time_match = re.search(r"Start Time:\s*(.+?)(?:\s*<|$)", html_content, re.IGNORECASE)
+ if start_time_match:
+ start_time_str = start_time_match.group(1).strip()
+ # 清理字符串,移除HTML标签
+ start_time_str = re.sub(r'<[^>]+>', '', start_time_str).strip()
+
+ try:
+ # 解析 Start Time 字符串为 datetime 对象
+ dt = datetime.strptime(start_time_str, "%A, %B %d, %Y %I:%M:%S %p")
+ base_timestamp = dt.timestamp()
+ global_elapsed_accumulator = base_timestamp
+ print(f"{Fore.GREEN}✔ 使用Start Time作为时间基准: {start_time_str} -> {base_timestamp}")
+ except Exception as e:
+ print(f"{Fore.RED}⚠ 解析Start Time失败: {start_time_str} - {e}")
+ # 尝试其他可能的日期格式
+ try:
+ # 尝试去掉星期几
+ dt = datetime.strptime(start_time_str.split(', ', 1)[1], "%B %d, %Y %I:%M:%S %p")
+ base_timestamp = dt.timestamp()
+ global_elapsed_accumulator = base_timestamp
+ print(f"{Fore.GREEN}✔ 使用简化格式Start Time作为时间基准: {start_time_str} -> {base_timestamp}")
+ except Exception as e2:
+ print(f"{Fore.RED}⚠ 二次解析Start Time失败: {start_time_str} - {e2}")
+
+ rows = []
+ # 跳过表头的两行(与原逻辑保持一致),但更安全地跳过 header_tr 所在的行
+ all_trs = table.find_all('tr')
+ start_index = 2 if len(all_trs) >= 3 else 1
+
+ for row in all_trs[start_index:]:
+ cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
+ if len(cols) < 2:
+ continue
+
+ original_test_name = cols[1].strip()
+ if not original_test_name:
+ print(f"\rFile{Fore.RED}:{self.currentFilename} 存在空的 Test name!")
+
+ # 统计 FAIL 数量
+ if status_col_idx is not None and len(cols) > status_col_idx:
+ status_val = cols[status_col_idx].strip().upper()
+ if 'FAIL' in status_val:
+ file_fail_count += 1
+
+ elapsed_time_str = cols[9].strip() if len(cols) > 9 else "0"
+
+ # 处理 Test_Time 行,更新基准时间
+ if original_test_name == "Test_Time":
+ measurement_str = cols[7] if len(cols) > 7 else ""
+ try:
+ dt = datetime.strptime(measurement_str, "%m/%d/%Y %I:%M:%S %p")
+ timestamp = dt.timestamp()
+ base_timestamp = timestamp - float(elapsed_time_str)
+ global_elapsed_accumulator = base_timestamp
+ print(f"{Fore.GREEN}✔ 更新时间基准为Test_Time: {measurement_str} -> {base_timestamp}")
+ except Exception as e:
+ print(f"{Fore.RED}⚠ 解析Test_Time失败: {measurement_str} - {e}")
+ # 如果Test_Time解析失败,保持使用Start Time作为基准
+
+ # 计算时间戳
+ try:
+ elapsed_append = global_elapsed_accumulator + float(elapsed_time_str)
+ except ValueError:
+ elapsed_append = global_elapsed_accumulator
+
+ # 插入清洗后的 test name、新增时间戳、来源文件名
+ cols.insert(test_name_idx + 1, self._clean_test_name(cols[test_name_idx]))
+ cols.append(elapsed_append)
+ cols.append(filename)
+ rows.append(cols)
+
+ # 返回给调用方,由调用方统一按相同 SN 键累加
+ return headers, rows, file_fail_count
+
+ def _store_data(self, sn, headers, rows):
+ """存储解析后的数据(主线程调用)"""
+ if sn not in self.sn_data_map:
+ self.sn_data_map[sn] = {'headers': headers, 'data': []}
+ self.sn_data_map[sn]['data'].extend(rows)
+
+
+class ExcelReportGenerator:
+ """Excel报告生成器(并行版)"""
+
+ def __init__(self, output_dir, max_workers=None):
+ self.output_dir = output_dir
+ self.progress = ProgressTracker()
+ self.max_workers = max_workers or self._calc_max_workers(env_var="EXCEL_WORKERS")
+
+ def generate_reports(self, sn_data_map):
+ """并行生成所有Excel报告"""
+ total_reports = len(sn_data_map.items())
+ errors = []
+ successes = []
+
+ print(f"\n{Fore.CYAN}▶ 开始并行生成Excel报告(共{total_reports}个),线程并发数: {self.max_workers}")
+ self.progress.begin(total_reports)
+
+ futures = []
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+ for sn, data_info in sn_data_map.items():
+ futures.append(executor.submit(self._generate_one_report, sn, data_info))
+
+ for future in as_completed(futures):
+ try:
+ res = future.result()
+ except Exception as e:
+ res = {'success': False, 'error': f"未知异常: {type(e).__name__}: {e}"}
+
+ # 主线程更新进度与输出
+ self.progress.update(res.get('success', False), prefix='Excel生成: ')
+
+ if res.get('success'):
+ successes.append(res)
+ else:
+ errors.append(res.get('error'))
+
+ self.progress.end(prefix='Excel生成')
+
+ # 汇总输出结果
+ for s in successes[:50]:
+ print(f"{Fore.GREEN}✓ 生成成功 | 文件: {os.path.basename(s['output_file'])} | SN: {s['sn']} | 记录数: {s['records']} | 来源HTML: {s['source_files_count']} | FAIL总数: {s['fail_count']}")
+ if len(successes) > 50:
+ print(f"{Fore.GREEN}... 成功列表省略 {len(successes)-50} 条")
+
+ if errors:
+ print(f"\n{Fore.RED}✗ 以下报告生成失败(共 {len(errors)} 个):")
+ for err in errors[:50]:
+ print(f" - {err}")
+ if len(errors) > 50:
+ print(f" ... 其余 {len(errors) - 50} 条省略")
+
+ print(f"\n{Fore.CYAN}输出目录: {self.output_dir}")
+
+ def _calc_max_workers(self, env_var="EXCEL_WORKERS"):
+ """根据机器性能自动计算线程数,可通过环境变量覆盖(EXCEL_WORKERS)"""
+ override = os.getenv(env_var)
+ if override and override.isdigit():
+ return max(1, int(override))
+ cpu = os.cpu_count() or 2
+ # 写Excel主要是I/O,适度并发,但避免过高导致磁盘抖动
+ return max(2, min(16, cpu * 2))
+
+ def _generate_one_report(self, sn, data_info):
+ """工作线程:生成单个SN的Excel报告(线程安全,不打印)"""
+ try:
+ base_name = f"{sn}_Report"
+ output_file = os.path.join(self.output_dir, f"{base_name}.xlsx")
+
+ df_all = self._prepare_dataframe(data_info)
+
+ # 动态识别状态列并统计失败数据(更稳健的列识别与包含FAIL)
+ status_col = self._detect_status_column(df_all)
+ if status_col:
+ fail_mask = df_all[status_col].astype(str).str.strip().str.upper().str.contains('FAIL')
+ df_fail = df_all[fail_mask]
+ fail_count = int(fail_mask.sum())
+ else:
+ df_fail = pd.DataFrame(columns=df_all.columns)
+ fail_count = 0
+
+ # 如果有FAIL项,重命名文件
+ if fail_count > 0:
+ new_name = f"{base_name}_Fail-item-{fail_count}.xlsx"
+ output_file = os.path.join(self.output_dir, new_name)
+
+ # 报告统计数据
+ report_stats = data_info.get('report_stats', {})
+ source_files_count = report_stats.get('source_files_count', 0)
+
+ # 写Excel文件
+ self._save_excel(df_all, df_fail, output_file, sn, source_files_count, fail_count)
+
+ return {
+ 'success': True,
+ 'sn': sn,
+ 'output_file': output_file,
+ 'records': len(df_all),
+ 'source_files_count': source_files_count,
+ 'fail_count': fail_count,
+ }
+ except Exception as e:
+ return {
+ 'success': False,
+ 'error': f"SN: {sn} - {type(e).__name__}: {str(e)}"
+ }
+
+ def _detect_status_column(self, df):
+ """自动检测状态列名称(增强:支持模糊匹配与大小写不敏感)"""
+ for col in df.columns:
+ col_str = str(col)
+ if re.search(r'\b(status|result)\b', col_str, flags=re.I) or col_str.strip().lower() in (
+ 'status', 'result', 'test status'):
+ return col
+ return None
+
+ def _save_excel(self, df_all, df_fail, output_file, sn, source_files_count, fail_count):
+ """保存Excel文件,包含All Tests和FAIL list两个工作表及统计信息"""
+ try:
+ with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
+ # 统计信息工作表(直接使用 df_fail 的数量)
+ stats_data = {
+ '统计项': ['SN号', '来源HTML文件数', '总FAIL数量', '生成时间'],
+ '值': [sn, source_files_count, fail_count, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
+ }
+ df_stats = pd.DataFrame(stats_data)
+ df_stats.to_excel(writer, sheet_name='Report Stats', index=False)
+
+ # 原有工作表
+ df_all.to_excel(writer, sheet_name='All Tests', index=False)
+ df_fail.to_excel(writer, sheet_name='FAIL list', index=False)
+
+ # 设置列宽
+ workbook = writer.book
+ if 'Report Stats' in workbook.sheetnames:
+ worksheet = workbook['Report Stats']
+ worksheet.column_dimensions['A'].width = 20
+ worksheet.column_dimensions['B'].width = 30
+ except Exception as e:
+ raise RuntimeError(f"Excel文件保存失败: {str(e)}")
+
+ def _prepare_dataframe(self, data_info):
+ """准备DataFrame(保持解析时的列顺序)"""
+ df = pd.DataFrame(data_info['data'], columns=data_info['headers'])
+ return df
+
+
+class ReportProcessor:
+ """主报告处理器(控制台版本)"""
+
+ def __init__(self):
+ pass
+
+ def process_reports(self):
+ """处理完整流程"""
+ source_dir = self._get_directory_from_console()
+ if not source_dir:
+ print(f"{Fore.RED}❌ 未选择目录,程序退出")
+ return
+
+ output_dir = self._create_output_dir(source_dir)
+ processed_data = self._process_html_files(source_dir)
+ self._generate_excel_reports(output_dir, processed_data)
+
+ def _get_directory_from_console(self):
+ """从控制台获取目录路径"""
+ while True:
+ print(f"\n{Fore.CYAN}=== HTML报告处理程序 ===")
+ print(f"{Fore.WHITE}请输入包含HTML文件的目录路径:")
+ path = input("> ").strip()
+
+ if not path:
+ print(f"{Fore.YELLOW}⚠ 路径不能为空,请重新输入")
+ continue
+
+ # 处理路径中的引号
+ path = path.strip('"\'')
+
+ if not os.path.exists(path):
+ print(f"{Fore.RED}❌ 路径不存在,请重新输入")
+ continue
+
+ if not os.path.isdir(path):
+ print(f"{Fore.RED}❌ 请输入目录路径,而不是文件路径")
+ continue
+
+ return path
+
+ def _create_output_dir(self, source_dir):
+ """创建输出目录"""
+ output_dir = os.path.join(source_dir, f"Html文件分析_{datetime.now().strftime('%Y%m%d%H%M%S')}")
+ os.makedirs(output_dir, exist_ok=True)
+ print(f"{Fore.GREEN}✔ 输出目录创建成功: {output_dir}")
+ return output_dir
+
+ def _process_html_files(self, source_dir):
+ """处理HTML文件(并行)"""
+ processor = HTMLReportProcessor()
+ return processor.process_files(source_dir)
+
+ def _generate_excel_reports(self, output_dir, data):
+ """并行生成Excel报告"""
+ generator = ExcelReportGenerator(output_dir)
+ generator.generate_reports(data)
+
+
+if __name__ == "__main__":
+ try:
+ processor = ReportProcessor()
+ processor.process_reports()
+
+ # 程序结束时暂停,方便用户查看结果
+ print(f"\n{Fore.CYAN}=== 程序执行完成 ===")
+ # input("按回车键退出...")
+
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}⚠ 用户中断程序")
+ except Exception as e:
+ print(f"\n{Fore.RED}❌ 程序执行出错: {type(e).__name__}: {str(e)}")
+ import traceback
+
+ traceback.print_exc()
+ input("traceback 按回车键退出...")
\ No newline at end of file
diff --git a/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_pV1.py b/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_pV1.py
new file mode 100644
index 0000000..8c3ef7a
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_cmd_p/htmlReportProcess_cmd_pV1.py
@@ -0,0 +1,1172 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+import os
+import re
+import sys
+from datetime import datetime
+import pytz # 需要安装 pytz 库
+
+from colorama import Fore, Style, init
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+import threading
+from collections import defaultdict
+
+init(autoreset=True)
+
+
+class ThreadSafeProgressTracker:
+ """线程安全的多进程进度跟踪器"""
+
+ def __init__(self, total_files):
+ self.lock = threading.Lock()
+ self.processed = 0
+ self.total = total_files
+ self.start_time = datetime.now()
+ self.success_count = 0
+ self.fail_count = 0
+
+ def update(self, success=True ,infor='', count=1):
+ """线程安全地更新进度"""
+ with self.lock:
+ self.processed += count
+ if success:
+ self.success_count += count
+ else:
+ self.fail_count += count
+
+ # 每处理10个文件或进度有显著变化时更新显示
+ if self.processed % 10 == 0 or self.processed == self.total:
+ self._display_progress(infor)
+
+ def _display_progress(self,infor=''):
+ """显示当前进度"""
+ time_used = datetime.now() - self.start_time
+ percent = self.processed / self.total * 100 if self.total > 0 else 0
+
+ # 计算处理速度
+ elapsed_seconds = time_used.total_seconds()
+ speed = self.processed / elapsed_seconds if elapsed_seconds > 0 else 0
+
+ progress_bar = f"[{'▉' * int(percent / 5)}{' ' * (20 - int(percent / 5))}]"
+
+ sys_info = [
+ f"进度: {self.processed}/{self.total}",
+ f"{percent:.1f}% {progress_bar}",
+ f"成功: {self.success_count}",
+ f"失败: {self.fail_count}",
+ f"速度: {speed:.1f} 文件/秒",
+ f"耗时: {self._format_timedelta(time_used)}",
+ f"Infor:{infor}"
+ ]
+
+ print('\x1b[2K\r' + ' | '.join(sys_info), end='', flush=True)
+
+ def finish(self, phase_name="处理"):
+ """完成进度跟踪"""
+ # self._display_progress()
+ print(f"\n{Fore.GREEN}✔ {phase_name}完成! 总耗时: {(datetime.now() - self.start_time).total_seconds():.1f}秒")
+ print(f"{Fore.CYAN}成功: {self.success_count}, 失败: {self.fail_count}")
+
+ def _format_timedelta(self, delta):
+ """格式化时间差"""
+ seconds = delta.total_seconds()
+ return f"{int(seconds // 3600):02}:{int((seconds % 3600) // 60):02}:{int(seconds % 60):02}"
+
+
+class HTMLFileProcessor:
+ """HTML文件处理器(单文件处理)"""
+
+ @staticmethod
+ def _clean_test_name(raw_name):
+ """使用配置规则清洗测试名称"""
+ rules = [
+ (r'^Round\d+_\d+_', ''), # 移除Round前缀
+ (r'_loop\d+$', ''), # 移除loop后缀
+ (r'_Round\d+$', ''), # 如果还有其他模式
+ ]
+
+ result = raw_name
+ for pattern, replacement in rules:
+ result = re.sub(pattern, replacement, result)
+
+ return result
+
+ @staticmethod
+ def _extract_test_cycle_time(filename):
+ """从文件名中提取测试周期时间并转换为标准格式"""
+ try:
+ # 匹配类似 "2025-11-21 13-23-16" 的格式
+ time_match = re.search(r'\((\d{4}-\d{2}-\d{2}\s+\d{2}-\d{2}-\d{2})\)', filename)
+ if time_match:
+ time_str = time_match.group(1)
+
+ # 使用datetime进行精确解析和格式化
+ dt = datetime.strptime(time_str, "%Y-%m-%d %H-%M-%S")
+ formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S")
+ return formatted_time
+
+ except Exception:
+ # 如果datetime解析失败,使用字符串处理作为备选方案
+ try:
+ if time_match:
+ time_str = time_match.group(1)
+ # 分割日期和时间部分
+ date_part, time_part = time_str.split()
+ # 只替换时间部分的分隔符
+ formatted_time_part = time_part.replace('-', ':')
+ formatted_time = f"{date_part} {formatted_time_part}"
+ return formatted_time
+ except Exception:
+ pass
+
+ return "UNKNOWN_TIME"
+
+ @staticmethod
+ def _extract_sn_and_cell(soup, filename):
+ """提取SN号和cell编号"""
+ try:
+ sn_regex = r'F[A-Z0-9]{15}(?:[A-Z0-9]{5,})?'
+
+ # 提取SN
+ sn = "UNKNOWN_SN"
+ if soup is not None:
+ sn_tag = soup.find('h3', string=re.compile(r'Serial Number:', re.I))
+ if sn_tag:
+ content_match = re.search(rf'\b({sn_regex})\b', sn_tag.get_text(), flags=re.I)
+ if content_match:
+ sn = content_match.group(1)
+ else:
+ html_text = soup.get_text(" ", strip=True)
+ content_match = re.search(rf'\b({sn_regex})\b', html_text, flags=re.I)
+ if content_match:
+ sn = content_match.group(1)
+ else:
+ content_match = re.search(rf'\b({sn_regex})\b', filename, flags=re.I)
+ if content_match:
+ sn = content_match.group(1)
+
+ # 提取cell编号(从文件名末尾的"数字")
+ cell_match = re.search(r'-(\d+)\.html$', filename)
+ cell = cell_match.group(1) if cell_match else "UNKNOWN_CELL"
+
+ return sn, cell
+
+ except Exception as e:
+ print(f"{Fore.RED}⚠ SN/CELL提取失败: {filename} - {str(e)}")
+ return "ERROR_SN", "ERROR_CELL"
+
+ @staticmethod
+ def _find_status_index(headers):
+ """识别状态列索引"""
+ if not headers:
+ return None
+ for idx, h in enumerate(headers):
+ h_norm = str(h).strip().lower()
+ if h_norm in ('status', 'result', 'test status') or re.search(r'status|result', h_norm, flags=re.I):
+ return idx
+ return None
+
+ @staticmethod
+ def _count_fail_rows(table, status_col_idx):
+ """统计表格中的FAIL行数(在添加新列之前统计)"""
+ fail_count = 0
+ if status_col_idx is None:
+ return fail_count
+
+ all_trs = table.find_all('tr')
+ start_index = 2 if len(all_trs) >= 3 else 1
+
+ for row in all_trs[start_index:]:
+ cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
+ if len(cols) > status_col_idx:
+ status_val = cols[status_col_idx].strip().upper()
+ if 'FAIL' in status_val:
+ fail_count += 1
+
+ return fail_count
+
+ @staticmethod
+ def process_single_file(file_path):
+ """处理单个HTML文件(独立函数,便于并行化)"""
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ html_content = f.read()
+ soup = BeautifulSoup(html_content, 'html.parser')
+
+ filename = os.path.basename(file_path)
+ sn, cell = HTMLFileProcessor._extract_sn_and_cell(soup, filename)
+ test_cycle_time = HTMLFileProcessor._extract_test_cycle_time(filename)
+
+ # 确保SN不为None
+ if sn is None:
+ sn = "UNKNOWN_SN"
+ if cell is None:
+ cell = "UNKNOWN_CELL"
+ if test_cycle_time is None:
+ test_cycle_time = "UNKNOWN_TIME"
+
+ table = soup.find('table', border=1) or soup.find('table')
+ if not table:
+ return {"success": False, "error": "未找到有效数据表格", "file": filename}
+
+ # 首先识别状态列索引(在原始表头上)
+ header_tr = table.find('tr', bgcolor='#eeeeee')
+ if not header_tr:
+ for tr in table.find_all('tr'):
+ if tr.find('th'):
+ header_tr = tr
+ break
+ if not header_tr:
+ header_tr = table.find('tr')
+
+ original_headers = [th.get_text(strip=True) for th in header_tr.find_all(['th', 'td'])]
+ if len(original_headers) > 11:
+ original_headers = original_headers[:11]
+
+ status_col_idx = HTMLFileProcessor._find_status_index(original_headers)
+
+ # 先统计FAIL数量(在原始表格数据上统计)
+ file_fail_count = HTMLFileProcessor._count_fail_rows(table, status_col_idx)
+
+ # 处理表格数据
+ headers, rows = HTMLFileProcessor._process_table_data(
+ table, html_content, filename, sn, cell, test_cycle_time, status_col_idx)
+
+ return {
+ "success": True,
+ "sn": sn,
+ "cell": cell,
+ "test_cycle_time": test_cycle_time,
+ "filename": filename,
+ "headers": headers,
+ "rows": rows,
+ "fail_count": file_fail_count
+ }
+
+ except Exception as e:
+ return {
+ "success": False,
+ "error": f"{type(e).__name__}: {str(e)}",
+ "file": filename
+ }
+
+ @staticmethod
+ def _process_table_data(table, html_content, filename, sn, cell, test_cycle_time, status_col_idx):
+ """处理表格数据"""
+ # 表头处理
+ header_tr = table.find('tr', bgcolor='#eeeeee')
+ if not header_tr:
+ for tr in table.find_all('tr'):
+ if tr.find('th'):
+ header_tr = tr
+ break
+ if not header_tr:
+ header_tr = table.find('tr')
+
+ headers = [th.get_text(strip=True) for th in header_tr.find_all(['th', 'td'])]
+ if len(headers) > 11:
+ headers = headers[:11]
+
+ # 插入新增列:SN、Cell、TestCycleTime
+ headers.insert(0, 'SN')
+ headers.insert(1, 'Cell')
+ headers.insert(2, 'TestCycleTime')
+
+ # 原有的新增列
+ test_name_idx = next((i for i, h in enumerate(headers) if re.search(r'test\s*name', h, flags=re.I)), 3)
+ if test_name_idx < len(headers):
+ headers.insert(test_name_idx + 1, 'Test Name New')
+ headers.append('Test Time')
+ headers.append('Source File name')
+
+ # 提取起始时间
+ base_timestamp = HTMLFileProcessor._extract_base_timestamp(html_content, filename)
+ # print(f"base_timestamp {base_timestamp}")
+ global_elapsed_accumulator = base_timestamp if base_timestamp else 0.0
+
+ rows = []
+
+ # 处理数据行
+ all_trs = table.find_all('tr')
+ start_index = 2 if len(all_trs) >= 3 else 1
+
+ for row in all_trs[start_index:]:
+ cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
+ if len(cols) < 2:
+ continue
+
+ original_test_name = cols[1].strip()
+
+ # 处理时间信息
+ elapsed_time_str = cols[9].strip() if len(cols) > 9 else "0"
+ elapsed_append , global_elapsed_accumulator = HTMLFileProcessor._calculate_timestamp(
+ cols, original_test_name, elapsed_time_str, global_elapsed_accumulator)
+
+ if elapsed_append is not None and base_timestamp is None:
+ global_elapsed_accumulator = elapsed_append - float(elapsed_time_str)
+
+ # 构建行数据:先插入新增的三个列
+ cols.insert(0, sn) # SN列
+ cols.insert(1, cell) # Cell列
+ cols.insert(2, test_cycle_time) # TestCycleTime列
+
+ # 原有的列处理
+ adjusted_test_name_idx = test_name_idx + 1 # 因为插入了3个新列
+ if adjusted_test_name_idx + 1 < len(cols):
+ cols.insert(adjusted_test_name_idx ,
+ HTMLFileProcessor._clean_test_name(cols[test_name_idx]))
+ # print(f"cols[test_name_idx]{cols[test_name_idx]}->{HTMLFileProcessor._clean_test_name(cols[test_name_idx])}")
+ cols.append(elapsed_append if elapsed_append is not None else 0.0)
+ cols.append(filename)
+ rows.append(cols)
+
+ return headers, rows
+
+ @staticmethod
+ def _extract_base_timestamp(html_content, filename):
+ """提取基准时间戳"""
+ start_time_match = re.search(r"Start Time:\s*(.+?)(?:\s*<|$)", html_content, re.IGNORECASE)
+ if start_time_match:
+ start_time_str = re.sub(r'<[^>]+>', '', start_time_match.group(1)).strip()
+ try:
+ dt = datetime.strptime(start_time_str, "%A, %B %d, %Y %I:%M:%S %p")
+ dt = pytz.timezone('UTC').localize(dt) # 转换为北京、UTC时间
+ # print(f"基于 测试报告概述中的时间信息解析到的测试启动时间:_extract_base_timestamp dt {dt} : {dt.timestamp()}")
+ return dt.timestamp()
+ except:
+ try:
+ dt = datetime.strptime(start_time_str.split(', ', 1)[1], "%B %d, %Y %I:%M:%S %p")
+ return dt.timestamp()
+ except:
+ pass
+ return None
+
+ @staticmethod
+ def _calculate_timestamp(cols, test_name, elapsed_time_str, base_accumulator):
+ """计算时间戳"""
+ if test_name == "Test_Time" and len(cols) > 7:
+ measurement_str = cols[7]
+ try:
+ dt = datetime.strptime(measurement_str, "%m/%d/%Y %I:%M:%S %p")
+ dt = pytz.timezone('UTC').localize(dt) # 转换为北京、UTC时间
+ # print(f"基于报告中 Test_Time 字段更新基准时间, _calculate_timestamp {dt}:{dt.timestamp()} \n base_accumulator:{base_accumulator}")
+ base_accumulator = dt.timestamp()
+ return base_accumulator,base_accumulator
+ except:
+ pass
+
+ try:
+ return base_accumulator + float(elapsed_time_str) ,base_accumulator
+ except ValueError:
+ return base_accumulator,base_accumulator
+
+
+class ExcelReportWorker:
+ """Excel报告生成工作器(单个报告生成)"""
+
+ @staticmethod
+ def generate_single_report(report_data, output_dir):
+ """生成单个Excel报告"""
+ try:
+ sn = report_data["sn"]
+ cell = report_data.get("cell", "UNKNOWN_CELL")
+ all_cells = report_data.get("all_cells", "UNKNOWN_CELLlist")
+ test_cycle_time = report_data.get("test_cycle_time", "UNKNOWN_TIME")
+ data_info = report_data["data_info"]
+ source_files_count = report_data["source_files_count"]
+
+ # 安全处理SN、cell和test_cycle_time,防止None值
+ if sn is None:
+ sn = "UNKNOWN_SN"
+ if cell is None:
+ cell = "UNKNOWN_CELL"
+ if all_cells is None:
+ all_cells = "UNKNOWN_CELLlist"
+ if test_cycle_time is None:
+ test_cycle_time = "UNKNOWN_TIME"
+
+ # 在文件名中体现cell编号
+ cell_list_display = ','.join(str(c) for c in all_cells)
+ # base_name = f"{sn}_C-{all_cells}_R-{source_files_count}"
+ base_name = f"{sn}_C-{cell_list_display}_R-{source_files_count}"
+ output_file = os.path.join(output_dir, f"{base_name}.xlsx")
+
+ # 检查数据是否有效
+ if not data_info or 'headers' not in data_info or 'data' not in data_info:
+ return {
+ "success": False,
+ "sn": sn,
+ "cell": cell,
+ "all_cells": all_cells,
+ "test_cycle_time": test_cycle_time,
+ "error": "数据格式无效或为空"
+ }
+
+ df_all = ExcelReportWorker._prepare_dataframe(data_info)
+
+ if df_all.empty:
+ return {
+ "success": False,
+ "sn": sn,
+ "cell": cell,
+ "all_cells": all_cells,
+ "test_cycle_time": test_cycle_time,
+ "error": "DataFrame为空,无数据可生成"
+ }
+
+ # 识别状态列并统计失败(使用文件处理时统计的FAIL数量)
+ status_col = ExcelReportWorker._detect_status_column(df_all)
+ fail_count = data_info.get('report_stats', {}).get('fail_count', 0)
+
+ # 提取TestCycleTime相关统计信息
+ time_stats = ExcelReportWorker._extract_time_statistics(df_all, data_info)
+
+ if status_col and fail_count == 0:
+ # 如果文件处理时统计为0,但在DataFrame中可能有FAIL,进行双重检查
+ fail_mask = df_all[status_col].astype(str).str.strip().str.upper().str.contains('FAIL')
+ fail_count = int(fail_mask.sum())
+ df_fail = df_all[fail_mask]
+ elif fail_count > 0:
+ # 使用文件处理时统计的FAIL数量,创建FAIL子集
+ if status_col:
+ fail_mask = df_all[status_col].astype(str).str.strip().str.upper().str.contains('FAIL')
+ df_fail = df_all[fail_mask]
+ else:
+ df_fail = pd.DataFrame(columns=df_all.columns)
+ else:
+ df_fail = pd.DataFrame(columns=df_all.columns)
+
+ # 根据失败数量调整文件名(保留cell编号)
+ if fail_count > 0:
+ new_name = f"{base_name}_Fitem-{fail_count}.xlsx"
+ output_file = os.path.join(output_dir, new_name)
+
+ # 报告统计(包含cell编号和测试周期时间信息)
+ report_stats = data_info.get('report_stats', {})
+ source_files_count = report_stats.get('source_files_count', 0)
+ cell_info = report_stats.get('cell_info', {})
+
+ # 保存Excel(包含cell编号和测试周期时间信息)
+ ExcelReportWorker._save_excel(df_all, df_fail, output_file, sn, cell, test_cycle_time,
+ source_files_count, fail_count, cell_info, time_stats)
+
+ return {
+ "success": True,
+ "sn": sn,
+ "cell": cell,
+ "all_cells": all_cells,
+ "test_cycle_time": test_cycle_time,
+ "output_file": output_file,
+ "record_count": len(df_all),
+ "source_files_count": source_files_count,
+ "fail_count": fail_count
+ }
+
+ except Exception as e:
+ return {
+ "success": False,
+ "sn": sn if sn is not None else "UNKNOWN_SN",
+ "cell": cell if cell is not None else "UNKNOWN_CELL",
+ "all_cells": all_cells if all_cells is not None else "UNKNOWN_CELL",
+ "test_cycle_time": test_cycle_time if test_cycle_time is not None else "UNKNOWN_TIME",
+ "error": f"{type(e).__name__}: {str(e)}"
+ }
+
+ @staticmethod
+ def _extract_time_statistics(df_all, data_info):
+ """提取TestCycleTime相关统计信息"""
+ time_stats = {
+ 'all_times': [],
+ 'time_count': 0,
+ 'time_distribution': {},
+ 'records_by_time': {},
+ 'time_range': {}
+ }
+
+ # 从数据中提取TestCycleTime信息
+ if 'TestCycleTime' in df_all.columns:
+ time_values = df_all['TestCycleTime'].dropna().unique()
+ time_stats['all_times'] = sorted(list(time_values))
+ time_stats['time_count'] = len(time_values)
+
+ # 每个时间段的记录数量
+ time_counts = df_all['TestCycleTime'].value_counts().to_dict()
+ time_stats['time_distribution'] = time_counts
+
+ # 每个时间段的记录详情
+ for time_val in time_values:
+ time_records = df_all[df_all['TestCycleTime'] == time_val]
+ time_stats['records_by_time'][time_val] = len(time_records)
+
+ # 时间范围(如果有时间信息)
+ try:
+ datetime_objects = []
+ for time_str in time_values:
+ if time_str != "UNKNOWN_TIME":
+ try:
+ dt = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+ datetime_objects.append(dt)
+ except:
+ pass
+
+ if datetime_objects:
+ min_time = min(datetime_objects)
+ max_time = max(datetime_objects)
+ time_stats['time_range'] = {
+ 'start': min_time.strftime("%Y-%m-%d %H:%M:%S"),
+ 'end': max_time.strftime("%Y-%m-%d %H:%M:%S"),
+ 'duration_hours': round((max_time - min_time).total_seconds() / 3600, 2)
+ }
+ except:
+ pass
+
+ # 从report_stats中获取补充信息
+ report_stats = data_info.get('report_stats', {})
+ time_info = report_stats.get('time_info', {})
+ if time_info.get('all_times'):
+ # 合并两个来源的时间信息
+ existing_times = set(time_stats['all_times'])
+ new_times = set(time_info.get('all_times', []))
+ all_combined_times = sorted(list(existing_times.union(new_times)))
+ time_stats['all_times'] = all_combined_times
+ time_stats['time_count'] = len(all_combined_times)
+
+ return time_stats
+
+ @staticmethod
+ def _detect_status_column(df):
+ """检测状态列"""
+ for col in df.columns:
+ col_str = str(col)
+ if re.search(r'\b(status|result)\b', col_str, flags=re.I) or col_str.strip().lower() in (
+ 'status', 'result', 'test status'):
+ return col
+ return None
+
+ @staticmethod
+ def _save_excel(df_all, df_fail, output_file, sn, cell, test_cycle_time, source_files_count, fail_count,
+ cell_info=None, time_stats=None):
+ try:
+ # 预处理:对对象列的超长字符串裁剪、填充 NaN
+ def _sanitize_df(df):
+ df = df.copy()
+ obj_cols = df.select_dtypes(include=['object']).columns
+ # 裁剪到 Excel 单元格上限 32767
+ for c in obj_cols:
+ df[c] = df[c].astype(str).str.slice(0, 32767)
+ # 可选:填充 NaN 以避免"空看起来像缺数据"
+ df[obj_cols] = df[obj_cols].fillna('')
+ return df
+
+ df_all = _sanitize_df(df_all)
+ df_fail = _sanitize_df(df_fail) if df_fail is not None and not df_fail.empty else df_fail
+
+ with pd.ExcelWriter(
+ output_file,
+ engine='xlsxwriter',
+ engine_kwargs={'options': {
+ 'strings_to_urls': False,
+ 'strings_to_formulas': False
+ }}
+ ) as writer:
+ # Report Stats(扩展统计信息,包含TestCycleTime详细信息)
+ stats_data = ExcelReportWorker._prepare_stats_data(
+ sn, cell, test_cycle_time, source_files_count, fail_count,
+ cell_info, time_stats
+ )
+ pd.DataFrame(stats_data).to_excel(writer, sheet_name='Report Stats', index=False)
+
+ # All Tests(分片写入,避免超过 Excel 行数)
+ MAX_ROWS = 1_048_576
+ CHUNK = 200_000 # 可根据机器调整
+ total_rows = len(df_all)
+
+ if total_rows == 0:
+ # 空表也创建一个空 Sheet,避免看起来"不完整"
+ pd.DataFrame(columns=df_all.columns).to_excel(writer, sheet_name='All Tests', index=False)
+ elif total_rows <= MAX_ROWS - 1: # -1 给表头预留一行
+ df_all.to_excel(writer, sheet_name='All Tests', index=False)
+ else:
+ # 超过行数限制,拆分到多个 Sheet
+ for start in range(0, total_rows, MAX_ROWS - 1):
+ end = min(start + (MAX_ROWS - 1), total_rows)
+ sheet_name = f'All Tests_{start // (MAX_ROWS - 1) + 1}'
+ df_all.iloc[start:end].to_excel(writer, sheet_name=sheet_name, index=False)
+
+ # FAIL list 仅在有数据时写,避免空表开销和误判
+ if fail_count > 0 and df_fail is not None and not df_fail.empty:
+ df_fail.to_excel(writer, sheet_name='FAIL list', index=False)
+
+ # TestCycleTime Details(新增:测试周期时间详情表)
+ if time_stats and time_stats['all_times']:
+ ExcelReportWorker._create_time_details_sheet(writer, time_stats)
+
+ # 列宽仅在小表上设置;避免对大表(多达几十万行)做格式化
+ wb = writer.book
+ ws = writer.sheets.get('Report Stats')
+ if ws is not None:
+ ws.set_column(0, 0, 25) # A列加宽以容纳更多统计项
+ ws.set_column(1, 1, 40) # B列加宽以容纳更长的时间列表
+
+ except Exception as e:
+ raise RuntimeError(f"Excel文件保存失败: {str(e)}")
+
+ @staticmethod
+ def _prepare_stats_data(sn, cell, test_cycle_time, source_files_count, fail_count,
+ cell_info, time_stats):
+ """准备统计页面数据(扩展包含TestCycleTime详细信息)"""
+ stats_data = {
+ '统计项': [],
+ '值': []
+ }
+
+ # 基础统计信息
+ base_stats = {
+ 'SN号': sn,
+ 'Cell编号': cell,
+ '主要测试周期时间': test_cycle_time,
+ '来源HTML文件数': source_files_count,
+ '总FAIL数量': fail_count,
+ '生成时间': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ }
+
+ for key, value in base_stats.items():
+ stats_data['统计项'].append(key)
+ stats_data['值'].append(value)
+
+ # TestCycleTime详细信息
+ if time_stats:
+ # 测试周期时间数量
+ stats_data['统计项'].append('测试周期时间总数')
+ stats_data['值'].append(time_stats.get('time_count', 0))
+
+ # 测试周期时间列表(前10个)
+ all_times = time_stats.get('all_times', [])
+ time_list_display = ', '.join(str(t) for t in all_times[:10])
+ if len(all_times) > 10:
+ time_list_display += f'...等{len(all_times)}个时间段'
+
+ stats_data['统计项'].append('测试周期时间列表')
+ stats_data['值'].append(time_list_display)
+
+ # 时间范围信息
+ time_range = time_stats.get('time_range', {})
+ if time_range:
+ stats_data['统计项'].append('测试时间范围')
+ stats_data['值'].append(f"{time_range.get('start', '')} 至 {time_range.get('end', '')}")
+
+ stats_data['统计项'].append('测试持续时长(小时)')
+ stats_data['值'].append(time_range.get('duration_hours', 0))
+
+ # 按时间分布统计
+ time_distribution = time_stats.get('time_distribution', {})
+ if time_distribution:
+ top_times = sorted(time_distribution.items(), key=lambda x: x[1], reverse=True)[:5]
+ dist_display = ', '.join([f"{time}({count})" for time, count in top_times])
+ if len(time_distribution) > 5:
+ dist_display += f'...等{len(time_distribution)}个分布'
+
+ stats_data['统计项'].append('记录数时间分布(前5)')
+ stats_data['值'].append(dist_display)
+
+ # Cell详细信息
+ if cell_info:
+ all_cells = cell_info.get('all_cells', [])
+ cell_count = cell_info.get('cell_count', 0)
+ stats_data['统计项'].extend(['Cell数量', 'Cell列表'])
+
+ cell_list_display = ','.join(str(c) for c in all_cells[:10])
+ if len(all_cells) > 10:
+ cell_list_display += f'...等{len(all_cells)}个'
+
+ stats_data['值'].extend([cell_count, cell_list_display])
+
+ return stats_data
+
+ @staticmethod
+ def _create_time_details_sheet(writer, time_stats):
+ """创建TestCycleTime详情工作表"""
+ time_details_data = []
+
+ all_times = time_stats.get('all_times', [])
+ time_distribution = time_stats.get('time_distribution', {})
+ records_by_time = time_stats.get('records_by_time', {})
+
+ for time_val in all_times:
+ record_count = records_by_time.get(time_val, time_distribution.get(time_val, 0))
+ time_details_data.append({
+ '测试周期时间': time_val,
+ '记录数量': record_count,
+ '占比(%)': round(record_count / sum(time_distribution.values()) * 100, 2) if time_distribution else 0
+ })
+
+ if time_details_data:
+ df_time_details = pd.DataFrame(time_details_data)
+ df_time_details.to_excel(writer, sheet_name='TestCycleTime Details', index=False)
+
+ # 设置TestCycleTime Details工作表的列宽
+ wb = writer.book
+ ws = writer.sheets.get('TestCycleTime Details')
+ if ws is not None:
+ ws.set_column(0, 0, 25) # 测试周期时间列
+ ws.set_column(1, 1, 15) # 记录数量列
+ ws.set_column(2, 2, 15) # 占比列
+
+ @staticmethod
+ def _prepare_dataframe(data_info):
+ """准备DataFrame"""
+ if not data_info['data']:
+ return pd.DataFrame()
+
+ df = pd.DataFrame(data_info['data'], columns=data_info['headers'])
+ cols = data_info['headers'].copy()
+ test_name_idx = cols.index('Test Name') if 'Test Name' in cols else next(
+ (i for i, h in enumerate(cols) if re.search(r'test\s*name', h, flags=re.I)), 0)
+ cols.insert(test_name_idx + 1, cols.pop(test_name_idx + 1))
+ return df[cols]
+
+
+class ParallelHTMLReportProcessor:
+ """并行HTML报告处理器"""
+
+ def __init__(self):
+ self.sn_data_map = {}
+ self.sn_source_files = defaultdict(set)
+ self.sn_fail_counts = defaultdict(int)
+ self.sn_file_counts = defaultdict(int)
+ self.sn_cell_info = defaultdict(set) # 存储每个SN对应的cell编号
+ self.sn_test_cycle_times = defaultdict(set) # 存储每个SN对应的测试周期时间
+
+ def process_files(self, source_dir, max_workers=None):
+ """并行处理目录中的所有文件"""
+ all_files = self._scan_files(source_dir)
+
+ if not all_files:
+ print(f"{Fore.YELLOW}⚠ 未找到HTML文件")
+ return self.sn_data_map
+
+ # 预扫描SN分布
+ print(f"{Fore.YELLOW}⌛ 正在扫描文件分布...")
+ self._collect_sn_distribution(all_files)
+
+ # 显示文件分布
+ self._display_file_distribution()
+
+ # 设置工作进程数
+ if max_workers is None:
+ max_workers = min(mp.cpu_count(), len(all_files))
+
+ print(f"{Fore.CYAN}▶ 开始并行处理 {len(all_files)} 个文件 (使用 {max_workers} 个进程)")
+
+ # 创建进度跟踪器
+ progress_tracker = ThreadSafeProgressTracker(len(all_files))
+
+ # 使用进程池并行处理
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
+ # 提交所有任务
+ future_to_file = {executor.submit(HTMLFileProcessor.process_single_file, file_path): file_path
+ for file_path in all_files}
+
+ # 处理结果
+ infor =''
+ for future in future_to_file:
+ result = future.result()
+ success = result["success"]
+
+ if success:
+ self._store_result_data(result)
+
+ # progress_tracker.update(success)
+
+ if not success:
+ # print(f"\n{Fore.RED}⚠ 处理失败: {result['file']} - {result['error']}")
+ infor = infor + f"{Fore.RED}⚠ 处理失败: {result['file']} - {result['error']}|"
+ progress_tracker.update(success,infor)
+
+ progress_tracker.finish(phase_name="HTML文件处理")
+ self._add_report_statistics()
+
+ return self.sn_data_map
+
+ def _scan_files(self, source_dir):
+ """扫描HTML文件"""
+ all_files = []
+ for root_dir, _, files in os.walk(source_dir):
+ all_files.extend(
+ [os.path.join(root_dir, f) for f in files if f.lower().endswith(('.html', '.htm'))]
+ )
+ return all_files
+
+ def _collect_sn_distribution(self, file_list):
+ """预扫描SN分布"""
+ for file_path in file_list:
+ filename = os.path.basename(file_path)
+ sn, cell = self._extract_sn_and_cell_from_filename(filename)
+ test_cycle_time = HTMLFileProcessor._extract_test_cycle_time(filename)
+ self.sn_file_counts[sn] += 1
+ self.sn_cell_info[sn].add(cell)
+ self.sn_test_cycle_times[sn].add(test_cycle_time)
+
+ def _extract_sn_and_cell_from_filename(self, filename):
+ """从文件名提取SN和cell编号"""
+ sn_regex = r'F[A-Z0-9]{15}(?:[A-Z0-9]{5,})?'
+ sn_match = re.search(rf'\b({sn_regex})\b', filename, flags=re.I)
+ cell_match = re.search(r'-(\d+)\.html$', filename)
+
+ sn = sn_match.group(1) if sn_match else "UNKNOWN_SN"
+ cell = cell_match.group(1) if cell_match else "UNKNOWN_CELL"
+
+ return sn, cell
+
+ def _display_file_distribution(self):
+ """显示文件分布(包含cell编号和测试周期时间信息)"""
+ dist_info = []
+ for sn, count in list(self.sn_file_counts.items())[:10]:
+ # 获取该SN对应的cell信息
+ cells = list(self.sn_cell_info.get(sn, set()))
+ cell_display = ', '.join(sorted(cells)[:3]) if cells else "未知"
+ if len(cells) > 3:
+ cell_display += f"...等{len(cells)}个"
+
+ # 获取该SN对应的测试周期时间信息
+ times = list(self.sn_test_cycle_times.get(sn, set()))
+ time_display = ', '.join(sorted(times)[:11]) if times else "未知"
+ if len(times) > 11:
+ time_display += f"...等{len(times)}个"
+
+ dist_info.append(f"{sn[:31]:<32}: {count}个文件, Cells: {cell_display:>3}, 时间: {time_display}")
+
+ if len(self.sn_file_counts) > 10:
+ dist_info.append(f"... 还有 {len(self.sn_file_counts) - 10} 个SN")
+
+ print(f"{Fore.MAGENTA}⚫ SN文件分布:\n{Fore.CYAN}{chr(10).join(dist_info)}")
+
+ def _store_result_data(self, result):
+ """存储处理结果"""
+ sn = result["sn"]
+ cell = result.get("cell", "UNKNOWN_CELL")
+ test_cycle_time = result.get("test_cycle_time", "UNKNOWN_TIME")
+ filename = result["filename"]
+
+ # 记录文件来源和cell信息
+ self.sn_source_files[sn].add(filename)
+ self.sn_cell_info[sn].add(cell)
+ self.sn_test_cycle_times[sn].add(test_cycle_time)
+
+ # 存储数据
+ if sn not in self.sn_data_map:
+ self.sn_data_map[sn] = {'headers': result["headers"], 'data': []}
+
+ self.sn_data_map[sn]['data'].extend(result["rows"])
+
+ # 累加FAIL数量
+ self.sn_fail_counts[sn] += result["fail_count"]
+
+ def _add_report_statistics(self):
+ """添加报告统计信息(包含cell和测试周期时间信息)"""
+ for sn, data_info in self.sn_data_map.items():
+ source_count = len(self.sn_source_files.get(sn, []))
+ cell_set = self.sn_cell_info.get(sn, set())
+ time_set = self.sn_test_cycle_times.get(sn, set())
+ cell_list = list(cell_set)
+ time_list = list(time_set)
+ primary_cell = cell_list[0] if cell_list else "UNKNOWN_CELL"
+ primary_time = time_list[0] if time_list else "UNKNOWN_TIME"
+
+ data_info['report_stats'] = {
+ 'source_files_count': source_count,
+ 'fail_count': self.sn_fail_counts.get(sn, 0),
+ 'cell_info': {
+ 'primary_cell': primary_cell,
+ 'all_cells': cell_list,
+ 'cell_count': len(cell_set)
+ },
+ 'time_info': {
+ 'primary_time': primary_time,
+ 'all_times': time_list,
+ 'time_count': len(time_set)
+ }
+ }
+
+
+class ParallelExcelReportGenerator:
+ """并行Excel报告生成器"""
+
+ def __init__(self, output_dir):
+ self.output_dir = output_dir
+
+ def generate_reports(self, sn_data_map, max_workers=None):
+ """并行生成Excel报告"""
+ total_reports = len(sn_data_map)
+
+ if total_reports == 0:
+ print(f"{Fore.YELLOW}⚠ 没有数据可生成报告")
+ return [], []
+
+ # 设置工作进程数
+ if max_workers is None:
+ max_workers = min(mp.cpu_count(), total_reports)
+
+ print(f"{Fore.CYAN}▶ 开始并行生成Excel报告 (共{total_reports}个,使用 {max_workers} 个进程)")
+
+ # 创建进度跟踪器
+ progress_tracker = ThreadSafeProgressTracker(total_reports)
+
+ # 准备报告数据(包含cell和测试周期时间信息)
+ report_tasks = []
+ for sn, data_info in sn_data_map.items():
+ cell_info = data_info.get('report_stats', {}).get('cell_info', {})
+ time_info = data_info.get('report_stats', {}).get('time_info', {})
+ source_files_count = data_info.get('report_stats', {}).get('source_files_count', {})
+ primary_cell = cell_info.get('primary_cell', 'UNKNOWN_CELL')
+ all_cells = cell_info.get('all_cells', 'UNKNOWN_CELLlist')
+ primary_time = time_info.get('primary_time', 'UNKNOWN_TIME')
+
+ report_tasks.append({
+ "sn": sn,
+ "cell": primary_cell,
+ "all_cells": all_cells,
+ "test_cycle_time": primary_time,
+ "data_info": data_info,
+ "source_files_count": source_files_count
+ })
+
+ success_reports = []
+ failed_reports = []
+
+ # 使用进程池并行生成报告
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
+ # 提交所有任务
+ future_to_report = {
+ executor.submit(ExcelReportWorker.generate_single_report, task, self.output_dir): task
+ for task in report_tasks
+ }
+
+ # 处理结果
+ for future in future_to_report:
+ result = future.result()
+
+ if result["success"]:
+ success_reports.append(result)
+ progress_tracker.update(success=True, count=1)
+
+ # 安全显示成功信息(包含cell和测试周期时间信息)
+ self._show_success_info(result)
+ else:
+ failed_reports.append(result)
+ progress_tracker.update(success=False, count=1)
+
+ # 安全显示错误信息(包含cell和测试周期时间信息)
+ self._show_error_info(result)
+
+ progress_tracker.finish(phase_name="Excel报告生成")
+
+ # 显示最终统计(包含cell和测试周期时间信息)
+ self._show_final_stats(success_reports, failed_reports)
+
+ return success_reports, failed_reports
+
+ def _show_success_info(self, result):
+ """安全显示单个成功报告信息(包含cell和测试周期时间信息)"""
+ try:
+ # 安全处理SN显示
+ sn_display = str(result.get('sn', 'UNKNOWN_SN'))[:32]
+ if result.get('sn') and len(str(result['sn'])) > 32:
+ sn_display += "..."
+
+ # 安全处理cell显示
+ # cell_display = str(result.get('cell', 'UNKNOWN_CELL'))
+ cell_list_display = ','.join(str(c) for c in result.get('all_cells', 'UNKNOWN_CELL'))
+ cell_display = str( cell_list_display )
+
+ # 安全处理测试周期时间显示
+ time_display = str(result.get('test_cycle_time', 'UNKNOWN_TIME'))[:20]
+
+ # 安全处理文件名显示
+ output_file = result.get('output_file', '')
+ file_name = os.path.basename(output_file) if output_file else '未知文件'
+
+ success_info = [
+ f"{Fore.GREEN}✓ 生成成功",
+ f"SN: {sn_display:<32}",
+ f"Cell: {cell_display:<8}",
+ f"时间: {time_display}",
+ f"文件: {file_name:<60}",
+ f"记录数: {result.get('record_count', 0):>5}",
+ f"来源文件: {result.get('source_files_count', 0):>2}",
+ f"FAIL数量: {result.get('fail_count', 0):>3}"
+ ]
+ print('\x1b[2K\r' + ' | '.join(success_info).ljust(120))
+ except Exception as e:
+ # 如果显示信息时出错,使用简化显示
+ print(
+ f"{Fore.GREEN}✓ 报告生成成功 (SN: {result.get('sn', 'UNKNOWN_SN')}, Cell: {result.get('cell', 'UNKNOWN_CELL')}, Time: {result.get('test_cycle_time', 'UNKNOWN_TIME')})")
+
+ def _show_error_info(self, result):
+ """安全显示单个失败报告信息(包含cell和测试周期时间信息)"""
+ try:
+ # 安全处理SN显示
+ sn_display = str(result.get('sn', 'UNKNOWN_SN'))[:32]
+ if result.get('sn') and len(str(result['sn'])) > 32:
+ sn_display += "..."
+
+ # 安全处理cell显示
+ cell_display = str(result.get('cell', 'UNKNOWN_CELL'))
+
+ # 安全处理测试周期时间显示
+ time_display = str(result.get('test_cycle_time', 'UNKNOWN_TIME'))[:20]
+
+ # 安全处理错误信息
+ error_msg = str(result.get('error', '未知错误'))[:50]
+ if len(str(result.get('error', ''))) > 50:
+ error_msg += "..."
+
+ error_info = [
+ f"{Fore.RED}✗ 生成失败",
+ f"SN: {sn_display}",
+ f"Cell: {cell_display}",
+ f"时间: {time_display}",
+ f"错误: {error_msg}"
+ ]
+ print('\x1b[2K\r' + ' | '.join(error_info).ljust(100))
+ except Exception as e:
+ # 如果显示信息时出错,使用简化显示
+ print(
+ f"{Fore.RED}✗ 报告生成失败 (SN: {result.get('sn', 'UNKNOWN_SN')}, Cell: {result.get('cell', 'UNKNOWN_CELL')}, Time: {result.get('test_cycle_time', 'UNKNOWN_TIME')})")
+
+ def _show_final_stats(self, success_reports, failed_reports):
+ """显示最终统计信息(包含cell和测试周期时间信息)"""
+ try:
+ total_records = sum(report.get('record_count', 0) for report in success_reports)
+ total_sources = sum(report.get('source_files_count', 0) for report in success_reports)
+ total_fails = sum(report.get('fail_count', 0) for report in success_reports)
+
+ # 统计涉及的不同cell数量
+ unique_cells = set(report.get('cell', 'UNKNOWN_CELL') for report in success_reports + failed_reports)
+ # 统计涉及的不同测试周期时间数量
+ unique_times = set(
+ report.get('test_cycle_time', 'UNKNOWN_TIME') for report in success_reports + failed_reports)
+
+ print(f"\n{Fore.CYAN}=== 最终统计 ===")
+ print(f"{Fore.GREEN}成功生成报告: {len(success_reports)} 个")
+ print(f"{Fore.RED}失败报告: {len(failed_reports)} 个")
+ print(f"{Fore.BLUE}总记录数: {total_records}")
+ print(f"{Fore.BLUE}总来源文件: {total_sources}")
+ print(f"{Fore.BLUE}总FAIL数量: {total_fails}")
+ print(f"{Fore.BLUE}涉及Cell数量: {len(unique_cells)}")
+ print(f"{Fore.BLUE}涉及测试周期时间数量: {len(unique_times)}")
+ print(f"{Fore.CYAN}输出目录: {self.output_dir}")
+
+ if failed_reports:
+ print(f"\n{Fore.YELLOW}失败报告详情:")
+ for report in failed_reports:
+ sn = report.get('sn', 'UNKNOWN_SN')
+ cell = report.get('cell', 'UNKNOWN_CELL')
+ time = report.get('test_cycle_time', 'UNKNOWN_TIME')
+ error = report.get('error', '未知错误')
+ print(f" {sn} (Cell {cell}, Time {time}): {error}")
+ except Exception as e:
+ print(f"{Fore.RED}⚠ 统计信息显示出错: {e}")
+
+
+class ReportProcessor:
+ """主报告处理器"""
+
+ def __init__(self):
+ pass
+
+ def process_reports(self, html_max_workers=None, excel_max_workers=None):
+ """处理完整流程"""
+ source_dir = self._get_directory_from_console()
+ if not source_dir:
+ print(f"{Fore.RED}❌ 未选择目录,程序退出")
+ return
+
+ output_dir = self._create_output_dir(source_dir)
+
+ try:
+ # 阶段1:并行处理HTML文件
+ print(f"\n{Fore.CYAN}=== 阶段1: HTML文件处理 ===")
+ html_processor = ParallelHTMLReportProcessor()
+ processed_data = html_processor.process_files(source_dir, html_max_workers)
+
+ if not processed_data:
+ print(f"{Fore.YELLOW}⚠ 没有处理任何数据,程序结束")
+ return
+
+ # 阶段2:并行生成Excel报告
+ print(f"\n{Fore.CYAN}=== 阶段2: Excel报告生成 ===")
+ excel_generator = ParallelExcelReportGenerator(output_dir)
+ success_reports, failed_reports = excel_generator.generate_reports(
+ processed_data, excel_max_workers)
+
+ # 安全显示总体结果
+ self._show_overall_result(len(processed_data), (success_reports), (failed_reports))
+
+ except Exception as e:
+ print(f"\n{Fore.RED}❌ 程序执行出错: {type(e).__name__}: {str(e)}")
+ import traceback
+ traceback.print_exc()
+
+ def _get_directory_from_console(self):
+ """从控制台获取目录路径"""
+ while True:
+ print(f"\n{Fore.CYAN}=== 并行HTML报告处理程序 ===")
+ print(f"{Fore.WHITE}请输入包含HTML文件的目录路径:")
+ path = input("> ").strip()
+
+ if not path:
+ print(f"{Fore.YELLOW}⚠ 路径不能为空,请重新输入")
+ continue
+
+ path = path.strip('"\'')
+
+ if not os.path.exists(path):
+ print(f"{Fore.RED}❌ 路径不存在,请重新输入")
+ continue
+
+ if not os.path.isdir(path):
+ print(f"{Fore.RED}❌ 请输入目录路径,而不是文件路径")
+ continue
+
+ return path
+
+ def _create_output_dir(self, source_dir):
+ """创建输出目录"""
+ output_dir = os.path.join(source_dir, f"Html文件分析_带Cell编号_{datetime.now().strftime('%Y%m%d%H%M%S')}")
+ os.makedirs(output_dir, exist_ok=True)
+ print(f"{Fore.GREEN}✔ 输出目录创建成功: {output_dir}")
+ return output_dir
+
+ def _show_overall_result(self, total_sn, success_reports, failed_reports):
+ """安全显示总体结果(包含cell和测试周期时间信息)"""
+ print(f"\n{Fore.CYAN}=== 程序执行完成 ===")
+ print(f"{Fore.GREEN}✓ 处理完成!")
+ print(f"{Fore.BLUE}总SN数量: {total_sn}")
+ print(f"{Fore.GREEN}成功报告: {len(success_reports)}")
+ print(f"{Fore.RED}失败报告: {len(failed_reports)}")
+
+ if len(failed_reports) == 0:
+ print(f"{Fore.GREEN}🎉 所有报告生成成功!")
+ else:
+ print(f"{Fore.YELLOW}⚠ 有 {len(failed_reports)} 个报告生成失败,请查看上述错误信息")
+
+
+if __name__ == "__main__":
+ try:
+ processor = ReportProcessor()
+
+ # 可分别指定HTML处理和Excel生成的进程数
+ processor.process_reports(
+ html_max_workers=None, # HTML处理进程数
+ excel_max_workers=None # Excel生成进程数
+ )
+
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}⚠ 用户中断程序")
+ except Exception as e:
+ print(f"\n{Fore.RED}❌ 程序执行出错: {type(e).__name__}: {str(e)}")
+ import traceback
+
+ traceback.print_exc()
diff --git a/htmlProcess/htmlReportProcess_picHtml/.gitignore b/htmlProcess/htmlReportProcess_picHtml/.gitignore
new file mode 100644
index 0000000..8b7678e
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_picHtml/.gitignore
@@ -0,0 +1,33 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+
+
+
+htmlReportProcess_Merge_picHtml_V3.py
+
+htmlReportProcess_Merge_picHtml_V2.py
+
+htmlReportProcess_Merge_pic_V2.py
+
+#/htmlReportProcess*/
+
+
+htmlReportProcess_cmd_pV2.py
+htmlReportProcess_cmd_pV3.py
+
+
+htmlReportProcess_cmd_V2.py
+
+htmlReportProcess.py
+
+
+htmlReportProcess_Merge_cmd_V2.py
+htmlReportProcess_Merge.py
+
+htmlReportProcess_picHtml_1kV2.py
+htmlReportProcess_picHtml_2kV2.py
\ No newline at end of file
diff --git a/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_1kV1.py b/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_1kV1.py
new file mode 100644
index 0000000..b13281c
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_1kV1.py
@@ -0,0 +1,1053 @@
+import os
+import re
+import sys
+import time
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+from matplotlib.lines import Line2D
+from typing import Optional, Tuple, List, Dict, Any, Union
+from pathlib import Path
+import numpy as np
+import base64
+from io import BytesIO
+from jinja2 import Template
+from colorama import Fore, Style, init
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import psutil
+
+# 初始化colorama
+init(autoreset=True)
+
+# 避免 SettingWithCopy 警告影响输出可读性
+pd.options.mode.chained_assignment = None
+
+# 设置中文字体支持
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
+plt.rcParams['axes.unicode_minus'] = False
+
+# HTML模板 - 添加了SN独立图的显示
+# 性能优化配置
+OPTIMIZATION_CONFIG = {
+ 'max_workers': min(mp.cpu_count(), 8), # 限制最大工作线程数
+ 'chunk_size': 50000, # 分块读取大小
+ 'use_threading': True, # 使用多线程
+ 'memory_limit_gb': psutil.virtual_memory().available // (1024 ** 3) * 0.7, # 内存限制
+}
+
+HTML_TEMPLATE = """
+
+
+
+
+
+ 测试报告分析 - {{ keyword }}
+
+
+
+
+
+
+ 📁 处理的文件列表
+ {% for file_info in file_infos %}
+
+ {{ loop.index }}. {{ file_info.filename }}
+ 路径: {{ file_info.path }}
+ 数据行数: {{ file_info.rows }} | 测试项数: {{ file_info.tests }}
+
+ {% endfor %}
+
+
+ {% for test in tests %}
+
+
+
+
+
+ 数据点数
+ {{ test.stats.count }}
+
+
+ 平均值
+ {{ "%.4f"|format(test.stats.mean) }}
+
+
+ 中位数
+ {{ "%.4f"|format(test.stats.median) }}
+
+
+ 标准差
+ {{ "%.4f"|format(test.stats.std) }}
+
+
+ 最小值
+ {{ "%.4f"|format(test.stats.min) }}
+
+
+ 最大值
+ {{ "%.4f"|format(test.stats.max) }}
+
+
+
+ {% if test.limits.lower is not none or test.limits.upper is not none %}
+
+ {% if test.limits.lower is not none %}
+
+ 下限值
+ {{ "%.4f"|format(test.limits.lower) }}
+
+ {% endif %}
+ {% if test.limits.upper is not none %}
+
+ 上限值
+ {{ "%.4f"|format(test.limits.upper) }}
+
+ {% endif %}
+
+ {% endif %}
+
+
+ 📈 汇总视图 (所有SN)
+
+ 
+
+
+
+ {% if test.sn_plot_images %}
+ 🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)
+
+ {% for sn_plot in test.sn_plot_images %}
+
+ SN: {{ sn_plot.sn }}
+ 
+
+ {% endfor %}
+
+ {% endif %}
+
+ {% endfor %}
+
+
+ 📈 分析摘要
+
+ 文件夹路径: {{ folder_path }}
+
+
+ 分析时间: {{ analysis_time }}秒
+
+
+ 测试项分布:
+
+ - 正常: {{ status_counts.normal }} 个
+ - 警告: {{ status_counts.warning }} 个
+ - 异常: {{ status_counts.abnormal }} 个
+
+
+
+ 数据摘要:
+
+ - 总文件数: {{ file_count }} 个
+ - 总数据行数: {{ total_rows }} 行
+ - 总测试项数: {{ test_count }} 个
+ - 总数据点数: {{ total_points }} 个
+
+
+
+
+
+ 报告生成于 {{ timestamp }} | 多文件测试报告分析系统
+
+
+
+"""
+
+
+class MultiFileTestReportScatterPlotter:
+ def __init__(self):
+ self.folder_path: Optional[str] = None
+ self.df: Optional[pd.DataFrame] = None
+ self.output_dir: Optional[str] = None
+ self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit"]
+ self.col_lower: Optional[str] = None
+ self.col_upper: Optional[str] = None
+ self.html_report_path: Optional[str] = None
+ self.file_infos: List[Dict[str, Any]] = []
+
+ # 缓存处理过的数据
+ self._processed_data_cache: Dict[str, Any] = {}
+
+ # 性能监控
+ self.performance_stats = {
+ 'load_times': [],
+ 'memory_usage': [],
+ 'file_sizes': []
+ }
+
+ def _print_stage(self, msg: str, color=Fore.CYAN) -> None:
+ """统一的阶段信息输出"""
+ print(f"\n{color}{'=' * 50}")
+ print(f"📋 {msg}")
+ print(f"{'=' * 50}{Style.RESET_ALL}")
+
+ def _print_progress(self, current: int, total: int, prefix: str = "进度",
+ color=Fore.YELLOW) -> None:
+ """改进的进度条显示"""
+ if total <= 0:
+ return
+
+ percent = (current / total) * 100
+ bar_len = 40
+ filled = int(bar_len * current / total)
+ bar = "█" * filled + "░" * (bar_len - filled)
+
+ sys.stdout.write(f"\r{color}{prefix}: [{bar}] {current}/{total} ({percent:.1f}%){Style.RESET_ALL}")
+ sys.stdout.flush()
+
+ if current == total:
+ print(f"{Fore.GREEN} ✅ 完成{Style.RESET_ALL}")
+
+ def _print_warning(self, msg: str) -> None:
+ """警告信息输出"""
+ print(f"{Fore.YELLOW}⚠️ {msg}{Style.RESET_ALL}")
+
+ def _print_success(self, msg: str) -> None:
+ """成功信息输出"""
+ print(f"{Fore.GREEN}✅ {msg}{Style.RESET_ALL}")
+
+ def _print_error(self, msg: str) -> None:
+ """错误信息输出"""
+ print(f"{Fore.RED}❌ {msg}{Style.RESET_ALL}")
+
+ def _get_memory_usage(self) -> float:
+ """获取当前内存使用量(GB)"""
+ process = psutil.Process()
+ return process.memory_info().rss / (1024 ** 3)
+
+ def _check_memory_safe(self, file_size_mb: float) -> bool:
+ """检查内存是否安全"""
+ available_memory = psutil.virtual_memory().available / (1024 ** 3)
+ estimated_need = file_size_mb * 5 / 1024 # 估算需要的内存(GB)
+ return available_memory > estimated_need + 1 # 保留1GB安全空间
+
+ def _load_single_file_optimized(self, file_info: Dict[str, Any]) -> Optional[pd.DataFrame]:
+ """优化单文件加载方法"""
+ file_path = file_info['path']
+ filename = file_info['filename']
+
+ try:
+ start_time = time.time()
+ file_size_mb = os.path.getsize(file_path) / (1024 ** 2)
+
+ # 内存安全检查
+ if not self._check_memory_safe(file_size_mb):
+ self._print_warning(f"内存不足,跳过大文件: {filename} ({file_size_mb:.1f}MB)")
+ return None
+
+ # 选择合适的引擎
+ file_ext = file_path.lower()
+ if file_ext.endswith('.xlsx'):
+ engine = 'openpyxl'
+ elif file_ext.endswith('.xls'):
+ engine = 'xlrd'
+ else:
+ self._print_warning(f"不支持的文件格式: {filename}")
+ return None
+
+ # 快速获取工作表信息
+ try:
+ excel_file = pd.ExcelFile(file_path, engine=engine)
+ sheet_names = excel_file.sheet_names
+
+ # 选择工作表
+ target_sheets = ["Merged All Tests", "All Tests", sheet_names[0] if sheet_names else None]
+ selected_sheet = next((s for s in target_sheets if s and s in sheet_names), None)
+
+ if not selected_sheet:
+ self._print_warning(f"未找到目标工作表: {filename}")
+ return None
+
+ except Exception as e:
+ self._print_warning(f"无法读取工作表信息 {filename}: {e}")
+ return None
+
+ # 优化读取参数
+ read_kwargs = {
+ 'io': file_path,
+ 'sheet_name': selected_sheet,
+ 'engine': engine,
+ 'dtype': 'object',
+ 'na_filter': False,
+ 'usecols': self.required_columns, # 只读取需要的列
+ }
+
+ # 对于大文件,使用分块读取
+ if file_size_mb > 50: # 50MB以上使用分块读取
+ chunks = []
+ for chunk in pd.read_excel(**read_kwargs, chunksize=OPTIMIZATION_CONFIG['chunk_size']):
+ chunks.append(chunk)
+
+ if chunks:
+ df = pd.concat(chunks, ignore_index=True)
+ else:
+ df = pd.DataFrame()
+ else:
+ df = pd.read_excel(**read_kwargs)
+
+ if df.empty:
+ self._print_warning(f"文件为空: {filename}")
+ return None
+
+ # 检查必要列
+ missing_columns = [col for col in self.required_columns if col not in df.columns]
+ if missing_columns:
+ self._print_warning(f"缺少必要列 {filename}: {missing_columns}")
+ return None
+
+ # 添加文件标识
+ df['_source_file'] = filename
+
+ load_time = time.time() - start_time
+ file_info.update({
+ 'load_time': round(load_time, 2),
+ 'file_size_mb': round(file_size_mb, 2),
+ 'engine': engine,
+ 'rows': len(df)
+ })
+
+ self.performance_stats['load_times'].append(load_time)
+ self.performance_stats['file_sizes'].append(file_size_mb)
+ self.performance_stats['memory_usage'].append(self._get_memory_usage())
+
+ self._print_success(f"加载完成: {filename} ({len(df)}行, {load_time:.2f}s)")
+ return df
+
+ except Exception as e:
+ self._print_error(f"加载文件失败 {filename}: {e}")
+ return None
+
+
+
+ def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
+ """优化的大小写不敏感列查找"""
+ if self.df is None:
+ return None
+
+ columns_lower = {col.lower().strip(): col for col in self.df.columns}
+ for candidate in candidates:
+ key = candidate.lower().strip()
+ if key in columns_lower:
+ return columns_lower[key]
+ return None
+
+ def get_folder_path(self) -> None:
+ """获取文件夹路径"""
+ self._print_stage("输入文件夹路径")
+
+ while True:
+ print(f"{Fore.WHITE}请输入包含Excel文件的文件夹路径: ")
+ folder_path = input("> ").strip()
+
+ if not folder_path:
+ continue
+
+ path_obj = Path(folder_path)
+ if path_obj.exists() and path_obj.is_dir():
+ self.folder_path = str(path_obj.resolve())
+ print(f"{Fore.GREEN}已选择文件夹: {self.folder_path}{Style.RESET_ALL}")
+ break
+ else:
+ self._print_error(f"文件夹不存在: {folder_path},请重新输入")
+
+ def find_excel_files(self) -> List[str]:
+ """查找文件夹中的所有Excel文件"""
+ self._print_stage("扫描Excel文件")
+
+ excel_files = []
+ valid_extensions = ('.xlsx', '.xls')
+
+ try:
+ for file_path in Path(self.folder_path).rglob('*'):
+ if file_path.suffix.lower() in valid_extensions and file_path.is_file():
+ excel_files.append(str(file_path.resolve()))
+
+ # 按文件名排序
+ excel_files.sort()
+
+ self._print_success(f"找到 {len(excel_files)} 个Excel文件")
+ for i, file_path in enumerate(excel_files, 1):
+ print(f" {i:2d}. {os.path.basename(file_path)}")
+
+ return excel_files
+
+ except Exception as e:
+ self._print_error(f"扫描文件夹时发生错误: {e}")
+ return []
+
+ def load_multiple_files_optimized(self, excel_files: List[str]) -> None:
+ """优化多文件加载方法"""
+ self._print_stage("并行加载Excel文件")
+ start_time = time.time()
+
+ # 准备文件信息
+ file_infos = [{'path': path, 'filename': os.path.basename(path)} for path in excel_files]
+
+ all_dataframes = []
+ self.file_infos = []
+
+ if OPTIMIZATION_CONFIG['use_threading'] and len(excel_files) > 1:
+ # 使用多线程并行加载
+ with ThreadPoolExecutor(max_workers=OPTIMIZATION_CONFIG['max_workers']) as executor:
+ futures = {executor.submit(self._load_single_file_optimized, file_info): file_info
+ for file_info in file_infos}
+
+ completed = 0
+ for future in futures:
+ try:
+ df = future.result(timeout=300) # 5分钟超时
+ if df is not None:
+ all_dataframes.append(df)
+ self.file_infos.append(futures[future])
+ completed += 1
+ self._print_progress(completed, len(excel_files), "并行加载文件")
+ except Exception as e:
+ file_info = futures[future]
+ self._print_error(f"加载失败 {file_info['filename']}: {e}")
+ else:
+ # 顺序加载
+ for i, file_info in enumerate(file_infos, 1):
+ self._print_progress(i, len(excel_files), "加载文件")
+ df = self._load_single_file_optimized(file_info)
+ if df is not None:
+ all_dataframes.append(df)
+ self.file_infos.append(file_info)
+
+ if not all_dataframes:
+ raise ValueError("没有成功加载任何Excel文件")
+
+ # 合并数据
+ self._print_stage("合并数据")
+ merge_start = time.time()
+
+ try:
+ self.df = pd.concat(all_dataframes, ignore_index=True, sort=False)
+ merge_time = time.time() - merge_start
+
+ total_time = time.time() - start_time
+ avg_load_time = np.mean(self.performance_stats['load_times']) if self.performance_stats['load_times'] else 0
+
+ self._print_success(f"合并完成: {len(self.df)}行, {len(all_dataframes)}个文件")
+ self._print_success(f"加载耗时: {total_time:.2f}s (平均: {avg_load_time:.2f}s/文件)")
+ self._print_success(f"合并耗时: {merge_time:.2f}s")
+
+ # 显示性能统计
+ print(f"\n{Fore.CYAN}📊 性能统计:")
+ print(f" 平均加载时间: {avg_load_time:.2f}s")
+ print(f" 峰值内存使用: {max(self.performance_stats['memory_usage']):.2f}GB")
+ print(f" 总文件大小: {sum(self.performance_stats['file_sizes']):.1f}MB{Style.RESET_ALL}")
+
+ except Exception as e:
+ self._print_error(f"合并数据失败: {e}")
+ raise
+
+ # 记录上下限列名
+ self.col_lower = self._find_column_case_insensitive([
+ "Lower Limit", "lower limit", "lower_limit", "ll", "lower"
+ ])
+ self.col_upper = self._find_column_case_insensitive([
+ "Upper Limit", "upper limit", "upper_limit", "ul", "upper"
+ ])
+
+ def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]:
+ """获取用户输入的关键词并筛选数据"""
+ self._print_stage("筛选关键词")
+
+ while True:
+ keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip()
+
+ if not keyword:
+ print("❌ 关键词不能为空,请重新输入")
+ continue
+
+ # 检查数据框是否为空
+ if self.df.empty:
+ print("⚠️ 数据框为空,无法进行筛选")
+ return pd.DataFrame(), keyword, []
+
+ # 检查列是否存在
+ if "Test Name New" not in self.df.columns:
+ print("❌ 列 'Test Name New' 不存在于数据框中")
+ print(f"可用列: {list(self.df.columns)}")
+ return pd.DataFrame(), keyword, []
+
+ try:
+ mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
+ filtered_df = self.df.loc[mask].copy()
+
+ if filtered_df.empty:
+ # 提供友好的提示和建议
+ print(f"⚠️ 没有找到包含关键词 '{keyword}' 的测试项")
+
+ # 显示部分可用的测试项作为参考
+ available_tests = self.df["Test Name New"].dropna().unique()
+ if len(available_tests) > 0:
+ print("📋 可用的测试项示例:")
+ for test in available_tests[:5]:
+ print(f" - {test}")
+ if len(available_tests) > 5:
+ print(f" ... 还有 {len(available_tests) - 5} 个测试项")
+
+ # 提供重新输入或退出的选项
+ choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ")
+ if choice == "1":
+ continue
+ elif choice == "2":
+ filtered_df = self.df.copy()
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 使用所有数据: {len(filtered_df)} 行,{len(unique_tests)} 个测试项")
+ return filtered_df, "", unique_tests
+ else:
+ print("👋 退出筛选操作")
+ return pd.DataFrame(), keyword, []
+ else:
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
+ return filtered_df, keyword, unique_tests
+
+ except Exception as e:
+ print(f"❌ 筛选过程中发生错误: {e}")
+ print("请检查数据格式或重新输入关键词")
+ continue
+
+ def create_output_dir(self, keyword) -> None:
+ """创建输出目录"""
+ self._print_stage("创建输出目录")
+
+ if not self.folder_path:
+ raise ValueError("文件夹路径未设置")
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.output_dir = os.path.join(self.folder_path, f"scatter_report_out")
+ safe_keyword = self._safe_filename(keyword) if keyword else "all_data"
+ self.html_report_path = os.path.join(self.output_dir, f"{safe_keyword}_report_{timestamp}.html")
+
+ os.makedirs(self.output_dir, exist_ok=True)
+ print(f"输出目录: {self.output_dir}")
+
+ @staticmethod
+ def _safe_filename(name: str) -> str:
+ """生成安全的文件名"""
+ safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
+ return safe or "Unknown_Test"
+
+ def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
+ Optional[float], Optional[float], List[float], List[float]]:
+ """提取某个测试项的上下限数值"""
+ lower_plot = upper_plot = None
+ lower_set = []
+ upper_set = []
+
+ if self.col_lower and self.col_lower in df_one_test.columns:
+ lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
+ lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
+ if lower_set:
+ lower_plot = min(lower_set)
+
+ if self.col_upper and self.col_upper in df_one_test.columns:
+ upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
+ upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
+ if upper_set:
+ upper_plot = max(upper_set)
+
+ return lower_plot, upper_plot, lower_set, upper_set
+
+ @staticmethod
+ def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
+ """统一的系列清洗和转换方法 - 修复了 ast 方法名错误"""
+ if series.empty:
+ return series
+
+ if target_type == 'numeric':
+ # 数值转换优化
+ if pd.api.types.is_numeric_dtype(series):
+ return series.astype(float)
+
+ # 批量字符串处理 - 修复这里的问题
+ cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
+ return pd.to_numeric(cleaned, errors='coerce')
+
+ elif target_type == 'datetime':
+ return MultiFileTestReportScatterPlotter._convert_to_datetime(series)
+
+ return series
+
+ @staticmethod
+ def _convert_to_datetime(series: pd.Series) -> pd.Series:
+ """优化的日期时间转换"""
+ if pd.api.types.is_datetime64_any_dtype(series):
+ return series
+
+ # 预处理:转换为数值和字符串两种形式
+ numeric_series = pd.to_numeric(series, errors='coerce')
+ string_series = series.astype(str).str.strip()
+
+ result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
+
+ # 数值时间戳处理
+ masks = {
+ 'ms': numeric_series >= 1e11,
+ 's': (numeric_series >= 1e9) & (numeric_series < 1e11),
+ 'excel': (numeric_series > 20000) & (numeric_series < 60000)
+ }
+
+ for mask_type, mask in masks.items():
+ if mask.any():
+ if mask_type == 'ms':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
+ elif mask_type == 's':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s')
+ elif mask_type == 'excel':
+ origin = pd.Timestamp('1899-12-30')
+ result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
+
+ # 字符串日期处理
+ remaining_mask = result.isna()
+ if remaining_mask.any():
+ remaining_strings = string_series.loc[remaining_mask]
+
+ # 特定格式优先处理
+ format_patterns = [
+ (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
+ ]
+
+ for pattern, date_format in format_patterns:
+ format_mask = remaining_strings.str.match(pattern)
+ if format_mask.any():
+ result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
+ remaining_strings.loc[format_mask], format=date_format, errors='coerce'
+ )
+
+ # 通用解析
+ still_na_mask = result.isna() & remaining_mask
+ if still_na_mask.any():
+ result.loc[still_na_mask] = pd.to_datetime(
+ string_series.loc[still_na_mask], errors='coerce'
+ )
+
+ return result
+
+ def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
+ """数据预处理"""
+ # 数值转换
+ test_data['Measurement_num'] = self._clean_and_convert_series(
+ test_data['Measurement'], 'numeric'
+ )
+ test_data['TestTime_dt'] = self._clean_and_convert_series(
+ test_data['Test Time'], 'datetime'
+ )
+
+ # 去除无效数据
+ valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
+ return valid_data.sort_values('TestTime_dt')
+
+ def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
+ """计算统计信息"""
+ stats = {
+ 'count': len(y_data),
+ 'mean': y_data.mean(),
+ 'median': y_data.median(),
+ 'min': y_data.min(),
+ 'max': y_data.max(),
+ 'std': y_data.std(),
+ 'q1': y_data.quantile(0.25),
+ 'q3': y_data.quantile(0.75)
+ }
+ return stats
+
+ def _plot_to_base64(self, fig) -> str:
+ """将图表转换为base64编码"""
+ buf = BytesIO()
+ fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+ buf.seek(0)
+ img_str = base64.b64encode(buf.read()).decode('utf-8')
+ plt.close(fig)
+ return img_str
+
+ def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> str:
+ """创建汇总图(所有SN在一个图中)"""
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # 分组绘制
+ groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
+ for sn, group in groups:
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ label=str(sn), alpha=0.7, s=25)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线和统计线
+ x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"汇总图 - {test_name}")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+
+ return self._plot_to_base64(fig)
+
+ def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]:
+ """为每个SN创建独立图表"""
+ sn_plots = []
+
+ if "SN" not in test_data.columns:
+ return sn_plots
+
+ sn_groups = test_data.groupby("SN")
+
+ for sn, group in sn_groups:
+ if group.empty:
+ continue
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # 绘制当前SN的数据点
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ color='blue', alpha=0.7, s=30, label=f"SN: {sn}")
+
+ # 计算当前SN的统计信息
+ y_data = group['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线
+ x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"SN独立图 - {test_name} (SN: {sn})")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend()
+
+ # 转换为base64
+ plot_image = self._plot_to_base64(fig)
+ sn_plots.append({"sn": str(sn), "image": plot_image})
+
+ return sn_plots
+
+ def _determine_test_status(self, stats: Dict[str, float],
+ lower_limit: Optional[float],
+ upper_limit: Optional[float]) -> Dict[str, Any]:
+ """确定测试状态"""
+ status = "success"
+ status_display = "正常"
+
+ if lower_limit is not None and upper_limit is not None:
+ # 检查是否超出限值
+ if stats['min'] < lower_limit or stats['max'] > upper_limit:
+ status = "danger"
+ status_display = "异常"
+ elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or
+ stats['std'] > (upper_limit - lower_limit) * 0.2):
+ status = "warning"
+ status_display = "警告"
+
+ return {"status": status, "status_display": status_display}
+
+ def generate_html_report(self, filtered_df: pd.DataFrame, keyword: str,
+ unique_tests: List[str]) -> None:
+ """生成HTML报告"""
+ self._print_stage("生成HTML报告")
+ start_time = time.time()
+
+ test_results = []
+ total_points = 0
+ status_counts = {"success": 0, "warning": 0, "danger": 0}
+
+ for i, test_name in enumerate(unique_tests, 1):
+ self._print_progress(i, len(unique_tests), "生成测试报告")
+
+ # 获取测试数据
+ test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
+ test_data = self._preprocess_test_data(test_data)
+
+ if test_data.empty:
+ continue
+
+ # 提取限值信息
+ lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+ total_points += stats['count']
+
+ # 生成汇总图表
+ summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot)
+
+ # 生成SN独立图表
+ sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot)
+
+ # 确定测试状态
+ status_info = self._determine_test_status(stats, lower_plot, upper_plot)
+ status_counts[status_info["status"]] += 1
+
+ # 添加到结果列表
+ test_results.append({
+ "name": test_name,
+ "stats": stats,
+ "limits": {"lower": lower_plot, "upper": upper_plot},
+ "summary_plot_image": summary_plot_image,
+ "sn_plot_images": sn_plot_images,
+ "status": status_info["status"],
+ "status_display": status_info["status_display"]
+ })
+
+ # 渲染HTML模板
+ template = Template(HTML_TEMPLATE)
+ html_content = template.render(
+ keyword=keyword if keyword else "所有数据",
+ timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ test_count=len(test_results),
+ total_points=total_points,
+ tests=test_results,
+ folder_path=self.folder_path,
+ analysis_time=round(time.time() - start_time, 2),
+ status_counts={"normal": status_counts["success"], "warning": status_counts["warning"],
+ "abnormal": status_counts["danger"]},
+ file_count=len(self.file_infos),
+ file_infos=self.file_infos,
+ total_rows=len(self.df) if self.df is not None else 0
+ )
+
+ # 保存HTML文件
+ with open(self.html_report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ self._print_success(f"HTML报告已生成: {self.html_report_path}")
+ self._print_success(
+ f"共处理 {len(self.file_infos)} 个文件,{len(test_results)} 个测试项,{total_points} 个数据点")
+
+
+ def run(self) -> None:
+ """运行主程序"""
+ try:
+ self.get_folder_path()
+ excel_files = self.find_excel_files()
+
+ if not excel_files:
+ self._print_error("没有找到可用的Excel文件")
+ return
+
+ # 使用优化后的加载方法
+ self.load_multiple_files_optimized(excel_files)
+
+ while True:
+ filtered_df, keyword, unique_tests = self.get_keyword()
+ if filtered_df.empty:
+ self._print_warning("没有数据可处理,退出程序")
+ break
+
+ self.create_output_dir(keyword)
+ self.generate_html_report(filtered_df, keyword, unique_tests)
+
+ self._print_success("分析完成!")
+ print(f"📊 报告文件: {self.html_report_path}")
+ print(f"📁 输出目录: {self.output_dir}")
+
+ # 询问是否继续分析其他关键词
+ continue_choice = input("\n是否继续分析其他关键词?(y/n): ").strip().lower()
+ if continue_choice not in ['y', 'yes', '是']:
+ break
+
+ except KeyboardInterrupt:
+ self._print_warning("用户中断程序")
+ except Exception as e:
+ self._print_error(f"发生错误: {type(e).__name__}: {str(e)}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ plotter = MultiFileTestReportScatterPlotter()
+ plotter.run()
diff --git a/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_2kV1.py b/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_2kV1.py
new file mode 100644
index 0000000..f83d3a0
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_picHtml/htmlReportProcess_picHtml_2kV1.py
@@ -0,0 +1,1373 @@
+import os
+import re
+import sys
+import time
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+from matplotlib.lines import Line2D
+from typing import Optional, Tuple, List, Dict, Any, Union
+from pathlib import Path
+import numpy as np
+import base64
+from io import BytesIO
+from jinja2 import Template
+from colorama import Fore, Style, init
+import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+import psutil
+
+# 初始化colorama
+init(autoreset=True)
+
+# 避免 SettingWithCopy 警告影响输出可读性
+pd.options.mode.chained_assignment = None
+
+# 设置中文字体支持
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
+plt.rcParams['axes.unicode_minus'] = False
+
+# HTML模板 - 更新为支持多关键词
+# 性能优化配置
+OPTIMIZATION_CONFIG = {
+ 'max_workers': min(mp.cpu_count(), 8), # 限制最大工作线程数
+ 'chunk_size': 50000, # 分块读取大小
+ 'use_threading': True, # 使用多线程
+ 'memory_limit_gb': psutil.virtual_memory().available // (1024 ** 3) * 0.7, # 内存限制
+}
+
+HTML_TEMPLATE = """
+
+
+
+
+
+ 测试报告分析 - {{ keywords_display }}
+
+
+
+
+
+
+ 📁 处理的文件列表
+ {% for file_info in file_infos %}
+
+ {{ loop.index }}. {{ file_info.filename }}
+ 路径: {{ file_info.path }}
+ 数据行数: {{ file_info.rows }} | 测试项数: {{ file_info.tests }}
+
+ {% endfor %}
+
+
+
+ {% if comparison_plot_images and comparison_plot_images|length > 0 %}
+
+ 🔍 多关键词对比视图
+
+ {% for comparison_plot in comparison_plot_images %}
+
+ {{ comparison_plot.title }}
+ 
+
+ {% endfor %}
+
+
+ {% endif %}
+
+ {% for test in tests %}
+
+
+
+
+
+ 数据点数
+ {{ test.stats.count }}
+
+
+ 平均值
+ {{ "%.4f"|format(test.stats.mean) }}
+
+
+ 中位数
+ {{ "%.4f"|format(test.stats.median) }}
+
+
+ 标准差
+ {{ "%.4f"|format(test.stats.std) }}
+
+
+ 最小值
+ {{ "%.4f"|format(test.stats.min) }}
+
+
+ 最大值
+ {{ "%.4f"|format(test.stats.max) }}
+
+
+
+ {% if test.limits.lower is not none or test.limits.upper is not none %}
+
+ {% if test.limits.lower is not none %}
+
+ 下限值
+ {{ "%.4f"|format(test.limits.lower) }}
+
+ {% endif %}
+ {% if test.limits.upper is not none %}
+
+ 上限值
+ {{ "%.4f"|format(test.limits.upper) }}
+
+ {% endif %}
+
+ {% endif %}
+
+
+ 📈 汇总视图 (所有SN)
+
+ 
+
+
+
+ {% if test.sn_plot_images %}
+ 🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)
+
+ {% for sn_plot in test.sn_plot_images %}
+
+ SN: {{ sn_plot.sn }}
+ 
+
+ {% endfor %}
+
+ {% endif %}
+
+ {% endfor %}
+
+
+ 📈 分析摘要
+
+ 文件夹路径: {{ folder_path }}
+
+
+ 分析时间: {{ analysis_time }}秒
+
+
+ 关键词:
+ {% for keyword in keywords %}
+ {{ keyword }}
+ {% endfor %}
+
+
+ 测试项分布:
+
+ - 正常: {{ status_counts.normal }} 个
+ - 警告: {{ status_counts.warning }} 个
+ - 异常: {{ status_counts.abnormal }} 个
+
+
+
+ 数据摘要:
+
+ - 总文件数: {{ file_count }} 个
+ - 总数据行数: {{ total_rows }} 行
+ - 总测试项数: {{ test_count }} 个
+ - 总数据点数: {{ total_points }} 个
+
+
+
+
+
+ 报告生成于 {{ timestamp }} | 多文件测试报告分析系统
+
+
+
+"""
+
+
+class MultiFileTestReportScatterPlotter:
+ def __init__(self):
+ self.folder_path: Optional[str] = None
+ self.df: Optional[pd.DataFrame] = None
+ self.output_dir: Optional[str] = None
+ self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit"]
+ self.col_lower: Optional[str] = None
+ self.col_upper: Optional[str] = None
+ self.html_report_path: Optional[str] = None
+ self.file_infos: List[Dict[str, Any]] = []
+
+ # 缓存处理过的数据
+ self._processed_data_cache: Dict[str, Any] = {}
+
+ # 性能监控
+ self.performance_stats = {
+ 'load_times': [],
+ 'memory_usage': [],
+ 'file_sizes': []
+ }
+
+ def _print_stage(self, msg: str, color=Fore.CYAN) -> None:
+ """统一的阶段信息输出"""
+ print(f"\n{color}{'=' * 50}")
+ print(f"📋 {msg}")
+ print(f"{'=' * 50}{Style.RESET_ALL}")
+
+ def _print_progress(self, current: int, total: int, prefix: str = "进度",
+ color=Fore.YELLOW) -> None:
+ """改进的进度条显示"""
+ if total <= 0:
+ return
+
+ percent = (current / total) * 100
+ bar_len = 40
+ filled = int(bar_len * current / total)
+ bar = "█" * filled + "░" * (bar_len - filled)
+
+ sys.stdout.write(f"\r{color}{prefix}: [{bar}] {current}/{total} ({percent:.1f}%){Style.RESET_ALL}")
+ sys.stdout.flush()
+
+ if current == total:
+ print(f"{Fore.GREEN} ✅ 完成{Style.RESET_ALL}")
+
+ def _print_warning(self, msg: str) -> None:
+ """警告信息输出"""
+ print(f"{Fore.YELLOW}⚠️ {msg}{Style.RESET_ALL}")
+
+ def _print_success(self, msg: str) -> None:
+ """成功信息输出"""
+ print(f"{Fore.GREEN}✅ {msg}{Style.RESET_ALL}")
+
+ def _print_error(self, msg: str) -> None:
+ """错误信息输出"""
+ print(f"{Fore.RED}❌ {msg}{Style.RESET_ALL}")
+
+ def _get_memory_usage(self) -> float:
+ """获取当前内存使用量(GB)"""
+ process = psutil.Process()
+ return process.memory_info().rss / (1024 ** 3)
+
+ def _check_memory_safe(self, file_size_mb: float) -> bool:
+ """检查内存是否安全"""
+ available_memory = psutil.virtual_memory().available / (1024 ** 3)
+ estimated_need = file_size_mb * 5 / 1024 # 估算需要的内存(GB)
+ return available_memory > estimated_need + 1 # 保留1GB安全空间
+
+ def _load_single_file_optimized(self, file_info: Dict[str, Any]) -> Optional[pd.DataFrame]:
+ """优化单文件加载方法"""
+ file_path = file_info['path']
+ filename = file_info['filename']
+
+ try:
+ start_time = time.time()
+ file_size_mb = os.path.getsize(file_path) / (1024 ** 2)
+
+ # 内存安全检查
+ if not self._check_memory_safe(file_size_mb):
+ self._print_warning(f"内存不足,跳过大文件: {filename} ({file_size_mb:.1f}MB)")
+ return None
+
+ # 选择合适的引擎
+ file_ext = file_path.lower()
+ if file_ext.endswith('.xlsx'):
+ engine = 'openpyxl'
+ elif file_ext.endswith('.xls'):
+ engine = 'xlrd'
+ else:
+ self._print_warning(f"不支持的文件格式: {filename}")
+ return None
+
+ # 快速获取工作表信息
+ try:
+ excel_file = pd.ExcelFile(file_path, engine=engine)
+ sheet_names = excel_file.sheet_names
+
+ # 选择工作表
+ target_sheets = ["Merged All Tests", "All Tests", sheet_names[0] if sheet_names else None]
+ selected_sheet = next((s for s in target_sheets if s and s in sheet_names), None)
+
+ if not selected_sheet:
+ self._print_warning(f"未找到目标工作表: {filename}")
+ return None
+
+ except Exception as e:
+ self._print_warning(f"无法读取工作表信息 {filename}: {e}")
+ return None
+
+ # 优化读取参数
+ read_kwargs = {
+ 'io': file_path,
+ 'sheet_name': selected_sheet,
+ 'engine': engine,
+ 'dtype': 'object',
+ 'na_filter': False,
+ 'usecols': self.required_columns, # 只读取需要的列
+ }
+
+ # 对于大文件,使用分块读取
+ if file_size_mb > 50: # 50MB以上使用分块读取
+ chunks = []
+ for chunk in pd.read_excel(**read_kwargs, chunksize=OPTIMIZATION_CONFIG['chunk_size']):
+ chunks.append(chunk)
+
+ if chunks:
+ df = pd.concat(chunks, ignore_index=True)
+ else:
+ df = pd.DataFrame()
+ else:
+ df = pd.read_excel(**read_kwargs)
+
+ if df.empty:
+ self._print_warning(f"文件为空: {filename}")
+ return None
+
+ # 检查必要列
+ missing_columns = [col for col in self.required_columns if col not in df.columns]
+ if missing_columns:
+ self._print_warning(f"缺少必要列 {filename}: {missing_columns}")
+ return None
+
+ # 添加文件标识
+ df['_source_file'] = filename
+
+ load_time = time.time() - start_time
+ file_info.update({
+ 'load_time': round(load_time, 2),
+ 'file_size_mb': round(file_size_mb, 2),
+ 'engine': engine,
+ 'rows': len(df)
+ })
+
+ self.performance_stats['load_times'].append(load_time)
+ self.performance_stats['file_sizes'].append(file_size_mb)
+ self.performance_stats['memory_usage'].append(self._get_memory_usage())
+
+ self._print_success(f"加载完成: {filename} ({len(df)}行, {load_time:.2f}s)")
+ return df
+
+ except Exception as e:
+ self._print_error(f"加载文件失败 {filename}: {e}")
+ return None
+
+
+
+ def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
+ """优化的大小写不敏感列查找"""
+ if self.df is None:
+ return None
+
+ columns_lower = {col.lower().strip(): col for col in self.df.columns}
+ for candidate in candidates:
+ key = candidate.lower().strip()
+ if key in columns_lower:
+ return columns_lower[key]
+ return None
+
+ # 以下方法保持不变(为节省空间省略部分重复代码)
+ def get_folder_path(self) -> None:
+ """获取文件夹路径"""
+ self._print_stage("输入文件夹路径")
+
+ while True:
+ print(f"{Fore.WHITE}请输入包含Excel文件的文件夹路径: ")
+ folder_path = input("> ").strip()
+
+ if not folder_path:
+ continue
+
+ path_obj = Path(folder_path)
+ if path_obj.exists() and path_obj.is_dir():
+ self.folder_path = str(path_obj.resolve())
+ print(f"{Fore.GREEN}已选择文件夹: {self.folder_path}{Style.RESET_ALL}")
+ break
+ else:
+ self._print_error(f"文件夹不存在: {folder_path},请重新输入")
+
+ def find_excel_files(self) -> List[str]:
+ """查找文件夹中的所有Excel文件"""
+ self._print_stage("扫描Excel文件")
+
+ excel_files = []
+ valid_extensions = ('.xlsx', '.xls')
+
+ try:
+ for file_path in Path(self.folder_path).rglob('*'):
+ if file_path.suffix.lower() in valid_extensions and file_path.is_file():
+ excel_files.append(str(file_path.resolve()))
+
+ # 按文件名排序
+ excel_files.sort()
+
+ self._print_success(f"找到 {len(excel_files)} 个Excel文件")
+ for i, file_path in enumerate(excel_files, 1):
+ print(f" {i:2d}. {os.path.basename(file_path)}")
+
+ return excel_files
+
+ except Exception as e:
+ self._print_error(f"扫描文件夹时发生错误: {e}")
+ return []
+
+ def load_multiple_files_optimized(self, excel_files: List[str]) -> None:
+ """优化多文件加载方法"""
+ self._print_stage("并行加载Excel文件")
+ start_time = time.time()
+
+ # 准备文件信息
+ file_infos = [{'path': path, 'filename': os.path.basename(path)} for path in excel_files]
+
+ all_dataframes = []
+ self.file_infos = []
+
+ if OPTIMIZATION_CONFIG['use_threading'] and len(excel_files) > 1:
+ # 使用多线程并行加载
+ with ThreadPoolExecutor(max_workers=OPTIMIZATION_CONFIG['max_workers']) as executor:
+ futures = {executor.submit(self._load_single_file_optimized, file_info): file_info
+ for file_info in file_infos}
+
+ completed = 0
+ for future in futures:
+ try:
+ df = future.result(timeout=300) # 5分钟超时
+ if df is not None:
+ all_dataframes.append(df)
+ self.file_infos.append(futures[future])
+ completed += 1
+ self._print_progress(completed, len(excel_files), "并行加载文件")
+ except Exception as e:
+ file_info = futures[future]
+ self._print_error(f"加载失败 {file_info['filename']}: {e}")
+ else:
+ # 顺序加载
+ for i, file_info in enumerate(file_infos, 1):
+ self._print_progress(i, len(excel_files), "加载文件")
+ df = self._load_single_file_optimized(file_info)
+ if df is not None:
+ all_dataframes.append(df)
+ self.file_infos.append(file_info)
+
+ if not all_dataframes:
+ raise ValueError("没有成功加载任何Excel文件")
+
+ # 合并数据
+ self._print_stage("合并数据")
+ merge_start = time.time()
+
+ try:
+ self.df = pd.concat(all_dataframes, ignore_index=True, sort=False)
+ merge_time = time.time() - merge_start
+
+ total_time = time.time() - start_time
+ avg_load_time = np.mean(self.performance_stats['load_times']) if self.performance_stats['load_times'] else 0
+
+ self._print_success(f"合并完成: {len(self.df)}行, {len(all_dataframes)}个文件")
+ self._print_success(f"加载耗时: {total_time:.2f}s (平均: {avg_load_time:.2f}s/文件)")
+ self._print_success(f"合并耗时: {merge_time:.2f}s")
+
+ # 显示性能统计
+ print(f"\n{Fore.CYAN}📊 性能统计:")
+ print(f" 平均加载时间: {avg_load_time:.2f}s")
+ print(f" 峰值内存使用: {max(self.performance_stats['memory_usage']):.2f}GB")
+ print(f" 总文件大小: {sum(self.performance_stats['file_sizes']):.1f}MB{Style.RESET_ALL}")
+
+ except Exception as e:
+ self._print_error(f"合并数据失败: {e}")
+ raise
+
+ # 记录上下限列名
+ self.col_lower = self._find_column_case_insensitive([
+ "Lower Limit", "lower limit", "lower_limit", "ll", "lower"
+ ])
+ self.col_upper = self._find_column_case_insensitive([
+ "Upper Limit", "upper limit", "upper_limit", "ul", "upper"
+ ])
+ def get_keywords(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
+ """获取用户输入的关键词并筛选数据 - 修改为支持多个关键词"""
+ self._print_stage("筛选关键词")
+
+ while True:
+ keyword_input = input("请输入一个或多个关键词(用逗号分隔,匹配 'Test Name New'): ").strip()
+
+ if not keyword_input:
+ print("❌ 关键词不能为空,请重新输入")
+ continue
+
+ # 分割关键词
+ keywords = [k.strip() for k in keyword_input.split(',') if k.strip()]
+
+ if not keywords:
+ print("❌ 没有有效的关键词,请重新输入")
+ continue
+
+ # 检查数据框是否为空
+ if self.df is None or self.df.empty:
+ print("⚠️ 数据框为空,无法进行筛选")
+ return pd.DataFrame(), keywords, []
+
+ # 检查列是否存在
+ if "Test Name New" not in self.df.columns:
+ print("❌ 列 'Test Name New' 不存在于数据框中")
+ print(f"可用列: {list(self.df.columns)}")
+ return pd.DataFrame(), keywords, []
+
+ try:
+ # 创建多个关键词的筛选条件
+ mask = pd.Series(False, index=self.df.index)
+ for keyword in keywords:
+ keyword_mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
+ mask = mask | keyword_mask
+
+ filtered_df = self.df.loc[mask].copy()
+
+ if filtered_df.empty:
+ # 提供友好的提示和建议
+ print(f"⚠️ 没有找到包含关键词 '{', '.join(keywords)}' 的测试项")
+
+ # 显示部分可用的测试项作为参考
+ available_tests = self.df["Test Name New"].dropna().unique()
+ if len(available_tests) > 0:
+ print("📋 可用的测试项示例:")
+ for test in available_tests[:5]:
+ print(f" - {test}")
+ if len(available_tests) > 5:
+ print(f" ... 还有 {len(available_tests) - 5} 个测试项")
+
+ # 提供重新输入或退出的选项
+ choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ")
+ if choice == "1":
+ continue
+ elif choice == "2":
+ filtered_df = self.df.copy()
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 使用所有数据: {len(filtered_df)} 行,{len(unique_tests)} 个测试项")
+ return filtered_df, keywords, unique_tests
+ else:
+ print("👋 退出筛选操作")
+ return pd.DataFrame(), keywords, []
+ else:
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
+ print(f" 使用的关键词: {', '.join(keywords)}")
+ return filtered_df, keywords, unique_tests
+
+ except Exception as e:
+ print(f"❌ 筛选过程中发生错误: {e}")
+ print("请检查数据格式或重新输入关键词")
+ continue
+
+ def create_output_dir(self, keyword) -> None:
+ """创建输出目录"""
+ self._print_stage("创建输出目录")
+
+ if not self.folder_path:
+ raise ValueError("文件夹路径未设置")
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.output_dir = os.path.join(self.folder_path, f"scatter_report_out")
+ safe_keyword = self._safe_filename(keyword) if keyword else "all_data"
+ self.html_report_path = os.path.join(self.output_dir, f"{safe_keyword}_report_{timestamp}.html")
+
+ os.makedirs(self.output_dir, exist_ok=True)
+ print(f"输出目录: {self.output_dir}")
+
+
+ @staticmethod
+ def _safe_filename(name: str) -> str:
+ """生成安全的文件名"""
+ safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
+ return safe or "Unknown_Test"
+
+ def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
+ Optional[float], Optional[float], List[float], List[float]]:
+ """提取某个测试项的上下限数值"""
+ lower_plot = upper_plot = None
+ lower_set = []
+ upper_set = []
+
+ if self.col_lower and self.col_lower in df_one_test.columns:
+ lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
+ lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
+ if lower_set:
+ lower_plot = min(lower_set)
+
+ if self.col_upper and self.col_upper in df_one_test.columns:
+ upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
+ upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
+ if upper_set:
+ upper_plot = max(upper_set)
+
+ return lower_plot, upper_plot, lower_set, upper_set
+
+
+ @staticmethod
+ def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
+ """统一的系列清洗和转换方法 - 修复了 ast 方法名错误"""
+ if series.empty:
+ return series
+
+ if target_type == 'numeric':
+ # 数值转换优化
+ if pd.api.types.is_numeric_dtype(series):
+ return series.astype(float)
+
+ # 批量字符串处理 - 修复这里的问题
+ cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
+ return pd.to_numeric(cleaned, errors='coerce')
+
+ elif target_type == 'datetime':
+ return MultiFileTestReportScatterPlotter._convert_to_datetime(series)
+
+ return series
+
+
+ @staticmethod
+ def _convert_to_datetime(series: pd.Series) -> pd.Series:
+ """优化的日期时间转换"""
+ if pd.api.types.is_datetime64_any_dtype(series):
+ return series
+
+ # 预处理:转换为数值和字符串两种形式
+ numeric_series = pd.to_numeric(series, errors='coerce')
+ string_series = series.astype(str).str.strip()
+
+ result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
+
+ # 数值时间戳处理 - 优化逻辑
+ masks = {
+ 'ms': numeric_series >= 1e12, # 调整为更合理的阈值
+ 's_ms': (numeric_series >= 1e9) & (numeric_series < 1e12), # 包含秒和毫秒的混合情况
+ 'excel': (numeric_series > 20000) & (numeric_series < 60000)
+ }
+
+ for mask_type, mask in masks.items():
+ if mask.any():
+ if mask_type == 'ms':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
+ elif mask_type == 's_ms':
+ # 对有小数部分的时间戳使用浮点数处理
+ timestamp_values = numeric_series.loc[mask]
+
+ # 检查是否有小数部分
+ has_decimal = (timestamp_values % 1 != 0)
+
+ # 对整数部分(秒级时间戳)处理
+ if (~has_decimal).any():
+ integer_mask = mask & (~has_decimal)
+ result.loc[integer_mask] = pd.to_datetime(
+ numeric_series.loc[integer_mask].astype('int64'), unit='s'
+ )
+
+ # 对小数部分(可能是毫秒级)处理
+ if has_decimal.any():
+ decimal_mask = mask & has_decimal
+ # 尝试毫秒单位转换
+ result.loc[decimal_mask] = pd.to_datetime(
+ numeric_series.loc[decimal_mask] * 1000, unit='ms'
+ )
+
+ elif mask_type == 'excel':
+ origin = pd.Timestamp('1899-12-30')
+ result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
+
+ # 字符串日期处理
+ remaining_mask = result.isna()
+ if remaining_mask.any():
+ remaining_strings = string_series.loc[remaining_mask]
+
+ # 特定格式优先处理
+ format_patterns = [
+ (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
+ ]
+
+ for pattern, date_format in format_patterns:
+ format_mask = remaining_strings.str.match(pattern)
+ if format_mask.any():
+ result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
+ remaining_strings.loc[format_mask], format=date_format, errors='coerce'
+ )
+
+ # 通用解析
+ still_na_mask = result.isna() & remaining_mask
+ if still_na_mask.any():
+ result.loc[still_na_mask] = pd.to_datetime(
+ string_series.loc[still_na_mask], errors='coerce'
+ )
+
+ return result
+
+ def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
+ """数据预处理"""
+ # 数值转换
+ test_data['Measurement_num'] = self._clean_and_convert_series(
+ test_data['Measurement'], 'numeric'
+ )
+ test_data['TestTime_dt'] = self._clean_and_convert_series(
+ test_data['Test Time'], 'datetime'
+ )
+
+ # 去除无效数据
+ valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
+ return valid_data.sort_values('TestTime_dt')
+
+ def _preprocess_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
+ """数据预处理 - 简化版本用于对比图"""
+ test_data = test_data.copy()
+
+ # 数值转换
+ test_data['Measurement_num'] = self._clean_and_convert_series(
+ test_data['Measurement'], 'numeric'
+ )
+ test_data['TestTime_dt'] = self._clean_and_convert_series(
+ test_data['Test Time'], 'datetime'
+ )
+
+ # 去除无效数据
+ valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
+ return valid_data
+
+
+
+
+ def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
+ """计算统计信息"""
+ stats = {
+ 'count': len(y_data),
+ 'mean': y_data.mean(),
+ 'median': y_data.median(),
+ 'min': y_data.min(),
+ 'max': y_data.max(),
+ 'std': y_data.std(),
+ 'q1': y_data.quantile(0.25),
+ 'q3': y_data.quantile(0.75)
+ }
+ return stats
+
+ def _plot_to_base64(self, fig) -> str:
+ """将图表转换为base64编码"""
+ buf = BytesIO()
+ fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+ buf.seek(0)
+ img_str = base64.b64encode(buf.read()).decode('utf-8')
+ plt.close(fig)
+ return img_str
+
+ def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> str:
+ """创建汇总图(所有SN在一个图中)"""
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # 分组绘制
+ groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
+ for sn, group in groups:
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ label=str(sn), alpha=0.7, s=25)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线和统计线
+ x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"汇总图 - {test_name}")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+
+ return self._plot_to_base64(fig)
+
+ def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]:
+ """为每个SN创建独立图表"""
+ sn_plots = []
+
+ if "SN" not in test_data.columns:
+ return sn_plots
+
+ sn_groups = test_data.groupby("SN")
+
+ for sn, group in sn_groups:
+ if group.empty:
+ continue
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # 绘制当前SN的数据点
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ color='blue', alpha=0.7, s=30, label=f"SN: {sn}")
+
+ # 计算当前SN的统计信息
+ y_data = group['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线
+ x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"SN独立图 - {test_name} (SN: {sn})")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend()
+
+ # 转换为base64
+ plot_image = self._plot_to_base64(fig)
+ sn_plots.append({"sn": str(sn), "image": plot_image})
+
+ return sn_plots
+
+
+ def _determine_test_status(self, stats: Dict[str, float],
+ lower_limit: Optional[float],
+ upper_limit: Optional[float]) -> Dict[str, Any]:
+ """确定测试状态"""
+ status = "success"
+ status_display = "正常"
+
+ if lower_limit is not None and upper_limit is not None:
+ # 检查是否超出限值
+ if stats['min'] < lower_limit or stats['max'] > upper_limit:
+ status = "danger"
+ status_display = "异常"
+ elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or
+ stats['std'] > (upper_limit - lower_limit) * 0.2):
+ status = "warning"
+ status_display = "警告"
+
+ return {"status": status, "status_display": status_display}
+
+ def _create_comparison_plots(self, filtered_df: pd.DataFrame, keywords: List[str]) -> List[Dict[str, str]]:
+ """创建多关键词对比图 - 优化版本:时间序列改为散点图"""
+ comparison_plots = []
+
+ if len(keywords) <= 1:
+ return comparison_plots # 单一关键词不需要对比图
+
+ try:
+ # 确保有足够的数据
+ if filtered_df.empty:
+ return comparison_plots
+
+ # 1. 散点图对比(原时间序列图改为散点图)
+ fig1, ax1 = plt.subplots(figsize=(14, 8))
+ has_data = False
+
+ # 使用更鲜艳的颜色
+ colors = plt.cm.Set3(np.linspace(0, 1, len(keywords)))
+ markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h'] # 多种标记形状
+
+ # 获取全局时间范围用于统一x轴
+ global_min_time = None
+ global_max_time = None
+
+ # 先收集所有数据的时间范围
+ for keyword in keywords:
+ keyword_mask = filtered_df["Test Name New"].astype(str).str.contains(
+ re.escape(keyword), case=False, na=False
+ )
+ keyword_data = filtered_df.loc[keyword_mask].copy()
+
+ if not keyword_data.empty:
+ keyword_data = self._preprocess_test_data(keyword_data)
+ if not keyword_data.empty:
+ time_min = keyword_data['TestTime_dt'].min()
+ time_max = keyword_data['TestTime_dt'].max()
+
+ if global_min_time is None or time_min < global_min_time:
+ global_min_time = time_min
+ if global_max_time is None or time_max > global_max_time:
+ global_max_time = time_max
+
+ for i, keyword in enumerate(keywords):
+ keyword_mask = filtered_df["Test Name New"].astype(str).str.contains(
+ re.escape(keyword), case=False, na=False
+ )
+ keyword_data = filtered_df.loc[keyword_mask].copy()
+
+ if not keyword_data.empty:
+ keyword_data = self._preprocess_test_data(keyword_data)
+ if not keyword_data.empty:
+ # 对散点图数据添加少量随机抖动,避免完全重叠
+ if len(keyword_data) > 1:
+ # 为相同时间点的数据添加微小的时间偏移,避免重叠
+ time_jitter = pd.Timedelta(minutes=1) # 1分钟抖动
+ jitter_range = np.random.uniform(-0.5, 0.5, len(keyword_data)) * time_jitter
+ keyword_data['TestTime_dt_jittered'] = keyword_data['TestTime_dt'] + jitter_range
+ x_values = keyword_data['TestTime_dt_jittered']
+ else:
+ x_values = keyword_data['TestTime_dt']
+
+ y_values = keyword_data['Measurement_num']
+
+ if len(x_values) > 0:
+ # 使用散点图,设置不同的标记和透明度
+ scatter = ax1.scatter(x_values, y_values,
+ label=f"{keyword} (n={len(keyword_data)})",
+ color=colors[i],
+ marker=markers[i % len(markers)],
+ s=40, # 点的大小
+ alpha=0.7, # 透明度
+ edgecolors='white', # 边缘颜色
+ linewidth=0.5) # 边缘线宽
+ has_data = True
+
+ # 为每个关键词添加趋势线(可选)
+ if len(keyword_data) >= 3:
+ try:
+ # 按时间排序
+ sorted_data = keyword_data.sort_values('TestTime_dt')
+ # 计算简单线性趋势
+ x_numeric = pd.to_numeric(sorted_data['TestTime_dt'])
+ y_trend = sorted_data['Measurement_num']
+
+ if len(x_numeric) >= 2:
+ # 使用numpy的polyfit计算趋势线
+ z = np.polyfit(x_numeric, y_trend, 1)
+ p = np.poly1d(z)
+ ax1.plot(sorted_data['TestTime_dt'], p(x_numeric),
+ color=colors[i], linestyle='--',
+ alpha=0.5, linewidth=1,
+ label=f"{keyword} 趋势线")
+ except:
+ pass # 趋势线计算失败时忽略
+
+ if has_data:
+ # 设置统一的x轴范围
+ if global_min_time and global_max_time:
+ # 添加一些边距
+ time_range = global_max_time - global_min_time
+ margin = time_range * 0.05
+ ax1.set_xlim(global_min_time - margin, global_max_time + margin)
+
+ ax1.set_title("多关键词散点图对比", fontsize=14, fontweight='bold')
+ ax1.set_xlabel("测试时间", fontsize=12)
+ ax1.set_ylabel("测量值", fontsize=12)
+ ax1.grid(True, alpha=0.3)
+ ax1.tick_params(axis='x', rotation=45)
+
+ # 优化图例显示
+ ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left',
+ fontsize=10, framealpha=0.9)
+ plt.tight_layout()
+
+ comparison_plots.append({
+ "title": "散点图对比",
+ "image": self._plot_to_base64(fig1)
+ })
+
+ # 2. 箱线图对比(简化版)
+ plot_data = []
+ labels = []
+
+ for keyword in keywords:
+ keyword_mask = filtered_df["Test Name New"].astype(str).str.contains(
+ re.escape(keyword), case=False, na=False
+ )
+ keyword_data = filtered_df.loc[keyword_mask].copy()
+
+ if not keyword_data.empty:
+ keyword_data = self._preprocess_test_data(keyword_data)
+ if not keyword_data.empty and len(keyword_data) >= 3: # 至少3个数据点
+ plot_data.append(keyword_data['Measurement_num'].values)
+ labels.append(f"{keyword}\n(n={len(keyword_data)})")
+
+ if len(plot_data) >= 2: # 至少两个关键词有数据
+ fig2, ax2 = plt.subplots(figsize=(10, 6))
+ bp = ax2.boxplot(plot_data, tick_labels=labels, patch_artist=True)
+
+ colors = plt.cm.Set3(np.linspace(0, 1, len(plot_data)))
+ for i, box in enumerate(bp['boxes']):
+ box.set(facecolor=colors[i], alpha=0.7)
+
+ ax2.set_title("多关键词箱线图对比", fontsize=14, fontweight='bold')
+ ax2.set_ylabel("测量值", fontsize=12)
+ ax2.grid(True, alpha=0.3)
+ plt.tight_layout()
+
+ comparison_plots.append({
+ "title": "箱线图对比",
+ "image": self._plot_to_base64(fig2)
+ })
+
+ # 3. 分布直方图对比
+ fig3, ax3 = plt.subplots(figsize=(12, 6))
+ has_hist_data = False
+
+ colors = plt.cm.Set3(np.linspace(0, 1, len(keywords)))
+
+ for i, keyword in enumerate(keywords):
+ keyword_mask = filtered_df["Test Name New"].astype(str).str.contains(
+ re.escape(keyword), case=False, na=False
+ )
+ keyword_data = filtered_df.loc[keyword_mask].copy()
+
+ if not keyword_data.empty:
+ keyword_data = self._preprocess_test_data(keyword_data)
+ if not keyword_data.empty:
+ # 动态调整直方图bins数量
+ n_bins = min(20, max(5, len(keyword_data) // 10))
+ ax3.hist(keyword_data['Measurement_num'].values,
+ bins=n_bins,
+ alpha=0.6,
+ label=f"{keyword} (n={len(keyword_data)})",
+ color=colors[i],
+ density=True) # 使用密度而不是频次
+ has_hist_data = True
+
+ if has_hist_data:
+ ax3.set_title("多关键词分布直方图对比", fontsize=14, fontweight='bold')
+ ax3.set_xlabel("测量值", fontsize=12)
+ ax3.set_ylabel("密度", fontsize=12)
+ ax3.legend()
+ ax3.grid(True, alpha=0.3)
+ plt.tight_layout()
+
+ comparison_plots.append({
+ "title": "分布直方图对比",
+ "image": self._plot_to_base64(fig3)
+ })
+
+ except Exception as e:
+ self._print_warning(f"创建对比图时出错: {e}")
+ import traceback
+ traceback.print_exc()
+
+ return comparison_plots
+
+ def generate_html_report(self, filtered_df: pd.DataFrame, keywords: List[str],
+ unique_tests: List[str]) -> None:
+ """生成HTML报告 - 修复对比图显示问题"""
+ self._print_stage("生成HTML报告")
+ start_time = time.time()
+
+ test_results = []
+ total_points = 0
+ status_counts = {"success": 0, "warning": 0, "danger": 0}
+
+ # 生成多关键词对比图
+ print(f"🔍 调试: 开始生成对比图,关键词数量: {len(keywords)}")
+ comparison_plot_images = self._create_comparison_plots(filtered_df, keywords)
+ print(f"🔍 调试: 对比图生成完成,数量: {len(comparison_plot_images)}")
+
+ # 调试输出对比图信息
+ for i, plot in enumerate(comparison_plot_images):
+ print(f" - 对比图{i + 1}: {plot['title']}, 图像大小: {len(plot['image'])} 字符")
+
+ # 生成各测试项的详细图表
+ for i, test_name in enumerate(unique_tests, 1):
+ self._print_progress(i, len(unique_tests), "生成测试报告")
+
+ # 获取测试数据
+ test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
+ test_data = self._preprocess_test_data(test_data)
+
+ if test_data.empty:
+ continue
+
+ # 提取限值信息
+ lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+ total_points += stats['count']
+
+ # 生成汇总图表
+ summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot)
+
+ # 生成SN独立图表
+ sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot)
+
+ # 确定测试状态
+ status_info = self._determine_test_status(stats, lower_plot, upper_plot)
+ status_counts[status_info["status"]] += 1
+
+ # 添加到结果列表
+ test_results.append({
+ "name": test_name,
+ "stats": stats,
+ "limits": {"lower": lower_plot, "upper": upper_plot},
+ "summary_plot_image": summary_plot_image,
+ "sn_plot_images": sn_plot_images,
+ "status": status_info["status"],
+ "status_display": status_info["status_display"]
+ })
+
+ # 渲染HTML模板前再次验证数据
+ print(f"🔍 调试: 传递给模板的对比图数量: {len(comparison_plot_images)}")
+
+ # 渲染HTML模板
+ template = Template(HTML_TEMPLATE)
+ html_content = template.render(
+ keywords=keywords if keywords else ["所有数据"],
+ keywords_display=", ".join(keywords) if keywords else "所有数据",
+ timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ test_count=len(test_results),
+ total_points=total_points,
+ tests=test_results,
+ folder_path=self.folder_path,
+ analysis_time=round(time.time() - start_time, 2),
+ status_counts={"normal": status_counts["success"], "warning": status_counts["warning"],
+ "abnormal": status_counts["danger"]},
+ file_count=len(self.file_infos),
+ file_infos=self.file_infos,
+ total_rows=len(self.df) if self.df is not None else 0,
+ comparison_plot_images=comparison_plot_images # 确保传递
+ )
+
+ # 调试:检查生成的HTML内容
+ if comparison_plot_images:
+ if "comparison_plot_images" in html_content or "时间序列对比图" in html_content:
+ print("✅ 对比图已成功嵌入HTML")
+ else:
+ print("❌ 对比图未正确嵌入HTML")
+
+ # 保存HTML文件
+ with open(self.html_report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ self._print_success(f"HTML报告已生成: {self.html_report_path}")
+ self._print_success(
+ f"共处理 {len(self.file_infos)} 个文件,{len(test_results)} 个测试项,{total_points} 个数据点")
+ if len(keywords) > 1:
+ self._print_success(f"已生成 {len(comparison_plot_images)} 个对比图表")
+
+ def run(self) -> None:
+ """运行主程序"""
+ try:
+ self.get_folder_path()
+ excel_files = self.find_excel_files()
+
+ if not excel_files:
+ self._print_error("没有找到可用的Excel文件")
+ return
+
+ # 使用优化后的加载方法
+ self.load_multiple_files_optimized(excel_files)
+
+ while True:
+ filtered_df, keywords, unique_tests = self.get_keywords() # 修改方法调用
+ if filtered_df.empty:
+ self._print_warning("没有数据可处理,退出程序")
+ break
+
+ safe_keyword_text = "_".join([self._safe_filename(k) for k in keywords]) if keywords else "all_data"
+ self.create_output_dir(safe_keyword_text)
+ self.generate_html_report(filtered_df, keywords, unique_tests) # 修改参数
+
+ self._print_success("分析完成!")
+ print(f"📊 报告文件: {self.html_report_path}")
+ print(f"📁 输出目录: {self.output_dir}")
+ if len(keywords) > 1:
+ print(f"🔍 对比关键词: {', '.join(keywords)}")
+
+ # 询问是否继续分析其他关键词
+ continue_choice = input("\n是否继续分析其他关键词?(y/n): ").strip().lower()
+ if continue_choice not in ['y', 'yes', '是']:
+ break
+
+ except KeyboardInterrupt:
+ self._print_warning("用户中断程序")
+ except Exception as e:
+ self._print_error(f"发生错误: {type(e).__name__}: {str(e)}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ plotter = MultiFileTestReportScatterPlotter()
+ plotter.run()
|