Files
PythonApp/dataProcess/dataProcess_html_V1.py

1061 lines
45 KiB
Python
Raw Normal View History

2026-02-02 15:19:30 +08:00
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import os
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from io import BytesIO
import base64
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, as_completed
import time
import json
import traceback
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
def plot_worker(args):
"""工作进程函数:生成单个分组的图表"""
try:
group_key, feature_data_dict, limits_dict = args
# 每个进程重新设置matplotlib配置避免线程冲突
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
results = {}
for feature_name, feature_data in feature_data_dict.items():
if len(feature_data) == 0:
results[feature_name] = ""
continue
usl, lsl = limits_dict[feature_name]
# 创建图表
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle(f'{group_key} - {feature_name} 统计分析', fontsize=14)
# 1. 直方图
axes[0, 0].hist(feature_data, bins=15, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
axes[0, 0].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
axes[0, 0].axvline(feature_data.mean(), color='orange', linestyle='-',
label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
axes[0, 0].set_title('直方图')
axes[0, 0].set_xlabel(feature_name)
axes[0, 0].set_ylabel('频数')
axes[0, 0].legend(fontsize=8)
axes[0, 0].grid(True, alpha=0.3)
# 2. 箱线图
sns.boxplot(y=feature_data, ax=axes[0, 1], color='lightblue')
axes[0, 1].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
axes[0, 1].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
axes[0, 1].set_title('箱线图')
axes[0, 1].set_ylabel(feature_name)
axes[0, 1].legend(fontsize=8)
axes[0, 1].grid(True, alpha=0.3)
# 3. 序列图
axes[1, 0].plot(range(len(feature_data)), feature_data, 'o-', color='blue',
alpha=0.7, markersize=3, linewidth=1)
axes[1, 0].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
axes[1, 0].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
axes[1, 0].axhline(feature_data.mean(), color='orange', linestyle='-',
label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
axes[1, 0].set_title('序列图')
axes[1, 0].set_xlabel('数据点序号')
axes[1, 0].set_ylabel(feature_name)
axes[1, 0].legend(fontsize=8)
axes[1, 0].grid(True, alpha=0.3)
# 4. 概率密度图
sns.kdeplot(feature_data, ax=axes[1, 1], color='blue', fill=True, alpha=0.5)
axes[1, 1].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
axes[1, 1].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
axes[1, 1].axvline(feature_data.mean(), color='orange', linestyle='-',
label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
axes[1, 1].set_title('概率密度图')
axes[1, 1].set_xlabel(feature_name)
axes[1, 1].set_ylabel('密度')
axes[1, 1].legend(fontsize=8)
axes[1, 1].grid(True, alpha=0.3)
plt.tight_layout()
# 转换为base64
buffer = BytesIO()
plt.savefig(buffer, format='png', dpi=80, bbox_inches='tight')
buffer.seek(0)
image_base64 = base64.b64encode(buffer.getvalue()).decode()
plt.close(fig)
results[feature_name] = image_base64
return group_key, results
except Exception as e:
print(f"❌ 图表生成失败 {group_key}: {e}")
print(f" 错误详情: {traceback.format_exc()}")
return group_key, {}
class DataProcessor:
def __init__(self):
self.data = None
self.filename = None
self.file_path = None
self.file_dir = None # 新增:存储输入文件所在目录
self.stats = None
self.output_dir = None
self.progress_file = None
def select_file(self):
"""手动选择数据文件"""
print("打开文件选择对话框...")
root = tk.Tk()
root.withdraw()
self.file_path = filedialog.askopenfilename(
title="选择数据文件",
filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")]
)
if self.file_path:
self.filename = os.path.basename(self.file_path)
self.file_dir = os.path.dirname(self.file_path) # 获取文件所在目录
print(f"✅ 已选择文件: {self.filename}")
print(f"📁 文件所在目录: {self.file_dir}")
return True
else:
print("❌ 未选择文件")
return False
def _load_data(self):
"""加载数据文件"""
print("开始加载数据文件...")
try:
if self.file_path.endswith('.csv'):
self.data = pd.read_csv(self.file_path)
print("✅ 成功加载CSV文件")
elif self.file_path.endswith('.xlsx'):
self.data = pd.read_excel(self.file_path)
print("✅ 成功加载Excel文件")
else:
raise ValueError("不支持的文件格式")
print(f"📊 数据文件形状: {self.data.shape}")
except Exception as e:
print(f"❌ 加载数据文件时出错: {e}")
print(f" 错误详情: {traceback.format_exc()}")
raise
def _validate_data(self):
"""验证数据完整性 - 增强验证:检查上下限列"""
print("验证数据完整性...")
# 检查必要的测量列
required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)']
missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns]
if missing_measure_columns:
error_msg = f"数据文件中缺少必要的测量列: {missing_measure_columns}"
print(f"{error_msg}")
raise ValueError(error_msg)
# 检查必要的上下限列
required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)',
'Vol_Min(%)', 'Vol_Max(%)',
'Area_Min(%)', 'Area_Max(%)']
missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns]
if missing_limit_columns:
error_msg = f"数据文件中缺少必要的上下限列: {missing_limit_columns}"
print(f"{error_msg}")
raise ValueError(error_msg)
print("✅ 数据验证通过")
# 检查数据是否存在空值
all_required_columns = required_measure_columns + required_limit_columns
null_counts = self.data[all_required_columns].isnull().sum()
if null_counts.any():
print(f"⚠️ 数据中存在空值 - {null_counts[null_counts > 0].to_dict()}")
def _setup_output_directory(self):
"""设置输出目录"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
base_name = os.path.splitext(self.filename)[0]
# 优化:输出目录放置在输入文件所在文件夹下
self.output_dir = os.path.join(self.file_dir, f"{base_name}_report_{timestamp}")
# 创建主目录
os.makedirs(self.output_dir, exist_ok=True)
# 创建分组报告子目录
os.makedirs(os.path.join(self.output_dir, 'group_reports'), exist_ok=True)
# 创建进度文件
self.progress_file = os.path.join(self.output_dir, 'progress.json')
print(f"📁 输出目录: {self.output_dir}")
def _save_progress(self, completed_groups=None, current_stage=None):
"""保存处理进度"""
try:
progress = {
'filename': self.filename,
'total_groups': len(self.stats.index) if self.stats is not None else 0,
'completed_groups': completed_groups or [],
'current_stage': current_stage,
'last_update': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'input_file_directory': self.file_dir, # 记录输入文件目录
'output_directory': self.output_dir # 记录输出目录
}
with open(self.progress_file, 'w', encoding='utf-8') as f:
json.dump(progress, f, indent=2, ensure_ascii=False)
except Exception as e:
print(f"⚠️ 保存进度失败: {e}")
def generate_report(self):
"""生成统计报告 - 分阶段输出"""
if self.data is None:
raise ValueError("请先选择数据文件")
try:
# 验证数据
self._validate_data()
# 设置输出目录
self._setup_output_directory()
print("开始数据处理...")
# 创建分组键
self.data['Group_Key'] = self.data['PAD ID'].astype(str) + '_' + self.data['Component ID'].astype(str)
group_count = self.data['Group_Key'].nunique()
print(f"📊 共发现 {group_count} 个分组")
# 阶段1快速生成基本统计信息和汇总报告
print("\n=== 阶段1: 生成基本统计信息 ===")
# 计算测量数据的统计信息
self.stats = self.data.groupby('Group_Key').agg({
'Height(mil)': ['min', 'max', 'mean', 'std'],
'Volume(%)': ['min', 'max', 'mean', 'std'],
'Area(%)': ['min', 'max', 'mean', 'std']
}).round(4)
# 重命名测量统计列
self.stats.columns = [
'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)',
'Vol_Measured_Min(%)', 'Vol_Measured_Max(%)', 'Vol_Mean(%)', 'Vol_Std(%)',
'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)'
]
print("基本统计信息计算完成")
# 获取预设的上下限信息
print("获取预设上下限信息...")
limits = self.data.groupby('Group_Key').agg({
'Height_Low(mil)': 'first', # 取第一个值作为该分组的预设下限
'Height_High(mil)': 'first', # 取第一个值作为该分组的预设上限
'Vol_Min(%)': 'first',
'Vol_Max(%)': 'first',
'Area_Min(%)': 'first',
'Area_Max(%)': 'first'
}).round(4)
# 合并统计信息和预设上下限信息
self.stats = pd.concat([self.stats, limits], axis=1)
print("预设上下限信息获取完成")
# 计算CPK - 使用预设的上下限值
print("计算CPK值...")
self.stats = self._calculate_cpk(self.stats)
# 立即生成汇总报告
summary_report_path = self._create_summary_report()
print(f"✅ 汇总报告生成完成: {summary_report_path}")
# 保存Excel
excel_path = self._save_to_excel_advanced()
print(f"✅ Excel文件保存完成: {excel_path}")
# 阶段2分批生成详细分组报告
print("\n=== 阶段2: 分批生成详细分组报告 ===")
self._generate_group_reports_incremental()
# 阶段3生成索引文件可选
print("\n=== 阶段3: 生成报告索引 ===")
index_path = self._create_report_index()
print(f"✅ 报告索引生成完成: {index_path}")
return summary_report_path
except Exception as e:
print(f"❌ 程序执行失败: {e}")
print(f" 错误详情: {traceback.format_exc()}")
# 即使失败,也尝试保存当前进度
if hasattr(self, 'output_dir'):
print(f"📁 当前结果已保存到: {self.output_dir}")
raise
def _create_summary_report(self):
"""创建快速汇总报告(区分预设上下限和实测值)"""
print("生成快速汇总报告...")
# 使用明确的空值检查
if self.stats is None or len(self.stats.index) == 0:
print("⚠️ 统计数据为空,生成空报告")
return self._create_empty_report()
# 将索引转换为列表避免DataFrame布尔判断问题
stats_index = list(self.stats.index)
total_groups = len(stats_index)
# 安全地检查CPK列是否存在
valid_height_cpk = 0
valid_volume_cpk = 0
valid_area_cpk = 0
if 'Height_Cpk' in self.stats.columns:
valid_height_cpk = self.stats['Height_Cpk'].notna().sum()
if 'Volume_Cpk' in self.stats.columns:
valid_volume_cpk = self.stats['Volume_Cpk'].notna().sum()
if 'Area_Cpk' in self.stats.columns:
valid_area_cpk = self.stats['Area_Cpk'].notna().sum()
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>数据统计汇总报告 - {self.filename}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1, h2, h3 {{ color: #333; }}
.summary {{ background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 20px 0; }}
table {{ border-collapse: collapse; width: 100%; margin: 10px 0; font-size: 12px; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: center; }}
th {{ background-color: #4CAF50; color: white; }}
.limits {{ background-color: #e8f5e8; font-weight: bold; }}
.measured {{ background-color: #fff3cd; }}
.info-box {{ background-color: #e7f3ff; padding: 15px; border-radius: 5px; margin: 15px 0; }}
.nav-links {{ margin: 20px 0; }}
.nav-links a {{ margin-right: 15px; text-decoration: none; color: #0066cc; }}
.progress {{ background-color: #fff3cd; padding: 10px; border: 1px solid #ffeaa7; border-radius: 5px; margin: 10px 0; }}
.warning {{ color: #856404; background-color: #fff3cd; padding: 5px; border-radius: 3px; }}
</style>
</head>
<body>
<h1>数据统计汇总报告 - {self.filename}</h1>
<p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>输入文件位置: <code>{self.file_dir}</code></p>
<div class="info-box">
<h3>报告说明</h3>
<p>此报告为快速生成的汇总报告包含所有分组的基本统计信息</p>
<p>CPK计算使用<strong>预设的上下限值</strong>而不是实测的最小最大值</p>
<p class="warning">注意分组详细报告可能需要较长时间生成请勿关闭程序</p>
</div>
<div class="progress">
<h3>处理进度</h3>
<p>总分组数量: <strong>{total_groups}</strong></p>
<p>有效Height CPK数量: <strong>{valid_height_cpk}</strong></p>
<p>有效Volume CPK数量: <strong>{valid_volume_cpk}</strong></p>
<p>有效Area CPK数量: <strong>{valid_area_cpk}</strong></p>
<p>输出目录: <code>{self.output_dir}</code></p>
</div>
<div class="nav-links">
<a href="group_reports/index.html">查看分组报告索引</a>
<a href="statistics.xlsx">下载Excel数据</a>
<a href="progress.json">查看处理进度</a>
</div>
<h2>详细统计数据</h2>
<table>
<thead>
<tr>
<th rowspan="2">分组标识<br>(PAD ID + Component ID)</th>
<th colspan="8">Height(mil)</th>
<th colspan="8">Volume(%)</th>
<th colspan="8">Area(%)</th>
{'<th colspan="3">CPK值</th>' if 'Height_Cpk' in self.stats.columns else ''}
</tr>
<tr>
<!-- Height列标题 -->
<th class="limits">预设下限<br>(LSL)</th>
<th class="limits">预设上限<br>(USL)</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>数据点数</th>
<th>CPK</th>
<!-- Volume列标题 -->
<th class="limits">预设下限<br>(LSL)</th>
<th class="limits">预设上限<br>(USL)</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>数据点数</th>
<th>CPK</th>
<!-- Area列标题 -->
<th class="limits">预设下限<br>(LSL)</th>
<th class="limits">预设上限<br>(USL)</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>数据点数</th>
<th>CPK</th>
<!-- 分组标识 -->
<th>分组</th>
</tr>
</thead>
<tbody>
"""
# 生成表格行数据
for group_key in stats_index:
row = self.stats.loc[group_key]
def format_value(value):
"""格式化数值显示"""
if pd.isna(value):
return 'N/A'
elif isinstance(value, (int, float)):
return f"{value:.4f}"
else:
return str(value)
# 获取数据点数
group_data = self.data[self.data['Group_Key'] == group_key]
data_count = len(group_data)
# 安全处理CPK列
cpk_columns = {"height": "", "volume": "", "area": ""}
if 'Height_Cpk' in self.stats.columns:
cpk_columns = {
"height": f"""<td>{format_value(row['Height_Cpk'])}</td>""",
"volume": f"""<td>{format_value(row['Volume_Cpk'])}</td>""",
"area": f"""<td>{format_value(row['Area_Cpk'])}</td>"""
}
# 为CPK值添加颜色标识
def get_cpk_color(cpk_value):
"""根据CPK值返回颜色标识"""
if pd.isna(cpk_value):
return ''
try:
cpk_val = float(cpk_value)
if cpk_val >= 1.33:
return 'style="background-color: #90EE90;"' # 绿色 - 优秀
elif cpk_val >= 1.0:
return 'style="background-color: #FFFFE0;"' # 黄色 - 合格
else:
return 'style="background-color: #FFB6C1;"' # 红色 - 不合格
except:
return ''
# 如果存在CPK列添加颜色
if 'Height_Cpk' in self.stats.columns:
# 这里需要为每个CPK单元格单独设置颜色
height_color = get_cpk_color(row['Height_Cpk'])
volume_color = get_cpk_color(row['Volume_Cpk'])
area_color = get_cpk_color(row['Area_Cpk'])
cpk_columns = {
"height": f"""<td {height_color}>{format_value(row['Height_Cpk'])}</td>""",
"volume": f"""<td {volume_color}>{format_value(row['Volume_Cpk'])}</td>""",
"area": f"""<td {area_color}>{format_value(row['Area_Cpk'])}</td>"""
}
html_content += f"""
<tr>
<td><a href="group_reports/{self._sanitize_filename(group_key)}.html" target="_blank">{group_key}</a></td>
<!-- Height数据 -->
<td class="limits">{format_value(row['Height_Low(mil)'])}</td>
<td class="limits">{format_value(row['Height_High(mil)'])}</td>
<td class="measured">{format_value(row['Height_Measured_Min(mil)'])}</td>
<td class="measured">{format_value(row['Height_Measured_Max(mil)'])}</td>
<td>{format_value(row['Height_Mean(mil)'])}</td>
<td>{format_value(row['Height_Std(mil)'])}</td>
<td>{data_count}</td>
{cpk_columns["height"]}
<!-- Volume数据 -->
<td class="limits">{format_value(row['Vol_Min(%)'])}</td>
<td class="limits">{format_value(row['Vol_Max(%)'])}</td>
<td class="measured">{format_value(row['Vol_Measured_Min(%)'])}</td>
<td class="measured">{format_value(row['Vol_Measured_Max(%)'])}</td>
<td>{format_value(row['Vol_Mean(%)'])}</td>
<td>{format_value(row['Vol_Std(%)'])}</td>
<td>{data_count}</td>
{cpk_columns["volume"]}
<!-- Area数据 -->
<td class="limits">{format_value(row['Area_Min(%)'])}</td>
<td class="limits">{format_value(row['Area_Max(%)'])}</td>
<td class="measured">{format_value(row['Area_Measured_Min(%)'])}</td>
<td class="measured">{format_value(row['Area_Measured_Max(%)'])}</td>
<td>{format_value(row['Area_Mean(%)'])}</td>
<td>{format_value(row['Area_Std(%)'])}</td>
<td>{data_count}</td>
{cpk_columns["area"]}
<!-- 分组标识 -->
<td>{group_key}</td>
</tr>
"""
html_content += """
</tbody>
</table>
<div class="info-box">
<h3>表格说明</h3>
<p><span class="limits" style="padding: 2px 5px;">绿色背景</span>: 预设的上下限值用于CPK计算</p>
<p><span class="measured" style="padding: 2px 5px;">黄色背景</span>: 实测数据的最小最大值</p>
<p>白色背景: 统计计算值</p>
</div>
<div class="info-box">
<h3>CPK计算说明</h3>
<p><strong>CPK计算公式:</strong> CPK = min[(USL - mean) / (3×std), (mean - LSL) / (3×std)]</p>
<p><strong>上下限取值:</strong> 使用数据文件中的预设上下限值而不是实测的最小最大值</p>
<p><span style="background-color: #90EE90; padding: 2px 5px;">绿色</span> CPK 1.33 (过程能力优秀)</p>
<p><span style="background-color: #FFFFE0; padding: 2px 5px;">黄色</span> 1.0 CPK < 1.33 (过程能力合格)</p>
<p><span style="background-color: #FFB6C1; padding: 2px 5px;">红色</span> CPK < 1.0 (过程能力不足)</p>
</div>
</body>
</html>
"""
report_path = os.path.join(self.output_dir, 'summary_report.html')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"✅ 汇总报告已生成: {report_path}")
return report_path
def _create_empty_report(self):
"""创建空数据报告"""
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>数据统计报告 - {self.filename}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.warning {{ color: #856404; background-color: #fff3cd; padding: 20px; border-radius: 5px; }}
</style>
</head>
<body>
<h1>数据统计报告 - {self.filename}</h1>
<div class="warning">
<h2> 数据为空</h2>
<p>未找到有效数据或统计数据为空</p>
<p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>输入文件位置: <code>{self.file_dir}</code></p>
</div>
</body>
</html>
"""
report_path = os.path.join(self.output_dir, 'summary_report.html')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
return report_path
def _sanitize_filename(self, filename):
"""清理文件名,移除非法字符"""
import re
return re.sub(r'[<>:"/\\|?*]', '_', filename)
def _generate_group_reports_incremental(self):
"""分批生成分组报告,避免长时间等待"""
# 使用明确的空值检查方法
if self.stats is None or len(self.stats.index) == 0:
print("⚠️ 统计数据为空,跳过分组报告生成")
return
stats_index = list(self.stats.index)
total_groups = len(stats_index)
if total_groups == 0:
print("⚠️ 没有有效的分组数据")
return
print(f"📊 开始分批生成 {total_groups} 个分组报告...")
print(f"📁 分组报告将保存到: {os.path.join(self.output_dir, 'group_reports')}")
# 分批处理
BATCH_SIZE = min(20, total_groups)
completed_groups = []
total_batches = (total_groups + BATCH_SIZE - 1) // BATCH_SIZE
for batch_idx in range(total_batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min((batch_idx + 1) * BATCH_SIZE, total_groups)
batch_groups = stats_index[batch_start:batch_end]
print(f"\n🔄 处理批次 {batch_idx + 1}/{total_batches}: 分组 {batch_start + 1}-{batch_end}")
try:
batch_results = self._process_batch(batch_groups)
# 生成当前批次的分组报告
successful_reports = 0
for group_key in batch_groups:
try:
self._create_single_group_report(group_key, batch_results.get(group_key, {}))
completed_groups.append(group_key)
successful_reports += 1
print(f" ✅ 分组报告生成: {self._sanitize_filename(group_key)}.html")
except Exception as e:
print(f" ❌ 生成分组 {group_key} 报告失败: {e}")
print(f" 错误详情: {traceback.format_exc()}")
# 保存进度
self._save_progress(completed_groups, f"batch_{batch_idx + 1}")
print(f"✅ 批次 {batch_idx + 1} 完成 (成功生成 {successful_reports}/{len(batch_groups)} 个报告)")
except Exception as batch_error:
print(f"❌ 批次 {batch_idx + 1} 处理失败: {batch_error}")
print(f" 错误详情: {traceback.format_exc()}")
# 继续处理下一批次
continue
# 添加批次间隔,避免资源竞争
if batch_idx < total_batches - 1:
print("⏳ 等待2秒后处理下一批次...")
time.sleep(2)
print(f"✅ 所有分组报告生成完成 (总计: {len(completed_groups)}/{total_groups})")
print(f"📁 分组报告保存位置: {os.path.join(self.output_dir, 'group_reports')}")
def _process_batch(self, group_keys):
"""处理单个批次的分组"""
if not group_keys: # 明确的空列表检查
print("⚠️ 当前批次没有分组数据")
return {}
tasks = []
for group_key in group_keys:
# 问题修正:使用明确的检查方法
stats_index_list = list(self.stats.index) # 转换为列表
if group_key not in stats_index_list:
print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过")
continue # 跳过不存在的分组
# 问题修正避免DataFrame的布尔判断使用明确的.empty检查
group_data = self.data[self.data['Group_Key'] == group_key]
if group_data.empty: # 明确的空值检查
print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过")
continue
row = self.stats.loc[group_key]
# 安全地获取特征数据,添加空值检查
feature_data_dict = {}
for col in ['Height(mil)', 'Volume(%)', 'Area(%)']:
col_data = group_data[col].dropna()
if len(col_data) == 0:
print(f"⚠️ 警告: 分组 {group_key}{col} 数据为空")
col_data = pd.Series([], dtype=float) # 创建空Series
feature_data_dict[col] = col_data
# 获取预设的上下限值
limits_dict = {}
# 安全地获取限制值
try:
limits_dict = {
'Height(mil)': (row['Height_High(mil)'], row['Height_Low(mil)']), # USL, LSL
'Volume(%)': (row['Vol_Max(%)'], row['Vol_Min(%)']), # USL, LSL
'Area(%)': (row['Area_Max(%)'], row['Area_Min(%)']) # USL, LSL
}
except KeyError as e:
print(f"❌ 错误: 分组 {group_key} 缺少预设上下限列 {e}")
continue
tasks.append((group_key, feature_data_dict, limits_dict))
if len(tasks) == 0: # 明确的空列表检查
print("⚠️ 当前批次没有有效任务")
return {}
# 使用多进程处理
max_workers = min(mp.cpu_count(), len(tasks), 4)
results = {}
print(f"🔧 开始处理批次中的 {len(tasks)} 个任务,使用 {max_workers} 个进程...")
with ProcessPoolExecutor(max_workers=max_workers) as executor:
future_to_key = {}
for task in tasks:
future = executor.submit(plot_worker, task)
future_to_key[future] = task[0]
completed_count = 0
for future in as_completed(future_to_key):
group_key = future_to_key[future]
try:
result_key, result_data = future.result()
if result_key: # 明确的结果检查
results[result_key] = result_data
completed_count += 1
print(f" 📈 图表生成完成: {result_key} ({completed_count}/{len(tasks)})")
except Exception as e:
print(f" ❌ 处理分组 {group_key} 时出错: {e}")
print(f"✅ 批次处理完成,成功生成 {len(results)}/{len(tasks)} 个图表")
return results
def _create_single_group_report(self, group_key, feature_charts):
"""创建单个分组的独立报告"""
# 添加明确的分组存在性检查
stats_index_list = list(self.stats.index) # 转换为列表
if group_key not in stats_index_list:
print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过报告生成")
return
try:
row = self.stats.loc[group_key]
except KeyError:
print(f"❌ 错误: 无法获取分组 {group_key} 的统计数据")
return
# 明确的空值检查
group_data = self.data[self.data['Group_Key'] == group_key]
# 确保group_data不为空
if group_data.empty:
print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过报告生成")
return
# 安全格式化数值
def safe_format(value, default="N/A"):
try:
if pd.isna(value):
return default
return f"{float(value):.4f}"
except (ValueError, TypeError):
return default
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>{group_key} - 详细分析报告</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1, h2, h3 {{ color: #333; }}
.summary {{ background-color: #f5f5f5; padding: 15px; border-radius: 5px; margin: 15px 0; }}
table {{ border-collapse: collapse; width: 100%; margin: 10px 0; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: center; }}
th {{ background-color: #4CAF50; color: white; }}
.limits {{ background-color: #e8f5e8; font-weight: bold; }}
.measured {{ background-color: #fff3cd; }}
.chart-container {{ display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin: 15px 0; }}
.chart {{ text-align: center; background-color: #fafafa; padding: 10px; border-radius: 5px; }}
.chart img {{ max-width: 100%; height: auto; }}
.nav {{ margin: 10px 0; }}
.nav a {{ margin-right: 10px; }}
</style>
</head>
<body>
<div class="nav">
<a href="../summary_report.html">返回汇总报告</a>
<a href="index.html">返回索引</a>
</div>
<h1>{group_key} - 详细分析报告</h1>
<p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>输入文件位置: <code>{self.file_dir}</code></p>
<div class="summary">
<h2>基本统计信息</h2>
<table>
<tr>
<th>特征</th><th class="limits">预设下限(LSL)</th><th class="limits">预设上限(USL)</th>
<th class="measured">实测最小值</th><th class="measured">实测最大值</th>
<th>平均值</th><th>标准差</th><th>CPK</th>
</tr>
<tr>
<td>Height(mil)</td><td class="limits">{safe_format(row.get('Height_Low(mil)'))}</td><td class="limits">{safe_format(row.get('Height_High(mil)'))}</td>
<td class="measured">{safe_format(row.get('Height_Measured_Min(mil)'))}</td><td class="measured">{safe_format(row.get('Height_Measured_Max(mil)'))}</td>
<td>{safe_format(row.get('Height_Mean(mil)'))}</td><td>{safe_format(row.get('Height_Std(mil)'))}</td><td>{safe_format(row.get('Height_Cpk'))}</td>
</tr>
<tr>
<td>Volume(%)</td><td class="limits">{safe_format(row.get('Vol_Min(%)'))}</td><td class="limits">{safe_format(row.get('Vol_Max(%)'))}</td>
<td class="measured">{safe_format(row.get('Vol_Measured_Min(%)'))}</td><td class="measured">{safe_format(row.get('Vol_Measured_Max(%)'))}</td>
<td>{safe_format(row.get('Vol_Mean(%)'))}</td><td>{safe_format(row.get('Vol_Std(%)'))}</td><td>{safe_format(row.get('Volume_Cpk'))}</td>
</tr>
<tr>
<td>Area(%)</td><td class="limits">{safe_format(row.get('Area_Min(%)'))}</td><td class="limits">{safe_format(row.get('Area_Max(%)'))}</td>
<td class="measured">{safe_format(row.get('Area_Measured_Min(%)'))}</td><td class="measured">{safe_format(row.get('Area_Measured_Max(%)'))}</td>
<td>{safe_format(row.get('Area_Mean(%)'))}</td><td>{safe_format(row.get('Area_Std(%)'))}</td><td>{safe_format(row.get('Area_Cpk'))}</td>
</tr>
</table>
</div>
"""
# 添加图表
for feature_name in ['Height(mil)', 'Volume(%)', 'Area(%)']:
chart_base64 = feature_charts.get(feature_name, "")
if chart_base64 and len(chart_base64) > 0: # 明确的字符串检查
html_content += f"""
<h2>{feature_name} 分析图表</h2>
<div class="chart-container">
<div class="chart">
<img src="data:image/png;base64,{chart_base64}" alt="{feature_name}统计图表">
</div>
</div>
"""
else:
html_content += f"""
<h2>{feature_name} 分析图表</h2>
<p>该特征的图表生成失败或数据不足</p>
"""
html_content += """
</body>
</html>
"""
filename = self._sanitize_filename(group_key) + '.html'
group_reports_dir = os.path.join(self.output_dir, 'group_reports')
report_path = os.path.join(group_reports_dir, filename)
try:
with open(report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
except Exception as e:
print(f"❌ 保存分组报告失败 {filename}: {e}")
def _create_report_index(self):
"""创建分组报告索引"""
# 确保使用正确的索引获取方式
if self.stats is None or len(self.stats.index) == 0:
print("⚠️ 统计数据为空,创建空索引")
return self._create_empty_index()
stats_index = list(self.stats.index) # 转换为列表
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>分组报告索引</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
h1 { color: #333; }
.group-list { margin: 20px 0; }
.group-item { margin: 5px 0; }
.group-item a { text-decoration: none; color: #0066cc; }
.nav { margin: 15px 0; }
</style>
</head>
<body>
<div class="nav">
<a href="../summary_report.html">返回汇总报告</a>
</div>
<h1>分组报告索引</h1>
<p>共生成 """ + str(len(stats_index)) + """ 个分组报告</p>
<p>输入文件位置: <code>""" + self.file_dir + """</code></p>
<div class="group-list">
"""
for group_key in stats_index: # 使用列表而不是DataFrame索引
filename = self._sanitize_filename(group_key) + '.html'
html_content += f'<div class="group-item"><a href="{filename}">{group_key}</a></div>\n'
html_content += """
</div>
</body>
</html>
"""
index_path = os.path.join(self.output_dir, 'group_reports', 'index.html')
try:
with open(index_path, 'w', encoding='utf-8') as f:
f.write(html_content)
except Exception as e:
print(f"❌ 创建索引文件失败: {e}")
return None
return index_path
def _create_empty_index(self):
"""创建空索引文件"""
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>分组报告索引</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.warning { color: #856404; background-color: #fff3cd; padding: 20px; }
</style>
</head>
<body>
<h1>分组报告索引</h1>
<div class="warning">
<h2> 没有分组报告</h2>
<p>当前没有生成任何分组报告</p>
<p>输入文件位置: <code>""" + self.file_dir + """</code></p>
</div>
</body>
</html>
"""
index_path = os.path.join(self.output_dir, 'group_reports', 'index.html')
with open(index_path, 'w', encoding='utf-8') as f:
f.write(html_content)
return index_path
def _calculate_cpk(self, stats):
"""计算CPK值 - 使用预设的上下限值"""
print("详细计算CPK值...")
def calculate_single_cpk(mean, std, usl, lsl):
"""计算单个特征的CPK"""
if std == 0 or pd.isna(std):
return np.nan
if pd.isna(usl) or pd.isna(lsl):
return np.nan
# CPK = min[(USL - mean) / (3*std), (mean - LSL) / (3*std)]
cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf')
cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf')
# 如果其中一个限值为无穷大,则返回另一个值
if cpu == float('inf') and cpl == float('inf'):
return np.nan
elif cpu == float('inf'):
return cpl
elif cpl == float('inf'):
return cpu
else:
return min(cpu, cpl)
# 确保CPK列不存在时创建
cpk_results = []
for idx, row in stats.iterrows():
print(f"计算分组 {idx} 的CPK值...")
# Height CPK - 使用预设的Height_High作为USLHeight_Low作为LSL
height_cpk = calculate_single_cpk(
row['Height_Mean(mil)'],
row['Height_Std(mil)'],
row['Height_High(mil)'], # USL - 预设上限
row['Height_Low(mil)'] # LSL - 预设下限
)
# Volume CPK - 使用预设的Vol_Max作为USLVol_Min作为LSL
volume_cpk = calculate_single_cpk(
row['Vol_Mean(%)'],
row['Vol_Std(%)'],
row['Vol_Max(%)'], # USL - 预设上限
row['Vol_Min(%)'] # LSL - 预设下限
)
# Area CPK - 使用预设的Area_Max作为USLArea_Min作为LSL
area_cpk = calculate_single_cpk(
row['Area_Mean(%)'],
row['Area_Std(%)'],
row['Area_Max(%)'], # USL - 预设上限
row['Area_Min(%)'] # LSL - 预设下限
)
cpk_results.append({
'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan,
'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan,
'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan
})
# 将CPK结果添加到统计数据中
cpk_df = pd.DataFrame(cpk_results, index=stats.index)
stats = pd.concat([stats, cpk_df], axis=1)
print("✅ 所有分组CPK计算完成 - 使用预设上下限值")
return stats
def _save_to_excel_advanced(self):
"""保存Excel文件"""
print("保存Excel文件...")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
excel_filename = os.path.join(self.output_dir, 'statistics.xlsx')
try:
with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
# 保存统计汇总
if self.stats is not None:
self.stats.reset_index().to_excel(writer, sheet_name='统计汇总', index=False)
# 保存前50个分组的数据
MAX_GROUPS_TO_SAVE = 50
unique_groups = self.data['Group_Key'].unique()[:MAX_GROUPS_TO_SAVE]
for i, group_key in enumerate(unique_groups):
group_data = self.data[self.data['Group_Key'] == group_key].copy()
sheet_name = f"组_{group_key}"[:31]
group_data.to_excel(writer, sheet_name=sheet_name, index=False)
print(f"✅ Excel文件保存完成: {excel_filename}")
return excel_filename
except Exception as e:
print(f"❌ Excel文件保存失败: {e}")
print(f" 错误详情: {traceback.format_exc()}")
return None
def main():
"""主函数"""
print("=== 数据统计报告生成程序(使用预设上下限值) ===")
processor = DataProcessor()
try:
if processor.select_file():
processor._load_data()
report_path = processor.generate_report()
print(f"✅ 报告生成完成")
print(f"📁 输入文件目录: {processor.file_dir}")
print(f"📁 输出目录: {processor.output_dir}")
print(f"📊 汇总报告: {report_path}")
# 显示重要文件路径
print(f"📊 Excel文件: {os.path.join(processor.output_dir, 'statistics.xlsx')}")
else:
print("❌ 未选择文件,程序退出")
except Exception as e:
print(f"❌ 程序执行失败: {e}")
print(f" 错误详情: {traceback.format_exc()}")
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
main()