Python脚本开发文件初始化
This commit is contained in:
20
dataProcess/.gitignore
vendored
Normal file
20
dataProcess/.gitignore
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
/build/*
|
||||
/build
|
||||
/dist/*
|
||||
/dist
|
||||
/source/*
|
||||
/source
|
||||
/dataProcess_out*
|
||||
*.xls
|
||||
*.xlsx
|
||||
*.csv
|
||||
*.spec
|
||||
|
||||
/temp
|
||||
|
||||
dataProcess_html_V2.py
|
||||
|
||||
dataProcess_sightml_V2.py
|
||||
dataProcess_sightml_V3.py
|
||||
|
||||
dataProcessMerge_V2.py
|
||||
475
dataProcess/dataProcessMerge_V1.py
Normal file
475
dataProcess/dataProcessMerge_V1.py
Normal file
@@ -0,0 +1,475 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from tkinter import filedialog, Tk
|
||||
import logging
|
||||
import datetime
|
||||
# --- 新增导入 ---
|
||||
from colorama import init, Fore, Style
|
||||
import sys
|
||||
|
||||
# 初始化 colorama,autoreset=True 使得每次打印后自动恢复默认颜色
|
||||
init(autoreset=True)
|
||||
|
||||
# --- 自定义日志格式化器 ---
|
||||
class ColoredFormatter(logging.Formatter):
|
||||
"""根据日志级别为控制台输出添加颜色"""
|
||||
|
||||
# 定义颜色
|
||||
COLORS = {
|
||||
'DEBUG': Fore.CYAN,
|
||||
'INFO': Fore.GREEN,
|
||||
'WARNING': Fore.YELLOW,
|
||||
'ERROR': Fore.RED,
|
||||
'CRITICAL': Fore.RED + Style.BRIGHT,
|
||||
}
|
||||
|
||||
def format(self, record):
|
||||
# 获取对应级别的颜色
|
||||
log_color = self.COLORS.get(record.levelname, '')
|
||||
# 应用颜色到整个记录
|
||||
record.levelname = f"{log_color}{record.levelname}{Style.RESET_ALL}"
|
||||
record.msg = f"{log_color}{record.msg}{Style.RESET_ALL}"
|
||||
# 使用父类的格式化方法
|
||||
return super().format(record)
|
||||
|
||||
# --- 配置日志 ---
|
||||
# 创建 logger 对象
|
||||
logger = logging.getLogger() # 获取根 logger
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 移除默认的 handlers(如果有的话),避免重复输出
|
||||
if logger.handlers:
|
||||
logger.handlers.clear()
|
||||
|
||||
# 创建控制台 handler
|
||||
console_handler = logging.StreamHandler(sys.stdout) # 使用 sys.stdout 通常更好
|
||||
console_handler.setLevel(logging.INFO)
|
||||
|
||||
# 创建并设置 formatter
|
||||
formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# 将 handler 添加到 logger
|
||||
logger.addHandler(console_handler)
|
||||
# --- 日志配置结束 ---
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self):
|
||||
self.spec_file = None
|
||||
self.data_folder = None
|
||||
self.spec_data = None
|
||||
self.data_files = []
|
||||
self.merged_data = pd.DataFrame()
|
||||
|
||||
def select_spec_file(self):
|
||||
"""选择上限和下限规格要求文件"""
|
||||
root = Tk()
|
||||
root.withdraw()
|
||||
self.spec_file = filedialog.askopenfilename(
|
||||
title="选择上限和下限规格要求文件",
|
||||
filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
|
||||
)
|
||||
if not self.spec_file:
|
||||
logging.error("未选择规格文件")
|
||||
return False
|
||||
logging.info(f"已选择规格文件: {self.spec_file}")
|
||||
return True
|
||||
|
||||
def select_data_folder(self):
|
||||
"""选择实际数据文件所在的文件夹"""
|
||||
root = Tk()
|
||||
root.withdraw()
|
||||
self.data_folder = filedialog.askdirectory(title="选择实际数据文件所在的文件夹")
|
||||
if not self.data_folder:
|
||||
logging.error("未选择数据文件夹")
|
||||
return False
|
||||
logging.info(f"已选择数据文件夹: {self.data_folder}")
|
||||
return True
|
||||
|
||||
def clean_column_names(self, df):
|
||||
"""清理列名,去除前后空格和特殊字符"""
|
||||
df.columns = [col.strip() for col in df.columns]
|
||||
return df
|
||||
|
||||
def load_spec_data(self):
|
||||
"""加载规格数据,标题行为第3行"""
|
||||
try:
|
||||
# 读取CSV文件,跳过前2行,第3行作为标题
|
||||
self.spec_data = pd.read_csv(self.spec_file, header=2)
|
||||
|
||||
# 清理列名
|
||||
self.spec_data = self.clean_column_names(self.spec_data)
|
||||
|
||||
# 确保PAD ID列是字符串类型
|
||||
if 'PAD ID' in self.spec_data.columns:
|
||||
self.spec_data['PAD ID'] = self.spec_data['PAD ID'].astype(str).str.strip()
|
||||
|
||||
# 检查必要的列是否存在
|
||||
required_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
|
||||
"Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
|
||||
|
||||
missing_columns = [col for col in required_columns if col not in self.spec_data.columns]
|
||||
if missing_columns:
|
||||
logging.warning(f"规格文件中缺少以下列: {missing_columns}")
|
||||
# 尝试查找相似的列名
|
||||
for missing_col in missing_columns:
|
||||
similar_cols = [col for col in self.spec_data.columns if missing_col.lower() in col.lower()]
|
||||
if similar_cols:
|
||||
logging.info(f"可能匹配的列: {similar_cols}")
|
||||
|
||||
# 特别检查 Component ID 是否存在
|
||||
if "Component ID" not in self.spec_data.columns:
|
||||
logging.warning("'Component ID' 列在规格文件中缺失,这可能导致输出文件中也缺少该列。")
|
||||
|
||||
logging.info(f"规格数据加载成功,共 {len(self.spec_data)} 行")
|
||||
logging.info(f"规格文件列名: {list(self.spec_data.columns)}")
|
||||
logging.info(
|
||||
f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype if 'PAD ID' in self.spec_data.columns else 'N/A'}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"加载规格数据失败: {e}")
|
||||
return False
|
||||
return True
|
||||
|
||||
def scan_data_files(self):
|
||||
"""扫描数据文件夹中的CSV文件,并检查标题行是否包含有效字段"""
|
||||
try:
|
||||
# 定义有效的字段名称(去除前后空格)
|
||||
required_fields = [
|
||||
"PAD ID", "Component ID", "Height(mil)", "Volume(%)",
|
||||
"Area(%)", "Volume(mil3)", "Area(mil2)"
|
||||
]
|
||||
|
||||
# 可选:定义字段匹配的宽松程度
|
||||
field_match_threshold = 0.8 # 80%的字段匹配即认为有效
|
||||
|
||||
# 扫描CSV文件
|
||||
valid_files = []
|
||||
for file in os.listdir(self.data_folder):
|
||||
if file.endswith(".csv") and "F27140015X3K" in file:
|
||||
file_path = os.path.join(self.data_folder, file)
|
||||
|
||||
# 检查文件是否可读且包含有效字段
|
||||
if self._is_valid_csv_file(file_path, required_fields, field_match_threshold):
|
||||
valid_files.append(file_path)
|
||||
|
||||
self.data_files = valid_files
|
||||
logging.info(
|
||||
f"找到 {len(self.data_files)} 个有效数据文件: {[os.path.basename(f) for f in self.data_files]}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"扫描数据文件失败: {e}")
|
||||
return False
|
||||
|
||||
return True if self.data_files else False
|
||||
|
||||
def _is_valid_csv_file(self, file_path, required_fields, threshold=1.0):
|
||||
"""检查CSV文件是否包含必需的字段"""
|
||||
try:
|
||||
# 尝试不同的编码
|
||||
encodings = ['utf-8', 'gbk', 'latin-1']
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
first_line = f.readline().strip()
|
||||
|
||||
# 解析CSV标题行
|
||||
headers = [header.strip() for header in first_line.split(',')]
|
||||
|
||||
# 计算匹配的字段数量
|
||||
matched_fields = 0
|
||||
missing_fields = []
|
||||
|
||||
for required_field in required_fields:
|
||||
if required_field in headers:
|
||||
matched_fields += 1
|
||||
else:
|
||||
missing_fields.append(required_field)
|
||||
|
||||
# 计算匹配比例
|
||||
match_ratio = matched_fields / len(required_fields)
|
||||
|
||||
if match_ratio >= threshold:
|
||||
if missing_fields:
|
||||
logging.warning(
|
||||
f"文件 {os.path.basename(file_path)} 部分字段缺失: {missing_fields},但满足阈值要求")
|
||||
else:
|
||||
logging.info(f"文件 {os.path.basename(file_path)} 所有字段完整")
|
||||
return True
|
||||
else:
|
||||
logging.warning(
|
||||
f"文件 {os.path.basename(file_path)} 字段匹配率不足: {match_ratio:.1%},缺失字段: {missing_fields}")
|
||||
return False
|
||||
|
||||
except UnicodeDecodeError:
|
||||
continue # 尝试下一个编码
|
||||
|
||||
logging.error(f"无法读取文件 {os.path.basename(file_path)},尝试了所有编码")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"检查文件 {os.path.basename(file_path)} 时发生错误: {e}")
|
||||
return False
|
||||
|
||||
def load_and_clean_data_file(self, data_file):
|
||||
"""加载并清理数据文件"""
|
||||
try:
|
||||
# 读取数据文件,第一行作为标题
|
||||
# 处理可能的编码问题
|
||||
try:
|
||||
data_df = pd.read_csv(data_file, header=0, encoding='utf-8')
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
data_df = pd.read_csv(data_file, header=0, encoding='gbk')
|
||||
except UnicodeDecodeError:
|
||||
data_df = pd.read_csv(data_file, header=0, encoding='latin-1')
|
||||
|
||||
# 清理列名
|
||||
data_df = self.clean_column_names(data_df)
|
||||
|
||||
logging.info(f"数据文件列名: {list(data_df.columns)}")
|
||||
|
||||
# --- 关键修改:创建副本以避免 SettingWithCopyWarning ---
|
||||
data_df = data_df.copy()
|
||||
|
||||
# 确保PAD ID列是字符串类型
|
||||
if 'PAD ID' in data_df.columns:
|
||||
data_df['PAD ID'] = data_df['PAD ID'].astype(str).str.strip()
|
||||
logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
|
||||
|
||||
# 检查必要的列是否存在
|
||||
required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
|
||||
|
||||
# 处理可能的列名变体
|
||||
column_mapping = {}
|
||||
for required_col in required_columns:
|
||||
if required_col not in data_df.columns:
|
||||
# 查找相似的列名
|
||||
# 更宽松的匹配方式:忽略空格和大小写
|
||||
similar_cols = [col for col in data_df.columns if
|
||||
required_col.lower().replace(" ", "") in col.lower().replace(" ", "")]
|
||||
if similar_cols:
|
||||
column_mapping[required_col] = similar_cols[0]
|
||||
logging.info(f"映射列: {required_col} -> {similar_cols[0]}")
|
||||
|
||||
# 重命名列
|
||||
if column_mapping:
|
||||
data_df = data_df.rename(columns=column_mapping)
|
||||
|
||||
missing_columns = [col for col in required_columns if col not in data_df.columns]
|
||||
if missing_columns:
|
||||
logging.error(f"数据文件中缺少以下列: {missing_columns}")
|
||||
logging.info(f"数据文件所有列: {list(data_df.columns)}")
|
||||
return None
|
||||
|
||||
return data_df # 返回处理好的副本
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"加载数据文件失败: {e}")
|
||||
return None
|
||||
|
||||
def process_data(self):
|
||||
"""处理数据并合并"""
|
||||
all_data = []
|
||||
total_files = len(self.data_files)
|
||||
|
||||
if total_files == 0:
|
||||
logging.error("未找到任何数据文件")
|
||||
return False
|
||||
|
||||
for idx, data_file in enumerate(self.data_files, 1):
|
||||
logging.info(f"处理数据文件 {idx}/{total_files}: {os.path.basename(data_file)}")
|
||||
try:
|
||||
# 加载并清理数据文件
|
||||
data_df = self.load_and_clean_data_file(data_file)
|
||||
if data_df is None:
|
||||
logging.error(f"无法加载文件: {os.path.basename(data_file)}")
|
||||
continue
|
||||
|
||||
# 选择需要的字段
|
||||
required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
|
||||
|
||||
# 检查数据文件中是否存在所有必需的列
|
||||
available_columns = [col for col in required_columns if col in data_df.columns]
|
||||
if len(available_columns) != len(required_columns):
|
||||
missing = set(required_columns) - set(available_columns)
|
||||
logging.warning(f"文件 {os.path.basename(data_file)} 缺少列: {missing}")
|
||||
logging.info(f"可用的列: {available_columns}")
|
||||
# --- 关键修改:使用可用的列继续处理 (再次创建副本) ---
|
||||
data_df = data_df[available_columns].copy()
|
||||
else:
|
||||
# --- 关键修改:选择所需的列 (创建副本) ---
|
||||
data_df = data_df[required_columns].copy()
|
||||
|
||||
# 添加数据来源字段
|
||||
data_df["数据来源"] = os.path.basename(data_file)
|
||||
data_df["限制来源"] = os.path.basename(self.spec_file)
|
||||
|
||||
# 调试信息:显示合并前的数据类型
|
||||
logging.info(
|
||||
f"合并前 - 数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist() if 'PAD ID' in data_df.columns else 'N/A'}")
|
||||
logging.info(
|
||||
f"合并前 - 规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist() if 'PAD ID' in self.spec_data.columns else 'N/A'}")
|
||||
|
||||
# 从规格文件中选择需要的字段
|
||||
spec_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
|
||||
"Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
|
||||
|
||||
# 只选择存在的列
|
||||
available_spec_columns = [col for col in spec_columns if col in self.spec_data.columns]
|
||||
# --- 关键修改:使用 .copy() 创建一个独立的副本,避免 SettingWithCopyWarning ---
|
||||
spec_df = self.spec_data[available_spec_columns].copy()
|
||||
|
||||
# 确保规格文件的PAD ID也是字符串类型
|
||||
if 'PAD ID' in spec_df.columns:
|
||||
spec_df['PAD ID'] = spec_df['PAD ID'].astype(str).str.strip()
|
||||
|
||||
# 合并规格数据
|
||||
merged_df = pd.merge(data_df, spec_df, on="PAD ID", how="inner", suffixes=('_data', '_spec'))
|
||||
|
||||
if merged_df.empty:
|
||||
logging.warning(f"文件 {os.path.basename(data_file)} 与规格数据无匹配项")
|
||||
# 显示一些调试信息
|
||||
data_pad_ids = set(data_df['PAD ID'].unique()) if 'PAD ID' in data_df.columns else set()
|
||||
spec_pad_ids = set(spec_df['PAD ID'].unique()) if 'PAD ID' in spec_df.columns else set()
|
||||
common_ids = data_pad_ids.intersection(spec_pad_ids)
|
||||
logging.info(
|
||||
f"数据文件PAD ID数量: {len(data_pad_ids)}, 规格文件PAD ID数量: {len(spec_pad_ids)}, 共同ID数量: {len(common_ids)}")
|
||||
logging.info(f"数据文件前5个PAD ID: {list(data_pad_ids)[:5] if data_pad_ids else 'N/A'}")
|
||||
logging.info(f"规格文件前5个PAD ID: {list(spec_pad_ids)[:5] if spec_pad_ids else 'N/A'}")
|
||||
continue
|
||||
|
||||
# --- 优化开始:确保 Component ID 来自数据文件 ---
|
||||
# 即使合并产生了两个 Component ID (_data 和 _spec),我们也明确使用来自 data_df 的那个
|
||||
if 'Component ID_data' in merged_df.columns:
|
||||
merged_df['Component ID'] = merged_df['Component ID_data']
|
||||
# 可选:删除来自规格文件的 Component ID 列
|
||||
# merged_df.drop(columns=['Component ID_spec'], inplace=True, errors='ignore')
|
||||
# 或者保留它以便对比,这里我们先注释掉删除操作
|
||||
|
||||
# 如果因为某种原因没有 _data 后缀(例如只有一个 Component ID),则默认就是 data_df 的
|
||||
# (这种情况在 merge 时不会发生,因为我们用了 suffixes)
|
||||
# --- 优化结束 ---
|
||||
|
||||
# --- 新增:对规格高度字段执行单位转换(除以 25.4) ---
|
||||
# 为避免意外字符导致转换失败,先清洗再转换为数值
|
||||
convert_cols = ["Height_Low(mil)", "Height_High(mil)"]
|
||||
for col in convert_cols:
|
||||
if col in merged_df.columns:
|
||||
before_non_null = merged_df[col].notna().sum()
|
||||
# 清洗非数字字符(保留数字、小数点和负号)
|
||||
cleaned = merged_df[col].astype(str).str.replace(r'[^\d\.\-]+', '', regex=True)
|
||||
merged_df[col] = pd.to_numeric(cleaned, errors='coerce') / 25.4
|
||||
after_non_null = merged_df[col].notna().sum()
|
||||
logging.info(
|
||||
f"字段 {col} 已除以 25.4 完成单位转换,非空值数: 转换前 {before_non_null} -> 转换后 {after_non_null}"
|
||||
)
|
||||
else:
|
||||
logging.warning(f"规格高度字段缺失,无法进行单位转换: {col}")
|
||||
|
||||
# 选择最终输出的字段(按照要求的顺序)
|
||||
output_columns = [
|
||||
"PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", "Height_Low(mil)",
|
||||
"Height_High(mil)", "Area_Min(%)", "Area_Max(%)", "Height(mil)", "Volume(%)", "Area(%)",
|
||||
"数据来源", "限制来源"
|
||||
]
|
||||
|
||||
# --- 优化开始 ---
|
||||
# 只选择存在的列
|
||||
available_output_columns = [col for col in output_columns if col in merged_df.columns]
|
||||
|
||||
# 检查是否有列缺失并打印警告
|
||||
missing_output_columns = [col for col in output_columns if col not in merged_df.columns]
|
||||
if missing_output_columns:
|
||||
logging.warning(
|
||||
f"文件 {os.path.basename(data_file)} 的最终输出中缺少以下预期列: {missing_output_columns}")
|
||||
|
||||
# 如果没有任何可用列,则跳过此文件
|
||||
if not available_output_columns:
|
||||
logging.error(f"文件 {os.path.basename(data_file)} 没有任何预期的输出列,将跳过此文件。")
|
||||
continue
|
||||
|
||||
merged_df = merged_df[available_output_columns].copy() # 再次使用.copy()确保安全
|
||||
# --- 优化结束 ---
|
||||
|
||||
all_data.append(merged_df)
|
||||
logging.info(f"文件 {os.path.basename(data_file)} 处理成功,匹配 {len(merged_df)} 行")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"处理文件 {os.path.basename(data_file)} 时出错: {e}")
|
||||
# 显示更多调试信息
|
||||
if 'data_df' in locals() and 'PAD ID' in data_df.columns:
|
||||
logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
|
||||
logging.info(f"数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist()}")
|
||||
if hasattr(self, 'spec_data') and 'PAD ID' in self.spec_data.columns:
|
||||
logging.info(f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype}")
|
||||
logging.info(f"规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist()}")
|
||||
continue
|
||||
|
||||
if all_data:
|
||||
self.merged_data = pd.concat(all_data, ignore_index=True)
|
||||
logging.info(f"数据处理完成,共合并 {len(self.merged_data)} 行数据")
|
||||
logging.info(f"最终数据列名: {list(self.merged_data.columns)}")
|
||||
else:
|
||||
logging.error("未成功处理任何数据文件")
|
||||
return False
|
||||
return True
|
||||
|
||||
def save_to_excel(self):
|
||||
"""保存合并后的数据到Excel文件"""
|
||||
try:
|
||||
# 生成时间戳
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_filename = f"dataProcess_out_{timestamp}.xlsx"
|
||||
output_file = os.path.join(self.data_folder, output_filename)
|
||||
|
||||
self.merged_data.to_excel(output_file, index=False)
|
||||
logging.info(f"数据已保存到: {output_file}")
|
||||
|
||||
# 显示统计信息
|
||||
stats = f"处理统计:\n"
|
||||
stats += f"- 规格文件: {os.path.basename(self.spec_file)}\n"
|
||||
stats += f"- 处理的数据文件数: {len(self.data_files)}\n"
|
||||
stats += f"- 合并的总行数: {len(self.merged_data)}\n"
|
||||
stats += f"- 输出文件: {output_file}\n"
|
||||
stats += f"- 包含的列: {list(self.merged_data.columns)}"
|
||||
|
||||
logging.info(stats)
|
||||
# 原来的 message box 提示已移除,改为日志输出
|
||||
logging.info("处理完成。\n" + stats)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"保存数据失败: {e}")
|
||||
# 原来的 error message box 已移除,改为日志输出
|
||||
logging.error(f"保存数据失败: {e}")
|
||||
|
||||
def run(self):
|
||||
"""运行整个数据处理流程"""
|
||||
logging.info("开始数据处理流程")
|
||||
|
||||
try:
|
||||
if not self.select_spec_file():
|
||||
return
|
||||
if not self.select_data_folder():
|
||||
return
|
||||
if not self.load_spec_data():
|
||||
return
|
||||
if not self.scan_data_files():
|
||||
return
|
||||
if not self.process_data():
|
||||
# 原来的 error message box 已移除,改为日志输出
|
||||
logging.error("数据处理失败,请检查日志信息")
|
||||
return
|
||||
self.save_to_excel()
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"处理流程出错: {e}")
|
||||
# 原来的 error message box 已移除,改为日志输出
|
||||
logging.error(f"处理过程中出现错误:\n{e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
processor = DataProcessor()
|
||||
processor.run()
|
||||
1060
dataProcess/dataProcess_html_V1.py
Normal file
1060
dataProcess/dataProcess_html_V1.py
Normal file
File diff suppressed because it is too large
Load Diff
810
dataProcess/dataProcess_sightml_V1.py
Normal file
810
dataProcess/dataProcess_sightml_V1.py
Normal file
@@ -0,0 +1,810 @@
|
||||
import pandas as pd
|
||||
import tkinter as tk
|
||||
from tkinter import filedialog
|
||||
import os
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self):
|
||||
self.data = None
|
||||
self.filename = None
|
||||
self.file_path = None
|
||||
self.file_dir = None
|
||||
self.processing_start_time = None
|
||||
|
||||
def select_file(self):
|
||||
"""手动选择数据文件"""
|
||||
print("🔍 打开文件选择对话框...")
|
||||
root = tk.Tk()
|
||||
root.withdraw()
|
||||
|
||||
self.file_path = filedialog.askopenfilename(
|
||||
title="选择数据文件",
|
||||
filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")]
|
||||
)
|
||||
|
||||
if self.file_path:
|
||||
self.filename = os.path.basename(self.file_path)
|
||||
self.file_dir = os.path.dirname(self.file_path)
|
||||
print(f"✅ 已选择文件: {self.filename}")
|
||||
print(f"📁 文件所在目录: {self.file_dir}")
|
||||
return True
|
||||
else:
|
||||
print("❌ 未选择文件")
|
||||
return False
|
||||
|
||||
def _load_data(self):
|
||||
"""加载数据文件"""
|
||||
print("📥 开始加载数据文件...")
|
||||
try:
|
||||
if self.file_path.endswith('.csv'):
|
||||
self.data = pd.read_csv(self.file_path)
|
||||
print("✅ 成功加载CSV文件")
|
||||
elif self.file_path.endswith('.xlsx'):
|
||||
self.data = pd.read_excel(self.file_path)
|
||||
print("✅ 成功加载Excel文件")
|
||||
else:
|
||||
raise ValueError("不支持的文件格式")
|
||||
|
||||
print(f"📊 数据文件形状: {self.data.shape}")
|
||||
print(f"📋 数据列名: {list(self.data.columns)[:10]}...")
|
||||
|
||||
# 显示数据预览
|
||||
print("\n📋 数据预览(前3行):")
|
||||
print(self.data.head(3))
|
||||
|
||||
# 显示列数据类型
|
||||
print("\n📊 列数据类型:")
|
||||
for col in self.data.columns[:10]:
|
||||
print(f" {col}: {self.data[col].dtype}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 加载数据文件时出错: {e}")
|
||||
raise
|
||||
|
||||
def _validate_data(self):
|
||||
"""验证数据完整性"""
|
||||
print("🔍 验证数据完整性...")
|
||||
|
||||
# 检查必要的测量列
|
||||
required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)']
|
||||
missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns]
|
||||
|
||||
if missing_measure_columns:
|
||||
error_msg = f"❌ 数据文件中缺少必要的测量列: {missing_measure_columns}"
|
||||
print(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# 检查上下限列
|
||||
required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)', 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)',
|
||||
'Area_Max(%)']
|
||||
missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns]
|
||||
|
||||
if missing_limit_columns:
|
||||
error_msg = f"❌ 数据文件中缺少必要的上下限列: {missing_limit_columns}"
|
||||
print(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
print("✅ 数据验证通过")
|
||||
|
||||
# 检查数据是否存在空值
|
||||
all_columns = required_measure_columns + required_limit_columns
|
||||
null_counts = self.data[all_columns].isnull().sum()
|
||||
if null_counts.any():
|
||||
print(f"⚠️ 数据中存在空值:")
|
||||
for col, count in null_counts[null_counts > 0].items():
|
||||
print(f" {col}: {count} 个空值")
|
||||
else:
|
||||
print("✅ 所有必需列都没有空值")
|
||||
|
||||
# 显示数据统计信息
|
||||
print("\n📊 数据统计信息:")
|
||||
|
||||
for col in required_measure_columns:
|
||||
if col in self.data.columns:
|
||||
# 检查列的数据类型,针对不同类型使用不同的格式化方式
|
||||
if pd.api.types.is_numeric_dtype(self.data[col]):
|
||||
valid_count = self.data[col].count()
|
||||
if valid_count > 0:
|
||||
min_val = self.data[col].min()
|
||||
max_val = self.data[col].max()
|
||||
print(f" {col}: {valid_count} 个有效值, 范围 {min_val:.4f} - {max_val:.4f}")
|
||||
else:
|
||||
print(f" {col}: 0 个有效值")
|
||||
else:
|
||||
# 非数值型列:显示唯一值和示例
|
||||
unique_count = self.data[col].nunique()
|
||||
sample_values = self.data[col].dropna().head(3).tolist()
|
||||
print(
|
||||
f" {col}: {self.data[col].count()} 个有效值, {unique_count} 个唯一值, 示例: {sample_values}")
|
||||
|
||||
# 检查并转换数据类型
|
||||
print("\n🔄 数据类型检查与转换:")
|
||||
numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)',
|
||||
'Height_Low(mil)', 'Height_High(mil)',
|
||||
'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)', 'Area_Max(%)']
|
||||
|
||||
for col in numeric_columns:
|
||||
if col in self.data.columns:
|
||||
if not pd.api.types.is_numeric_dtype(self.data[col]):
|
||||
try:
|
||||
# 尝试转换为数值类型
|
||||
original_count = self.data[col].count()
|
||||
self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
|
||||
converted_count = self.data[col].count()
|
||||
lost_data = original_count - converted_count
|
||||
if lost_data > 0:
|
||||
print(f" ⚠️ {col}: 转换后丢失 {lost_data} 个非数值数据")
|
||||
else:
|
||||
print(f" ✅ {col}: 成功转换为数值类型")
|
||||
except Exception as e:
|
||||
print(f" ❌ {col}: 类型转换失败 - {e}")
|
||||
else:
|
||||
valid_count = self.data[col].count()
|
||||
print(f" ✅ {col}: 已经是数值类型, {valid_count} 个有效值")
|
||||
|
||||
def _print_progress(self, message, level=1):
|
||||
"""打印进度信息,支持分级显示"""
|
||||
indent = " " * level
|
||||
timestamp = datetime.now().strftime("%H:%M:%S")
|
||||
print(f"{timestamp} {indent}{message}")
|
||||
|
||||
def generate_report(self):
|
||||
"""生成统计报告"""
|
||||
if self.data is None:
|
||||
raise ValueError("请先选择数据文件")
|
||||
|
||||
try:
|
||||
self.processing_start_time = datetime.now()
|
||||
print(f"\n🚀 开始生成报告 - {self.processing_start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 验证数据
|
||||
self._validate_data()
|
||||
|
||||
self._print_progress("开始数据处理...", 1)
|
||||
|
||||
# 创建分组键
|
||||
self._print_progress("创建分组键...", 2)
|
||||
|
||||
# 确保PAD ID和Component ID都是字符串类型
|
||||
self.data['PAD ID'] = self.data['PAD ID'].astype(str)
|
||||
self.data['Component ID'] = self.data['Component ID'].astype(str)
|
||||
|
||||
self.data['Group_Key'] = self.data['PAD ID'] + '_' + self.data['Component ID']
|
||||
group_count = self.data['Group_Key'].nunique()
|
||||
self._print_progress(f"共发现 {group_count} 个分组", 2)
|
||||
|
||||
# 显示分组信息
|
||||
group_info = self.data['Group_Key'].value_counts()
|
||||
self._print_progress(f"分组数据量统计:", 2)
|
||||
for i, (group, count) in enumerate(group_info.head(5).items()):
|
||||
self._print_progress(f" {group}: {count} 个数据点", 3)
|
||||
if len(group_info) > 5:
|
||||
self._print_progress(f" ... 还有 {len(group_info) - 5} 个分组", 3)
|
||||
|
||||
# 检查数值列是否存在NaN值
|
||||
numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)']
|
||||
for col in numeric_columns:
|
||||
if col in self.data.columns:
|
||||
nan_count = self.data[col].isna().sum()
|
||||
if nan_count > 0:
|
||||
self._print_progress(f"⚠️ {col} 有 {nan_count} 个空值,将在统计计算中排除", 3)
|
||||
|
||||
# 计算统计信息
|
||||
self._print_progress("计算基本统计信息...", 2)
|
||||
|
||||
# 确保数值列没有无穷大值
|
||||
for col in numeric_columns:
|
||||
if col in self.data.columns:
|
||||
inf_count = np.isinf(self.data[col]).sum()
|
||||
if inf_count > 0:
|
||||
self._print_progress(f"⚠️ {col} 有 {inf_count} 个无穷大值,将替换为NaN", 3)
|
||||
self.data[col] = self.data[col].replace([np.inf, -np.inf], np.nan)
|
||||
|
||||
stats = self.data.groupby('Group_Key').agg({
|
||||
'Height(mil)': ['min', 'max', 'mean', 'std'],
|
||||
'Volume(%)': ['min', 'max', 'mean', 'std'],
|
||||
'Area(%)': ['min', 'max', 'mean', 'std']
|
||||
}).round(4)
|
||||
|
||||
# 重命名列
|
||||
stats.columns = [
|
||||
'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)',
|
||||
'Volume_Measured_Min(%)', 'Volume_Measured_Max(%)', 'Volume_Mean(%)', 'Volume_Std(%)',
|
||||
'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)'
|
||||
]
|
||||
|
||||
self._print_progress("基本统计信息计算完成", 2)
|
||||
|
||||
# 获取上下限信息
|
||||
self._print_progress("获取预设上下限信息...", 2)
|
||||
limits = self.data.groupby('Group_Key').agg({
|
||||
'Height_Low(mil)': 'first',
|
||||
'Height_High(mil)': 'first',
|
||||
'Vol_Min(%)': 'first',
|
||||
'Vol_Max(%)': 'first',
|
||||
'Area_Min(%)': 'first',
|
||||
'Area_Max(%)': 'first'
|
||||
}).round(4)
|
||||
|
||||
# 合并统计信息和上下限信息
|
||||
stats = pd.concat([stats, limits], axis=1)
|
||||
self._print_progress("上下限信息获取完成", 2)
|
||||
|
||||
# 计算CPK
|
||||
self._print_progress("开始计算CPK值...", 2)
|
||||
stats = self._calculate_cpk(stats)
|
||||
|
||||
# 分析CPK结果
|
||||
cpk_analysis = self._analyze_cpk_results(stats)
|
||||
self._print_progress("CPK分析完成", 2)
|
||||
self._print_cpk_summary(cpk_analysis)
|
||||
|
||||
# 生成HTML报告
|
||||
self._print_progress("生成HTML报告...", 2)
|
||||
report_path = self._create_html_report(stats, cpk_analysis)
|
||||
self._print_progress("HTML报告生成完成", 2)
|
||||
|
||||
# 计算处理时间
|
||||
processing_time = datetime.now() - self.processing_start_time
|
||||
self._print_progress(f"总处理时间: {processing_time.total_seconds():.2f} 秒", 1)
|
||||
|
||||
return report_path
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 生成报告过程中出错: {e}")
|
||||
import traceback
|
||||
print(f"详细错误信息:")
|
||||
traceback.print_exc()
|
||||
raise
|
||||
|
||||
def _analyze_cpk_results(self, stats):
|
||||
"""分析CPK结果"""
|
||||
cpk_analysis = {
|
||||
'total_groups': len(stats),
|
||||
'cpk_status': {'Height': {}, 'Volume': {}, 'Area': {}},
|
||||
'problematic_groups': []
|
||||
}
|
||||
|
||||
for feature in ['Height', 'Volume', 'Area']:
|
||||
cpk_col = f'{feature}_Cpk'
|
||||
if cpk_col not in stats.columns:
|
||||
continue
|
||||
|
||||
valid_cpk = stats[cpk_col].dropna()
|
||||
total_valid = len(valid_cpk)
|
||||
|
||||
cpk_analysis['cpk_status'][feature] = {
|
||||
'total': total_valid,
|
||||
'excellent': len(valid_cpk[valid_cpk >= 1.33]) if total_valid > 0 else 0,
|
||||
'acceptable': len(valid_cpk[(valid_cpk >= 1.0) & (valid_cpk < 1.33)]) if total_valid > 0 else 0,
|
||||
'poor': len(valid_cpk[valid_cpk < 1.0]) if total_valid > 0 else 0,
|
||||
'invalid': len(stats) - total_valid
|
||||
}
|
||||
|
||||
# 识别有问题的分组(任意特征的CPK < 1.0)
|
||||
for group_key, row in stats.iterrows():
|
||||
problems = []
|
||||
for feature in ['Height', 'Volume', 'Area']:
|
||||
cpk_col = f'{feature}_Cpk'
|
||||
if cpk_col in stats.columns and not pd.isna(row[cpk_col]):
|
||||
if row[cpk_col] < 1.0:
|
||||
problems.append(f"{feature}: {row[cpk_col]:.4f}")
|
||||
|
||||
if problems:
|
||||
cpk_analysis['problematic_groups'].append({
|
||||
'group_key': group_key,
|
||||
'problems': problems
|
||||
})
|
||||
|
||||
return cpk_analysis
|
||||
|
||||
def _print_cpk_summary(self, cpk_analysis):
|
||||
"""打印CPK结果摘要"""
|
||||
print("\n📈 CPK分析结果摘要:")
|
||||
print("=" * 60)
|
||||
|
||||
for feature, status in cpk_analysis['cpk_status'].items():
|
||||
total = status['total']
|
||||
if total == 0:
|
||||
print(f"\n{feature}: 无有效CPK数据")
|
||||
continue
|
||||
|
||||
print(f"\n{feature}:")
|
||||
excellent_pct = (status['excellent'] / total * 100) if total > 0 else 0
|
||||
acceptable_pct = (status['acceptable'] / total * 100) if total > 0 else 0
|
||||
poor_pct = (status['poor'] / total * 100) if total > 0 else 0
|
||||
|
||||
print(f" ✅ 优秀 (CPK ≥ 1.33): {status['excellent']}/{total} ({excellent_pct:.1f}%)")
|
||||
print(f" ⚠️ 合格 (1.0 ≤ CPK < 1.33): {status['acceptable']}/{total} ({acceptable_pct:.1f}%)")
|
||||
print(f" ❌ 不合格 (CPK < 1.0): {status['poor']}/{total} ({poor_pct:.1f}%)")
|
||||
print(f" ❓ 无法计算: {status['invalid']}")
|
||||
|
||||
if cpk_analysis['problematic_groups']:
|
||||
print(f"\n⚠️ 发现 {len(cpk_analysis['problematic_groups'])} 个有问题分组:")
|
||||
for i, group in enumerate(cpk_analysis['problematic_groups'][:10]):
|
||||
print(f" {i + 1}. {group['group_key']}: {', '.join(group['problems'])}")
|
||||
if len(cpk_analysis['problematic_groups']) > 10:
|
||||
print(f" ... 还有 {len(cpk_analysis['problematic_groups']) - 10} 个问题分组")
|
||||
else:
|
||||
print("\n✅ 所有分组的CPK都在合格范围内")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
def _calculate_cpk(self, stats):
|
||||
"""计算CPK值"""
|
||||
self._print_progress("详细计算CPK值...", 3)
|
||||
|
||||
def calculate_single_cpk(mean, std, usl, lsl):
|
||||
"""计算单个特征的CPK"""
|
||||
if pd.isna(mean) or pd.isna(std) or std == 0:
|
||||
return np.nan
|
||||
|
||||
if pd.isna(usl) or pd.isna(lsl):
|
||||
return np.nan
|
||||
|
||||
try:
|
||||
cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf')
|
||||
cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf')
|
||||
|
||||
if cpu == float('inf') and cpl == float('inf'):
|
||||
return np.nan
|
||||
elif cpu == float('inf'):
|
||||
return cpl
|
||||
elif cpl == float('inf'):
|
||||
return cpu
|
||||
else:
|
||||
return min(cpu, cpl)
|
||||
except (ZeroDivisionError, TypeError):
|
||||
return np.nan
|
||||
|
||||
# 计算每个特征的CPK
|
||||
cpk_results = []
|
||||
total_groups = len(stats)
|
||||
|
||||
for idx, row in stats.iterrows():
|
||||
if len(cpk_results) % 100 == 0 and total_groups > 100:
|
||||
self._print_progress(f"计算第 {len(cpk_results) + 1} 个分组的CPK...", 4)
|
||||
|
||||
# Height CPK
|
||||
height_cpk = calculate_single_cpk(
|
||||
row.get('Height_Mean(mil)', np.nan),
|
||||
row.get('Height_Std(mil)', np.nan),
|
||||
row.get('Height_High(mil)', np.nan),
|
||||
row.get('Height_Low(mil)', np.nan)
|
||||
)
|
||||
|
||||
# Volume CPK
|
||||
volume_cpk = calculate_single_cpk(
|
||||
row.get('Volume_Mean(%)', np.nan),
|
||||
row.get('Volume_Std(%)', np.nan),
|
||||
row.get('Vol_Max(%)', np.nan),
|
||||
row.get('Vol_Min(%)', np.nan)
|
||||
)
|
||||
|
||||
# Area CPK
|
||||
area_cpk = calculate_single_cpk(
|
||||
row.get('Area_Mean(%)', np.nan),
|
||||
row.get('Area_Std(%)', np.nan),
|
||||
row.get('Area_Max(%)', np.nan),
|
||||
row.get('Area_Min(%)', np.nan)
|
||||
)
|
||||
|
||||
cpk_results.append({
|
||||
'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan,
|
||||
'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan,
|
||||
'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan
|
||||
})
|
||||
|
||||
# 将CPK结果添加到统计数据中
|
||||
cpk_df = pd.DataFrame(cpk_results, index=stats.index)
|
||||
stats = pd.concat([stats, cpk_df], axis=1)
|
||||
|
||||
self._print_progress(f"所有 {len(stats)} 个分组CPK计算完成", 3)
|
||||
return stats
|
||||
|
||||
def _get_cpk_status_class(self, cpk_value):
|
||||
"""根据CPK值返回状态类别"""
|
||||
if pd.isna(cpk_value):
|
||||
return 'cpk-invalid'
|
||||
elif cpk_value >= 1.33:
|
||||
return 'cpk-excellent'
|
||||
elif cpk_value >= 1.0:
|
||||
return 'cpk-acceptable'
|
||||
else:
|
||||
return 'cpk-poor'
|
||||
|
||||
def _create_html_report(self, stats, cpk_analysis):
|
||||
"""创建完整的HTML报告"""
|
||||
self._print_progress("构建HTML报告内容...", 3)
|
||||
|
||||
total_groups = len(stats)
|
||||
|
||||
# 完整的HTML模板
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>数据统计报告 - {self.filename}</title>
|
||||
<style>
|
||||
:root {{
|
||||
--color-excellent: #4CAF50;
|
||||
--color-acceptable: #FFC107;
|
||||
--color-poor: #F44336;
|
||||
--color-invalid: #9E9E9E;
|
||||
}}
|
||||
|
||||
body {{
|
||||
font-family: 'Segoe UI', Arial, sans-serif;
|
||||
margin: 20px;
|
||||
line-height: 1.6;
|
||||
background-color: #f8f9fa;
|
||||
}}
|
||||
|
||||
.container {{
|
||||
max-width: 95%;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
padding: 20px;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
}}
|
||||
|
||||
h1 {{
|
||||
color: #2c3e50;
|
||||
border-bottom: 3px solid #3498db;
|
||||
padding-bottom: 10px;
|
||||
text-align: center;
|
||||
}}
|
||||
|
||||
h2 {{
|
||||
color: #34495e;
|
||||
margin-top: 30px;
|
||||
padding: 15px;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
border-radius: 5px;
|
||||
}}
|
||||
|
||||
.summary {{
|
||||
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
|
||||
color: white;
|
||||
padding: 20px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 30px;
|
||||
}}
|
||||
|
||||
.cpk-dashboard {{
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
||||
gap: 20px;
|
||||
margin: 20px 0;
|
||||
}}
|
||||
|
||||
.cpk-card {{
|
||||
background: white;
|
||||
padding: 20px;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
||||
text-align: center;
|
||||
}}
|
||||
|
||||
.cpk-excellent {{ background-color: var(--color-excellent); color: white; }}
|
||||
.cpk-acceptable {{ background-color: var(--color-acceptable); color: black; }}
|
||||
.cpk-poor {{ background-color: var(--color-poor); color: white; }}
|
||||
.cpk-invalid {{ background-color: var(--color-invalid); color: white; }}
|
||||
|
||||
table {{
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin-top: 20px;
|
||||
font-size: 12px;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
||||
background: white;
|
||||
}}
|
||||
|
||||
th, td {{
|
||||
border: 1px solid #ddd;
|
||||
padding: 12px;
|
||||
text-align: center;
|
||||
}}
|
||||
|
||||
th {{
|
||||
background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%);
|
||||
color: white;
|
||||
font-weight: bold;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
}}
|
||||
|
||||
tr:nth-child(even) {{ background-color: #f8f9fa; }}
|
||||
tr:hover {{ background-color: #e3f2fd; }}
|
||||
|
||||
.limits {{
|
||||
background-color: #e8f5e8;
|
||||
font-weight: bold;
|
||||
color: #2e7d32;
|
||||
}}
|
||||
|
||||
.measured {{
|
||||
background-color: #fff3cd;
|
||||
color: #856404;
|
||||
}}
|
||||
|
||||
.problematic-row {{
|
||||
background-color: #ffebee !important;
|
||||
border-left: 4px solid var(--color-poor);
|
||||
}}
|
||||
|
||||
.warning-box {{
|
||||
background: #fff3cd;
|
||||
border-left: 4px solid #ffc107;
|
||||
padding: 15px;
|
||||
margin: 20px 0;
|
||||
border-radius: 5px;
|
||||
}}
|
||||
|
||||
.chart-container {{
|
||||
margin: 20px 0;
|
||||
padding: 20px;
|
||||
background: white;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
||||
}}
|
||||
|
||||
.legend {{
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
gap: 20px;
|
||||
margin: 20px 0;
|
||||
flex-wrap: wrap;
|
||||
}}
|
||||
|
||||
.legend-item {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 5px;
|
||||
padding: 5px 10px;
|
||||
border-radius: 3px;
|
||||
}}
|
||||
|
||||
.na {{ color: #999; font-style: italic; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>📊 数据统计报告 - {self.filename}</h1>
|
||||
<p><strong>生成时间:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
|
||||
<p><strong>输入文件:</strong> {self.filename}</p>
|
||||
|
||||
<div class="summary">
|
||||
<h2>📈 报告摘要</h2>
|
||||
<p><strong>总分组数量:</strong> {total_groups}</p>
|
||||
<p><strong>处理时间:</strong> {(datetime.now() - self.processing_start_time).total_seconds():.2f} 秒</p>
|
||||
</div>
|
||||
|
||||
<!-- CPK状态仪表板 -->
|
||||
<div class="cpk-dashboard">
|
||||
"""
|
||||
|
||||
# 添加CPK状态卡片
|
||||
for feature, status in cpk_analysis['cpk_status'].items():
|
||||
total = status['total'] + status['invalid']
|
||||
if total == 0:
|
||||
continue
|
||||
|
||||
html_content += f"""
|
||||
<div class="cpk-card">
|
||||
<h3>{feature} CPK状态</h3>
|
||||
<div style="font-size: 2em; font-weight: bold; margin: 10px 0;">
|
||||
{status['excellent'] + status['acceptable']}/{total}
|
||||
</div>
|
||||
<p>合格率: {(status['excellent'] + status['acceptable']) / total * 100:.1f}%</p>
|
||||
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 10px;">
|
||||
<span class="legend-item cpk-excellent">优秀: {status['excellent']}</span>
|
||||
<span class="legend-item cpk-acceptable">合格: {status['acceptable']}</span>
|
||||
<span class="legend-item cpk-poor">不合格: {status['poor']}</span>
|
||||
<span class="legend-item cpk-invalid">无效: {status['invalid']}</span>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html_content += f"""
|
||||
</div>
|
||||
|
||||
<!-- 问题分组警告 -->
|
||||
{f'<div class="warning-box"><h3>⚠️ 发现 {len(cpk_analysis["problematic_groups"])} 个问题分组</h3><p>以下分组的CPK值低于1.0,需要重点关注</p></div>' if cpk_analysis['problematic_groups'] else ''}
|
||||
|
||||
<h2>📋 详细统计数据</h2>
|
||||
|
||||
<div class="legend">
|
||||
<span class="legend-item" style="background-color: #e8f5e8;">预设上下限</span>
|
||||
<span class="legend-item" style="background-color: #fff3cd;">实测值</span>
|
||||
<span class="legend-item cpk-excellent">CPK ≥ 1.33</span>
|
||||
<span class="legend-item cpk-acceptable">1.0 ≤ CPK < 1.33</span>
|
||||
<span class="legend-item cpk-poor">CPK < 1.0</span>
|
||||
</div>
|
||||
|
||||
<div style="overflow-x: auto;">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">分组标识</th>
|
||||
<th colspan="7">Height(mil)</th>
|
||||
<th colspan="7">Volume(%)</th>
|
||||
<th colspan="7">Area(%)</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<!-- Height列标题 -->
|
||||
<th class="limits">预设下限</th>
|
||||
<th class="limits">预设上限</th>
|
||||
<th class="measured">实测最小值</th>
|
||||
<th class="measured">实测最大值</th>
|
||||
<th>平均值</th>
|
||||
<th>标准差</th>
|
||||
<th>CPK</th>
|
||||
<!-- Volume列标题 -->
|
||||
<th class="limits">预设下限</th>
|
||||
<th class="limits">预设上限</th>
|
||||
<th class="measured">实测最小值</th>
|
||||
<th class="measured">实测最大值</th>
|
||||
<th>平均值</th>
|
||||
<th>标准差</th>
|
||||
<th>CPK</th>
|
||||
<!-- Area列标题 -->
|
||||
<th class="limits">预设下限</th>
|
||||
<th class="limits">预设上限</th>
|
||||
<th class="measured">实测最小值</th>
|
||||
<th class="measured">实测最大值</th>
|
||||
<th>平均值</th>
|
||||
<th>标准差</th>
|
||||
<th>CPK</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
|
||||
# 生成表格行数据的辅助函数
|
||||
def format_value(value):
|
||||
if pd.isna(value):
|
||||
return '<span class="na">N/A</span>'
|
||||
elif isinstance(value, (int, float)):
|
||||
return f"{value:.4f}"
|
||||
else:
|
||||
return str(value)
|
||||
|
||||
# 用于检查列是否存在的辅助函数
|
||||
def safe_get_value(row, column_name):
|
||||
"""安全获取列值,如果列不存在返回N/A"""
|
||||
if column_name in row.index:
|
||||
return row[column_name]
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
for group_key, row in stats.iterrows():
|
||||
# 检查是否为问题分组
|
||||
is_problematic = any(problem['group_key'] == group_key for problem in cpk_analysis['problematic_groups'])
|
||||
row_class = 'class="problematic-row"' if is_problematic else ''
|
||||
|
||||
html_content += f"""
|
||||
<tr {row_class}>
|
||||
<td><strong>{group_key}</strong>{' ⚠️' if is_problematic else ''}</td>
|
||||
"""
|
||||
|
||||
# 为每个特征生成列
|
||||
for feature in ['Height', 'Volume', 'Area']:
|
||||
cpk_value = safe_get_value(row, f'{feature}_Cpk')
|
||||
cpk_class = self._get_cpk_status_class(cpk_value)
|
||||
|
||||
# 为不同特征设置正确的列名
|
||||
if feature == 'Height':
|
||||
lower_limit_col = 'Height_Low(mil)'
|
||||
upper_limit_col = 'Height_High(mil)'
|
||||
measured_min_col = 'Height_Measured_Min(mil)'
|
||||
measured_max_col = 'Height_Measured_Max(mil)'
|
||||
mean_col = 'Height_Mean(mil)'
|
||||
std_col = 'Height_Std(mil)'
|
||||
else:
|
||||
lower_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Min(%)" # 修正:Volume使用Vol_Min(%),Area使用Area_Min(%)
|
||||
upper_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Max(%)" # 修正:Volume使用Vol_Max(%),Area使用Area_Max(%)
|
||||
measured_min_col = f'{feature}_Measured_Min(%)'
|
||||
measured_max_col = f'{feature}_Measured_Max(%)'
|
||||
mean_col = f'{feature}_Mean(%)'
|
||||
std_col = f'{feature}_Std(%)'
|
||||
|
||||
html_content += f"""
|
||||
<!-- {feature}数据 -->
|
||||
<td class="limits">{format_value(safe_get_value(row, lower_limit_col))}</td>
|
||||
<td class="limits">{format_value(safe_get_value(row, upper_limit_col))}</td>
|
||||
<td class="measured">{format_value(safe_get_value(row, measured_min_col))}</td>
|
||||
<td class="measured">{format_value(safe_get_value(row, measured_max_col))}</td>
|
||||
<td>{format_value(safe_get_value(row, mean_col))}</td>
|
||||
<td>{format_value(safe_get_value(row, std_col))}</td>
|
||||
<td class="{cpk_class}">{format_value(cpk_value)}</td>
|
||||
"""
|
||||
|
||||
html_content += """
|
||||
</tr>"""
|
||||
|
||||
html_content += """
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<h2>📊 CPK状态分布</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px;">
|
||||
"""
|
||||
|
||||
# 添加简单的CPK分布图表
|
||||
for feature, status in cpk_analysis['cpk_status'].items():
|
||||
total = status['total'] + status['invalid']
|
||||
if total == 0:
|
||||
continue
|
||||
|
||||
html_content += f"""
|
||||
<div>
|
||||
<h3>{feature} CPK分布</h3>
|
||||
<div style="background: #f8f9fa; padding: 20px; border-radius: 5px;">
|
||||
<div style="display: flex; height: 30px; margin: 10px 0; border-radius: 5px; overflow: hidden;">
|
||||
<div style="background: var(--color-excellent); width: {status['excellent'] / total * 100}%;"></div>
|
||||
<div style="background: var(--color-acceptable); width: {status['acceptable'] / total * 100}%;"></div>
|
||||
<div style="background: var(--color-poor); width: {status['poor'] / total * 100}%;"></div>
|
||||
<div style="background: var(--color-invalid); width: {status['invalid'] / total * 100}%;"></div>
|
||||
</div>
|
||||
<div style="text-align: center;">
|
||||
<small>优秀 {status['excellent']} | 合格 {status['acceptable']} | 不合格 {status['poor']} | 无效 {status['invalid']}</small>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
html_content += """
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
# 保存报告
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
report_filename = f"{os.path.splitext(self.filename)[0]}_report_{timestamp}.html"
|
||||
report_path = os.path.join(self.file_dir, report_filename)
|
||||
|
||||
self._print_progress(f"保存报告到: {report_path}", 3)
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
return report_path
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("=" * 60)
|
||||
print("🚀 数据统计报告生成程序 - Volume上下限修复版")
|
||||
print("=" * 60)
|
||||
|
||||
processor = DataProcessor()
|
||||
|
||||
try:
|
||||
if processor.select_file():
|
||||
processor._load_data()
|
||||
report_path = processor.generate_report()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ 程序执行完成")
|
||||
print(f"📄 统计报告生成成功: {report_path}")
|
||||
print("=" * 60)
|
||||
else:
|
||||
print("❌ 未选择文件,程序退出")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ 程序执行失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user