From 5c846eae947e0d8f14ca9e2105be26ef02632eb7 Mon Sep 17 00:00:00 2001 From: panxiang <1275280643@qq.com> Date: Mon, 2 Feb 2026 15:19:30 +0800 Subject: [PATCH] =?UTF-8?q?Python=E8=84=9A=E6=9C=AC=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=88=9D=E5=A7=8B=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 7 + BOMCompare/.gitignore | 16 + BOMCompare/BOMCompare for Merge V1.py | 655 ++++++++++ BOMCompare/BOMCompareForJP1.py | 635 ++++++++++ BOMCompare/BOMConsolidatorV1.py | 618 ++++++++++ BOMCompare/README.md | 14 + FFT_IMU/.gitignore | 19 + FFT_IMU/FFT_IMU_dc_html_v1.py | 739 ++++++++++++ FFT_IMU/FFT_IMU_dc_scan_v1.py | 648 ++++++++++ FFT_IMU/FFT_IMU_dc_v1.py | 648 ++++++++++ ICCIDupdata/.gitignore | 6 + ICCIDupdata/ICCIDtest_V1.py | 90 ++ IMULinkdata/.gitignore | 9 + IMULinkdata/LINLinkData_V1.py | 252 ++++ dataProcess/.gitignore | 20 + dataProcess/dataProcessMerge_V1.py | 475 ++++++++ dataProcess/dataProcess_html_V1.py | 1060 +++++++++++++++++ dataProcess/dataProcess_sightml_V1.py | 810 +++++++++++++ htmlProcess/.gitignore | 17 + htmlProcess/README.md | 11 + .../htmlReportProcess_Merge_picHtml_V1.py | 926 ++++++++++++++ htmlProcess/htmlReportProcess_Merge_pic_V1.py | 563 +++++++++ main.py | 251 ++++ tempReportProcess/.gitignore | 9 + tempReportProcess/tempReportProcess_V1.py | 248 ++++ 25 files changed, 8746 insertions(+) create mode 100644 .gitignore create mode 100644 BOMCompare/.gitignore create mode 100644 BOMCompare/BOMCompare for Merge V1.py create mode 100644 BOMCompare/BOMCompareForJP1.py create mode 100644 BOMCompare/BOMConsolidatorV1.py create mode 100644 BOMCompare/README.md create mode 100644 FFT_IMU/.gitignore create mode 100644 FFT_IMU/FFT_IMU_dc_html_v1.py create mode 100644 FFT_IMU/FFT_IMU_dc_scan_v1.py create mode 100644 FFT_IMU/FFT_IMU_dc_v1.py create mode 100644 ICCIDupdata/.gitignore create mode 100644 ICCIDupdata/ICCIDtest_V1.py create mode 100644 IMULinkdata/.gitignore create mode 100644 IMULinkdata/LINLinkData_V1.py create mode 100644 dataProcess/.gitignore create mode 100644 dataProcess/dataProcessMerge_V1.py create mode 100644 dataProcess/dataProcess_html_V1.py create mode 100644 dataProcess/dataProcess_sightml_V1.py create mode 100644 htmlProcess/.gitignore create mode 100644 htmlProcess/README.md create mode 100644 htmlProcess/htmlReportProcess_Merge_picHtml_V1.py create mode 100644 htmlProcess/htmlReportProcess_Merge_pic_V1.py create mode 100644 main.py create mode 100644 tempReportProcess/.gitignore create mode 100644 tempReportProcess/tempReportProcess_V1.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6a20e8c --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/.idea/* +.idea/* +/.idea +.idea +/.venv +/.venv/* + diff --git a/BOMCompare/.gitignore b/BOMCompare/.gitignore new file mode 100644 index 0000000..2949325 --- /dev/null +++ b/BOMCompare/.gitignore @@ -0,0 +1,16 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source + +BOMCompare for Merge V2.py + +BOMCompareForJP2.py + + +BOMConsolidator.py + +BOMConsolidatorV2.py +# BOMConsolidator.py \ No newline at end of file diff --git a/BOMCompare/BOMCompare for Merge V1.py b/BOMCompare/BOMCompare for Merge V1.py new file mode 100644 index 0000000..566d11e --- /dev/null +++ b/BOMCompare/BOMCompare for Merge V1.py @@ -0,0 +1,655 @@ +import pandas as pd +import tkinter as tk +from tkinter import filedialog +from datetime import datetime +import os +from typing import Dict, List, Tuple, Optional + + +class BOMComparator: + """BOM文件差异对比器""" + + def __init__(self): + self.file1_path = "" + self.file2_path = "" + self.file1_sheets = [] + self.file2_sheets = [] + self.common_sheets = [] + self.differences = {} + self.file1_name = "" + self.file2_name = "" + self.columns_to_exclude = ['检查信息', '检查状态', '校验信息'] # 要排除的列名 + + def select_file(self, title: str) -> str: + """手动选择文件""" + root = tk.Tk() + root.withdraw() + file_path = filedialog.askopenfilename( + title=title, + filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")] + ) + root.destroy() + return file_path + + def find_valid_sheets(self, file_path: str) -> List[str]: + """参考附件3的方式查找有效的sheet""" + valid_sheets = [] + + try: + xl_file = pd.ExcelFile(file_path) + + for sheet_name in xl_file.sheet_names: + try: + # 尝试读取sheet,检查是否包含BOM数据 + df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=10) + + # 检查是否包含BOM相关列(参考附件结构) + required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description'] + found_columns = [col for col in df.columns if col in required_columns] + + if len(found_columns) >= 2: # 至少找到2个关键列 + # 检查是否有实际数据(不只是表头) + if len(df) > 1: + valid_sheets.append(sheet_name) + + except Exception as e: + continue + + except Exception as e: + print(f"读取文件 {file_path} 时出错: {e}") + + return valid_sheets + + def get_common_sheets(self) -> List[str]: + """获取两个文件的共同工作表""" + if not self.file1_sheets or not self.file2_sheets: + return [] + + # 标准化工作表名称(去除空格和特殊字符) + file1_clean = [self.standardize_sheet_name(sheet) for sheet in self.file1_sheets] + file2_clean = [self.standardize_sheet_name(sheet) for sheet in self.file2_sheets] + + # 找出共同的工作表 + common_sheets = [] + for sheet1 in self.file1_sheets: + clean_sheet1 = self.standardize_sheet_name(sheet1) + for sheet2 in self.file2_sheets: + clean_sheet2 = self.standardize_sheet_name(sheet2) + if clean_sheet1 == clean_sheet2: + common_sheets.append(sheet1) + break + + return common_sheets + + def standardize_sheet_name(self, sheet_name: str) -> str: + """标准化工作表名称,便于比较""" + return str(sheet_name).strip().lower().replace(' ', '_').replace('-', '_') + + def load_bom_data(self, file_path: str, sheet_name: str) -> pd.DataFrame: + """加载BOM数据""" + try: + df = pd.read_excel(file_path, sheet_name=sheet_name) + # 清理数据:去除空行和空列 + df = df.dropna(how='all').dropna(axis=1, how='all') + + # 清理列名 + df.columns = df.columns.str.strip() + + return df + except Exception as e: + print(f"加载sheet {sheet_name} 时出错: {e}") + return pd.DataFrame() + + def should_compare_column(self, column_name: str) -> bool: + """判断是否应该对比该列(排除检查信息类列)""" + exclude_keywords = ['检查', '校验', '状态', '备注', 'comment', 'check'] + column_lower = str(column_name).lower() + + # 检查是否在排除列表中 + if column_name in self.columns_to_exclude: + return False + + # 检查是否包含排除关键词 + for keyword in exclude_keywords: + if keyword in column_lower: + return False + + return True + + def get_columns_to_compare(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]: + """获取需要对比的列名(排除检查信息类列)""" + common_columns = list(set(df1.columns).intersection(set(df2.columns))) + + # 过滤掉不需要对比的列 + columns_to_compare = [col for col in common_columns if self.should_compare_column(col)] + + return columns_to_compare + + def compare_dataframes(self, df1: pd.DataFrame, df2: pd.DataFrame, sheet_name1: str, sheet_name2: str) -> Dict: + """对比两个DataFrame的差异(排除检查信息类列)""" + differences = { + 'sheet_names': f"{sheet_name1} vs {sheet_name2}", + 'added_rows': [], + 'removed_rows': [], + 'modified_rows': [], + 'columns_comparison': {}, + 'summary': { + 'total_rows_df1': len(df1), + 'total_rows_df2': len(df2), + 'added_count': 0, + 'removed_count': 0, + 'modified_count': 0 + }, + 'original_dfs': { + 'df1': df1.copy(), + 'df2': df2.copy() + } + } + + # 确定关键列用于行匹配 + key_columns = self.identify_key_columns(df1, df2) + + if not key_columns: + differences['error'] = "无法确定用于对比的关键列" + return differences + + try: + # 设置索引 + df1_indexed = df1.set_index(key_columns) + df2_indexed = df2.set_index(key_columns) + + # 获取需要对比的列(排除检查信息类列) + columns_to_compare = self.get_columns_to_compare(df1, df2) + + # 找出新增的行 + new_indexes = df2_indexed.index.difference(df1_indexed.index) + if len(new_indexes) > 0: + differences['added_rows'] = df2_indexed.loc[new_indexes].reset_index().to_dict('records') + differences['summary']['added_count'] = len(new_indexes) + + # 找出删除的行 + removed_indexes = df1_indexed.index.difference(df2_indexed.index) + if len(removed_indexes) > 0: + differences['removed_rows'] = df1_indexed.loc[removed_indexes].reset_index().to_dict('records') + differences['summary']['removed_count'] = len(removed_indexes) + + # 找出共同的行并进行详细对比(排除检查信息类列) + common_indexes = df1_indexed.index.intersection(df2_indexed.index) + + for idx in common_indexes: + row1 = df1_indexed.loc[idx] + row2 = df2_indexed.loc[idx] + + # 检查每列的值是否相同(只对比需要比较的列) + modified_cols = {} + for col in columns_to_compare: + if col in df1_indexed.columns and col in df2_indexed.columns: + val1 = row1[col] + val2 = row2[col] + + # 处理NaN值的比较 + if pd.isna(val1) and pd.isna(val2): + continue + elif pd.isna(val1) or pd.isna(val2) or str(val1) != str(val2): + modified_cols[col] = { + 'old_value': val1, + 'new_value': val2 + } + + if modified_cols: + # 获取完整的行数据以显示所有需要的列 + full_row_data = self.get_full_row_data_for_display(df1, df2, idx, key_columns) + + differences['modified_rows'].append({ + 'key_values': dict(zip(key_columns, idx)) if isinstance(idx, tuple) else {key_columns[0]: idx}, + 'modified_columns': modified_cols, + 'full_row_data': full_row_data + }) + differences['summary']['modified_count'] += 1 + + # 列级对比(包含所有列,用于统计) + common_columns = set(df1.columns).intersection(set(df2.columns)) + df1_only_columns = set(df1.columns).difference(set(df2.columns)) + df2_only_columns = set(df2.columns).difference(set(df1.columns)) + + # 计算实际参与对比的列 + compared_columns = set(columns_to_compare) + excluded_columns = common_columns - compared_columns + + differences['columns_comparison'] = { + 'common_columns': list(common_columns), + 'compared_columns': list(compared_columns), + 'excluded_columns': list(excluded_columns), + 'file1_only_columns': list(df1_only_columns), + 'file2_only_columns': list(df2_only_columns) + } + + except Exception as e: + differences['error'] = f"对比过程中出错: {str(e)}" + + return differences + + def get_full_row_data_for_display(self, df1: pd.DataFrame, df2: pd.DataFrame, idx, key_columns: List[str]) -> Dict: + """获取完整的行数据用于显示""" + display_data = {} + + # 获取两个文件中的对应行数据 + row1_data = self.extract_row_data(df1, idx, key_columns) + row2_data = self.extract_row_data(df2, idx, key_columns) + + # 定义需要显示的列(排除检查信息类列) + display_columns = ['Purchase_Code', 'MF_PN', 'Description', 'Part Type', 'MF_NAME', 'PCB_Footprint', '合计'] + + # 过滤掉检查信息类列 + display_columns = [col for col in display_columns if self.should_compare_column(col)] + + for col in display_columns: + val1 = row1_data.get(col, '') + val2 = row2_data.get(col, '') + + # 格式化显示:有差异显示原值->新值,无差异显示原值 + if pd.isna(val1) or val1 == '': + display_value = val2 + elif pd.isna(val2) or val2 == '': + display_value = val1 + elif str(val1) != str(val2): + display_value = f"{val1} -> {val2}" + else: + display_value = val1 + + display_data[col] = display_value + + # 添加文件来源信息 + display_data['_from_file1'] = row1_data + display_data['_from_file2'] = row2_data + + return display_data + + def extract_row_data(self, df: pd.DataFrame, idx, key_columns: List[str]) -> Dict: + """从DataFrame中提取指定行的数据""" + row_data = {} + + try: + if isinstance(idx, tuple): + # 多列索引的情况 + mask = pd.Series(True, index=df.index) + for i, key in enumerate(key_columns): + mask = mask & (df[key] == idx[i]) + if mask.any(): + original_row = df[mask].iloc[0] + for col in df.columns: + row_data[col] = original_row[col] + else: + # 单列索引的情况 + matching_rows = df[df[key_columns[0]] == idx] + if len(matching_rows) > 0: + original_row = matching_rows.iloc[0] + for col in df.columns: + row_data[col] = original_row[col] + + except Exception as e: + pass + + return row_data + + def format_value_display(self, value1, value2): + """格式化值的显示:有差异显示原值->新值,无差异显示原值""" + if pd.isna(value1) or value1 == '': + return value2 + elif pd.isna(value2) or value2 == '': + return value1 + elif str(value1) != str(value2): + return f"{value1} -> {value2}" + else: + return value1 + + def get_modified_columns_summary(self, modified_columns: Dict) -> str: + """获取修改列的概要汇总""" + if not modified_columns: + return "无修改" + + modified_list = list(modified_columns.keys()) + + # 如果修改列数量较少,直接显示 + if len(modified_list) <= 3: + return ", ".join(modified_list) + else: + # 数量较多时显示前3个加省略号 + return ", ".join(modified_list[:3]) + f"...等{len(modified_list)}列" + + def identify_key_columns(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]: + """识别用于行匹配的关键列""" + # 优先使用Partnumber作为关键列 + potential_keys = ['Partnumber', 'Purchase_Code', 'MF_PN'] + + for key in potential_keys: + if key in df1.columns and key in df2.columns: + # 检查该列是否适合作为关键列(不应有过多重复值) + df1_dup_rate = df1[key].duplicated().sum() / len(df1) + df2_dup_rate = df2[key].duplicated().sum() / len(df2) + + if df1_dup_rate < 0.1 and df2_dup_rate < 0.1: # 允许少量重复 + return [key] + + # 如果没有单一关键列,尝试组合 + for key_combo in [['Partnumber', 'MF_PN'], ['Purchase_Code', 'MF_PN']]: + if all(col in df1.columns for col in key_combo) and all(col in df2.columns for col in key_combo): + return key_combo + + # 最后尝试使用所有找到的共同列 + common_cols = list(set(df1.columns).intersection(set(df2.columns))) + if common_cols: + return common_cols[:2] # 最多使用前两列 + + return [] + + def generate_output_filename(self) -> str: + """生成输出文件名,以两个文件的有效sheet名称开头""" + if not self.file1_sheets or not self.file2_sheets: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"BOM差异报告_{timestamp}.xlsx" + + # 使用第一个文件第一个sheet和第二个文件第一个sheet + file1_sheet_name = str(self.file1_sheets[0]) if self.file1_sheets else "File1" + file2_sheet_name = str(self.file2_sheets[0]) if self.file2_sheets else "File2" + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 清理sheet名称中的特殊字符 + clean_sheet1 = self.clean_filename(file1_sheet_name) + clean_sheet2 = self.clean_filename(file2_sheet_name) + + filename = f"{clean_sheet1}_vs_{clean_sheet2}_差异报告_{timestamp}.xlsx" + + return filename + + def clean_filename(self, filename: str) -> str: + """清理文件名中的特殊字符""" + filename = str(filename) + + # 移除Windows文件名中不允许的字符 + invalid_chars = '<>:"/\\|?*' + for char in invalid_chars: + filename = filename.replace(char, '_') + + # 移除多余的空格和特殊字符 + filename = filename.replace(' ', '_') + filename = filename.replace('\t', '_') + filename = filename.replace('\n', '_') + + # 限制文件名长度 + if len(filename) > 50: + filename = filename[:50] + + return filename + + def clean_sheet_name(self, sheet_name: str, max_length: int = 25) -> str: + """清理工作表名称,确保符合Excel工作表名称限制""" + sheet_name = str(sheet_name) + + # 移除Excel工作表名称中不允许的字符 + invalid_chars = '[]:*?/\\' + for char in invalid_chars: + sheet_name = sheet_name.replace(char, '_') + + # 限制工作表名称长度(Excel限制为31个字符) + if len(sheet_name) > max_length: + sheet_name = sheet_name[:max_length] + + return sheet_name + + def get_output_directory(self) -> str: + """获取输出目录(第二个文件所在目录)""" + return os.path.dirname(self.file2_path) + + def generate_difference_report(self) -> str: + """生成差异报告Excel文件""" + if not self.differences: + return "没有发现差异" + + # 生成输出文件名和路径 + output_filename = self.generate_output_filename() + output_directory = self.get_output_directory() + output_path = os.path.join(output_directory, output_filename) + + try: + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + + # 创建总摘要表 + summary_data = [] + for diff_key, differences in self.differences.items(): + if 'error' not in differences: + columns_comparison = differences.get('columns_comparison', {}) + excluded_count = len(columns_comparison.get('excluded_columns', [])) + + summary_data.append([ + differences.get('sheet_names', diff_key), + differences['summary']['total_rows_df1'], + differences['summary']['total_rows_df2'], + differences['summary']['added_count'], + differences['summary']['removed_count'], + differences['summary']['modified_count'], + excluded_count + ]) + + if summary_data: + summary_df = pd.DataFrame(summary_data, columns=[ + '工作表对比', '文件1行数', '文件2行数', '新增行数', '删除行数', '修改行数', '排除列数' + ]) + summary_df.to_excel(writer, sheet_name='对比摘要', index=False) + + # 为每个对比创建详细报告 + for diff_key, differences in self.differences.items(): + sheet_key = self.clean_sheet_name(diff_key.replace('vs', '_vs_')) + + if 'error' in differences: + # 如果有错误,创建错误报告 + error_df = pd.DataFrame([['错误信息', differences['error']]]) + error_df.to_excel(writer, sheet_name=f"{sheet_key}_错误", index=False, header=False) + continue + + # 汇总表 - 包含列对比的详细信息 + summary_data = [] + summary_data.append(["对比项", "数量"]) + summary_data.append(["文件1总行数", differences['summary']['total_rows_df1']]) + summary_data.append(["文件2总行数", differences['summary']['total_rows_df2']]) + summary_data.append(["新增行数", differences['summary']['added_count']]) + summary_data.append(["删除行数", differences['summary']['removed_count']]) + summary_data.append(["修改行数", differences['summary']['modified_count']]) + summary_data.append(["共同列数", len(differences['columns_comparison']['common_columns'])]) + summary_data.append(["实际对比列数", len(differences['columns_comparison']['compared_columns'])]) + summary_data.append(["排除列数", len(differences['columns_comparison']['excluded_columns'])]) + summary_data.append(["文件1特有列", len(differences['columns_comparison']['file1_only_columns'])]) + summary_data.append(["文件2特有列", len(differences['columns_comparison']['file2_only_columns'])]) + + # 添加排除列详情 + excluded_cols = differences['columns_comparison'].get('excluded_columns', []) + if excluded_cols: + summary_data.append(["", ""]) + summary_data.append(["排除的列", "(检查信息类列不参与对比)"]) + for col in excluded_cols: + summary_data.append(["", f"- {col}"]) + + pd.DataFrame(summary_data).to_excel( + writer, + sheet_name=f"{sheet_key}_汇总", + index=False, + header=False + ) + + # 新增行详情 + if differences['added_rows']: + pd.DataFrame(differences['added_rows']).to_excel( + writer, + sheet_name=f"{sheet_key}_新增行", + index=False + ) + + # 删除行详情 + if differences['removed_rows']: + pd.DataFrame(differences['removed_rows']).to_excel( + writer, + sheet_name=f"{sheet_key}_删除行", + index=False + ) + + # 修改行详情 - 优化后的显示格式(排除检查信息列) + if differences['modified_rows']: + modified_data = [] + + for mod_row in differences['modified_rows']: + # 创建基础记录 + record = { + **mod_row['key_values'], # 关键列(如Partnumber) + '修改列': self.get_modified_columns_summary(mod_row['modified_columns']) + } + + # 添加所有需要显示的列(排除检查信息类列) + display_data = mod_row.get('full_row_data', {}) + + # 获取需要显示的列 + display_columns = list(display_data.keys()) + display_columns = [col for col in display_columns if + not col.startswith('_') and self.should_compare_column(col)] + + for col in display_columns: + record[col] = display_data.get(col, '') + + # 添加详细的修改信息(只包括参与对比的列) + for col, values in mod_row['modified_columns'].items(): + if self.should_compare_column(col): + record[f'详细_{col}'] = f"{values['old_value']} -> {values['new_value']}" + + modified_data.append(record) + + if modified_data: + modified_df = pd.DataFrame(modified_data) + + # 重新排列列的顺序,让重要信息在前 + column_order = list(mod_row['key_values'].keys()) + ['修改列'] + + # 添加其他显示列 + other_columns = [col for col in modified_df.columns + if col not in column_order and not col.startswith('详细_')] + column_order.extend(other_columns) + + # 添加详细修改信息列 + detailed_cols = [col for col in modified_df.columns if col.startswith('详细_')] + column_order.extend(detailed_cols) + + # 确保所有列都存在 + existing_columns = [col for col in column_order if col in modified_df.columns] + modified_df = modified_df[existing_columns] + + modified_df.to_excel( + writer, + sheet_name=f"{sheet_key}_修改行", + index=False + ) + + return output_path + + except Exception as e: + print(f"生成报告时出错: {e}") + return "" + + def run_comparison(self): + """执行完整的BOM对比流程""" + print("=== BOM文件差异对比工具 ===") + print("注意:检查信息类列(如'检查信息')将不参与修改行对比") + + # 1. 选择第一份文件 + print("\n步骤1: 选择第一份Excel文件") + self.file1_path = self.select_file("选择第一份BOM Excel文件") + if not self.file1_path: + print("未选择文件,程序退出") + return + + self.file1_name = os.path.basename(self.file1_path) + + # 2. 选择第二份文件 + print("\n步骤2: 选择第二份Excel文件") + self.file2_path = self.select_file("选择第二份BOM Excel文件") + if not self.file2_path: + print("未选择文件,程序退出") + return + + self.file2_name = os.path.basename(self.file2_path) + + print(f"\n文件1: {self.file1_name}") + print(f"文件2: {self.file2_name}") + + # 3. 查找有效sheet + print("\n步骤3: 查找有效的工作表...") + self.file1_sheets = self.find_valid_sheets(self.file1_path) + self.file2_sheets = self.find_valid_sheets(self.file2_path) + + print(f"文件1的有效工作表: {self.file1_sheets}") + print(f"文件2的有效工作表: {self.file2_sheets}") + + if not self.file1_sheets or not self.file2_sheets: + print("至少有一个文件没有有效的工作表,无法进行对比") + return + + # 4. 进行差异对比 + print("\n步骤4: 进行差异对比...") + self.differences = {} + + # 使用第一个文件第一个sheet和第二个文件第一个sheet进行对比 + sheet1 = self.file1_sheets[0] + sheet2 = self.file2_sheets[0] + + print(f"正在对比: {sheet1} (文件1) vs {sheet2} (文件2)") + + df1 = self.load_bom_data(self.file1_path, sheet1) + df2 = self.load_bom_data(self.file2_path, sheet2) + + if df1.empty: + print(f" ⚠ 文件1的工作表 {sheet1} 数据加载失败") + return + + if df2.empty: + print(f" ⚠ 文件2的工作表 {sheet2} 数据加载失败") + return + + differences = self.compare_dataframes(df1, df2, sheet1, sheet2) + comparison_key = f"{sheet1}_vs_{sheet2}" + self.differences[comparison_key] = differences + + if 'error' in differences: + print(f" ⚠ 对比过程中出错: {differences['error']}") + else: + columns_comparison = differences.get('columns_comparison', {}) + excluded_count = len(columns_comparison.get('excluded_columns', [])) + + print(f" √ 完成对比:") + print(f" 文件1行数: {differences['summary']['total_rows_df1']}") + print(f" 文件2行数: {differences['summary']['total_rows_df2']}") + print(f" 新增行数: {differences['summary']['added_count']}") + print(f" 删除行数: {differences['summary']['removed_count']}") + print(f" 修改行数: {differences['summary']['modified_count']}") + print(f" 排除列数: {excluded_count} (检查信息类列不参与对比)") + + # 5. 生成差异报告 + print("\n步骤5: 生成差异报告...") + output_file = self.generate_difference_report() + + if output_file and os.path.exists(output_file): + print(f"\n=== 对比完成 ===") + print(f"差异报告已生成: {os.path.basename(output_file)}") + # print(f"文件位置: {output_file}") + print(f"输出目录: {self.get_output_directory()}") + else: + print("未成功生成差异报告") + + +def main(): + """主函数""" + comparator = BOMComparator() + comparator.run_comparison() + + input("\n按Enter键退出...") + + +if __name__ == "__main__": + main() diff --git a/BOMCompare/BOMCompareForJP1.py b/BOMCompare/BOMCompareForJP1.py new file mode 100644 index 0000000..fa4eef9 --- /dev/null +++ b/BOMCompare/BOMCompareForJP1.py @@ -0,0 +1,635 @@ +import os +import pandas as pd +import numpy as np +import tkinter as tk +from tkinter import filedialog +from datetime import datetime +import warnings +import re +from openpyxl import Workbook +from openpyxl.utils.dataframe import dataframe_to_rows + +warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl') + + +class BOMComparator: + def __init__(self): + self.column_mapping = { + 'ITEM': 'Partnumber', + 'HT PN': 'Partnumber', + 'MF PN': 'MF_PN', + 'MFG': 'MF_NAME', + 'CRD': 'Reference', + 'Description': 'Description', + 'Qty': 'Quantity', + '焊接方式': '焊接方式', + 'Remark': '备注' + } + self.ignore_columns = ['备注'] + self.required_columns = list(self.column_mapping.values()) + self.change_columns = [ + 'ITEM', 'HT PN', 'MF PN', 'MFG', 'CRD', 'Description', 'Qty', 'Remark' + ] + self.mandatory_keywords = ['item', 'partnumber', 'mfpn'] + + # 异常记录 + self.validation_errors = [] + + self.stats = { + 'old_bom_rows': 0, + 'new_bom_rows': 0, + 'changed_items': 0, + 'added_items': 0, + 'removed_items': 0, + 'total_errors': 0 + } + + def normalize_text(self, text): + if pd.isna(text): + return "" + text = str(text) + text = re.sub(r'[^a-zA-Z0-9\s]', '', text) + return text.strip().lower() + + def find_header_row(self, df): + print(f"扫描前 {min(20, len(df))} 行寻找标题行...") + for i in range(min(20, len(df))): + row_values = [self.normalize_text(cell) for cell in df.iloc[i].values] + + contains_all_keywords = True + for keyword in self.mandatory_keywords: + if not any(keyword in cell_value for cell_value in row_values): + contains_all_keywords = False + break + + if contains_all_keywords: + print(f"✅ 找到有效标题行 (索引 {i}),包含所有必需关键词") + return i + + error_msg = ( + "❌ 未找到有效的标题行:所有标题行必须同时包含以下关键词:\n" + f"- Item (或类似表述)\n" + f"- Partnumber (或类似表述)\n" + f"- MF_PN (或类似表述)\n\n" + "在文件的前20行中没有找到同时包含所有关键词的行。" + ) + raise ValueError(error_msg) + + def find_active_sheet(self, file_path): + print(f"扫描文件: {os.path.basename(file_path)}") + xls = pd.ExcelFile(file_path) + + candidate_sheets = [] + for sheet_name in xls.sheet_names: + # 使用 BOM 或 PCBA 作为关键词 + if any(keyword in sheet_name.lower() for keyword in ["bom", "pcba"]): + candidate_sheets.append(sheet_name) + print(f" 发现候选Sheet: {sheet_name} - 关键词匹配") + + # 第一步:优先检查第一个bom候选Sheet + successful_sheet = None + if candidate_sheets: + + for first_candidate in candidate_sheets: + # 先检查第一个候选Sheet + # first_candidate = candidate_sheets[0] + try: + print(f" 优先检查候选Sheet: {first_candidate}") + df_preview = pd.read_excel( + file_path, + sheet_name=first_candidate, + header=None, + nrows=20, + engine='openpyxl' + ) + header_row_idx = self.find_header_row(df_preview) + print(f"✅ 在候选Sheet '{first_candidate}' 中找到标题行") + # return first_candidate + successful_sheet = first_candidate + break + except Exception as e: + print(f" ❌ 优先候选Sheet '{first_candidate}': {str(e)}") + # 移除失败的首选候选 + # candidate_sheets.pop(0) + # remove(值) - 移除指定值的元素 + # candidate_sheets.remove(first_candidate) # 移除值为 'sheet_name' 的元素 + continue + if successful_sheet: + return successful_sheet + + # 第二步:如果没找到bom候选Sheet或首选候选失败,遍历所有候选Sheet + if not successful_sheet: + candidate_sheets = xls.sheet_names + print(" 未找到名称包含'BOM'的Sheet,将检查所有Sheet") + + # 遍历剩余候选Sheet + for sheet_name in candidate_sheets: + try: + print(f" 检查Sheet: {sheet_name}") + df_preview = pd.read_excel( + file_path, + sheet_name=sheet_name, + header=None, + nrows=20, + engine='openpyxl' + ) + + try: + header_row_idx = self.find_header_row(df_preview) + print(f"✅ 在Sheet '{sheet_name}' 中找到标题行") + return sheet_name + except ValueError as e: + print(f" ❌ Sheet '{sheet_name}': {str(e)}") + continue + except Exception as e: + print(f" 检查Sheet '{sheet_name}' 时出错: {str(e)}") + continue + + # 第三步:如果所有候选Sheet都失败,尝试第一个Sheet作为备选 + print("⚠️ 所有候选Sheet检查失败,尝试第一个Sheet") + first_sheet = xls.sheet_names[0] + try: + df_preview = pd.read_excel( + file_path, + sheet_name=first_sheet, + header=None, + nrows=20, + engine='openpyxl' + ) + header_row_idx = self.find_header_row(df_preview) + print(f"✅ 在备份Sheet '{first_sheet}' 中找到标题行") + return first_sheet + except Exception as e: + print(f"❌ 备份Sheet '{first_sheet}' 也失败: {str(e)}") + return None + + def validate_bom(self, bom_df, file_name, sheet_name): + """验证BOM数据并收集异常""" + errors = [] + + # 1. 检查Partnumber是否有重复 + dup_partnumbers = bom_df[bom_df.duplicated('Partnumber', keep=False)] + if not dup_partnumbers.empty: + print(f"⚠️ 发现重复的Partnumber: {len(dup_partnumbers)} 行") + for idx, row in dup_partnumbers.iterrows(): + error = { + '文件': file_name, + 'Sheet': sheet_name, + '原始行号': idx + 2, # Excel行号从1开始,标题行下一行 + '异常类型': '重复Partnumber', + '异常描述': f"Partnumber '{row['Partnumber']}' 重复出现" + } + errors.append(error) + + # 2. 检查Partnumber是否为空 + empty_partnumbers = bom_df[bom_df['Partnumber'].isna() | (bom_df['Partnumber'] == '')] + if not empty_partnumbers.empty: + print(f"⚠️ 发现空Partnumber: {len(empty_partnumbers)} 行") + for idx, row in empty_partnumbers.iterrows(): + error = { + '文件': file_name, + 'Sheet': sheet_name, + '原始行号': idx + 2, + '异常类型': '空Partnumber', + '异常描述': "Partnumber为空" + } + errors.append(error) + + # 3. 验证Reference位号数量与Quantity是否一致 + for idx, row in bom_df.iterrows(): + # # 跳过PCB等特殊项 + # if row.get('Part Type') == 'PCB' or pd.isna(row.get('Reference')): + # continue + + refs = str(row['Reference']) + qty = row['Quantity'] + + try: + # 计算实际位号数量 + ref_count = len([r for r in refs.split(',') if r.strip()]) + + # 检查Quantity是否为数字 + try: + qty_val = int(qty) + except (ValueError, TypeError): + qty_val = -1 + + # 验证数量一致性 + if ref_count != qty_val: + error = { + '文件': file_name, + 'Sheet': sheet_name, + '原始行号': idx + 2, + '异常类型': '数量不一致', + '异常描述': f"位号数量({ref_count}) ≠ Quantity({qty})" + } + errors.append(error) + except Exception as e: + error = { + '文件': file_name, + 'Sheet': sheet_name, + '原始行号': idx + 2, + '异常类型': '验证错误', + '异常描述': f"验证异常: {str(e)}" + } + errors.append(error) + + return errors + + def load_bom(self, file_path): + print(f"识别激活Sheet...") + active_sheet = self.find_active_sheet(file_path) + print(f"📊 使用Sheet: {active_sheet}") + + df_preview = pd.read_excel( + file_path, + sheet_name=active_sheet, + header=None, + nrows=20 + ) + + header_row_idx = self.find_header_row(df_preview) + + print("加载完整BOM数据...") + bom_df = pd.read_excel( + file_path, + sheet_name=active_sheet, + header=header_row_idx, + dtype=str + ) + + if "old_bom_rows" not in self.stats or self.stats['old_bom_rows'] == 0: + self.stats['old_bom_rows'] = len(bom_df) + else: + self.stats['new_bom_rows'] = len(bom_df) + + # 清理列名 + bom_df.columns = [str(col).strip() for col in bom_df.columns] + print(f" 原始列名: {list(bom_df.columns)}") + + # 列名标准化映射 + column_aliases = { + 'Item': 'Item', + 'Partnumber': 'Partnumber', + 'Part Number': 'Partnumber', + 'Purchase_Code': 'Purchase_Code', + 'MF_PN': 'MF_PN', + 'Description': 'Description', + 'Part Type': 'Part Type', + 'MF_NAME': 'MF_NAME', + 'Manufacturer': 'MF_NAME', + 'PCB_Footprint': 'PCB_Footprint', + 'Reference': 'Reference', + 'References': 'Reference', + 'Quantity': 'Quantity', + 'Qty': 'Quantity', + '加工方式': '焊接方式', + '焊接方式': '焊接方式', + 'Value': 'Value', + '备注': '备注', + 'Remark': '备注', + 'Comments': '备注' + } + + # 应用别名映射 + bom_df = bom_df.rename(columns={col: alias for col, alias in column_aliases.items() + if col in bom_df.columns}) + print(f" 标准化后列名: {list(bom_df.columns)}") + + # 确保所有必需列存在 + missing_cols = [col for col in self.required_columns if col not in bom_df.columns] + if missing_cols: + raise ValueError(f"❌ 缺少必需列: {', '.join(missing_cols)}") + + # 清理数据:去除空行和无效项 + initial_count = len(bom_df) + bom_df = bom_df.replace('', np.nan) + bom_df = bom_df.dropna(subset=['Item'], how='all') + cleaned_count = len(bom_df) + + if initial_count > cleaned_count: + print( + f" 清理空行: 移除 {initial_count - cleaned_count} 行 (原 {initial_count} 行 -> 现 {cleaned_count} 行)") + + # 执行数据验证 + file_name = os.path.basename(file_path) + errors = self.validate_bom(bom_df, file_name, active_sheet) + self.validation_errors.extend(errors) + self.stats['total_errors'] += len(errors) + + if errors: + print(f"⚠️ 在 '{file_name}' 中发现 {len(errors)} 个数据异常") + + return bom_df, active_sheet + + def compare_reference_lists(self, old_refs_str, new_refs_str): + """比较两个Reference列表,返回差异描述""" + if pd.isna(old_refs_str): + old_refs_str = "" + if pd.isna(new_refs_str): + new_refs_str = "" + + old_refs = set([ref.strip() for ref in str(old_refs_str).split(',') if ref.strip()]) + new_refs = set([ref.strip() for ref in str(new_refs_str).split(',') if ref.strip()]) + + # 如果两个集合相同,返回空字符串表示无差异 + if old_refs == new_refs: + return "" + + # 计算差异 + added_refs = new_refs - old_refs + removed_refs = old_refs - new_refs + + diff_msgs = [] + if added_refs: + diff_msgs.append(f"增加位号: {','.join(sorted(added_refs))}") + if removed_refs: + diff_msgs.append(f"删除位号: {','.join(sorted(removed_refs))}") + + return "; ".join(diff_msgs) + + def compare_boms(self, old_bom, new_bom): + print("开始比较两份BOM...") + old_bom['Partnumber'] = old_bom['Partnumber'].astype(str).str.strip() + new_bom['Partnumber'] = new_bom['Partnumber'].astype(str).str.strip() + + changes = [] + + old_partnumbers = set(old_bom['Partnumber'].unique()) + if len(old_partnumbers) != len(old_bom): + print(f"⚠️ 旧BOM有重复的Partnumber: 总行数{len(old_bom)},唯一物料数{len(old_partnumbers)}") + new_partnumbers = set(new_bom['Partnumber'].unique()) + if len(new_partnumbers) != len(new_bom): + print(f"⚠️ 新BOM有重复的Partnumber: 总行数{len(new_bom)},唯一物料数{len(new_partnumbers)}") + + all_partnumbers = sorted(old_partnumbers | new_partnumbers) + print(f" 总物料项数量: {len(all_partnumbers)} (旧BOM: {len(old_partnumbers)}, 新BOM: {len(new_partnumbers)})") + + for idx, pn in enumerate(all_partnumbers): + if (idx + 1) % 100 == 0 or (idx + 1) == len(all_partnumbers): + print(f" 处理进度: {idx + 1}/{len(all_partnumbers)} 项物料") + + record = {'ITEM_OLD': '', 'ITEM_NEW': ''} + old_row = None + new_row = None + change_desc = "" + + old_match = old_bom[old_bom['Partnumber'] == pn] + if not old_match.empty: + old_row = old_match.iloc[0] + record['ITEM_OLD'] = old_row['Item'] + + new_match = new_bom[new_bom['Partnumber'] == pn] + if not new_match.empty: + new_row = new_match.iloc[0] + record['ITEM_NEW'] = new_row['Item'] + + change_type = "" + if old_row is None: + change_type = "新增" + self.stats['added_items'] += 1 + change_desc = "新增物料" + elif new_row is None: + change_type = "删除" + self.stats['removed_items'] += 1 + change_desc = "删除物料" + else: + change_type = "变更" + self.stats['changed_items'] += 1 + + # 填充左侧列(旧BOM值) + for change_col, bom_col in self.column_mapping.items(): + if change_col == 'ITEM': + continue + old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else '' + record[change_col] = old_val + + # 填充右侧列(新BOM值) + for change_col, bom_col in self.column_mapping.items(): + if change_col == 'ITEM': + continue + new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else '' + record[f'NEW_{change_col}'] = new_val + + if change_type == "变更": + change_details = [] + qty_changed = False + if 'Quantity' in old_row.index and 'Quantity' in new_row.index: + old_qty = str(old_row['Quantity']) + new_qty = str(new_row['Quantity']) + if old_qty != new_qty: + change_details.append(f"Qty: {old_qty}→{new_qty}") + qty_changed = True + + mfpn_changed = False + if 'MF_PN' in old_row.index and 'MF_PN' in new_row.index: + old_mfpn = str(old_row['MF_PN']) + new_mfpn = str(new_row['MF_PN']) + if old_mfpn != new_mfpn: + change_details.append(f"MF PN: {old_mfpn}→{new_mfpn}") + mfpn_changed = True + + # 优化:使用新的Reference比较方法 + if 'Reference' in old_row.index and 'Reference' in new_row.index: + ref_diff = self.compare_reference_lists(old_row['Reference'], new_row['Reference']) + if ref_diff: + change_details.append(ref_diff) + + for change_col, bom_col in self.column_mapping.items(): + if (change_col == 'ITEM' or + bom_col in ['Quantity', 'MF_PN', 'Reference'] or + bom_col in self.ignore_columns): + continue + + old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else '' + new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else '' + + if str(old_val) != str(new_val): + change_details.append(f"{change_col}: {old_val}→{new_val}") + + if change_details: + change_desc = "; ".join(change_details) + else: + change_type = "" + + record['Design change Type'] = change_type + record['NEW_Remark'] = change_desc + + if change_type: + changes.append(record) + + left_columns = ['ITEM_OLD'] + [col for col in self.change_columns if col != 'ITEM'] + middle_columns = ['Design change Type'] + right_columns = ['ITEM_NEW'] + [f'NEW_{col}' for col in self.change_columns if col != 'ITEM'] + + if 'NEW_Remark' in right_columns: + right_columns.remove('NEW_Remark') + right_columns.append('NEW_Remark') + + change_columns = left_columns + middle_columns + right_columns + right_start_col = len(left_columns) + len(middle_columns) + 1 + + return pd.DataFrame(changes, columns=change_columns), right_start_col + + def generate_summary(self): + summary = [ + "\n" + "=" * 50, + "BOM 比较处理汇总", + "-" * 50, + f"原始BOM行数: {self.stats['old_bom_rows']}", + f"新BOM行数: {self.stats['new_bom_rows']}", + f"变更物料数量: {self.stats['changed_items']}", + f"新增物料数量: {self.stats['added_items']}", + f"删除物料数量: {self.stats['removed_items']}", + f"变更记录总数: {self.stats['changed_items'] + self.stats['added_items'] + self.stats['removed_items']}", + f"数据异常总数: {self.stats['total_errors']}", + "=" * 50 + ] + return "\n".join(summary) + + def generate_change_record(self): + root = tk.Tk() + root.withdraw() + + # 重置统计信息和异常记录 + self.stats = { + 'old_bom_rows': 0, + 'new_bom_rows': 0, + 'changed_items': 0, + 'added_items': 0, + 'removed_items': 0, + 'total_errors': 0 + } + self.validation_errors = [] + + try: + # 选择原始BOM文件 + print("\n" + "=" * 50) + print("步骤 1/4: 选择原始BOM文件") + print("=" * 50) + old_file = filedialog.askopenfilename( + title="选择原始BOM文件", + filetypes=[("Excel Files", "*.xlsx *.xls")] + ) + if not old_file: + print("❌ 未选择文件,操作取消") + return + print(f"📂 已选择原始BOM: {old_file}") + old_file_name = os.path.basename(old_file) + # output_dir = os.path.dirname(old_file) + + # 选择变更后BOM文件 + print("\n" + "=" * 50) + print("步骤 2/4: 选择变更后BOM文件") + print("=" * 50) + new_file = filedialog.askopenfilename( + title="选择变更后BOM文件", + filetypes=[("Excel Files", "*.xlsx *.xls")] + ) + if not new_file: + print("❌ 未选择文件,操作取消") + return + print(f"📂 已选择新BOM: {new_file}") + new_file_name = os.path.basename(new_file) + output_dir = os.path.dirname(new_file) + + # 加载BOM文件 + print("\n" + "=" * 50) + print("步骤 3/4: 加载并处理BOM文件") + print("=" * 50) + print(f"🔍 加载原始BOM文件: {old_file_name}") + old_bom, old_bom_activesheetname = self.load_bom(old_file) + print(f"✅ 原始BOM加载完成,共 {len(old_bom)} 行") + + print(f"\n🔍 加载变更后BOM文件: {new_file_name}") + new_bom, new_bom_activesheetname = self.load_bom(new_file) + print(f"✅ 新BOM加载完成,共 {len(new_bom)} 行") + + # 比较BOM生成变更记录 + print("\n" + "=" * 50) + print("步骤 4/4: 比较BOM差异并生成变更记录") + print("=" * 50) + print("🔍 比较BOM差异...") + change_df, right_start_col = self.compare_boms(old_bom, new_bom) + + # 准备输出文件名 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = f"{old_bom_activesheetname} to {new_bom_activesheetname} eBOM_change_record_{timestamp}.xlsx" + output_path = os.path.join(output_dir, output_file) + + # 保存变更记录和异常记录 + print(f"\n💾 保存变更记录文件: {output_path}") + wb = Workbook() + + # 创建变更记录工作表 + ws_change = wb.active + ws_change.title = "PCBA_BOM_change record" + + if change_df.empty: + ws_change.cell(row=1, column=1, value="两份BOM完全相同,无变更记录") + print("✅ 两份BOM完全相同,无变更记录") + else: + # 重命名列 + column_rename = { + 'ITEM_OLD': 'ITEM', + 'ITEM_NEW': 'ITEM', + **{f'NEW_{col}': col for col in self.change_columns if col != 'ITEM'}, + 'NEW_Remark': 'Remark' + } + change_df = change_df.rename(columns=column_rename) + + # 添加文件名信息 + ws_change.cell(row=1, column=1, value=old_file_name) + ws_change.cell(row=1, column=right_start_col, value=new_file_name) + + # 添加列标题 + col_names = change_df.columns.tolist() + for col_idx, col_name in enumerate(col_names, 1): + ws_change.cell(row=2, column=col_idx, value=col_name) + + # 添加数据行 + for r_idx, row in enumerate(dataframe_to_rows(change_df, index=False, header=False), 3): + for c_idx, value in enumerate(row, 1): + ws_change.cell(row=r_idx, column=c_idx, value=value) + + # 创建异常记录工作表 + if self.validation_errors: + print(f"⚠️ 发现 {len(self.validation_errors)} 个数据异常,创建异常记录") + ws_errors = wb.create_sheet(title="BOM异常记录") + + # 异常记录列名 + error_columns = ['文件', 'Sheet', '原始行号', '异常类型', '异常描述'] + for col_idx, col_name in enumerate(error_columns, 1): + ws_errors.cell(row=1, column=col_idx, value=col_name) + + # 添加异常数据 + for row_idx, error in enumerate(self.validation_errors, 2): + ws_errors.cell(row=row_idx, column=1, value=error['文件']) + ws_errors.cell(row=row_idx, column=2, value=error['Sheet']) + ws_errors.cell(row=row_idx, column=3, value=error['原始行号']) + ws_errors.cell(row=row_idx, column=4, value=error['异常类型']) + ws_errors.cell(row=row_idx, column=5, value=error['异常描述']) + + # 保存工作簿 + wb.save(output_path) + + # 打印处理汇总 + print(self.generate_summary()) + print(f"\n✅ 变更记录已保存至: {output_path}") + + except Exception as e: + print(f"\n❌ 处理过程中出错: {str(e)}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + print("=" * 60) + print(" PCBA BOM 变更记录生成工具 ") + print("=" * 60) + print("要求: 标题行必须同时包含 'Item', 'Partnumber', 'MF_PN'") + comparator = BOMComparator() + comparator.generate_change_record() + print("\n" + "=" * 50) + print(" 处理完成,按任意键退出... ") + # input() diff --git a/BOMCompare/BOMConsolidatorV1.py b/BOMCompare/BOMConsolidatorV1.py new file mode 100644 index 0000000..9180244 --- /dev/null +++ b/BOMCompare/BOMConsolidatorV1.py @@ -0,0 +1,618 @@ +import pandas as pd +import os +import glob +import re +from datetime import datetime +import tkinter as tk +from tkinter import filedialog +from collections import defaultdict +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Tuple, Any +from dataclasses import dataclass + + +@dataclass +class ProcessedFileInfo: + """处理文件信息类""" + filename: str + sheet_name: str + start_row: int + total_rows: int + valid_rows: int + + +@dataclass +class BOMRow: + """BOM行数据类""" + partnumber: str + purchase_code: str + mf_pn: str + description: str + part_type: str + mf_name: str + pcb_footprint: str + quantity: int + reference: str + filename: str = "" + sheet_name: str = "" + + @classmethod + def from_dataframe_row(cls, row: pd.Series, filename: str = "", sheet_name: str = "") -> Optional['BOMRow']: + """从DataFrame行创建BOMRow对象""" + try: + return cls( + partnumber=str(row.get('Partnumber', '')).strip(), + purchase_code=str(row.get('Purchase_Code', '')).strip(), + mf_pn=str(row.get('MF_PN', '')).strip(), + description=str(row.get('Description', '')).strip(), + part_type=str(row.get('Part_Type', '')).strip(), + mf_name=str(row.get('MF_NAME', '')).strip(), + pcb_footprint=str(row.get('PCB_Footprint', '')).strip(), + quantity=int(row.get('Quantity', 0)), + reference=str(row.get('Reference', '')).strip(), + filename=filename, + sheet_name=sheet_name + ) + except (ValueError, TypeError): + return None + + def get_key(self) -> str: + """获取行的唯一标识键""" + return self.partnumber if self.partnumber else self.mf_pn + + def is_valid(self) -> bool: + """检查行数据是否有效""" + return bool(self.get_key()) + + +@dataclass +class ConsolidatedMaterial: + """合并后的物料数据类""" + partnumber: str + purchase_code: str + mf_pn: str + description: str + part_type: str + mf_name: str + pcb_footprint: str + quantity_data: Dict[str, int] # 文件名: 数量 + inconsistencies: List[str] + + @property + def total_quantity(self) -> int: + """计算总数量""" + return sum(self.quantity_data.values()) + + @property + def has_inconsistencies(self) -> bool: + """检查是否有不一致""" + return len(self.inconsistencies) > 0 + + +class ConsistencyChecker: + """一致性检查器""" + + def __init__(self): + self.fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint'] + + def check_field_consistency(self, existing: ConsolidatedMaterial, new_row: BOMRow) -> List[str]: + """检查字段一致性""" + inconsistencies = [] + + field_mapping = { + 'Purchase_Code': ('purchase_code', 'Purchase_Code'), + 'MF_PN': ('mf_pn', 'MF_PN'), + 'Part_Type': ('part_type', 'Part Type'), + 'MF_NAME': ('mf_name', 'MF_NAME'), + 'PCB_Footprint': ('pcb_footprint', 'PCB_Footprint') + } + + for field, (attr_name, row_field) in field_mapping.items(): + existing_val = getattr(existing, attr_name) + new_val = getattr(new_row, attr_name) + + if self._should_check_field(existing_val, new_val) and existing_val != new_val: + inconsistencies.append( + f"{field}不一致: {existing_val} ≠ {new_val} (文件: {new_row.filename}, Sheet: {new_row.sheet_name})" + ) + + return inconsistencies + + def check_quantity_reference(self, row: BOMRow) -> Optional[str]: + """检查Reference数量和Quantity是否匹配""" + if not row.reference: + return None + + ref_count = len([ref for ref in row.reference.split(',') if ref.strip()]) + + if ref_count != row.quantity: + return f"Reference数量不符: {ref_count}个位置 ≠ Quantity={row.quantity} (文件: {row.filename}, Sheet: {row.sheet_name})" + + return None + + def _should_check_field(self, existing_val: str, new_val: str) -> bool: + """判断是否应该检查字段""" + # 忽略空值和无意义值 + if not new_val or new_val.lower() in ['', 'nan', 'none', 'null']: + return False + return True + + +class BOMFileParser: + """BOM文件解析器""" + + def __init__(self): + self.required_headers = ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN'] + self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description', + 'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference'] + + def find_valid_sheet(self, file_path: str) -> Optional[Tuple[str, int]]: + """定位包含有效BOM的Sheet""" + try: + xl = pd.ExcelFile(file_path) + + for sheet_name in xl.sheet_names: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) + + for i in range(min(len(df), 10)): # 只检查前10行 + headers = df.iloc[i].values + if all(col in str(headers) for col in self.required_headers): + filename = os.path.basename(file_path) + print(f"文件{filename}找到有效sheet {sheet_name}|有效数据行从 {i} 开始。") + return sheet_name, i + except Exception as e: + print(f"读取文件 {file_path} 时出错: {e}") + + return None, None + + def parse_file(self, file_path: str) -> Optional[Tuple[List[BOMRow], ProcessedFileInfo]]: + """解析BOM文件""" + filename = os.path.basename(file_path) + sheet_name, header_row = self.find_valid_sheet(file_path) + + if not sheet_name: + return None + + try: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + total_rows = len(df) + df = self._clean_dataframe(df) + + if not self._validate_columns(df): + return None + + bom_rows = [] + valid_rows = 0 + for _, row_data in df.iterrows(): + bom_row = BOMRow.from_dataframe_row(row_data, filename, sheet_name) + if bom_row and bom_row.is_valid(): + bom_rows.append(bom_row) + valid_rows += 1 + + # 创建文件信息对象 + file_info = ProcessedFileInfo( + filename=filename, + sheet_name=sheet_name, + start_row=header_row, + total_rows=total_rows, + valid_rows=valid_rows + ) + + return bom_rows, file_info + + except Exception as e: + print(f"解析文件 {file_path} 时出错: {e}") + return None + + def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: + """清洗DataFrame""" + # 清理列名 + df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True) + df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True) + + # 去除空行 + df = df.dropna(how='all') + + return df + + def _validate_columns(self, df: pd.DataFrame) -> bool: + """验证必要列是否存在""" + missing_cols = [col for col in self.required_columns if col not in df.columns] + return len(missing_cols) == 0 + + +class MaterialConsolidator: + """物料合并器""" + + def __init__(self): + self.materials: Dict[str, ConsolidatedMaterial] = {} + self.consistency_checker = ConsistencyChecker() + self.file_quantities: Dict[str, Dict[str, int]] = defaultdict(dict) + self.processed_files_info: List[ProcessedFileInfo] = [] + + def add_bom_row(self, bom_row: BOMRow) -> None: + """添加BOM行数据""" + key = bom_row.get_key() + + if key not in self.materials: + # 创建新的合并物料 + self.materials[key] = ConsolidatedMaterial( + partnumber=bom_row.partnumber, + purchase_code=bom_row.purchase_code, + mf_pn=bom_row.mf_pn, + description=bom_row.description, + part_type=bom_row.part_type, + mf_name=bom_row.mf_name, + pcb_footprint=bom_row.pcb_footprint, + quantity_data={}, + inconsistencies=[] + ) + + material = self.materials[key] + + # 检查一致性 + inconsistencies = self.consistency_checker.check_field_consistency(material, bom_row) + material.inconsistencies.extend(inconsistencies) + + # 检查数量引用 + ref_inconsistency = self.consistency_checker.check_quantity_reference(bom_row) + if ref_inconsistency: + material.inconsistencies.append(ref_inconsistency) + + # 记录数量数据 + material.quantity_data[bom_row.filename] = bom_row.quantity + self.file_quantities[bom_row.filename][key] = bom_row.quantity + + def add_file_info(self, file_info: ProcessedFileInfo) -> None: + """添加文件处理信息""" + self.processed_files_info.append(file_info) + + def get_statistics(self) -> Dict[str, Any]: + """获取统计信息""" + total_inconsistencies = sum(len(mat.inconsistencies) for mat in self.materials.values()) + materials_with_issues = sum(1 for mat in self.materials.values() if mat.has_inconsistencies) + + return { + 'total_materials': len(self.materials), + 'total_inconsistencies': total_inconsistencies, + 'materials_with_issues': materials_with_issues, + 'file_count': len(self.file_quantities), + 'processed_files_info': self.processed_files_info + } + + +class ReportGenerator: + """报告生成器""" + + def __init__(self, output_folder: str): + self.output_folder = output_folder + self._ensure_output_directory() + + def _ensure_output_directory(self): + """确保输出目录存在""" + output_dir = os.path.join(self.output_folder, "BOM_Merge_out") + os.makedirs(output_dir, exist_ok=True) + + def _create_summary_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame: + """创建汇总信息Sheet""" + summary_data = [ + ["BOM合并检查汇总报告", ""], + ["生成时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")], + ["", ""], + ["处理统计", ""], + ["扫描文件总数", stats['total_files']], + ["成功处理文件数", stats['processed_files']], + ["处理数据行数", stats['processed_rows']], + ["", ""], + ["物料统计", ""], + ["合并物料种类数", stats['total_materials']], + ["存在问题的物料数", stats['materials_with_issues']], + ["不一致问题总数", stats['total_inconsistencies']], + ["", ""], + ["数据源文件信息", ""], + ["有效文件总数", len(stats.get('processed_files_info', []))], + ["", ""] + ] + + # 添加详细的数据源文件信息 + files_info = stats.get('processed_files_info', []) + for i, file_info in enumerate(files_info, 1): + summary_data.extend([ + [f"数据源文件 {i}", file_info.filename], + [" Sheet名称", file_info.sheet_name], + [" 起始行", file_info.start_row + 1], # 转换为1-based索引 + [" 总行数", file_info.total_rows], + [" 有效行数", file_info.valid_rows], + ["", ""] + ]) + + summary_data.extend([ + ["", ""], + ["文件信息", ""], + ["输出文件夹", os.path.join(self.output_folder, "BOM_Merge_out")], + ["报告文件", stats.get('output_filename', '')], + ["合并Sheet名称", "BOM_Merge"] + ]) + + return pd.DataFrame(summary_data, columns=["项目", "数值"]) + + def _create_data_source_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame: + """创建数据源文件详细信息Sheet""" + files_info = stats.get('processed_files_info', []) + + if not files_info: + return pd.DataFrame([["无有效数据源文件", ""]], columns=["状态", "说明"]) + + data_source_data = [] + for i, file_info in enumerate(files_info, 1): + data_source_data.append({ + '序号': i, + '文件名': file_info.filename, + 'Sheet名称': file_info.sheet_name, + '数据起始行': file_info.start_row + 1, # 转换为1-based索引 + '总行数': file_info.total_rows, + '有效行数': file_info.valid_rows, + '处理状态': '成功' + }) + + return pd.DataFrame(data_source_data) + + def _create_merge_sheet(self, consolidator: MaterialConsolidator) -> pd.DataFrame: + """创建合并数据Sheet""" + report_data = [] + file_columns = sorted(consolidator.file_quantities.keys()) + + for material in consolidator.materials.values(): + row = { + 'Partnumber': material.partnumber, + 'Purchase_Code': material.purchase_code, + 'MF_PN': material.mf_pn, + 'Description': material.description, + 'Part Type': material.part_type, + 'MF_NAME': material.mf_name, + 'PCB_Footprint': material.pcb_footprint, + '检查信息': '; '.join(material.inconsistencies) if material.inconsistencies else '一致' + } + + # 添加各文件数量 + for file in file_columns: + row[file] = material.quantity_data.get(file, 0) + row['合计'] = material.total_quantity + + report_data.append(row) + + return pd.DataFrame(report_data) + + def generate_consolidated_report(self, consolidator: MaterialConsolidator, stats: Dict[str, Any]) -> Optional[str]: + """生成包含多个Sheet的合并报告""" + if not consolidator.materials: + return None + + # 生成带时间戳的文件名 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_filename = f"BOM合并报告_{timestamp}.xlsx" + output_path = os.path.join(self.output_folder, "BOM_Merge_out", output_filename) + + try: + # 使用ExcelWriter创建多Sheet的Excel文件 + with pd.ExcelWriter(output_path, engine='openpyxl') as writer: + # Sheet 1: 汇总信息 + summary_df = self._create_summary_sheet(stats) + summary_df.to_excel(writer, sheet_name='汇总信息', index=False) + + # Sheet 2: 数据源文件信息 + data_source_df = self._create_data_source_sheet(stats) + data_source_df.to_excel(writer, sheet_name='数据源文件', index=False) + + # Sheet 3: 合并数据 + merge_df = self._create_merge_sheet(consolidator) + merge_df.to_excel(writer, sheet_name='BOM_Merge', index=False) + + # 调整列宽 + workbook = writer.book + + # 调整汇总信息Sheet列宽 + summary_sheet = workbook['汇总信息'] + summary_sheet.column_dimensions['A'].width = 25 + summary_sheet.column_dimensions['B'].width = 40 + + # 调整数据源文件Sheet列宽 + data_source_sheet = workbook['数据源文件'] + for col in data_source_sheet.columns: + max_length = 0 + column = col[0].column_letter + for cell in col: + try: + if len(str(cell.value)) > max_length: + max_length = len(str(cell.value)) + except: + pass + adjusted_width = min(max_length + 2, 30) + data_source_sheet.column_dimensions[column].width = adjusted_width + + # 调整合并数据Sheet列宽 + merge_sheet = workbook['BOM_Merge'] + for col in merge_sheet.columns: + max_length = 0 + column = col[0].column_letter + for cell in col: + try: + if len(str(cell.value)) > max_length: + max_length = len(str(cell.value)) + except: + pass + adjusted_width = min(max_length + 2, 50) + merge_sheet.column_dimensions[column].width = adjusted_width + + # 更新stats中的文件名 + stats['output_filename'] = output_filename + + return output_path + + except Exception as e: + print(f"保存报告失败: {e}") + return None + + +class BOMProcessor: + """BOM处理器 - 主控制器""" + + def __init__(self): + self.file_parser = BOMFileParser() + self.material_consolidator = MaterialConsolidator() + self.report_generator: Optional[ReportGenerator] = None + + # 统计信息 + self.processed_files = 0 + self.processed_rows = 0 + self.total_files = 0 + + def set_output_folder(self, folder_path: str): + """设置输出文件夹""" + self.report_generator = ReportGenerator(folder_path) + + def process_folder(self, folder_path: str) -> bool: + """处理文件夹中的所有BOM文件""" + bom_files = glob.glob(os.path.join(folder_path, "*.xlsx")) + self.total_files = len(bom_files) + + if not bom_files: + return False + + successful_files = 0 + for file_path in bom_files: + if self._process_single_file(file_path): + successful_files += 1 + + self.processed_files = successful_files + return successful_files > 0 + + def _process_single_file(self, file_path: str) -> bool: + """处理单个文件""" + filename = os.path.basename(file_path) + print(f"处理文件: {filename}...") + + result = self.file_parser.parse_file(file_path) + if not result: + print(f" ! 无法解析文件: {filename}") + return False + + bom_rows, file_info = result + + print(f" √ 文件{filename}找到 {len(bom_rows)} 行有效数据 (Sheet: {file_info.sheet_name})") + + # 添加文件处理信息 + self.material_consolidator.add_file_info(file_info) + + # 处理BOM行数据 + for bom_row in bom_rows: + self.material_consolidator.add_bom_row(bom_row) + self.processed_rows += 1 + + return True + + def generate_report(self) -> Optional[Dict[str, Any]]: + """生成报告并返回统计信息""" + if not self.report_generator: + return None + + # 获取基本统计信息 + base_stats = self.material_consolidator.get_statistics() + base_stats.update({ + 'processed_files': self.processed_files, + 'total_files': self.total_files, + 'processed_rows': self.processed_rows + }) + + # 生成报告 + output_path = self.report_generator.generate_consolidated_report( + self.material_consolidator, base_stats + ) + + if not output_path: + return None + + # 返回完整的统计信息 + base_stats['output_path'] = output_path + return base_stats + + +class UserInterface: + """用户界面处理器""" + + @staticmethod + def select_folder(title: str = "选择文件夹") -> str: + """选择文件夹""" + root = tk.Tk() + root.withdraw() + folder_path = filedialog.askdirectory(title=title) + root.destroy() + return folder_path + + @staticmethod + def print_summary(stats: Dict[str, Any], folder_path: str): + """打印汇总信息""" + print("\n" + "=" * 60) + print("BOM合并检查完成!") + print("=" * 60) + print(f"处理文件夹: {folder_path}") + print(f"扫描文件数: {stats['total_files']}") + print(f"成功处理文件数: {stats['processed_files']}") + print(f"处理数据行数: {stats['processed_rows']}") + print(f"合并物料种类数: {stats['total_materials']}") + print(f"存在问题的物料数: {stats['materials_with_issues']}") + print(f"不一致问题总数: {stats['total_inconsistencies']}") + + # 显示数据源文件信息 + files_info = stats.get('processed_files_info', []) + print(f"有效数据源文件数: {len(files_info)}") + for file_info in files_info: + print(f" - {file_info.filename} (Sheet: {file_info.sheet_name}, 有效行: {file_info.valid_rows})") + + print(f"报告文件: {stats['output_path']}") + print("=" * 60) + + # 额外显示输出文件夹信息 + output_dir = os.path.join(folder_path, "BOM_Merge_out") + print(f"输出保存在: {output_dir}") + + print("\n报告包含三个Sheet:") + print("1. '汇总信息' - 处理统计和汇总信息") + print("2. '数据源文件' - 有效数据源文件详细信息") + print("3. 'BOM_Merge' - 合并后的物料数据") + + +def main(): + """主函数""" + # 初始化处理器 + bom_processor = BOMProcessor() + + # 选择文件夹 + folder_path = UserInterface.select_folder("选择包含BOM文件的文件夹") + if not folder_path: + print("未选择文件夹,程序退出") + return + + bom_processor.set_output_folder(folder_path) + + # 处理文件 + print(f"开始处理文件夹: {folder_path}") + success = bom_processor.process_folder(folder_path) + + if not success: + print("没有找到可处理的BOM文件") + return + + # 生成报告 + print("\n生成合并报告...") + stats = bom_processor.generate_report() + + if stats: + UserInterface.print_summary(stats, folder_path) + else: + print("生成报告失败") + + +if __name__ == "__main__": + main() + input("\n按任意键退出...") diff --git a/BOMCompare/README.md b/BOMCompare/README.md new file mode 100644 index 0000000..8caf796 --- /dev/null +++ b/BOMCompare/README.md @@ -0,0 +1,14 @@ +# Sample GitLab Project + +This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches, +named and filled with lorem ipsum. + +You can look around to get an idea how to structure your project and, when done, you can safely delete this project. + +[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html) + +# 基于标准格式的 BOM文件,输出 BOM差异信息文件 +BOMCompereForJP.py + +# 基于标准格式的 BOM文件,输出 BOM的合并后的文件,方便校对和物料备料情况的分析。 +BOMConsolidator.py \ No newline at end of file diff --git a/FFT_IMU/.gitignore b/FFT_IMU/.gitignore new file mode 100644 index 0000000..f74fbf0 --- /dev/null +++ b/FFT_IMU/.gitignore @@ -0,0 +1,19 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source +/dataProcess_out* +*.xls +*.xlsx +*.csv +*.spec + +/src + +/temp + +FFT_IMU_dc_html_v2.py + +FFT_IMU_dc_v2.py \ No newline at end of file diff --git a/FFT_IMU/FFT_IMU_dc_html_v1.py b/FFT_IMU/FFT_IMU_dc_html_v1.py new file mode 100644 index 0000000..bc4b711 --- /dev/null +++ b/FFT_IMU/FFT_IMU_dc_html_v1.py @@ -0,0 +1,739 @@ +import pandas as pd +import numpy as np +import matplotlib + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy import signal +import os +import glob +from datetime import datetime +import time +from multiprocessing import Pool, cpu_count +from matplotlib.colors import Normalize +from matplotlib.ticker import MaxNLocator +import re +from colorama import Fore, Style, init +from concurrent.futures import ProcessPoolExecutor, as_completed +import warnings +import threading + +# 初始化colorama +init(autoreset=True) + +# 忽略特定的matplotlib警告 +warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") +warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib") + +# 创建线程锁,确保文件操作和日志输出的线程安全 +file_lock = threading.Lock() +log_lock = threading.Lock() + + +class IMUDataAnalyzer: + def __init__(self, file_path): + self.file_path = file_path + self.data = None + self.sampling_rate = None + self.fig_size = (15, 10) + self.spectrogram_params = {} # 存储频谱图计算参数 + + # 从文件名推断数据类型和采样率 + file_name = os.path.basename(file_path).lower() + if 'calib' in file_name: + self.data_type = 'calib' + self.default_sampling_rate = 5 + elif 'raw' in file_name: + self.data_type = 'raw' + self.default_sampling_rate = 1000 + else: + self.data_type = 'unknown' + self.default_sampling_rate = 5 + + # 解析文件路径和文件名 + file_dir = os.path.dirname(os.path.abspath(file_path)) + file_base_name = os.path.splitext(os.path.basename(file_path))[0] + self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 创建文件名称+时间戳尾缀的输出目录 + self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}") + + # 使用锁确保目录创建的线程安全 + with file_lock: + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + self.log_progress(f"创建输出目录:{self.output_dir}", "INFO") + + # 字体设置 + plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial'] + plt.rcParams['axes.unicode_minus'] = False + + # 设置matplotlib兼容性选项,避免布局引擎冲突 + plt.rcParams['figure.constrained_layout.use'] = False + plt.rcParams['figure.constrained_layout.h_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.w_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.hspace'] = 0.02 + plt.rcParams['figure.constrained_layout.wspace'] = 0.02 + + self.log_progress(f"处理文件:{self.file_path}", "INFO") + self.log_progress(f"数据类型:{self.data_type}", "INFO") + self.log_progress(f"输出路径:{self.output_dir}", "INFO") + + def log_progress(self, message, level="INFO"): + """带颜色和级别的日志输出(线程安全)""" + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + with log_lock: + if level == "INFO": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}") + elif level == "WARNING": + print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}") + elif level == "ERROR": + print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}") + elif level == "SUCCESS": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}") + else: + print(f"{Fore.CYAN}[{timestamp}] {message}") + + def check_imu_columns_in_file(self): + """检查文件是否包含IMU数据列(通过读取文件头)""" + try: + # 只读取第一行来检查列名 + with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f: + first_line = f.readline().strip() + + # 检查第一行是否包含imu关键词(不区分大小写) + if re.search(r'imu', first_line, re.IGNORECASE): + return True + else: + self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING") + return False + + except Exception as e: + self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR") + return False + + def detect_imu_columns(self): + """自动检测IMU数据列""" + all_columns = self.data.columns.tolist() + + # 查找imu前缀(如imu1, imu2等) + imu_prefixes = set() + for col in all_columns: + match = re.match(r'^(imu\d+)_', col.lower()) + if match: + imu_prefixes.add(match.group(1)) + + if not imu_prefixes: + self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING") + # 尝试使用常见列名 + self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z'] + self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z'] + self.temp_columns = ['imu1_temp'] + return + + # 使用第一个检测到的IMU前缀 + imu_prefix = list(imu_prefixes)[0] + self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO") + + # 查找加速度计列 + self.acc_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_acc") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找陀螺仪列 + self.gyro_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_gyro") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找温度列 + self.temp_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_temp")] + + # 如果没有找到温度列,尝试其他常见名称 + if not self.temp_columns: + self.temp_columns = [col for col in all_columns + if any(name in col.lower() for name in ['temp', 'temperature'])] + + self.log_progress(f"加速度计列: {self.acc_columns}", "INFO") + self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO") + self.log_progress(f"温度列: {self.temp_columns}", "INFO") + + def estimate_sampling_rate(self): + """估计实际采样率""" + if 'time' in self.data.columns and len(self.data) > 10: + time_diff = np.diff(self.data['time'].values) + valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值 + if len(valid_diffs) > 0: + estimated_rate = 1.0 / np.median(valid_diffs) + self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz") + return estimated_rate + + # 如果没有时间列或无法估计,使用基于文件名的默认值 + self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz") + return self.default_sampling_rate + + def load_data(self): + """加载并预处理数据""" + self.log_progress("开始加载数据...") + start_time = time.time() + + # 首先检查文件是否包含IMU数据 + if not self.check_imu_columns_in_file(): + raise ValueError("文件不包含IMU数据列,跳过处理") + + # 使用锁确保文件读取的线程安全 + with file_lock: + self.data = pd.read_csv(self.file_path) + + self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒") + + # 检测IMU数据列 + self.detect_imu_columns() + + # 估计采样率 + self.sampling_rate = self.estimate_sampling_rate() + + # 创建时间序列并处理异常时间值 + if 'time' in self.data.columns: + valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6) + self.data = self.data[valid_time_mask].copy() + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + else: + # 如果没有时间列,创建基于采样率的时间序列 + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + + def remove_dc(self, signal_data): + """不移除直流分量(保留以在频谱中显示 DC)""" + return signal_data + + def compute_spectrogram(self, signal_data): + """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感""" + # 保留直流分量 + signal_data = self.remove_dc(signal_data) + + # 数据长度 + n_samples = len(signal_data) + + # 根据采样率和数据长度自适应选择参数 + if self.sampling_rate <= 10: # 低采样率(5Hz) + # 对于低采样率,使用较长的窗口以获得更好的频率分辨率 + nperseg = min(256, max(64, n_samples // 2)) + noverlap = int(nperseg * 0.75) # 增加重叠比例 + + else: # 高采样率(1000Hz) + # 对于高采样率,平衡时间分辨率和频率分辨率 + if n_samples < 10000: # 较短的数据 + nperseg = min(512, max(256, n_samples // 4)) + else: # 较长的数据 + nperseg = min(1024, max(512, n_samples // 8)) + + noverlap = int(nperseg * 0.66) # 适中的重叠比例 + + # 确保窗口大小合理 + nperseg = max(16, min(nperseg, n_samples)) + noverlap = min(noverlap, nperseg - 1) + + # 记录频谱图计算参数 + self.spectrogram_params = { + "nperseg": nperseg, + "noverlap": noverlap, + "window": "hamming", + "detrend": False, + "scaling": "density", + "mode": "psd" + } + + # 使用更平滑的窗口函数 + f, t, Sxx = signal.spectrogram( + signal_data, + fs=self.sampling_rate, + window='hamming', # 使用汉明窗,比汉宁窗更平滑 + nperseg=nperseg, + noverlap=noverlap, + scaling='density', + detrend=False, # 保留直流 + mode='psd' + ) + + # 应用平滑处理以减少颗粒感 + if Sxx.size > 0: + # 使用小范围的高斯滤波平滑(可选) + from scipy.ndimage import gaussian_filter + Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7) + return f, t, Sxx_smoothed + + return f, t, Sxx + + def process_signal(self, args): + """并行处理单个信号""" + signal_data, axis = args + f, t, Sxx = self.compute_spectrogram(signal_data) + + # 防止 log10(0) + eps = np.finfo(float).eps + Sxx_log = 10 * np.log10(Sxx + eps) + + # 降采样以加速绘图 + if len(t) > 1000: # 如果时间点太多,进行降采样 + time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int) + freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int) + t = t[time_indices] + f = f[freq_indices] + Sxx_log = Sxx_log[freq_indices, :][:, time_indices] + dc_idx = int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # shape: (len(t),) + + # 更健壮的 0 Hz 索引选择 + zero_idx = np.where(np.isclose(f, 0.0))[0] + dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB) + + return { + 'f': f, + 't': t, + 'Sxx_log': Sxx_log, + 'dc_log': dc_log, + 'axis': axis + } + + @staticmethod + def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)): + """ + 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN + """ + if not results: + return fallback + dc_all = np.concatenate([r['dc_log'].ravel() for r in results]) + dc_all = dc_all[np.isfinite(dc_all)] + if dc_all.size == 0: + return fallback + lo, hi = np.percentile(dc_all, [p_low, p_high]) + span = max(1e-9, hi - lo) + lo -= span * pad_ratio + hi += span * pad_ratio + return lo, hi + + def get_time_domain_stats(self): + """计算时域信号的统计信息""" + stats = {} + if self.acc_columns: + stats['加速度计'] = {col: { + '均值': self.data[col].mean(), + '标准差': self.data[col].std(), + '最大值': self.data[col].max(), + '最小值': self.data[col].min() + } for col in self.acc_columns} + if self.gyro_columns: + stats['陀螺仪'] = {col: { + '均值': self.data[col].mean(), + '标准差': self.data[col].std(), + '最大值': self.data[col].max(), + '最小值': self.data[col].min() + } for col in self.gyro_columns} + if self.temp_columns: + stats['温度'] = {col: { + '均值': self.data[col].mean(), + '标准差': self.data[col].std(), + '最大值': self.data[col].max(), + '最小值': self.data[col].min() + } for col in self.temp_columns} + return stats + + def generate_html_report(self, time_domain_stats): + """生成HTML报告""" + html_content = f""" + + + + + + IMU数据分析报告 - {os.path.basename(self.file_path)} + + + +

IMU数据分析报告

+

文件路径: {self.file_path}

+

分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+

采样率: {self.sampling_rate} Hz

+ +

时域信号统计信息

+ """ + + # 添加时域统计信息 + for sensor_type, sensors in time_domain_stats.items(): + html_content += f"

{sensor_type}

" + html_content += "" + html_content += "" + for col, stats in sensors.items(): + html_content += f"" + html_content += "
传感器均值标准差最大值最小值
{col}{stats['均值']:.4f}{stats['标准差']:.4f}{stats['最大值']:.4f}{stats['最小值']:.4f}
" + + # 添加频域参数信息 + html_content += """ +

频域信号计算参数

+ + + """ + for key, value in self.spectrogram_params.items(): + html_content += f"" + html_content += "
参数
{key}{value}
" + + # 添加图像链接 + time_series_image = f'time_series_{self.timestamp}.png' + acc_spectrogram_image = f'acc_rainfall_spectrogram_{self.timestamp}.png' + gyro_spectrogram_image = f'gyro_rainfall_spectrogram_{self.timestamp}.png' + + html_content += f""" +

时域信号图

+ 时域信号图 + +

加速度计频谱雨点图

+ 加速度计频谱雨点图 + +

陀螺仪频谱雨点图

+ 陀螺仪频谱雨点图 + """ + + html_content += """ + + + """ + + # 保存HTML报告 + report_path = os.path.join(self.output_dir, f'report_{self.timestamp}.html') + with open(report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + self.log_progress(f"HTML报告已生成: {report_path}") + + def plot_time_series(self): + """绘制时间序列图""" + self.log_progress("开始绘制时间序列图...") + start_time = time.time() + + # 确定子图数量 + n_plots = 1 # 至少有一个加速度图 + if self.gyro_columns: # 如果有陀螺仪数据 + n_plots += 1 + if self.temp_columns: # 如果有温度数据 + n_plots += 1 + + fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120) + if n_plots == 1: + axes = [axes] # 确保axes是列表 + + plot_idx = 0 + + # 加速度计数据 + if self.acc_columns: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.acc_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('加速度时间序列', fontsize=12) + ax.set_ylabel('加速度 (g)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 陀螺仪数据(如果有) + if self.gyro_columns and plot_idx < n_plots: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.gyro_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('陀螺仪时间序列', fontsize=12) + ax.set_ylabel('角速度 (deg/s)', fontsize=10) + ax.legend(loc='upper left', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 温度数据(如果有) + if self.temp_columns and plot_idx < n_plots: + ax = axes[plot_idx] + ax.plot(self.data['time'], self.data[self.temp_columns[0]], + label='温度', color='#9467bd', linewidth=1.0, alpha=0.8) + ax.set_title('温度时间序列', fontsize=12) + ax.set_xlabel('时间 (s)', fontsize=10) + ax.set_ylabel('温度 (°C)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + + plt.tight_layout() + output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"时间序列图已保存: {output_path}") + self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒") + + def plot_rainfall_spectrograms(self): + """并行绘制所有频谱雨点图(修复colorbar布局问题)""" + self.log_progress("开始并行绘制频谱雨点图...") + start_time = time.time() + + # 准备加速度计数据 + self.log_progress("准备加速度计数据...") + acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴 + + # 准备陀螺仪数据(如果有) + gyro_signals = [] + if self.gyro_columns: + self.log_progress("准备陀螺仪数据...") + gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴 + + # 如果没有数据可处理,直接返回 + if not acc_signals and not gyro_signals: + self.log_progress("警告: 没有有效的数据列可供处理", "WARNING") + return + + # 使用多进程处理信号(避免线程冲突) + self.log_progress("使用多进程并行处理...") + all_signals = acc_signals + gyro_signals + with Pool(processes=min(len(all_signals), cpu_count())) as pool: + results = pool.map(self.process_signal, all_signals) + + # 分离结果 + self.log_progress("分离结果...") + acc_results = [r for r in results if r['axis'].startswith('Acc')] + gyro_results = [r for r in results if r['axis'].startswith('Gyro')] + + # 统一颜色标尺(5%-95%分位) + if acc_results: + self.log_progress("计算加速度计全局最小和最大值...") + acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results]) + acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results) + self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}") + + if gyro_results: + self.log_progress("计算陀螺仪全局最小和最大值...") + gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results]) + gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results) + self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}") + + # ========= 绘制加速度计频谱雨点图 ========= + if acc_results: + self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax, + '加速度', 'acc_rainfall_spectrogram') + self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB") + + # ========= 绘制陀螺仪频谱雨点图 ========= + if gyro_results: + self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax, + '角速度', 'gyro_rainfall_spectrogram') + self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB") + + total_time = time.time() - start_time + self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒") + + def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix): + """绘制单个频谱雨点图""" + rows = len(results) + fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150) + gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12) + + axes_main = [] + axes_cbar = [] + for i in range(rows): + axes_main.append(fig.add_subplot(gs[i, 0])) + axes_cbar.append(fig.add_subplot(gs[i, 1])) + + for i, result in enumerate(results): + ax = axes_main[i] + cax = axes_cbar[i] + + sc = ax.scatter( + np.repeat(result['t'], len(result['f'])), + np.tile(result['f'], len(result['t'])), + c=result['Sxx_log'].T.ravel(), + cmap='jet', + s=3, + alpha=0.7, + vmin=vmin, + vmax=vmax, + rasterized=True + ) + + ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10) + ax.set_xlabel('时间 (s)', fontsize=9) + ax.set_ylabel('频率 (Hz)', fontsize=9) + ax.set_ylim(0, self.sampling_rate / 2) + ax.grid(True, linestyle=':', alpha=0.4) + + ax2 = ax.twinx() + ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)') + ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black') + ax2.set_ylim(dc_ymin, dc_ymax) + ax2.tick_params(axis='y', labelcolor='black') + ax2.yaxis.set_major_locator(MaxNLocator(nbins=6)) + ax2.grid(False) + ax2.legend(loc='upper right', fontsize=8, framealpha=0.5) + + cbar = fig.colorbar(sc, cax=cax) + cbar.set_label('功率谱密度 (dB)', fontsize=9) + cax.tick_params(labelsize=8) + + output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}") + + def run_analysis(self): + """运行完整分析流程""" + try: + self.log_progress("开始数据分析流程", "INFO") + start_time = time.time() + + self.load_data() + self.plot_time_series() + self.plot_rainfall_spectrograms() + + # 计算时域统计信息 + time_domain_stats = self.get_time_domain_stats() + + # 生成HTML报告 + self.generate_html_report(time_domain_stats) + + total_time = time.time() - start_time + self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS") + self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO") + return True + + except ValueError as e: + # 跳过不包含IMU数据的文件 + self.log_progress(f"跳过文件: {str(e)}", "WARNING") + return False + except Exception as e: + self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR") + import traceback + traceback.print_exc() + return False + + +def process_single_file(file_path): + """处理单个文件的函数(使用进程隔离)""" + try: + print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}") + analyzer = IMUDataAnalyzer(file_path) + success = analyzer.run_analysis() + if success: + return (file_path, True, "处理成功") + else: + return (file_path, False, "文件不包含IMU数据,已跳过") + except Exception as e: + return (file_path, False, str(e)) + + +def main(): + """主函数,支持多文件处理和进度显示""" + print("=" * 60) + print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理") + print("=" * 60) + + # 获取输入路径 + print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ") + input_path = input("> ").strip() + + if not os.path.exists(input_path): + print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!") + return + + # 查找所有包含imu的CSV文件(不区分大小写) + if os.path.isdir(input_path): + # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件 + all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True) + csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)] + csv_files = list(set(csv_files)) # 去重 + csv_files.sort() + else: + # 对于单个文件,检查是否包含imu(不区分大小写) + if re.search(r'imu', input_path, re.IGNORECASE): + csv_files = [input_path] + else: + csv_files = [] + + if not csv_files: + print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件") + return + + print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:") + for i, file in enumerate(csv_files, 1): + print(f" {i}. {os.path.basename(file)}") + + # 使用多进程处理文件(避免matplotlib线程冲突) + print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...") + + success_count = 0 + skipped_count = 0 + failed_files = [] + + # 使用ProcessPoolExecutor而不是ThreadPoolExecutor + with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor: + # 提交所有任务 + future_to_file = {executor.submit(process_single_file, file): file for file in csv_files} + + # 处理完成的任务 + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + file_path, success, message = result + if success: + print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}") + success_count += 1 + else: + if "跳过" in message: + print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}") + skipped_count += 1 + else: + print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}") + failed_files.append((file_path, message)) + except Exception as e: + print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}") + failed_files.append((file_path, str(e))) + + # 输出统计信息 + print(f"\n{Fore.CYAN}处理完成统计:") + print(f"{Fore.GREEN}成功: {success_count} 个文件") + print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)") + print(f"{Fore.RED}失败: {len(failed_files)} 个文件") + + if failed_files: + print(f"\n{Fore.YELLOW}失败文件详情:") + for file, error in failed_files: + print(f" {os.path.basename(file)}: {error}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print(f"\n{Fore.YELLOW}用户中断程序执行") + except Exception as e: + print(f"{Fore.RED}程序运行出错: {str(e)}") + import traceback + + traceback.print_exc() \ No newline at end of file diff --git a/FFT_IMU/FFT_IMU_dc_scan_v1.py b/FFT_IMU/FFT_IMU_dc_scan_v1.py new file mode 100644 index 0000000..26bfa6a --- /dev/null +++ b/FFT_IMU/FFT_IMU_dc_scan_v1.py @@ -0,0 +1,648 @@ +import pandas as pd +import numpy as np +import matplotlib + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy import signal +import os +import glob +from datetime import datetime +import time +from multiprocessing import Pool, cpu_count +from matplotlib.colors import Normalize +from matplotlib.ticker import MaxNLocator +import re +from colorama import Fore, Style, init +from concurrent.futures import ProcessPoolExecutor, as_completed +import warnings +import threading + +# 初始化colorama +init(autoreset=True) + +# 忽略特定的matplotlib警告 +warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") +warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib") + +# 创建线程锁,确保文件操作和日志输出的线程安全 +file_lock = threading.Lock() +log_lock = threading.Lock() + + +class IMUDataAnalyzer: + def __init__(self, file_path): + self.file_path = file_path + self.data = None + self.sampling_rate = None + self.fig_size = (15, 10) + + # 从文件名推断数据类型和采样率 + file_name = os.path.basename(file_path).lower() + if 'calib' in file_name: + self.data_type = 'calib' + self.default_sampling_rate = 5 + elif 'raw' in file_name: + self.data_type = 'raw' + self.default_sampling_rate = 1000 + else: + self.data_type = 'unknown' + self.default_sampling_rate = 5 + + # 解析文件路径和文件名 + file_dir = os.path.dirname(os.path.abspath(file_path)) + file_base_name = os.path.splitext(os.path.basename(file_path))[0] + self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 创建文件名称+时间戳尾缀的输出目录 + self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}") + + # 使用锁确保目录创建的线程安全 + with file_lock: + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + self.log_progress(f"创建输出目录:{self.output_dir}", "INFO") + + # 字体设置 + plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial'] + plt.rcParams['axes.unicode_minus'] = False + + # 设置matplotlib兼容性选项,避免布局引擎冲突 + plt.rcParams['figure.constrained_layout.use'] = False + plt.rcParams['figure.constrained_layout.h_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.w_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.hspace'] = 0.02 + plt.rcParams['figure.constrained_layout.wspace'] = 0.02 + + self.log_progress(f"处理文件:{self.file_path}", "INFO") + self.log_progress(f"数据类型:{self.data_type}", "INFO") + self.log_progress(f"输出路径:{self.output_dir}", "INFO") + + def log_progress(self, message, level="INFO"): + """带颜色和级别的日志输出(线程安全)""" + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + with log_lock: + if level == "INFO": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}") + elif level == "WARNING": + print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}") + elif level == "ERROR": + print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}") + elif level == "SUCCESS": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}") + else: + print(f"{Fore.CYAN}[{timestamp}] {message}") + + def check_imu_columns_in_file(self): + """检查文件是否包含IMU数据列(通过读取文件头)""" + try: + # 只读取第一行来检查列名 + with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f: + first_line = f.readline().strip() + + # 检查第一行是否包含imu关键词(不区分大小写) + if re.search(r'imu', first_line, re.IGNORECASE): + return True + else: + self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING") + return False + + except Exception as e: + self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR") + return False + + def detect_imu_columns(self): + """自动检测IMU数据列""" + all_columns = self.data.columns.tolist() + + # 查找imu前缀(如imu1, imu2等) + imu_prefixes = set() + for col in all_columns: + match = re.match(r'^(imu\d+)_', col.lower()) + if match: + imu_prefixes.add(match.group(1)) + + if not imu_prefixes: + self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING") + # 尝试使用常见列名 + self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z'] + self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z'] + self.temp_columns = ['imu1_temp'] + return + + # 使用第一个检测到的IMU前缀 + imu_prefix = list(imu_prefixes)[0] + self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO") + + # 查找加速度计列 + self.acc_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_acc") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找陀螺仪列 + self.gyro_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_gyro") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找温度列 + self.temp_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_temp")] + + # 如果没有找到温度列,尝试其他常见名称 + if not self.temp_columns: + self.temp_columns = [col for col in all_columns + if any(name in col.lower() for name in ['temp', 'temperature'])] + + self.log_progress(f"加速度计列: {self.acc_columns}", "INFO") + self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO") + self.log_progress(f"温度列: {self.temp_columns}", "INFO") + + def estimate_sampling_rate(self): + """估计实际采样率""" + if 'time' in self.data.columns and len(self.data) > 10: + time_diff = np.diff(self.data['time'].values) + valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值 + if len(valid_diffs) > 0: + estimated_rate = 1.0 / np.median(valid_diffs) + self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz") + return estimated_rate + + # 如果没有时间列或无法估计,使用基于文件名的默认值 + self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz") + return self.default_sampling_rate + + def load_data(self): + """加载并预处理数据""" + self.log_progress("开始加载数据...") + start_time = time.time() + + # 首先检查文件是否包含IMU数据 + if not self.check_imu_columns_in_file(): + raise ValueError("文件不包含IMU数据列,跳过处理") + + # 使用锁确保文件读取的线程安全 + with file_lock: + self.data = pd.read_csv(self.file_path) + + self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒") + + # 检测IMU数据列 + self.detect_imu_columns() + + # 估计采样率 + self.sampling_rate = self.estimate_sampling_rate() + + # 创建时间序列并处理异常时间值 + if 'time' in self.data.columns: + valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6) + self.data = self.data[valid_time_mask].copy() + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + else: + # 如果没有时间列,创建基于采样率的时间序列 + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + + def remove_dc(self, signal_data): + """不移除直流分量(保留以在频谱中显示 DC)""" + return signal_data + + # def compute_spectrogram(self, signal_data): + # """计算频谱图(保留直流分量)""" + # # 保留直流分量 + # signal_data = self.remove_dc(signal_data) + # + # # 自适应窗口大小 - 根据采样率调整 + # if self.sampling_rate <= 10: # 低采样率 + # nperseg = min(64, max(16, len(signal_data) // 4)) + # else: # 高采样率 + # nperseg = min(1024, max(64, len(signal_data) // 8)) + # + # noverlap = nperseg // 2 + # + # f, t, Sxx = signal.spectrogram( + # signal_data, + # fs=self.sampling_rate, + # window='hann', + # nperseg=nperseg, + # noverlap=noverlap, + # scaling='density', + # detrend=False, # 保留直流 + # mode='psd' # 更高效的模式 + # ) + # return f, t, Sxx + + def compute_spectrogram(self, signal_data): + """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感""" + # 保留直流分量 + signal_data = self.remove_dc(signal_data) + + # 数据长度 + n_samples = len(signal_data) + + # 根据采样率和数据长度自适应选择参数 + if self.sampling_rate <= 10: # 低采样率(5Hz) + # 对于低采样率,使用较长的窗口以获得更好的频率分辨率 + nperseg = min(256, max(64, n_samples // 2)) + noverlap = int(nperseg * 0.75) # 增加重叠比例 + + else: # 高采样率(1000Hz) + # 对于高采样率,平衡时间分辨率和频率分辨率 + if n_samples < 10000: # 较短的数据 + nperseg = min(512, max(256, n_samples // 4)) + else: # 较长的数据 + nperseg = min(1024, max(512, n_samples // 8)) + + noverlap = int(nperseg * 0.66) # 适中的重叠比例 + + # 确保窗口大小合理 + nperseg = max(16, min(nperseg, n_samples)) + noverlap = min(noverlap, nperseg - 1) + + # 使用更平滑的窗口函数 + f, t, Sxx = signal.spectrogram( + signal_data, + fs=self.sampling_rate, + window='hamming', # 使用汉明窗,比汉宁窗更平滑 + nperseg=nperseg, + noverlap=noverlap, + scaling='density', + # detrend='linear', # 使用线性去趋势,减少低频干扰 + detrend=False, # 保留直流 + mode='psd' + ) + + # 应用平滑处理以减少颗粒感 + if Sxx.size > 0: + # 使用小范围的高斯滤波平滑(可选) + from scipy.ndimage import gaussian_filter + Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7) + return f, t, Sxx_smoothed + + return f, t, Sxx + + def process_signal(self, args): + """并行处理单个信号""" + signal_data, axis = args + f, t, Sxx = self.compute_spectrogram(signal_data) + + # 防止 log10(0) + eps = np.finfo(float).eps + Sxx_log = 10 * np.log10(Sxx + eps) + + # 降采样以加速绘图 + if len(t) > 1000: # 如果时间点太多,进行降采样 + time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int) + freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int) + t = t[time_indices] + f = f[freq_indices] + Sxx_log = Sxx_log[freq_indices, :][:, time_indices] + dc_idx = int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # shape: (len(t),) + + # 更健壮的 0 Hz 索引选择 + zero_idx = np.where(np.isclose(f, 0.0))[0] + dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB) + + return { + 'f': f, + 't': t, + 'Sxx_log': Sxx_log, + 'dc_log': dc_log, + 'axis': axis + } + + @staticmethod + def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)): + """ + 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN + """ + if not results: + return fallback + dc_all = np.concatenate([r['dc_log'].ravel() for r in results]) + dc_all = dc_all[np.isfinite(dc_all)] + if dc_all.size == 0: + return fallback + lo, hi = np.percentile(dc_all, [p_low, p_high]) + span = max(1e-9, hi - lo) + lo -= span * pad_ratio + hi += span * pad_ratio + return lo, hi + + def plot_time_series(self): + """绘制时间序列图""" + self.log_progress("开始绘制时间序列图...") + start_time = time.time() + + # 确定子图数量 + n_plots = 1 # 至少有一个加速度图 + if self.gyro_columns: # 如果有陀螺仪数据 + n_plots += 1 + if self.temp_columns: # 如果有温度数据 + n_plots += 1 + + fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120) + if n_plots == 1: + axes = [axes] # 确保axes是列表 + + plot_idx = 0 + + # 加速度计数据 + if self.acc_columns: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.acc_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('加速度时间序列', fontsize=12) + ax.set_ylabel('加速度 (g)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 陀螺仪数据(如果有) + if self.gyro_columns and plot_idx < n_plots: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.gyro_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('陀螺仪时间序列', fontsize=12) + ax.set_ylabel('角速度 (deg/s)', fontsize=10) + ax.legend(loc='upper left', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 温度数据(如果有) + if self.temp_columns and plot_idx < n_plots: + ax = axes[plot_idx] + ax.plot(self.data['time'], self.data[self.temp_columns[0]], + label='温度', color='#9467bd', linewidth=1.0, alpha=0.8) + ax.set_title('温度时间序列', fontsize=12) + ax.set_xlabel('时间 (s)', fontsize=10) + ax.set_ylabel('温度 (°C)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + + plt.tight_layout() + output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"时间序列图已保存: {output_path}") + self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒") + + def plot_rainfall_spectrograms(self): + """并行绘制所有频谱雨点图(修复colorbar布局问题)""" + self.log_progress("开始并行绘制频谱雨点图...") + start_time = time.time() + + # 准备加速度计数据 + self.log_progress("准备加速度计数据...") + acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴 + + # 准备陀螺仪数据(如果有) + gyro_signals = [] + if self.gyro_columns: + self.log_progress("准备陀螺仪数据...") + gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴 + + # 如果没有数据可处理,直接返回 + if not acc_signals and not gyro_signals: + self.log_progress("警告: 没有有效的数据列可供处理", "WARNING") + return + + # 使用多进程处理信号(避免线程冲突) + self.log_progress("使用多进程并行处理...") + all_signals = acc_signals + gyro_signals + with Pool(processes=min(len(all_signals), cpu_count())) as pool: + results = pool.map(self.process_signal, all_signals) + + # 分离结果 + self.log_progress("分离结果...") + acc_results = [r for r in results if r['axis'].startswith('Acc')] + gyro_results = [r for r in results if r['axis'].startswith('Gyro')] + + # 统一颜色标尺(5%-95%分位) + if acc_results: + self.log_progress("计算加速度计全局最小和最大值...") + acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results]) + acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results) + self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}") + + if gyro_results: + self.log_progress("计算陀螺仪全局最小和最大值...") + gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results]) + gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results) + self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}") + + # ========= 绘制加速度计频谱雨点图 ========= + if acc_results: + self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax, + '加速度', 'acc_rainfall_spectrogram') + self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB") + + # ========= 绘制陀螺仪频谱雨点图 ========= + if gyro_results: + self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax, + '角速度', 'gyro_rainfall_spectrogram') + self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB") + + total_time = time.time() - start_time + self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒") + + def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix): + """绘制单个频谱雨点图""" + rows = len(results) + fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150) + gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12) + + axes_main = [] + axes_cbar = [] + for i in range(rows): + axes_main.append(fig.add_subplot(gs[i, 0])) + axes_cbar.append(fig.add_subplot(gs[i, 1])) + + for i, result in enumerate(results): + ax = axes_main[i] + cax = axes_cbar[i] + + sc = ax.scatter( + np.repeat(result['t'], len(result['f'])), + np.tile(result['f'], len(result['t'])), + c=result['Sxx_log'].T.ravel(), + cmap='jet', + s=3, + alpha=0.7, + vmin=vmin, + vmax=vmax, + rasterized=True + ) + + ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10) + ax.set_xlabel('时间 (s)', fontsize=9) + ax.set_ylabel('频率 (Hz)', fontsize=9) + ax.set_ylim(0, self.sampling_rate / 2) + ax.grid(True, linestyle=':', alpha=0.4) + + ax2 = ax.twinx() + ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)') + ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black') + ax2.set_ylim(dc_ymin, dc_ymax) + ax2.tick_params(axis='y', labelcolor='black') + ax2.yaxis.set_major_locator(MaxNLocator(nbins=6)) + ax2.grid(False) + ax2.legend(loc='upper right', fontsize=8, framealpha=0.5) + + cbar = fig.colorbar(sc, cax=cax) + cbar.set_label('功率谱密度 (dB)', fontsize=9) + cax.tick_params(labelsize=8) + + output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}") + + def run_analysis(self): + """运行完整分析流程""" + try: + self.log_progress("开始数据分析流程", "INFO") + start_time = time.time() + + self.load_data() + self.plot_time_series() + self.plot_rainfall_spectrograms() + + total_time = time.time() - start_time + self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS") + self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO") + return True + + except ValueError as e: + # 跳过不包含IMU数据的文件 + self.log_progress(f"跳过文件: {str(e)}", "WARNING") + return False + except Exception as e: + self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR") + import traceback + traceback.print_exc() + return False + + +def process_single_file(file_path): + """处理单个文件的函数(使用进程隔离)""" + try: + print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}") + analyzer = IMUDataAnalyzer(file_path) + success = analyzer.run_analysis() + if success: + return (file_path, True, "处理成功") + else: + return (file_path, False, "文件不包含IMU数据,已跳过") + except Exception as e: + return (file_path, False, str(e)) + + +def main(): + """主函数,支持多文件处理和进度显示""" + print("=" * 60) + print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理") + print("=" * 60) + + # 获取输入路径 + print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ") + input_path = input("> ").strip() + + if not os.path.exists(input_path): + print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!") + return + + # 查找所有包含imu的CSV文件(不区分大小写) + if os.path.isdir(input_path): + # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件 + all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True) + csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)] + csv_files = list(set(csv_files)) # 去重 + csv_files.sort() + else: + # 对于单个文件,检查是否包含imu(不区分大小写) + if re.search(r'imu', input_path, re.IGNORECASE): + csv_files = [input_path] + else: + csv_files = [] + + if not csv_files: + print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件") + return + + print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:") + for i, file in enumerate(csv_files, 1): + print(f" {i}. {os.path.basename(file)}") + + # 使用多进程处理文件(避免matplotlib线程冲突) + print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...") + + success_count = 0 + skipped_count = 0 + failed_files = [] + + # 使用ProcessPoolExecutor而不是ThreadPoolExecutor + with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor: + # 提交所有任务 + future_to_file = {executor.submit(process_single_file, file): file for file in csv_files} + + # 处理完成的任务 + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + file_path, success, message = result + if success: + print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}") + success_count += 1 + else: + if "跳过" in message: + print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}") + skipped_count += 1 + else: + print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}") + failed_files.append((file_path, message)) + except Exception as e: + print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}") + failed_files.append((file_path, str(e))) + + # 输出统计信息 + print(f"\n{Fore.CYAN}处理完成统计:") + print(f"{Fore.GREEN}成功: {success_count} 个文件") + print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)") + print(f"{Fore.RED}失败: {len(failed_files)} 个文件") + + if failed_files: + print(f"\n{Fore.YELLOW}失败文件详情:") + for file, error in failed_files: + print(f" {os.path.basename(file)}: {error}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print(f"\n{Fore.YELLOW}用户中断程序执行") + except Exception as e: + print(f"{Fore.RED}程序运行出错: {str(e)}") + import traceback + + traceback.print_exc() \ No newline at end of file diff --git a/FFT_IMU/FFT_IMU_dc_v1.py b/FFT_IMU/FFT_IMU_dc_v1.py new file mode 100644 index 0000000..26bfa6a --- /dev/null +++ b/FFT_IMU/FFT_IMU_dc_v1.py @@ -0,0 +1,648 @@ +import pandas as pd +import numpy as np +import matplotlib + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scipy import signal +import os +import glob +from datetime import datetime +import time +from multiprocessing import Pool, cpu_count +from matplotlib.colors import Normalize +from matplotlib.ticker import MaxNLocator +import re +from colorama import Fore, Style, init +from concurrent.futures import ProcessPoolExecutor, as_completed +import warnings +import threading + +# 初始化colorama +init(autoreset=True) + +# 忽略特定的matplotlib警告 +warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") +warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib") + +# 创建线程锁,确保文件操作和日志输出的线程安全 +file_lock = threading.Lock() +log_lock = threading.Lock() + + +class IMUDataAnalyzer: + def __init__(self, file_path): + self.file_path = file_path + self.data = None + self.sampling_rate = None + self.fig_size = (15, 10) + + # 从文件名推断数据类型和采样率 + file_name = os.path.basename(file_path).lower() + if 'calib' in file_name: + self.data_type = 'calib' + self.default_sampling_rate = 5 + elif 'raw' in file_name: + self.data_type = 'raw' + self.default_sampling_rate = 1000 + else: + self.data_type = 'unknown' + self.default_sampling_rate = 5 + + # 解析文件路径和文件名 + file_dir = os.path.dirname(os.path.abspath(file_path)) + file_base_name = os.path.splitext(os.path.basename(file_path))[0] + self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # 创建文件名称+时间戳尾缀的输出目录 + self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}") + + # 使用锁确保目录创建的线程安全 + with file_lock: + if not os.path.exists(self.output_dir): + os.makedirs(self.output_dir) + self.log_progress(f"创建输出目录:{self.output_dir}", "INFO") + + # 字体设置 + plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial'] + plt.rcParams['axes.unicode_minus'] = False + + # 设置matplotlib兼容性选项,避免布局引擎冲突 + plt.rcParams['figure.constrained_layout.use'] = False + plt.rcParams['figure.constrained_layout.h_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.w_pad'] = 0.02 + plt.rcParams['figure.constrained_layout.hspace'] = 0.02 + plt.rcParams['figure.constrained_layout.wspace'] = 0.02 + + self.log_progress(f"处理文件:{self.file_path}", "INFO") + self.log_progress(f"数据类型:{self.data_type}", "INFO") + self.log_progress(f"输出路径:{self.output_dir}", "INFO") + + def log_progress(self, message, level="INFO"): + """带颜色和级别的日志输出(线程安全)""" + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + with log_lock: + if level == "INFO": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}") + elif level == "WARNING": + print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}") + elif level == "ERROR": + print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}") + elif level == "SUCCESS": + print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}") + else: + print(f"{Fore.CYAN}[{timestamp}] {message}") + + def check_imu_columns_in_file(self): + """检查文件是否包含IMU数据列(通过读取文件头)""" + try: + # 只读取第一行来检查列名 + with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f: + first_line = f.readline().strip() + + # 检查第一行是否包含imu关键词(不区分大小写) + if re.search(r'imu', first_line, re.IGNORECASE): + return True + else: + self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING") + return False + + except Exception as e: + self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR") + return False + + def detect_imu_columns(self): + """自动检测IMU数据列""" + all_columns = self.data.columns.tolist() + + # 查找imu前缀(如imu1, imu2等) + imu_prefixes = set() + for col in all_columns: + match = re.match(r'^(imu\d+)_', col.lower()) + if match: + imu_prefixes.add(match.group(1)) + + if not imu_prefixes: + self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING") + # 尝试使用常见列名 + self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z'] + self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z'] + self.temp_columns = ['imu1_temp'] + return + + # 使用第一个检测到的IMU前缀 + imu_prefix = list(imu_prefixes)[0] + self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO") + + # 查找加速度计列 + self.acc_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_acc") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找陀螺仪列 + self.gyro_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_gyro") and + any(axis in col.lower() for axis in ['_x', '_y', '_z'])] + + # 查找温度列 + self.temp_columns = [col for col in all_columns + if col.lower().startswith(f"{imu_prefix}_temp")] + + # 如果没有找到温度列,尝试其他常见名称 + if not self.temp_columns: + self.temp_columns = [col for col in all_columns + if any(name in col.lower() for name in ['temp', 'temperature'])] + + self.log_progress(f"加速度计列: {self.acc_columns}", "INFO") + self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO") + self.log_progress(f"温度列: {self.temp_columns}", "INFO") + + def estimate_sampling_rate(self): + """估计实际采样率""" + if 'time' in self.data.columns and len(self.data) > 10: + time_diff = np.diff(self.data['time'].values) + valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值 + if len(valid_diffs) > 0: + estimated_rate = 1.0 / np.median(valid_diffs) + self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz") + return estimated_rate + + # 如果没有时间列或无法估计,使用基于文件名的默认值 + self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz") + return self.default_sampling_rate + + def load_data(self): + """加载并预处理数据""" + self.log_progress("开始加载数据...") + start_time = time.time() + + # 首先检查文件是否包含IMU数据 + if not self.check_imu_columns_in_file(): + raise ValueError("文件不包含IMU数据列,跳过处理") + + # 使用锁确保文件读取的线程安全 + with file_lock: + self.data = pd.read_csv(self.file_path) + + self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒") + + # 检测IMU数据列 + self.detect_imu_columns() + + # 估计采样率 + self.sampling_rate = self.estimate_sampling_rate() + + # 创建时间序列并处理异常时间值 + if 'time' in self.data.columns: + valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6) + self.data = self.data[valid_time_mask].copy() + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + else: + # 如果没有时间列,创建基于采样率的时间序列 + self.data['time'] = np.arange(len(self.data)) / self.sampling_rate + + def remove_dc(self, signal_data): + """不移除直流分量(保留以在频谱中显示 DC)""" + return signal_data + + # def compute_spectrogram(self, signal_data): + # """计算频谱图(保留直流分量)""" + # # 保留直流分量 + # signal_data = self.remove_dc(signal_data) + # + # # 自适应窗口大小 - 根据采样率调整 + # if self.sampling_rate <= 10: # 低采样率 + # nperseg = min(64, max(16, len(signal_data) // 4)) + # else: # 高采样率 + # nperseg = min(1024, max(64, len(signal_data) // 8)) + # + # noverlap = nperseg // 2 + # + # f, t, Sxx = signal.spectrogram( + # signal_data, + # fs=self.sampling_rate, + # window='hann', + # nperseg=nperseg, + # noverlap=noverlap, + # scaling='density', + # detrend=False, # 保留直流 + # mode='psd' # 更高效的模式 + # ) + # return f, t, Sxx + + def compute_spectrogram(self, signal_data): + """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感""" + # 保留直流分量 + signal_data = self.remove_dc(signal_data) + + # 数据长度 + n_samples = len(signal_data) + + # 根据采样率和数据长度自适应选择参数 + if self.sampling_rate <= 10: # 低采样率(5Hz) + # 对于低采样率,使用较长的窗口以获得更好的频率分辨率 + nperseg = min(256, max(64, n_samples // 2)) + noverlap = int(nperseg * 0.75) # 增加重叠比例 + + else: # 高采样率(1000Hz) + # 对于高采样率,平衡时间分辨率和频率分辨率 + if n_samples < 10000: # 较短的数据 + nperseg = min(512, max(256, n_samples // 4)) + else: # 较长的数据 + nperseg = min(1024, max(512, n_samples // 8)) + + noverlap = int(nperseg * 0.66) # 适中的重叠比例 + + # 确保窗口大小合理 + nperseg = max(16, min(nperseg, n_samples)) + noverlap = min(noverlap, nperseg - 1) + + # 使用更平滑的窗口函数 + f, t, Sxx = signal.spectrogram( + signal_data, + fs=self.sampling_rate, + window='hamming', # 使用汉明窗,比汉宁窗更平滑 + nperseg=nperseg, + noverlap=noverlap, + scaling='density', + # detrend='linear', # 使用线性去趋势,减少低频干扰 + detrend=False, # 保留直流 + mode='psd' + ) + + # 应用平滑处理以减少颗粒感 + if Sxx.size > 0: + # 使用小范围的高斯滤波平滑(可选) + from scipy.ndimage import gaussian_filter + Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7) + return f, t, Sxx_smoothed + + return f, t, Sxx + + def process_signal(self, args): + """并行处理单个信号""" + signal_data, axis = args + f, t, Sxx = self.compute_spectrogram(signal_data) + + # 防止 log10(0) + eps = np.finfo(float).eps + Sxx_log = 10 * np.log10(Sxx + eps) + + # 降采样以加速绘图 + if len(t) > 1000: # 如果时间点太多,进行降采样 + time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int) + freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int) + t = t[time_indices] + f = f[freq_indices] + Sxx_log = Sxx_log[freq_indices, :][:, time_indices] + dc_idx = int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # shape: (len(t),) + + # 更健壮的 0 Hz 索引选择 + zero_idx = np.where(np.isclose(f, 0.0))[0] + dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0))) + dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB) + + return { + 'f': f, + 't': t, + 'Sxx_log': Sxx_log, + 'dc_log': dc_log, + 'axis': axis + } + + @staticmethod + def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)): + """ + 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN + """ + if not results: + return fallback + dc_all = np.concatenate([r['dc_log'].ravel() for r in results]) + dc_all = dc_all[np.isfinite(dc_all)] + if dc_all.size == 0: + return fallback + lo, hi = np.percentile(dc_all, [p_low, p_high]) + span = max(1e-9, hi - lo) + lo -= span * pad_ratio + hi += span * pad_ratio + return lo, hi + + def plot_time_series(self): + """绘制时间序列图""" + self.log_progress("开始绘制时间序列图...") + start_time = time.time() + + # 确定子图数量 + n_plots = 1 # 至少有一个加速度图 + if self.gyro_columns: # 如果有陀螺仪数据 + n_plots += 1 + if self.temp_columns: # 如果有温度数据 + n_plots += 1 + + fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120) + if n_plots == 1: + axes = [axes] # 确保axes是列表 + + plot_idx = 0 + + # 加速度计数据 + if self.acc_columns: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.acc_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('加速度时间序列', fontsize=12) + ax.set_ylabel('加速度 (g)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 陀螺仪数据(如果有) + if self.gyro_columns and plot_idx < n_plots: + ax = axes[plot_idx] + colors = ['#1f77b4', '#ff7f0e', '#2ca02c'] + labels = ['X', 'Y', 'Z'] + for i, col in enumerate(self.gyro_columns): + if i < 3: # 只绘制前三个轴 + ax.plot(self.data['time'], self.data[col], + label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8) + ax.set_title('陀螺仪时间序列', fontsize=12) + ax.set_ylabel('角速度 (deg/s)', fontsize=10) + ax.legend(loc='upper left', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + plot_idx += 1 + + # 温度数据(如果有) + if self.temp_columns and plot_idx < n_plots: + ax = axes[plot_idx] + ax.plot(self.data['time'], self.data[self.temp_columns[0]], + label='温度', color='#9467bd', linewidth=1.0, alpha=0.8) + ax.set_title('温度时间序列', fontsize=12) + ax.set_xlabel('时间 (s)', fontsize=10) + ax.set_ylabel('温度 (°C)', fontsize=10) + ax.legend(loc='upper right', fontsize=8, framealpha=0.5) + ax.grid(True, linestyle=':', alpha=0.5) + ax.set_xlim(0, self.data['time'].max()) + + plt.tight_layout() + output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"时间序列图已保存: {output_path}") + self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒") + + def plot_rainfall_spectrograms(self): + """并行绘制所有频谱雨点图(修复colorbar布局问题)""" + self.log_progress("开始并行绘制频谱雨点图...") + start_time = time.time() + + # 准备加速度计数据 + self.log_progress("准备加速度计数据...") + acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴 + + # 准备陀螺仪数据(如果有) + gyro_signals = [] + if self.gyro_columns: + self.log_progress("准备陀螺仪数据...") + gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}') + for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴 + + # 如果没有数据可处理,直接返回 + if not acc_signals and not gyro_signals: + self.log_progress("警告: 没有有效的数据列可供处理", "WARNING") + return + + # 使用多进程处理信号(避免线程冲突) + self.log_progress("使用多进程并行处理...") + all_signals = acc_signals + gyro_signals + with Pool(processes=min(len(all_signals), cpu_count())) as pool: + results = pool.map(self.process_signal, all_signals) + + # 分离结果 + self.log_progress("分离结果...") + acc_results = [r for r in results if r['axis'].startswith('Acc')] + gyro_results = [r for r in results if r['axis'].startswith('Gyro')] + + # 统一颜色标尺(5%-95%分位) + if acc_results: + self.log_progress("计算加速度计全局最小和最大值...") + acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results]) + acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results) + self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}") + + if gyro_results: + self.log_progress("计算陀螺仪全局最小和最大值...") + gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results]) + gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95]) + + # 统一 DC Y 轴范围 + gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results) + self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}") + + # ========= 绘制加速度计频谱雨点图 ========= + if acc_results: + self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax, + '加速度', 'acc_rainfall_spectrogram') + self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB") + + # ========= 绘制陀螺仪频谱雨点图 ========= + if gyro_results: + self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax, + '角速度', 'gyro_rainfall_spectrogram') + self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB") + + total_time = time.time() - start_time + self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒") + + def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix): + """绘制单个频谱雨点图""" + rows = len(results) + fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150) + gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12) + + axes_main = [] + axes_cbar = [] + for i in range(rows): + axes_main.append(fig.add_subplot(gs[i, 0])) + axes_cbar.append(fig.add_subplot(gs[i, 1])) + + for i, result in enumerate(results): + ax = axes_main[i] + cax = axes_cbar[i] + + sc = ax.scatter( + np.repeat(result['t'], len(result['f'])), + np.tile(result['f'], len(result['t'])), + c=result['Sxx_log'].T.ravel(), + cmap='jet', + s=3, + alpha=0.7, + vmin=vmin, + vmax=vmax, + rasterized=True + ) + + ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10) + ax.set_xlabel('时间 (s)', fontsize=9) + ax.set_ylabel('频率 (Hz)', fontsize=9) + ax.set_ylim(0, self.sampling_rate / 2) + ax.grid(True, linestyle=':', alpha=0.4) + + ax2 = ax.twinx() + ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)') + ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black') + ax2.set_ylim(dc_ymin, dc_ymax) + ax2.tick_params(axis='y', labelcolor='black') + ax2.yaxis.set_major_locator(MaxNLocator(nbins=6)) + ax2.grid(False) + ax2.legend(loc='upper right', fontsize=8, framealpha=0.5) + + cbar = fig.colorbar(sc, cax=cax) + cbar.set_label('功率谱密度 (dB)', fontsize=9) + cax.tick_params(labelsize=8) + + output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png') + plt.savefig(output_path, bbox_inches='tight', dpi=150) + plt.close(fig) + self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}") + + def run_analysis(self): + """运行完整分析流程""" + try: + self.log_progress("开始数据分析流程", "INFO") + start_time = time.time() + + self.load_data() + self.plot_time_series() + self.plot_rainfall_spectrograms() + + total_time = time.time() - start_time + self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS") + self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO") + return True + + except ValueError as e: + # 跳过不包含IMU数据的文件 + self.log_progress(f"跳过文件: {str(e)}", "WARNING") + return False + except Exception as e: + self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR") + import traceback + traceback.print_exc() + return False + + +def process_single_file(file_path): + """处理单个文件的函数(使用进程隔离)""" + try: + print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}") + analyzer = IMUDataAnalyzer(file_path) + success = analyzer.run_analysis() + if success: + return (file_path, True, "处理成功") + else: + return (file_path, False, "文件不包含IMU数据,已跳过") + except Exception as e: + return (file_path, False, str(e)) + + +def main(): + """主函数,支持多文件处理和进度显示""" + print("=" * 60) + print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理") + print("=" * 60) + + # 获取输入路径 + print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ") + input_path = input("> ").strip() + + if not os.path.exists(input_path): + print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!") + return + + # 查找所有包含imu的CSV文件(不区分大小写) + if os.path.isdir(input_path): + # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件 + all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True) + csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)] + csv_files = list(set(csv_files)) # 去重 + csv_files.sort() + else: + # 对于单个文件,检查是否包含imu(不区分大小写) + if re.search(r'imu', input_path, re.IGNORECASE): + csv_files = [input_path] + else: + csv_files = [] + + if not csv_files: + print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件") + return + + print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:") + for i, file in enumerate(csv_files, 1): + print(f" {i}. {os.path.basename(file)}") + + # 使用多进程处理文件(避免matplotlib线程冲突) + print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...") + + success_count = 0 + skipped_count = 0 + failed_files = [] + + # 使用ProcessPoolExecutor而不是ThreadPoolExecutor + with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor: + # 提交所有任务 + future_to_file = {executor.submit(process_single_file, file): file for file in csv_files} + + # 处理完成的任务 + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + result = future.result() + file_path, success, message = result + if success: + print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}") + success_count += 1 + else: + if "跳过" in message: + print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}") + skipped_count += 1 + else: + print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}") + failed_files.append((file_path, message)) + except Exception as e: + print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}") + failed_files.append((file_path, str(e))) + + # 输出统计信息 + print(f"\n{Fore.CYAN}处理完成统计:") + print(f"{Fore.GREEN}成功: {success_count} 个文件") + print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)") + print(f"{Fore.RED}失败: {len(failed_files)} 个文件") + + if failed_files: + print(f"\n{Fore.YELLOW}失败文件详情:") + for file, error in failed_files: + print(f" {os.path.basename(file)}: {error}") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print(f"\n{Fore.YELLOW}用户中断程序执行") + except Exception as e: + print(f"{Fore.RED}程序运行出错: {str(e)}") + import traceback + + traceback.print_exc() \ No newline at end of file diff --git a/ICCIDupdata/.gitignore b/ICCIDupdata/.gitignore new file mode 100644 index 0000000..0cb4b73 --- /dev/null +++ b/ICCIDupdata/.gitignore @@ -0,0 +1,6 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source diff --git a/ICCIDupdata/ICCIDtest_V1.py b/ICCIDupdata/ICCIDtest_V1.py new file mode 100644 index 0000000..a9a307f --- /dev/null +++ b/ICCIDupdata/ICCIDtest_V1.py @@ -0,0 +1,90 @@ +import requests +import hashlib +import time + +import json + +def generate_sign(system_id, request_info, request_time, secret_key): + """生成签名""" + params = { + 'requestInfo': request_info, + 'requestTime': request_time, + 'systemId': system_id + } + # 按字典序排序 + sorted_params = '&'.join([f"{k}={v}" for k, v in sorted(params.items())]) + sign_str = sorted_params + secret_key + return hashlib.md5(sign_str.encode()).hexdigest() + + +def test_navp_interface(): + # 需要向HT获取系统密钥 + secret_key = "aqwec3be422c22a752c22" + + # url = "https://flow-gateway.pre.aeroht.com/server/oem/navp/infoUpload" + url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload" + + # 测试数据 + request_info = '{"iccid":"navp345678112300001","partsNo":"F34410001X3K-00-02","hVer":"F34410001X3K-00-02","sVer":"F34410001X3K0P001","network":"AG35CEVFMR12A02T4G&864169079532089","soc":"NA","sn":"F34410001X3K00024013683HJ00170"}' + # system_id = "navpFactory" + system_id = "diufactory" + request_time = time.strftime("%Y-%m-%d %H:%M:%S") + print(f"request_time:{request_time}") + # request_time = time.strftime("%Y/%m/%d %H:%M") + + # 生成签名 + sign = generate_sign(system_id, request_info, request_time, secret_key) + + data = { + "systemId": system_id, + "requestInfo": request_info, + "requestTime": request_time, + "sign": sign + } + + headers = { + "Content-Type": "application/x-www-form-urlencoded" + } + + try: + # response = requests.post(url, data=data, headers=headers, timeout=30) + print(f"data|requestInfo:{data['requestInfo']}") + response = requests.request("POST",url, data=data, headers=headers, timeout=30) + + if(response.status_code == 200) : + print(f"NAVP 接口测试 OK") + else : + print(f"NAVP 接口测试 NG") + + print(f"NAVP接口响应状态码: {response.status_code}") + print(f"NAVP接口响应内容: {response.text}") + + return response.status_code == 200 + except Exception as e: + print(f"NAVP接口请求失败: {e}") + return False + + +def test_navs_interface(): + # NAVS接口测试(类似NAVP,只需修改systemId和url) + # 实现逻辑与test_navp_interface类似 + # pass + url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload?requestInfo=%7B%22iccid%22:%22navp345678112300001%22,%22partsNo%22:%22parts111%22,%22hVer%22:%22hVer_7d98d056c96e22222%22,%22sVer%22:%22sVer_b38651e22222%22,%22soc%22:%22111%22,%22network%22:%222222%22%7D&systemId=diufactory&requestTime=2026-01-14%2017:19:58&sign=f480924ff291e0f98a4fb9fdd0167a3e&appSecret=aqwec3be422c22a752c22" + + payload = {} + headers = { + 'Cookie': 'cookies=mg3Tr49e6qr2eIHbvmiHp9NXJa56ei5vh4CeDbcRaEH450bqgdWLrHYHIgaZX3A7CXB9l0X3c1i+9D96HFAFjSCIA58vVLNpM2EtDixW67CQVOpinLaIMEcnr4wSqtaHjOvpw+XVvm+nB3LE2C5AH/qpSULCgySiX3ET7BQV0PSZkGUfWs2z6PqLSPa7ta9jr18otqVkK7y2zKdsdc4YkYq2jbZldPXm8cXufRCUqdvXoR2QzMoN+/gu6vBKtSXHSlyaTCC/aay+i64ChV4iNXrKlfHHj9MswdrzAazFvZXoDNMTMW00TEbev9DDbcTXVUdbjxidZM4Qk8xIMcpaR07l1ruHLLd2gmYZKRarBAxhrGXGWvJtm5EV1N0AgO3t9sSWhsyWNKKPijgMmUhYyOzKoxD3cvSZ2vGnI5iojb9W6U+cT3A98W81ENYs3yyrEZamJOAbbwAi+zpcCmxI/wcWq32HXgiYLxJ4pmaNlIlW+h8a4tGfTpxAR/WrG/SPN/HoMPohV1INDkllXkurrijH8ZeAQmF+lVepFfBcC9dPrkB7RBAUF/P0FIqjoAXVf6ULLoUvyHcD92vIPDVT4UPW7XGT7FRxtNoBMXhKJ9fOosn+ofuskmOWS1pQsAe5zY7fM/uE7VRrS/AaUt94hKSTJO0p94dPeRRxMt5zDe+Fe4M+wvE5SKaE++C6ZrSNqsuYq1RhcQS26PR90xvq9+OT3HX1r7vGakFIGNXzW/Gd3+QF7+5oGDQHzc6WjFAsQBs1HkntPcJpsVbE24r6kiGzMxgCNTzreqJXDYmyePETDKj75bb/K5E18Zeo83vF3zk2vVrxxefwbvaGWyeRJJW6sQv8kyaIpyNVPQOhetlpDV4RHVzja869fEIl1zOdNQWkU+7F/gCBfcUS79RIaC5psIDofx28E7TIhfanh41OU+TtBXNXEVYqf/7NDc3q+1pLnenogmFvSSG4qE0iSGUapL9iTaDXjlOyYkS39keVog/AHrVfDYMTzBWeko2YJmpLExUwLeXfwL3xRI41yuuBz2eEuQhyIMwxxQWHkptPFR9Cn6TfjDwYBVIxIzrEBFc6E14VmZQ/zNloS2n66Z45ivuaRpagMaWo7+cqSX0CQvQ8SJQ+5k4i7pnXzhSq8fxiLSa0wIvfrnDlwd7WS6oe0nKqyMInt/iGMqUiAVmrlduHhlrIweHkM/E7pVaURVI38R5WNOOYBgHV8CTUwi2FLwsZbEkD0ElJDhEkeHfWHxmn9XkIVU+XD3/OZp+IRCYBWr3t6+iPJqosp75eWNfST5kCzP/bye+h5vRjuvRdnnMhekyd9MY3yPPbz5JJ42CTrkjyAGIyiycQNI4mFIWB9nlM3hXoywoO+FDd2CFIMvwkdH+GXEvbVNR/il3O94jRS+kF3v/i8WBPDzUSP0aHAQPEAvzyIsxY/6WrOcAuuA7Cy0qeIzjI0Wzjv/QbOEgtHk7kR6+pgptQtVFgU4EldoQFnuZEPPQdbw1OAhxUKfyIuljTKq8FbZ95aHX5fFQ6POzgIgUFYCwVqRHkVn6dwHLkDXcOMhfXvw+5q23k0GGCFUPwFM+6ypZBoKKCRTZ60e0IUOq3afVls/UUgGnDQp4pT/BXhLYhICGH8cZw+sNxHLddehuepi4PI8fq60e+H6RfE7xxk+LRlVNyI0TTi+NuESQr+UzX7GIvVkiiwgQKrUPafqBbDS6L2890tVXt1un1UH5hW9GuE+uftclBWqvGnYZUUrHQ42eAr1c8xvunaTINVU24nBlVFUPeh3x34RsjldTkrYeIkk9v0tz8T7ndWi6qxv/03u9YBlMRcJozgDnovVx/tNH7J0f6j6Sq1RNkhxVvRe6SPAgS3mvz5MLcMLw9pWCTSOf8NVDbSuV5NpOm+f5mhU9u/5tLfXgznJSmu9UW6WWx4PgiPTB0jHELrYnDQiiDRqVDFixaHzPZ6t9CKJp088NXrLamFfOYfd3e2S6xEu7aUHBKR2vnscQfl5awuzWD8uVh3sHcK/N4f2wregqra3YaSgme', + 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)', + 'Content-Type': 'application/json' + } + + response = requests.request("POST", url, headers=headers, data=payload) + + print(response.text) + + + +if __name__ == "__main__": + print("开始测试预发布环境...") + navp_result = test_navp_interface() + # navs_result = test_navs_interface() diff --git a/IMULinkdata/.gitignore b/IMULinkdata/.gitignore new file mode 100644 index 0000000..fc40eca --- /dev/null +++ b/IMULinkdata/.gitignore @@ -0,0 +1,9 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source + + +LINLinkData_V2.py \ No newline at end of file diff --git a/IMULinkdata/LINLinkData_V1.py b/IMULinkdata/LINLinkData_V1.py new file mode 100644 index 0000000..e3066ca --- /dev/null +++ b/IMULinkdata/LINLinkData_V1.py @@ -0,0 +1,252 @@ +import os +import pandas as pd +from datetime import datetime +import argparse + +import re +import time +import argparse +from datetime import datetime +from collections import defaultdict + +import numpy as np +import pandas as pd +import openpyxl + + +class ExcelProcessor: + def __init__(self, file_path): + self.file_path = file_path + self.df = None + self.output_folder = None + self.output_file = None + self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self.processed_data = {} # 存储处理后的数据 + + + def load_data(self): + """加载Excel文件数据""" + print(f"正在加载文件: {self.file_path}") + try: + # 尝试读取指定sheet,如果不存在则尝试读取第一个sheet + try: + # 建议使用 engine='openpyxl',pandas 会尽可能把 Excel 的日期单元格读成 datetime + # self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl') + + # 获取所有工作表名称 + sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names + # 查找包含'LINK'的工作表(不区分大小写) + target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None) + if target_sheet is None: + raise ValueError(f"未找到包含'LINK'的工作表") + + self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl') + except Exception as e: + print("提示: 未找到包含'LINK' sheet,请检查文件内容。") + return False + + # 确保有 PartNumber 列(兼容 LinkObject) + if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns: + self.df['PartNumber'] = self.df['LinkObject'] + + # 检查必要的列是否存在 + required_cols = ["PartNumber", "ChildSN", "linkDate"] + missing = [c for c in required_cols if c not in self.df.columns] + if missing: + raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}") + + # 解析 linkDate 为 datetime(支持 AM/PM) + # 注:pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM" + # 如果有极端异构格式,可在这里加更精细的清洗逻辑 + # errors='coerce' 会把无法解析的值变为 NaT + # self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce') + + self.df['linkDate'] = pd.to_datetime( + self.df['linkDate'], + format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM + errors='coerce' + ) + + # 提示解析情况 + total = len(self.df) + invalid = int(self.df['linkDate'].isna().sum()) + print(f"文件加载成功,总行数: {total},日期解析失败: {invalid} 行") + + # 添加备注列 + if '备注' not in self.df.columns: + self.df['备注'] = '' + + return True + except Exception as e: + print(f"加载文件失败: {str(e)}") + return False + + def create_output_folder(self): + """准备输出目录和文件名""" + + # 先去除扩展名,再截取前10个字符 + # base_name = os.path.splitext(os.path.basename(self.file_path))[0] + original_name = os.path.splitext(os.path.basename(self.file_path))[0] + + # base_name = original_name[:10] + base_name = original_name[:20] + + output_folder_name = f"{base_name} output_{self.timestamp}" + + # self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name) + self.output_folder = os.path.dirname(self.file_path) + + self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx") + + if not os.path.exists(self.output_folder): + os.makedirs(self.output_folder) + print(f"已创建输出文件夹: {self.output_folder}") + + + def _safe_sheet_name(self, name): + """清理为合法的 Excel sheet 名称(<=31字符,无非法字符)""" + # 转为字符串 + s = str(name) + # 替换非法字符:: \ / ? * [ ] + s = re.sub(r'[:\\/\?\*\[\]]', '_', s) + # 去除首尾空格 + s = s.strip() + # 截断到 31 个字符 + if len(s) > 31: + s = s[:31] + # 空名兜底 + if not s: + s = 'Sheet' + return s + + def process_data(self): + """处理数据并拆分到不同sheet""" + if self.df is None: + raise ValueError("数据未加载,请先调用 load_data() 方法") + + # 确保有PartNumber列 + if 'PartNumber' not in self.df.columns: + if 'LinkObject' in self.df.columns: + self.df['PartNumber'] = self.df['LinkObject'] + else: + raise ValueError("数据表中既没有PartNumber也没有LinkObject列") + + # 添加备注列 + self.df['备注'] = '' + + # 按 PartNumber 分组 + grouped = self.df.groupby('PartNumber', dropna=False) + total_groups = len(grouped) + print(f"开始处理数据,共 {total_groups} 个分组...") + + # 使用上下文管理器,自动保存关闭 + # print(f"输出文件信息,self.output_folder:{self.output_folder}") + print(f"输出文件信息,self.output_file:{self.output_file}") + # output_path = os.path.join(self.output_folder, self.output_file) + output_path = self.output_file + + writer = pd.ExcelWriter(output_path, engine='openpyxl') + + for i, (name, group) in enumerate(grouped): + print(f"正在处理分组 {i + 1}/{total_groups}: {name}") + + # 处理重复 ChildSN(根据最新 linkDate 保留一条) + group_processed = self.process_duplicates(group) + + # 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串 + group_out = group_processed.copy() + group_out['linkDate'] = group_out['linkDate'].apply( + lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else '' + ) + + # 写入sheet + safe_name = self._safe_sheet_name(name) + group_out.to_excel(writer, sheet_name=safe_name, index=False) + + # 保存文件 + writer.close() + print(f"处理完成! 结果已保存到: {output_path}") + + def process_duplicates(self, group): + """处理重复的 ChildSN,优化备注信息:保留最新 linkDate 的一行""" + # 找出重复 ChildSN + duplicates = group[group.duplicated('ChildSN', keep=False)] + + if not duplicates.empty: + print(f" 发现 {len(duplicates)} 行重复数据,正在处理...") + + # 遍历每个重复 ChildSN 的分组 + for child_sn, dup_group in duplicates.groupby('ChildSN'): + # 按 linkDate 排序,保留最新(降序) + # 若 linkDate 有 NaT,会排在末尾 + dup_group = dup_group.sort_values('linkDate', ascending=False) + + # 获取最新行 + latest_row = dup_group.iloc[0] + + # 差异字段收集(除 ChildSN、备注) + diff_info = {} + for col in dup_group.columns: + if col in ['ChildSN', '备注']: + continue + unique_values = dup_group[col].unique() + if len(unique_values) > 1: + # 对 linkDate 做专门格式化,其他列保持原样转字符串 + if col == 'linkDate': + vals = [] + for v in unique_values: + if pd.isna(v): + vals.append('') + elif isinstance(v, pd.Timestamp): + vals.append(v.strftime('%Y-%m-%d %H:%M:%S')) + else: + vals.append(str(v)) + diff_info[col] = f"{col}: {', '.join(vals)}" + else: + diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}" + + # 生成备注信息 + note = f"重复行数: {len(dup_group)}" + if diff_info: + note += "; 差异内容: " + "; ".join(diff_info.values()) + + # 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除) + group.loc[group['ChildSN'] == child_sn, '备注'] = note + + # 删除除最新以外的行 + drop_indices = dup_group.index[1:] + group = group.drop(drop_indices) + + return group + + +def main(): + print("=== Excel拆分工具 ===") + file_path = input("请输入Excel文件路径: ").strip('"') + if not os.path.exists(file_path): + print("文件不存在,请检查路径") + return + + start_time = time.time() + + try: + # 创建处理器实例 + processor = ExcelProcessor(file_path) + + # 执行处理流程 + if not processor.load_data(): + return + + processor.create_output_folder() + processor.process_data() + + print("所有处理已完成!") + except Exception as e: + print(f"处理过程中发生错误: {e}") + + end_time = time.time() + print(f"总耗时: {end_time - start_time:.2f}秒") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dataProcess/.gitignore b/dataProcess/.gitignore new file mode 100644 index 0000000..b9a731e --- /dev/null +++ b/dataProcess/.gitignore @@ -0,0 +1,20 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source +/dataProcess_out* +*.xls +*.xlsx +*.csv +*.spec + +/temp + +dataProcess_html_V2.py + +dataProcess_sightml_V2.py +dataProcess_sightml_V3.py + +dataProcessMerge_V2.py \ No newline at end of file diff --git a/dataProcess/dataProcessMerge_V1.py b/dataProcess/dataProcessMerge_V1.py new file mode 100644 index 0000000..469a691 --- /dev/null +++ b/dataProcess/dataProcessMerge_V1.py @@ -0,0 +1,475 @@ +import os +import pandas as pd +from tkinter import filedialog, Tk +import logging +import datetime +# --- 新增导入 --- +from colorama import init, Fore, Style +import sys + +# 初始化 colorama,autoreset=True 使得每次打印后自动恢复默认颜色 +init(autoreset=True) + +# --- 自定义日志格式化器 --- +class ColoredFormatter(logging.Formatter): + """根据日志级别为控制台输出添加颜色""" + + # 定义颜色 + COLORS = { + 'DEBUG': Fore.CYAN, + 'INFO': Fore.GREEN, + 'WARNING': Fore.YELLOW, + 'ERROR': Fore.RED, + 'CRITICAL': Fore.RED + Style.BRIGHT, + } + + def format(self, record): + # 获取对应级别的颜色 + log_color = self.COLORS.get(record.levelname, '') + # 应用颜色到整个记录 + record.levelname = f"{log_color}{record.levelname}{Style.RESET_ALL}" + record.msg = f"{log_color}{record.msg}{Style.RESET_ALL}" + # 使用父类的格式化方法 + return super().format(record) + +# --- 配置日志 --- +# 创建 logger 对象 +logger = logging.getLogger() # 获取根 logger +logger.setLevel(logging.INFO) + +# 移除默认的 handlers(如果有的话),避免重复输出 +if logger.handlers: + logger.handlers.clear() + +# 创建控制台 handler +console_handler = logging.StreamHandler(sys.stdout) # 使用 sys.stdout 通常更好 +console_handler.setLevel(logging.INFO) + +# 创建并设置 formatter +formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s') +console_handler.setFormatter(formatter) + +# 将 handler 添加到 logger +logger.addHandler(console_handler) +# --- 日志配置结束 --- + + +class DataProcessor: + def __init__(self): + self.spec_file = None + self.data_folder = None + self.spec_data = None + self.data_files = [] + self.merged_data = pd.DataFrame() + + def select_spec_file(self): + """选择上限和下限规格要求文件""" + root = Tk() + root.withdraw() + self.spec_file = filedialog.askopenfilename( + title="选择上限和下限规格要求文件", + filetypes=[("CSV files", "*.csv"), ("All files", "*.*")] + ) + if not self.spec_file: + logging.error("未选择规格文件") + return False + logging.info(f"已选择规格文件: {self.spec_file}") + return True + + def select_data_folder(self): + """选择实际数据文件所在的文件夹""" + root = Tk() + root.withdraw() + self.data_folder = filedialog.askdirectory(title="选择实际数据文件所在的文件夹") + if not self.data_folder: + logging.error("未选择数据文件夹") + return False + logging.info(f"已选择数据文件夹: {self.data_folder}") + return True + + def clean_column_names(self, df): + """清理列名,去除前后空格和特殊字符""" + df.columns = [col.strip() for col in df.columns] + return df + + def load_spec_data(self): + """加载规格数据,标题行为第3行""" + try: + # 读取CSV文件,跳过前2行,第3行作为标题 + self.spec_data = pd.read_csv(self.spec_file, header=2) + + # 清理列名 + self.spec_data = self.clean_column_names(self.spec_data) + + # 确保PAD ID列是字符串类型 + if 'PAD ID' in self.spec_data.columns: + self.spec_data['PAD ID'] = self.spec_data['PAD ID'].astype(str).str.strip() + + # 检查必要的列是否存在 + required_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", + "Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"] + + missing_columns = [col for col in required_columns if col not in self.spec_data.columns] + if missing_columns: + logging.warning(f"规格文件中缺少以下列: {missing_columns}") + # 尝试查找相似的列名 + for missing_col in missing_columns: + similar_cols = [col for col in self.spec_data.columns if missing_col.lower() in col.lower()] + if similar_cols: + logging.info(f"可能匹配的列: {similar_cols}") + + # 特别检查 Component ID 是否存在 + if "Component ID" not in self.spec_data.columns: + logging.warning("'Component ID' 列在规格文件中缺失,这可能导致输出文件中也缺少该列。") + + logging.info(f"规格数据加载成功,共 {len(self.spec_data)} 行") + logging.info(f"规格文件列名: {list(self.spec_data.columns)}") + logging.info( + f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype if 'PAD ID' in self.spec_data.columns else 'N/A'}") + + except Exception as e: + logging.error(f"加载规格数据失败: {e}") + return False + return True + + def scan_data_files(self): + """扫描数据文件夹中的CSV文件,并检查标题行是否包含有效字段""" + try: + # 定义有效的字段名称(去除前后空格) + required_fields = [ + "PAD ID", "Component ID", "Height(mil)", "Volume(%)", + "Area(%)", "Volume(mil3)", "Area(mil2)" + ] + + # 可选:定义字段匹配的宽松程度 + field_match_threshold = 0.8 # 80%的字段匹配即认为有效 + + # 扫描CSV文件 + valid_files = [] + for file in os.listdir(self.data_folder): + if file.endswith(".csv") and "F27140015X3K" in file: + file_path = os.path.join(self.data_folder, file) + + # 检查文件是否可读且包含有效字段 + if self._is_valid_csv_file(file_path, required_fields, field_match_threshold): + valid_files.append(file_path) + + self.data_files = valid_files + logging.info( + f"找到 {len(self.data_files)} 个有效数据文件: {[os.path.basename(f) for f in self.data_files]}") + + except Exception as e: + logging.error(f"扫描数据文件失败: {e}") + return False + + return True if self.data_files else False + + def _is_valid_csv_file(self, file_path, required_fields, threshold=1.0): + """检查CSV文件是否包含必需的字段""" + try: + # 尝试不同的编码 + encodings = ['utf-8', 'gbk', 'latin-1'] + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as f: + first_line = f.readline().strip() + + # 解析CSV标题行 + headers = [header.strip() for header in first_line.split(',')] + + # 计算匹配的字段数量 + matched_fields = 0 + missing_fields = [] + + for required_field in required_fields: + if required_field in headers: + matched_fields += 1 + else: + missing_fields.append(required_field) + + # 计算匹配比例 + match_ratio = matched_fields / len(required_fields) + + if match_ratio >= threshold: + if missing_fields: + logging.warning( + f"文件 {os.path.basename(file_path)} 部分字段缺失: {missing_fields},但满足阈值要求") + else: + logging.info(f"文件 {os.path.basename(file_path)} 所有字段完整") + return True + else: + logging.warning( + f"文件 {os.path.basename(file_path)} 字段匹配率不足: {match_ratio:.1%},缺失字段: {missing_fields}") + return False + + except UnicodeDecodeError: + continue # 尝试下一个编码 + + logging.error(f"无法读取文件 {os.path.basename(file_path)},尝试了所有编码") + return False + + except Exception as e: + logging.error(f"检查文件 {os.path.basename(file_path)} 时发生错误: {e}") + return False + + def load_and_clean_data_file(self, data_file): + """加载并清理数据文件""" + try: + # 读取数据文件,第一行作为标题 + # 处理可能的编码问题 + try: + data_df = pd.read_csv(data_file, header=0, encoding='utf-8') + except UnicodeDecodeError: + try: + data_df = pd.read_csv(data_file, header=0, encoding='gbk') + except UnicodeDecodeError: + data_df = pd.read_csv(data_file, header=0, encoding='latin-1') + + # 清理列名 + data_df = self.clean_column_names(data_df) + + logging.info(f"数据文件列名: {list(data_df.columns)}") + + # --- 关键修改:创建副本以避免 SettingWithCopyWarning --- + data_df = data_df.copy() + + # 确保PAD ID列是字符串类型 + if 'PAD ID' in data_df.columns: + data_df['PAD ID'] = data_df['PAD ID'].astype(str).str.strip() + logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}") + + # 检查必要的列是否存在 + required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"] + + # 处理可能的列名变体 + column_mapping = {} + for required_col in required_columns: + if required_col not in data_df.columns: + # 查找相似的列名 + # 更宽松的匹配方式:忽略空格和大小写 + similar_cols = [col for col in data_df.columns if + required_col.lower().replace(" ", "") in col.lower().replace(" ", "")] + if similar_cols: + column_mapping[required_col] = similar_cols[0] + logging.info(f"映射列: {required_col} -> {similar_cols[0]}") + + # 重命名列 + if column_mapping: + data_df = data_df.rename(columns=column_mapping) + + missing_columns = [col for col in required_columns if col not in data_df.columns] + if missing_columns: + logging.error(f"数据文件中缺少以下列: {missing_columns}") + logging.info(f"数据文件所有列: {list(data_df.columns)}") + return None + + return data_df # 返回处理好的副本 + + except Exception as e: + logging.error(f"加载数据文件失败: {e}") + return None + + def process_data(self): + """处理数据并合并""" + all_data = [] + total_files = len(self.data_files) + + if total_files == 0: + logging.error("未找到任何数据文件") + return False + + for idx, data_file in enumerate(self.data_files, 1): + logging.info(f"处理数据文件 {idx}/{total_files}: {os.path.basename(data_file)}") + try: + # 加载并清理数据文件 + data_df = self.load_and_clean_data_file(data_file) + if data_df is None: + logging.error(f"无法加载文件: {os.path.basename(data_file)}") + continue + + # 选择需要的字段 + required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"] + + # 检查数据文件中是否存在所有必需的列 + available_columns = [col for col in required_columns if col in data_df.columns] + if len(available_columns) != len(required_columns): + missing = set(required_columns) - set(available_columns) + logging.warning(f"文件 {os.path.basename(data_file)} 缺少列: {missing}") + logging.info(f"可用的列: {available_columns}") + # --- 关键修改:使用可用的列继续处理 (再次创建副本) --- + data_df = data_df[available_columns].copy() + else: + # --- 关键修改:选择所需的列 (创建副本) --- + data_df = data_df[required_columns].copy() + + # 添加数据来源字段 + data_df["数据来源"] = os.path.basename(data_file) + data_df["限制来源"] = os.path.basename(self.spec_file) + + # 调试信息:显示合并前的数据类型 + logging.info( + f"合并前 - 数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist() if 'PAD ID' in data_df.columns else 'N/A'}") + logging.info( + f"合并前 - 规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist() if 'PAD ID' in self.spec_data.columns else 'N/A'}") + + # 从规格文件中选择需要的字段 + spec_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", + "Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"] + + # 只选择存在的列 + available_spec_columns = [col for col in spec_columns if col in self.spec_data.columns] + # --- 关键修改:使用 .copy() 创建一个独立的副本,避免 SettingWithCopyWarning --- + spec_df = self.spec_data[available_spec_columns].copy() + + # 确保规格文件的PAD ID也是字符串类型 + if 'PAD ID' in spec_df.columns: + spec_df['PAD ID'] = spec_df['PAD ID'].astype(str).str.strip() + + # 合并规格数据 + merged_df = pd.merge(data_df, spec_df, on="PAD ID", how="inner", suffixes=('_data', '_spec')) + + if merged_df.empty: + logging.warning(f"文件 {os.path.basename(data_file)} 与规格数据无匹配项") + # 显示一些调试信息 + data_pad_ids = set(data_df['PAD ID'].unique()) if 'PAD ID' in data_df.columns else set() + spec_pad_ids = set(spec_df['PAD ID'].unique()) if 'PAD ID' in spec_df.columns else set() + common_ids = data_pad_ids.intersection(spec_pad_ids) + logging.info( + f"数据文件PAD ID数量: {len(data_pad_ids)}, 规格文件PAD ID数量: {len(spec_pad_ids)}, 共同ID数量: {len(common_ids)}") + logging.info(f"数据文件前5个PAD ID: {list(data_pad_ids)[:5] if data_pad_ids else 'N/A'}") + logging.info(f"规格文件前5个PAD ID: {list(spec_pad_ids)[:5] if spec_pad_ids else 'N/A'}") + continue + + # --- 优化开始:确保 Component ID 来自数据文件 --- + # 即使合并产生了两个 Component ID (_data 和 _spec),我们也明确使用来自 data_df 的那个 + if 'Component ID_data' in merged_df.columns: + merged_df['Component ID'] = merged_df['Component ID_data'] + # 可选:删除来自规格文件的 Component ID 列 + # merged_df.drop(columns=['Component ID_spec'], inplace=True, errors='ignore') + # 或者保留它以便对比,这里我们先注释掉删除操作 + + # 如果因为某种原因没有 _data 后缀(例如只有一个 Component ID),则默认就是 data_df 的 + # (这种情况在 merge 时不会发生,因为我们用了 suffixes) + # --- 优化结束 --- + + # --- 新增:对规格高度字段执行单位转换(除以 25.4) --- + # 为避免意外字符导致转换失败,先清洗再转换为数值 + convert_cols = ["Height_Low(mil)", "Height_High(mil)"] + for col in convert_cols: + if col in merged_df.columns: + before_non_null = merged_df[col].notna().sum() + # 清洗非数字字符(保留数字、小数点和负号) + cleaned = merged_df[col].astype(str).str.replace(r'[^\d\.\-]+', '', regex=True) + merged_df[col] = pd.to_numeric(cleaned, errors='coerce') / 25.4 + after_non_null = merged_df[col].notna().sum() + logging.info( + f"字段 {col} 已除以 25.4 完成单位转换,非空值数: 转换前 {before_non_null} -> 转换后 {after_non_null}" + ) + else: + logging.warning(f"规格高度字段缺失,无法进行单位转换: {col}") + + # 选择最终输出的字段(按照要求的顺序) + output_columns = [ + "PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", "Height_Low(mil)", + "Height_High(mil)", "Area_Min(%)", "Area_Max(%)", "Height(mil)", "Volume(%)", "Area(%)", + "数据来源", "限制来源" + ] + + # --- 优化开始 --- + # 只选择存在的列 + available_output_columns = [col for col in output_columns if col in merged_df.columns] + + # 检查是否有列缺失并打印警告 + missing_output_columns = [col for col in output_columns if col not in merged_df.columns] + if missing_output_columns: + logging.warning( + f"文件 {os.path.basename(data_file)} 的最终输出中缺少以下预期列: {missing_output_columns}") + + # 如果没有任何可用列,则跳过此文件 + if not available_output_columns: + logging.error(f"文件 {os.path.basename(data_file)} 没有任何预期的输出列,将跳过此文件。") + continue + + merged_df = merged_df[available_output_columns].copy() # 再次使用.copy()确保安全 + # --- 优化结束 --- + + all_data.append(merged_df) + logging.info(f"文件 {os.path.basename(data_file)} 处理成功,匹配 {len(merged_df)} 行") + + except Exception as e: + logging.error(f"处理文件 {os.path.basename(data_file)} 时出错: {e}") + # 显示更多调试信息 + if 'data_df' in locals() and 'PAD ID' in data_df.columns: + logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}") + logging.info(f"数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist()}") + if hasattr(self, 'spec_data') and 'PAD ID' in self.spec_data.columns: + logging.info(f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype}") + logging.info(f"规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist()}") + continue + + if all_data: + self.merged_data = pd.concat(all_data, ignore_index=True) + logging.info(f"数据处理完成,共合并 {len(self.merged_data)} 行数据") + logging.info(f"最终数据列名: {list(self.merged_data.columns)}") + else: + logging.error("未成功处理任何数据文件") + return False + return True + + def save_to_excel(self): + """保存合并后的数据到Excel文件""" + try: + # 生成时间戳 + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_filename = f"dataProcess_out_{timestamp}.xlsx" + output_file = os.path.join(self.data_folder, output_filename) + + self.merged_data.to_excel(output_file, index=False) + logging.info(f"数据已保存到: {output_file}") + + # 显示统计信息 + stats = f"处理统计:\n" + stats += f"- 规格文件: {os.path.basename(self.spec_file)}\n" + stats += f"- 处理的数据文件数: {len(self.data_files)}\n" + stats += f"- 合并的总行数: {len(self.merged_data)}\n" + stats += f"- 输出文件: {output_file}\n" + stats += f"- 包含的列: {list(self.merged_data.columns)}" + + logging.info(stats) + # 原来的 message box 提示已移除,改为日志输出 + logging.info("处理完成。\n" + stats) + + except Exception as e: + logging.error(f"保存数据失败: {e}") + # 原来的 error message box 已移除,改为日志输出 + logging.error(f"保存数据失败: {e}") + + def run(self): + """运行整个数据处理流程""" + logging.info("开始数据处理流程") + + try: + if not self.select_spec_file(): + return + if not self.select_data_folder(): + return + if not self.load_spec_data(): + return + if not self.scan_data_files(): + return + if not self.process_data(): + # 原来的 error message box 已移除,改为日志输出 + logging.error("数据处理失败,请检查日志信息") + return + self.save_to_excel() + + except Exception as e: + logging.error(f"处理流程出错: {e}") + # 原来的 error message box 已移除,改为日志输出 + logging.error(f"处理过程中出现错误:\n{e}") + + +if __name__ == "__main__": + processor = DataProcessor() + processor.run() diff --git a/dataProcess/dataProcess_html_V1.py b/dataProcess/dataProcess_html_V1.py new file mode 100644 index 0000000..290430c --- /dev/null +++ b/dataProcess/dataProcess_html_V1.py @@ -0,0 +1,1060 @@ +import pandas as pd +import tkinter as tk +from tkinter import filedialog +import os +from datetime import datetime +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from io import BytesIO +import base64 +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor, as_completed +import time +import json +import traceback + +# 设置中文字体 +plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans'] +plt.rcParams['axes.unicode_minus'] = False + + +def plot_worker(args): + """工作进程函数:生成单个分组的图表""" + try: + group_key, feature_data_dict, limits_dict = args + + # 每个进程重新设置matplotlib配置,避免线程冲突 + plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans'] + plt.rcParams['axes.unicode_minus'] = False + + results = {} + + for feature_name, feature_data in feature_data_dict.items(): + if len(feature_data) == 0: + results[feature_name] = "" + continue + + usl, lsl = limits_dict[feature_name] + + # 创建图表 + fig, axes = plt.subplots(2, 2, figsize=(12, 10)) + fig.suptitle(f'{group_key} - {feature_name} 统计分析', fontsize=14) + + # 1. 直方图 + axes[0, 0].hist(feature_data, bins=15, alpha=0.7, color='skyblue', edgecolor='black') + axes[0, 0].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1) + axes[0, 0].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1) + axes[0, 0].axvline(feature_data.mean(), color='orange', linestyle='-', + label=f'均值: {feature_data.mean():.2f}', linewidth=1.5) + axes[0, 0].set_title('直方图') + axes[0, 0].set_xlabel(feature_name) + axes[0, 0].set_ylabel('频数') + axes[0, 0].legend(fontsize=8) + axes[0, 0].grid(True, alpha=0.3) + + # 2. 箱线图 + sns.boxplot(y=feature_data, ax=axes[0, 1], color='lightblue') + axes[0, 1].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1) + axes[0, 1].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1) + axes[0, 1].set_title('箱线图') + axes[0, 1].set_ylabel(feature_name) + axes[0, 1].legend(fontsize=8) + axes[0, 1].grid(True, alpha=0.3) + + # 3. 序列图 + axes[1, 0].plot(range(len(feature_data)), feature_data, 'o-', color='blue', + alpha=0.7, markersize=3, linewidth=1) + axes[1, 0].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1) + axes[1, 0].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1) + axes[1, 0].axhline(feature_data.mean(), color='orange', linestyle='-', + label=f'均值: {feature_data.mean():.2f}', linewidth=1.5) + axes[1, 0].set_title('序列图') + axes[1, 0].set_xlabel('数据点序号') + axes[1, 0].set_ylabel(feature_name) + axes[1, 0].legend(fontsize=8) + axes[1, 0].grid(True, alpha=0.3) + + # 4. 概率密度图 + sns.kdeplot(feature_data, ax=axes[1, 1], color='blue', fill=True, alpha=0.5) + axes[1, 1].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1) + axes[1, 1].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1) + axes[1, 1].axvline(feature_data.mean(), color='orange', linestyle='-', + label=f'均值: {feature_data.mean():.2f}', linewidth=1.5) + axes[1, 1].set_title('概率密度图') + axes[1, 1].set_xlabel(feature_name) + axes[1, 1].set_ylabel('密度') + axes[1, 1].legend(fontsize=8) + axes[1, 1].grid(True, alpha=0.3) + + plt.tight_layout() + + # 转换为base64 + buffer = BytesIO() + plt.savefig(buffer, format='png', dpi=80, bbox_inches='tight') + buffer.seek(0) + image_base64 = base64.b64encode(buffer.getvalue()).decode() + plt.close(fig) + + results[feature_name] = image_base64 + + return group_key, results + + except Exception as e: + print(f"❌ 图表生成失败 {group_key}: {e}") + print(f" 错误详情: {traceback.format_exc()}") + return group_key, {} + + +class DataProcessor: + def __init__(self): + self.data = None + self.filename = None + self.file_path = None + self.file_dir = None # 新增:存储输入文件所在目录 + self.stats = None + self.output_dir = None + self.progress_file = None + + def select_file(self): + """手动选择数据文件""" + print("打开文件选择对话框...") + root = tk.Tk() + root.withdraw() + + self.file_path = filedialog.askopenfilename( + title="选择数据文件", + filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")] + ) + + if self.file_path: + self.filename = os.path.basename(self.file_path) + self.file_dir = os.path.dirname(self.file_path) # 获取文件所在目录 + print(f"✅ 已选择文件: {self.filename}") + print(f"📁 文件所在目录: {self.file_dir}") + return True + else: + print("❌ 未选择文件") + return False + + def _load_data(self): + """加载数据文件""" + print("开始加载数据文件...") + try: + if self.file_path.endswith('.csv'): + self.data = pd.read_csv(self.file_path) + print("✅ 成功加载CSV文件") + elif self.file_path.endswith('.xlsx'): + self.data = pd.read_excel(self.file_path) + print("✅ 成功加载Excel文件") + else: + raise ValueError("不支持的文件格式") + + print(f"📊 数据文件形状: {self.data.shape}") + + except Exception as e: + print(f"❌ 加载数据文件时出错: {e}") + print(f" 错误详情: {traceback.format_exc()}") + raise + + def _validate_data(self): + """验证数据完整性 - 增强验证:检查上下限列""" + print("验证数据完整性...") + + # 检查必要的测量列 + required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)'] + missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns] + + if missing_measure_columns: + error_msg = f"数据文件中缺少必要的测量列: {missing_measure_columns}" + print(f"❌ {error_msg}") + raise ValueError(error_msg) + + # 检查必要的上下限列 + required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)', + 'Vol_Min(%)', 'Vol_Max(%)', + 'Area_Min(%)', 'Area_Max(%)'] + missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns] + + if missing_limit_columns: + error_msg = f"数据文件中缺少必要的上下限列: {missing_limit_columns}" + print(f"❌ {error_msg}") + raise ValueError(error_msg) + + print("✅ 数据验证通过") + + # 检查数据是否存在空值 + all_required_columns = required_measure_columns + required_limit_columns + null_counts = self.data[all_required_columns].isnull().sum() + if null_counts.any(): + print(f"⚠️ 数据中存在空值 - {null_counts[null_counts > 0].to_dict()}") + + def _setup_output_directory(self): + """设置输出目录""" + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + base_name = os.path.splitext(self.filename)[0] + + # 优化:输出目录放置在输入文件所在文件夹下 + self.output_dir = os.path.join(self.file_dir, f"{base_name}_report_{timestamp}") + + # 创建主目录 + os.makedirs(self.output_dir, exist_ok=True) + + # 创建分组报告子目录 + os.makedirs(os.path.join(self.output_dir, 'group_reports'), exist_ok=True) + + # 创建进度文件 + self.progress_file = os.path.join(self.output_dir, 'progress.json') + + print(f"📁 输出目录: {self.output_dir}") + + def _save_progress(self, completed_groups=None, current_stage=None): + """保存处理进度""" + try: + progress = { + 'filename': self.filename, + 'total_groups': len(self.stats.index) if self.stats is not None else 0, + 'completed_groups': completed_groups or [], + 'current_stage': current_stage, + 'last_update': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'input_file_directory': self.file_dir, # 记录输入文件目录 + 'output_directory': self.output_dir # 记录输出目录 + } + + with open(self.progress_file, 'w', encoding='utf-8') as f: + json.dump(progress, f, indent=2, ensure_ascii=False) + except Exception as e: + print(f"⚠️ 保存进度失败: {e}") + + def generate_report(self): + """生成统计报告 - 分阶段输出""" + if self.data is None: + raise ValueError("请先选择数据文件") + + try: + # 验证数据 + self._validate_data() + + # 设置输出目录 + self._setup_output_directory() + + print("开始数据处理...") + + # 创建分组键 + self.data['Group_Key'] = self.data['PAD ID'].astype(str) + '_' + self.data['Component ID'].astype(str) + group_count = self.data['Group_Key'].nunique() + print(f"📊 共发现 {group_count} 个分组") + + # 阶段1:快速生成基本统计信息和汇总报告 + print("\n=== 阶段1: 生成基本统计信息 ===") + + # 计算测量数据的统计信息 + self.stats = self.data.groupby('Group_Key').agg({ + 'Height(mil)': ['min', 'max', 'mean', 'std'], + 'Volume(%)': ['min', 'max', 'mean', 'std'], + 'Area(%)': ['min', 'max', 'mean', 'std'] + }).round(4) + + # 重命名测量统计列 + self.stats.columns = [ + 'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)', + 'Vol_Measured_Min(%)', 'Vol_Measured_Max(%)', 'Vol_Mean(%)', 'Vol_Std(%)', + 'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)' + ] + + print("基本统计信息计算完成") + + # 获取预设的上下限信息 + print("获取预设上下限信息...") + limits = self.data.groupby('Group_Key').agg({ + 'Height_Low(mil)': 'first', # 取第一个值作为该分组的预设下限 + 'Height_High(mil)': 'first', # 取第一个值作为该分组的预设上限 + 'Vol_Min(%)': 'first', + 'Vol_Max(%)': 'first', + 'Area_Min(%)': 'first', + 'Area_Max(%)': 'first' + }).round(4) + + # 合并统计信息和预设上下限信息 + self.stats = pd.concat([self.stats, limits], axis=1) + print("预设上下限信息获取完成") + + # 计算CPK - 使用预设的上下限值 + print("计算CPK值...") + self.stats = self._calculate_cpk(self.stats) + + # 立即生成汇总报告 + summary_report_path = self._create_summary_report() + print(f"✅ 汇总报告生成完成: {summary_report_path}") + + # 保存Excel + excel_path = self._save_to_excel_advanced() + print(f"✅ Excel文件保存完成: {excel_path}") + + # 阶段2:分批生成详细分组报告 + print("\n=== 阶段2: 分批生成详细分组报告 ===") + self._generate_group_reports_incremental() + + # 阶段3:生成索引文件(可选) + print("\n=== 阶段3: 生成报告索引 ===") + index_path = self._create_report_index() + print(f"✅ 报告索引生成完成: {index_path}") + + return summary_report_path + + except Exception as e: + print(f"❌ 程序执行失败: {e}") + print(f" 错误详情: {traceback.format_exc()}") + # 即使失败,也尝试保存当前进度 + if hasattr(self, 'output_dir'): + print(f"📁 当前结果已保存到: {self.output_dir}") + raise + + def _create_summary_report(self): + """创建快速汇总报告(区分预设上下限和实测值)""" + print("生成快速汇总报告...") + + # 使用明确的空值检查 + if self.stats is None or len(self.stats.index) == 0: + print("⚠️ 统计数据为空,生成空报告") + return self._create_empty_report() + + # 将索引转换为列表,避免DataFrame布尔判断问题 + stats_index = list(self.stats.index) + total_groups = len(stats_index) + + # 安全地检查CPK列是否存在 + valid_height_cpk = 0 + valid_volume_cpk = 0 + valid_area_cpk = 0 + + if 'Height_Cpk' in self.stats.columns: + valid_height_cpk = self.stats['Height_Cpk'].notna().sum() + if 'Volume_Cpk' in self.stats.columns: + valid_volume_cpk = self.stats['Volume_Cpk'].notna().sum() + if 'Area_Cpk' in self.stats.columns: + valid_area_cpk = self.stats['Area_Cpk'].notna().sum() + + html_content = f""" + + + + 数据统计汇总报告 - {self.filename} + + + +

数据统计汇总报告 - {self.filename}

+

生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+

输入文件位置: {self.file_dir}

+ +
+

报告说明

+

此报告为快速生成的汇总报告,包含所有分组的基本统计信息。

+

CPK计算使用预设的上下限值,而不是实测的最小最大值。

+

注意:分组详细报告可能需要较长时间生成,请勿关闭程序。

+
+ +
+

处理进度

+

总分组数量: {total_groups}

+

有效Height CPK数量: {valid_height_cpk}

+

有效Volume CPK数量: {valid_volume_cpk}

+

有效Area CPK数量: {valid_area_cpk}

+

输出目录: {self.output_dir}

+
+ + + +

详细统计数据

+ + + + + + + + {'' if 'Height_Cpk' in self.stats.columns else ''} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + """ + + # 生成表格行数据 + for group_key in stats_index: + row = self.stats.loc[group_key] + + def format_value(value): + """格式化数值显示""" + if pd.isna(value): + return 'N/A' + elif isinstance(value, (int, float)): + return f"{value:.4f}" + else: + return str(value) + + # 获取数据点数 + group_data = self.data[self.data['Group_Key'] == group_key] + data_count = len(group_data) + + # 安全处理CPK列 + cpk_columns = {"height": "", "volume": "", "area": ""} + if 'Height_Cpk' in self.stats.columns: + cpk_columns = { + "height": f"""""", + "volume": f"""""", + "area": f"""""" + } + + # 为CPK值添加颜色标识 + def get_cpk_color(cpk_value): + """根据CPK值返回颜色标识""" + if pd.isna(cpk_value): + return '' + try: + cpk_val = float(cpk_value) + if cpk_val >= 1.33: + return 'style="background-color: #90EE90;"' # 绿色 - 优秀 + elif cpk_val >= 1.0: + return 'style="background-color: #FFFFE0;"' # 黄色 - 合格 + else: + return 'style="background-color: #FFB6C1;"' # 红色 - 不合格 + except: + return '' + + # 如果存在CPK列,添加颜色 + if 'Height_Cpk' in self.stats.columns: + # 这里需要为每个CPK单元格单独设置颜色 + height_color = get_cpk_color(row['Height_Cpk']) + volume_color = get_cpk_color(row['Volume_Cpk']) + area_color = get_cpk_color(row['Area_Cpk']) + + cpk_columns = { + "height": f"""""", + "volume": f"""""", + "area": f"""""" + } + + html_content += f""" + + + + + + + + + + + {cpk_columns["height"]} + + + + + + + + + {cpk_columns["volume"]} + + + + + + + + + {cpk_columns["area"]} + + + + """ + + html_content += """ + +
分组标识
(PAD ID + Component ID)
Height(mil)Volume(%)Area(%)CPK值
预设下限
(LSL)
预设上限
(USL)
实测最小值实测最大值平均值标准差数据点数CPK预设下限
(LSL)
预设上限
(USL)
实测最小值实测最大值平均值标准差数据点数CPK预设下限
(LSL)
预设上限
(USL)
实测最小值实测最大值平均值标准差数据点数CPK分组
{format_value(row['Height_Cpk'])}{format_value(row['Volume_Cpk'])}{format_value(row['Area_Cpk'])}{format_value(row['Height_Cpk'])}{format_value(row['Volume_Cpk'])}{format_value(row['Area_Cpk'])}
{group_key}{format_value(row['Height_Low(mil)'])}{format_value(row['Height_High(mil)'])}{format_value(row['Height_Measured_Min(mil)'])}{format_value(row['Height_Measured_Max(mil)'])}{format_value(row['Height_Mean(mil)'])}{format_value(row['Height_Std(mil)'])}{data_count}{format_value(row['Vol_Min(%)'])}{format_value(row['Vol_Max(%)'])}{format_value(row['Vol_Measured_Min(%)'])}{format_value(row['Vol_Measured_Max(%)'])}{format_value(row['Vol_Mean(%)'])}{format_value(row['Vol_Std(%)'])}{data_count}{format_value(row['Area_Min(%)'])}{format_value(row['Area_Max(%)'])}{format_value(row['Area_Measured_Min(%)'])}{format_value(row['Area_Measured_Max(%)'])}{format_value(row['Area_Mean(%)'])}{format_value(row['Area_Std(%)'])}{data_count}{group_key}
+ +
+

表格说明

+

绿色背景: 预设的上下限值(用于CPK计算)

+

黄色背景: 实测数据的最小最大值

+

白色背景: 统计计算值

+
+ +
+

CPK计算说明

+

CPK计算公式: CPK = min[(USL - mean) / (3×std), (mean - LSL) / (3×std)]

+

上下限取值: 使用数据文件中的预设上下限值,而不是实测的最小最大值

+

绿色 CPK ≥ 1.33 (过程能力优秀)

+

黄色 1.0 ≤ CPK < 1.33 (过程能力合格)

+

红色 CPK < 1.0 (过程能力不足)

+
+ + + """ + + report_path = os.path.join(self.output_dir, 'summary_report.html') + with open(report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f"✅ 汇总报告已生成: {report_path}") + return report_path + + def _create_empty_report(self): + """创建空数据报告""" + html_content = f""" + + + + 数据统计报告 - {self.filename} + + + +

数据统计报告 - {self.filename}

+
+

⚠️ 数据为空

+

未找到有效数据或统计数据为空。

+

生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+

输入文件位置: {self.file_dir}

+
+ + + """ + + report_path = os.path.join(self.output_dir, 'summary_report.html') + with open(report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + return report_path + + def _sanitize_filename(self, filename): + """清理文件名,移除非法字符""" + import re + return re.sub(r'[<>:"/\\|?*]', '_', filename) + + def _generate_group_reports_incremental(self): + """分批生成分组报告,避免长时间等待""" + # 使用明确的空值检查方法 + if self.stats is None or len(self.stats.index) == 0: + print("⚠️ 统计数据为空,跳过分组报告生成") + return + + stats_index = list(self.stats.index) + total_groups = len(stats_index) + + if total_groups == 0: + print("⚠️ 没有有效的分组数据") + return + + print(f"📊 开始分批生成 {total_groups} 个分组报告...") + print(f"📁 分组报告将保存到: {os.path.join(self.output_dir, 'group_reports')}") + + # 分批处理 + BATCH_SIZE = min(20, total_groups) + completed_groups = [] + total_batches = (total_groups + BATCH_SIZE - 1) // BATCH_SIZE + + for batch_idx in range(total_batches): + batch_start = batch_idx * BATCH_SIZE + batch_end = min((batch_idx + 1) * BATCH_SIZE, total_groups) + batch_groups = stats_index[batch_start:batch_end] + + print(f"\n🔄 处理批次 {batch_idx + 1}/{total_batches}: 分组 {batch_start + 1}-{batch_end}") + + try: + batch_results = self._process_batch(batch_groups) + + # 生成当前批次的分组报告 + successful_reports = 0 + for group_key in batch_groups: + try: + self._create_single_group_report(group_key, batch_results.get(group_key, {})) + completed_groups.append(group_key) + successful_reports += 1 + print(f" ✅ 分组报告生成: {self._sanitize_filename(group_key)}.html") + except Exception as e: + print(f" ❌ 生成分组 {group_key} 报告失败: {e}") + print(f" 错误详情: {traceback.format_exc()}") + + # 保存进度 + self._save_progress(completed_groups, f"batch_{batch_idx + 1}") + + print(f"✅ 批次 {batch_idx + 1} 完成 (成功生成 {successful_reports}/{len(batch_groups)} 个报告)") + + except Exception as batch_error: + print(f"❌ 批次 {batch_idx + 1} 处理失败: {batch_error}") + print(f" 错误详情: {traceback.format_exc()}") + # 继续处理下一批次 + continue + + # 添加批次间隔,避免资源竞争 + if batch_idx < total_batches - 1: + print("⏳ 等待2秒后处理下一批次...") + time.sleep(2) + + print(f"✅ 所有分组报告生成完成 (总计: {len(completed_groups)}/{total_groups})") + print(f"📁 分组报告保存位置: {os.path.join(self.output_dir, 'group_reports')}") + + def _process_batch(self, group_keys): + """处理单个批次的分组""" + if not group_keys: # 明确的空列表检查 + print("⚠️ 当前批次没有分组数据") + return {} + + tasks = [] + for group_key in group_keys: + # 问题修正:使用明确的检查方法 + stats_index_list = list(self.stats.index) # 转换为列表 + if group_key not in stats_index_list: + print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过") + continue # 跳过不存在的分组 + + # 问题修正:避免DataFrame的布尔判断,使用明确的.empty检查 + group_data = self.data[self.data['Group_Key'] == group_key] + if group_data.empty: # 明确的空值检查 + print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过") + continue + + row = self.stats.loc[group_key] + + # 安全地获取特征数据,添加空值检查 + feature_data_dict = {} + for col in ['Height(mil)', 'Volume(%)', 'Area(%)']: + col_data = group_data[col].dropna() + if len(col_data) == 0: + print(f"⚠️ 警告: 分组 {group_key} 的 {col} 数据为空") + col_data = pd.Series([], dtype=float) # 创建空Series + feature_data_dict[col] = col_data + + # 获取预设的上下限值 + limits_dict = {} + # 安全地获取限制值 + try: + limits_dict = { + 'Height(mil)': (row['Height_High(mil)'], row['Height_Low(mil)']), # USL, LSL + 'Volume(%)': (row['Vol_Max(%)'], row['Vol_Min(%)']), # USL, LSL + 'Area(%)': (row['Area_Max(%)'], row['Area_Min(%)']) # USL, LSL + } + except KeyError as e: + print(f"❌ 错误: 分组 {group_key} 缺少预设上下限列 {e}") + continue + + tasks.append((group_key, feature_data_dict, limits_dict)) + + if len(tasks) == 0: # 明确的空列表检查 + print("⚠️ 当前批次没有有效任务") + return {} + + # 使用多进程处理 + max_workers = min(mp.cpu_count(), len(tasks), 4) + results = {} + + print(f"🔧 开始处理批次中的 {len(tasks)} 个任务,使用 {max_workers} 个进程...") + + with ProcessPoolExecutor(max_workers=max_workers) as executor: + future_to_key = {} + for task in tasks: + future = executor.submit(plot_worker, task) + future_to_key[future] = task[0] + + completed_count = 0 + for future in as_completed(future_to_key): + group_key = future_to_key[future] + try: + result_key, result_data = future.result() + if result_key: # 明确的结果检查 + results[result_key] = result_data + completed_count += 1 + print(f" 📈 图表生成完成: {result_key} ({completed_count}/{len(tasks)})") + except Exception as e: + print(f" ❌ 处理分组 {group_key} 时出错: {e}") + + print(f"✅ 批次处理完成,成功生成 {len(results)}/{len(tasks)} 个图表") + return results + + def _create_single_group_report(self, group_key, feature_charts): + """创建单个分组的独立报告""" + # 添加明确的分组存在性检查 + stats_index_list = list(self.stats.index) # 转换为列表 + + if group_key not in stats_index_list: + print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过报告生成") + return + + try: + row = self.stats.loc[group_key] + except KeyError: + print(f"❌ 错误: 无法获取分组 {group_key} 的统计数据") + return + + # 明确的空值检查 + group_data = self.data[self.data['Group_Key'] == group_key] + + # 确保group_data不为空 + if group_data.empty: + print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过报告生成") + return + + # 安全格式化数值 + def safe_format(value, default="N/A"): + try: + if pd.isna(value): + return default + return f"{float(value):.4f}" + except (ValueError, TypeError): + return default + + html_content = f""" + + + + {group_key} - 详细分析报告 + + + + + +

{group_key} - 详细分析报告

+

生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+

输入文件位置: {self.file_dir}

+ +
+

基本统计信息

+ + + + + + + + + + + + + + + + + + + + + +
特征预设下限(LSL)预设上限(USL)实测最小值实测最大值平均值标准差CPK
Height(mil){safe_format(row.get('Height_Low(mil)'))}{safe_format(row.get('Height_High(mil)'))}{safe_format(row.get('Height_Measured_Min(mil)'))}{safe_format(row.get('Height_Measured_Max(mil)'))}{safe_format(row.get('Height_Mean(mil)'))}{safe_format(row.get('Height_Std(mil)'))}{safe_format(row.get('Height_Cpk'))}
Volume(%){safe_format(row.get('Vol_Min(%)'))}{safe_format(row.get('Vol_Max(%)'))}{safe_format(row.get('Vol_Measured_Min(%)'))}{safe_format(row.get('Vol_Measured_Max(%)'))}{safe_format(row.get('Vol_Mean(%)'))}{safe_format(row.get('Vol_Std(%)'))}{safe_format(row.get('Volume_Cpk'))}
Area(%){safe_format(row.get('Area_Min(%)'))}{safe_format(row.get('Area_Max(%)'))}{safe_format(row.get('Area_Measured_Min(%)'))}{safe_format(row.get('Area_Measured_Max(%)'))}{safe_format(row.get('Area_Mean(%)'))}{safe_format(row.get('Area_Std(%)'))}{safe_format(row.get('Area_Cpk'))}
+
+ """ + + # 添加图表 + for feature_name in ['Height(mil)', 'Volume(%)', 'Area(%)']: + chart_base64 = feature_charts.get(feature_name, "") + if chart_base64 and len(chart_base64) > 0: # 明确的字符串检查 + html_content += f""" +

{feature_name} 分析图表

+
+
+ {feature_name}统计图表 +
+
+ """ + else: + html_content += f""" +

{feature_name} 分析图表

+

该特征的图表生成失败或数据不足。

+ """ + + html_content += """ + + + """ + + filename = self._sanitize_filename(group_key) + '.html' + group_reports_dir = os.path.join(self.output_dir, 'group_reports') + report_path = os.path.join(group_reports_dir, filename) + + try: + with open(report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + except Exception as e: + print(f"❌ 保存分组报告失败 {filename}: {e}") + + def _create_report_index(self): + """创建分组报告索引""" + # 确保使用正确的索引获取方式 + if self.stats is None or len(self.stats.index) == 0: + print("⚠️ 统计数据为空,创建空索引") + return self._create_empty_index() + + stats_index = list(self.stats.index) # 转换为列表 + + html_content = """ + + + + 分组报告索引 + + + + + +

分组报告索引

+

共生成 """ + str(len(stats_index)) + """ 个分组报告

+

输入文件位置: """ + self.file_dir + """

+ +
+ """ + + for group_key in stats_index: # 使用列表而不是DataFrame索引 + filename = self._sanitize_filename(group_key) + '.html' + html_content += f'
{group_key}
\n' + + html_content += """ +
+ + + """ + + index_path = os.path.join(self.output_dir, 'group_reports', 'index.html') + try: + with open(index_path, 'w', encoding='utf-8') as f: + f.write(html_content) + except Exception as e: + print(f"❌ 创建索引文件失败: {e}") + return None + + return index_path + + def _create_empty_index(self): + """创建空索引文件""" + html_content = """ + + + + 分组报告索引 + + + +

分组报告索引

+
+

⚠️ 没有分组报告

+

当前没有生成任何分组报告。

+

输入文件位置: """ + self.file_dir + """

+
+ + + """ + + index_path = os.path.join(self.output_dir, 'group_reports', 'index.html') + with open(index_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + return index_path + + def _calculate_cpk(self, stats): + """计算CPK值 - 使用预设的上下限值""" + print("详细计算CPK值...") + + def calculate_single_cpk(mean, std, usl, lsl): + """计算单个特征的CPK""" + if std == 0 or pd.isna(std): + return np.nan + + if pd.isna(usl) or pd.isna(lsl): + return np.nan + + # CPK = min[(USL - mean) / (3*std), (mean - LSL) / (3*std)] + cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf') + cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf') + + # 如果其中一个限值为无穷大,则返回另一个值 + if cpu == float('inf') and cpl == float('inf'): + return np.nan + elif cpu == float('inf'): + return cpl + elif cpl == float('inf'): + return cpu + else: + return min(cpu, cpl) + + # 确保CPK列不存在时创建 + cpk_results = [] + + for idx, row in stats.iterrows(): + print(f"计算分组 {idx} 的CPK值...") + + # Height CPK - 使用预设的Height_High作为USL,Height_Low作为LSL + height_cpk = calculate_single_cpk( + row['Height_Mean(mil)'], + row['Height_Std(mil)'], + row['Height_High(mil)'], # USL - 预设上限 + row['Height_Low(mil)'] # LSL - 预设下限 + ) + + # Volume CPK - 使用预设的Vol_Max作为USL,Vol_Min作为LSL + volume_cpk = calculate_single_cpk( + row['Vol_Mean(%)'], + row['Vol_Std(%)'], + row['Vol_Max(%)'], # USL - 预设上限 + row['Vol_Min(%)'] # LSL - 预设下限 + ) + + # Area CPK - 使用预设的Area_Max作为USL,Area_Min作为LSL + area_cpk = calculate_single_cpk( + row['Area_Mean(%)'], + row['Area_Std(%)'], + row['Area_Max(%)'], # USL - 预设上限 + row['Area_Min(%)'] # LSL - 预设下限 + ) + + cpk_results.append({ + 'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan, + 'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan, + 'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan + }) + + # 将CPK结果添加到统计数据中 + cpk_df = pd.DataFrame(cpk_results, index=stats.index) + stats = pd.concat([stats, cpk_df], axis=1) + + print("✅ 所有分组CPK计算完成 - 使用预设上下限值") + return stats + + def _save_to_excel_advanced(self): + """保存Excel文件""" + print("保存Excel文件...") + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + excel_filename = os.path.join(self.output_dir, 'statistics.xlsx') + + try: + with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer: + # 保存统计汇总 + if self.stats is not None: + self.stats.reset_index().to_excel(writer, sheet_name='统计汇总', index=False) + + # 保存前50个分组的数据 + MAX_GROUPS_TO_SAVE = 50 + unique_groups = self.data['Group_Key'].unique()[:MAX_GROUPS_TO_SAVE] + + for i, group_key in enumerate(unique_groups): + group_data = self.data[self.data['Group_Key'] == group_key].copy() + sheet_name = f"组_{group_key}"[:31] + group_data.to_excel(writer, sheet_name=sheet_name, index=False) + + print(f"✅ Excel文件保存完成: {excel_filename}") + return excel_filename + + except Exception as e: + print(f"❌ Excel文件保存失败: {e}") + print(f" 错误详情: {traceback.format_exc()}") + return None + + +def main(): + """主函数""" + print("=== 数据统计报告生成程序(使用预设上下限值) ===") + + processor = DataProcessor() + + try: + if processor.select_file(): + processor._load_data() + report_path = processor.generate_report() + print(f"✅ 报告生成完成") + print(f"📁 输入文件目录: {processor.file_dir}") + print(f"📁 输出目录: {processor.output_dir}") + print(f"📊 汇总报告: {report_path}") + + # 显示重要文件路径 + print(f"📊 Excel文件: {os.path.join(processor.output_dir, 'statistics.xlsx')}") + else: + print("❌ 未选择文件,程序退出") + + except Exception as e: + print(f"❌ 程序执行失败: {e}") + print(f" 错误详情: {traceback.format_exc()}") + + +if __name__ == "__main__": + mp.set_start_method('spawn', force=True) + main() diff --git a/dataProcess/dataProcess_sightml_V1.py b/dataProcess/dataProcess_sightml_V1.py new file mode 100644 index 0000000..edc6bbf --- /dev/null +++ b/dataProcess/dataProcess_sightml_V1.py @@ -0,0 +1,810 @@ +import pandas as pd +import tkinter as tk +from tkinter import filedialog +import os +from datetime import datetime +import numpy as np + + +class DataProcessor: + def __init__(self): + self.data = None + self.filename = None + self.file_path = None + self.file_dir = None + self.processing_start_time = None + + def select_file(self): + """手动选择数据文件""" + print("🔍 打开文件选择对话框...") + root = tk.Tk() + root.withdraw() + + self.file_path = filedialog.askopenfilename( + title="选择数据文件", + filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")] + ) + + if self.file_path: + self.filename = os.path.basename(self.file_path) + self.file_dir = os.path.dirname(self.file_path) + print(f"✅ 已选择文件: {self.filename}") + print(f"📁 文件所在目录: {self.file_dir}") + return True + else: + print("❌ 未选择文件") + return False + + def _load_data(self): + """加载数据文件""" + print("📥 开始加载数据文件...") + try: + if self.file_path.endswith('.csv'): + self.data = pd.read_csv(self.file_path) + print("✅ 成功加载CSV文件") + elif self.file_path.endswith('.xlsx'): + self.data = pd.read_excel(self.file_path) + print("✅ 成功加载Excel文件") + else: + raise ValueError("不支持的文件格式") + + print(f"📊 数据文件形状: {self.data.shape}") + print(f"📋 数据列名: {list(self.data.columns)[:10]}...") + + # 显示数据预览 + print("\n📋 数据预览(前3行):") + print(self.data.head(3)) + + # 显示列数据类型 + print("\n📊 列数据类型:") + for col in self.data.columns[:10]: + print(f" {col}: {self.data[col].dtype}") + + except Exception as e: + print(f"❌ 加载数据文件时出错: {e}") + raise + + def _validate_data(self): + """验证数据完整性""" + print("🔍 验证数据完整性...") + + # 检查必要的测量列 + required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)'] + missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns] + + if missing_measure_columns: + error_msg = f"❌ 数据文件中缺少必要的测量列: {missing_measure_columns}" + print(error_msg) + raise ValueError(error_msg) + + # 检查上下限列 + required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)', 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)', + 'Area_Max(%)'] + missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns] + + if missing_limit_columns: + error_msg = f"❌ 数据文件中缺少必要的上下限列: {missing_limit_columns}" + print(error_msg) + raise ValueError(error_msg) + + print("✅ 数据验证通过") + + # 检查数据是否存在空值 + all_columns = required_measure_columns + required_limit_columns + null_counts = self.data[all_columns].isnull().sum() + if null_counts.any(): + print(f"⚠️ 数据中存在空值:") + for col, count in null_counts[null_counts > 0].items(): + print(f" {col}: {count} 个空值") + else: + print("✅ 所有必需列都没有空值") + + # 显示数据统计信息 + print("\n📊 数据统计信息:") + + for col in required_measure_columns: + if col in self.data.columns: + # 检查列的数据类型,针对不同类型使用不同的格式化方式 + if pd.api.types.is_numeric_dtype(self.data[col]): + valid_count = self.data[col].count() + if valid_count > 0: + min_val = self.data[col].min() + max_val = self.data[col].max() + print(f" {col}: {valid_count} 个有效值, 范围 {min_val:.4f} - {max_val:.4f}") + else: + print(f" {col}: 0 个有效值") + else: + # 非数值型列:显示唯一值和示例 + unique_count = self.data[col].nunique() + sample_values = self.data[col].dropna().head(3).tolist() + print( + f" {col}: {self.data[col].count()} 个有效值, {unique_count} 个唯一值, 示例: {sample_values}") + + # 检查并转换数据类型 + print("\n🔄 数据类型检查与转换:") + numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)', + 'Height_Low(mil)', 'Height_High(mil)', + 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)', 'Area_Max(%)'] + + for col in numeric_columns: + if col in self.data.columns: + if not pd.api.types.is_numeric_dtype(self.data[col]): + try: + # 尝试转换为数值类型 + original_count = self.data[col].count() + self.data[col] = pd.to_numeric(self.data[col], errors='coerce') + converted_count = self.data[col].count() + lost_data = original_count - converted_count + if lost_data > 0: + print(f" ⚠️ {col}: 转换后丢失 {lost_data} 个非数值数据") + else: + print(f" ✅ {col}: 成功转换为数值类型") + except Exception as e: + print(f" ❌ {col}: 类型转换失败 - {e}") + else: + valid_count = self.data[col].count() + print(f" ✅ {col}: 已经是数值类型, {valid_count} 个有效值") + + def _print_progress(self, message, level=1): + """打印进度信息,支持分级显示""" + indent = " " * level + timestamp = datetime.now().strftime("%H:%M:%S") + print(f"{timestamp} {indent}{message}") + + def generate_report(self): + """生成统计报告""" + if self.data is None: + raise ValueError("请先选择数据文件") + + try: + self.processing_start_time = datetime.now() + print(f"\n🚀 开始生成报告 - {self.processing_start_time.strftime('%Y-%m-%d %H:%M:%S')}") + + # 验证数据 + self._validate_data() + + self._print_progress("开始数据处理...", 1) + + # 创建分组键 + self._print_progress("创建分组键...", 2) + + # 确保PAD ID和Component ID都是字符串类型 + self.data['PAD ID'] = self.data['PAD ID'].astype(str) + self.data['Component ID'] = self.data['Component ID'].astype(str) + + self.data['Group_Key'] = self.data['PAD ID'] + '_' + self.data['Component ID'] + group_count = self.data['Group_Key'].nunique() + self._print_progress(f"共发现 {group_count} 个分组", 2) + + # 显示分组信息 + group_info = self.data['Group_Key'].value_counts() + self._print_progress(f"分组数据量统计:", 2) + for i, (group, count) in enumerate(group_info.head(5).items()): + self._print_progress(f" {group}: {count} 个数据点", 3) + if len(group_info) > 5: + self._print_progress(f" ... 还有 {len(group_info) - 5} 个分组", 3) + + # 检查数值列是否存在NaN值 + numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)'] + for col in numeric_columns: + if col in self.data.columns: + nan_count = self.data[col].isna().sum() + if nan_count > 0: + self._print_progress(f"⚠️ {col} 有 {nan_count} 个空值,将在统计计算中排除", 3) + + # 计算统计信息 + self._print_progress("计算基本统计信息...", 2) + + # 确保数值列没有无穷大值 + for col in numeric_columns: + if col in self.data.columns: + inf_count = np.isinf(self.data[col]).sum() + if inf_count > 0: + self._print_progress(f"⚠️ {col} 有 {inf_count} 个无穷大值,将替换为NaN", 3) + self.data[col] = self.data[col].replace([np.inf, -np.inf], np.nan) + + stats = self.data.groupby('Group_Key').agg({ + 'Height(mil)': ['min', 'max', 'mean', 'std'], + 'Volume(%)': ['min', 'max', 'mean', 'std'], + 'Area(%)': ['min', 'max', 'mean', 'std'] + }).round(4) + + # 重命名列 + stats.columns = [ + 'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)', + 'Volume_Measured_Min(%)', 'Volume_Measured_Max(%)', 'Volume_Mean(%)', 'Volume_Std(%)', + 'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)' + ] + + self._print_progress("基本统计信息计算完成", 2) + + # 获取上下限信息 + self._print_progress("获取预设上下限信息...", 2) + limits = self.data.groupby('Group_Key').agg({ + 'Height_Low(mil)': 'first', + 'Height_High(mil)': 'first', + 'Vol_Min(%)': 'first', + 'Vol_Max(%)': 'first', + 'Area_Min(%)': 'first', + 'Area_Max(%)': 'first' + }).round(4) + + # 合并统计信息和上下限信息 + stats = pd.concat([stats, limits], axis=1) + self._print_progress("上下限信息获取完成", 2) + + # 计算CPK + self._print_progress("开始计算CPK值...", 2) + stats = self._calculate_cpk(stats) + + # 分析CPK结果 + cpk_analysis = self._analyze_cpk_results(stats) + self._print_progress("CPK分析完成", 2) + self._print_cpk_summary(cpk_analysis) + + # 生成HTML报告 + self._print_progress("生成HTML报告...", 2) + report_path = self._create_html_report(stats, cpk_analysis) + self._print_progress("HTML报告生成完成", 2) + + # 计算处理时间 + processing_time = datetime.now() - self.processing_start_time + self._print_progress(f"总处理时间: {processing_time.total_seconds():.2f} 秒", 1) + + return report_path + + except Exception as e: + print(f"❌ 生成报告过程中出错: {e}") + import traceback + print(f"详细错误信息:") + traceback.print_exc() + raise + + def _analyze_cpk_results(self, stats): + """分析CPK结果""" + cpk_analysis = { + 'total_groups': len(stats), + 'cpk_status': {'Height': {}, 'Volume': {}, 'Area': {}}, + 'problematic_groups': [] + } + + for feature in ['Height', 'Volume', 'Area']: + cpk_col = f'{feature}_Cpk' + if cpk_col not in stats.columns: + continue + + valid_cpk = stats[cpk_col].dropna() + total_valid = len(valid_cpk) + + cpk_analysis['cpk_status'][feature] = { + 'total': total_valid, + 'excellent': len(valid_cpk[valid_cpk >= 1.33]) if total_valid > 0 else 0, + 'acceptable': len(valid_cpk[(valid_cpk >= 1.0) & (valid_cpk < 1.33)]) if total_valid > 0 else 0, + 'poor': len(valid_cpk[valid_cpk < 1.0]) if total_valid > 0 else 0, + 'invalid': len(stats) - total_valid + } + + # 识别有问题的分组(任意特征的CPK < 1.0) + for group_key, row in stats.iterrows(): + problems = [] + for feature in ['Height', 'Volume', 'Area']: + cpk_col = f'{feature}_Cpk' + if cpk_col in stats.columns and not pd.isna(row[cpk_col]): + if row[cpk_col] < 1.0: + problems.append(f"{feature}: {row[cpk_col]:.4f}") + + if problems: + cpk_analysis['problematic_groups'].append({ + 'group_key': group_key, + 'problems': problems + }) + + return cpk_analysis + + def _print_cpk_summary(self, cpk_analysis): + """打印CPK结果摘要""" + print("\n📈 CPK分析结果摘要:") + print("=" * 60) + + for feature, status in cpk_analysis['cpk_status'].items(): + total = status['total'] + if total == 0: + print(f"\n{feature}: 无有效CPK数据") + continue + + print(f"\n{feature}:") + excellent_pct = (status['excellent'] / total * 100) if total > 0 else 0 + acceptable_pct = (status['acceptable'] / total * 100) if total > 0 else 0 + poor_pct = (status['poor'] / total * 100) if total > 0 else 0 + + print(f" ✅ 优秀 (CPK ≥ 1.33): {status['excellent']}/{total} ({excellent_pct:.1f}%)") + print(f" ⚠️ 合格 (1.0 ≤ CPK < 1.33): {status['acceptable']}/{total} ({acceptable_pct:.1f}%)") + print(f" ❌ 不合格 (CPK < 1.0): {status['poor']}/{total} ({poor_pct:.1f}%)") + print(f" ❓ 无法计算: {status['invalid']}") + + if cpk_analysis['problematic_groups']: + print(f"\n⚠️ 发现 {len(cpk_analysis['problematic_groups'])} 个有问题分组:") + for i, group in enumerate(cpk_analysis['problematic_groups'][:10]): + print(f" {i + 1}. {group['group_key']}: {', '.join(group['problems'])}") + if len(cpk_analysis['problematic_groups']) > 10: + print(f" ... 还有 {len(cpk_analysis['problematic_groups']) - 10} 个问题分组") + else: + print("\n✅ 所有分组的CPK都在合格范围内") + + print("=" * 60) + + def _calculate_cpk(self, stats): + """计算CPK值""" + self._print_progress("详细计算CPK值...", 3) + + def calculate_single_cpk(mean, std, usl, lsl): + """计算单个特征的CPK""" + if pd.isna(mean) or pd.isna(std) or std == 0: + return np.nan + + if pd.isna(usl) or pd.isna(lsl): + return np.nan + + try: + cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf') + cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf') + + if cpu == float('inf') and cpl == float('inf'): + return np.nan + elif cpu == float('inf'): + return cpl + elif cpl == float('inf'): + return cpu + else: + return min(cpu, cpl) + except (ZeroDivisionError, TypeError): + return np.nan + + # 计算每个特征的CPK + cpk_results = [] + total_groups = len(stats) + + for idx, row in stats.iterrows(): + if len(cpk_results) % 100 == 0 and total_groups > 100: + self._print_progress(f"计算第 {len(cpk_results) + 1} 个分组的CPK...", 4) + + # Height CPK + height_cpk = calculate_single_cpk( + row.get('Height_Mean(mil)', np.nan), + row.get('Height_Std(mil)', np.nan), + row.get('Height_High(mil)', np.nan), + row.get('Height_Low(mil)', np.nan) + ) + + # Volume CPK + volume_cpk = calculate_single_cpk( + row.get('Volume_Mean(%)', np.nan), + row.get('Volume_Std(%)', np.nan), + row.get('Vol_Max(%)', np.nan), + row.get('Vol_Min(%)', np.nan) + ) + + # Area CPK + area_cpk = calculate_single_cpk( + row.get('Area_Mean(%)', np.nan), + row.get('Area_Std(%)', np.nan), + row.get('Area_Max(%)', np.nan), + row.get('Area_Min(%)', np.nan) + ) + + cpk_results.append({ + 'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan, + 'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan, + 'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan + }) + + # 将CPK结果添加到统计数据中 + cpk_df = pd.DataFrame(cpk_results, index=stats.index) + stats = pd.concat([stats, cpk_df], axis=1) + + self._print_progress(f"所有 {len(stats)} 个分组CPK计算完成", 3) + return stats + + def _get_cpk_status_class(self, cpk_value): + """根据CPK值返回状态类别""" + if pd.isna(cpk_value): + return 'cpk-invalid' + elif cpk_value >= 1.33: + return 'cpk-excellent' + elif cpk_value >= 1.0: + return 'cpk-acceptable' + else: + return 'cpk-poor' + + def _create_html_report(self, stats, cpk_analysis): + """创建完整的HTML报告""" + self._print_progress("构建HTML报告内容...", 3) + + total_groups = len(stats) + + # 完整的HTML模板 + html_content = f""" + + + + 数据统计报告 - {self.filename} + + + +
+

📊 数据统计报告 - {self.filename}

+

生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

+

输入文件: {self.filename}

+ +
+

📈 报告摘要

+

总分组数量: {total_groups}

+

处理时间: {(datetime.now() - self.processing_start_time).total_seconds():.2f} 秒

+
+ + +
+""" + + # 添加CPK状态卡片 + for feature, status in cpk_analysis['cpk_status'].items(): + total = status['total'] + status['invalid'] + if total == 0: + continue + + html_content += f""" +
+

{feature} CPK状态

+
+ {status['excellent'] + status['acceptable']}/{total} +
+

合格率: {(status['excellent'] + status['acceptable']) / total * 100:.1f}%

+
+ 优秀: {status['excellent']} + 合格: {status['acceptable']} + 不合格: {status['poor']} + 无效: {status['invalid']} +
+
+""" + + html_content += f""" +
+ + + {f'

⚠️ 发现 {len(cpk_analysis["problematic_groups"])} 个问题分组

以下分组的CPK值低于1.0,需要重点关注

' if cpk_analysis['problematic_groups'] else ''} + +

📋 详细统计数据

+ +
+ 预设上下限 + 实测值 + CPK ≥ 1.33 + 1.0 ≤ CPK < 1.33 + CPK < 1.0 +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +""" + + # 生成表格行数据的辅助函数 + def format_value(value): + if pd.isna(value): + return 'N/A' + elif isinstance(value, (int, float)): + return f"{value:.4f}" + else: + return str(value) + + # 用于检查列是否存在的辅助函数 + def safe_get_value(row, column_name): + """安全获取列值,如果列不存在返回N/A""" + if column_name in row.index: + return row[column_name] + else: + return np.nan + + for group_key, row in stats.iterrows(): + # 检查是否为问题分组 + is_problematic = any(problem['group_key'] == group_key for problem in cpk_analysis['problematic_groups']) + row_class = 'class="problematic-row"' if is_problematic else '' + + html_content += f""" + + +""" + + # 为每个特征生成列 + for feature in ['Height', 'Volume', 'Area']: + cpk_value = safe_get_value(row, f'{feature}_Cpk') + cpk_class = self._get_cpk_status_class(cpk_value) + + # 为不同特征设置正确的列名 + if feature == 'Height': + lower_limit_col = 'Height_Low(mil)' + upper_limit_col = 'Height_High(mil)' + measured_min_col = 'Height_Measured_Min(mil)' + measured_max_col = 'Height_Measured_Max(mil)' + mean_col = 'Height_Mean(mil)' + std_col = 'Height_Std(mil)' + else: + lower_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Min(%)" # 修正:Volume使用Vol_Min(%),Area使用Area_Min(%) + upper_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Max(%)" # 修正:Volume使用Vol_Max(%),Area使用Area_Max(%) + measured_min_col = f'{feature}_Measured_Min(%)' + measured_max_col = f'{feature}_Measured_Max(%)' + mean_col = f'{feature}_Mean(%)' + std_col = f'{feature}_Std(%)' + + html_content += f""" + + + + + + + + +""" + + html_content += """ + """ + + html_content += """ + +
分组标识Height(mil)Volume(%)Area(%)
预设下限预设上限实测最小值实测最大值平均值标准差CPK预设下限预设上限实测最小值实测最大值平均值标准差CPK预设下限预设上限实测最小值实测最大值平均值标准差CPK
{group_key}{' ⚠️' if is_problematic else ''}{format_value(safe_get_value(row, lower_limit_col))}{format_value(safe_get_value(row, upper_limit_col))}{format_value(safe_get_value(row, measured_min_col))}{format_value(safe_get_value(row, measured_max_col))}{format_value(safe_get_value(row, mean_col))}{format_value(safe_get_value(row, std_col))}{format_value(cpk_value)}
+
+ +
+

📊 CPK状态分布

+
+""" + + # 添加简单的CPK分布图表 + for feature, status in cpk_analysis['cpk_status'].items(): + total = status['total'] + status['invalid'] + if total == 0: + continue + + html_content += f""" +
+

{feature} CPK分布

+
+
+
+
+
+
+
+
+ 优秀 {status['excellent']} | 合格 {status['acceptable']} | 不合格 {status['poor']} | 无效 {status['invalid']} +
+
+
+""" + + html_content += """ +
+
+
+ +""" + + # 保存报告 + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + report_filename = f"{os.path.splitext(self.filename)[0]}_report_{timestamp}.html" + report_path = os.path.join(self.file_dir, report_filename) + + self._print_progress(f"保存报告到: {report_path}", 3) + with open(report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + return report_path + + +def main(): + """主函数""" + print("=" * 60) + print("🚀 数据统计报告生成程序 - Volume上下限修复版") + print("=" * 60) + + processor = DataProcessor() + + try: + if processor.select_file(): + processor._load_data() + report_path = processor.generate_report() + + print("\n" + "=" * 60) + print("✅ 程序执行完成") + print(f"📄 统计报告生成成功: {report_path}") + print("=" * 60) + else: + print("❌ 未选择文件,程序退出") + + except Exception as e: + print(f"\n❌ 程序执行失败: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/htmlProcess/.gitignore b/htmlProcess/.gitignore new file mode 100644 index 0000000..00c07ca --- /dev/null +++ b/htmlProcess/.gitignore @@ -0,0 +1,17 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source + + + + +htmlReportProcess_Merge_picHtml_V3.py + +htmlReportProcess_Merge_picHtml_V2.py + +htmlReportProcess_Merge_pic_V2.py + +/htmlReportProcess*/ \ No newline at end of file diff --git a/htmlProcess/README.md b/htmlProcess/README.md new file mode 100644 index 0000000..92d2fb2 --- /dev/null +++ b/htmlProcess/README.md @@ -0,0 +1,11 @@ +# Sample GitLab Project + +This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches, +named and filled with lorem ipsum. + +You can look around to get an idea how to structure your project and, when done, you can safely delete this project. + +[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html) + +html文件的报告自动分析和处理数据的工具脚本 + diff --git a/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py b/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py new file mode 100644 index 0000000..c571814 --- /dev/null +++ b/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py @@ -0,0 +1,926 @@ +import os +import re +import sys +import time +import pandas as pd +import matplotlib.pyplot as plt +from datetime import datetime +from matplotlib.lines import Line2D +from typing import Optional, Tuple, List, Dict, Any, Union +from pathlib import Path +import numpy as np +import base64 +from io import BytesIO +from jinja2 import Template +from colorama import Fore, Style, init + +# 避免 SettingWithCopy 警告影响输出可读性 +pd.options.mode.chained_assignment = None + +# 设置中文字体支持 +plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei'] +plt.rcParams['axes.unicode_minus'] = False + +# HTML模板 - 添加了SN独立图的显示 +HTML_TEMPLATE = """ + + + + + + 测试报告分析 - {{ keyword }} + + + +
+

📊 测试报告分析

+

关键词: {{ keyword }} | 生成时间: {{ timestamp }}

+

共分析 {{ test_count }} 个测试项,{{ total_points }} 个数据点

+
+ + {% for test in tests %} +
+
+
📋 {{ test.name }}
+
+ {{ test.status_display }} +
+
+ +
+
+
数据点数
+
{{ test.stats.count }}
+
+
+
平均值
+
{{ "%.4f"|format(test.stats.mean) }}
+
+
+
中位数
+
{{ "%.4f"|format(test.stats.median) }}
+
+
+
标准差
+
{{ "%.4f"|format(test.stats.std) }}
+
+
+
最小值
+
{{ "%.4f"|format(test.stats.min) }}
+
+
+
最大值
+
{{ "%.4f"|format(test.stats.max) }}
+
+
+ + {% if test.limits.lower is not none or test.limits.upper is not none %} +
+ {% if test.limits.lower is not none %} +
+
下限值
+
{{ "%.4f"|format(test.limits.lower) }}
+
+ {% endif %} + {% if test.limits.upper is not none %} +
+
上限值
+
{{ "%.4f"|format(test.limits.upper) }}
+
+ {% endif %} +
+ {% endif %} + + +
📈 汇总视图 (所有SN)
+
+ {{ test.name }} 汇总散点图 +
+ + + {% if test.sn_plot_images %} +
🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)
+
+ {% for sn_plot in test.sn_plot_images %} +
+
SN: {{ sn_plot.sn }}
+ {{ test.name }} - SN {{ sn_plot.sn }} 散点图 +
+ {% endfor %} +
+ {% endif %} +
+ {% endfor %} + +
+

📈 分析摘要

+
+ 文件路径: {{ file_path }} +
+
+ 分析时间: {{ analysis_time }}秒 +
+
+ 测试项分布: + +
+
+ +
+ 报告生成于 {{ timestamp }} | 测试报告分析系统 +
+ + +""" + + +class TestReportScatterPlotter: + def __init__(self): + self.file_path: Optional[str] = None + self.df: Optional[pd.DataFrame] = None + self.output_dir: Optional[str] = None + self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit", ] + self.col_lower: Optional[str] = None + self.col_upper: Optional[str] = None + self.html_report_path: Optional[str] = None + + # 缓存处理过的数据 + self._processed_data_cache: Dict[str, Any] = {} + + def _print_stage(self, msg: str) -> None: + """统一的阶段信息输出""" + print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}") + + def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None: + """改进的进度条显示""" + if total <= 0: + return + + percent = (current / total) * 100 + bar_len = 30 + filled = int(bar_len * current / total) + bar = "█" * filled + "-" * (bar_len - filled) + sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)") + sys.stdout.flush() + if current == total: + print() # 换行 + + def get_file_path(self) -> None: + """改进的文件路径获取,支持路径补全""" + self._print_stage("输入文件路径") + + while True: + print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ") + file_path = input("> ").strip() + + # 尝试路径补全和验证 + if not file_path: + continue + + path_obj = Path(file_path) + if path_obj.exists(): + self.file_path = str(path_obj.resolve()) + print(f"已选择文件: {self.file_path}") + break + else: + print(f"文件不存在: {file_path},请重新输入") + + def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]: + """优化的大小写不敏感列查找""" + if self.df is None: + return None + + columns_lower = {col.lower().strip(): col for col in self.df.columns} + for candidate in candidates: + key = candidate.lower().strip() + if key in columns_lower: + return columns_lower[key] + return None + + def load_data(self) -> None: + """优化的数据加载方法""" + self._print_stage("加载数据") + start_time = time.time() + + # 检查文件是否存在 + if not os.path.exists(self.file_path): + raise FileNotFoundError(f"文件不存在: {self.file_path}") + + # 根据文件扩展名选择最优引擎 + file_ext = self.file_path.lower() + if file_ext.endswith('.xlsx'): + # .xlsx 文件引擎选择优先级 + engine_options = ['openpyxl', 'calamine'] # calamine需要安装并可能更快 + engine = 'openpyxl' # 默认 + elif file_ext.endswith('.xls'): + # .xls 文件引擎选择 + engine_options = ['xlrd', 'calamine'] + engine = 'xlrd' # 默认 + else: + raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)") + + # 快速获取工作表名称(轻量级方式) + try: + if engine == 'openpyxl': + import openpyxl + workbook = openpyxl.load_workbook(self.file_path, read_only=True) + sheet_names = workbook.sheetnames + workbook.close() + elif engine == 'xlrd': + import xlrd + workbook = xlrd.open_workbook(self.file_path, on_demand=True) + sheet_names = workbook.sheet_names() + workbook.release_resources() + else: + # 使用pandas的轻量级方式 + excel_file = pd.ExcelFile(self.file_path, engine=engine) + sheet_names = excel_file.sheet_names + except Exception as e: + raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}") + + # 定义优先查找的工作表名 + target_sheets = ["Merged All Tests", "All Tests"] + selected_sheet = None + + for sheet in target_sheets: + if sheet in sheet_names: + selected_sheet = sheet + break + + if selected_sheet is None: + raise ValueError( + f"未找到指定的工作表: {' 或 '.join(target_sheets)}。" + f"当前文件包含的工作表有: {sheet_names}" + ) + + try: + # 性能优化:使用更高效的参数设置 + read_excel_kwargs = { + # 'filepath_or_buffer': self.file_path, + 'io': self.file_path, # 修正:使用'io'而不是'filepath_or_buffer' + 'sheet_name': selected_sheet, + 'engine': engine, + 'dtype': 'object', # 先统一读取为对象类型,减少类型推断时间 + 'na_filter': False, # 禁用自动NA过滤,提高读取速度 + } + + # 如果知道必需列,且不为空,则只读取需要的列 + if hasattr(self, 'required_columns') and self.required_columns: + # 先检查哪些列实际存在 + try: + # 轻量级检查列名是否存在 + sample_df = pd.read_excel( + self.file_path, + sheet_name=selected_sheet, + engine=engine, + nrows=1 # 只读取第一行来获取列名 + ) + existing_columns = [col for col in self.required_columns if col in sample_df.columns] + + if len(existing_columns) < len(self.required_columns): + missing = set(self.required_columns) - set(existing_columns) + raise KeyError(f"缺少必要列: {list(missing)}") + + read_excel_kwargs['usecols'] = existing_columns + + # print(f"使用 read_excel_kwargs 读取excel:\n {read_excel_kwargs}") + # 打印完整的参数信息(调试用) + print("使用 read_excel_kwargs 读取excel:") + for key, value in read_excel_kwargs.items(): + print(f" {key}: {repr(value)}") # 使用repr确保特殊字符正确显示 + + except Exception as e: + print(f"列检查失败,将读取所有列: {e}") + # 如果列检查失败,回退到读取所有列 + + + # 执行数据读取 + self._print_stage("执行数据读取") + self.df = pd.read_excel(**read_excel_kwargs) + + except Exception as e: + # 如果默认引擎失败,尝试备选引擎 + print(f"引擎 {engine} 读取失败,尝试备选引擎...\n{e}") + try: + # 回退到基本的读取方式 + self.df = pd.read_excel( + self.file_path, + sheet_name=selected_sheet, + engine=None # 让pandas自动选择 + ) + except Exception as fallback_e: + raise RuntimeError( + f"读取 Excel 失败,工作表: '{selected_sheet}'。" + f"主引擎错误: {type(e).__name__}: {e}\n" + f"备选引擎错误: {type(fallback_e).__name__}: {fallback_e}" + ) + + if self.df.empty: + raise ValueError("工作表为空,无法处理") + + # 校验必要列(如果前面没有使用usecols过滤,这里需要再次检查) + if hasattr(self, 'required_columns') and self.required_columns: + missing_columns = [col for col in self.required_columns if col not in self.df.columns] + if missing_columns: + raise KeyError(f"缺少必要列: {missing_columns}") + + # 记录上下限列名 + self.col_lower = self._find_column_case_insensitive([ + "Lower Limit", "lower limit", "lower_limit", "ll", "lower" + ]) + self.col_upper = self._find_column_case_insensitive([ + "Upper Limit", "upper limit", "upper_limit", "ul", "upper" + ]) + + loading_time = time.time() - start_time + print(f"数据加载完成: {len(self.df)} 行 × {self.df.shape[1]} 列") + print(f"使用引擎: {engine}") + print(f"耗时: {loading_time:.2f}s") + + # 显示列信息摘要 + print(f"检测到下限列: {self.col_lower or '无'}") + print(f"检测到上限列: {self.col_upper or '无'}") + + # 可选:类型转换(如果知道具体的数据类型) + # self._convert_data_types() + + # 可以添加这个方法进行类型转换优化 + def _convert_data_types(self): + """优化数据类型转换""" + if self.df is None or self.df.empty: + return + + # 根据列名模式推断数据类型 + numeric_patterns = ['limit', 'value', 'measure', 'result', 'score'] + date_patterns = ['date', 'time', 'period'] + + for col in self.df.columns: + col_lower = str(col).lower() + + # 数值类型转换 + if any(pattern in col_lower for pattern in numeric_patterns): + self.df[col] = pd.to_numeric(self.df[col], errors='coerce') + # 日期类型转换 + elif any(pattern in col_lower for pattern in date_patterns): + self.df[col] = pd.to_datetime(self.df[col], errors='coerce') + + + def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]: + """获取用户输入的关键词并筛选数据""" + self._print_stage("筛选关键词") + + while True: + keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip() + + if not keyword: + print("❌ 关键词不能为空,请重新输入") + continue + + # 检查数据框是否为空 + if self.df.empty: + print("⚠️ 数据框为空,无法进行筛选") + return pd.DataFrame(), keyword, [] + + # 检查列是否存在 + if "Test Name New" not in self.df.columns: + print("❌ 列 'Test Name New' 不存在于数据框中") + print(f"可用列: {list(self.df.columns)}") + return pd.DataFrame(), keyword, [] + + try: + mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False) + filtered_df = self.df.loc[mask].copy() + + if filtered_df.empty: + # 提供友好的提示和建议,而不是直接抛出异常 + print(f"⚠️ 没有找到包含关键词 '{keyword}' 的测试项") + + # 显示部分可用的测试项作为参考 + available_tests = self.df["Test Name New"].dropna().unique() + if len(available_tests) > 0: + print("📋 可用的测试项示例:") + for test in available_tests[:5]: # 只显示前5个作为参考 + print(f" - {test}") + if len(available_tests) > 5: + print(f" ... 还有 {len(available_tests) - 5} 个测试项") + + # 提供重新输入或退出的选项 + choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ") + if choice == "1": + continue + elif choice == "2": + filtered_df = self.df.copy() + unique_tests = filtered_df["Test Name New"].unique().tolist() + print(f"✅ 使用所有数据: {len(filtered_df)} 行,{len(unique_tests)} 个测试项") + return filtered_df, "", unique_tests + else: + print("👋 退出筛选操作") + return pd.DataFrame(), keyword, [] + else: + unique_tests = filtered_df["Test Name New"].unique().tolist() + print(f"✅ 匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项") + return filtered_df, keyword, unique_tests + + except Exception as e: + print(f"❌ 筛选过程中发生错误: {e}") + print("请检查数据格式或重新输入关键词") + continue + + def create_output_dir(self, keyword) -> None: + """创建输出目录""" + self._print_stage("创建输出目录") + + if not self.file_path: + raise ValueError("文件路径未设置") + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_dir = os.path.dirname(self.file_path) + # self.output_dir = os.path.join(base_dir, f"scatter_report_{timestamp}") + self.output_dir = os.path.join(base_dir, f"scatter_report_out") + self.html_report_path = os.path.join(self.output_dir, f"{keyword}_report_{timestamp}.html") + + os.makedirs(self.output_dir, exist_ok=True) + print(f"输出目录: {self.output_dir}") + + @staticmethod + def _safe_filename(name: str) -> str: + """生成安全的文件名""" + safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip() + return safe or "Unknown_Test" + + def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[ + Optional[float], Optional[float], List[float], List[float]]: + """提取某个测试项的上下限数值""" + lower_plot = upper_plot = None + lower_set = [] + upper_set = [] + + if self.col_lower and self.col_lower in df_one_test.columns: + lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique() + lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else [] + if lower_set: + lower_plot = min(lower_set) + + if self.col_upper and self.col_upper in df_one_test.columns: + upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique() + upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else [] + if upper_set: + upper_plot = max(upper_set) + + return lower_plot, upper_plot, lower_set, upper_set + + @staticmethod + def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series: + """统一的系列清洗和转换方法 - 修复了 ast 方法名错误""" + if series.empty: + return series + + if target_type == 'numeric': + # 数值转换优化 + if pd.api.types.is_numeric_dtype(series): + return series.astype(float) + + # 批量字符串处理 - 修复这里的问题 + cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip() + return pd.to_numeric(cleaned, errors='coerce') + + elif target_type == 'datetime': + return TestReportScatterPlotter._convert_to_datetime(series) + + return series + + @staticmethod + def _convert_to_datetime(series: pd.Series) -> pd.Series: + """优化的日期时间转换""" + if pd.api.types.is_datetime64_any_dtype(series): + return series + + # 预处理:转换为数值和字符串两种形式 + numeric_series = pd.to_numeric(series, errors='coerce') + string_series = series.astype(str).str.strip() + + result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]') + + # 数值时间戳处理 + masks = { + 'ms': numeric_series >= 1e11, + 's': (numeric_series >= 1e9) & (numeric_series < 1e11), + 'excel': (numeric_series > 20000) & (numeric_series < 60000) + } + + for mask_type, mask in masks.items(): + if mask.any(): + if mask_type == 'ms': + result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms') + elif mask_type == 's': + result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s') + elif mask_type == 'excel': + origin = pd.Timestamp('1899-12-30') + result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D') + + # 字符串日期处理 + remaining_mask = result.isna() + if remaining_mask.any(): + remaining_strings = string_series.loc[remaining_mask] + + # 特定格式优先处理 + format_patterns = [ + (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'), + ] + + for pattern, date_format in format_patterns: + format_mask = remaining_strings.str.match(pattern) + if format_mask.any(): + result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime( + remaining_strings.loc[format_mask], format=date_format, errors='coerce' + ) + + # 通用解析 + still_na_mask = result.isna() & remaining_mask + if still_na_mask.any(): + result.loc[still_na_mask] = pd.to_datetime( + string_series.loc[still_na_mask], errors='coerce' + ) + + return result + + def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame: + """数据预处理""" + # 数值转换 + test_data['Measurement_num'] = self._clean_and_convert_series( + test_data['Measurement'], 'numeric' + ) + test_data['TestTime_dt'] = self._clean_and_convert_series( + test_data['Test Time'], 'datetime' + ) + + # 去除无效数据 + valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt']) + return valid_data.sort_values('TestTime_dt') + + def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]: + """计算统计信息""" + stats = { + 'count': len(y_data), + 'mean': y_data.mean(), + 'median': y_data.median(), + 'min': y_data.min(), + 'max': y_data.max(), + 'std': y_data.std(), + 'q1': y_data.quantile(0.25), + 'q3': y_data.quantile(0.75) + } + return stats + + def _plot_to_base64(self, fig) -> str: + """将图表转换为base64编码""" + buf = BytesIO() + fig.savefig(buf, format='png', dpi=150, bbox_inches='tight') + buf.seek(0) + img_str = base64.b64encode(buf.read()).decode('utf-8') + plt.close(fig) + return img_str + + def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str, + lower_plot: Optional[float], upper_plot: Optional[float]) -> str: + """创建汇总图(所有SN在一个图中)""" + fig, ax = plt.subplots(figsize=(12, 8)) + + # 分组绘制 + groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)] + for sn, group in groups: + ax.scatter(group['TestTime_dt'], group['Measurement_num'], + label=str(sn), alpha=0.7, s=25) + + # 计算统计信息 + y_data = test_data['Measurement_num'] + stats = self._calculate_statistics(y_data) + + # 绘制限值线和统计线 + x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max() + + if lower_plot is not None: + ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit") + if upper_plot is not None: + ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit") + + # 添加统计线 + ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange', + linestyles='-', linewidth=1.5, alpha=0.7, label='Mean') + ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple', + linestyles='-.', linewidth=1.5, alpha=0.7, label='Median') + + # 设置图形属性 + ax.set_title(f"汇总图 - {test_name}") + ax.set_xlabel("Test Time") + ax.set_ylabel("Measurement Value") + ax.grid(True, alpha=0.3) + ax.tick_params(axis='x', rotation=45) + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + + return self._plot_to_base64(fig) + + def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str, + lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]: + """为每个SN创建独立图表""" + sn_plots = [] + + if "SN" not in test_data.columns: + return sn_plots + + sn_groups = test_data.groupby("SN") + + for sn, group in sn_groups: + if group.empty: + continue + + fig, ax = plt.subplots(figsize=(10, 6)) + + # 绘制当前SN的数据点 + ax.scatter(group['TestTime_dt'], group['Measurement_num'], + color='blue', alpha=0.7, s=30, label=f"SN: {sn}") + + # 计算当前SN的统计信息 + y_data = group['Measurement_num'] + stats = self._calculate_statistics(y_data) + + # 绘制限值线 + x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max() + + if lower_plot is not None: + ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit") + if upper_plot is not None: + ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit") + + # 添加统计线 + ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange', + linestyles='-', linewidth=1.5, alpha=0.7, label='Mean') + ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple', + linestyles='-.', linewidth=1.5, alpha=0.7, label='Median') + + # 设置图形属性 + ax.set_title(f"SN独立图 - {test_name} (SN: {sn})") + ax.set_xlabel("Test Time") + ax.set_ylabel("Measurement Value") + ax.grid(True, alpha=0.3) + ax.tick_params(axis='x', rotation=45) + ax.legend() + + # 转换为base64 + plot_image = self._plot_to_base64(fig) + sn_plots.append({"sn": str(sn), "image": plot_image}) + + return sn_plots + + def _determine_test_status(self, stats: Dict[str, float], + lower_limit: Optional[float], + upper_limit: Optional[float]) -> Dict[str, Any]: + """确定测试状态""" + status = "success" + status_display = "正常" + + if lower_limit is not None and upper_limit is not None: + # 检查是否超出限值 + if stats['min'] < lower_limit or stats['max'] > upper_limit: + status = "danger" + status_display = "异常" + elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or + stats['std'] > (upper_limit - lower_limit) * 0.2): + status = "warning" + status_display = "警告" + + return {"status": status, "status_display": status_display} + + def generate_html_report(self, filtered_df: pd.DataFrame, keyword: str, + unique_tests: List[str]) -> None: + """生成HTML报告""" + self._print_stage("生成HTML报告") + start_time = time.time() + + test_results = [] + total_points = 0 + status_counts = {"success": 0, "warning": 0, "danger": 0} + + for i, test_name in enumerate(unique_tests, 1): + self._print_progress(i, len(unique_tests), "生成测试报告") + + # 获取测试数据 + test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy() + test_data = self._preprocess_test_data(test_data) + + if test_data.empty: + continue + + # 提取限值信息 + lower_plot, upper_plot, _, _ = self._extract_limits(test_data) + + # 计算统计信息 + y_data = test_data['Measurement_num'] + stats = self._calculate_statistics(y_data) + total_points += stats['count'] + + # 生成汇总图表 + summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot) + + # 生成SN独立图表 + sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot) + + # 确定测试状态 + status_info = self._determine_test_status(stats, lower_plot, upper_plot) + status_counts[status_info["status"]] += 1 + + # 添加到结果列表 + test_results.append({ + "name": test_name, + "stats": stats, + "limits": {"lower": lower_plot, "upper": upper_plot}, + "summary_plot_image": summary_plot_image, + "sn_plot_images": sn_plot_images, + "status": status_info["status"], + "status_display": status_info["status_display"] + }) + + # 渲染HTML模板 + template = Template(HTML_TEMPLATE) + html_content = template.render( + keyword=keyword, + timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + test_count=len(test_results), + total_points=total_points, + tests=test_results, + file_path=self.file_path, + analysis_time=round(time.time() - start_time, 2), + status_counts={"normal": status_counts["success"], "warning": status_counts["warning"], + "abnormal": status_counts["danger"]} + ) + + # 保存HTML文件 + with open(self.html_report_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f"\nHTML报告已生成: {self.html_report_path}") + print(f"共处理 {len(test_results)} 个测试项,{total_points} 个数据点") + + def run(self) -> None: + """运行主程序""" + try: + self.get_file_path() + self.load_data() + while True: + filtered_df, keyword, unique_tests = self.get_keyword() + self.create_output_dir(keyword) + self.generate_html_report(filtered_df, keyword, unique_tests) + print(f"\n✅ 分析完成!") + # print(f"📊 报告文件: {self.html_report_path}") + # print(f"📁 输出目录: {self.output_dir}") + except KeyboardInterrupt: + print(f"\n{Fore.YELLOW}⚠ 用户中断程序") + except Exception as e: + print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + plotter = TestReportScatterPlotter() + plotter.run() diff --git a/htmlProcess/htmlReportProcess_Merge_pic_V1.py b/htmlProcess/htmlReportProcess_Merge_pic_V1.py new file mode 100644 index 0000000..802b798 --- /dev/null +++ b/htmlProcess/htmlReportProcess_Merge_pic_V1.py @@ -0,0 +1,563 @@ +import os +import re +import sys +import time +import pandas as pd +import matplotlib.pyplot as plt +from datetime import datetime +from matplotlib.lines import Line2D +from typing import Optional, Tuple, List, Dict, Any, Union +from pathlib import Path +import numpy as np + +from colorama import Fore, Style, init + +# 避免 SettingWithCopy 警告影响输出可读性 +pd.options.mode.chained_assignment = None + +# 设置中文字体支持 +plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei'] +plt.rcParams['axes.unicode_minus'] = False + + +class TestReportScatterPlotter: + def __init__(self): + self.file_path: Optional[str] = None + self.df: Optional[pd.DataFrame] = None + self.output_dir: Optional[str] = None + self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time"] + self.col_lower: Optional[str] = None + self.col_upper: Optional[str] = None + + # 缓存处理过的数据 + self._processed_data_cache: Dict[str, Any] = {} + + def _print_stage(self, msg: str) -> None: + """统一的阶段信息输出""" + print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}") + + def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None: + """改进的进度条显示""" + if total <= 0: + return + + percent = (current / total) * 100 + bar_len = 30 + filled = int(bar_len * current / total) + bar = "█" * filled + "-" * (bar_len - filled) + sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)") + sys.stdout.flush() + if current == total: + print() # 换行 + + def get_file_path(self) -> None: + """改进的文件路径获取,支持路径补全""" + self._print_stage("输入文件路径") + + while True: + print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ") + file_path = input("> ").strip() + + # 尝试路径补全和验证 + if not file_path: + continue + + path_obj = Path(file_path) + if path_obj.exists(): + self.file_path = str(path_obj.resolve()) + print(f"已选择文件: {self.file_path}") + break + else: + print(f"文件不存在: {file_path},请重新输入") + + def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]: + """优化的大小写不敏感列查找""" + if self.df is None: + return None + + columns_lower = {col.lower().strip(): col for col in self.df.columns} + for candidate in candidates: + key = candidate.lower().strip() + if key in columns_lower: + return columns_lower[key] + return None + + def load_data(self) -> None: + """优化的数据加载方法""" + self._print_stage("加载数据") + start_time = time.time() + + # try: + # # 使用更高效的数据读取方式 + # self.df = pd.read_excel( + # self.file_path, + # sheet_name="Merged All Tests", + # engine='openpyxl' # 指定引擎提高性能 + # ) + # except Exception as e: + # raise RuntimeError( + # f"读取 Excel 失败,请确认工作表名为 'Merged All Tests'。错误: {type(e).__name__}: {e}" + # ) + + # 检查文件是否存在 + if not os.path.exists(self.file_path): + raise FileNotFoundError(f"文件不存在: {self.file_path}") + + # 检查文件扩展名是否为Excel支持的格式 + if not self.file_path.lower().endswith(('.xls', '.xlsx')): + raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)") + + try: + # 打开Excel文件并获取所有sheet名称 + excel_file = pd.ExcelFile(self.file_path, engine='openpyxl') + sheet_names = excel_file.sheet_names + except Exception as e: + raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}") + + # 定义优先查找的工作表名 + target_sheets = ["Merged All Tests", "All Tests"] + selected_sheet = None + + for sheet in target_sheets: + if sheet in sheet_names: + selected_sheet = sheet + break + + if selected_sheet is None: + raise ValueError( + f"未找到指定的工作表: {' 或 '.join(target_sheets)}。" + f"当前文件包含的工作表有: {sheet_names}" + ) + + try: + # 使用更高效的方式读取指定sheet + self.df = pd.read_excel( + self.file_path, + sheet_name=selected_sheet, + engine='openpyxl' + ) + except Exception as e: + raise RuntimeError( + f"读取 Excel 失败,工作表: '{selected_sheet}'。错误: {type(e).__name__}: {e}" + ) + + if self.df.empty: + raise ValueError("工作表为空,无法处理") + + # 校验必要列 + missing_columns = [col for col in self.required_columns if col not in self.df.columns] + if missing_columns: + raise KeyError(f"缺少必要列: {missing_columns}") + + # 记录上下限列名 + self.col_lower = self._find_column_case_insensitive([ + "Lower Limit", "lower limit", "lower_limit", "ll", "lower" + ]) + self.col_upper = self._find_column_case_insensitive([ + "Upper Limit", "upper limit", "upper_limit", "ul", "upper" + ]) + + loading_time = time.time() - start_time + print(f"数据加载完成: {len(self.df)} 行 × {self.df.shape[1]} 列") + print(f"耗时: {loading_time:.2f}s") + + # 显示列信息摘要 + print(f"检测到下限列: {self.col_lower or '无'}") + print(f"检测到上限列: {self.col_upper or '无'}") + + def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]: + """获取用户输入的关键词并筛选数据""" + self._print_stage("筛选关键词") + + while True: + keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip() + if not keyword: + print("关键词不能为空,请重新输入") + continue + break + + mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False) + filtered_df = self.df.loc[mask].copy() + + if filtered_df.empty: + raise ValueError(f"没有找到包含关键词 '{keyword}' 的测试项") + + unique_tests = filtered_df["Test Name New"].unique().tolist() + print(f"匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项") + return filtered_df, keyword, unique_tests + + def create_output_dir(self) -> None: + """创建输出目录""" + self._print_stage("创建输出目录") + + if not self.file_path: + raise ValueError("文件路径未设置") + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + base_dir = os.path.dirname(self.file_path) + self.output_dir = os.path.join(base_dir, f"scatter_plots_{timestamp}") + + os.makedirs(self.output_dir, exist_ok=True) + print(f"输出目录: {self.output_dir}") + + @staticmethod + def _safe_filename(name: str) -> str: + """生成安全的文件名""" + safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip() + return safe or "Unknown_Test" + + def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[ + Optional[float], Optional[float], List[float], List[float]]: + """提取某个测试项的上下限数值""" + lower_plot = upper_plot = None + lower_set = [] + upper_set = [] + + if self.col_lower and self.col_lower in df_one_test.columns: + lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique() + lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else [] + if lower_set: + lower_plot = min(lower_set) + + if self.col_upper and self.col_upper in df_one_test.columns: + upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique() + upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else [] + if upper_set: + upper_plot = max(upper_set) + + return lower_plot, upper_plot, lower_set, upper_set + + @staticmethod + def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series: + """统一的系列清洗和转换方法""" + if series.empty: + return series + + if target_type == 'numeric': + # 数值转换优化 + if pd.api.types.is_numeric_dtype(series): + return series.astype(float) + + # 批量字符串处理 + cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip() + return pd.to_numeric(cleaned, errors='coerce') + + elif target_type == 'datetime': + return TestReportScatterPlotter._convert_to_datetime(series) + + return series + + @staticmethod + def _convert_to_datetime(series: pd.Series) -> pd.Series: + """优化的日期时间转换""" + if pd.api.types.is_datetime64_any_dtype(series): + return series + + # 预处理:转换为数值和字符串两种形式 + numeric_series = pd.to_numeric(series, errors='coerce') + string_series = series.astype(str).str.strip() + + result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]') + + # 数值时间戳处理 + masks = { + 'ms': numeric_series >= 1e11, + 's': (numeric_series >= 1e9) & (numeric_series < 1e11), + 'excel': (numeric_series > 20000) & (numeric_series < 60000) + } + + for mask_type, mask in masks.items(): + if mask.any(): + if mask_type == 'ms': + result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms') + elif mask_type == 's': + result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s') + elif mask_type == 'excel': + origin = pd.Timestamp('1899-12-30') + result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D') + + # 字符串日期处理 + remaining_mask = result.isna() + if remaining_mask.any(): + remaining_strings = string_series.loc[remaining_mask] + + # 特定格式优先处理 + format_patterns = [ + (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'), + ] + + for pattern, date_format in format_patterns: + format_mask = remaining_strings.str.match(pattern) + if format_mask.any(): + result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime( + remaining_strings.loc[format_mask], format=date_format, errors='coerce' + ) + + # 通用解析 + still_na_mask = result.isna() & remaining_mask + if still_na_mask.any(): + result.loc[still_na_mask] = pd.to_datetime( + string_series.loc[still_na_mask], errors='coerce' + ) + + return result + + def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame: + """数据预处理""" + # 数值转换 + test_data['Measurement_num'] = self._clean_and_convert_series( + test_data['Measurement'], 'numeric' + ) + test_data['TestTime_dt'] = self._clean_and_convert_series( + test_data['Test Time'], 'datetime' + ) + + # 去除无效数据 + valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt']) + return valid_data.sort_values('TestTime_dt') + + def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]: + """计算统计信息""" + stats = { + 'count': len(y_data), + 'mean': y_data.mean(), + 'median': y_data.median(), + 'min': y_data.min(), + 'max': y_data.max(), + 'std': y_data.std(), + 'q1': y_data.quantile(0.25), + 'q3': y_data.quantile(0.75) + } + return stats + + def _add_statistics_textbox(self, ax, stats: Dict[str, float], + x_pos: float = 1.02, y_pos: float = 0.98) -> None: + """在图表右侧添加统计信息文本框""" + # 使用英文标签避免中文显示问题 + stats_text = ( + f"Count: {stats['count']}\n" + f"Mean: {stats['mean']:.4f}\n" + f"Median: {stats['median']:.4f}\n" + f"Min: {stats['min']:.4f}\n" + f"Max: {stats['max']:.4f}\n" + f"Std: {stats['std']:.4f}\n" + f"Q1: {stats['q1']:.4f}\n" + f"Q3: {stats['q3']:.4f}" + ) + + # 添加文本框到右侧,使用英文字体 + props = dict(boxstyle='round', facecolor='wheat', alpha=0.8) + ax.text(x_pos, y_pos, stats_text, transform=ax.transAxes, fontsize=8, + verticalalignment='top', horizontalalignment='left', # 左对齐 + bbox=props, fontfamily='monospace') + + def _add_statistics_lines(self, ax, stats: Dict[str, float], + x_min: float, x_max: float) -> None: + """添加统计线到图表""" + # 添加平均值线 + ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, + colors='orange', linestyles='-', linewidth=1.5, alpha=0.7, label='Mean') + + # 添加中位数线 + ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, + colors='purple', linestyles='-.', linewidth=1.5, alpha=0.7, label='Median') + + # 添加Q1和Q3线 + ax.hlines(y=stats['q1'], xmin=x_min, xmax=x_max, + colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q1') + ax.hlines(y=stats['q3'], xmin=x_min, xmax=x_max, + colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q3') + + def _configure_plot(self, ax, test_data: pd.DataFrame, test_name: str, + lower_plot: Optional[float], upper_plot: Optional[float]) -> None: + """配置图形属性""" + # 计算统计信息 + y_data = test_data['Measurement_num'] + stats = self._calculate_statistics(y_data) + + # 获取时间范围用于统计线 + x_min = test_data['TestTime_dt'].min() + x_max = test_data['TestTime_dt'].max() + + # Y轴范围计算 + y_min, y_max = y_data.min(), y_data.max() + y_candidates = [y_min, y_max] + + # 绘制限值线 + custom_lines = [] + if lower_plot is not None: + y_candidates.append(lower_plot) + ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2) + custom_lines.append(Line2D([0], [0], color='green', linestyle='--', label="Lower Limit")) + + if upper_plot is not None: + y_candidates.append(upper_plot) + ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2) + custom_lines.append(Line2D([0], [0], color='red', linestyle='--', label="Upper Limit")) + + # 添加统计线 + self._add_statistics_lines(ax, stats, x_min, x_max) + + # 设置范围 + valid_candidates = [y for y in y_candidates if pd.notna(y)] + if valid_candidates: + y_min_plot = min(valid_candidates) + y_max_plot = max(valid_candidates) + y_range = y_max_plot - y_min_plot + if y_range == 0: + y_range = abs(y_max_plot) * 0.1 if y_max_plot != 0 else 1.0 + y_min_plot = y_min_plot - y_range / 2 + y_max_plot = y_max_plot + y_range / 2 + ax.set_ylim(y_min_plot - 0.1 * y_range, y_max_plot + 0.1 * y_range) + + # 添加统计信息文本框到右侧 + self._add_statistics_textbox(ax, stats) + + # 设置标题和标签,使用英文避免中文问题 + ax.set_title(f"Scatter Plot - {test_name}\n" + f"Mean: {stats['mean']:.4f}, Median: {stats['median']:.4f}, " + f"Range: [{stats['min']:.4f}, {stats['max']:.4f}]", + fontsize=10) + ax.set_xlabel("Test Time") + ax.set_ylabel("Measurement Value") + ax.grid(True, alpha=0.3) + ax.tick_params(axis='x', rotation=45) + + # 图例处理 - 优化位置在右侧 + handles, labels = ax.get_legend_handles_labels() + if custom_lines: + handles.extend(custom_lines) + labels.extend([line.get_label() for line in custom_lines]) + + if handles: + # 根据图例项数量决定图例位置和布局 + if len(handles) > 10: # 如果图例项很多,使用两列布局 + ncol = 2 + # 调整图例位置,确保不遮挡数据 + ax.legend(handles=handles, labels=labels, title="Legend", + fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5), + ncol=ncol, frameon=True, fancybox=True, shadow=True) + else: + # 图例项较少时使用单列布局 + ax.legend(handles=handles, labels=labels, title="Legend", + fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5), + frameon=True, fancybox=True, shadow=True) + + def _save_plot(self, fig, test_name: str) -> None: + """保存图形""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_name = self._safe_filename(test_name) + filename = f"{safe_name}_{timestamp}.png" + output_path = os.path.join(self.output_dir, filename) + + # 调整布局以确保图例完整显示 + fig.savefig(output_path, dpi=300, bbox_inches='tight') + plt.close(fig) + print(f"已保存: {output_path}") + + def plot_scatter(self, filtered_df: pd.DataFrame, unique_tests: List[str]) -> None: + """优化的散点图绘制方法""" + self._print_stage("生成散点图") + total_tests = len(unique_tests) + start_time = time.time() + + for i, test_name in enumerate(unique_tests, 1): + self._print_progress(i, total_tests, "测试项绘图") + + # 使用缓存避免重复计算 + cache_key = f"test_{hash(test_name)}" + if cache_key in self._processed_data_cache: + test_data = self._processed_data_cache[cache_key] + else: + test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy() + # 预处理数据 + test_data = self._preprocess_test_data(test_data) + self._processed_data_cache[cache_key] = test_data + + if test_data.empty: + print(f"\n跳过 '{test_name}' - 无有效的 Measurement/Test Time 数据") + continue + + # 提取限值信息 + lower_plot, upper_plot, lower_set, upper_set = self._extract_limits(test_data) + + # 输出限值信息 + limit_info = [] + if lower_set: + limit_info.append(f"Lower unique={len(lower_set)}, used={lower_plot}") + else: + limit_info.append("Lower N/A") + if upper_set: + limit_info.append(f"Upper unique={len(upper_set)}, used={upper_plot}") + else: + limit_info.append("Upper N/A") + + # 计算并输出统计信息 + y_data = test_data['Measurement_num'] + stats = self._calculate_statistics(y_data) + stat_info = ( + f"数据点: {stats['count']}, " + f"均值: {stats['mean']:.4f}, " + f"中位数: {stats['median']:.4f}, " + f"范围: [{stats['min']:.4f}, {stats['max']:.4f}]" + ) + + print(f"\n→ 绘制: '{test_name}' | {stat_info} | 限值: {', '.join(limit_info)}") + + # 创建图形 - 增大图像尺寸以容纳图例和统计信息 + sn_count = len(test_data["SN"].unique()) if "SN" in test_data.columns else 1 + + # 根据SN数量和预期图例项数量调整图形大小 + base_width = 14 # 增加宽度以容纳统计信息 + base_height = 9 # 增加高度以容纳更多信息 + + # 如果SN数量多,增加图形宽度以容纳图例 + if sn_count > 5: + fig_width = base_width + min(sn_count / 5, 6) # 最大增加6个单位宽度 + else: + fig_width = base_width + + fig, ax = plt.subplots(figsize=(fig_width, base_height)) + + # 分组绘制 + groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)] + + for j, (sn, group) in enumerate(groups, 1): + ax.scatter(group['TestTime_dt'], group['Measurement_num'], + label=str(sn), alpha=0.7, s=25) + if j % 10 == 0 or j == len(groups): + self._print_progress(j, len(groups), "SN分组绘制") + + # 配置图形 + self._configure_plot(ax, test_data, test_name, lower_plot, upper_plot) + + # 调整布局,为右侧统计信息和图例留出空间 + plt.tight_layout() + plt.subplots_adjust(right=0.8 if sn_count <= 10 else 0.7) # 为右侧统计信息留出更多空间 + + # 保存图像 + self._save_plot(fig, test_name) + + total_time = time.time() - start_time + print(f"\n全部绘图完成,总耗时: {total_time:.2f}s") + print(f"所有图表已保存到: {self.output_dir}") + + def run(self) -> None: + """运行主程序""" + try: + self.get_file_path() + self.load_data() + filtered_df, keyword, unique_tests = self.get_keyword() + self.create_output_dir() + self.plot_scatter(filtered_df, unique_tests) + + except Exception as e: + print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + plotter = TestReportScatterPlotter() + plotter.run() diff --git a/main.py b/main.py new file mode 100644 index 0000000..9c7b199 --- /dev/null +++ b/main.py @@ -0,0 +1,251 @@ +import pandas as pd +import os +import glob +import re +from datetime import datetime +import tkinter as tk +from tkinter import filedialog +from collections import defaultdict + + +class BOMConsolidator: + def __init__(self): + self.master_data = defaultdict(dict) + self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description', + 'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference'] + self.file_quantities = {} + self.consolidated_report = None + self.inconsistency_count = 0 + self.processed_files = 0 + self.processed_rows = 0 + self.output_folder = "" + + def find_valid_sheet(self, file_path): + """定位包含有效BOM的Sheet""" + xl = pd.ExcelFile(file_path) + for sheet_name in xl.sheet_names: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) + for i in range(len(df)): + headers = df.iloc[i].values + if all(col in headers for col in ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN']): + return sheet_name, i + return None, None + + def clean_column_names(self, df): + """清洗列名并标准化""" + df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True) + df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True) + return df + + def process_file(self, file_path): + """处理单个BOM文件""" + filename = os.path.basename(file_path) + print(f"处理文件: {filename}...") + + sheet_name, header_row = self.find_valid_sheet(file_path) + if not sheet_name: + print(f" ! 未找到有效BOM表: {filename}") + return False + + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + df = self.clean_column_names(df) + + # 验证必要字段 + missing_cols = [col for col in self.required_columns if col not in df.columns] + if missing_cols: + print(f" ! 缺少必要列: {', '.join(missing_cols)}") + return False + + print(f" √ 找到有效Sheet: {sheet_name} (共{len(df)}行)") + self.file_quantities[filename] = {} + self.processed_files += 1 + + # 处理每行数据 + for _, row in df.iterrows(): + self.process_row(row, filename) + self.processed_rows += 1 + + return True + + def process_row(self, row, filename): + """处理单行数据""" + # 确定合并主键 + key = row['Partnumber'] if pd.notna(row['Partnumber']) and row['Partnumber'] != '' else row['MF_PN'] + if pd.isna(key) or key == '': + return + + # 首次记录该物料 + if key not in self.master_data: + self.master_data[key] = { + 'Partnumber': row['Partnumber'], + 'Purchase_Code': row['Purchase_Code'], + 'MF_PN': row['MF_PN'], + 'Description': row.get('Description', ''), + 'Part_Type': row.get('Part_Type', ''), + 'MF_NAME': row.get('MF_NAME', ''), + 'PCB_Footprint': row.get('PCB_Footprint', ''), + 'quantity_data': {}, # 存储每个文件的数量 + 'inconsistencies': [] # 存储不一致信息 + } + + # 检查字段一致性 + current_data = self.master_data[key] + fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint'] + + for field in fields_to_check: + # 处理字段名称差异 + db_field = 'Part Type' if field == 'Part_Type' else field + + current_val = str(current_data[field]) + new_val = str(row.get(db_field, '')) + + # 忽略空值和'nan'字符串 + if new_val in ['', 'nan', 'NaN', 'NaT']: + continue + + # 比较当前值和新值 + if current_val != new_val: + current_data['inconsistencies'].append( + f"{field}不一致: {current_val} ≠ {new_val} (文件: {filename})" + ) + + # 检查Reference数量和Quantity是否匹配 + ref_count = 0 + if pd.notna(row['Reference']) and row['Reference'] != '': + ref_list = str(row['Reference']).split(',') + ref_count = len([ref for ref in ref_list if ref.strip() != '']) + + try: + quantity = int(row['Quantity']) + if ref_count != quantity: + current_data['inconsistencies'].append( + f"Reference数量不符: {ref_count}个位置 ≠ Quantity={quantity} (文件: {filename})" + ) + except (ValueError, TypeError): + pass + + # 记录当前文件的数量 + try: + qty_val = int(row['Quantity']) + self.file_quantities[filename][key] = qty_val + current_data['quantity_data'][filename] = qty_val + except (ValueError, TypeError): + self.file_quantities[filename][key] = 0 + current_data['quantity_data'][filename] = 0 + + # 更新不一致计数 + if current_data['inconsistencies']: + self.inconsistency_count += 1 + + def generate_report(self): + """生成合并报告""" + if not self.master_data: + print("无有效数据可生成报告") + return None + + print(f"\n生成合并报告,共{len(self.master_data)}种物料...") + + # 准备报告数据结构 + report_data = [] + file_columns = sorted(self.file_quantities.keys()) + + for key, data in self.master_data.items(): + row = { + 'Partnumber': data['Partnumber'], + 'Purchase_Code': data['Purchase_Code'], + 'MF_PN': data['MF_PN'], + 'Description': data['Description'], + 'Part Type': data['Part_Type'], + 'MF_NAME': data['MF_NAME'], + 'PCB_Footprint': data['PCB_Footprint'], + '检查信息': '; '.join(data['inconsistencies']) + } + + # 添加各文件数量 + total = 0 + for file in file_columns: + qty = data['quantity_data'].get(file, 0) + row[file] = qty + total += qty + row['合计'] = total + + report_data.append(row) + + # 创建DataFrame + self.consolidated_report = pd.DataFrame(report_data) + + # 生成带时间戳的文件名 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(self.output_folder, f"BOM合并报告_{timestamp}.xlsx") + + # 保存报告 + self.consolidated_report.to_excel(output_path, index=False) + + # 返回统计信息和路径 + stats = { + 'output_path': output_path, + 'file_count': self.processed_files, + 'material_count': len(self.master_data), + 'inconsistency_count': self.inconsistency_count, + 'processed_rows': self.processed_rows + } + + return stats + + +def select_folder(): + """弹出文件夹选择对话框""" + root = tk.Tk() + root.withdraw() + folder_selected = filedialog.askdirectory(title='选择BOM文件所在文件夹') + return folder_selected + + +def main(): + # 初始化合并器 + bom_processor = BOMConsolidator() + + # 选择文件夹 + folder_path = select_folder() + if not folder_path: + print("未选择文件夹,程序退出") + return + + bom_processor.output_folder = folder_path + + # 获取所有Excel文件 + bom_files = glob.glob(os.path.join(folder_path, "*.xlsx")) + if not bom_files: + print("文件夹中没有Excel文件") + return + + print(f"找到 {len(bom_files)} 个Excel文件,开始处理...") + + # 处理文件 + processed_count = 0 + for file_path in bom_files: + success = bom_processor.process_file(file_path) + if success: + processed_count += 1 + + # 生成报告 + if bom_processor.master_data: + stats = bom_processor.generate_report() + + # 打印汇总信息 + print("\n" + "=" * 40) + print("BOM合并完成! 汇总信息:") + print(f"处理文件夹: {folder_path}") + print(f"扫描文件数: {len(bom_files)}") + print(f"成功处理文件数: {processed_count}") + print(f"处理行数: {stats['processed_rows']}") + print(f"合并物料种类数: {stats['material_count']}") + print(f"检测到不一致条目数: {stats['inconsistency_count']}") + print(f"报告已保存至: {stats['output_path']}") + print("=" * 40) + else: + print("没有有效数据生成报告") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tempReportProcess/.gitignore b/tempReportProcess/.gitignore new file mode 100644 index 0000000..e9afaff --- /dev/null +++ b/tempReportProcess/.gitignore @@ -0,0 +1,9 @@ +/build/* +/build +/dist/* +/dist +/source/* +/source + + +tempReportProcess_V2.py \ No newline at end of file diff --git a/tempReportProcess/tempReportProcess_V1.py b/tempReportProcess/tempReportProcess_V1.py new file mode 100644 index 0000000..dde4e13 --- /dev/null +++ b/tempReportProcess/tempReportProcess_V1.py @@ -0,0 +1,248 @@ +import pandas as pd +import matplotlib.pyplot as plt +from datetime import datetime +import tkinter as tk +from tkinter import filedialog +import os +import matplotlib.dates as mdates +from jinja2 import Template +from matplotlib import font_manager, rcParams + + +class TemperatureDataAnalyzer: + def __init__(self): + self.data = None + self.file_path = None + self.timestamps = [] + self.temperatures = [] + self.statuses = [] + self._configure_chinese_font() # 配置中文字体,修复中文字符缺失警告 + + def _configure_chinese_font(self): + """ + 配置 Matplotlib 中文字体,避免中文字符缺失的警告。 + 会尝试常见的中文字体并设置 axes.unicode_minus 为 False。 + """ + try: + # 常见中文字体候选(跨平台) + candidates = [ + "Microsoft YaHei", "Microsoft YaHei UI", # Windows + "SimHei", "SimSun", # Windows(黑体/宋体) + "PingFang SC", "Heiti SC", # macOS + "Noto Sans CJK SC", "Source Han Sans SC", "WenQuanYi Micro Hei", # Linux + "Arial Unicode MS" # 覆盖广的 Unicode 字体 + ] + available = {f.name for f in font_manager.fontManager.ttflist} + for name in candidates: + if name in available: + rcParams["font.sans-serif"] = [name] + rcParams["axes.unicode_minus"] = False + # 可选:打印使用的字体名称 + # print(f"使用中文字体: {name}") + return + # 如果没有找到常见中文字体,给出提示 + rcParams["axes.unicode_minus"] = False + print("未检测到常见中文字体,图中中文可能无法正常显示。建议安装 'Noto Sans CJK SC' 或 'Microsoft YaHei'。") + except Exception as e: + print(f"中文字体配置失败: {e}") + + def select_file(self): + """手动选择CSV文件""" + root = tk.Tk() + root.withdraw() # 隐藏主窗口 + + file_types = [("CSV files", "*.csv"), ("All files", "*.*")] + self.file_path = filedialog.askopenfilename(title="选择温度数据CSV文件", filetypes=file_types) + + if not self.file_path: + print("未选择文件,程序退出") + return False + return True + + def load_and_process_data(self): + """加载和处理数据""" + try: + # 读取CSV文件,无表头 + self.data = pd.read_csv(self.file_path, header=None) + + # 重命名列以便于引用 + self.data.columns = ['timestamp', 'temperature', 'status'] + + # 转换时间戳格式(文本例如:10/29/2025 2:20:41 PM) + self.data['datetime'] = pd.to_datetime(self.data['timestamp'], format='%m/%d/%Y %I:%M:%S %p') + + # 提取处理后的数据 + self.timestamps = self.data['datetime'] + self.temperatures = self.data['temperature'] + self.statuses = self.data['status'] + + print(f"成功加载 {len(self.data)} 条记录") + return True + + except Exception as e: + print(f"数据处理错误: {e}") + return False + + def create_scatter_plots(self): + """创建散点图""" + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) + + # 温度散点图 + sc1 = ax1.scatter(self.timestamps, self.temperatures, c=self.temperatures, + cmap='coolwarm', alpha=0.7, s=20) + ax1.set_title('温度随时间变化趋势') + ax1.set_ylabel('温度 (°C)') + ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) + ax1.grid(True, linestyle='--', alpha=0.7) + ax1.tick_params(axis='x', rotation=45) + plt.colorbar(sc1, ax=ax1, label="温度(°C)") + + # 状态散点图 + sc2 = ax2.scatter(self.timestamps, self.statuses, c=self.statuses, + cmap='viridis', alpha=0.7, s=20) + ax2.set_title('状态随时间变化') + ax2.set_xlabel('时间') + ax2.set_ylabel('状态值') + ax2.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) + ax2.grid(True, linestyle='--', alpha=0.7) + ax2.tick_params(axis='x', rotation=45) + plt.colorbar(sc2, ax=ax2, label="状态值") + + plt.tight_layout() + return fig + + def generate_statistics_report(self): + """生成统计报告""" + stats = { + 'total_records': len(self.temperatures), + 'avg_temperature': round(self.temperatures.mean(), 2), + 'max_temperature': round(self.temperatures.max(), 2), + 'min_temperature': round(self.temperatures.min(), 2), + 'std_deviation': round(self.temperatures.std(), 2), + 'temp_range': round(self.temperatures.max() - self.temperatures.min(), 2), + 'start_time': self.timestamps.iloc[0].strftime('%Y-%m-%d %H:%M:%S'), + 'end_time': self.timestamps.iloc[-1].strftime('%Y-%m-%d %H:%M:%S'), + 'duration_hours': round((self.timestamps.iloc[-1] - self.timestamps.iloc[0]).total_seconds() / 3600, 2) + } + + # 状态分布统计 + status_counts = self.statuses.value_counts().to_dict() + stats['status_distribution'] = status_counts + + return stats + + def save_fig_to_html(self, fig, output_path): + """将图形保存为HTML""" + import io + import base64 + + # 将图形转换为base64编码 + buf = io.BytesIO() + fig.savefig(buf, format='png', dpi=150, bbox_inches='tight') + buf.seek(0) + img_str = base64.b64encode(buf.read()).decode('utf-8') + buf.close() + + # HTML模板(修复了多余的 '}') + html_template = """ + + + + + 温度数据分析报告 + + + +
+

温度数据分析报告

+

数据文件: {{ file_name }}

+

生成时间: {{ generation_time }}

+
+ +
+

数据概览

+ + + {% for key, value in statistics.items() %} + {% if key != 'status_distribution' %} + + {% endif %} + {% endfor %} +
项目数值
{{ key.replace('_', ' ').title() }}{{ value }}
+
+ +
+

状态分布

+ + + {% for status, count in statistics.status_distribution.items() %} + + {% endfor %} +
状态值出现次数
{{ status }}{{ count }}
+
+ +
+

温度与状态时序图

+
+ 温度与状态时序图 +
+
+ + + + """ + + template = Template(html_template) + rendered_html = template.render( + file_name=self.file_path, + generation_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + statistics=self.generate_statistics_report(), + image_data=img_str + ) + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(rendered_html) + + def run_analysis(self): + """运行完整分析流程""" + if not self.select_file(): + return + + if not self.load_and_process_data(): + return + + # 创建图形 + fig = self.create_scatter_plots() + + # 生成输出文件名(保存到选择的文件所在文件夹) + base_filename = os.path.splitext(os.path.basename(self.file_path))[0] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_filename = f"{base_filename}_{timestamp}.html" + output_dir = os.path.dirname(self.file_path) + output_path = os.path.join(output_dir, output_filename) + + # 保存HTML报告到同一文件夹 + self.save_fig_to_html(fig, output_path) + + print(f"分析完成!报告已保存至: {output_path}") + + # 显示统计摘要 + stats = self.generate_statistics_report() + print("\n=== 数据统计摘要 ===") + for key, value in stats.items(): + if key != 'status_distribution': + print(f"{key.replace('_', ' ').title()}: {value}") + + +if __name__ == "__main__": + analyzer = TemperatureDataAnalyzer() + analyzer.run_analysis() \ No newline at end of file