diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6a20e8c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+/.idea/*
+.idea/*
+/.idea
+.idea
+/.venv
+/.venv/*
+
diff --git a/BOMCompare/.gitignore b/BOMCompare/.gitignore
new file mode 100644
index 0000000..2949325
--- /dev/null
+++ b/BOMCompare/.gitignore
@@ -0,0 +1,16 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+BOMCompare for Merge V2.py
+
+BOMCompareForJP2.py
+
+
+BOMConsolidator.py
+
+BOMConsolidatorV2.py
+# BOMConsolidator.py
\ No newline at end of file
diff --git a/BOMCompare/BOMCompare for Merge V1.py b/BOMCompare/BOMCompare for Merge V1.py
new file mode 100644
index 0000000..566d11e
--- /dev/null
+++ b/BOMCompare/BOMCompare for Merge V1.py
@@ -0,0 +1,655 @@
+import pandas as pd
+import tkinter as tk
+from tkinter import filedialog
+from datetime import datetime
+import os
+from typing import Dict, List, Tuple, Optional
+
+
+class BOMComparator:
+ """BOM文件差异对比器"""
+
+ def __init__(self):
+ self.file1_path = ""
+ self.file2_path = ""
+ self.file1_sheets = []
+ self.file2_sheets = []
+ self.common_sheets = []
+ self.differences = {}
+ self.file1_name = ""
+ self.file2_name = ""
+ self.columns_to_exclude = ['检查信息', '检查状态', '校验信息'] # 要排除的列名
+
+ def select_file(self, title: str) -> str:
+ """手动选择文件"""
+ root = tk.Tk()
+ root.withdraw()
+ file_path = filedialog.askopenfilename(
+ title=title,
+ filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")]
+ )
+ root.destroy()
+ return file_path
+
+ def find_valid_sheets(self, file_path: str) -> List[str]:
+ """参考附件3的方式查找有效的sheet"""
+ valid_sheets = []
+
+ try:
+ xl_file = pd.ExcelFile(file_path)
+
+ for sheet_name in xl_file.sheet_names:
+ try:
+ # 尝试读取sheet,检查是否包含BOM数据
+ df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=10)
+
+ # 检查是否包含BOM相关列(参考附件结构)
+ required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description']
+ found_columns = [col for col in df.columns if col in required_columns]
+
+ if len(found_columns) >= 2: # 至少找到2个关键列
+ # 检查是否有实际数据(不只是表头)
+ if len(df) > 1:
+ valid_sheets.append(sheet_name)
+
+ except Exception as e:
+ continue
+
+ except Exception as e:
+ print(f"读取文件 {file_path} 时出错: {e}")
+
+ return valid_sheets
+
+ def get_common_sheets(self) -> List[str]:
+ """获取两个文件的共同工作表"""
+ if not self.file1_sheets or not self.file2_sheets:
+ return []
+
+ # 标准化工作表名称(去除空格和特殊字符)
+ file1_clean = [self.standardize_sheet_name(sheet) for sheet in self.file1_sheets]
+ file2_clean = [self.standardize_sheet_name(sheet) for sheet in self.file2_sheets]
+
+ # 找出共同的工作表
+ common_sheets = []
+ for sheet1 in self.file1_sheets:
+ clean_sheet1 = self.standardize_sheet_name(sheet1)
+ for sheet2 in self.file2_sheets:
+ clean_sheet2 = self.standardize_sheet_name(sheet2)
+ if clean_sheet1 == clean_sheet2:
+ common_sheets.append(sheet1)
+ break
+
+ return common_sheets
+
+ def standardize_sheet_name(self, sheet_name: str) -> str:
+ """标准化工作表名称,便于比较"""
+ return str(sheet_name).strip().lower().replace(' ', '_').replace('-', '_')
+
+ def load_bom_data(self, file_path: str, sheet_name: str) -> pd.DataFrame:
+ """加载BOM数据"""
+ try:
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
+ # 清理数据:去除空行和空列
+ df = df.dropna(how='all').dropna(axis=1, how='all')
+
+ # 清理列名
+ df.columns = df.columns.str.strip()
+
+ return df
+ except Exception as e:
+ print(f"加载sheet {sheet_name} 时出错: {e}")
+ return pd.DataFrame()
+
+ def should_compare_column(self, column_name: str) -> bool:
+ """判断是否应该对比该列(排除检查信息类列)"""
+ exclude_keywords = ['检查', '校验', '状态', '备注', 'comment', 'check']
+ column_lower = str(column_name).lower()
+
+ # 检查是否在排除列表中
+ if column_name in self.columns_to_exclude:
+ return False
+
+ # 检查是否包含排除关键词
+ for keyword in exclude_keywords:
+ if keyword in column_lower:
+ return False
+
+ return True
+
+ def get_columns_to_compare(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]:
+ """获取需要对比的列名(排除检查信息类列)"""
+ common_columns = list(set(df1.columns).intersection(set(df2.columns)))
+
+ # 过滤掉不需要对比的列
+ columns_to_compare = [col for col in common_columns if self.should_compare_column(col)]
+
+ return columns_to_compare
+
+ def compare_dataframes(self, df1: pd.DataFrame, df2: pd.DataFrame, sheet_name1: str, sheet_name2: str) -> Dict:
+ """对比两个DataFrame的差异(排除检查信息类列)"""
+ differences = {
+ 'sheet_names': f"{sheet_name1} vs {sheet_name2}",
+ 'added_rows': [],
+ 'removed_rows': [],
+ 'modified_rows': [],
+ 'columns_comparison': {},
+ 'summary': {
+ 'total_rows_df1': len(df1),
+ 'total_rows_df2': len(df2),
+ 'added_count': 0,
+ 'removed_count': 0,
+ 'modified_count': 0
+ },
+ 'original_dfs': {
+ 'df1': df1.copy(),
+ 'df2': df2.copy()
+ }
+ }
+
+ # 确定关键列用于行匹配
+ key_columns = self.identify_key_columns(df1, df2)
+
+ if not key_columns:
+ differences['error'] = "无法确定用于对比的关键列"
+ return differences
+
+ try:
+ # 设置索引
+ df1_indexed = df1.set_index(key_columns)
+ df2_indexed = df2.set_index(key_columns)
+
+ # 获取需要对比的列(排除检查信息类列)
+ columns_to_compare = self.get_columns_to_compare(df1, df2)
+
+ # 找出新增的行
+ new_indexes = df2_indexed.index.difference(df1_indexed.index)
+ if len(new_indexes) > 0:
+ differences['added_rows'] = df2_indexed.loc[new_indexes].reset_index().to_dict('records')
+ differences['summary']['added_count'] = len(new_indexes)
+
+ # 找出删除的行
+ removed_indexes = df1_indexed.index.difference(df2_indexed.index)
+ if len(removed_indexes) > 0:
+ differences['removed_rows'] = df1_indexed.loc[removed_indexes].reset_index().to_dict('records')
+ differences['summary']['removed_count'] = len(removed_indexes)
+
+ # 找出共同的行并进行详细对比(排除检查信息类列)
+ common_indexes = df1_indexed.index.intersection(df2_indexed.index)
+
+ for idx in common_indexes:
+ row1 = df1_indexed.loc[idx]
+ row2 = df2_indexed.loc[idx]
+
+ # 检查每列的值是否相同(只对比需要比较的列)
+ modified_cols = {}
+ for col in columns_to_compare:
+ if col in df1_indexed.columns and col in df2_indexed.columns:
+ val1 = row1[col]
+ val2 = row2[col]
+
+ # 处理NaN值的比较
+ if pd.isna(val1) and pd.isna(val2):
+ continue
+ elif pd.isna(val1) or pd.isna(val2) or str(val1) != str(val2):
+ modified_cols[col] = {
+ 'old_value': val1,
+ 'new_value': val2
+ }
+
+ if modified_cols:
+ # 获取完整的行数据以显示所有需要的列
+ full_row_data = self.get_full_row_data_for_display(df1, df2, idx, key_columns)
+
+ differences['modified_rows'].append({
+ 'key_values': dict(zip(key_columns, idx)) if isinstance(idx, tuple) else {key_columns[0]: idx},
+ 'modified_columns': modified_cols,
+ 'full_row_data': full_row_data
+ })
+ differences['summary']['modified_count'] += 1
+
+ # 列级对比(包含所有列,用于统计)
+ common_columns = set(df1.columns).intersection(set(df2.columns))
+ df1_only_columns = set(df1.columns).difference(set(df2.columns))
+ df2_only_columns = set(df2.columns).difference(set(df1.columns))
+
+ # 计算实际参与对比的列
+ compared_columns = set(columns_to_compare)
+ excluded_columns = common_columns - compared_columns
+
+ differences['columns_comparison'] = {
+ 'common_columns': list(common_columns),
+ 'compared_columns': list(compared_columns),
+ 'excluded_columns': list(excluded_columns),
+ 'file1_only_columns': list(df1_only_columns),
+ 'file2_only_columns': list(df2_only_columns)
+ }
+
+ except Exception as e:
+ differences['error'] = f"对比过程中出错: {str(e)}"
+
+ return differences
+
+ def get_full_row_data_for_display(self, df1: pd.DataFrame, df2: pd.DataFrame, idx, key_columns: List[str]) -> Dict:
+ """获取完整的行数据用于显示"""
+ display_data = {}
+
+ # 获取两个文件中的对应行数据
+ row1_data = self.extract_row_data(df1, idx, key_columns)
+ row2_data = self.extract_row_data(df2, idx, key_columns)
+
+ # 定义需要显示的列(排除检查信息类列)
+ display_columns = ['Purchase_Code', 'MF_PN', 'Description', 'Part Type', 'MF_NAME', 'PCB_Footprint', '合计']
+
+ # 过滤掉检查信息类列
+ display_columns = [col for col in display_columns if self.should_compare_column(col)]
+
+ for col in display_columns:
+ val1 = row1_data.get(col, '')
+ val2 = row2_data.get(col, '')
+
+ # 格式化显示:有差异显示原值->新值,无差异显示原值
+ if pd.isna(val1) or val1 == '':
+ display_value = val2
+ elif pd.isna(val2) or val2 == '':
+ display_value = val1
+ elif str(val1) != str(val2):
+ display_value = f"{val1} -> {val2}"
+ else:
+ display_value = val1
+
+ display_data[col] = display_value
+
+ # 添加文件来源信息
+ display_data['_from_file1'] = row1_data
+ display_data['_from_file2'] = row2_data
+
+ return display_data
+
+ def extract_row_data(self, df: pd.DataFrame, idx, key_columns: List[str]) -> Dict:
+ """从DataFrame中提取指定行的数据"""
+ row_data = {}
+
+ try:
+ if isinstance(idx, tuple):
+ # 多列索引的情况
+ mask = pd.Series(True, index=df.index)
+ for i, key in enumerate(key_columns):
+ mask = mask & (df[key] == idx[i])
+ if mask.any():
+ original_row = df[mask].iloc[0]
+ for col in df.columns:
+ row_data[col] = original_row[col]
+ else:
+ # 单列索引的情况
+ matching_rows = df[df[key_columns[0]] == idx]
+ if len(matching_rows) > 0:
+ original_row = matching_rows.iloc[0]
+ for col in df.columns:
+ row_data[col] = original_row[col]
+
+ except Exception as e:
+ pass
+
+ return row_data
+
+ def format_value_display(self, value1, value2):
+ """格式化值的显示:有差异显示原值->新值,无差异显示原值"""
+ if pd.isna(value1) or value1 == '':
+ return value2
+ elif pd.isna(value2) or value2 == '':
+ return value1
+ elif str(value1) != str(value2):
+ return f"{value1} -> {value2}"
+ else:
+ return value1
+
+ def get_modified_columns_summary(self, modified_columns: Dict) -> str:
+ """获取修改列的概要汇总"""
+ if not modified_columns:
+ return "无修改"
+
+ modified_list = list(modified_columns.keys())
+
+ # 如果修改列数量较少,直接显示
+ if len(modified_list) <= 3:
+ return ", ".join(modified_list)
+ else:
+ # 数量较多时显示前3个加省略号
+ return ", ".join(modified_list[:3]) + f"...等{len(modified_list)}列"
+
+ def identify_key_columns(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]:
+ """识别用于行匹配的关键列"""
+ # 优先使用Partnumber作为关键列
+ potential_keys = ['Partnumber', 'Purchase_Code', 'MF_PN']
+
+ for key in potential_keys:
+ if key in df1.columns and key in df2.columns:
+ # 检查该列是否适合作为关键列(不应有过多重复值)
+ df1_dup_rate = df1[key].duplicated().sum() / len(df1)
+ df2_dup_rate = df2[key].duplicated().sum() / len(df2)
+
+ if df1_dup_rate < 0.1 and df2_dup_rate < 0.1: # 允许少量重复
+ return [key]
+
+ # 如果没有单一关键列,尝试组合
+ for key_combo in [['Partnumber', 'MF_PN'], ['Purchase_Code', 'MF_PN']]:
+ if all(col in df1.columns for col in key_combo) and all(col in df2.columns for col in key_combo):
+ return key_combo
+
+ # 最后尝试使用所有找到的共同列
+ common_cols = list(set(df1.columns).intersection(set(df2.columns)))
+ if common_cols:
+ return common_cols[:2] # 最多使用前两列
+
+ return []
+
+ def generate_output_filename(self) -> str:
+ """生成输出文件名,以两个文件的有效sheet名称开头"""
+ if not self.file1_sheets or not self.file2_sheets:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ return f"BOM差异报告_{timestamp}.xlsx"
+
+ # 使用第一个文件第一个sheet和第二个文件第一个sheet
+ file1_sheet_name = str(self.file1_sheets[0]) if self.file1_sheets else "File1"
+ file2_sheet_name = str(self.file2_sheets[0]) if self.file2_sheets else "File2"
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # 清理sheet名称中的特殊字符
+ clean_sheet1 = self.clean_filename(file1_sheet_name)
+ clean_sheet2 = self.clean_filename(file2_sheet_name)
+
+ filename = f"{clean_sheet1}_vs_{clean_sheet2}_差异报告_{timestamp}.xlsx"
+
+ return filename
+
+ def clean_filename(self, filename: str) -> str:
+ """清理文件名中的特殊字符"""
+ filename = str(filename)
+
+ # 移除Windows文件名中不允许的字符
+ invalid_chars = '<>:"/\\|?*'
+ for char in invalid_chars:
+ filename = filename.replace(char, '_')
+
+ # 移除多余的空格和特殊字符
+ filename = filename.replace(' ', '_')
+ filename = filename.replace('\t', '_')
+ filename = filename.replace('\n', '_')
+
+ # 限制文件名长度
+ if len(filename) > 50:
+ filename = filename[:50]
+
+ return filename
+
+ def clean_sheet_name(self, sheet_name: str, max_length: int = 25) -> str:
+ """清理工作表名称,确保符合Excel工作表名称限制"""
+ sheet_name = str(sheet_name)
+
+ # 移除Excel工作表名称中不允许的字符
+ invalid_chars = '[]:*?/\\'
+ for char in invalid_chars:
+ sheet_name = sheet_name.replace(char, '_')
+
+ # 限制工作表名称长度(Excel限制为31个字符)
+ if len(sheet_name) > max_length:
+ sheet_name = sheet_name[:max_length]
+
+ return sheet_name
+
+ def get_output_directory(self) -> str:
+ """获取输出目录(第二个文件所在目录)"""
+ return os.path.dirname(self.file2_path)
+
+ def generate_difference_report(self) -> str:
+ """生成差异报告Excel文件"""
+ if not self.differences:
+ return "没有发现差异"
+
+ # 生成输出文件名和路径
+ output_filename = self.generate_output_filename()
+ output_directory = self.get_output_directory()
+ output_path = os.path.join(output_directory, output_filename)
+
+ try:
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+
+ # 创建总摘要表
+ summary_data = []
+ for diff_key, differences in self.differences.items():
+ if 'error' not in differences:
+ columns_comparison = differences.get('columns_comparison', {})
+ excluded_count = len(columns_comparison.get('excluded_columns', []))
+
+ summary_data.append([
+ differences.get('sheet_names', diff_key),
+ differences['summary']['total_rows_df1'],
+ differences['summary']['total_rows_df2'],
+ differences['summary']['added_count'],
+ differences['summary']['removed_count'],
+ differences['summary']['modified_count'],
+ excluded_count
+ ])
+
+ if summary_data:
+ summary_df = pd.DataFrame(summary_data, columns=[
+ '工作表对比', '文件1行数', '文件2行数', '新增行数', '删除行数', '修改行数', '排除列数'
+ ])
+ summary_df.to_excel(writer, sheet_name='对比摘要', index=False)
+
+ # 为每个对比创建详细报告
+ for diff_key, differences in self.differences.items():
+ sheet_key = self.clean_sheet_name(diff_key.replace('vs', '_vs_'))
+
+ if 'error' in differences:
+ # 如果有错误,创建错误报告
+ error_df = pd.DataFrame([['错误信息', differences['error']]])
+ error_df.to_excel(writer, sheet_name=f"{sheet_key}_错误", index=False, header=False)
+ continue
+
+ # 汇总表 - 包含列对比的详细信息
+ summary_data = []
+ summary_data.append(["对比项", "数量"])
+ summary_data.append(["文件1总行数", differences['summary']['total_rows_df1']])
+ summary_data.append(["文件2总行数", differences['summary']['total_rows_df2']])
+ summary_data.append(["新增行数", differences['summary']['added_count']])
+ summary_data.append(["删除行数", differences['summary']['removed_count']])
+ summary_data.append(["修改行数", differences['summary']['modified_count']])
+ summary_data.append(["共同列数", len(differences['columns_comparison']['common_columns'])])
+ summary_data.append(["实际对比列数", len(differences['columns_comparison']['compared_columns'])])
+ summary_data.append(["排除列数", len(differences['columns_comparison']['excluded_columns'])])
+ summary_data.append(["文件1特有列", len(differences['columns_comparison']['file1_only_columns'])])
+ summary_data.append(["文件2特有列", len(differences['columns_comparison']['file2_only_columns'])])
+
+ # 添加排除列详情
+ excluded_cols = differences['columns_comparison'].get('excluded_columns', [])
+ if excluded_cols:
+ summary_data.append(["", ""])
+ summary_data.append(["排除的列", "(检查信息类列不参与对比)"])
+ for col in excluded_cols:
+ summary_data.append(["", f"- {col}"])
+
+ pd.DataFrame(summary_data).to_excel(
+ writer,
+ sheet_name=f"{sheet_key}_汇总",
+ index=False,
+ header=False
+ )
+
+ # 新增行详情
+ if differences['added_rows']:
+ pd.DataFrame(differences['added_rows']).to_excel(
+ writer,
+ sheet_name=f"{sheet_key}_新增行",
+ index=False
+ )
+
+ # 删除行详情
+ if differences['removed_rows']:
+ pd.DataFrame(differences['removed_rows']).to_excel(
+ writer,
+ sheet_name=f"{sheet_key}_删除行",
+ index=False
+ )
+
+ # 修改行详情 - 优化后的显示格式(排除检查信息列)
+ if differences['modified_rows']:
+ modified_data = []
+
+ for mod_row in differences['modified_rows']:
+ # 创建基础记录
+ record = {
+ **mod_row['key_values'], # 关键列(如Partnumber)
+ '修改列': self.get_modified_columns_summary(mod_row['modified_columns'])
+ }
+
+ # 添加所有需要显示的列(排除检查信息类列)
+ display_data = mod_row.get('full_row_data', {})
+
+ # 获取需要显示的列
+ display_columns = list(display_data.keys())
+ display_columns = [col for col in display_columns if
+ not col.startswith('_') and self.should_compare_column(col)]
+
+ for col in display_columns:
+ record[col] = display_data.get(col, '')
+
+ # 添加详细的修改信息(只包括参与对比的列)
+ for col, values in mod_row['modified_columns'].items():
+ if self.should_compare_column(col):
+ record[f'详细_{col}'] = f"{values['old_value']} -> {values['new_value']}"
+
+ modified_data.append(record)
+
+ if modified_data:
+ modified_df = pd.DataFrame(modified_data)
+
+ # 重新排列列的顺序,让重要信息在前
+ column_order = list(mod_row['key_values'].keys()) + ['修改列']
+
+ # 添加其他显示列
+ other_columns = [col for col in modified_df.columns
+ if col not in column_order and not col.startswith('详细_')]
+ column_order.extend(other_columns)
+
+ # 添加详细修改信息列
+ detailed_cols = [col for col in modified_df.columns if col.startswith('详细_')]
+ column_order.extend(detailed_cols)
+
+ # 确保所有列都存在
+ existing_columns = [col for col in column_order if col in modified_df.columns]
+ modified_df = modified_df[existing_columns]
+
+ modified_df.to_excel(
+ writer,
+ sheet_name=f"{sheet_key}_修改行",
+ index=False
+ )
+
+ return output_path
+
+ except Exception as e:
+ print(f"生成报告时出错: {e}")
+ return ""
+
+ def run_comparison(self):
+ """执行完整的BOM对比流程"""
+ print("=== BOM文件差异对比工具 ===")
+ print("注意:检查信息类列(如'检查信息')将不参与修改行对比")
+
+ # 1. 选择第一份文件
+ print("\n步骤1: 选择第一份Excel文件")
+ self.file1_path = self.select_file("选择第一份BOM Excel文件")
+ if not self.file1_path:
+ print("未选择文件,程序退出")
+ return
+
+ self.file1_name = os.path.basename(self.file1_path)
+
+ # 2. 选择第二份文件
+ print("\n步骤2: 选择第二份Excel文件")
+ self.file2_path = self.select_file("选择第二份BOM Excel文件")
+ if not self.file2_path:
+ print("未选择文件,程序退出")
+ return
+
+ self.file2_name = os.path.basename(self.file2_path)
+
+ print(f"\n文件1: {self.file1_name}")
+ print(f"文件2: {self.file2_name}")
+
+ # 3. 查找有效sheet
+ print("\n步骤3: 查找有效的工作表...")
+ self.file1_sheets = self.find_valid_sheets(self.file1_path)
+ self.file2_sheets = self.find_valid_sheets(self.file2_path)
+
+ print(f"文件1的有效工作表: {self.file1_sheets}")
+ print(f"文件2的有效工作表: {self.file2_sheets}")
+
+ if not self.file1_sheets or not self.file2_sheets:
+ print("至少有一个文件没有有效的工作表,无法进行对比")
+ return
+
+ # 4. 进行差异对比
+ print("\n步骤4: 进行差异对比...")
+ self.differences = {}
+
+ # 使用第一个文件第一个sheet和第二个文件第一个sheet进行对比
+ sheet1 = self.file1_sheets[0]
+ sheet2 = self.file2_sheets[0]
+
+ print(f"正在对比: {sheet1} (文件1) vs {sheet2} (文件2)")
+
+ df1 = self.load_bom_data(self.file1_path, sheet1)
+ df2 = self.load_bom_data(self.file2_path, sheet2)
+
+ if df1.empty:
+ print(f" ⚠ 文件1的工作表 {sheet1} 数据加载失败")
+ return
+
+ if df2.empty:
+ print(f" ⚠ 文件2的工作表 {sheet2} 数据加载失败")
+ return
+
+ differences = self.compare_dataframes(df1, df2, sheet1, sheet2)
+ comparison_key = f"{sheet1}_vs_{sheet2}"
+ self.differences[comparison_key] = differences
+
+ if 'error' in differences:
+ print(f" ⚠ 对比过程中出错: {differences['error']}")
+ else:
+ columns_comparison = differences.get('columns_comparison', {})
+ excluded_count = len(columns_comparison.get('excluded_columns', []))
+
+ print(f" √ 完成对比:")
+ print(f" 文件1行数: {differences['summary']['total_rows_df1']}")
+ print(f" 文件2行数: {differences['summary']['total_rows_df2']}")
+ print(f" 新增行数: {differences['summary']['added_count']}")
+ print(f" 删除行数: {differences['summary']['removed_count']}")
+ print(f" 修改行数: {differences['summary']['modified_count']}")
+ print(f" 排除列数: {excluded_count} (检查信息类列不参与对比)")
+
+ # 5. 生成差异报告
+ print("\n步骤5: 生成差异报告...")
+ output_file = self.generate_difference_report()
+
+ if output_file and os.path.exists(output_file):
+ print(f"\n=== 对比完成 ===")
+ print(f"差异报告已生成: {os.path.basename(output_file)}")
+ # print(f"文件位置: {output_file}")
+ print(f"输出目录: {self.get_output_directory()}")
+ else:
+ print("未成功生成差异报告")
+
+
+def main():
+ """主函数"""
+ comparator = BOMComparator()
+ comparator.run_comparison()
+
+ input("\n按Enter键退出...")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/BOMCompare/BOMCompareForJP1.py b/BOMCompare/BOMCompareForJP1.py
new file mode 100644
index 0000000..fa4eef9
--- /dev/null
+++ b/BOMCompare/BOMCompareForJP1.py
@@ -0,0 +1,635 @@
+import os
+import pandas as pd
+import numpy as np
+import tkinter as tk
+from tkinter import filedialog
+from datetime import datetime
+import warnings
+import re
+from openpyxl import Workbook
+from openpyxl.utils.dataframe import dataframe_to_rows
+
+warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
+
+
+class BOMComparator:
+ def __init__(self):
+ self.column_mapping = {
+ 'ITEM': 'Partnumber',
+ 'HT PN': 'Partnumber',
+ 'MF PN': 'MF_PN',
+ 'MFG': 'MF_NAME',
+ 'CRD': 'Reference',
+ 'Description': 'Description',
+ 'Qty': 'Quantity',
+ '焊接方式': '焊接方式',
+ 'Remark': '备注'
+ }
+ self.ignore_columns = ['备注']
+ self.required_columns = list(self.column_mapping.values())
+ self.change_columns = [
+ 'ITEM', 'HT PN', 'MF PN', 'MFG', 'CRD', 'Description', 'Qty', 'Remark'
+ ]
+ self.mandatory_keywords = ['item', 'partnumber', 'mfpn']
+
+ # 异常记录
+ self.validation_errors = []
+
+ self.stats = {
+ 'old_bom_rows': 0,
+ 'new_bom_rows': 0,
+ 'changed_items': 0,
+ 'added_items': 0,
+ 'removed_items': 0,
+ 'total_errors': 0
+ }
+
+ def normalize_text(self, text):
+ if pd.isna(text):
+ return ""
+ text = str(text)
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
+ return text.strip().lower()
+
+ def find_header_row(self, df):
+ print(f"扫描前 {min(20, len(df))} 行寻找标题行...")
+ for i in range(min(20, len(df))):
+ row_values = [self.normalize_text(cell) for cell in df.iloc[i].values]
+
+ contains_all_keywords = True
+ for keyword in self.mandatory_keywords:
+ if not any(keyword in cell_value for cell_value in row_values):
+ contains_all_keywords = False
+ break
+
+ if contains_all_keywords:
+ print(f"✅ 找到有效标题行 (索引 {i}),包含所有必需关键词")
+ return i
+
+ error_msg = (
+ "❌ 未找到有效的标题行:所有标题行必须同时包含以下关键词:\n"
+ f"- Item (或类似表述)\n"
+ f"- Partnumber (或类似表述)\n"
+ f"- MF_PN (或类似表述)\n\n"
+ "在文件的前20行中没有找到同时包含所有关键词的行。"
+ )
+ raise ValueError(error_msg)
+
+ def find_active_sheet(self, file_path):
+ print(f"扫描文件: {os.path.basename(file_path)}")
+ xls = pd.ExcelFile(file_path)
+
+ candidate_sheets = []
+ for sheet_name in xls.sheet_names:
+ # 使用 BOM 或 PCBA 作为关键词
+ if any(keyword in sheet_name.lower() for keyword in ["bom", "pcba"]):
+ candidate_sheets.append(sheet_name)
+ print(f" 发现候选Sheet: {sheet_name} - 关键词匹配")
+
+ # 第一步:优先检查第一个bom候选Sheet
+ successful_sheet = None
+ if candidate_sheets:
+
+ for first_candidate in candidate_sheets:
+ # 先检查第一个候选Sheet
+ # first_candidate = candidate_sheets[0]
+ try:
+ print(f" 优先检查候选Sheet: {first_candidate}")
+ df_preview = pd.read_excel(
+ file_path,
+ sheet_name=first_candidate,
+ header=None,
+ nrows=20,
+ engine='openpyxl'
+ )
+ header_row_idx = self.find_header_row(df_preview)
+ print(f"✅ 在候选Sheet '{first_candidate}' 中找到标题行")
+ # return first_candidate
+ successful_sheet = first_candidate
+ break
+ except Exception as e:
+ print(f" ❌ 优先候选Sheet '{first_candidate}': {str(e)}")
+ # 移除失败的首选候选
+ # candidate_sheets.pop(0)
+ # remove(值) - 移除指定值的元素
+ # candidate_sheets.remove(first_candidate) # 移除值为 'sheet_name' 的元素
+ continue
+ if successful_sheet:
+ return successful_sheet
+
+ # 第二步:如果没找到bom候选Sheet或首选候选失败,遍历所有候选Sheet
+ if not successful_sheet:
+ candidate_sheets = xls.sheet_names
+ print(" 未找到名称包含'BOM'的Sheet,将检查所有Sheet")
+
+ # 遍历剩余候选Sheet
+ for sheet_name in candidate_sheets:
+ try:
+ print(f" 检查Sheet: {sheet_name}")
+ df_preview = pd.read_excel(
+ file_path,
+ sheet_name=sheet_name,
+ header=None,
+ nrows=20,
+ engine='openpyxl'
+ )
+
+ try:
+ header_row_idx = self.find_header_row(df_preview)
+ print(f"✅ 在Sheet '{sheet_name}' 中找到标题行")
+ return sheet_name
+ except ValueError as e:
+ print(f" ❌ Sheet '{sheet_name}': {str(e)}")
+ continue
+ except Exception as e:
+ print(f" 检查Sheet '{sheet_name}' 时出错: {str(e)}")
+ continue
+
+ # 第三步:如果所有候选Sheet都失败,尝试第一个Sheet作为备选
+ print("⚠️ 所有候选Sheet检查失败,尝试第一个Sheet")
+ first_sheet = xls.sheet_names[0]
+ try:
+ df_preview = pd.read_excel(
+ file_path,
+ sheet_name=first_sheet,
+ header=None,
+ nrows=20,
+ engine='openpyxl'
+ )
+ header_row_idx = self.find_header_row(df_preview)
+ print(f"✅ 在备份Sheet '{first_sheet}' 中找到标题行")
+ return first_sheet
+ except Exception as e:
+ print(f"❌ 备份Sheet '{first_sheet}' 也失败: {str(e)}")
+ return None
+
+ def validate_bom(self, bom_df, file_name, sheet_name):
+ """验证BOM数据并收集异常"""
+ errors = []
+
+ # 1. 检查Partnumber是否有重复
+ dup_partnumbers = bom_df[bom_df.duplicated('Partnumber', keep=False)]
+ if not dup_partnumbers.empty:
+ print(f"⚠️ 发现重复的Partnumber: {len(dup_partnumbers)} 行")
+ for idx, row in dup_partnumbers.iterrows():
+ error = {
+ '文件': file_name,
+ 'Sheet': sheet_name,
+ '原始行号': idx + 2, # Excel行号从1开始,标题行下一行
+ '异常类型': '重复Partnumber',
+ '异常描述': f"Partnumber '{row['Partnumber']}' 重复出现"
+ }
+ errors.append(error)
+
+ # 2. 检查Partnumber是否为空
+ empty_partnumbers = bom_df[bom_df['Partnumber'].isna() | (bom_df['Partnumber'] == '')]
+ if not empty_partnumbers.empty:
+ print(f"⚠️ 发现空Partnumber: {len(empty_partnumbers)} 行")
+ for idx, row in empty_partnumbers.iterrows():
+ error = {
+ '文件': file_name,
+ 'Sheet': sheet_name,
+ '原始行号': idx + 2,
+ '异常类型': '空Partnumber',
+ '异常描述': "Partnumber为空"
+ }
+ errors.append(error)
+
+ # 3. 验证Reference位号数量与Quantity是否一致
+ for idx, row in bom_df.iterrows():
+ # # 跳过PCB等特殊项
+ # if row.get('Part Type') == 'PCB' or pd.isna(row.get('Reference')):
+ # continue
+
+ refs = str(row['Reference'])
+ qty = row['Quantity']
+
+ try:
+ # 计算实际位号数量
+ ref_count = len([r for r in refs.split(',') if r.strip()])
+
+ # 检查Quantity是否为数字
+ try:
+ qty_val = int(qty)
+ except (ValueError, TypeError):
+ qty_val = -1
+
+ # 验证数量一致性
+ if ref_count != qty_val:
+ error = {
+ '文件': file_name,
+ 'Sheet': sheet_name,
+ '原始行号': idx + 2,
+ '异常类型': '数量不一致',
+ '异常描述': f"位号数量({ref_count}) ≠ Quantity({qty})"
+ }
+ errors.append(error)
+ except Exception as e:
+ error = {
+ '文件': file_name,
+ 'Sheet': sheet_name,
+ '原始行号': idx + 2,
+ '异常类型': '验证错误',
+ '异常描述': f"验证异常: {str(e)}"
+ }
+ errors.append(error)
+
+ return errors
+
+ def load_bom(self, file_path):
+ print(f"识别激活Sheet...")
+ active_sheet = self.find_active_sheet(file_path)
+ print(f"📊 使用Sheet: {active_sheet}")
+
+ df_preview = pd.read_excel(
+ file_path,
+ sheet_name=active_sheet,
+ header=None,
+ nrows=20
+ )
+
+ header_row_idx = self.find_header_row(df_preview)
+
+ print("加载完整BOM数据...")
+ bom_df = pd.read_excel(
+ file_path,
+ sheet_name=active_sheet,
+ header=header_row_idx,
+ dtype=str
+ )
+
+ if "old_bom_rows" not in self.stats or self.stats['old_bom_rows'] == 0:
+ self.stats['old_bom_rows'] = len(bom_df)
+ else:
+ self.stats['new_bom_rows'] = len(bom_df)
+
+ # 清理列名
+ bom_df.columns = [str(col).strip() for col in bom_df.columns]
+ print(f" 原始列名: {list(bom_df.columns)}")
+
+ # 列名标准化映射
+ column_aliases = {
+ 'Item': 'Item',
+ 'Partnumber': 'Partnumber',
+ 'Part Number': 'Partnumber',
+ 'Purchase_Code': 'Purchase_Code',
+ 'MF_PN': 'MF_PN',
+ 'Description': 'Description',
+ 'Part Type': 'Part Type',
+ 'MF_NAME': 'MF_NAME',
+ 'Manufacturer': 'MF_NAME',
+ 'PCB_Footprint': 'PCB_Footprint',
+ 'Reference': 'Reference',
+ 'References': 'Reference',
+ 'Quantity': 'Quantity',
+ 'Qty': 'Quantity',
+ '加工方式': '焊接方式',
+ '焊接方式': '焊接方式',
+ 'Value': 'Value',
+ '备注': '备注',
+ 'Remark': '备注',
+ 'Comments': '备注'
+ }
+
+ # 应用别名映射
+ bom_df = bom_df.rename(columns={col: alias for col, alias in column_aliases.items()
+ if col in bom_df.columns})
+ print(f" 标准化后列名: {list(bom_df.columns)}")
+
+ # 确保所有必需列存在
+ missing_cols = [col for col in self.required_columns if col not in bom_df.columns]
+ if missing_cols:
+ raise ValueError(f"❌ 缺少必需列: {', '.join(missing_cols)}")
+
+ # 清理数据:去除空行和无效项
+ initial_count = len(bom_df)
+ bom_df = bom_df.replace('', np.nan)
+ bom_df = bom_df.dropna(subset=['Item'], how='all')
+ cleaned_count = len(bom_df)
+
+ if initial_count > cleaned_count:
+ print(
+ f" 清理空行: 移除 {initial_count - cleaned_count} 行 (原 {initial_count} 行 -> 现 {cleaned_count} 行)")
+
+ # 执行数据验证
+ file_name = os.path.basename(file_path)
+ errors = self.validate_bom(bom_df, file_name, active_sheet)
+ self.validation_errors.extend(errors)
+ self.stats['total_errors'] += len(errors)
+
+ if errors:
+ print(f"⚠️ 在 '{file_name}' 中发现 {len(errors)} 个数据异常")
+
+ return bom_df, active_sheet
+
+ def compare_reference_lists(self, old_refs_str, new_refs_str):
+ """比较两个Reference列表,返回差异描述"""
+ if pd.isna(old_refs_str):
+ old_refs_str = ""
+ if pd.isna(new_refs_str):
+ new_refs_str = ""
+
+ old_refs = set([ref.strip() for ref in str(old_refs_str).split(',') if ref.strip()])
+ new_refs = set([ref.strip() for ref in str(new_refs_str).split(',') if ref.strip()])
+
+ # 如果两个集合相同,返回空字符串表示无差异
+ if old_refs == new_refs:
+ return ""
+
+ # 计算差异
+ added_refs = new_refs - old_refs
+ removed_refs = old_refs - new_refs
+
+ diff_msgs = []
+ if added_refs:
+ diff_msgs.append(f"增加位号: {','.join(sorted(added_refs))}")
+ if removed_refs:
+ diff_msgs.append(f"删除位号: {','.join(sorted(removed_refs))}")
+
+ return "; ".join(diff_msgs)
+
+ def compare_boms(self, old_bom, new_bom):
+ print("开始比较两份BOM...")
+ old_bom['Partnumber'] = old_bom['Partnumber'].astype(str).str.strip()
+ new_bom['Partnumber'] = new_bom['Partnumber'].astype(str).str.strip()
+
+ changes = []
+
+ old_partnumbers = set(old_bom['Partnumber'].unique())
+ if len(old_partnumbers) != len(old_bom):
+ print(f"⚠️ 旧BOM有重复的Partnumber: 总行数{len(old_bom)},唯一物料数{len(old_partnumbers)}")
+ new_partnumbers = set(new_bom['Partnumber'].unique())
+ if len(new_partnumbers) != len(new_bom):
+ print(f"⚠️ 新BOM有重复的Partnumber: 总行数{len(new_bom)},唯一物料数{len(new_partnumbers)}")
+
+ all_partnumbers = sorted(old_partnumbers | new_partnumbers)
+ print(f" 总物料项数量: {len(all_partnumbers)} (旧BOM: {len(old_partnumbers)}, 新BOM: {len(new_partnumbers)})")
+
+ for idx, pn in enumerate(all_partnumbers):
+ if (idx + 1) % 100 == 0 or (idx + 1) == len(all_partnumbers):
+ print(f" 处理进度: {idx + 1}/{len(all_partnumbers)} 项物料")
+
+ record = {'ITEM_OLD': '', 'ITEM_NEW': ''}
+ old_row = None
+ new_row = None
+ change_desc = ""
+
+ old_match = old_bom[old_bom['Partnumber'] == pn]
+ if not old_match.empty:
+ old_row = old_match.iloc[0]
+ record['ITEM_OLD'] = old_row['Item']
+
+ new_match = new_bom[new_bom['Partnumber'] == pn]
+ if not new_match.empty:
+ new_row = new_match.iloc[0]
+ record['ITEM_NEW'] = new_row['Item']
+
+ change_type = ""
+ if old_row is None:
+ change_type = "新增"
+ self.stats['added_items'] += 1
+ change_desc = "新增物料"
+ elif new_row is None:
+ change_type = "删除"
+ self.stats['removed_items'] += 1
+ change_desc = "删除物料"
+ else:
+ change_type = "变更"
+ self.stats['changed_items'] += 1
+
+ # 填充左侧列(旧BOM值)
+ for change_col, bom_col in self.column_mapping.items():
+ if change_col == 'ITEM':
+ continue
+ old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else ''
+ record[change_col] = old_val
+
+ # 填充右侧列(新BOM值)
+ for change_col, bom_col in self.column_mapping.items():
+ if change_col == 'ITEM':
+ continue
+ new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else ''
+ record[f'NEW_{change_col}'] = new_val
+
+ if change_type == "变更":
+ change_details = []
+ qty_changed = False
+ if 'Quantity' in old_row.index and 'Quantity' in new_row.index:
+ old_qty = str(old_row['Quantity'])
+ new_qty = str(new_row['Quantity'])
+ if old_qty != new_qty:
+ change_details.append(f"Qty: {old_qty}→{new_qty}")
+ qty_changed = True
+
+ mfpn_changed = False
+ if 'MF_PN' in old_row.index and 'MF_PN' in new_row.index:
+ old_mfpn = str(old_row['MF_PN'])
+ new_mfpn = str(new_row['MF_PN'])
+ if old_mfpn != new_mfpn:
+ change_details.append(f"MF PN: {old_mfpn}→{new_mfpn}")
+ mfpn_changed = True
+
+ # 优化:使用新的Reference比较方法
+ if 'Reference' in old_row.index and 'Reference' in new_row.index:
+ ref_diff = self.compare_reference_lists(old_row['Reference'], new_row['Reference'])
+ if ref_diff:
+ change_details.append(ref_diff)
+
+ for change_col, bom_col in self.column_mapping.items():
+ if (change_col == 'ITEM' or
+ bom_col in ['Quantity', 'MF_PN', 'Reference'] or
+ bom_col in self.ignore_columns):
+ continue
+
+ old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else ''
+ new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else ''
+
+ if str(old_val) != str(new_val):
+ change_details.append(f"{change_col}: {old_val}→{new_val}")
+
+ if change_details:
+ change_desc = "; ".join(change_details)
+ else:
+ change_type = ""
+
+ record['Design change Type'] = change_type
+ record['NEW_Remark'] = change_desc
+
+ if change_type:
+ changes.append(record)
+
+ left_columns = ['ITEM_OLD'] + [col for col in self.change_columns if col != 'ITEM']
+ middle_columns = ['Design change Type']
+ right_columns = ['ITEM_NEW'] + [f'NEW_{col}' for col in self.change_columns if col != 'ITEM']
+
+ if 'NEW_Remark' in right_columns:
+ right_columns.remove('NEW_Remark')
+ right_columns.append('NEW_Remark')
+
+ change_columns = left_columns + middle_columns + right_columns
+ right_start_col = len(left_columns) + len(middle_columns) + 1
+
+ return pd.DataFrame(changes, columns=change_columns), right_start_col
+
+ def generate_summary(self):
+ summary = [
+ "\n" + "=" * 50,
+ "BOM 比较处理汇总",
+ "-" * 50,
+ f"原始BOM行数: {self.stats['old_bom_rows']}",
+ f"新BOM行数: {self.stats['new_bom_rows']}",
+ f"变更物料数量: {self.stats['changed_items']}",
+ f"新增物料数量: {self.stats['added_items']}",
+ f"删除物料数量: {self.stats['removed_items']}",
+ f"变更记录总数: {self.stats['changed_items'] + self.stats['added_items'] + self.stats['removed_items']}",
+ f"数据异常总数: {self.stats['total_errors']}",
+ "=" * 50
+ ]
+ return "\n".join(summary)
+
+ def generate_change_record(self):
+ root = tk.Tk()
+ root.withdraw()
+
+ # 重置统计信息和异常记录
+ self.stats = {
+ 'old_bom_rows': 0,
+ 'new_bom_rows': 0,
+ 'changed_items': 0,
+ 'added_items': 0,
+ 'removed_items': 0,
+ 'total_errors': 0
+ }
+ self.validation_errors = []
+
+ try:
+ # 选择原始BOM文件
+ print("\n" + "=" * 50)
+ print("步骤 1/4: 选择原始BOM文件")
+ print("=" * 50)
+ old_file = filedialog.askopenfilename(
+ title="选择原始BOM文件",
+ filetypes=[("Excel Files", "*.xlsx *.xls")]
+ )
+ if not old_file:
+ print("❌ 未选择文件,操作取消")
+ return
+ print(f"📂 已选择原始BOM: {old_file}")
+ old_file_name = os.path.basename(old_file)
+ # output_dir = os.path.dirname(old_file)
+
+ # 选择变更后BOM文件
+ print("\n" + "=" * 50)
+ print("步骤 2/4: 选择变更后BOM文件")
+ print("=" * 50)
+ new_file = filedialog.askopenfilename(
+ title="选择变更后BOM文件",
+ filetypes=[("Excel Files", "*.xlsx *.xls")]
+ )
+ if not new_file:
+ print("❌ 未选择文件,操作取消")
+ return
+ print(f"📂 已选择新BOM: {new_file}")
+ new_file_name = os.path.basename(new_file)
+ output_dir = os.path.dirname(new_file)
+
+ # 加载BOM文件
+ print("\n" + "=" * 50)
+ print("步骤 3/4: 加载并处理BOM文件")
+ print("=" * 50)
+ print(f"🔍 加载原始BOM文件: {old_file_name}")
+ old_bom, old_bom_activesheetname = self.load_bom(old_file)
+ print(f"✅ 原始BOM加载完成,共 {len(old_bom)} 行")
+
+ print(f"\n🔍 加载变更后BOM文件: {new_file_name}")
+ new_bom, new_bom_activesheetname = self.load_bom(new_file)
+ print(f"✅ 新BOM加载完成,共 {len(new_bom)} 行")
+
+ # 比较BOM生成变更记录
+ print("\n" + "=" * 50)
+ print("步骤 4/4: 比较BOM差异并生成变更记录")
+ print("=" * 50)
+ print("🔍 比较BOM差异...")
+ change_df, right_start_col = self.compare_boms(old_bom, new_bom)
+
+ # 准备输出文件名
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_file = f"{old_bom_activesheetname} to {new_bom_activesheetname} eBOM_change_record_{timestamp}.xlsx"
+ output_path = os.path.join(output_dir, output_file)
+
+ # 保存变更记录和异常记录
+ print(f"\n💾 保存变更记录文件: {output_path}")
+ wb = Workbook()
+
+ # 创建变更记录工作表
+ ws_change = wb.active
+ ws_change.title = "PCBA_BOM_change record"
+
+ if change_df.empty:
+ ws_change.cell(row=1, column=1, value="两份BOM完全相同,无变更记录")
+ print("✅ 两份BOM完全相同,无变更记录")
+ else:
+ # 重命名列
+ column_rename = {
+ 'ITEM_OLD': 'ITEM',
+ 'ITEM_NEW': 'ITEM',
+ **{f'NEW_{col}': col for col in self.change_columns if col != 'ITEM'},
+ 'NEW_Remark': 'Remark'
+ }
+ change_df = change_df.rename(columns=column_rename)
+
+ # 添加文件名信息
+ ws_change.cell(row=1, column=1, value=old_file_name)
+ ws_change.cell(row=1, column=right_start_col, value=new_file_name)
+
+ # 添加列标题
+ col_names = change_df.columns.tolist()
+ for col_idx, col_name in enumerate(col_names, 1):
+ ws_change.cell(row=2, column=col_idx, value=col_name)
+
+ # 添加数据行
+ for r_idx, row in enumerate(dataframe_to_rows(change_df, index=False, header=False), 3):
+ for c_idx, value in enumerate(row, 1):
+ ws_change.cell(row=r_idx, column=c_idx, value=value)
+
+ # 创建异常记录工作表
+ if self.validation_errors:
+ print(f"⚠️ 发现 {len(self.validation_errors)} 个数据异常,创建异常记录")
+ ws_errors = wb.create_sheet(title="BOM异常记录")
+
+ # 异常记录列名
+ error_columns = ['文件', 'Sheet', '原始行号', '异常类型', '异常描述']
+ for col_idx, col_name in enumerate(error_columns, 1):
+ ws_errors.cell(row=1, column=col_idx, value=col_name)
+
+ # 添加异常数据
+ for row_idx, error in enumerate(self.validation_errors, 2):
+ ws_errors.cell(row=row_idx, column=1, value=error['文件'])
+ ws_errors.cell(row=row_idx, column=2, value=error['Sheet'])
+ ws_errors.cell(row=row_idx, column=3, value=error['原始行号'])
+ ws_errors.cell(row=row_idx, column=4, value=error['异常类型'])
+ ws_errors.cell(row=row_idx, column=5, value=error['异常描述'])
+
+ # 保存工作簿
+ wb.save(output_path)
+
+ # 打印处理汇总
+ print(self.generate_summary())
+ print(f"\n✅ 变更记录已保存至: {output_path}")
+
+ except Exception as e:
+ print(f"\n❌ 处理过程中出错: {str(e)}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ print("=" * 60)
+ print(" PCBA BOM 变更记录生成工具 ")
+ print("=" * 60)
+ print("要求: 标题行必须同时包含 'Item', 'Partnumber', 'MF_PN'")
+ comparator = BOMComparator()
+ comparator.generate_change_record()
+ print("\n" + "=" * 50)
+ print(" 处理完成,按任意键退出... ")
+ # input()
diff --git a/BOMCompare/BOMConsolidatorV1.py b/BOMCompare/BOMConsolidatorV1.py
new file mode 100644
index 0000000..9180244
--- /dev/null
+++ b/BOMCompare/BOMConsolidatorV1.py
@@ -0,0 +1,618 @@
+import pandas as pd
+import os
+import glob
+import re
+from datetime import datetime
+import tkinter as tk
+from tkinter import filedialog
+from collections import defaultdict
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple, Any
+from dataclasses import dataclass
+
+
+@dataclass
+class ProcessedFileInfo:
+ """处理文件信息类"""
+ filename: str
+ sheet_name: str
+ start_row: int
+ total_rows: int
+ valid_rows: int
+
+
+@dataclass
+class BOMRow:
+ """BOM行数据类"""
+ partnumber: str
+ purchase_code: str
+ mf_pn: str
+ description: str
+ part_type: str
+ mf_name: str
+ pcb_footprint: str
+ quantity: int
+ reference: str
+ filename: str = ""
+ sheet_name: str = ""
+
+ @classmethod
+ def from_dataframe_row(cls, row: pd.Series, filename: str = "", sheet_name: str = "") -> Optional['BOMRow']:
+ """从DataFrame行创建BOMRow对象"""
+ try:
+ return cls(
+ partnumber=str(row.get('Partnumber', '')).strip(),
+ purchase_code=str(row.get('Purchase_Code', '')).strip(),
+ mf_pn=str(row.get('MF_PN', '')).strip(),
+ description=str(row.get('Description', '')).strip(),
+ part_type=str(row.get('Part_Type', '')).strip(),
+ mf_name=str(row.get('MF_NAME', '')).strip(),
+ pcb_footprint=str(row.get('PCB_Footprint', '')).strip(),
+ quantity=int(row.get('Quantity', 0)),
+ reference=str(row.get('Reference', '')).strip(),
+ filename=filename,
+ sheet_name=sheet_name
+ )
+ except (ValueError, TypeError):
+ return None
+
+ def get_key(self) -> str:
+ """获取行的唯一标识键"""
+ return self.partnumber if self.partnumber else self.mf_pn
+
+ def is_valid(self) -> bool:
+ """检查行数据是否有效"""
+ return bool(self.get_key())
+
+
+@dataclass
+class ConsolidatedMaterial:
+ """合并后的物料数据类"""
+ partnumber: str
+ purchase_code: str
+ mf_pn: str
+ description: str
+ part_type: str
+ mf_name: str
+ pcb_footprint: str
+ quantity_data: Dict[str, int] # 文件名: 数量
+ inconsistencies: List[str]
+
+ @property
+ def total_quantity(self) -> int:
+ """计算总数量"""
+ return sum(self.quantity_data.values())
+
+ @property
+ def has_inconsistencies(self) -> bool:
+ """检查是否有不一致"""
+ return len(self.inconsistencies) > 0
+
+
+class ConsistencyChecker:
+ """一致性检查器"""
+
+ def __init__(self):
+ self.fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint']
+
+ def check_field_consistency(self, existing: ConsolidatedMaterial, new_row: BOMRow) -> List[str]:
+ """检查字段一致性"""
+ inconsistencies = []
+
+ field_mapping = {
+ 'Purchase_Code': ('purchase_code', 'Purchase_Code'),
+ 'MF_PN': ('mf_pn', 'MF_PN'),
+ 'Part_Type': ('part_type', 'Part Type'),
+ 'MF_NAME': ('mf_name', 'MF_NAME'),
+ 'PCB_Footprint': ('pcb_footprint', 'PCB_Footprint')
+ }
+
+ for field, (attr_name, row_field) in field_mapping.items():
+ existing_val = getattr(existing, attr_name)
+ new_val = getattr(new_row, attr_name)
+
+ if self._should_check_field(existing_val, new_val) and existing_val != new_val:
+ inconsistencies.append(
+ f"{field}不一致: {existing_val} ≠ {new_val} (文件: {new_row.filename}, Sheet: {new_row.sheet_name})"
+ )
+
+ return inconsistencies
+
+ def check_quantity_reference(self, row: BOMRow) -> Optional[str]:
+ """检查Reference数量和Quantity是否匹配"""
+ if not row.reference:
+ return None
+
+ ref_count = len([ref for ref in row.reference.split(',') if ref.strip()])
+
+ if ref_count != row.quantity:
+ return f"Reference数量不符: {ref_count}个位置 ≠ Quantity={row.quantity} (文件: {row.filename}, Sheet: {row.sheet_name})"
+
+ return None
+
+ def _should_check_field(self, existing_val: str, new_val: str) -> bool:
+ """判断是否应该检查字段"""
+ # 忽略空值和无意义值
+ if not new_val or new_val.lower() in ['', 'nan', 'none', 'null']:
+ return False
+ return True
+
+
+class BOMFileParser:
+ """BOM文件解析器"""
+
+ def __init__(self):
+ self.required_headers = ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN']
+ self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description',
+ 'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference']
+
+ def find_valid_sheet(self, file_path: str) -> Optional[Tuple[str, int]]:
+ """定位包含有效BOM的Sheet"""
+ try:
+ xl = pd.ExcelFile(file_path)
+
+ for sheet_name in xl.sheet_names:
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
+
+ for i in range(min(len(df), 10)): # 只检查前10行
+ headers = df.iloc[i].values
+ if all(col in str(headers) for col in self.required_headers):
+ filename = os.path.basename(file_path)
+ print(f"文件{filename}找到有效sheet {sheet_name}|有效数据行从 {i} 开始。")
+ return sheet_name, i
+ except Exception as e:
+ print(f"读取文件 {file_path} 时出错: {e}")
+
+ return None, None
+
+ def parse_file(self, file_path: str) -> Optional[Tuple[List[BOMRow], ProcessedFileInfo]]:
+ """解析BOM文件"""
+ filename = os.path.basename(file_path)
+ sheet_name, header_row = self.find_valid_sheet(file_path)
+
+ if not sheet_name:
+ return None
+
+ try:
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+ total_rows = len(df)
+ df = self._clean_dataframe(df)
+
+ if not self._validate_columns(df):
+ return None
+
+ bom_rows = []
+ valid_rows = 0
+ for _, row_data in df.iterrows():
+ bom_row = BOMRow.from_dataframe_row(row_data, filename, sheet_name)
+ if bom_row and bom_row.is_valid():
+ bom_rows.append(bom_row)
+ valid_rows += 1
+
+ # 创建文件信息对象
+ file_info = ProcessedFileInfo(
+ filename=filename,
+ sheet_name=sheet_name,
+ start_row=header_row,
+ total_rows=total_rows,
+ valid_rows=valid_rows
+ )
+
+ return bom_rows, file_info
+
+ except Exception as e:
+ print(f"解析文件 {file_path} 时出错: {e}")
+ return None
+
+ def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+ """清洗DataFrame"""
+ # 清理列名
+ df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
+ df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)
+
+ # 去除空行
+ df = df.dropna(how='all')
+
+ return df
+
+ def _validate_columns(self, df: pd.DataFrame) -> bool:
+ """验证必要列是否存在"""
+ missing_cols = [col for col in self.required_columns if col not in df.columns]
+ return len(missing_cols) == 0
+
+
+class MaterialConsolidator:
+ """物料合并器"""
+
+ def __init__(self):
+ self.materials: Dict[str, ConsolidatedMaterial] = {}
+ self.consistency_checker = ConsistencyChecker()
+ self.file_quantities: Dict[str, Dict[str, int]] = defaultdict(dict)
+ self.processed_files_info: List[ProcessedFileInfo] = []
+
+ def add_bom_row(self, bom_row: BOMRow) -> None:
+ """添加BOM行数据"""
+ key = bom_row.get_key()
+
+ if key not in self.materials:
+ # 创建新的合并物料
+ self.materials[key] = ConsolidatedMaterial(
+ partnumber=bom_row.partnumber,
+ purchase_code=bom_row.purchase_code,
+ mf_pn=bom_row.mf_pn,
+ description=bom_row.description,
+ part_type=bom_row.part_type,
+ mf_name=bom_row.mf_name,
+ pcb_footprint=bom_row.pcb_footprint,
+ quantity_data={},
+ inconsistencies=[]
+ )
+
+ material = self.materials[key]
+
+ # 检查一致性
+ inconsistencies = self.consistency_checker.check_field_consistency(material, bom_row)
+ material.inconsistencies.extend(inconsistencies)
+
+ # 检查数量引用
+ ref_inconsistency = self.consistency_checker.check_quantity_reference(bom_row)
+ if ref_inconsistency:
+ material.inconsistencies.append(ref_inconsistency)
+
+ # 记录数量数据
+ material.quantity_data[bom_row.filename] = bom_row.quantity
+ self.file_quantities[bom_row.filename][key] = bom_row.quantity
+
+ def add_file_info(self, file_info: ProcessedFileInfo) -> None:
+ """添加文件处理信息"""
+ self.processed_files_info.append(file_info)
+
+ def get_statistics(self) -> Dict[str, Any]:
+ """获取统计信息"""
+ total_inconsistencies = sum(len(mat.inconsistencies) for mat in self.materials.values())
+ materials_with_issues = sum(1 for mat in self.materials.values() if mat.has_inconsistencies)
+
+ return {
+ 'total_materials': len(self.materials),
+ 'total_inconsistencies': total_inconsistencies,
+ 'materials_with_issues': materials_with_issues,
+ 'file_count': len(self.file_quantities),
+ 'processed_files_info': self.processed_files_info
+ }
+
+
+class ReportGenerator:
+ """报告生成器"""
+
+ def __init__(self, output_folder: str):
+ self.output_folder = output_folder
+ self._ensure_output_directory()
+
+ def _ensure_output_directory(self):
+ """确保输出目录存在"""
+ output_dir = os.path.join(self.output_folder, "BOM_Merge_out")
+ os.makedirs(output_dir, exist_ok=True)
+
+ def _create_summary_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame:
+ """创建汇总信息Sheet"""
+ summary_data = [
+ ["BOM合并检查汇总报告", ""],
+ ["生成时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
+ ["", ""],
+ ["处理统计", ""],
+ ["扫描文件总数", stats['total_files']],
+ ["成功处理文件数", stats['processed_files']],
+ ["处理数据行数", stats['processed_rows']],
+ ["", ""],
+ ["物料统计", ""],
+ ["合并物料种类数", stats['total_materials']],
+ ["存在问题的物料数", stats['materials_with_issues']],
+ ["不一致问题总数", stats['total_inconsistencies']],
+ ["", ""],
+ ["数据源文件信息", ""],
+ ["有效文件总数", len(stats.get('processed_files_info', []))],
+ ["", ""]
+ ]
+
+ # 添加详细的数据源文件信息
+ files_info = stats.get('processed_files_info', [])
+ for i, file_info in enumerate(files_info, 1):
+ summary_data.extend([
+ [f"数据源文件 {i}", file_info.filename],
+ [" Sheet名称", file_info.sheet_name],
+ [" 起始行", file_info.start_row + 1], # 转换为1-based索引
+ [" 总行数", file_info.total_rows],
+ [" 有效行数", file_info.valid_rows],
+ ["", ""]
+ ])
+
+ summary_data.extend([
+ ["", ""],
+ ["文件信息", ""],
+ ["输出文件夹", os.path.join(self.output_folder, "BOM_Merge_out")],
+ ["报告文件", stats.get('output_filename', '')],
+ ["合并Sheet名称", "BOM_Merge"]
+ ])
+
+ return pd.DataFrame(summary_data, columns=["项目", "数值"])
+
+ def _create_data_source_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame:
+ """创建数据源文件详细信息Sheet"""
+ files_info = stats.get('processed_files_info', [])
+
+ if not files_info:
+ return pd.DataFrame([["无有效数据源文件", ""]], columns=["状态", "说明"])
+
+ data_source_data = []
+ for i, file_info in enumerate(files_info, 1):
+ data_source_data.append({
+ '序号': i,
+ '文件名': file_info.filename,
+ 'Sheet名称': file_info.sheet_name,
+ '数据起始行': file_info.start_row + 1, # 转换为1-based索引
+ '总行数': file_info.total_rows,
+ '有效行数': file_info.valid_rows,
+ '处理状态': '成功'
+ })
+
+ return pd.DataFrame(data_source_data)
+
+ def _create_merge_sheet(self, consolidator: MaterialConsolidator) -> pd.DataFrame:
+ """创建合并数据Sheet"""
+ report_data = []
+ file_columns = sorted(consolidator.file_quantities.keys())
+
+ for material in consolidator.materials.values():
+ row = {
+ 'Partnumber': material.partnumber,
+ 'Purchase_Code': material.purchase_code,
+ 'MF_PN': material.mf_pn,
+ 'Description': material.description,
+ 'Part Type': material.part_type,
+ 'MF_NAME': material.mf_name,
+ 'PCB_Footprint': material.pcb_footprint,
+ '检查信息': '; '.join(material.inconsistencies) if material.inconsistencies else '一致'
+ }
+
+ # 添加各文件数量
+ for file in file_columns:
+ row[file] = material.quantity_data.get(file, 0)
+ row['合计'] = material.total_quantity
+
+ report_data.append(row)
+
+ return pd.DataFrame(report_data)
+
+ def generate_consolidated_report(self, consolidator: MaterialConsolidator, stats: Dict[str, Any]) -> Optional[str]:
+ """生成包含多个Sheet的合并报告"""
+ if not consolidator.materials:
+ return None
+
+ # 生成带时间戳的文件名
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"BOM合并报告_{timestamp}.xlsx"
+ output_path = os.path.join(self.output_folder, "BOM_Merge_out", output_filename)
+
+ try:
+ # 使用ExcelWriter创建多Sheet的Excel文件
+ with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+ # Sheet 1: 汇总信息
+ summary_df = self._create_summary_sheet(stats)
+ summary_df.to_excel(writer, sheet_name='汇总信息', index=False)
+
+ # Sheet 2: 数据源文件信息
+ data_source_df = self._create_data_source_sheet(stats)
+ data_source_df.to_excel(writer, sheet_name='数据源文件', index=False)
+
+ # Sheet 3: 合并数据
+ merge_df = self._create_merge_sheet(consolidator)
+ merge_df.to_excel(writer, sheet_name='BOM_Merge', index=False)
+
+ # 调整列宽
+ workbook = writer.book
+
+ # 调整汇总信息Sheet列宽
+ summary_sheet = workbook['汇总信息']
+ summary_sheet.column_dimensions['A'].width = 25
+ summary_sheet.column_dimensions['B'].width = 40
+
+ # 调整数据源文件Sheet列宽
+ data_source_sheet = workbook['数据源文件']
+ for col in data_source_sheet.columns:
+ max_length = 0
+ column = col[0].column_letter
+ for cell in col:
+ try:
+ if len(str(cell.value)) > max_length:
+ max_length = len(str(cell.value))
+ except:
+ pass
+ adjusted_width = min(max_length + 2, 30)
+ data_source_sheet.column_dimensions[column].width = adjusted_width
+
+ # 调整合并数据Sheet列宽
+ merge_sheet = workbook['BOM_Merge']
+ for col in merge_sheet.columns:
+ max_length = 0
+ column = col[0].column_letter
+ for cell in col:
+ try:
+ if len(str(cell.value)) > max_length:
+ max_length = len(str(cell.value))
+ except:
+ pass
+ adjusted_width = min(max_length + 2, 50)
+ merge_sheet.column_dimensions[column].width = adjusted_width
+
+ # 更新stats中的文件名
+ stats['output_filename'] = output_filename
+
+ return output_path
+
+ except Exception as e:
+ print(f"保存报告失败: {e}")
+ return None
+
+
+class BOMProcessor:
+ """BOM处理器 - 主控制器"""
+
+ def __init__(self):
+ self.file_parser = BOMFileParser()
+ self.material_consolidator = MaterialConsolidator()
+ self.report_generator: Optional[ReportGenerator] = None
+
+ # 统计信息
+ self.processed_files = 0
+ self.processed_rows = 0
+ self.total_files = 0
+
+ def set_output_folder(self, folder_path: str):
+ """设置输出文件夹"""
+ self.report_generator = ReportGenerator(folder_path)
+
+ def process_folder(self, folder_path: str) -> bool:
+ """处理文件夹中的所有BOM文件"""
+ bom_files = glob.glob(os.path.join(folder_path, "*.xlsx"))
+ self.total_files = len(bom_files)
+
+ if not bom_files:
+ return False
+
+ successful_files = 0
+ for file_path in bom_files:
+ if self._process_single_file(file_path):
+ successful_files += 1
+
+ self.processed_files = successful_files
+ return successful_files > 0
+
+ def _process_single_file(self, file_path: str) -> bool:
+ """处理单个文件"""
+ filename = os.path.basename(file_path)
+ print(f"处理文件: {filename}...")
+
+ result = self.file_parser.parse_file(file_path)
+ if not result:
+ print(f" ! 无法解析文件: {filename}")
+ return False
+
+ bom_rows, file_info = result
+
+ print(f" √ 文件{filename}找到 {len(bom_rows)} 行有效数据 (Sheet: {file_info.sheet_name})")
+
+ # 添加文件处理信息
+ self.material_consolidator.add_file_info(file_info)
+
+ # 处理BOM行数据
+ for bom_row in bom_rows:
+ self.material_consolidator.add_bom_row(bom_row)
+ self.processed_rows += 1
+
+ return True
+
+ def generate_report(self) -> Optional[Dict[str, Any]]:
+ """生成报告并返回统计信息"""
+ if not self.report_generator:
+ return None
+
+ # 获取基本统计信息
+ base_stats = self.material_consolidator.get_statistics()
+ base_stats.update({
+ 'processed_files': self.processed_files,
+ 'total_files': self.total_files,
+ 'processed_rows': self.processed_rows
+ })
+
+ # 生成报告
+ output_path = self.report_generator.generate_consolidated_report(
+ self.material_consolidator, base_stats
+ )
+
+ if not output_path:
+ return None
+
+ # 返回完整的统计信息
+ base_stats['output_path'] = output_path
+ return base_stats
+
+
+class UserInterface:
+ """用户界面处理器"""
+
+ @staticmethod
+ def select_folder(title: str = "选择文件夹") -> str:
+ """选择文件夹"""
+ root = tk.Tk()
+ root.withdraw()
+ folder_path = filedialog.askdirectory(title=title)
+ root.destroy()
+ return folder_path
+
+ @staticmethod
+ def print_summary(stats: Dict[str, Any], folder_path: str):
+ """打印汇总信息"""
+ print("\n" + "=" * 60)
+ print("BOM合并检查完成!")
+ print("=" * 60)
+ print(f"处理文件夹: {folder_path}")
+ print(f"扫描文件数: {stats['total_files']}")
+ print(f"成功处理文件数: {stats['processed_files']}")
+ print(f"处理数据行数: {stats['processed_rows']}")
+ print(f"合并物料种类数: {stats['total_materials']}")
+ print(f"存在问题的物料数: {stats['materials_with_issues']}")
+ print(f"不一致问题总数: {stats['total_inconsistencies']}")
+
+ # 显示数据源文件信息
+ files_info = stats.get('processed_files_info', [])
+ print(f"有效数据源文件数: {len(files_info)}")
+ for file_info in files_info:
+ print(f" - {file_info.filename} (Sheet: {file_info.sheet_name}, 有效行: {file_info.valid_rows})")
+
+ print(f"报告文件: {stats['output_path']}")
+ print("=" * 60)
+
+ # 额外显示输出文件夹信息
+ output_dir = os.path.join(folder_path, "BOM_Merge_out")
+ print(f"输出保存在: {output_dir}")
+
+ print("\n报告包含三个Sheet:")
+ print("1. '汇总信息' - 处理统计和汇总信息")
+ print("2. '数据源文件' - 有效数据源文件详细信息")
+ print("3. 'BOM_Merge' - 合并后的物料数据")
+
+
+def main():
+ """主函数"""
+ # 初始化处理器
+ bom_processor = BOMProcessor()
+
+ # 选择文件夹
+ folder_path = UserInterface.select_folder("选择包含BOM文件的文件夹")
+ if not folder_path:
+ print("未选择文件夹,程序退出")
+ return
+
+ bom_processor.set_output_folder(folder_path)
+
+ # 处理文件
+ print(f"开始处理文件夹: {folder_path}")
+ success = bom_processor.process_folder(folder_path)
+
+ if not success:
+ print("没有找到可处理的BOM文件")
+ return
+
+ # 生成报告
+ print("\n生成合并报告...")
+ stats = bom_processor.generate_report()
+
+ if stats:
+ UserInterface.print_summary(stats, folder_path)
+ else:
+ print("生成报告失败")
+
+
+if __name__ == "__main__":
+ main()
+ input("\n按任意键退出...")
diff --git a/BOMCompare/README.md b/BOMCompare/README.md
new file mode 100644
index 0000000..8caf796
--- /dev/null
+++ b/BOMCompare/README.md
@@ -0,0 +1,14 @@
+# Sample GitLab Project
+
+This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches,
+named and filled with lorem ipsum.
+
+You can look around to get an idea how to structure your project and, when done, you can safely delete this project.
+
+[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html)
+
+# 基于标准格式的 BOM文件,输出 BOM差异信息文件
+BOMCompereForJP.py
+
+# 基于标准格式的 BOM文件,输出 BOM的合并后的文件,方便校对和物料备料情况的分析。
+BOMConsolidator.py
\ No newline at end of file
diff --git a/FFT_IMU/.gitignore b/FFT_IMU/.gitignore
new file mode 100644
index 0000000..f74fbf0
--- /dev/null
+++ b/FFT_IMU/.gitignore
@@ -0,0 +1,19 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+/dataProcess_out*
+*.xls
+*.xlsx
+*.csv
+*.spec
+
+/src
+
+/temp
+
+FFT_IMU_dc_html_v2.py
+
+FFT_IMU_dc_v2.py
\ No newline at end of file
diff --git a/FFT_IMU/FFT_IMU_dc_html_v1.py b/FFT_IMU/FFT_IMU_dc_html_v1.py
new file mode 100644
index 0000000..bc4b711
--- /dev/null
+++ b/FFT_IMU/FFT_IMU_dc_html_v1.py
@@ -0,0 +1,739 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy import signal
+import os
+import glob
+from datetime import datetime
+import time
+from multiprocessing import Pool, cpu_count
+from matplotlib.colors import Normalize
+from matplotlib.ticker import MaxNLocator
+import re
+from colorama import Fore, Style, init
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import warnings
+import threading
+
+# 初始化colorama
+init(autoreset=True)
+
+# 忽略特定的matplotlib警告
+warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
+warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
+
+# 创建线程锁,确保文件操作和日志输出的线程安全
+file_lock = threading.Lock()
+log_lock = threading.Lock()
+
+
+class IMUDataAnalyzer:
+ def __init__(self, file_path):
+ self.file_path = file_path
+ self.data = None
+ self.sampling_rate = None
+ self.fig_size = (15, 10)
+ self.spectrogram_params = {} # 存储频谱图计算参数
+
+ # 从文件名推断数据类型和采样率
+ file_name = os.path.basename(file_path).lower()
+ if 'calib' in file_name:
+ self.data_type = 'calib'
+ self.default_sampling_rate = 5
+ elif 'raw' in file_name:
+ self.data_type = 'raw'
+ self.default_sampling_rate = 1000
+ else:
+ self.data_type = 'unknown'
+ self.default_sampling_rate = 5
+
+ # 解析文件路径和文件名
+ file_dir = os.path.dirname(os.path.abspath(file_path))
+ file_base_name = os.path.splitext(os.path.basename(file_path))[0]
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # 创建文件名称+时间戳尾缀的输出目录
+ self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
+
+ # 使用锁确保目录创建的线程安全
+ with file_lock:
+ if not os.path.exists(self.output_dir):
+ os.makedirs(self.output_dir)
+ self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
+
+ # 字体设置
+ plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
+ plt.rcParams['axes.unicode_minus'] = False
+
+ # 设置matplotlib兼容性选项,避免布局引擎冲突
+ plt.rcParams['figure.constrained_layout.use'] = False
+ plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.hspace'] = 0.02
+ plt.rcParams['figure.constrained_layout.wspace'] = 0.02
+
+ self.log_progress(f"处理文件:{self.file_path}", "INFO")
+ self.log_progress(f"数据类型:{self.data_type}", "INFO")
+ self.log_progress(f"输出路径:{self.output_dir}", "INFO")
+
+ def log_progress(self, message, level="INFO"):
+ """带颜色和级别的日志输出(线程安全)"""
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ with log_lock:
+ if level == "INFO":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
+ elif level == "WARNING":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
+ elif level == "ERROR":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
+ elif level == "SUCCESS":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}")
+ else:
+ print(f"{Fore.CYAN}[{timestamp}] {message}")
+
+ def check_imu_columns_in_file(self):
+ """检查文件是否包含IMU数据列(通过读取文件头)"""
+ try:
+ # 只读取第一行来检查列名
+ with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ first_line = f.readline().strip()
+
+ # 检查第一行是否包含imu关键词(不区分大小写)
+ if re.search(r'imu', first_line, re.IGNORECASE):
+ return True
+ else:
+ self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING")
+ return False
+
+ except Exception as e:
+ self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
+ return False
+
+ def detect_imu_columns(self):
+ """自动检测IMU数据列"""
+ all_columns = self.data.columns.tolist()
+
+ # 查找imu前缀(如imu1, imu2等)
+ imu_prefixes = set()
+ for col in all_columns:
+ match = re.match(r'^(imu\d+)_', col.lower())
+ if match:
+ imu_prefixes.add(match.group(1))
+
+ if not imu_prefixes:
+ self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING")
+ # 尝试使用常见列名
+ self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
+ self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
+ self.temp_columns = ['imu1_temp']
+ return
+
+ # 使用第一个检测到的IMU前缀
+ imu_prefix = list(imu_prefixes)[0]
+ self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
+
+ # 查找加速度计列
+ self.acc_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_acc") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找陀螺仪列
+ self.gyro_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_gyro") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找温度列
+ self.temp_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_temp")]
+
+ # 如果没有找到温度列,尝试其他常见名称
+ if not self.temp_columns:
+ self.temp_columns = [col for col in all_columns
+ if any(name in col.lower() for name in ['temp', 'temperature'])]
+
+ self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
+ self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
+ self.log_progress(f"温度列: {self.temp_columns}", "INFO")
+
+ def estimate_sampling_rate(self):
+ """估计实际采样率"""
+ if 'time' in self.data.columns and len(self.data) > 10:
+ time_diff = np.diff(self.data['time'].values)
+ valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
+ if len(valid_diffs) > 0:
+ estimated_rate = 1.0 / np.median(valid_diffs)
+ self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
+ return estimated_rate
+
+ # 如果没有时间列或无法估计,使用基于文件名的默认值
+ self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
+ return self.default_sampling_rate
+
+ def load_data(self):
+ """加载并预处理数据"""
+ self.log_progress("开始加载数据...")
+ start_time = time.time()
+
+ # 首先检查文件是否包含IMU数据
+ if not self.check_imu_columns_in_file():
+ raise ValueError("文件不包含IMU数据列,跳过处理")
+
+ # 使用锁确保文件读取的线程安全
+ with file_lock:
+ self.data = pd.read_csv(self.file_path)
+
+ self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒")
+
+ # 检测IMU数据列
+ self.detect_imu_columns()
+
+ # 估计采样率
+ self.sampling_rate = self.estimate_sampling_rate()
+
+ # 创建时间序列并处理异常时间值
+ if 'time' in self.data.columns:
+ valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
+ self.data = self.data[valid_time_mask].copy()
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+ else:
+ # 如果没有时间列,创建基于采样率的时间序列
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+
+ def remove_dc(self, signal_data):
+ """不移除直流分量(保留以在频谱中显示 DC)"""
+ return signal_data
+
+ def compute_spectrogram(self, signal_data):
+ """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
+ # 保留直流分量
+ signal_data = self.remove_dc(signal_data)
+
+ # 数据长度
+ n_samples = len(signal_data)
+
+ # 根据采样率和数据长度自适应选择参数
+ if self.sampling_rate <= 10: # 低采样率(5Hz)
+ # 对于低采样率,使用较长的窗口以获得更好的频率分辨率
+ nperseg = min(256, max(64, n_samples // 2))
+ noverlap = int(nperseg * 0.75) # 增加重叠比例
+
+ else: # 高采样率(1000Hz)
+ # 对于高采样率,平衡时间分辨率和频率分辨率
+ if n_samples < 10000: # 较短的数据
+ nperseg = min(512, max(256, n_samples // 4))
+ else: # 较长的数据
+ nperseg = min(1024, max(512, n_samples // 8))
+
+ noverlap = int(nperseg * 0.66) # 适中的重叠比例
+
+ # 确保窗口大小合理
+ nperseg = max(16, min(nperseg, n_samples))
+ noverlap = min(noverlap, nperseg - 1)
+
+ # 记录频谱图计算参数
+ self.spectrogram_params = {
+ "nperseg": nperseg,
+ "noverlap": noverlap,
+ "window": "hamming",
+ "detrend": False,
+ "scaling": "density",
+ "mode": "psd"
+ }
+
+ # 使用更平滑的窗口函数
+ f, t, Sxx = signal.spectrogram(
+ signal_data,
+ fs=self.sampling_rate,
+ window='hamming', # 使用汉明窗,比汉宁窗更平滑
+ nperseg=nperseg,
+ noverlap=noverlap,
+ scaling='density',
+ detrend=False, # 保留直流
+ mode='psd'
+ )
+
+ # 应用平滑处理以减少颗粒感
+ if Sxx.size > 0:
+ # 使用小范围的高斯滤波平滑(可选)
+ from scipy.ndimage import gaussian_filter
+ Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
+ return f, t, Sxx_smoothed
+
+ return f, t, Sxx
+
+ def process_signal(self, args):
+ """并行处理单个信号"""
+ signal_data, axis = args
+ f, t, Sxx = self.compute_spectrogram(signal_data)
+
+ # 防止 log10(0)
+ eps = np.finfo(float).eps
+ Sxx_log = 10 * np.log10(Sxx + eps)
+
+ # 降采样以加速绘图
+ if len(t) > 1000: # 如果时间点太多,进行降采样
+ time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
+ freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
+ t = t[time_indices]
+ f = f[freq_indices]
+ Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
+ dc_idx = int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
+
+ # 更健壮的 0 Hz 索引选择
+ zero_idx = np.where(np.isclose(f, 0.0))[0]
+ dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB)
+
+ return {
+ 'f': f,
+ 't': t,
+ 'Sxx_log': Sxx_log,
+ 'dc_log': dc_log,
+ 'axis': axis
+ }
+
+ @staticmethod
+ def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
+ """
+ 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
+ """
+ if not results:
+ return fallback
+ dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
+ dc_all = dc_all[np.isfinite(dc_all)]
+ if dc_all.size == 0:
+ return fallback
+ lo, hi = np.percentile(dc_all, [p_low, p_high])
+ span = max(1e-9, hi - lo)
+ lo -= span * pad_ratio
+ hi += span * pad_ratio
+ return lo, hi
+
+ def get_time_domain_stats(self):
+ """计算时域信号的统计信息"""
+ stats = {}
+ if self.acc_columns:
+ stats['加速度计'] = {col: {
+ '均值': self.data[col].mean(),
+ '标准差': self.data[col].std(),
+ '最大值': self.data[col].max(),
+ '最小值': self.data[col].min()
+ } for col in self.acc_columns}
+ if self.gyro_columns:
+ stats['陀螺仪'] = {col: {
+ '均值': self.data[col].mean(),
+ '标准差': self.data[col].std(),
+ '最大值': self.data[col].max(),
+ '最小值': self.data[col].min()
+ } for col in self.gyro_columns}
+ if self.temp_columns:
+ stats['温度'] = {col: {
+ '均值': self.data[col].mean(),
+ '标准差': self.data[col].std(),
+ '最大值': self.data[col].max(),
+ '最小值': self.data[col].min()
+ } for col in self.temp_columns}
+ return stats
+
+ def generate_html_report(self, time_domain_stats):
+ """生成HTML报告"""
+ html_content = f"""
+
+
+
+
+
+ IMU数据分析报告 - {os.path.basename(self.file_path)}
+
+
+
+ IMU数据分析报告
+ 文件路径: {self.file_path}
+ 分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+ 采样率: {self.sampling_rate} Hz
+
+ 时域信号统计信息
+ """
+
+ # 添加时域统计信息
+ for sensor_type, sensors in time_domain_stats.items():
+ html_content += f"{sensor_type}
"
+ html_content += ""
+ html_content += "| 传感器 | 均值 | 标准差 | 最大值 | 最小值 |
"
+ for col, stats in sensors.items():
+ html_content += f"| {col} | {stats['均值']:.4f} | {stats['标准差']:.4f} | {stats['最大值']:.4f} | {stats['最小值']:.4f} |
"
+ html_content += "
"
+
+ # 添加频域参数信息
+ html_content += """
+ 频域信号计算参数
+
+ | 参数 | 值 |
+ """
+ for key, value in self.spectrogram_params.items():
+ html_content += f"| {key} | {value} |
"
+ html_content += "
"
+
+ # 添加图像链接
+ time_series_image = f'time_series_{self.timestamp}.png'
+ acc_spectrogram_image = f'acc_rainfall_spectrogram_{self.timestamp}.png'
+ gyro_spectrogram_image = f'gyro_rainfall_spectrogram_{self.timestamp}.png'
+
+ html_content += f"""
+ 时域信号图
+
+
+ 加速度计频谱雨点图
+
+
+ 陀螺仪频谱雨点图
+
+ """
+
+ html_content += """
+
+
+ """
+
+ # 保存HTML报告
+ report_path = os.path.join(self.output_dir, f'report_{self.timestamp}.html')
+ with open(report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ self.log_progress(f"HTML报告已生成: {report_path}")
+
+ def plot_time_series(self):
+ """绘制时间序列图"""
+ self.log_progress("开始绘制时间序列图...")
+ start_time = time.time()
+
+ # 确定子图数量
+ n_plots = 1 # 至少有一个加速度图
+ if self.gyro_columns: # 如果有陀螺仪数据
+ n_plots += 1
+ if self.temp_columns: # 如果有温度数据
+ n_plots += 1
+
+ fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
+ if n_plots == 1:
+ axes = [axes] # 确保axes是列表
+
+ plot_idx = 0
+
+ # 加速度计数据
+ if self.acc_columns:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.acc_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('加速度时间序列', fontsize=12)
+ ax.set_ylabel('加速度 (g)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 陀螺仪数据(如果有)
+ if self.gyro_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.gyro_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('陀螺仪时间序列', fontsize=12)
+ ax.set_ylabel('角速度 (deg/s)', fontsize=10)
+ ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 温度数据(如果有)
+ if self.temp_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ ax.plot(self.data['time'], self.data[self.temp_columns[0]],
+ label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
+ ax.set_title('温度时间序列', fontsize=12)
+ ax.set_xlabel('时间 (s)', fontsize=10)
+ ax.set_ylabel('温度 (°C)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+
+ plt.tight_layout()
+ output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"时间序列图已保存: {output_path}")
+ self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒")
+
+ def plot_rainfall_spectrograms(self):
+ """并行绘制所有频谱雨点图(修复colorbar布局问题)"""
+ self.log_progress("开始并行绘制频谱雨点图...")
+ start_time = time.time()
+
+ # 准备加速度计数据
+ self.log_progress("准备加速度计数据...")
+ acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
+
+ # 准备陀螺仪数据(如果有)
+ gyro_signals = []
+ if self.gyro_columns:
+ self.log_progress("准备陀螺仪数据...")
+ gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
+
+ # 如果没有数据可处理,直接返回
+ if not acc_signals and not gyro_signals:
+ self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
+ return
+
+ # 使用多进程处理信号(避免线程冲突)
+ self.log_progress("使用多进程并行处理...")
+ all_signals = acc_signals + gyro_signals
+ with Pool(processes=min(len(all_signals), cpu_count())) as pool:
+ results = pool.map(self.process_signal, all_signals)
+
+ # 分离结果
+ self.log_progress("分离结果...")
+ acc_results = [r for r in results if r['axis'].startswith('Acc')]
+ gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
+
+ # 统一颜色标尺(5%-95%分位)
+ if acc_results:
+ self.log_progress("计算加速度计全局最小和最大值...")
+ acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
+ acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
+ self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}")
+
+ if gyro_results:
+ self.log_progress("计算陀螺仪全局最小和最大值...")
+ gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
+ gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
+ self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}")
+
+ # ========= 绘制加速度计频谱雨点图 =========
+ if acc_results:
+ self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
+ '加速度', 'acc_rainfall_spectrogram')
+ self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
+
+ # ========= 绘制陀螺仪频谱雨点图 =========
+ if gyro_results:
+ self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
+ '角速度', 'gyro_rainfall_spectrogram')
+ self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
+
+ total_time = time.time() - start_time
+ self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒")
+
+ def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
+ """绘制单个频谱雨点图"""
+ rows = len(results)
+ fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
+ gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
+
+ axes_main = []
+ axes_cbar = []
+ for i in range(rows):
+ axes_main.append(fig.add_subplot(gs[i, 0]))
+ axes_cbar.append(fig.add_subplot(gs[i, 1]))
+
+ for i, result in enumerate(results):
+ ax = axes_main[i]
+ cax = axes_cbar[i]
+
+ sc = ax.scatter(
+ np.repeat(result['t'], len(result['f'])),
+ np.tile(result['f'], len(result['t'])),
+ c=result['Sxx_log'].T.ravel(),
+ cmap='jet',
+ s=3,
+ alpha=0.7,
+ vmin=vmin,
+ vmax=vmax,
+ rasterized=True
+ )
+
+ ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10)
+ ax.set_xlabel('时间 (s)', fontsize=9)
+ ax.set_ylabel('频率 (Hz)', fontsize=9)
+ ax.set_ylim(0, self.sampling_rate / 2)
+ ax.grid(True, linestyle=':', alpha=0.4)
+
+ ax2 = ax.twinx()
+ ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
+ ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
+ ax2.set_ylim(dc_ymin, dc_ymax)
+ ax2.tick_params(axis='y', labelcolor='black')
+ ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
+ ax2.grid(False)
+ ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
+
+ cbar = fig.colorbar(sc, cax=cax)
+ cbar.set_label('功率谱密度 (dB)', fontsize=9)
+ cax.tick_params(labelsize=8)
+
+ output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
+
+ def run_analysis(self):
+ """运行完整分析流程"""
+ try:
+ self.log_progress("开始数据分析流程", "INFO")
+ start_time = time.time()
+
+ self.load_data()
+ self.plot_time_series()
+ self.plot_rainfall_spectrograms()
+
+ # 计算时域统计信息
+ time_domain_stats = self.get_time_domain_stats()
+
+ # 生成HTML报告
+ self.generate_html_report(time_domain_stats)
+
+ total_time = time.time() - start_time
+ self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS")
+ self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
+ return True
+
+ except ValueError as e:
+ # 跳过不包含IMU数据的文件
+ self.log_progress(f"跳过文件: {str(e)}", "WARNING")
+ return False
+ except Exception as e:
+ self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
+ import traceback
+ traceback.print_exc()
+ return False
+
+
+def process_single_file(file_path):
+ """处理单个文件的函数(使用进程隔离)"""
+ try:
+ print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
+ analyzer = IMUDataAnalyzer(file_path)
+ success = analyzer.run_analysis()
+ if success:
+ return (file_path, True, "处理成功")
+ else:
+ return (file_path, False, "文件不包含IMU数据,已跳过")
+ except Exception as e:
+ return (file_path, False, str(e))
+
+
+def main():
+ """主函数,支持多文件处理和进度显示"""
+ print("=" * 60)
+ print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
+ print("=" * 60)
+
+ # 获取输入路径
+ print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
+ input_path = input("> ").strip()
+
+ if not os.path.exists(input_path):
+ print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
+ return
+
+ # 查找所有包含imu的CSV文件(不区分大小写)
+ if os.path.isdir(input_path):
+ # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件
+ all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
+ csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
+ csv_files = list(set(csv_files)) # 去重
+ csv_files.sort()
+ else:
+ # 对于单个文件,检查是否包含imu(不区分大小写)
+ if re.search(r'imu', input_path, re.IGNORECASE):
+ csv_files = [input_path]
+ else:
+ csv_files = []
+
+ if not csv_files:
+ print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
+ return
+
+ print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
+ for i, file in enumerate(csv_files, 1):
+ print(f" {i}. {os.path.basename(file)}")
+
+ # 使用多进程处理文件(避免matplotlib线程冲突)
+ print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
+
+ success_count = 0
+ skipped_count = 0
+ failed_files = []
+
+ # 使用ProcessPoolExecutor而不是ThreadPoolExecutor
+ with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
+ # 提交所有任务
+ future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
+
+ # 处理完成的任务
+ for future in as_completed(future_to_file):
+ file_path = future_to_file[future]
+ try:
+ result = future.result()
+ file_path, success, message = result
+ if success:
+ print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
+ success_count += 1
+ else:
+ if "跳过" in message:
+ print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
+ skipped_count += 1
+ else:
+ print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
+ failed_files.append((file_path, message))
+ except Exception as e:
+ print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
+ failed_files.append((file_path, str(e)))
+
+ # 输出统计信息
+ print(f"\n{Fore.CYAN}处理完成统计:")
+ print(f"{Fore.GREEN}成功: {success_count} 个文件")
+ print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)")
+ print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
+
+ if failed_files:
+ print(f"\n{Fore.YELLOW}失败文件详情:")
+ for file, error in failed_files:
+ print(f" {os.path.basename(file)}: {error}")
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}用户中断程序执行")
+ except Exception as e:
+ print(f"{Fore.RED}程序运行出错: {str(e)}")
+ import traceback
+
+ traceback.print_exc()
\ No newline at end of file
diff --git a/FFT_IMU/FFT_IMU_dc_scan_v1.py b/FFT_IMU/FFT_IMU_dc_scan_v1.py
new file mode 100644
index 0000000..26bfa6a
--- /dev/null
+++ b/FFT_IMU/FFT_IMU_dc_scan_v1.py
@@ -0,0 +1,648 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy import signal
+import os
+import glob
+from datetime import datetime
+import time
+from multiprocessing import Pool, cpu_count
+from matplotlib.colors import Normalize
+from matplotlib.ticker import MaxNLocator
+import re
+from colorama import Fore, Style, init
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import warnings
+import threading
+
+# 初始化colorama
+init(autoreset=True)
+
+# 忽略特定的matplotlib警告
+warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
+warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
+
+# 创建线程锁,确保文件操作和日志输出的线程安全
+file_lock = threading.Lock()
+log_lock = threading.Lock()
+
+
+class IMUDataAnalyzer:
+ def __init__(self, file_path):
+ self.file_path = file_path
+ self.data = None
+ self.sampling_rate = None
+ self.fig_size = (15, 10)
+
+ # 从文件名推断数据类型和采样率
+ file_name = os.path.basename(file_path).lower()
+ if 'calib' in file_name:
+ self.data_type = 'calib'
+ self.default_sampling_rate = 5
+ elif 'raw' in file_name:
+ self.data_type = 'raw'
+ self.default_sampling_rate = 1000
+ else:
+ self.data_type = 'unknown'
+ self.default_sampling_rate = 5
+
+ # 解析文件路径和文件名
+ file_dir = os.path.dirname(os.path.abspath(file_path))
+ file_base_name = os.path.splitext(os.path.basename(file_path))[0]
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # 创建文件名称+时间戳尾缀的输出目录
+ self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
+
+ # 使用锁确保目录创建的线程安全
+ with file_lock:
+ if not os.path.exists(self.output_dir):
+ os.makedirs(self.output_dir)
+ self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
+
+ # 字体设置
+ plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
+ plt.rcParams['axes.unicode_minus'] = False
+
+ # 设置matplotlib兼容性选项,避免布局引擎冲突
+ plt.rcParams['figure.constrained_layout.use'] = False
+ plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.hspace'] = 0.02
+ plt.rcParams['figure.constrained_layout.wspace'] = 0.02
+
+ self.log_progress(f"处理文件:{self.file_path}", "INFO")
+ self.log_progress(f"数据类型:{self.data_type}", "INFO")
+ self.log_progress(f"输出路径:{self.output_dir}", "INFO")
+
+ def log_progress(self, message, level="INFO"):
+ """带颜色和级别的日志输出(线程安全)"""
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ with log_lock:
+ if level == "INFO":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
+ elif level == "WARNING":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
+ elif level == "ERROR":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
+ elif level == "SUCCESS":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}")
+ else:
+ print(f"{Fore.CYAN}[{timestamp}] {message}")
+
+ def check_imu_columns_in_file(self):
+ """检查文件是否包含IMU数据列(通过读取文件头)"""
+ try:
+ # 只读取第一行来检查列名
+ with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ first_line = f.readline().strip()
+
+ # 检查第一行是否包含imu关键词(不区分大小写)
+ if re.search(r'imu', first_line, re.IGNORECASE):
+ return True
+ else:
+ self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING")
+ return False
+
+ except Exception as e:
+ self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
+ return False
+
+ def detect_imu_columns(self):
+ """自动检测IMU数据列"""
+ all_columns = self.data.columns.tolist()
+
+ # 查找imu前缀(如imu1, imu2等)
+ imu_prefixes = set()
+ for col in all_columns:
+ match = re.match(r'^(imu\d+)_', col.lower())
+ if match:
+ imu_prefixes.add(match.group(1))
+
+ if not imu_prefixes:
+ self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING")
+ # 尝试使用常见列名
+ self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
+ self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
+ self.temp_columns = ['imu1_temp']
+ return
+
+ # 使用第一个检测到的IMU前缀
+ imu_prefix = list(imu_prefixes)[0]
+ self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
+
+ # 查找加速度计列
+ self.acc_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_acc") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找陀螺仪列
+ self.gyro_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_gyro") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找温度列
+ self.temp_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_temp")]
+
+ # 如果没有找到温度列,尝试其他常见名称
+ if not self.temp_columns:
+ self.temp_columns = [col for col in all_columns
+ if any(name in col.lower() for name in ['temp', 'temperature'])]
+
+ self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
+ self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
+ self.log_progress(f"温度列: {self.temp_columns}", "INFO")
+
+ def estimate_sampling_rate(self):
+ """估计实际采样率"""
+ if 'time' in self.data.columns and len(self.data) > 10:
+ time_diff = np.diff(self.data['time'].values)
+ valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
+ if len(valid_diffs) > 0:
+ estimated_rate = 1.0 / np.median(valid_diffs)
+ self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
+ return estimated_rate
+
+ # 如果没有时间列或无法估计,使用基于文件名的默认值
+ self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
+ return self.default_sampling_rate
+
+ def load_data(self):
+ """加载并预处理数据"""
+ self.log_progress("开始加载数据...")
+ start_time = time.time()
+
+ # 首先检查文件是否包含IMU数据
+ if not self.check_imu_columns_in_file():
+ raise ValueError("文件不包含IMU数据列,跳过处理")
+
+ # 使用锁确保文件读取的线程安全
+ with file_lock:
+ self.data = pd.read_csv(self.file_path)
+
+ self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒")
+
+ # 检测IMU数据列
+ self.detect_imu_columns()
+
+ # 估计采样率
+ self.sampling_rate = self.estimate_sampling_rate()
+
+ # 创建时间序列并处理异常时间值
+ if 'time' in self.data.columns:
+ valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
+ self.data = self.data[valid_time_mask].copy()
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+ else:
+ # 如果没有时间列,创建基于采样率的时间序列
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+
+ def remove_dc(self, signal_data):
+ """不移除直流分量(保留以在频谱中显示 DC)"""
+ return signal_data
+
+ # def compute_spectrogram(self, signal_data):
+ # """计算频谱图(保留直流分量)"""
+ # # 保留直流分量
+ # signal_data = self.remove_dc(signal_data)
+ #
+ # # 自适应窗口大小 - 根据采样率调整
+ # if self.sampling_rate <= 10: # 低采样率
+ # nperseg = min(64, max(16, len(signal_data) // 4))
+ # else: # 高采样率
+ # nperseg = min(1024, max(64, len(signal_data) // 8))
+ #
+ # noverlap = nperseg // 2
+ #
+ # f, t, Sxx = signal.spectrogram(
+ # signal_data,
+ # fs=self.sampling_rate,
+ # window='hann',
+ # nperseg=nperseg,
+ # noverlap=noverlap,
+ # scaling='density',
+ # detrend=False, # 保留直流
+ # mode='psd' # 更高效的模式
+ # )
+ # return f, t, Sxx
+
+ def compute_spectrogram(self, signal_data):
+ """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
+ # 保留直流分量
+ signal_data = self.remove_dc(signal_data)
+
+ # 数据长度
+ n_samples = len(signal_data)
+
+ # 根据采样率和数据长度自适应选择参数
+ if self.sampling_rate <= 10: # 低采样率(5Hz)
+ # 对于低采样率,使用较长的窗口以获得更好的频率分辨率
+ nperseg = min(256, max(64, n_samples // 2))
+ noverlap = int(nperseg * 0.75) # 增加重叠比例
+
+ else: # 高采样率(1000Hz)
+ # 对于高采样率,平衡时间分辨率和频率分辨率
+ if n_samples < 10000: # 较短的数据
+ nperseg = min(512, max(256, n_samples // 4))
+ else: # 较长的数据
+ nperseg = min(1024, max(512, n_samples // 8))
+
+ noverlap = int(nperseg * 0.66) # 适中的重叠比例
+
+ # 确保窗口大小合理
+ nperseg = max(16, min(nperseg, n_samples))
+ noverlap = min(noverlap, nperseg - 1)
+
+ # 使用更平滑的窗口函数
+ f, t, Sxx = signal.spectrogram(
+ signal_data,
+ fs=self.sampling_rate,
+ window='hamming', # 使用汉明窗,比汉宁窗更平滑
+ nperseg=nperseg,
+ noverlap=noverlap,
+ scaling='density',
+ # detrend='linear', # 使用线性去趋势,减少低频干扰
+ detrend=False, # 保留直流
+ mode='psd'
+ )
+
+ # 应用平滑处理以减少颗粒感
+ if Sxx.size > 0:
+ # 使用小范围的高斯滤波平滑(可选)
+ from scipy.ndimage import gaussian_filter
+ Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
+ return f, t, Sxx_smoothed
+
+ return f, t, Sxx
+
+ def process_signal(self, args):
+ """并行处理单个信号"""
+ signal_data, axis = args
+ f, t, Sxx = self.compute_spectrogram(signal_data)
+
+ # 防止 log10(0)
+ eps = np.finfo(float).eps
+ Sxx_log = 10 * np.log10(Sxx + eps)
+
+ # 降采样以加速绘图
+ if len(t) > 1000: # 如果时间点太多,进行降采样
+ time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
+ freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
+ t = t[time_indices]
+ f = f[freq_indices]
+ Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
+ dc_idx = int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
+
+ # 更健壮的 0 Hz 索引选择
+ zero_idx = np.where(np.isclose(f, 0.0))[0]
+ dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB)
+
+ return {
+ 'f': f,
+ 't': t,
+ 'Sxx_log': Sxx_log,
+ 'dc_log': dc_log,
+ 'axis': axis
+ }
+
+ @staticmethod
+ def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
+ """
+ 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
+ """
+ if not results:
+ return fallback
+ dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
+ dc_all = dc_all[np.isfinite(dc_all)]
+ if dc_all.size == 0:
+ return fallback
+ lo, hi = np.percentile(dc_all, [p_low, p_high])
+ span = max(1e-9, hi - lo)
+ lo -= span * pad_ratio
+ hi += span * pad_ratio
+ return lo, hi
+
+ def plot_time_series(self):
+ """绘制时间序列图"""
+ self.log_progress("开始绘制时间序列图...")
+ start_time = time.time()
+
+ # 确定子图数量
+ n_plots = 1 # 至少有一个加速度图
+ if self.gyro_columns: # 如果有陀螺仪数据
+ n_plots += 1
+ if self.temp_columns: # 如果有温度数据
+ n_plots += 1
+
+ fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
+ if n_plots == 1:
+ axes = [axes] # 确保axes是列表
+
+ plot_idx = 0
+
+ # 加速度计数据
+ if self.acc_columns:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.acc_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('加速度时间序列', fontsize=12)
+ ax.set_ylabel('加速度 (g)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 陀螺仪数据(如果有)
+ if self.gyro_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.gyro_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('陀螺仪时间序列', fontsize=12)
+ ax.set_ylabel('角速度 (deg/s)', fontsize=10)
+ ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 温度数据(如果有)
+ if self.temp_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ ax.plot(self.data['time'], self.data[self.temp_columns[0]],
+ label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
+ ax.set_title('温度时间序列', fontsize=12)
+ ax.set_xlabel('时间 (s)', fontsize=10)
+ ax.set_ylabel('温度 (°C)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+
+ plt.tight_layout()
+ output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"时间序列图已保存: {output_path}")
+ self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒")
+
+ def plot_rainfall_spectrograms(self):
+ """并行绘制所有频谱雨点图(修复colorbar布局问题)"""
+ self.log_progress("开始并行绘制频谱雨点图...")
+ start_time = time.time()
+
+ # 准备加速度计数据
+ self.log_progress("准备加速度计数据...")
+ acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
+
+ # 准备陀螺仪数据(如果有)
+ gyro_signals = []
+ if self.gyro_columns:
+ self.log_progress("准备陀螺仪数据...")
+ gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
+
+ # 如果没有数据可处理,直接返回
+ if not acc_signals and not gyro_signals:
+ self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
+ return
+
+ # 使用多进程处理信号(避免线程冲突)
+ self.log_progress("使用多进程并行处理...")
+ all_signals = acc_signals + gyro_signals
+ with Pool(processes=min(len(all_signals), cpu_count())) as pool:
+ results = pool.map(self.process_signal, all_signals)
+
+ # 分离结果
+ self.log_progress("分离结果...")
+ acc_results = [r for r in results if r['axis'].startswith('Acc')]
+ gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
+
+ # 统一颜色标尺(5%-95%分位)
+ if acc_results:
+ self.log_progress("计算加速度计全局最小和最大值...")
+ acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
+ acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
+ self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}")
+
+ if gyro_results:
+ self.log_progress("计算陀螺仪全局最小和最大值...")
+ gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
+ gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
+ self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}")
+
+ # ========= 绘制加速度计频谱雨点图 =========
+ if acc_results:
+ self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
+ '加速度', 'acc_rainfall_spectrogram')
+ self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
+
+ # ========= 绘制陀螺仪频谱雨点图 =========
+ if gyro_results:
+ self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
+ '角速度', 'gyro_rainfall_spectrogram')
+ self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
+
+ total_time = time.time() - start_time
+ self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒")
+
+ def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
+ """绘制单个频谱雨点图"""
+ rows = len(results)
+ fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
+ gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
+
+ axes_main = []
+ axes_cbar = []
+ for i in range(rows):
+ axes_main.append(fig.add_subplot(gs[i, 0]))
+ axes_cbar.append(fig.add_subplot(gs[i, 1]))
+
+ for i, result in enumerate(results):
+ ax = axes_main[i]
+ cax = axes_cbar[i]
+
+ sc = ax.scatter(
+ np.repeat(result['t'], len(result['f'])),
+ np.tile(result['f'], len(result['t'])),
+ c=result['Sxx_log'].T.ravel(),
+ cmap='jet',
+ s=3,
+ alpha=0.7,
+ vmin=vmin,
+ vmax=vmax,
+ rasterized=True
+ )
+
+ ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10)
+ ax.set_xlabel('时间 (s)', fontsize=9)
+ ax.set_ylabel('频率 (Hz)', fontsize=9)
+ ax.set_ylim(0, self.sampling_rate / 2)
+ ax.grid(True, linestyle=':', alpha=0.4)
+
+ ax2 = ax.twinx()
+ ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
+ ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
+ ax2.set_ylim(dc_ymin, dc_ymax)
+ ax2.tick_params(axis='y', labelcolor='black')
+ ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
+ ax2.grid(False)
+ ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
+
+ cbar = fig.colorbar(sc, cax=cax)
+ cbar.set_label('功率谱密度 (dB)', fontsize=9)
+ cax.tick_params(labelsize=8)
+
+ output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
+
+ def run_analysis(self):
+ """运行完整分析流程"""
+ try:
+ self.log_progress("开始数据分析流程", "INFO")
+ start_time = time.time()
+
+ self.load_data()
+ self.plot_time_series()
+ self.plot_rainfall_spectrograms()
+
+ total_time = time.time() - start_time
+ self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS")
+ self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
+ return True
+
+ except ValueError as e:
+ # 跳过不包含IMU数据的文件
+ self.log_progress(f"跳过文件: {str(e)}", "WARNING")
+ return False
+ except Exception as e:
+ self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
+ import traceback
+ traceback.print_exc()
+ return False
+
+
+def process_single_file(file_path):
+ """处理单个文件的函数(使用进程隔离)"""
+ try:
+ print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
+ analyzer = IMUDataAnalyzer(file_path)
+ success = analyzer.run_analysis()
+ if success:
+ return (file_path, True, "处理成功")
+ else:
+ return (file_path, False, "文件不包含IMU数据,已跳过")
+ except Exception as e:
+ return (file_path, False, str(e))
+
+
+def main():
+ """主函数,支持多文件处理和进度显示"""
+ print("=" * 60)
+ print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
+ print("=" * 60)
+
+ # 获取输入路径
+ print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
+ input_path = input("> ").strip()
+
+ if not os.path.exists(input_path):
+ print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
+ return
+
+ # 查找所有包含imu的CSV文件(不区分大小写)
+ if os.path.isdir(input_path):
+ # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件
+ all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
+ csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
+ csv_files = list(set(csv_files)) # 去重
+ csv_files.sort()
+ else:
+ # 对于单个文件,检查是否包含imu(不区分大小写)
+ if re.search(r'imu', input_path, re.IGNORECASE):
+ csv_files = [input_path]
+ else:
+ csv_files = []
+
+ if not csv_files:
+ print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
+ return
+
+ print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
+ for i, file in enumerate(csv_files, 1):
+ print(f" {i}. {os.path.basename(file)}")
+
+ # 使用多进程处理文件(避免matplotlib线程冲突)
+ print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
+
+ success_count = 0
+ skipped_count = 0
+ failed_files = []
+
+ # 使用ProcessPoolExecutor而不是ThreadPoolExecutor
+ with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
+ # 提交所有任务
+ future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
+
+ # 处理完成的任务
+ for future in as_completed(future_to_file):
+ file_path = future_to_file[future]
+ try:
+ result = future.result()
+ file_path, success, message = result
+ if success:
+ print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
+ success_count += 1
+ else:
+ if "跳过" in message:
+ print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
+ skipped_count += 1
+ else:
+ print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
+ failed_files.append((file_path, message))
+ except Exception as e:
+ print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
+ failed_files.append((file_path, str(e)))
+
+ # 输出统计信息
+ print(f"\n{Fore.CYAN}处理完成统计:")
+ print(f"{Fore.GREEN}成功: {success_count} 个文件")
+ print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)")
+ print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
+
+ if failed_files:
+ print(f"\n{Fore.YELLOW}失败文件详情:")
+ for file, error in failed_files:
+ print(f" {os.path.basename(file)}: {error}")
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}用户中断程序执行")
+ except Exception as e:
+ print(f"{Fore.RED}程序运行出错: {str(e)}")
+ import traceback
+
+ traceback.print_exc()
\ No newline at end of file
diff --git a/FFT_IMU/FFT_IMU_dc_v1.py b/FFT_IMU/FFT_IMU_dc_v1.py
new file mode 100644
index 0000000..26bfa6a
--- /dev/null
+++ b/FFT_IMU/FFT_IMU_dc_v1.py
@@ -0,0 +1,648 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from scipy import signal
+import os
+import glob
+from datetime import datetime
+import time
+from multiprocessing import Pool, cpu_count
+from matplotlib.colors import Normalize
+from matplotlib.ticker import MaxNLocator
+import re
+from colorama import Fore, Style, init
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import warnings
+import threading
+
+# 初始化colorama
+init(autoreset=True)
+
+# 忽略特定的matplotlib警告
+warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
+warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
+
+# 创建线程锁,确保文件操作和日志输出的线程安全
+file_lock = threading.Lock()
+log_lock = threading.Lock()
+
+
+class IMUDataAnalyzer:
+ def __init__(self, file_path):
+ self.file_path = file_path
+ self.data = None
+ self.sampling_rate = None
+ self.fig_size = (15, 10)
+
+ # 从文件名推断数据类型和采样率
+ file_name = os.path.basename(file_path).lower()
+ if 'calib' in file_name:
+ self.data_type = 'calib'
+ self.default_sampling_rate = 5
+ elif 'raw' in file_name:
+ self.data_type = 'raw'
+ self.default_sampling_rate = 1000
+ else:
+ self.data_type = 'unknown'
+ self.default_sampling_rate = 5
+
+ # 解析文件路径和文件名
+ file_dir = os.path.dirname(os.path.abspath(file_path))
+ file_base_name = os.path.splitext(os.path.basename(file_path))[0]
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+ # 创建文件名称+时间戳尾缀的输出目录
+ self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
+
+ # 使用锁确保目录创建的线程安全
+ with file_lock:
+ if not os.path.exists(self.output_dir):
+ os.makedirs(self.output_dir)
+ self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
+
+ # 字体设置
+ plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
+ plt.rcParams['axes.unicode_minus'] = False
+
+ # 设置matplotlib兼容性选项,避免布局引擎冲突
+ plt.rcParams['figure.constrained_layout.use'] = False
+ plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
+ plt.rcParams['figure.constrained_layout.hspace'] = 0.02
+ plt.rcParams['figure.constrained_layout.wspace'] = 0.02
+
+ self.log_progress(f"处理文件:{self.file_path}", "INFO")
+ self.log_progress(f"数据类型:{self.data_type}", "INFO")
+ self.log_progress(f"输出路径:{self.output_dir}", "INFO")
+
+ def log_progress(self, message, level="INFO"):
+ """带颜色和级别的日志输出(线程安全)"""
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ with log_lock:
+ if level == "INFO":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
+ elif level == "WARNING":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
+ elif level == "ERROR":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
+ elif level == "SUCCESS":
+ print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}✓ {message}")
+ else:
+ print(f"{Fore.CYAN}[{timestamp}] {message}")
+
+ def check_imu_columns_in_file(self):
+ """检查文件是否包含IMU数据列(通过读取文件头)"""
+ try:
+ # 只读取第一行来检查列名
+ with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
+ first_line = f.readline().strip()
+
+ # 检查第一行是否包含imu关键词(不区分大小写)
+ if re.search(r'imu', first_line, re.IGNORECASE):
+ return True
+ else:
+ self.log_progress(f"文件头部不包含'imu'关键词,跳过处理,first_line {first_line}", "WARNING")
+ return False
+
+ except Exception as e:
+ self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
+ return False
+
+ def detect_imu_columns(self):
+ """自动检测IMU数据列"""
+ all_columns = self.data.columns.tolist()
+
+ # 查找imu前缀(如imu1, imu2等)
+ imu_prefixes = set()
+ for col in all_columns:
+ match = re.match(r'^(imu\d+)_', col.lower())
+ if match:
+ imu_prefixes.add(match.group(1))
+
+ if not imu_prefixes:
+ self.log_progress("未检测到IMU数据列,尝试使用默认列名", "WARNING")
+ # 尝试使用常见列名
+ self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
+ self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
+ self.temp_columns = ['imu1_temp']
+ return
+
+ # 使用第一个检测到的IMU前缀
+ imu_prefix = list(imu_prefixes)[0]
+ self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
+
+ # 查找加速度计列
+ self.acc_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_acc") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找陀螺仪列
+ self.gyro_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_gyro") and
+ any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
+
+ # 查找温度列
+ self.temp_columns = [col for col in all_columns
+ if col.lower().startswith(f"{imu_prefix}_temp")]
+
+ # 如果没有找到温度列,尝试其他常见名称
+ if not self.temp_columns:
+ self.temp_columns = [col for col in all_columns
+ if any(name in col.lower() for name in ['temp', 'temperature'])]
+
+ self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
+ self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
+ self.log_progress(f"温度列: {self.temp_columns}", "INFO")
+
+ def estimate_sampling_rate(self):
+ """估计实际采样率"""
+ if 'time' in self.data.columns and len(self.data) > 10:
+ time_diff = np.diff(self.data['time'].values)
+ valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
+ if len(valid_diffs) > 0:
+ estimated_rate = 1.0 / np.median(valid_diffs)
+ self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
+ return estimated_rate
+
+ # 如果没有时间列或无法估计,使用基于文件名的默认值
+ self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
+ return self.default_sampling_rate
+
+ def load_data(self):
+ """加载并预处理数据"""
+ self.log_progress("开始加载数据...")
+ start_time = time.time()
+
+ # 首先检查文件是否包含IMU数据
+ if not self.check_imu_columns_in_file():
+ raise ValueError("文件不包含IMU数据列,跳过处理")
+
+ # 使用锁确保文件读取的线程安全
+ with file_lock:
+ self.data = pd.read_csv(self.file_path)
+
+ self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}秒")
+
+ # 检测IMU数据列
+ self.detect_imu_columns()
+
+ # 估计采样率
+ self.sampling_rate = self.estimate_sampling_rate()
+
+ # 创建时间序列并处理异常时间值
+ if 'time' in self.data.columns:
+ valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
+ self.data = self.data[valid_time_mask].copy()
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+ else:
+ # 如果没有时间列,创建基于采样率的时间序列
+ self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
+
+ def remove_dc(self, signal_data):
+ """不移除直流分量(保留以在频谱中显示 DC)"""
+ return signal_data
+
+ # def compute_spectrogram(self, signal_data):
+ # """计算频谱图(保留直流分量)"""
+ # # 保留直流分量
+ # signal_data = self.remove_dc(signal_data)
+ #
+ # # 自适应窗口大小 - 根据采样率调整
+ # if self.sampling_rate <= 10: # 低采样率
+ # nperseg = min(64, max(16, len(signal_data) // 4))
+ # else: # 高采样率
+ # nperseg = min(1024, max(64, len(signal_data) // 8))
+ #
+ # noverlap = nperseg // 2
+ #
+ # f, t, Sxx = signal.spectrogram(
+ # signal_data,
+ # fs=self.sampling_rate,
+ # window='hann',
+ # nperseg=nperseg,
+ # noverlap=noverlap,
+ # scaling='density',
+ # detrend=False, # 保留直流
+ # mode='psd' # 更高效的模式
+ # )
+ # return f, t, Sxx
+
+ def compute_spectrogram(self, signal_data):
+ """计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
+ # 保留直流分量
+ signal_data = self.remove_dc(signal_data)
+
+ # 数据长度
+ n_samples = len(signal_data)
+
+ # 根据采样率和数据长度自适应选择参数
+ if self.sampling_rate <= 10: # 低采样率(5Hz)
+ # 对于低采样率,使用较长的窗口以获得更好的频率分辨率
+ nperseg = min(256, max(64, n_samples // 2))
+ noverlap = int(nperseg * 0.75) # 增加重叠比例
+
+ else: # 高采样率(1000Hz)
+ # 对于高采样率,平衡时间分辨率和频率分辨率
+ if n_samples < 10000: # 较短的数据
+ nperseg = min(512, max(256, n_samples // 4))
+ else: # 较长的数据
+ nperseg = min(1024, max(512, n_samples // 8))
+
+ noverlap = int(nperseg * 0.66) # 适中的重叠比例
+
+ # 确保窗口大小合理
+ nperseg = max(16, min(nperseg, n_samples))
+ noverlap = min(noverlap, nperseg - 1)
+
+ # 使用更平滑的窗口函数
+ f, t, Sxx = signal.spectrogram(
+ signal_data,
+ fs=self.sampling_rate,
+ window='hamming', # 使用汉明窗,比汉宁窗更平滑
+ nperseg=nperseg,
+ noverlap=noverlap,
+ scaling='density',
+ # detrend='linear', # 使用线性去趋势,减少低频干扰
+ detrend=False, # 保留直流
+ mode='psd'
+ )
+
+ # 应用平滑处理以减少颗粒感
+ if Sxx.size > 0:
+ # 使用小范围的高斯滤波平滑(可选)
+ from scipy.ndimage import gaussian_filter
+ Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
+ return f, t, Sxx_smoothed
+
+ return f, t, Sxx
+
+ def process_signal(self, args):
+ """并行处理单个信号"""
+ signal_data, axis = args
+ f, t, Sxx = self.compute_spectrogram(signal_data)
+
+ # 防止 log10(0)
+ eps = np.finfo(float).eps
+ Sxx_log = 10 * np.log10(Sxx + eps)
+
+ # 降采样以加速绘图
+ if len(t) > 1000: # 如果时间点太多,进行降采样
+ time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
+ freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
+ t = t[time_indices]
+ f = f[freq_indices]
+ Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
+ dc_idx = int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
+
+ # 更健壮的 0 Hz 索引选择
+ zero_idx = np.where(np.isclose(f, 0.0))[0]
+ dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
+ dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSD(dB)
+
+ return {
+ 'f': f,
+ 't': t,
+ 'Sxx_log': Sxx_log,
+ 'dc_log': dc_log,
+ 'axis': axis
+ }
+
+ @staticmethod
+ def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
+ """
+ 计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
+ """
+ if not results:
+ return fallback
+ dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
+ dc_all = dc_all[np.isfinite(dc_all)]
+ if dc_all.size == 0:
+ return fallback
+ lo, hi = np.percentile(dc_all, [p_low, p_high])
+ span = max(1e-9, hi - lo)
+ lo -= span * pad_ratio
+ hi += span * pad_ratio
+ return lo, hi
+
+ def plot_time_series(self):
+ """绘制时间序列图"""
+ self.log_progress("开始绘制时间序列图...")
+ start_time = time.time()
+
+ # 确定子图数量
+ n_plots = 1 # 至少有一个加速度图
+ if self.gyro_columns: # 如果有陀螺仪数据
+ n_plots += 1
+ if self.temp_columns: # 如果有温度数据
+ n_plots += 1
+
+ fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
+ if n_plots == 1:
+ axes = [axes] # 确保axes是列表
+
+ plot_idx = 0
+
+ # 加速度计数据
+ if self.acc_columns:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.acc_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('加速度时间序列', fontsize=12)
+ ax.set_ylabel('加速度 (g)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 陀螺仪数据(如果有)
+ if self.gyro_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+ labels = ['X', 'Y', 'Z']
+ for i, col in enumerate(self.gyro_columns):
+ if i < 3: # 只绘制前三个轴
+ ax.plot(self.data['time'], self.data[col],
+ label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
+ ax.set_title('陀螺仪时间序列', fontsize=12)
+ ax.set_ylabel('角速度 (deg/s)', fontsize=10)
+ ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+ plot_idx += 1
+
+ # 温度数据(如果有)
+ if self.temp_columns and plot_idx < n_plots:
+ ax = axes[plot_idx]
+ ax.plot(self.data['time'], self.data[self.temp_columns[0]],
+ label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
+ ax.set_title('温度时间序列', fontsize=12)
+ ax.set_xlabel('时间 (s)', fontsize=10)
+ ax.set_ylabel('温度 (°C)', fontsize=10)
+ ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
+ ax.grid(True, linestyle=':', alpha=0.5)
+ ax.set_xlim(0, self.data['time'].max())
+
+ plt.tight_layout()
+ output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"时间序列图已保存: {output_path}")
+ self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}秒")
+
+ def plot_rainfall_spectrograms(self):
+ """并行绘制所有频谱雨点图(修复colorbar布局问题)"""
+ self.log_progress("开始并行绘制频谱雨点图...")
+ start_time = time.time()
+
+ # 准备加速度计数据
+ self.log_progress("准备加速度计数据...")
+ acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
+
+ # 准备陀螺仪数据(如果有)
+ gyro_signals = []
+ if self.gyro_columns:
+ self.log_progress("准备陀螺仪数据...")
+ gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
+ for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
+
+ # 如果没有数据可处理,直接返回
+ if not acc_signals and not gyro_signals:
+ self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
+ return
+
+ # 使用多进程处理信号(避免线程冲突)
+ self.log_progress("使用多进程并行处理...")
+ all_signals = acc_signals + gyro_signals
+ with Pool(processes=min(len(all_signals), cpu_count())) as pool:
+ results = pool.map(self.process_signal, all_signals)
+
+ # 分离结果
+ self.log_progress("分离结果...")
+ acc_results = [r for r in results if r['axis'].startswith('Acc')]
+ gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
+
+ # 统一颜色标尺(5%-95%分位)
+ if acc_results:
+ self.log_progress("计算加速度计全局最小和最大值...")
+ acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
+ acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
+ self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f} 到 {acc_dc_ymax:.1f}")
+
+ if gyro_results:
+ self.log_progress("计算陀螺仪全局最小和最大值...")
+ gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
+ gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
+
+ # 统一 DC Y 轴范围
+ gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
+ self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f} 到 {gyro_dc_ymax:.1f}")
+
+ # ========= 绘制加速度计频谱雨点图 =========
+ if acc_results:
+ self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
+ '加速度', 'acc_rainfall_spectrogram')
+ self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
+
+ # ========= 绘制陀螺仪频谱雨点图 =========
+ if gyro_results:
+ self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
+ '角速度', 'gyro_rainfall_spectrogram')
+ self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
+
+ total_time = time.time() - start_time
+ self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}秒")
+
+ def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
+ """绘制单个频谱雨点图"""
+ rows = len(results)
+ fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
+ gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
+
+ axes_main = []
+ axes_cbar = []
+ for i in range(rows):
+ axes_main.append(fig.add_subplot(gs[i, 0]))
+ axes_cbar.append(fig.add_subplot(gs[i, 1]))
+
+ for i, result in enumerate(results):
+ ax = axes_main[i]
+ cax = axes_cbar[i]
+
+ sc = ax.scatter(
+ np.repeat(result['t'], len(result['f'])),
+ np.tile(result['f'], len(result['t'])),
+ c=result['Sxx_log'].T.ravel(),
+ cmap='jet',
+ s=3,
+ alpha=0.7,
+ vmin=vmin,
+ vmax=vmax,
+ rasterized=True
+ )
+
+ ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}(右侧为DC分量 dB)', fontsize=10)
+ ax.set_xlabel('时间 (s)', fontsize=9)
+ ax.set_ylabel('频率 (Hz)', fontsize=9)
+ ax.set_ylim(0, self.sampling_rate / 2)
+ ax.grid(True, linestyle=':', alpha=0.4)
+
+ ax2 = ax.twinx()
+ ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
+ ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
+ ax2.set_ylim(dc_ymin, dc_ymax)
+ ax2.tick_params(axis='y', labelcolor='black')
+ ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
+ ax2.grid(False)
+ ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
+
+ cbar = fig.colorbar(sc, cax=cax)
+ cbar.set_label('功率谱密度 (dB)', fontsize=9)
+ cax.tick_params(labelsize=8)
+
+ output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
+ plt.savefig(output_path, bbox_inches='tight', dpi=150)
+ plt.close(fig)
+ self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
+
+ def run_analysis(self):
+ """运行完整分析流程"""
+ try:
+ self.log_progress("开始数据分析流程", "INFO")
+ start_time = time.time()
+
+ self.load_data()
+ self.plot_time_series()
+ self.plot_rainfall_spectrograms()
+
+ total_time = time.time() - start_time
+ self.log_progress(f"分析完成,总耗时 {total_time:.2f}秒", "SUCCESS")
+ self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
+ return True
+
+ except ValueError as e:
+ # 跳过不包含IMU数据的文件
+ self.log_progress(f"跳过文件: {str(e)}", "WARNING")
+ return False
+ except Exception as e:
+ self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
+ import traceback
+ traceback.print_exc()
+ return False
+
+
+def process_single_file(file_path):
+ """处理单个文件的函数(使用进程隔离)"""
+ try:
+ print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
+ analyzer = IMUDataAnalyzer(file_path)
+ success = analyzer.run_analysis()
+ if success:
+ return (file_path, True, "处理成功")
+ else:
+ return (file_path, False, "文件不包含IMU数据,已跳过")
+ except Exception as e:
+ return (file_path, False, str(e))
+
+
+def main():
+ """主函数,支持多文件处理和进度显示"""
+ print("=" * 60)
+ print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
+ print("=" * 60)
+
+ # 获取输入路径
+ print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
+ input_path = input("> ").strip()
+
+ if not os.path.exists(input_path):
+ print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
+ return
+
+ # 查找所有包含imu的CSV文件(不区分大小写)
+ if os.path.isdir(input_path):
+ # 使用单个glob模式匹配所有文件,然后过滤包含imu的文件
+ all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
+ csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
+ csv_files = list(set(csv_files)) # 去重
+ csv_files.sort()
+ else:
+ # 对于单个文件,检查是否包含imu(不区分大小写)
+ if re.search(r'imu', input_path, re.IGNORECASE):
+ csv_files = [input_path]
+ else:
+ csv_files = []
+
+ if not csv_files:
+ print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
+ return
+
+ print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
+ for i, file in enumerate(csv_files, 1):
+ print(f" {i}. {os.path.basename(file)}")
+
+ # 使用多进程处理文件(避免matplotlib线程冲突)
+ print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
+
+ success_count = 0
+ skipped_count = 0
+ failed_files = []
+
+ # 使用ProcessPoolExecutor而不是ThreadPoolExecutor
+ with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
+ # 提交所有任务
+ future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
+
+ # 处理完成的任务
+ for future in as_completed(future_to_file):
+ file_path = future_to_file[future]
+ try:
+ result = future.result()
+ file_path, success, message = result
+ if success:
+ print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
+ success_count += 1
+ else:
+ if "跳过" in message:
+ print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
+ skipped_count += 1
+ else:
+ print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
+ failed_files.append((file_path, message))
+ except Exception as e:
+ print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
+ failed_files.append((file_path, str(e)))
+
+ # 输出统计信息
+ print(f"\n{Fore.CYAN}处理完成统计:")
+ print(f"{Fore.GREEN}成功: {success_count} 个文件")
+ print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件(不包含IMU数据)")
+ print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
+
+ if failed_files:
+ print(f"\n{Fore.YELLOW}失败文件详情:")
+ for file, error in failed_files:
+ print(f" {os.path.basename(file)}: {error}")
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}用户中断程序执行")
+ except Exception as e:
+ print(f"{Fore.RED}程序运行出错: {str(e)}")
+ import traceback
+
+ traceback.print_exc()
\ No newline at end of file
diff --git a/ICCIDupdata/.gitignore b/ICCIDupdata/.gitignore
new file mode 100644
index 0000000..0cb4b73
--- /dev/null
+++ b/ICCIDupdata/.gitignore
@@ -0,0 +1,6 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
diff --git a/ICCIDupdata/ICCIDtest_V1.py b/ICCIDupdata/ICCIDtest_V1.py
new file mode 100644
index 0000000..a9a307f
--- /dev/null
+++ b/ICCIDupdata/ICCIDtest_V1.py
@@ -0,0 +1,90 @@
+import requests
+import hashlib
+import time
+
+import json
+
+def generate_sign(system_id, request_info, request_time, secret_key):
+ """生成签名"""
+ params = {
+ 'requestInfo': request_info,
+ 'requestTime': request_time,
+ 'systemId': system_id
+ }
+ # 按字典序排序
+ sorted_params = '&'.join([f"{k}={v}" for k, v in sorted(params.items())])
+ sign_str = sorted_params + secret_key
+ return hashlib.md5(sign_str.encode()).hexdigest()
+
+
+def test_navp_interface():
+ # 需要向HT获取系统密钥
+ secret_key = "aqwec3be422c22a752c22"
+
+ # url = "https://flow-gateway.pre.aeroht.com/server/oem/navp/infoUpload"
+ url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload"
+
+ # 测试数据
+ request_info = '{"iccid":"navp345678112300001","partsNo":"F34410001X3K-00-02","hVer":"F34410001X3K-00-02","sVer":"F34410001X3K0P001","network":"AG35CEVFMR12A02T4G&864169079532089","soc":"NA","sn":"F34410001X3K00024013683HJ00170"}'
+ # system_id = "navpFactory"
+ system_id = "diufactory"
+ request_time = time.strftime("%Y-%m-%d %H:%M:%S")
+ print(f"request_time:{request_time}")
+ # request_time = time.strftime("%Y/%m/%d %H:%M")
+
+ # 生成签名
+ sign = generate_sign(system_id, request_info, request_time, secret_key)
+
+ data = {
+ "systemId": system_id,
+ "requestInfo": request_info,
+ "requestTime": request_time,
+ "sign": sign
+ }
+
+ headers = {
+ "Content-Type": "application/x-www-form-urlencoded"
+ }
+
+ try:
+ # response = requests.post(url, data=data, headers=headers, timeout=30)
+ print(f"data|requestInfo:{data['requestInfo']}")
+ response = requests.request("POST",url, data=data, headers=headers, timeout=30)
+
+ if(response.status_code == 200) :
+ print(f"NAVP 接口测试 OK")
+ else :
+ print(f"NAVP 接口测试 NG")
+
+ print(f"NAVP接口响应状态码: {response.status_code}")
+ print(f"NAVP接口响应内容: {response.text}")
+
+ return response.status_code == 200
+ except Exception as e:
+ print(f"NAVP接口请求失败: {e}")
+ return False
+
+
+def test_navs_interface():
+ # NAVS接口测试(类似NAVP,只需修改systemId和url)
+ # 实现逻辑与test_navp_interface类似
+ # pass
+ url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload?requestInfo=%7B%22iccid%22:%22navp345678112300001%22,%22partsNo%22:%22parts111%22,%22hVer%22:%22hVer_7d98d056c96e22222%22,%22sVer%22:%22sVer_b38651e22222%22,%22soc%22:%22111%22,%22network%22:%222222%22%7D&systemId=diufactory&requestTime=2026-01-14%2017:19:58&sign=f480924ff291e0f98a4fb9fdd0167a3e&appSecret=aqwec3be422c22a752c22"
+
+ payload = {}
+ headers = {
+ 'Cookie': 'cookies=mg3Tr49e6qr2eIHbvmiHp9NXJa56ei5vh4CeDbcRaEH450bqgdWLrHYHIgaZX3A7CXB9l0X3c1i+9D96HFAFjSCIA58vVLNpM2EtDixW67CQVOpinLaIMEcnr4wSqtaHjOvpw+XVvm+nB3LE2C5AH/qpSULCgySiX3ET7BQV0PSZkGUfWs2z6PqLSPa7ta9jr18otqVkK7y2zKdsdc4YkYq2jbZldPXm8cXufRCUqdvXoR2QzMoN+/gu6vBKtSXHSlyaTCC/aay+i64ChV4iNXrKlfHHj9MswdrzAazFvZXoDNMTMW00TEbev9DDbcTXVUdbjxidZM4Qk8xIMcpaR07l1ruHLLd2gmYZKRarBAxhrGXGWvJtm5EV1N0AgO3t9sSWhsyWNKKPijgMmUhYyOzKoxD3cvSZ2vGnI5iojb9W6U+cT3A98W81ENYs3yyrEZamJOAbbwAi+zpcCmxI/wcWq32HXgiYLxJ4pmaNlIlW+h8a4tGfTpxAR/WrG/SPN/HoMPohV1INDkllXkurrijH8ZeAQmF+lVepFfBcC9dPrkB7RBAUF/P0FIqjoAXVf6ULLoUvyHcD92vIPDVT4UPW7XGT7FRxtNoBMXhKJ9fOosn+ofuskmOWS1pQsAe5zY7fM/uE7VRrS/AaUt94hKSTJO0p94dPeRRxMt5zDe+Fe4M+wvE5SKaE++C6ZrSNqsuYq1RhcQS26PR90xvq9+OT3HX1r7vGakFIGNXzW/Gd3+QF7+5oGDQHzc6WjFAsQBs1HkntPcJpsVbE24r6kiGzMxgCNTzreqJXDYmyePETDKj75bb/K5E18Zeo83vF3zk2vVrxxefwbvaGWyeRJJW6sQv8kyaIpyNVPQOhetlpDV4RHVzja869fEIl1zOdNQWkU+7F/gCBfcUS79RIaC5psIDofx28E7TIhfanh41OU+TtBXNXEVYqf/7NDc3q+1pLnenogmFvSSG4qE0iSGUapL9iTaDXjlOyYkS39keVog/AHrVfDYMTzBWeko2YJmpLExUwLeXfwL3xRI41yuuBz2eEuQhyIMwxxQWHkptPFR9Cn6TfjDwYBVIxIzrEBFc6E14VmZQ/zNloS2n66Z45ivuaRpagMaWo7+cqSX0CQvQ8SJQ+5k4i7pnXzhSq8fxiLSa0wIvfrnDlwd7WS6oe0nKqyMInt/iGMqUiAVmrlduHhlrIweHkM/E7pVaURVI38R5WNOOYBgHV8CTUwi2FLwsZbEkD0ElJDhEkeHfWHxmn9XkIVU+XD3/OZp+IRCYBWr3t6+iPJqosp75eWNfST5kCzP/bye+h5vRjuvRdnnMhekyd9MY3yPPbz5JJ42CTrkjyAGIyiycQNI4mFIWB9nlM3hXoywoO+FDd2CFIMvwkdH+GXEvbVNR/il3O94jRS+kF3v/i8WBPDzUSP0aHAQPEAvzyIsxY/6WrOcAuuA7Cy0qeIzjI0Wzjv/QbOEgtHk7kR6+pgptQtVFgU4EldoQFnuZEPPQdbw1OAhxUKfyIuljTKq8FbZ95aHX5fFQ6POzgIgUFYCwVqRHkVn6dwHLkDXcOMhfXvw+5q23k0GGCFUPwFM+6ypZBoKKCRTZ60e0IUOq3afVls/UUgGnDQp4pT/BXhLYhICGH8cZw+sNxHLddehuepi4PI8fq60e+H6RfE7xxk+LRlVNyI0TTi+NuESQr+UzX7GIvVkiiwgQKrUPafqBbDS6L2890tVXt1un1UH5hW9GuE+uftclBWqvGnYZUUrHQ42eAr1c8xvunaTINVU24nBlVFUPeh3x34RsjldTkrYeIkk9v0tz8T7ndWi6qxv/03u9YBlMRcJozgDnovVx/tNH7J0f6j6Sq1RNkhxVvRe6SPAgS3mvz5MLcMLw9pWCTSOf8NVDbSuV5NpOm+f5mhU9u/5tLfXgznJSmu9UW6WWx4PgiPTB0jHELrYnDQiiDRqVDFixaHzPZ6t9CKJp088NXrLamFfOYfd3e2S6xEu7aUHBKR2vnscQfl5awuzWD8uVh3sHcK/N4f2wregqra3YaSgme',
+ 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
+ 'Content-Type': 'application/json'
+ }
+
+ response = requests.request("POST", url, headers=headers, data=payload)
+
+ print(response.text)
+
+
+
+if __name__ == "__main__":
+ print("开始测试预发布环境...")
+ navp_result = test_navp_interface()
+ # navs_result = test_navs_interface()
diff --git a/IMULinkdata/.gitignore b/IMULinkdata/.gitignore
new file mode 100644
index 0000000..fc40eca
--- /dev/null
+++ b/IMULinkdata/.gitignore
@@ -0,0 +1,9 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+
+LINLinkData_V2.py
\ No newline at end of file
diff --git a/IMULinkdata/LINLinkData_V1.py b/IMULinkdata/LINLinkData_V1.py
new file mode 100644
index 0000000..e3066ca
--- /dev/null
+++ b/IMULinkdata/LINLinkData_V1.py
@@ -0,0 +1,252 @@
+import os
+import pandas as pd
+from datetime import datetime
+import argparse
+
+import re
+import time
+import argparse
+from datetime import datetime
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+import openpyxl
+
+
+class ExcelProcessor:
+ def __init__(self, file_path):
+ self.file_path = file_path
+ self.df = None
+ self.output_folder = None
+ self.output_file = None
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ self.processed_data = {} # 存储处理后的数据
+
+
+ def load_data(self):
+ """加载Excel文件数据"""
+ print(f"正在加载文件: {self.file_path}")
+ try:
+ # 尝试读取指定sheet,如果不存在则尝试读取第一个sheet
+ try:
+ # 建议使用 engine='openpyxl',pandas 会尽可能把 Excel 的日期单元格读成 datetime
+ # self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl')
+
+ # 获取所有工作表名称
+ sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names
+ # 查找包含'LINK'的工作表(不区分大小写)
+ target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None)
+ if target_sheet is None:
+ raise ValueError(f"未找到包含'LINK'的工作表")
+
+ self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl')
+ except Exception as e:
+ print("提示: 未找到包含'LINK' sheet,请检查文件内容。")
+ return False
+
+ # 确保有 PartNumber 列(兼容 LinkObject)
+ if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns:
+ self.df['PartNumber'] = self.df['LinkObject']
+
+ # 检查必要的列是否存在
+ required_cols = ["PartNumber", "ChildSN", "linkDate"]
+ missing = [c for c in required_cols if c not in self.df.columns]
+ if missing:
+ raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}")
+
+ # 解析 linkDate 为 datetime(支持 AM/PM)
+ # 注:pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM"
+ # 如果有极端异构格式,可在这里加更精细的清洗逻辑
+ # errors='coerce' 会把无法解析的值变为 NaT
+ # self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce')
+
+ self.df['linkDate'] = pd.to_datetime(
+ self.df['linkDate'],
+ format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM
+ errors='coerce'
+ )
+
+ # 提示解析情况
+ total = len(self.df)
+ invalid = int(self.df['linkDate'].isna().sum())
+ print(f"文件加载成功,总行数: {total},日期解析失败: {invalid} 行")
+
+ # 添加备注列
+ if '备注' not in self.df.columns:
+ self.df['备注'] = ''
+
+ return True
+ except Exception as e:
+ print(f"加载文件失败: {str(e)}")
+ return False
+
+ def create_output_folder(self):
+ """准备输出目录和文件名"""
+
+ # 先去除扩展名,再截取前10个字符
+ # base_name = os.path.splitext(os.path.basename(self.file_path))[0]
+ original_name = os.path.splitext(os.path.basename(self.file_path))[0]
+
+ # base_name = original_name[:10]
+ base_name = original_name[:20]
+
+ output_folder_name = f"{base_name} output_{self.timestamp}"
+
+ # self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name)
+ self.output_folder = os.path.dirname(self.file_path)
+
+ self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx")
+
+ if not os.path.exists(self.output_folder):
+ os.makedirs(self.output_folder)
+ print(f"已创建输出文件夹: {self.output_folder}")
+
+
+ def _safe_sheet_name(self, name):
+ """清理为合法的 Excel sheet 名称(<=31字符,无非法字符)"""
+ # 转为字符串
+ s = str(name)
+ # 替换非法字符:: \ / ? * [ ]
+ s = re.sub(r'[:\\/\?\*\[\]]', '_', s)
+ # 去除首尾空格
+ s = s.strip()
+ # 截断到 31 个字符
+ if len(s) > 31:
+ s = s[:31]
+ # 空名兜底
+ if not s:
+ s = 'Sheet'
+ return s
+
+ def process_data(self):
+ """处理数据并拆分到不同sheet"""
+ if self.df is None:
+ raise ValueError("数据未加载,请先调用 load_data() 方法")
+
+ # 确保有PartNumber列
+ if 'PartNumber' not in self.df.columns:
+ if 'LinkObject' in self.df.columns:
+ self.df['PartNumber'] = self.df['LinkObject']
+ else:
+ raise ValueError("数据表中既没有PartNumber也没有LinkObject列")
+
+ # 添加备注列
+ self.df['备注'] = ''
+
+ # 按 PartNumber 分组
+ grouped = self.df.groupby('PartNumber', dropna=False)
+ total_groups = len(grouped)
+ print(f"开始处理数据,共 {total_groups} 个分组...")
+
+ # 使用上下文管理器,自动保存关闭
+ # print(f"输出文件信息,self.output_folder:{self.output_folder}")
+ print(f"输出文件信息,self.output_file:{self.output_file}")
+ # output_path = os.path.join(self.output_folder, self.output_file)
+ output_path = self.output_file
+
+ writer = pd.ExcelWriter(output_path, engine='openpyxl')
+
+ for i, (name, group) in enumerate(grouped):
+ print(f"正在处理分组 {i + 1}/{total_groups}: {name}")
+
+ # 处理重复 ChildSN(根据最新 linkDate 保留一条)
+ group_processed = self.process_duplicates(group)
+
+ # 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串
+ group_out = group_processed.copy()
+ group_out['linkDate'] = group_out['linkDate'].apply(
+ lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else ''
+ )
+
+ # 写入sheet
+ safe_name = self._safe_sheet_name(name)
+ group_out.to_excel(writer, sheet_name=safe_name, index=False)
+
+ # 保存文件
+ writer.close()
+ print(f"处理完成! 结果已保存到: {output_path}")
+
+ def process_duplicates(self, group):
+ """处理重复的 ChildSN,优化备注信息:保留最新 linkDate 的一行"""
+ # 找出重复 ChildSN
+ duplicates = group[group.duplicated('ChildSN', keep=False)]
+
+ if not duplicates.empty:
+ print(f" 发现 {len(duplicates)} 行重复数据,正在处理...")
+
+ # 遍历每个重复 ChildSN 的分组
+ for child_sn, dup_group in duplicates.groupby('ChildSN'):
+ # 按 linkDate 排序,保留最新(降序)
+ # 若 linkDate 有 NaT,会排在末尾
+ dup_group = dup_group.sort_values('linkDate', ascending=False)
+
+ # 获取最新行
+ latest_row = dup_group.iloc[0]
+
+ # 差异字段收集(除 ChildSN、备注)
+ diff_info = {}
+ for col in dup_group.columns:
+ if col in ['ChildSN', '备注']:
+ continue
+ unique_values = dup_group[col].unique()
+ if len(unique_values) > 1:
+ # 对 linkDate 做专门格式化,其他列保持原样转字符串
+ if col == 'linkDate':
+ vals = []
+ for v in unique_values:
+ if pd.isna(v):
+ vals.append('')
+ elif isinstance(v, pd.Timestamp):
+ vals.append(v.strftime('%Y-%m-%d %H:%M:%S'))
+ else:
+ vals.append(str(v))
+ diff_info[col] = f"{col}: {', '.join(vals)}"
+ else:
+ diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}"
+
+ # 生成备注信息
+ note = f"重复行数: {len(dup_group)}"
+ if diff_info:
+ note += "; 差异内容: " + "; ".join(diff_info.values())
+
+ # 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除)
+ group.loc[group['ChildSN'] == child_sn, '备注'] = note
+
+ # 删除除最新以外的行
+ drop_indices = dup_group.index[1:]
+ group = group.drop(drop_indices)
+
+ return group
+
+
+def main():
+ print("=== Excel拆分工具 ===")
+ file_path = input("请输入Excel文件路径: ").strip('"')
+ if not os.path.exists(file_path):
+ print("文件不存在,请检查路径")
+ return
+
+ start_time = time.time()
+
+ try:
+ # 创建处理器实例
+ processor = ExcelProcessor(file_path)
+
+ # 执行处理流程
+ if not processor.load_data():
+ return
+
+ processor.create_output_folder()
+ processor.process_data()
+
+ print("所有处理已完成!")
+ except Exception as e:
+ print(f"处理过程中发生错误: {e}")
+
+ end_time = time.time()
+ print(f"总耗时: {end_time - start_time:.2f}秒")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/dataProcess/.gitignore b/dataProcess/.gitignore
new file mode 100644
index 0000000..b9a731e
--- /dev/null
+++ b/dataProcess/.gitignore
@@ -0,0 +1,20 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+/dataProcess_out*
+*.xls
+*.xlsx
+*.csv
+*.spec
+
+/temp
+
+dataProcess_html_V2.py
+
+dataProcess_sightml_V2.py
+dataProcess_sightml_V3.py
+
+dataProcessMerge_V2.py
\ No newline at end of file
diff --git a/dataProcess/dataProcessMerge_V1.py b/dataProcess/dataProcessMerge_V1.py
new file mode 100644
index 0000000..469a691
--- /dev/null
+++ b/dataProcess/dataProcessMerge_V1.py
@@ -0,0 +1,475 @@
+import os
+import pandas as pd
+from tkinter import filedialog, Tk
+import logging
+import datetime
+# --- 新增导入 ---
+from colorama import init, Fore, Style
+import sys
+
+# 初始化 colorama,autoreset=True 使得每次打印后自动恢复默认颜色
+init(autoreset=True)
+
+# --- 自定义日志格式化器 ---
+class ColoredFormatter(logging.Formatter):
+ """根据日志级别为控制台输出添加颜色"""
+
+ # 定义颜色
+ COLORS = {
+ 'DEBUG': Fore.CYAN,
+ 'INFO': Fore.GREEN,
+ 'WARNING': Fore.YELLOW,
+ 'ERROR': Fore.RED,
+ 'CRITICAL': Fore.RED + Style.BRIGHT,
+ }
+
+ def format(self, record):
+ # 获取对应级别的颜色
+ log_color = self.COLORS.get(record.levelname, '')
+ # 应用颜色到整个记录
+ record.levelname = f"{log_color}{record.levelname}{Style.RESET_ALL}"
+ record.msg = f"{log_color}{record.msg}{Style.RESET_ALL}"
+ # 使用父类的格式化方法
+ return super().format(record)
+
+# --- 配置日志 ---
+# 创建 logger 对象
+logger = logging.getLogger() # 获取根 logger
+logger.setLevel(logging.INFO)
+
+# 移除默认的 handlers(如果有的话),避免重复输出
+if logger.handlers:
+ logger.handlers.clear()
+
+# 创建控制台 handler
+console_handler = logging.StreamHandler(sys.stdout) # 使用 sys.stdout 通常更好
+console_handler.setLevel(logging.INFO)
+
+# 创建并设置 formatter
+formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s')
+console_handler.setFormatter(formatter)
+
+# 将 handler 添加到 logger
+logger.addHandler(console_handler)
+# --- 日志配置结束 ---
+
+
+class DataProcessor:
+ def __init__(self):
+ self.spec_file = None
+ self.data_folder = None
+ self.spec_data = None
+ self.data_files = []
+ self.merged_data = pd.DataFrame()
+
+ def select_spec_file(self):
+ """选择上限和下限规格要求文件"""
+ root = Tk()
+ root.withdraw()
+ self.spec_file = filedialog.askopenfilename(
+ title="选择上限和下限规格要求文件",
+ filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
+ )
+ if not self.spec_file:
+ logging.error("未选择规格文件")
+ return False
+ logging.info(f"已选择规格文件: {self.spec_file}")
+ return True
+
+ def select_data_folder(self):
+ """选择实际数据文件所在的文件夹"""
+ root = Tk()
+ root.withdraw()
+ self.data_folder = filedialog.askdirectory(title="选择实际数据文件所在的文件夹")
+ if not self.data_folder:
+ logging.error("未选择数据文件夹")
+ return False
+ logging.info(f"已选择数据文件夹: {self.data_folder}")
+ return True
+
+ def clean_column_names(self, df):
+ """清理列名,去除前后空格和特殊字符"""
+ df.columns = [col.strip() for col in df.columns]
+ return df
+
+ def load_spec_data(self):
+ """加载规格数据,标题行为第3行"""
+ try:
+ # 读取CSV文件,跳过前2行,第3行作为标题
+ self.spec_data = pd.read_csv(self.spec_file, header=2)
+
+ # 清理列名
+ self.spec_data = self.clean_column_names(self.spec_data)
+
+ # 确保PAD ID列是字符串类型
+ if 'PAD ID' in self.spec_data.columns:
+ self.spec_data['PAD ID'] = self.spec_data['PAD ID'].astype(str).str.strip()
+
+ # 检查必要的列是否存在
+ required_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
+ "Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
+
+ missing_columns = [col for col in required_columns if col not in self.spec_data.columns]
+ if missing_columns:
+ logging.warning(f"规格文件中缺少以下列: {missing_columns}")
+ # 尝试查找相似的列名
+ for missing_col in missing_columns:
+ similar_cols = [col for col in self.spec_data.columns if missing_col.lower() in col.lower()]
+ if similar_cols:
+ logging.info(f"可能匹配的列: {similar_cols}")
+
+ # 特别检查 Component ID 是否存在
+ if "Component ID" not in self.spec_data.columns:
+ logging.warning("'Component ID' 列在规格文件中缺失,这可能导致输出文件中也缺少该列。")
+
+ logging.info(f"规格数据加载成功,共 {len(self.spec_data)} 行")
+ logging.info(f"规格文件列名: {list(self.spec_data.columns)}")
+ logging.info(
+ f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype if 'PAD ID' in self.spec_data.columns else 'N/A'}")
+
+ except Exception as e:
+ logging.error(f"加载规格数据失败: {e}")
+ return False
+ return True
+
+ def scan_data_files(self):
+ """扫描数据文件夹中的CSV文件,并检查标题行是否包含有效字段"""
+ try:
+ # 定义有效的字段名称(去除前后空格)
+ required_fields = [
+ "PAD ID", "Component ID", "Height(mil)", "Volume(%)",
+ "Area(%)", "Volume(mil3)", "Area(mil2)"
+ ]
+
+ # 可选:定义字段匹配的宽松程度
+ field_match_threshold = 0.8 # 80%的字段匹配即认为有效
+
+ # 扫描CSV文件
+ valid_files = []
+ for file in os.listdir(self.data_folder):
+ if file.endswith(".csv") and "F27140015X3K" in file:
+ file_path = os.path.join(self.data_folder, file)
+
+ # 检查文件是否可读且包含有效字段
+ if self._is_valid_csv_file(file_path, required_fields, field_match_threshold):
+ valid_files.append(file_path)
+
+ self.data_files = valid_files
+ logging.info(
+ f"找到 {len(self.data_files)} 个有效数据文件: {[os.path.basename(f) for f in self.data_files]}")
+
+ except Exception as e:
+ logging.error(f"扫描数据文件失败: {e}")
+ return False
+
+ return True if self.data_files else False
+
+ def _is_valid_csv_file(self, file_path, required_fields, threshold=1.0):
+ """检查CSV文件是否包含必需的字段"""
+ try:
+ # 尝试不同的编码
+ encodings = ['utf-8', 'gbk', 'latin-1']
+
+ for encoding in encodings:
+ try:
+ with open(file_path, 'r', encoding=encoding) as f:
+ first_line = f.readline().strip()
+
+ # 解析CSV标题行
+ headers = [header.strip() for header in first_line.split(',')]
+
+ # 计算匹配的字段数量
+ matched_fields = 0
+ missing_fields = []
+
+ for required_field in required_fields:
+ if required_field in headers:
+ matched_fields += 1
+ else:
+ missing_fields.append(required_field)
+
+ # 计算匹配比例
+ match_ratio = matched_fields / len(required_fields)
+
+ if match_ratio >= threshold:
+ if missing_fields:
+ logging.warning(
+ f"文件 {os.path.basename(file_path)} 部分字段缺失: {missing_fields},但满足阈值要求")
+ else:
+ logging.info(f"文件 {os.path.basename(file_path)} 所有字段完整")
+ return True
+ else:
+ logging.warning(
+ f"文件 {os.path.basename(file_path)} 字段匹配率不足: {match_ratio:.1%},缺失字段: {missing_fields}")
+ return False
+
+ except UnicodeDecodeError:
+ continue # 尝试下一个编码
+
+ logging.error(f"无法读取文件 {os.path.basename(file_path)},尝试了所有编码")
+ return False
+
+ except Exception as e:
+ logging.error(f"检查文件 {os.path.basename(file_path)} 时发生错误: {e}")
+ return False
+
+ def load_and_clean_data_file(self, data_file):
+ """加载并清理数据文件"""
+ try:
+ # 读取数据文件,第一行作为标题
+ # 处理可能的编码问题
+ try:
+ data_df = pd.read_csv(data_file, header=0, encoding='utf-8')
+ except UnicodeDecodeError:
+ try:
+ data_df = pd.read_csv(data_file, header=0, encoding='gbk')
+ except UnicodeDecodeError:
+ data_df = pd.read_csv(data_file, header=0, encoding='latin-1')
+
+ # 清理列名
+ data_df = self.clean_column_names(data_df)
+
+ logging.info(f"数据文件列名: {list(data_df.columns)}")
+
+ # --- 关键修改:创建副本以避免 SettingWithCopyWarning ---
+ data_df = data_df.copy()
+
+ # 确保PAD ID列是字符串类型
+ if 'PAD ID' in data_df.columns:
+ data_df['PAD ID'] = data_df['PAD ID'].astype(str).str.strip()
+ logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
+
+ # 检查必要的列是否存在
+ required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
+
+ # 处理可能的列名变体
+ column_mapping = {}
+ for required_col in required_columns:
+ if required_col not in data_df.columns:
+ # 查找相似的列名
+ # 更宽松的匹配方式:忽略空格和大小写
+ similar_cols = [col for col in data_df.columns if
+ required_col.lower().replace(" ", "") in col.lower().replace(" ", "")]
+ if similar_cols:
+ column_mapping[required_col] = similar_cols[0]
+ logging.info(f"映射列: {required_col} -> {similar_cols[0]}")
+
+ # 重命名列
+ if column_mapping:
+ data_df = data_df.rename(columns=column_mapping)
+
+ missing_columns = [col for col in required_columns if col not in data_df.columns]
+ if missing_columns:
+ logging.error(f"数据文件中缺少以下列: {missing_columns}")
+ logging.info(f"数据文件所有列: {list(data_df.columns)}")
+ return None
+
+ return data_df # 返回处理好的副本
+
+ except Exception as e:
+ logging.error(f"加载数据文件失败: {e}")
+ return None
+
+ def process_data(self):
+ """处理数据并合并"""
+ all_data = []
+ total_files = len(self.data_files)
+
+ if total_files == 0:
+ logging.error("未找到任何数据文件")
+ return False
+
+ for idx, data_file in enumerate(self.data_files, 1):
+ logging.info(f"处理数据文件 {idx}/{total_files}: {os.path.basename(data_file)}")
+ try:
+ # 加载并清理数据文件
+ data_df = self.load_and_clean_data_file(data_file)
+ if data_df is None:
+ logging.error(f"无法加载文件: {os.path.basename(data_file)}")
+ continue
+
+ # 选择需要的字段
+ required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
+
+ # 检查数据文件中是否存在所有必需的列
+ available_columns = [col for col in required_columns if col in data_df.columns]
+ if len(available_columns) != len(required_columns):
+ missing = set(required_columns) - set(available_columns)
+ logging.warning(f"文件 {os.path.basename(data_file)} 缺少列: {missing}")
+ logging.info(f"可用的列: {available_columns}")
+ # --- 关键修改:使用可用的列继续处理 (再次创建副本) ---
+ data_df = data_df[available_columns].copy()
+ else:
+ # --- 关键修改:选择所需的列 (创建副本) ---
+ data_df = data_df[required_columns].copy()
+
+ # 添加数据来源字段
+ data_df["数据来源"] = os.path.basename(data_file)
+ data_df["限制来源"] = os.path.basename(self.spec_file)
+
+ # 调试信息:显示合并前的数据类型
+ logging.info(
+ f"合并前 - 数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist() if 'PAD ID' in data_df.columns else 'N/A'}")
+ logging.info(
+ f"合并前 - 规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist() if 'PAD ID' in self.spec_data.columns else 'N/A'}")
+
+ # 从规格文件中选择需要的字段
+ spec_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
+ "Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
+
+ # 只选择存在的列
+ available_spec_columns = [col for col in spec_columns if col in self.spec_data.columns]
+ # --- 关键修改:使用 .copy() 创建一个独立的副本,避免 SettingWithCopyWarning ---
+ spec_df = self.spec_data[available_spec_columns].copy()
+
+ # 确保规格文件的PAD ID也是字符串类型
+ if 'PAD ID' in spec_df.columns:
+ spec_df['PAD ID'] = spec_df['PAD ID'].astype(str).str.strip()
+
+ # 合并规格数据
+ merged_df = pd.merge(data_df, spec_df, on="PAD ID", how="inner", suffixes=('_data', '_spec'))
+
+ if merged_df.empty:
+ logging.warning(f"文件 {os.path.basename(data_file)} 与规格数据无匹配项")
+ # 显示一些调试信息
+ data_pad_ids = set(data_df['PAD ID'].unique()) if 'PAD ID' in data_df.columns else set()
+ spec_pad_ids = set(spec_df['PAD ID'].unique()) if 'PAD ID' in spec_df.columns else set()
+ common_ids = data_pad_ids.intersection(spec_pad_ids)
+ logging.info(
+ f"数据文件PAD ID数量: {len(data_pad_ids)}, 规格文件PAD ID数量: {len(spec_pad_ids)}, 共同ID数量: {len(common_ids)}")
+ logging.info(f"数据文件前5个PAD ID: {list(data_pad_ids)[:5] if data_pad_ids else 'N/A'}")
+ logging.info(f"规格文件前5个PAD ID: {list(spec_pad_ids)[:5] if spec_pad_ids else 'N/A'}")
+ continue
+
+ # --- 优化开始:确保 Component ID 来自数据文件 ---
+ # 即使合并产生了两个 Component ID (_data 和 _spec),我们也明确使用来自 data_df 的那个
+ if 'Component ID_data' in merged_df.columns:
+ merged_df['Component ID'] = merged_df['Component ID_data']
+ # 可选:删除来自规格文件的 Component ID 列
+ # merged_df.drop(columns=['Component ID_spec'], inplace=True, errors='ignore')
+ # 或者保留它以便对比,这里我们先注释掉删除操作
+
+ # 如果因为某种原因没有 _data 后缀(例如只有一个 Component ID),则默认就是 data_df 的
+ # (这种情况在 merge 时不会发生,因为我们用了 suffixes)
+ # --- 优化结束 ---
+
+ # --- 新增:对规格高度字段执行单位转换(除以 25.4) ---
+ # 为避免意外字符导致转换失败,先清洗再转换为数值
+ convert_cols = ["Height_Low(mil)", "Height_High(mil)"]
+ for col in convert_cols:
+ if col in merged_df.columns:
+ before_non_null = merged_df[col].notna().sum()
+ # 清洗非数字字符(保留数字、小数点和负号)
+ cleaned = merged_df[col].astype(str).str.replace(r'[^\d\.\-]+', '', regex=True)
+ merged_df[col] = pd.to_numeric(cleaned, errors='coerce') / 25.4
+ after_non_null = merged_df[col].notna().sum()
+ logging.info(
+ f"字段 {col} 已除以 25.4 完成单位转换,非空值数: 转换前 {before_non_null} -> 转换后 {after_non_null}"
+ )
+ else:
+ logging.warning(f"规格高度字段缺失,无法进行单位转换: {col}")
+
+ # 选择最终输出的字段(按照要求的顺序)
+ output_columns = [
+ "PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", "Height_Low(mil)",
+ "Height_High(mil)", "Area_Min(%)", "Area_Max(%)", "Height(mil)", "Volume(%)", "Area(%)",
+ "数据来源", "限制来源"
+ ]
+
+ # --- 优化开始 ---
+ # 只选择存在的列
+ available_output_columns = [col for col in output_columns if col in merged_df.columns]
+
+ # 检查是否有列缺失并打印警告
+ missing_output_columns = [col for col in output_columns if col not in merged_df.columns]
+ if missing_output_columns:
+ logging.warning(
+ f"文件 {os.path.basename(data_file)} 的最终输出中缺少以下预期列: {missing_output_columns}")
+
+ # 如果没有任何可用列,则跳过此文件
+ if not available_output_columns:
+ logging.error(f"文件 {os.path.basename(data_file)} 没有任何预期的输出列,将跳过此文件。")
+ continue
+
+ merged_df = merged_df[available_output_columns].copy() # 再次使用.copy()确保安全
+ # --- 优化结束 ---
+
+ all_data.append(merged_df)
+ logging.info(f"文件 {os.path.basename(data_file)} 处理成功,匹配 {len(merged_df)} 行")
+
+ except Exception as e:
+ logging.error(f"处理文件 {os.path.basename(data_file)} 时出错: {e}")
+ # 显示更多调试信息
+ if 'data_df' in locals() and 'PAD ID' in data_df.columns:
+ logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
+ logging.info(f"数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist()}")
+ if hasattr(self, 'spec_data') and 'PAD ID' in self.spec_data.columns:
+ logging.info(f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype}")
+ logging.info(f"规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist()}")
+ continue
+
+ if all_data:
+ self.merged_data = pd.concat(all_data, ignore_index=True)
+ logging.info(f"数据处理完成,共合并 {len(self.merged_data)} 行数据")
+ logging.info(f"最终数据列名: {list(self.merged_data.columns)}")
+ else:
+ logging.error("未成功处理任何数据文件")
+ return False
+ return True
+
+ def save_to_excel(self):
+ """保存合并后的数据到Excel文件"""
+ try:
+ # 生成时间戳
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"dataProcess_out_{timestamp}.xlsx"
+ output_file = os.path.join(self.data_folder, output_filename)
+
+ self.merged_data.to_excel(output_file, index=False)
+ logging.info(f"数据已保存到: {output_file}")
+
+ # 显示统计信息
+ stats = f"处理统计:\n"
+ stats += f"- 规格文件: {os.path.basename(self.spec_file)}\n"
+ stats += f"- 处理的数据文件数: {len(self.data_files)}\n"
+ stats += f"- 合并的总行数: {len(self.merged_data)}\n"
+ stats += f"- 输出文件: {output_file}\n"
+ stats += f"- 包含的列: {list(self.merged_data.columns)}"
+
+ logging.info(stats)
+ # 原来的 message box 提示已移除,改为日志输出
+ logging.info("处理完成。\n" + stats)
+
+ except Exception as e:
+ logging.error(f"保存数据失败: {e}")
+ # 原来的 error message box 已移除,改为日志输出
+ logging.error(f"保存数据失败: {e}")
+
+ def run(self):
+ """运行整个数据处理流程"""
+ logging.info("开始数据处理流程")
+
+ try:
+ if not self.select_spec_file():
+ return
+ if not self.select_data_folder():
+ return
+ if not self.load_spec_data():
+ return
+ if not self.scan_data_files():
+ return
+ if not self.process_data():
+ # 原来的 error message box 已移除,改为日志输出
+ logging.error("数据处理失败,请检查日志信息")
+ return
+ self.save_to_excel()
+
+ except Exception as e:
+ logging.error(f"处理流程出错: {e}")
+ # 原来的 error message box 已移除,改为日志输出
+ logging.error(f"处理过程中出现错误:\n{e}")
+
+
+if __name__ == "__main__":
+ processor = DataProcessor()
+ processor.run()
diff --git a/dataProcess/dataProcess_html_V1.py b/dataProcess/dataProcess_html_V1.py
new file mode 100644
index 0000000..290430c
--- /dev/null
+++ b/dataProcess/dataProcess_html_V1.py
@@ -0,0 +1,1060 @@
+import pandas as pd
+import tkinter as tk
+from tkinter import filedialog
+import os
+from datetime import datetime
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from io import BytesIO
+import base64
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import time
+import json
+import traceback
+
+# 设置中文字体
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
+plt.rcParams['axes.unicode_minus'] = False
+
+
+def plot_worker(args):
+ """工作进程函数:生成单个分组的图表"""
+ try:
+ group_key, feature_data_dict, limits_dict = args
+
+ # 每个进程重新设置matplotlib配置,避免线程冲突
+ plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
+ plt.rcParams['axes.unicode_minus'] = False
+
+ results = {}
+
+ for feature_name, feature_data in feature_data_dict.items():
+ if len(feature_data) == 0:
+ results[feature_name] = ""
+ continue
+
+ usl, lsl = limits_dict[feature_name]
+
+ # 创建图表
+ fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+ fig.suptitle(f'{group_key} - {feature_name} 统计分析', fontsize=14)
+
+ # 1. 直方图
+ axes[0, 0].hist(feature_data, bins=15, alpha=0.7, color='skyblue', edgecolor='black')
+ axes[0, 0].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
+ axes[0, 0].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
+ axes[0, 0].axvline(feature_data.mean(), color='orange', linestyle='-',
+ label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
+ axes[0, 0].set_title('直方图')
+ axes[0, 0].set_xlabel(feature_name)
+ axes[0, 0].set_ylabel('频数')
+ axes[0, 0].legend(fontsize=8)
+ axes[0, 0].grid(True, alpha=0.3)
+
+ # 2. 箱线图
+ sns.boxplot(y=feature_data, ax=axes[0, 1], color='lightblue')
+ axes[0, 1].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
+ axes[0, 1].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
+ axes[0, 1].set_title('箱线图')
+ axes[0, 1].set_ylabel(feature_name)
+ axes[0, 1].legend(fontsize=8)
+ axes[0, 1].grid(True, alpha=0.3)
+
+ # 3. 序列图
+ axes[1, 0].plot(range(len(feature_data)), feature_data, 'o-', color='blue',
+ alpha=0.7, markersize=3, linewidth=1)
+ axes[1, 0].axhline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
+ axes[1, 0].axhline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
+ axes[1, 0].axhline(feature_data.mean(), color='orange', linestyle='-',
+ label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
+ axes[1, 0].set_title('序列图')
+ axes[1, 0].set_xlabel('数据点序号')
+ axes[1, 0].set_ylabel(feature_name)
+ axes[1, 0].legend(fontsize=8)
+ axes[1, 0].grid(True, alpha=0.3)
+
+ # 4. 概率密度图
+ sns.kdeplot(feature_data, ax=axes[1, 1], color='blue', fill=True, alpha=0.5)
+ axes[1, 1].axvline(usl, color='red', linestyle='--', label=f'上限: {usl:.2f}', linewidth=1)
+ axes[1, 1].axvline(lsl, color='green', linestyle='--', label=f'下限: {lsl:.2f}', linewidth=1)
+ axes[1, 1].axvline(feature_data.mean(), color='orange', linestyle='-',
+ label=f'均值: {feature_data.mean():.2f}', linewidth=1.5)
+ axes[1, 1].set_title('概率密度图')
+ axes[1, 1].set_xlabel(feature_name)
+ axes[1, 1].set_ylabel('密度')
+ axes[1, 1].legend(fontsize=8)
+ axes[1, 1].grid(True, alpha=0.3)
+
+ plt.tight_layout()
+
+ # 转换为base64
+ buffer = BytesIO()
+ plt.savefig(buffer, format='png', dpi=80, bbox_inches='tight')
+ buffer.seek(0)
+ image_base64 = base64.b64encode(buffer.getvalue()).decode()
+ plt.close(fig)
+
+ results[feature_name] = image_base64
+
+ return group_key, results
+
+ except Exception as e:
+ print(f"❌ 图表生成失败 {group_key}: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+ return group_key, {}
+
+
+class DataProcessor:
+ def __init__(self):
+ self.data = None
+ self.filename = None
+ self.file_path = None
+ self.file_dir = None # 新增:存储输入文件所在目录
+ self.stats = None
+ self.output_dir = None
+ self.progress_file = None
+
+ def select_file(self):
+ """手动选择数据文件"""
+ print("打开文件选择对话框...")
+ root = tk.Tk()
+ root.withdraw()
+
+ self.file_path = filedialog.askopenfilename(
+ title="选择数据文件",
+ filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")]
+ )
+
+ if self.file_path:
+ self.filename = os.path.basename(self.file_path)
+ self.file_dir = os.path.dirname(self.file_path) # 获取文件所在目录
+ print(f"✅ 已选择文件: {self.filename}")
+ print(f"📁 文件所在目录: {self.file_dir}")
+ return True
+ else:
+ print("❌ 未选择文件")
+ return False
+
+ def _load_data(self):
+ """加载数据文件"""
+ print("开始加载数据文件...")
+ try:
+ if self.file_path.endswith('.csv'):
+ self.data = pd.read_csv(self.file_path)
+ print("✅ 成功加载CSV文件")
+ elif self.file_path.endswith('.xlsx'):
+ self.data = pd.read_excel(self.file_path)
+ print("✅ 成功加载Excel文件")
+ else:
+ raise ValueError("不支持的文件格式")
+
+ print(f"📊 数据文件形状: {self.data.shape}")
+
+ except Exception as e:
+ print(f"❌ 加载数据文件时出错: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+ raise
+
+ def _validate_data(self):
+ """验证数据完整性 - 增强验证:检查上下限列"""
+ print("验证数据完整性...")
+
+ # 检查必要的测量列
+ required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)']
+ missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns]
+
+ if missing_measure_columns:
+ error_msg = f"数据文件中缺少必要的测量列: {missing_measure_columns}"
+ print(f"❌ {error_msg}")
+ raise ValueError(error_msg)
+
+ # 检查必要的上下限列
+ required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)',
+ 'Vol_Min(%)', 'Vol_Max(%)',
+ 'Area_Min(%)', 'Area_Max(%)']
+ missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns]
+
+ if missing_limit_columns:
+ error_msg = f"数据文件中缺少必要的上下限列: {missing_limit_columns}"
+ print(f"❌ {error_msg}")
+ raise ValueError(error_msg)
+
+ print("✅ 数据验证通过")
+
+ # 检查数据是否存在空值
+ all_required_columns = required_measure_columns + required_limit_columns
+ null_counts = self.data[all_required_columns].isnull().sum()
+ if null_counts.any():
+ print(f"⚠️ 数据中存在空值 - {null_counts[null_counts > 0].to_dict()}")
+
+ def _setup_output_directory(self):
+ """设置输出目录"""
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ base_name = os.path.splitext(self.filename)[0]
+
+ # 优化:输出目录放置在输入文件所在文件夹下
+ self.output_dir = os.path.join(self.file_dir, f"{base_name}_report_{timestamp}")
+
+ # 创建主目录
+ os.makedirs(self.output_dir, exist_ok=True)
+
+ # 创建分组报告子目录
+ os.makedirs(os.path.join(self.output_dir, 'group_reports'), exist_ok=True)
+
+ # 创建进度文件
+ self.progress_file = os.path.join(self.output_dir, 'progress.json')
+
+ print(f"📁 输出目录: {self.output_dir}")
+
+ def _save_progress(self, completed_groups=None, current_stage=None):
+ """保存处理进度"""
+ try:
+ progress = {
+ 'filename': self.filename,
+ 'total_groups': len(self.stats.index) if self.stats is not None else 0,
+ 'completed_groups': completed_groups or [],
+ 'current_stage': current_stage,
+ 'last_update': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+ 'input_file_directory': self.file_dir, # 记录输入文件目录
+ 'output_directory': self.output_dir # 记录输出目录
+ }
+
+ with open(self.progress_file, 'w', encoding='utf-8') as f:
+ json.dump(progress, f, indent=2, ensure_ascii=False)
+ except Exception as e:
+ print(f"⚠️ 保存进度失败: {e}")
+
+ def generate_report(self):
+ """生成统计报告 - 分阶段输出"""
+ if self.data is None:
+ raise ValueError("请先选择数据文件")
+
+ try:
+ # 验证数据
+ self._validate_data()
+
+ # 设置输出目录
+ self._setup_output_directory()
+
+ print("开始数据处理...")
+
+ # 创建分组键
+ self.data['Group_Key'] = self.data['PAD ID'].astype(str) + '_' + self.data['Component ID'].astype(str)
+ group_count = self.data['Group_Key'].nunique()
+ print(f"📊 共发现 {group_count} 个分组")
+
+ # 阶段1:快速生成基本统计信息和汇总报告
+ print("\n=== 阶段1: 生成基本统计信息 ===")
+
+ # 计算测量数据的统计信息
+ self.stats = self.data.groupby('Group_Key').agg({
+ 'Height(mil)': ['min', 'max', 'mean', 'std'],
+ 'Volume(%)': ['min', 'max', 'mean', 'std'],
+ 'Area(%)': ['min', 'max', 'mean', 'std']
+ }).round(4)
+
+ # 重命名测量统计列
+ self.stats.columns = [
+ 'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)',
+ 'Vol_Measured_Min(%)', 'Vol_Measured_Max(%)', 'Vol_Mean(%)', 'Vol_Std(%)',
+ 'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)'
+ ]
+
+ print("基本统计信息计算完成")
+
+ # 获取预设的上下限信息
+ print("获取预设上下限信息...")
+ limits = self.data.groupby('Group_Key').agg({
+ 'Height_Low(mil)': 'first', # 取第一个值作为该分组的预设下限
+ 'Height_High(mil)': 'first', # 取第一个值作为该分组的预设上限
+ 'Vol_Min(%)': 'first',
+ 'Vol_Max(%)': 'first',
+ 'Area_Min(%)': 'first',
+ 'Area_Max(%)': 'first'
+ }).round(4)
+
+ # 合并统计信息和预设上下限信息
+ self.stats = pd.concat([self.stats, limits], axis=1)
+ print("预设上下限信息获取完成")
+
+ # 计算CPK - 使用预设的上下限值
+ print("计算CPK值...")
+ self.stats = self._calculate_cpk(self.stats)
+
+ # 立即生成汇总报告
+ summary_report_path = self._create_summary_report()
+ print(f"✅ 汇总报告生成完成: {summary_report_path}")
+
+ # 保存Excel
+ excel_path = self._save_to_excel_advanced()
+ print(f"✅ Excel文件保存完成: {excel_path}")
+
+ # 阶段2:分批生成详细分组报告
+ print("\n=== 阶段2: 分批生成详细分组报告 ===")
+ self._generate_group_reports_incremental()
+
+ # 阶段3:生成索引文件(可选)
+ print("\n=== 阶段3: 生成报告索引 ===")
+ index_path = self._create_report_index()
+ print(f"✅ 报告索引生成完成: {index_path}")
+
+ return summary_report_path
+
+ except Exception as e:
+ print(f"❌ 程序执行失败: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+ # 即使失败,也尝试保存当前进度
+ if hasattr(self, 'output_dir'):
+ print(f"📁 当前结果已保存到: {self.output_dir}")
+ raise
+
+ def _create_summary_report(self):
+ """创建快速汇总报告(区分预设上下限和实测值)"""
+ print("生成快速汇总报告...")
+
+ # 使用明确的空值检查
+ if self.stats is None or len(self.stats.index) == 0:
+ print("⚠️ 统计数据为空,生成空报告")
+ return self._create_empty_report()
+
+ # 将索引转换为列表,避免DataFrame布尔判断问题
+ stats_index = list(self.stats.index)
+ total_groups = len(stats_index)
+
+ # 安全地检查CPK列是否存在
+ valid_height_cpk = 0
+ valid_volume_cpk = 0
+ valid_area_cpk = 0
+
+ if 'Height_Cpk' in self.stats.columns:
+ valid_height_cpk = self.stats['Height_Cpk'].notna().sum()
+ if 'Volume_Cpk' in self.stats.columns:
+ valid_volume_cpk = self.stats['Volume_Cpk'].notna().sum()
+ if 'Area_Cpk' in self.stats.columns:
+ valid_area_cpk = self.stats['Area_Cpk'].notna().sum()
+
+ html_content = f"""
+
+
+
+ 数据统计汇总报告 - {self.filename}
+
+
+
+ 数据统计汇总报告 - {self.filename}
+ 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+ 输入文件位置: {self.file_dir}
+
+
+
报告说明
+
此报告为快速生成的汇总报告,包含所有分组的基本统计信息。
+
CPK计算使用预设的上下限值,而不是实测的最小最大值。
+
注意:分组详细报告可能需要较长时间生成,请勿关闭程序。
+
+
+
+
处理进度
+
总分组数量: {total_groups}
+
有效Height CPK数量: {valid_height_cpk}
+
有效Volume CPK数量: {valid_volume_cpk}
+
有效Area CPK数量: {valid_area_cpk}
+
输出目录: {self.output_dir}
+
+
+
+
+ 详细统计数据
+
+
+
+ 分组标识 (PAD ID + Component ID) |
+ Height(mil) |
+ Volume(%) |
+ Area(%) |
+ {'CPK值 | ' if 'Height_Cpk' in self.stats.columns else ''}
+
+
+
+ 预设下限 (LSL) |
+ 预设上限 (USL) |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ 数据点数 |
+ CPK |
+
+ 预设下限 (LSL) |
+ 预设上限 (USL) |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ 数据点数 |
+ CPK |
+
+ 预设下限 (LSL) |
+ 预设上限 (USL) |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ 数据点数 |
+ CPK |
+
+ 分组 |
+
+
+
+ """
+
+ # 生成表格行数据
+ for group_key in stats_index:
+ row = self.stats.loc[group_key]
+
+ def format_value(value):
+ """格式化数值显示"""
+ if pd.isna(value):
+ return 'N/A'
+ elif isinstance(value, (int, float)):
+ return f"{value:.4f}"
+ else:
+ return str(value)
+
+ # 获取数据点数
+ group_data = self.data[self.data['Group_Key'] == group_key]
+ data_count = len(group_data)
+
+ # 安全处理CPK列
+ cpk_columns = {"height": "", "volume": "", "area": ""}
+ if 'Height_Cpk' in self.stats.columns:
+ cpk_columns = {
+ "height": f"""{format_value(row['Height_Cpk'])} | """,
+ "volume": f"""{format_value(row['Volume_Cpk'])} | """,
+ "area": f"""{format_value(row['Area_Cpk'])} | """
+ }
+
+ # 为CPK值添加颜色标识
+ def get_cpk_color(cpk_value):
+ """根据CPK值返回颜色标识"""
+ if pd.isna(cpk_value):
+ return ''
+ try:
+ cpk_val = float(cpk_value)
+ if cpk_val >= 1.33:
+ return 'style="background-color: #90EE90;"' # 绿色 - 优秀
+ elif cpk_val >= 1.0:
+ return 'style="background-color: #FFFFE0;"' # 黄色 - 合格
+ else:
+ return 'style="background-color: #FFB6C1;"' # 红色 - 不合格
+ except:
+ return ''
+
+ # 如果存在CPK列,添加颜色
+ if 'Height_Cpk' in self.stats.columns:
+ # 这里需要为每个CPK单元格单独设置颜色
+ height_color = get_cpk_color(row['Height_Cpk'])
+ volume_color = get_cpk_color(row['Volume_Cpk'])
+ area_color = get_cpk_color(row['Area_Cpk'])
+
+ cpk_columns = {
+ "height": f"""{format_value(row['Height_Cpk'])} | """,
+ "volume": f"""{format_value(row['Volume_Cpk'])} | """,
+ "area": f"""{format_value(row['Area_Cpk'])} | """
+ }
+
+ html_content += f"""
+
+ | {group_key} |
+
+ {format_value(row['Height_Low(mil)'])} |
+ {format_value(row['Height_High(mil)'])} |
+ {format_value(row['Height_Measured_Min(mil)'])} |
+ {format_value(row['Height_Measured_Max(mil)'])} |
+ {format_value(row['Height_Mean(mil)'])} |
+ {format_value(row['Height_Std(mil)'])} |
+ {data_count} |
+ {cpk_columns["height"]}
+
+ {format_value(row['Vol_Min(%)'])} |
+ {format_value(row['Vol_Max(%)'])} |
+ {format_value(row['Vol_Measured_Min(%)'])} |
+ {format_value(row['Vol_Measured_Max(%)'])} |
+ {format_value(row['Vol_Mean(%)'])} |
+ {format_value(row['Vol_Std(%)'])} |
+ {data_count} |
+ {cpk_columns["volume"]}
+
+ {format_value(row['Area_Min(%)'])} |
+ {format_value(row['Area_Max(%)'])} |
+ {format_value(row['Area_Measured_Min(%)'])} |
+ {format_value(row['Area_Measured_Max(%)'])} |
+ {format_value(row['Area_Mean(%)'])} |
+ {format_value(row['Area_Std(%)'])} |
+ {data_count} |
+ {cpk_columns["area"]}
+
+ {group_key} |
+
+ """
+
+ html_content += """
+
+
+
+
+
表格说明
+
绿色背景: 预设的上下限值(用于CPK计算)
+
黄色背景: 实测数据的最小最大值
+
白色背景: 统计计算值
+
+
+
+
CPK计算说明
+
CPK计算公式: CPK = min[(USL - mean) / (3×std), (mean - LSL) / (3×std)]
+
上下限取值: 使用数据文件中的预设上下限值,而不是实测的最小最大值
+
绿色 CPK ≥ 1.33 (过程能力优秀)
+
黄色 1.0 ≤ CPK < 1.33 (过程能力合格)
+
红色 CPK < 1.0 (过程能力不足)
+
+
+
+ """
+
+ report_path = os.path.join(self.output_dir, 'summary_report.html')
+ with open(report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ print(f"✅ 汇总报告已生成: {report_path}")
+ return report_path
+
+ def _create_empty_report(self):
+ """创建空数据报告"""
+ html_content = f"""
+
+
+
+ 数据统计报告 - {self.filename}
+
+
+
+ 数据统计报告 - {self.filename}
+
+
⚠️ 数据为空
+
未找到有效数据或统计数据为空。
+
生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
输入文件位置: {self.file_dir}
+
+
+
+ """
+
+ report_path = os.path.join(self.output_dir, 'summary_report.html')
+ with open(report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ return report_path
+
+ def _sanitize_filename(self, filename):
+ """清理文件名,移除非法字符"""
+ import re
+ return re.sub(r'[<>:"/\\|?*]', '_', filename)
+
+ def _generate_group_reports_incremental(self):
+ """分批生成分组报告,避免长时间等待"""
+ # 使用明确的空值检查方法
+ if self.stats is None or len(self.stats.index) == 0:
+ print("⚠️ 统计数据为空,跳过分组报告生成")
+ return
+
+ stats_index = list(self.stats.index)
+ total_groups = len(stats_index)
+
+ if total_groups == 0:
+ print("⚠️ 没有有效的分组数据")
+ return
+
+ print(f"📊 开始分批生成 {total_groups} 个分组报告...")
+ print(f"📁 分组报告将保存到: {os.path.join(self.output_dir, 'group_reports')}")
+
+ # 分批处理
+ BATCH_SIZE = min(20, total_groups)
+ completed_groups = []
+ total_batches = (total_groups + BATCH_SIZE - 1) // BATCH_SIZE
+
+ for batch_idx in range(total_batches):
+ batch_start = batch_idx * BATCH_SIZE
+ batch_end = min((batch_idx + 1) * BATCH_SIZE, total_groups)
+ batch_groups = stats_index[batch_start:batch_end]
+
+ print(f"\n🔄 处理批次 {batch_idx + 1}/{total_batches}: 分组 {batch_start + 1}-{batch_end}")
+
+ try:
+ batch_results = self._process_batch(batch_groups)
+
+ # 生成当前批次的分组报告
+ successful_reports = 0
+ for group_key in batch_groups:
+ try:
+ self._create_single_group_report(group_key, batch_results.get(group_key, {}))
+ completed_groups.append(group_key)
+ successful_reports += 1
+ print(f" ✅ 分组报告生成: {self._sanitize_filename(group_key)}.html")
+ except Exception as e:
+ print(f" ❌ 生成分组 {group_key} 报告失败: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+
+ # 保存进度
+ self._save_progress(completed_groups, f"batch_{batch_idx + 1}")
+
+ print(f"✅ 批次 {batch_idx + 1} 完成 (成功生成 {successful_reports}/{len(batch_groups)} 个报告)")
+
+ except Exception as batch_error:
+ print(f"❌ 批次 {batch_idx + 1} 处理失败: {batch_error}")
+ print(f" 错误详情: {traceback.format_exc()}")
+ # 继续处理下一批次
+ continue
+
+ # 添加批次间隔,避免资源竞争
+ if batch_idx < total_batches - 1:
+ print("⏳ 等待2秒后处理下一批次...")
+ time.sleep(2)
+
+ print(f"✅ 所有分组报告生成完成 (总计: {len(completed_groups)}/{total_groups})")
+ print(f"📁 分组报告保存位置: {os.path.join(self.output_dir, 'group_reports')}")
+
+ def _process_batch(self, group_keys):
+ """处理单个批次的分组"""
+ if not group_keys: # 明确的空列表检查
+ print("⚠️ 当前批次没有分组数据")
+ return {}
+
+ tasks = []
+ for group_key in group_keys:
+ # 问题修正:使用明确的检查方法
+ stats_index_list = list(self.stats.index) # 转换为列表
+ if group_key not in stats_index_list:
+ print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过")
+ continue # 跳过不存在的分组
+
+ # 问题修正:避免DataFrame的布尔判断,使用明确的.empty检查
+ group_data = self.data[self.data['Group_Key'] == group_key]
+ if group_data.empty: # 明确的空值检查
+ print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过")
+ continue
+
+ row = self.stats.loc[group_key]
+
+ # 安全地获取特征数据,添加空值检查
+ feature_data_dict = {}
+ for col in ['Height(mil)', 'Volume(%)', 'Area(%)']:
+ col_data = group_data[col].dropna()
+ if len(col_data) == 0:
+ print(f"⚠️ 警告: 分组 {group_key} 的 {col} 数据为空")
+ col_data = pd.Series([], dtype=float) # 创建空Series
+ feature_data_dict[col] = col_data
+
+ # 获取预设的上下限值
+ limits_dict = {}
+ # 安全地获取限制值
+ try:
+ limits_dict = {
+ 'Height(mil)': (row['Height_High(mil)'], row['Height_Low(mil)']), # USL, LSL
+ 'Volume(%)': (row['Vol_Max(%)'], row['Vol_Min(%)']), # USL, LSL
+ 'Area(%)': (row['Area_Max(%)'], row['Area_Min(%)']) # USL, LSL
+ }
+ except KeyError as e:
+ print(f"❌ 错误: 分组 {group_key} 缺少预设上下限列 {e}")
+ continue
+
+ tasks.append((group_key, feature_data_dict, limits_dict))
+
+ if len(tasks) == 0: # 明确的空列表检查
+ print("⚠️ 当前批次没有有效任务")
+ return {}
+
+ # 使用多进程处理
+ max_workers = min(mp.cpu_count(), len(tasks), 4)
+ results = {}
+
+ print(f"🔧 开始处理批次中的 {len(tasks)} 个任务,使用 {max_workers} 个进程...")
+
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
+ future_to_key = {}
+ for task in tasks:
+ future = executor.submit(plot_worker, task)
+ future_to_key[future] = task[0]
+
+ completed_count = 0
+ for future in as_completed(future_to_key):
+ group_key = future_to_key[future]
+ try:
+ result_key, result_data = future.result()
+ if result_key: # 明确的结果检查
+ results[result_key] = result_data
+ completed_count += 1
+ print(f" 📈 图表生成完成: {result_key} ({completed_count}/{len(tasks)})")
+ except Exception as e:
+ print(f" ❌ 处理分组 {group_key} 时出错: {e}")
+
+ print(f"✅ 批次处理完成,成功生成 {len(results)}/{len(tasks)} 个图表")
+ return results
+
+ def _create_single_group_report(self, group_key, feature_charts):
+ """创建单个分组的独立报告"""
+ # 添加明确的分组存在性检查
+ stats_index_list = list(self.stats.index) # 转换为列表
+
+ if group_key not in stats_index_list:
+ print(f"⚠️ 警告: 分组 {group_key} 不在统计数据中,跳过报告生成")
+ return
+
+ try:
+ row = self.stats.loc[group_key]
+ except KeyError:
+ print(f"❌ 错误: 无法获取分组 {group_key} 的统计数据")
+ return
+
+ # 明确的空值检查
+ group_data = self.data[self.data['Group_Key'] == group_key]
+
+ # 确保group_data不为空
+ if group_data.empty:
+ print(f"⚠️ 警告: 分组 {group_key} 的数据为空,跳过报告生成")
+ return
+
+ # 安全格式化数值
+ def safe_format(value, default="N/A"):
+ try:
+ if pd.isna(value):
+ return default
+ return f"{float(value):.4f}"
+ except (ValueError, TypeError):
+ return default
+
+ html_content = f"""
+
+
+
+ {group_key} - 详细分析报告
+
+
+
+
+
+ {group_key} - 详细分析报告
+ 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+ 输入文件位置: {self.file_dir}
+
+
+
基本统计信息
+
+
+ | 特征 | 预设下限(LSL) | 预设上限(USL) |
+ 实测最小值 | 实测最大值 |
+ 平均值 | 标准差 | CPK |
+
+
+ | Height(mil) | {safe_format(row.get('Height_Low(mil)'))} | {safe_format(row.get('Height_High(mil)'))} |
+ {safe_format(row.get('Height_Measured_Min(mil)'))} | {safe_format(row.get('Height_Measured_Max(mil)'))} |
+ {safe_format(row.get('Height_Mean(mil)'))} | {safe_format(row.get('Height_Std(mil)'))} | {safe_format(row.get('Height_Cpk'))} |
+
+
+ | Volume(%) | {safe_format(row.get('Vol_Min(%)'))} | {safe_format(row.get('Vol_Max(%)'))} |
+ {safe_format(row.get('Vol_Measured_Min(%)'))} | {safe_format(row.get('Vol_Measured_Max(%)'))} |
+ {safe_format(row.get('Vol_Mean(%)'))} | {safe_format(row.get('Vol_Std(%)'))} | {safe_format(row.get('Volume_Cpk'))} |
+
+
+ | Area(%) | {safe_format(row.get('Area_Min(%)'))} | {safe_format(row.get('Area_Max(%)'))} |
+ {safe_format(row.get('Area_Measured_Min(%)'))} | {safe_format(row.get('Area_Measured_Max(%)'))} |
+ {safe_format(row.get('Area_Mean(%)'))} | {safe_format(row.get('Area_Std(%)'))} | {safe_format(row.get('Area_Cpk'))} |
+
+
+
+ """
+
+ # 添加图表
+ for feature_name in ['Height(mil)', 'Volume(%)', 'Area(%)']:
+ chart_base64 = feature_charts.get(feature_name, "")
+ if chart_base64 and len(chart_base64) > 0: # 明确的字符串检查
+ html_content += f"""
+ {feature_name} 分析图表
+
+
+

+
+
+ """
+ else:
+ html_content += f"""
+ {feature_name} 分析图表
+ 该特征的图表生成失败或数据不足。
+ """
+
+ html_content += """
+
+
+ """
+
+ filename = self._sanitize_filename(group_key) + '.html'
+ group_reports_dir = os.path.join(self.output_dir, 'group_reports')
+ report_path = os.path.join(group_reports_dir, filename)
+
+ try:
+ with open(report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+ except Exception as e:
+ print(f"❌ 保存分组报告失败 {filename}: {e}")
+
+ def _create_report_index(self):
+ """创建分组报告索引"""
+ # 确保使用正确的索引获取方式
+ if self.stats is None or len(self.stats.index) == 0:
+ print("⚠️ 统计数据为空,创建空索引")
+ return self._create_empty_index()
+
+ stats_index = list(self.stats.index) # 转换为列表
+
+ html_content = """
+
+
+
+ 分组报告索引
+
+
+
+
+
+ 分组报告索引
+ 共生成 """ + str(len(stats_index)) + """ 个分组报告
+ 输入文件位置: """ + self.file_dir + """
+
+
+ """
+
+ for group_key in stats_index: # 使用列表而不是DataFrame索引
+ filename = self._sanitize_filename(group_key) + '.html'
+ html_content += f'
\n'
+
+ html_content += """
+
+
+
+ """
+
+ index_path = os.path.join(self.output_dir, 'group_reports', 'index.html')
+ try:
+ with open(index_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+ except Exception as e:
+ print(f"❌ 创建索引文件失败: {e}")
+ return None
+
+ return index_path
+
+ def _create_empty_index(self):
+ """创建空索引文件"""
+ html_content = """
+
+
+
+ 分组报告索引
+
+
+
+ 分组报告索引
+
+
⚠️ 没有分组报告
+
当前没有生成任何分组报告。
+
输入文件位置: """ + self.file_dir + """
+
+
+
+ """
+
+ index_path = os.path.join(self.output_dir, 'group_reports', 'index.html')
+ with open(index_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ return index_path
+
+ def _calculate_cpk(self, stats):
+ """计算CPK值 - 使用预设的上下限值"""
+ print("详细计算CPK值...")
+
+ def calculate_single_cpk(mean, std, usl, lsl):
+ """计算单个特征的CPK"""
+ if std == 0 or pd.isna(std):
+ return np.nan
+
+ if pd.isna(usl) or pd.isna(lsl):
+ return np.nan
+
+ # CPK = min[(USL - mean) / (3*std), (mean - LSL) / (3*std)]
+ cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf')
+ cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf')
+
+ # 如果其中一个限值为无穷大,则返回另一个值
+ if cpu == float('inf') and cpl == float('inf'):
+ return np.nan
+ elif cpu == float('inf'):
+ return cpl
+ elif cpl == float('inf'):
+ return cpu
+ else:
+ return min(cpu, cpl)
+
+ # 确保CPK列不存在时创建
+ cpk_results = []
+
+ for idx, row in stats.iterrows():
+ print(f"计算分组 {idx} 的CPK值...")
+
+ # Height CPK - 使用预设的Height_High作为USL,Height_Low作为LSL
+ height_cpk = calculate_single_cpk(
+ row['Height_Mean(mil)'],
+ row['Height_Std(mil)'],
+ row['Height_High(mil)'], # USL - 预设上限
+ row['Height_Low(mil)'] # LSL - 预设下限
+ )
+
+ # Volume CPK - 使用预设的Vol_Max作为USL,Vol_Min作为LSL
+ volume_cpk = calculate_single_cpk(
+ row['Vol_Mean(%)'],
+ row['Vol_Std(%)'],
+ row['Vol_Max(%)'], # USL - 预设上限
+ row['Vol_Min(%)'] # LSL - 预设下限
+ )
+
+ # Area CPK - 使用预设的Area_Max作为USL,Area_Min作为LSL
+ area_cpk = calculate_single_cpk(
+ row['Area_Mean(%)'],
+ row['Area_Std(%)'],
+ row['Area_Max(%)'], # USL - 预设上限
+ row['Area_Min(%)'] # LSL - 预设下限
+ )
+
+ cpk_results.append({
+ 'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan,
+ 'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan,
+ 'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan
+ })
+
+ # 将CPK结果添加到统计数据中
+ cpk_df = pd.DataFrame(cpk_results, index=stats.index)
+ stats = pd.concat([stats, cpk_df], axis=1)
+
+ print("✅ 所有分组CPK计算完成 - 使用预设上下限值")
+ return stats
+
+ def _save_to_excel_advanced(self):
+ """保存Excel文件"""
+ print("保存Excel文件...")
+
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ excel_filename = os.path.join(self.output_dir, 'statistics.xlsx')
+
+ try:
+ with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
+ # 保存统计汇总
+ if self.stats is not None:
+ self.stats.reset_index().to_excel(writer, sheet_name='统计汇总', index=False)
+
+ # 保存前50个分组的数据
+ MAX_GROUPS_TO_SAVE = 50
+ unique_groups = self.data['Group_Key'].unique()[:MAX_GROUPS_TO_SAVE]
+
+ for i, group_key in enumerate(unique_groups):
+ group_data = self.data[self.data['Group_Key'] == group_key].copy()
+ sheet_name = f"组_{group_key}"[:31]
+ group_data.to_excel(writer, sheet_name=sheet_name, index=False)
+
+ print(f"✅ Excel文件保存完成: {excel_filename}")
+ return excel_filename
+
+ except Exception as e:
+ print(f"❌ Excel文件保存失败: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+ return None
+
+
+def main():
+ """主函数"""
+ print("=== 数据统计报告生成程序(使用预设上下限值) ===")
+
+ processor = DataProcessor()
+
+ try:
+ if processor.select_file():
+ processor._load_data()
+ report_path = processor.generate_report()
+ print(f"✅ 报告生成完成")
+ print(f"📁 输入文件目录: {processor.file_dir}")
+ print(f"📁 输出目录: {processor.output_dir}")
+ print(f"📊 汇总报告: {report_path}")
+
+ # 显示重要文件路径
+ print(f"📊 Excel文件: {os.path.join(processor.output_dir, 'statistics.xlsx')}")
+ else:
+ print("❌ 未选择文件,程序退出")
+
+ except Exception as e:
+ print(f"❌ 程序执行失败: {e}")
+ print(f" 错误详情: {traceback.format_exc()}")
+
+
+if __name__ == "__main__":
+ mp.set_start_method('spawn', force=True)
+ main()
diff --git a/dataProcess/dataProcess_sightml_V1.py b/dataProcess/dataProcess_sightml_V1.py
new file mode 100644
index 0000000..edc6bbf
--- /dev/null
+++ b/dataProcess/dataProcess_sightml_V1.py
@@ -0,0 +1,810 @@
+import pandas as pd
+import tkinter as tk
+from tkinter import filedialog
+import os
+from datetime import datetime
+import numpy as np
+
+
+class DataProcessor:
+ def __init__(self):
+ self.data = None
+ self.filename = None
+ self.file_path = None
+ self.file_dir = None
+ self.processing_start_time = None
+
+ def select_file(self):
+ """手动选择数据文件"""
+ print("🔍 打开文件选择对话框...")
+ root = tk.Tk()
+ root.withdraw()
+
+ self.file_path = filedialog.askopenfilename(
+ title="选择数据文件",
+ filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")]
+ )
+
+ if self.file_path:
+ self.filename = os.path.basename(self.file_path)
+ self.file_dir = os.path.dirname(self.file_path)
+ print(f"✅ 已选择文件: {self.filename}")
+ print(f"📁 文件所在目录: {self.file_dir}")
+ return True
+ else:
+ print("❌ 未选择文件")
+ return False
+
+ def _load_data(self):
+ """加载数据文件"""
+ print("📥 开始加载数据文件...")
+ try:
+ if self.file_path.endswith('.csv'):
+ self.data = pd.read_csv(self.file_path)
+ print("✅ 成功加载CSV文件")
+ elif self.file_path.endswith('.xlsx'):
+ self.data = pd.read_excel(self.file_path)
+ print("✅ 成功加载Excel文件")
+ else:
+ raise ValueError("不支持的文件格式")
+
+ print(f"📊 数据文件形状: {self.data.shape}")
+ print(f"📋 数据列名: {list(self.data.columns)[:10]}...")
+
+ # 显示数据预览
+ print("\n📋 数据预览(前3行):")
+ print(self.data.head(3))
+
+ # 显示列数据类型
+ print("\n📊 列数据类型:")
+ for col in self.data.columns[:10]:
+ print(f" {col}: {self.data[col].dtype}")
+
+ except Exception as e:
+ print(f"❌ 加载数据文件时出错: {e}")
+ raise
+
+ def _validate_data(self):
+ """验证数据完整性"""
+ print("🔍 验证数据完整性...")
+
+ # 检查必要的测量列
+ required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)']
+ missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns]
+
+ if missing_measure_columns:
+ error_msg = f"❌ 数据文件中缺少必要的测量列: {missing_measure_columns}"
+ print(error_msg)
+ raise ValueError(error_msg)
+
+ # 检查上下限列
+ required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)', 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)',
+ 'Area_Max(%)']
+ missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns]
+
+ if missing_limit_columns:
+ error_msg = f"❌ 数据文件中缺少必要的上下限列: {missing_limit_columns}"
+ print(error_msg)
+ raise ValueError(error_msg)
+
+ print("✅ 数据验证通过")
+
+ # 检查数据是否存在空值
+ all_columns = required_measure_columns + required_limit_columns
+ null_counts = self.data[all_columns].isnull().sum()
+ if null_counts.any():
+ print(f"⚠️ 数据中存在空值:")
+ for col, count in null_counts[null_counts > 0].items():
+ print(f" {col}: {count} 个空值")
+ else:
+ print("✅ 所有必需列都没有空值")
+
+ # 显示数据统计信息
+ print("\n📊 数据统计信息:")
+
+ for col in required_measure_columns:
+ if col in self.data.columns:
+ # 检查列的数据类型,针对不同类型使用不同的格式化方式
+ if pd.api.types.is_numeric_dtype(self.data[col]):
+ valid_count = self.data[col].count()
+ if valid_count > 0:
+ min_val = self.data[col].min()
+ max_val = self.data[col].max()
+ print(f" {col}: {valid_count} 个有效值, 范围 {min_val:.4f} - {max_val:.4f}")
+ else:
+ print(f" {col}: 0 个有效值")
+ else:
+ # 非数值型列:显示唯一值和示例
+ unique_count = self.data[col].nunique()
+ sample_values = self.data[col].dropna().head(3).tolist()
+ print(
+ f" {col}: {self.data[col].count()} 个有效值, {unique_count} 个唯一值, 示例: {sample_values}")
+
+ # 检查并转换数据类型
+ print("\n🔄 数据类型检查与转换:")
+ numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)',
+ 'Height_Low(mil)', 'Height_High(mil)',
+ 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)', 'Area_Max(%)']
+
+ for col in numeric_columns:
+ if col in self.data.columns:
+ if not pd.api.types.is_numeric_dtype(self.data[col]):
+ try:
+ # 尝试转换为数值类型
+ original_count = self.data[col].count()
+ self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
+ converted_count = self.data[col].count()
+ lost_data = original_count - converted_count
+ if lost_data > 0:
+ print(f" ⚠️ {col}: 转换后丢失 {lost_data} 个非数值数据")
+ else:
+ print(f" ✅ {col}: 成功转换为数值类型")
+ except Exception as e:
+ print(f" ❌ {col}: 类型转换失败 - {e}")
+ else:
+ valid_count = self.data[col].count()
+ print(f" ✅ {col}: 已经是数值类型, {valid_count} 个有效值")
+
+ def _print_progress(self, message, level=1):
+ """打印进度信息,支持分级显示"""
+ indent = " " * level
+ timestamp = datetime.now().strftime("%H:%M:%S")
+ print(f"{timestamp} {indent}{message}")
+
+ def generate_report(self):
+ """生成统计报告"""
+ if self.data is None:
+ raise ValueError("请先选择数据文件")
+
+ try:
+ self.processing_start_time = datetime.now()
+ print(f"\n🚀 开始生成报告 - {self.processing_start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+
+ # 验证数据
+ self._validate_data()
+
+ self._print_progress("开始数据处理...", 1)
+
+ # 创建分组键
+ self._print_progress("创建分组键...", 2)
+
+ # 确保PAD ID和Component ID都是字符串类型
+ self.data['PAD ID'] = self.data['PAD ID'].astype(str)
+ self.data['Component ID'] = self.data['Component ID'].astype(str)
+
+ self.data['Group_Key'] = self.data['PAD ID'] + '_' + self.data['Component ID']
+ group_count = self.data['Group_Key'].nunique()
+ self._print_progress(f"共发现 {group_count} 个分组", 2)
+
+ # 显示分组信息
+ group_info = self.data['Group_Key'].value_counts()
+ self._print_progress(f"分组数据量统计:", 2)
+ for i, (group, count) in enumerate(group_info.head(5).items()):
+ self._print_progress(f" {group}: {count} 个数据点", 3)
+ if len(group_info) > 5:
+ self._print_progress(f" ... 还有 {len(group_info) - 5} 个分组", 3)
+
+ # 检查数值列是否存在NaN值
+ numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)']
+ for col in numeric_columns:
+ if col in self.data.columns:
+ nan_count = self.data[col].isna().sum()
+ if nan_count > 0:
+ self._print_progress(f"⚠️ {col} 有 {nan_count} 个空值,将在统计计算中排除", 3)
+
+ # 计算统计信息
+ self._print_progress("计算基本统计信息...", 2)
+
+ # 确保数值列没有无穷大值
+ for col in numeric_columns:
+ if col in self.data.columns:
+ inf_count = np.isinf(self.data[col]).sum()
+ if inf_count > 0:
+ self._print_progress(f"⚠️ {col} 有 {inf_count} 个无穷大值,将替换为NaN", 3)
+ self.data[col] = self.data[col].replace([np.inf, -np.inf], np.nan)
+
+ stats = self.data.groupby('Group_Key').agg({
+ 'Height(mil)': ['min', 'max', 'mean', 'std'],
+ 'Volume(%)': ['min', 'max', 'mean', 'std'],
+ 'Area(%)': ['min', 'max', 'mean', 'std']
+ }).round(4)
+
+ # 重命名列
+ stats.columns = [
+ 'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)',
+ 'Volume_Measured_Min(%)', 'Volume_Measured_Max(%)', 'Volume_Mean(%)', 'Volume_Std(%)',
+ 'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)'
+ ]
+
+ self._print_progress("基本统计信息计算完成", 2)
+
+ # 获取上下限信息
+ self._print_progress("获取预设上下限信息...", 2)
+ limits = self.data.groupby('Group_Key').agg({
+ 'Height_Low(mil)': 'first',
+ 'Height_High(mil)': 'first',
+ 'Vol_Min(%)': 'first',
+ 'Vol_Max(%)': 'first',
+ 'Area_Min(%)': 'first',
+ 'Area_Max(%)': 'first'
+ }).round(4)
+
+ # 合并统计信息和上下限信息
+ stats = pd.concat([stats, limits], axis=1)
+ self._print_progress("上下限信息获取完成", 2)
+
+ # 计算CPK
+ self._print_progress("开始计算CPK值...", 2)
+ stats = self._calculate_cpk(stats)
+
+ # 分析CPK结果
+ cpk_analysis = self._analyze_cpk_results(stats)
+ self._print_progress("CPK分析完成", 2)
+ self._print_cpk_summary(cpk_analysis)
+
+ # 生成HTML报告
+ self._print_progress("生成HTML报告...", 2)
+ report_path = self._create_html_report(stats, cpk_analysis)
+ self._print_progress("HTML报告生成完成", 2)
+
+ # 计算处理时间
+ processing_time = datetime.now() - self.processing_start_time
+ self._print_progress(f"总处理时间: {processing_time.total_seconds():.2f} 秒", 1)
+
+ return report_path
+
+ except Exception as e:
+ print(f"❌ 生成报告过程中出错: {e}")
+ import traceback
+ print(f"详细错误信息:")
+ traceback.print_exc()
+ raise
+
+ def _analyze_cpk_results(self, stats):
+ """分析CPK结果"""
+ cpk_analysis = {
+ 'total_groups': len(stats),
+ 'cpk_status': {'Height': {}, 'Volume': {}, 'Area': {}},
+ 'problematic_groups': []
+ }
+
+ for feature in ['Height', 'Volume', 'Area']:
+ cpk_col = f'{feature}_Cpk'
+ if cpk_col not in stats.columns:
+ continue
+
+ valid_cpk = stats[cpk_col].dropna()
+ total_valid = len(valid_cpk)
+
+ cpk_analysis['cpk_status'][feature] = {
+ 'total': total_valid,
+ 'excellent': len(valid_cpk[valid_cpk >= 1.33]) if total_valid > 0 else 0,
+ 'acceptable': len(valid_cpk[(valid_cpk >= 1.0) & (valid_cpk < 1.33)]) if total_valid > 0 else 0,
+ 'poor': len(valid_cpk[valid_cpk < 1.0]) if total_valid > 0 else 0,
+ 'invalid': len(stats) - total_valid
+ }
+
+ # 识别有问题的分组(任意特征的CPK < 1.0)
+ for group_key, row in stats.iterrows():
+ problems = []
+ for feature in ['Height', 'Volume', 'Area']:
+ cpk_col = f'{feature}_Cpk'
+ if cpk_col in stats.columns and not pd.isna(row[cpk_col]):
+ if row[cpk_col] < 1.0:
+ problems.append(f"{feature}: {row[cpk_col]:.4f}")
+
+ if problems:
+ cpk_analysis['problematic_groups'].append({
+ 'group_key': group_key,
+ 'problems': problems
+ })
+
+ return cpk_analysis
+
+ def _print_cpk_summary(self, cpk_analysis):
+ """打印CPK结果摘要"""
+ print("\n📈 CPK分析结果摘要:")
+ print("=" * 60)
+
+ for feature, status in cpk_analysis['cpk_status'].items():
+ total = status['total']
+ if total == 0:
+ print(f"\n{feature}: 无有效CPK数据")
+ continue
+
+ print(f"\n{feature}:")
+ excellent_pct = (status['excellent'] / total * 100) if total > 0 else 0
+ acceptable_pct = (status['acceptable'] / total * 100) if total > 0 else 0
+ poor_pct = (status['poor'] / total * 100) if total > 0 else 0
+
+ print(f" ✅ 优秀 (CPK ≥ 1.33): {status['excellent']}/{total} ({excellent_pct:.1f}%)")
+ print(f" ⚠️ 合格 (1.0 ≤ CPK < 1.33): {status['acceptable']}/{total} ({acceptable_pct:.1f}%)")
+ print(f" ❌ 不合格 (CPK < 1.0): {status['poor']}/{total} ({poor_pct:.1f}%)")
+ print(f" ❓ 无法计算: {status['invalid']}")
+
+ if cpk_analysis['problematic_groups']:
+ print(f"\n⚠️ 发现 {len(cpk_analysis['problematic_groups'])} 个有问题分组:")
+ for i, group in enumerate(cpk_analysis['problematic_groups'][:10]):
+ print(f" {i + 1}. {group['group_key']}: {', '.join(group['problems'])}")
+ if len(cpk_analysis['problematic_groups']) > 10:
+ print(f" ... 还有 {len(cpk_analysis['problematic_groups']) - 10} 个问题分组")
+ else:
+ print("\n✅ 所有分组的CPK都在合格范围内")
+
+ print("=" * 60)
+
+ def _calculate_cpk(self, stats):
+ """计算CPK值"""
+ self._print_progress("详细计算CPK值...", 3)
+
+ def calculate_single_cpk(mean, std, usl, lsl):
+ """计算单个特征的CPK"""
+ if pd.isna(mean) or pd.isna(std) or std == 0:
+ return np.nan
+
+ if pd.isna(usl) or pd.isna(lsl):
+ return np.nan
+
+ try:
+ cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf')
+ cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf')
+
+ if cpu == float('inf') and cpl == float('inf'):
+ return np.nan
+ elif cpu == float('inf'):
+ return cpl
+ elif cpl == float('inf'):
+ return cpu
+ else:
+ return min(cpu, cpl)
+ except (ZeroDivisionError, TypeError):
+ return np.nan
+
+ # 计算每个特征的CPK
+ cpk_results = []
+ total_groups = len(stats)
+
+ for idx, row in stats.iterrows():
+ if len(cpk_results) % 100 == 0 and total_groups > 100:
+ self._print_progress(f"计算第 {len(cpk_results) + 1} 个分组的CPK...", 4)
+
+ # Height CPK
+ height_cpk = calculate_single_cpk(
+ row.get('Height_Mean(mil)', np.nan),
+ row.get('Height_Std(mil)', np.nan),
+ row.get('Height_High(mil)', np.nan),
+ row.get('Height_Low(mil)', np.nan)
+ )
+
+ # Volume CPK
+ volume_cpk = calculate_single_cpk(
+ row.get('Volume_Mean(%)', np.nan),
+ row.get('Volume_Std(%)', np.nan),
+ row.get('Vol_Max(%)', np.nan),
+ row.get('Vol_Min(%)', np.nan)
+ )
+
+ # Area CPK
+ area_cpk = calculate_single_cpk(
+ row.get('Area_Mean(%)', np.nan),
+ row.get('Area_Std(%)', np.nan),
+ row.get('Area_Max(%)', np.nan),
+ row.get('Area_Min(%)', np.nan)
+ )
+
+ cpk_results.append({
+ 'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan,
+ 'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan,
+ 'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan
+ })
+
+ # 将CPK结果添加到统计数据中
+ cpk_df = pd.DataFrame(cpk_results, index=stats.index)
+ stats = pd.concat([stats, cpk_df], axis=1)
+
+ self._print_progress(f"所有 {len(stats)} 个分组CPK计算完成", 3)
+ return stats
+
+ def _get_cpk_status_class(self, cpk_value):
+ """根据CPK值返回状态类别"""
+ if pd.isna(cpk_value):
+ return 'cpk-invalid'
+ elif cpk_value >= 1.33:
+ return 'cpk-excellent'
+ elif cpk_value >= 1.0:
+ return 'cpk-acceptable'
+ else:
+ return 'cpk-poor'
+
+ def _create_html_report(self, stats, cpk_analysis):
+ """创建完整的HTML报告"""
+ self._print_progress("构建HTML报告内容...", 3)
+
+ total_groups = len(stats)
+
+ # 完整的HTML模板
+ html_content = f"""
+
+
+
+ 数据统计报告 - {self.filename}
+
+
+
+
+
📊 数据统计报告 - {self.filename}
+
生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+
输入文件: {self.filename}
+
+
+
📈 报告摘要
+
总分组数量: {total_groups}
+
处理时间: {(datetime.now() - self.processing_start_time).total_seconds():.2f} 秒
+
+
+
+
+"""
+
+ # 添加CPK状态卡片
+ for feature, status in cpk_analysis['cpk_status'].items():
+ total = status['total'] + status['invalid']
+ if total == 0:
+ continue
+
+ html_content += f"""
+
+
{feature} CPK状态
+
+ {status['excellent'] + status['acceptable']}/{total}
+
+
合格率: {(status['excellent'] + status['acceptable']) / total * 100:.1f}%
+
+ 优秀: {status['excellent']}
+ 合格: {status['acceptable']}
+ 不合格: {status['poor']}
+ 无效: {status['invalid']}
+
+
+"""
+
+ html_content += f"""
+
+
+
+ {f'
⚠️ 发现 {len(cpk_analysis["problematic_groups"])} 个问题分组
以下分组的CPK值低于1.0,需要重点关注
' if cpk_analysis['problematic_groups'] else ''}
+
+
📋 详细统计数据
+
+
+ 预设上下限
+ 实测值
+ CPK ≥ 1.33
+ 1.0 ≤ CPK < 1.33
+ CPK < 1.0
+
+
+
+
+
+
+ | 分组标识 |
+ Height(mil) |
+ Volume(%) |
+ Area(%) |
+
+
+
+ | 预设下限 |
+ 预设上限 |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ CPK |
+
+ 预设下限 |
+ 预设上限 |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ CPK |
+
+ 预设下限 |
+ 预设上限 |
+ 实测最小值 |
+ 实测最大值 |
+ 平均值 |
+ 标准差 |
+ CPK |
+
+
+
+"""
+
+ # 生成表格行数据的辅助函数
+ def format_value(value):
+ if pd.isna(value):
+ return 'N/A'
+ elif isinstance(value, (int, float)):
+ return f"{value:.4f}"
+ else:
+ return str(value)
+
+ # 用于检查列是否存在的辅助函数
+ def safe_get_value(row, column_name):
+ """安全获取列值,如果列不存在返回N/A"""
+ if column_name in row.index:
+ return row[column_name]
+ else:
+ return np.nan
+
+ for group_key, row in stats.iterrows():
+ # 检查是否为问题分组
+ is_problematic = any(problem['group_key'] == group_key for problem in cpk_analysis['problematic_groups'])
+ row_class = 'class="problematic-row"' if is_problematic else ''
+
+ html_content += f"""
+
+ | {group_key}{' ⚠️' if is_problematic else ''} |
+"""
+
+ # 为每个特征生成列
+ for feature in ['Height', 'Volume', 'Area']:
+ cpk_value = safe_get_value(row, f'{feature}_Cpk')
+ cpk_class = self._get_cpk_status_class(cpk_value)
+
+ # 为不同特征设置正确的列名
+ if feature == 'Height':
+ lower_limit_col = 'Height_Low(mil)'
+ upper_limit_col = 'Height_High(mil)'
+ measured_min_col = 'Height_Measured_Min(mil)'
+ measured_max_col = 'Height_Measured_Max(mil)'
+ mean_col = 'Height_Mean(mil)'
+ std_col = 'Height_Std(mil)'
+ else:
+ lower_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Min(%)" # 修正:Volume使用Vol_Min(%),Area使用Area_Min(%)
+ upper_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Max(%)" # 修正:Volume使用Vol_Max(%),Area使用Area_Max(%)
+ measured_min_col = f'{feature}_Measured_Min(%)'
+ measured_max_col = f'{feature}_Measured_Max(%)'
+ mean_col = f'{feature}_Mean(%)'
+ std_col = f'{feature}_Std(%)'
+
+ html_content += f"""
+
+ {format_value(safe_get_value(row, lower_limit_col))} |
+ {format_value(safe_get_value(row, upper_limit_col))} |
+ {format_value(safe_get_value(row, measured_min_col))} |
+ {format_value(safe_get_value(row, measured_max_col))} |
+ {format_value(safe_get_value(row, mean_col))} |
+ {format_value(safe_get_value(row, std_col))} |
+ {format_value(cpk_value)} |
+"""
+
+ html_content += """
+
"""
+
+ html_content += """
+
+
+
+
+
+
📊 CPK状态分布
+
+"""
+
+ # 添加简单的CPK分布图表
+ for feature, status in cpk_analysis['cpk_status'].items():
+ total = status['total'] + status['invalid']
+ if total == 0:
+ continue
+
+ html_content += f"""
+
+
{feature} CPK分布
+
+
+
+ 优秀 {status['excellent']} | 合格 {status['acceptable']} | 不合格 {status['poor']} | 无效 {status['invalid']}
+
+
+
+"""
+
+ html_content += """
+
+
+
+
+"""
+
+ # 保存报告
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ report_filename = f"{os.path.splitext(self.filename)[0]}_report_{timestamp}.html"
+ report_path = os.path.join(self.file_dir, report_filename)
+
+ self._print_progress(f"保存报告到: {report_path}", 3)
+ with open(report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ return report_path
+
+
+def main():
+ """主函数"""
+ print("=" * 60)
+ print("🚀 数据统计报告生成程序 - Volume上下限修复版")
+ print("=" * 60)
+
+ processor = DataProcessor()
+
+ try:
+ if processor.select_file():
+ processor._load_data()
+ report_path = processor.generate_report()
+
+ print("\n" + "=" * 60)
+ print("✅ 程序执行完成")
+ print(f"📄 统计报告生成成功: {report_path}")
+ print("=" * 60)
+ else:
+ print("❌ 未选择文件,程序退出")
+
+ except Exception as e:
+ print(f"\n❌ 程序执行失败: {e}")
+ import traceback
+ traceback.print_exc()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/htmlProcess/.gitignore b/htmlProcess/.gitignore
new file mode 100644
index 0000000..00c07ca
--- /dev/null
+++ b/htmlProcess/.gitignore
@@ -0,0 +1,17 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+
+
+
+htmlReportProcess_Merge_picHtml_V3.py
+
+htmlReportProcess_Merge_picHtml_V2.py
+
+htmlReportProcess_Merge_pic_V2.py
+
+/htmlReportProcess*/
\ No newline at end of file
diff --git a/htmlProcess/README.md b/htmlProcess/README.md
new file mode 100644
index 0000000..92d2fb2
--- /dev/null
+++ b/htmlProcess/README.md
@@ -0,0 +1,11 @@
+# Sample GitLab Project
+
+This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches,
+named and filled with lorem ipsum.
+
+You can look around to get an idea how to structure your project and, when done, you can safely delete this project.
+
+[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html)
+
+html文件的报告自动分析和处理数据的工具脚本
+
diff --git a/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py b/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py
new file mode 100644
index 0000000..c571814
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_Merge_picHtml_V1.py
@@ -0,0 +1,926 @@
+import os
+import re
+import sys
+import time
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+from matplotlib.lines import Line2D
+from typing import Optional, Tuple, List, Dict, Any, Union
+from pathlib import Path
+import numpy as np
+import base64
+from io import BytesIO
+from jinja2 import Template
+from colorama import Fore, Style, init
+
+# 避免 SettingWithCopy 警告影响输出可读性
+pd.options.mode.chained_assignment = None
+
+# 设置中文字体支持
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
+plt.rcParams['axes.unicode_minus'] = False
+
+# HTML模板 - 添加了SN独立图的显示
+HTML_TEMPLATE = """
+
+
+
+
+
+ 测试报告分析 - {{ keyword }}
+
+
+
+
+
+ {% for test in tests %}
+
+
+
+
+
+
数据点数
+
{{ test.stats.count }}
+
+
+
平均值
+
{{ "%.4f"|format(test.stats.mean) }}
+
+
+
中位数
+
{{ "%.4f"|format(test.stats.median) }}
+
+
+
标准差
+
{{ "%.4f"|format(test.stats.std) }}
+
+
+
最小值
+
{{ "%.4f"|format(test.stats.min) }}
+
+
+
最大值
+
{{ "%.4f"|format(test.stats.max) }}
+
+
+
+ {% if test.limits.lower is not none or test.limits.upper is not none %}
+
+ {% if test.limits.lower is not none %}
+
+
下限值
+
{{ "%.4f"|format(test.limits.lower) }}
+
+ {% endif %}
+ {% if test.limits.upper is not none %}
+
+
上限值
+
{{ "%.4f"|format(test.limits.upper) }}
+
+ {% endif %}
+
+ {% endif %}
+
+
+
📈 汇总视图 (所有SN)
+
+

+
+
+
+ {% if test.sn_plot_images %}
+
🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)
+
+ {% for sn_plot in test.sn_plot_images %}
+
+
SN: {{ sn_plot.sn }}
+

+
+ {% endfor %}
+
+ {% endif %}
+
+ {% endfor %}
+
+
+
📈 分析摘要
+
+ 文件路径: {{ file_path }}
+
+
+ 分析时间: {{ analysis_time }}秒
+
+
+
测试项分布:
+
+ - 正常: {{ status_counts.normal }} 个
+ - 警告: {{ status_counts.warning }} 个
+ - 异常: {{ status_counts.abnormal }} 个
+
+
+
+
+
+ 报告生成于 {{ timestamp }} | 测试报告分析系统
+
+
+
+"""
+
+
+class TestReportScatterPlotter:
+ def __init__(self):
+ self.file_path: Optional[str] = None
+ self.df: Optional[pd.DataFrame] = None
+ self.output_dir: Optional[str] = None
+ self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit", ]
+ self.col_lower: Optional[str] = None
+ self.col_upper: Optional[str] = None
+ self.html_report_path: Optional[str] = None
+
+ # 缓存处理过的数据
+ self._processed_data_cache: Dict[str, Any] = {}
+
+ def _print_stage(self, msg: str) -> None:
+ """统一的阶段信息输出"""
+ print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}")
+
+ def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None:
+ """改进的进度条显示"""
+ if total <= 0:
+ return
+
+ percent = (current / total) * 100
+ bar_len = 30
+ filled = int(bar_len * current / total)
+ bar = "█" * filled + "-" * (bar_len - filled)
+ sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)")
+ sys.stdout.flush()
+ if current == total:
+ print() # 换行
+
+ def get_file_path(self) -> None:
+ """改进的文件路径获取,支持路径补全"""
+ self._print_stage("输入文件路径")
+
+ while True:
+ print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ")
+ file_path = input("> ").strip()
+
+ # 尝试路径补全和验证
+ if not file_path:
+ continue
+
+ path_obj = Path(file_path)
+ if path_obj.exists():
+ self.file_path = str(path_obj.resolve())
+ print(f"已选择文件: {self.file_path}")
+ break
+ else:
+ print(f"文件不存在: {file_path},请重新输入")
+
+ def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
+ """优化的大小写不敏感列查找"""
+ if self.df is None:
+ return None
+
+ columns_lower = {col.lower().strip(): col for col in self.df.columns}
+ for candidate in candidates:
+ key = candidate.lower().strip()
+ if key in columns_lower:
+ return columns_lower[key]
+ return None
+
+ def load_data(self) -> None:
+ """优化的数据加载方法"""
+ self._print_stage("加载数据")
+ start_time = time.time()
+
+ # 检查文件是否存在
+ if not os.path.exists(self.file_path):
+ raise FileNotFoundError(f"文件不存在: {self.file_path}")
+
+ # 根据文件扩展名选择最优引擎
+ file_ext = self.file_path.lower()
+ if file_ext.endswith('.xlsx'):
+ # .xlsx 文件引擎选择优先级
+ engine_options = ['openpyxl', 'calamine'] # calamine需要安装并可能更快
+ engine = 'openpyxl' # 默认
+ elif file_ext.endswith('.xls'):
+ # .xls 文件引擎选择
+ engine_options = ['xlrd', 'calamine']
+ engine = 'xlrd' # 默认
+ else:
+ raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)")
+
+ # 快速获取工作表名称(轻量级方式)
+ try:
+ if engine == 'openpyxl':
+ import openpyxl
+ workbook = openpyxl.load_workbook(self.file_path, read_only=True)
+ sheet_names = workbook.sheetnames
+ workbook.close()
+ elif engine == 'xlrd':
+ import xlrd
+ workbook = xlrd.open_workbook(self.file_path, on_demand=True)
+ sheet_names = workbook.sheet_names()
+ workbook.release_resources()
+ else:
+ # 使用pandas的轻量级方式
+ excel_file = pd.ExcelFile(self.file_path, engine=engine)
+ sheet_names = excel_file.sheet_names
+ except Exception as e:
+ raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}")
+
+ # 定义优先查找的工作表名
+ target_sheets = ["Merged All Tests", "All Tests"]
+ selected_sheet = None
+
+ for sheet in target_sheets:
+ if sheet in sheet_names:
+ selected_sheet = sheet
+ break
+
+ if selected_sheet is None:
+ raise ValueError(
+ f"未找到指定的工作表: {' 或 '.join(target_sheets)}。"
+ f"当前文件包含的工作表有: {sheet_names}"
+ )
+
+ try:
+ # 性能优化:使用更高效的参数设置
+ read_excel_kwargs = {
+ # 'filepath_or_buffer': self.file_path,
+ 'io': self.file_path, # 修正:使用'io'而不是'filepath_or_buffer'
+ 'sheet_name': selected_sheet,
+ 'engine': engine,
+ 'dtype': 'object', # 先统一读取为对象类型,减少类型推断时间
+ 'na_filter': False, # 禁用自动NA过滤,提高读取速度
+ }
+
+ # 如果知道必需列,且不为空,则只读取需要的列
+ if hasattr(self, 'required_columns') and self.required_columns:
+ # 先检查哪些列实际存在
+ try:
+ # 轻量级检查列名是否存在
+ sample_df = pd.read_excel(
+ self.file_path,
+ sheet_name=selected_sheet,
+ engine=engine,
+ nrows=1 # 只读取第一行来获取列名
+ )
+ existing_columns = [col for col in self.required_columns if col in sample_df.columns]
+
+ if len(existing_columns) < len(self.required_columns):
+ missing = set(self.required_columns) - set(existing_columns)
+ raise KeyError(f"缺少必要列: {list(missing)}")
+
+ read_excel_kwargs['usecols'] = existing_columns
+
+ # print(f"使用 read_excel_kwargs 读取excel:\n {read_excel_kwargs}")
+ # 打印完整的参数信息(调试用)
+ print("使用 read_excel_kwargs 读取excel:")
+ for key, value in read_excel_kwargs.items():
+ print(f" {key}: {repr(value)}") # 使用repr确保特殊字符正确显示
+
+ except Exception as e:
+ print(f"列检查失败,将读取所有列: {e}")
+ # 如果列检查失败,回退到读取所有列
+
+
+ # 执行数据读取
+ self._print_stage("执行数据读取")
+ self.df = pd.read_excel(**read_excel_kwargs)
+
+ except Exception as e:
+ # 如果默认引擎失败,尝试备选引擎
+ print(f"引擎 {engine} 读取失败,尝试备选引擎...\n{e}")
+ try:
+ # 回退到基本的读取方式
+ self.df = pd.read_excel(
+ self.file_path,
+ sheet_name=selected_sheet,
+ engine=None # 让pandas自动选择
+ )
+ except Exception as fallback_e:
+ raise RuntimeError(
+ f"读取 Excel 失败,工作表: '{selected_sheet}'。"
+ f"主引擎错误: {type(e).__name__}: {e}\n"
+ f"备选引擎错误: {type(fallback_e).__name__}: {fallback_e}"
+ )
+
+ if self.df.empty:
+ raise ValueError("工作表为空,无法处理")
+
+ # 校验必要列(如果前面没有使用usecols过滤,这里需要再次检查)
+ if hasattr(self, 'required_columns') and self.required_columns:
+ missing_columns = [col for col in self.required_columns if col not in self.df.columns]
+ if missing_columns:
+ raise KeyError(f"缺少必要列: {missing_columns}")
+
+ # 记录上下限列名
+ self.col_lower = self._find_column_case_insensitive([
+ "Lower Limit", "lower limit", "lower_limit", "ll", "lower"
+ ])
+ self.col_upper = self._find_column_case_insensitive([
+ "Upper Limit", "upper limit", "upper_limit", "ul", "upper"
+ ])
+
+ loading_time = time.time() - start_time
+ print(f"数据加载完成: {len(self.df)} 行 × {self.df.shape[1]} 列")
+ print(f"使用引擎: {engine}")
+ print(f"耗时: {loading_time:.2f}s")
+
+ # 显示列信息摘要
+ print(f"检测到下限列: {self.col_lower or '无'}")
+ print(f"检测到上限列: {self.col_upper or '无'}")
+
+ # 可选:类型转换(如果知道具体的数据类型)
+ # self._convert_data_types()
+
+ # 可以添加这个方法进行类型转换优化
+ def _convert_data_types(self):
+ """优化数据类型转换"""
+ if self.df is None or self.df.empty:
+ return
+
+ # 根据列名模式推断数据类型
+ numeric_patterns = ['limit', 'value', 'measure', 'result', 'score']
+ date_patterns = ['date', 'time', 'period']
+
+ for col in self.df.columns:
+ col_lower = str(col).lower()
+
+ # 数值类型转换
+ if any(pattern in col_lower for pattern in numeric_patterns):
+ self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
+ # 日期类型转换
+ elif any(pattern in col_lower for pattern in date_patterns):
+ self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
+
+
+ def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]:
+ """获取用户输入的关键词并筛选数据"""
+ self._print_stage("筛选关键词")
+
+ while True:
+ keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip()
+
+ if not keyword:
+ print("❌ 关键词不能为空,请重新输入")
+ continue
+
+ # 检查数据框是否为空
+ if self.df.empty:
+ print("⚠️ 数据框为空,无法进行筛选")
+ return pd.DataFrame(), keyword, []
+
+ # 检查列是否存在
+ if "Test Name New" not in self.df.columns:
+ print("❌ 列 'Test Name New' 不存在于数据框中")
+ print(f"可用列: {list(self.df.columns)}")
+ return pd.DataFrame(), keyword, []
+
+ try:
+ mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
+ filtered_df = self.df.loc[mask].copy()
+
+ if filtered_df.empty:
+ # 提供友好的提示和建议,而不是直接抛出异常
+ print(f"⚠️ 没有找到包含关键词 '{keyword}' 的测试项")
+
+ # 显示部分可用的测试项作为参考
+ available_tests = self.df["Test Name New"].dropna().unique()
+ if len(available_tests) > 0:
+ print("📋 可用的测试项示例:")
+ for test in available_tests[:5]: # 只显示前5个作为参考
+ print(f" - {test}")
+ if len(available_tests) > 5:
+ print(f" ... 还有 {len(available_tests) - 5} 个测试项")
+
+ # 提供重新输入或退出的选项
+ choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ")
+ if choice == "1":
+ continue
+ elif choice == "2":
+ filtered_df = self.df.copy()
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 使用所有数据: {len(filtered_df)} 行,{len(unique_tests)} 个测试项")
+ return filtered_df, "", unique_tests
+ else:
+ print("👋 退出筛选操作")
+ return pd.DataFrame(), keyword, []
+ else:
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"✅ 匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
+ return filtered_df, keyword, unique_tests
+
+ except Exception as e:
+ print(f"❌ 筛选过程中发生错误: {e}")
+ print("请检查数据格式或重新输入关键词")
+ continue
+
+ def create_output_dir(self, keyword) -> None:
+ """创建输出目录"""
+ self._print_stage("创建输出目录")
+
+ if not self.file_path:
+ raise ValueError("文件路径未设置")
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ base_dir = os.path.dirname(self.file_path)
+ # self.output_dir = os.path.join(base_dir, f"scatter_report_{timestamp}")
+ self.output_dir = os.path.join(base_dir, f"scatter_report_out")
+ self.html_report_path = os.path.join(self.output_dir, f"{keyword}_report_{timestamp}.html")
+
+ os.makedirs(self.output_dir, exist_ok=True)
+ print(f"输出目录: {self.output_dir}")
+
+ @staticmethod
+ def _safe_filename(name: str) -> str:
+ """生成安全的文件名"""
+ safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
+ return safe or "Unknown_Test"
+
+ def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
+ Optional[float], Optional[float], List[float], List[float]]:
+ """提取某个测试项的上下限数值"""
+ lower_plot = upper_plot = None
+ lower_set = []
+ upper_set = []
+
+ if self.col_lower and self.col_lower in df_one_test.columns:
+ lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
+ lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
+ if lower_set:
+ lower_plot = min(lower_set)
+
+ if self.col_upper and self.col_upper in df_one_test.columns:
+ upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
+ upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
+ if upper_set:
+ upper_plot = max(upper_set)
+
+ return lower_plot, upper_plot, lower_set, upper_set
+
+ @staticmethod
+ def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
+ """统一的系列清洗和转换方法 - 修复了 ast 方法名错误"""
+ if series.empty:
+ return series
+
+ if target_type == 'numeric':
+ # 数值转换优化
+ if pd.api.types.is_numeric_dtype(series):
+ return series.astype(float)
+
+ # 批量字符串处理 - 修复这里的问题
+ cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
+ return pd.to_numeric(cleaned, errors='coerce')
+
+ elif target_type == 'datetime':
+ return TestReportScatterPlotter._convert_to_datetime(series)
+
+ return series
+
+ @staticmethod
+ def _convert_to_datetime(series: pd.Series) -> pd.Series:
+ """优化的日期时间转换"""
+ if pd.api.types.is_datetime64_any_dtype(series):
+ return series
+
+ # 预处理:转换为数值和字符串两种形式
+ numeric_series = pd.to_numeric(series, errors='coerce')
+ string_series = series.astype(str).str.strip()
+
+ result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
+
+ # 数值时间戳处理
+ masks = {
+ 'ms': numeric_series >= 1e11,
+ 's': (numeric_series >= 1e9) & (numeric_series < 1e11),
+ 'excel': (numeric_series > 20000) & (numeric_series < 60000)
+ }
+
+ for mask_type, mask in masks.items():
+ if mask.any():
+ if mask_type == 'ms':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
+ elif mask_type == 's':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s')
+ elif mask_type == 'excel':
+ origin = pd.Timestamp('1899-12-30')
+ result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
+
+ # 字符串日期处理
+ remaining_mask = result.isna()
+ if remaining_mask.any():
+ remaining_strings = string_series.loc[remaining_mask]
+
+ # 特定格式优先处理
+ format_patterns = [
+ (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
+ ]
+
+ for pattern, date_format in format_patterns:
+ format_mask = remaining_strings.str.match(pattern)
+ if format_mask.any():
+ result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
+ remaining_strings.loc[format_mask], format=date_format, errors='coerce'
+ )
+
+ # 通用解析
+ still_na_mask = result.isna() & remaining_mask
+ if still_na_mask.any():
+ result.loc[still_na_mask] = pd.to_datetime(
+ string_series.loc[still_na_mask], errors='coerce'
+ )
+
+ return result
+
+ def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
+ """数据预处理"""
+ # 数值转换
+ test_data['Measurement_num'] = self._clean_and_convert_series(
+ test_data['Measurement'], 'numeric'
+ )
+ test_data['TestTime_dt'] = self._clean_and_convert_series(
+ test_data['Test Time'], 'datetime'
+ )
+
+ # 去除无效数据
+ valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
+ return valid_data.sort_values('TestTime_dt')
+
+ def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
+ """计算统计信息"""
+ stats = {
+ 'count': len(y_data),
+ 'mean': y_data.mean(),
+ 'median': y_data.median(),
+ 'min': y_data.min(),
+ 'max': y_data.max(),
+ 'std': y_data.std(),
+ 'q1': y_data.quantile(0.25),
+ 'q3': y_data.quantile(0.75)
+ }
+ return stats
+
+ def _plot_to_base64(self, fig) -> str:
+ """将图表转换为base64编码"""
+ buf = BytesIO()
+ fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+ buf.seek(0)
+ img_str = base64.b64encode(buf.read()).decode('utf-8')
+ plt.close(fig)
+ return img_str
+
+ def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> str:
+ """创建汇总图(所有SN在一个图中)"""
+ fig, ax = plt.subplots(figsize=(12, 8))
+
+ # 分组绘制
+ groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
+ for sn, group in groups:
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ label=str(sn), alpha=0.7, s=25)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线和统计线
+ x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"汇总图 - {test_name}")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+
+ return self._plot_to_base64(fig)
+
+ def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]:
+ """为每个SN创建独立图表"""
+ sn_plots = []
+
+ if "SN" not in test_data.columns:
+ return sn_plots
+
+ sn_groups = test_data.groupby("SN")
+
+ for sn, group in sn_groups:
+ if group.empty:
+ continue
+
+ fig, ax = plt.subplots(figsize=(10, 6))
+
+ # 绘制当前SN的数据点
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ color='blue', alpha=0.7, s=30, label=f"SN: {sn}")
+
+ # 计算当前SN的统计信息
+ y_data = group['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 绘制限值线
+ x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()
+
+ if lower_plot is not None:
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
+ if upper_plot is not None:
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
+
+ # 添加统计线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
+ linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
+ linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 设置图形属性
+ ax.set_title(f"SN独立图 - {test_name} (SN: {sn})")
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+ ax.legend()
+
+ # 转换为base64
+ plot_image = self._plot_to_base64(fig)
+ sn_plots.append({"sn": str(sn), "image": plot_image})
+
+ return sn_plots
+
+ def _determine_test_status(self, stats: Dict[str, float],
+ lower_limit: Optional[float],
+ upper_limit: Optional[float]) -> Dict[str, Any]:
+ """确定测试状态"""
+ status = "success"
+ status_display = "正常"
+
+ if lower_limit is not None and upper_limit is not None:
+ # 检查是否超出限值
+ if stats['min'] < lower_limit or stats['max'] > upper_limit:
+ status = "danger"
+ status_display = "异常"
+ elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or
+ stats['std'] > (upper_limit - lower_limit) * 0.2):
+ status = "warning"
+ status_display = "警告"
+
+ return {"status": status, "status_display": status_display}
+
+ def generate_html_report(self, filtered_df: pd.DataFrame, keyword: str,
+ unique_tests: List[str]) -> None:
+ """生成HTML报告"""
+ self._print_stage("生成HTML报告")
+ start_time = time.time()
+
+ test_results = []
+ total_points = 0
+ status_counts = {"success": 0, "warning": 0, "danger": 0}
+
+ for i, test_name in enumerate(unique_tests, 1):
+ self._print_progress(i, len(unique_tests), "生成测试报告")
+
+ # 获取测试数据
+ test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
+ test_data = self._preprocess_test_data(test_data)
+
+ if test_data.empty:
+ continue
+
+ # 提取限值信息
+ lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
+
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+ total_points += stats['count']
+
+ # 生成汇总图表
+ summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot)
+
+ # 生成SN独立图表
+ sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot)
+
+ # 确定测试状态
+ status_info = self._determine_test_status(stats, lower_plot, upper_plot)
+ status_counts[status_info["status"]] += 1
+
+ # 添加到结果列表
+ test_results.append({
+ "name": test_name,
+ "stats": stats,
+ "limits": {"lower": lower_plot, "upper": upper_plot},
+ "summary_plot_image": summary_plot_image,
+ "sn_plot_images": sn_plot_images,
+ "status": status_info["status"],
+ "status_display": status_info["status_display"]
+ })
+
+ # 渲染HTML模板
+ template = Template(HTML_TEMPLATE)
+ html_content = template.render(
+ keyword=keyword,
+ timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+ test_count=len(test_results),
+ total_points=total_points,
+ tests=test_results,
+ file_path=self.file_path,
+ analysis_time=round(time.time() - start_time, 2),
+ status_counts={"normal": status_counts["success"], "warning": status_counts["warning"],
+ "abnormal": status_counts["danger"]}
+ )
+
+ # 保存HTML文件
+ with open(self.html_report_path, 'w', encoding='utf-8') as f:
+ f.write(html_content)
+
+ print(f"\nHTML报告已生成: {self.html_report_path}")
+ print(f"共处理 {len(test_results)} 个测试项,{total_points} 个数据点")
+
+ def run(self) -> None:
+ """运行主程序"""
+ try:
+ self.get_file_path()
+ self.load_data()
+ while True:
+ filtered_df, keyword, unique_tests = self.get_keyword()
+ self.create_output_dir(keyword)
+ self.generate_html_report(filtered_df, keyword, unique_tests)
+ print(f"\n✅ 分析完成!")
+ # print(f"📊 报告文件: {self.html_report_path}")
+ # print(f"📁 输出目录: {self.output_dir}")
+ except KeyboardInterrupt:
+ print(f"\n{Fore.YELLOW}⚠ 用户中断程序")
+ except Exception as e:
+ print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}")
+ import traceback
+ traceback.print_exc()
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ plotter = TestReportScatterPlotter()
+ plotter.run()
diff --git a/htmlProcess/htmlReportProcess_Merge_pic_V1.py b/htmlProcess/htmlReportProcess_Merge_pic_V1.py
new file mode 100644
index 0000000..802b798
--- /dev/null
+++ b/htmlProcess/htmlReportProcess_Merge_pic_V1.py
@@ -0,0 +1,563 @@
+import os
+import re
+import sys
+import time
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+from matplotlib.lines import Line2D
+from typing import Optional, Tuple, List, Dict, Any, Union
+from pathlib import Path
+import numpy as np
+
+from colorama import Fore, Style, init
+
+# 避免 SettingWithCopy 警告影响输出可读性
+pd.options.mode.chained_assignment = None
+
+# 设置中文字体支持
+plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
+plt.rcParams['axes.unicode_minus'] = False
+
+
+class TestReportScatterPlotter:
+ def __init__(self):
+ self.file_path: Optional[str] = None
+ self.df: Optional[pd.DataFrame] = None
+ self.output_dir: Optional[str] = None
+ self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time"]
+ self.col_lower: Optional[str] = None
+ self.col_upper: Optional[str] = None
+
+ # 缓存处理过的数据
+ self._processed_data_cache: Dict[str, Any] = {}
+
+ def _print_stage(self, msg: str) -> None:
+ """统一的阶段信息输出"""
+ print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}")
+
+ def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None:
+ """改进的进度条显示"""
+ if total <= 0:
+ return
+
+ percent = (current / total) * 100
+ bar_len = 30
+ filled = int(bar_len * current / total)
+ bar = "█" * filled + "-" * (bar_len - filled)
+ sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)")
+ sys.stdout.flush()
+ if current == total:
+ print() # 换行
+
+ def get_file_path(self) -> None:
+ """改进的文件路径获取,支持路径补全"""
+ self._print_stage("输入文件路径")
+
+ while True:
+ print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ")
+ file_path = input("> ").strip()
+
+ # 尝试路径补全和验证
+ if not file_path:
+ continue
+
+ path_obj = Path(file_path)
+ if path_obj.exists():
+ self.file_path = str(path_obj.resolve())
+ print(f"已选择文件: {self.file_path}")
+ break
+ else:
+ print(f"文件不存在: {file_path},请重新输入")
+
+ def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
+ """优化的大小写不敏感列查找"""
+ if self.df is None:
+ return None
+
+ columns_lower = {col.lower().strip(): col for col in self.df.columns}
+ for candidate in candidates:
+ key = candidate.lower().strip()
+ if key in columns_lower:
+ return columns_lower[key]
+ return None
+
+ def load_data(self) -> None:
+ """优化的数据加载方法"""
+ self._print_stage("加载数据")
+ start_time = time.time()
+
+ # try:
+ # # 使用更高效的数据读取方式
+ # self.df = pd.read_excel(
+ # self.file_path,
+ # sheet_name="Merged All Tests",
+ # engine='openpyxl' # 指定引擎提高性能
+ # )
+ # except Exception as e:
+ # raise RuntimeError(
+ # f"读取 Excel 失败,请确认工作表名为 'Merged All Tests'。错误: {type(e).__name__}: {e}"
+ # )
+
+ # 检查文件是否存在
+ if not os.path.exists(self.file_path):
+ raise FileNotFoundError(f"文件不存在: {self.file_path}")
+
+ # 检查文件扩展名是否为Excel支持的格式
+ if not self.file_path.lower().endswith(('.xls', '.xlsx')):
+ raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)")
+
+ try:
+ # 打开Excel文件并获取所有sheet名称
+ excel_file = pd.ExcelFile(self.file_path, engine='openpyxl')
+ sheet_names = excel_file.sheet_names
+ except Exception as e:
+ raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}")
+
+ # 定义优先查找的工作表名
+ target_sheets = ["Merged All Tests", "All Tests"]
+ selected_sheet = None
+
+ for sheet in target_sheets:
+ if sheet in sheet_names:
+ selected_sheet = sheet
+ break
+
+ if selected_sheet is None:
+ raise ValueError(
+ f"未找到指定的工作表: {' 或 '.join(target_sheets)}。"
+ f"当前文件包含的工作表有: {sheet_names}"
+ )
+
+ try:
+ # 使用更高效的方式读取指定sheet
+ self.df = pd.read_excel(
+ self.file_path,
+ sheet_name=selected_sheet,
+ engine='openpyxl'
+ )
+ except Exception as e:
+ raise RuntimeError(
+ f"读取 Excel 失败,工作表: '{selected_sheet}'。错误: {type(e).__name__}: {e}"
+ )
+
+ if self.df.empty:
+ raise ValueError("工作表为空,无法处理")
+
+ # 校验必要列
+ missing_columns = [col for col in self.required_columns if col not in self.df.columns]
+ if missing_columns:
+ raise KeyError(f"缺少必要列: {missing_columns}")
+
+ # 记录上下限列名
+ self.col_lower = self._find_column_case_insensitive([
+ "Lower Limit", "lower limit", "lower_limit", "ll", "lower"
+ ])
+ self.col_upper = self._find_column_case_insensitive([
+ "Upper Limit", "upper limit", "upper_limit", "ul", "upper"
+ ])
+
+ loading_time = time.time() - start_time
+ print(f"数据加载完成: {len(self.df)} 行 × {self.df.shape[1]} 列")
+ print(f"耗时: {loading_time:.2f}s")
+
+ # 显示列信息摘要
+ print(f"检测到下限列: {self.col_lower or '无'}")
+ print(f"检测到上限列: {self.col_upper or '无'}")
+
+ def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]:
+ """获取用户输入的关键词并筛选数据"""
+ self._print_stage("筛选关键词")
+
+ while True:
+ keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip()
+ if not keyword:
+ print("关键词不能为空,请重新输入")
+ continue
+ break
+
+ mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
+ filtered_df = self.df.loc[mask].copy()
+
+ if filtered_df.empty:
+ raise ValueError(f"没有找到包含关键词 '{keyword}' 的测试项")
+
+ unique_tests = filtered_df["Test Name New"].unique().tolist()
+ print(f"匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
+ return filtered_df, keyword, unique_tests
+
+ def create_output_dir(self) -> None:
+ """创建输出目录"""
+ self._print_stage("创建输出目录")
+
+ if not self.file_path:
+ raise ValueError("文件路径未设置")
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ base_dir = os.path.dirname(self.file_path)
+ self.output_dir = os.path.join(base_dir, f"scatter_plots_{timestamp}")
+
+ os.makedirs(self.output_dir, exist_ok=True)
+ print(f"输出目录: {self.output_dir}")
+
+ @staticmethod
+ def _safe_filename(name: str) -> str:
+ """生成安全的文件名"""
+ safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
+ return safe or "Unknown_Test"
+
+ def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
+ Optional[float], Optional[float], List[float], List[float]]:
+ """提取某个测试项的上下限数值"""
+ lower_plot = upper_plot = None
+ lower_set = []
+ upper_set = []
+
+ if self.col_lower and self.col_lower in df_one_test.columns:
+ lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
+ lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
+ if lower_set:
+ lower_plot = min(lower_set)
+
+ if self.col_upper and self.col_upper in df_one_test.columns:
+ upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
+ upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
+ if upper_set:
+ upper_plot = max(upper_set)
+
+ return lower_plot, upper_plot, lower_set, upper_set
+
+ @staticmethod
+ def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
+ """统一的系列清洗和转换方法"""
+ if series.empty:
+ return series
+
+ if target_type == 'numeric':
+ # 数值转换优化
+ if pd.api.types.is_numeric_dtype(series):
+ return series.astype(float)
+
+ # 批量字符串处理
+ cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
+ return pd.to_numeric(cleaned, errors='coerce')
+
+ elif target_type == 'datetime':
+ return TestReportScatterPlotter._convert_to_datetime(series)
+
+ return series
+
+ @staticmethod
+ def _convert_to_datetime(series: pd.Series) -> pd.Series:
+ """优化的日期时间转换"""
+ if pd.api.types.is_datetime64_any_dtype(series):
+ return series
+
+ # 预处理:转换为数值和字符串两种形式
+ numeric_series = pd.to_numeric(series, errors='coerce')
+ string_series = series.astype(str).str.strip()
+
+ result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
+
+ # 数值时间戳处理
+ masks = {
+ 'ms': numeric_series >= 1e11,
+ 's': (numeric_series >= 1e9) & (numeric_series < 1e11),
+ 'excel': (numeric_series > 20000) & (numeric_series < 60000)
+ }
+
+ for mask_type, mask in masks.items():
+ if mask.any():
+ if mask_type == 'ms':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
+ elif mask_type == 's':
+ result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s')
+ elif mask_type == 'excel':
+ origin = pd.Timestamp('1899-12-30')
+ result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
+
+ # 字符串日期处理
+ remaining_mask = result.isna()
+ if remaining_mask.any():
+ remaining_strings = string_series.loc[remaining_mask]
+
+ # 特定格式优先处理
+ format_patterns = [
+ (r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
+ ]
+
+ for pattern, date_format in format_patterns:
+ format_mask = remaining_strings.str.match(pattern)
+ if format_mask.any():
+ result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
+ remaining_strings.loc[format_mask], format=date_format, errors='coerce'
+ )
+
+ # 通用解析
+ still_na_mask = result.isna() & remaining_mask
+ if still_na_mask.any():
+ result.loc[still_na_mask] = pd.to_datetime(
+ string_series.loc[still_na_mask], errors='coerce'
+ )
+
+ return result
+
+ def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
+ """数据预处理"""
+ # 数值转换
+ test_data['Measurement_num'] = self._clean_and_convert_series(
+ test_data['Measurement'], 'numeric'
+ )
+ test_data['TestTime_dt'] = self._clean_and_convert_series(
+ test_data['Test Time'], 'datetime'
+ )
+
+ # 去除无效数据
+ valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
+ return valid_data.sort_values('TestTime_dt')
+
+ def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
+ """计算统计信息"""
+ stats = {
+ 'count': len(y_data),
+ 'mean': y_data.mean(),
+ 'median': y_data.median(),
+ 'min': y_data.min(),
+ 'max': y_data.max(),
+ 'std': y_data.std(),
+ 'q1': y_data.quantile(0.25),
+ 'q3': y_data.quantile(0.75)
+ }
+ return stats
+
+ def _add_statistics_textbox(self, ax, stats: Dict[str, float],
+ x_pos: float = 1.02, y_pos: float = 0.98) -> None:
+ """在图表右侧添加统计信息文本框"""
+ # 使用英文标签避免中文显示问题
+ stats_text = (
+ f"Count: {stats['count']}\n"
+ f"Mean: {stats['mean']:.4f}\n"
+ f"Median: {stats['median']:.4f}\n"
+ f"Min: {stats['min']:.4f}\n"
+ f"Max: {stats['max']:.4f}\n"
+ f"Std: {stats['std']:.4f}\n"
+ f"Q1: {stats['q1']:.4f}\n"
+ f"Q3: {stats['q3']:.4f}"
+ )
+
+ # 添加文本框到右侧,使用英文字体
+ props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
+ ax.text(x_pos, y_pos, stats_text, transform=ax.transAxes, fontsize=8,
+ verticalalignment='top', horizontalalignment='left', # 左对齐
+ bbox=props, fontfamily='monospace')
+
+ def _add_statistics_lines(self, ax, stats: Dict[str, float],
+ x_min: float, x_max: float) -> None:
+ """添加统计线到图表"""
+ # 添加平均值线
+ ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max,
+ colors='orange', linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
+
+ # 添加中位数线
+ ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max,
+ colors='purple', linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
+
+ # 添加Q1和Q3线
+ ax.hlines(y=stats['q1'], xmin=x_min, xmax=x_max,
+ colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q1')
+ ax.hlines(y=stats['q3'], xmin=x_min, xmax=x_max,
+ colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q3')
+
+ def _configure_plot(self, ax, test_data: pd.DataFrame, test_name: str,
+ lower_plot: Optional[float], upper_plot: Optional[float]) -> None:
+ """配置图形属性"""
+ # 计算统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+
+ # 获取时间范围用于统计线
+ x_min = test_data['TestTime_dt'].min()
+ x_max = test_data['TestTime_dt'].max()
+
+ # Y轴范围计算
+ y_min, y_max = y_data.min(), y_data.max()
+ y_candidates = [y_min, y_max]
+
+ # 绘制限值线
+ custom_lines = []
+ if lower_plot is not None:
+ y_candidates.append(lower_plot)
+ ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2)
+ custom_lines.append(Line2D([0], [0], color='green', linestyle='--', label="Lower Limit"))
+
+ if upper_plot is not None:
+ y_candidates.append(upper_plot)
+ ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2)
+ custom_lines.append(Line2D([0], [0], color='red', linestyle='--', label="Upper Limit"))
+
+ # 添加统计线
+ self._add_statistics_lines(ax, stats, x_min, x_max)
+
+ # 设置范围
+ valid_candidates = [y for y in y_candidates if pd.notna(y)]
+ if valid_candidates:
+ y_min_plot = min(valid_candidates)
+ y_max_plot = max(valid_candidates)
+ y_range = y_max_plot - y_min_plot
+ if y_range == 0:
+ y_range = abs(y_max_plot) * 0.1 if y_max_plot != 0 else 1.0
+ y_min_plot = y_min_plot - y_range / 2
+ y_max_plot = y_max_plot + y_range / 2
+ ax.set_ylim(y_min_plot - 0.1 * y_range, y_max_plot + 0.1 * y_range)
+
+ # 添加统计信息文本框到右侧
+ self._add_statistics_textbox(ax, stats)
+
+ # 设置标题和标签,使用英文避免中文问题
+ ax.set_title(f"Scatter Plot - {test_name}\n"
+ f"Mean: {stats['mean']:.4f}, Median: {stats['median']:.4f}, "
+ f"Range: [{stats['min']:.4f}, {stats['max']:.4f}]",
+ fontsize=10)
+ ax.set_xlabel("Test Time")
+ ax.set_ylabel("Measurement Value")
+ ax.grid(True, alpha=0.3)
+ ax.tick_params(axis='x', rotation=45)
+
+ # 图例处理 - 优化位置在右侧
+ handles, labels = ax.get_legend_handles_labels()
+ if custom_lines:
+ handles.extend(custom_lines)
+ labels.extend([line.get_label() for line in custom_lines])
+
+ if handles:
+ # 根据图例项数量决定图例位置和布局
+ if len(handles) > 10: # 如果图例项很多,使用两列布局
+ ncol = 2
+ # 调整图例位置,确保不遮挡数据
+ ax.legend(handles=handles, labels=labels, title="Legend",
+ fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5),
+ ncol=ncol, frameon=True, fancybox=True, shadow=True)
+ else:
+ # 图例项较少时使用单列布局
+ ax.legend(handles=handles, labels=labels, title="Legend",
+ fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5),
+ frameon=True, fancybox=True, shadow=True)
+
+ def _save_plot(self, fig, test_name: str) -> None:
+ """保存图形"""
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ safe_name = self._safe_filename(test_name)
+ filename = f"{safe_name}_{timestamp}.png"
+ output_path = os.path.join(self.output_dir, filename)
+
+ # 调整布局以确保图例完整显示
+ fig.savefig(output_path, dpi=300, bbox_inches='tight')
+ plt.close(fig)
+ print(f"已保存: {output_path}")
+
+ def plot_scatter(self, filtered_df: pd.DataFrame, unique_tests: List[str]) -> None:
+ """优化的散点图绘制方法"""
+ self._print_stage("生成散点图")
+ total_tests = len(unique_tests)
+ start_time = time.time()
+
+ for i, test_name in enumerate(unique_tests, 1):
+ self._print_progress(i, total_tests, "测试项绘图")
+
+ # 使用缓存避免重复计算
+ cache_key = f"test_{hash(test_name)}"
+ if cache_key in self._processed_data_cache:
+ test_data = self._processed_data_cache[cache_key]
+ else:
+ test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
+ # 预处理数据
+ test_data = self._preprocess_test_data(test_data)
+ self._processed_data_cache[cache_key] = test_data
+
+ if test_data.empty:
+ print(f"\n跳过 '{test_name}' - 无有效的 Measurement/Test Time 数据")
+ continue
+
+ # 提取限值信息
+ lower_plot, upper_plot, lower_set, upper_set = self._extract_limits(test_data)
+
+ # 输出限值信息
+ limit_info = []
+ if lower_set:
+ limit_info.append(f"Lower unique={len(lower_set)}, used={lower_plot}")
+ else:
+ limit_info.append("Lower N/A")
+ if upper_set:
+ limit_info.append(f"Upper unique={len(upper_set)}, used={upper_plot}")
+ else:
+ limit_info.append("Upper N/A")
+
+ # 计算并输出统计信息
+ y_data = test_data['Measurement_num']
+ stats = self._calculate_statistics(y_data)
+ stat_info = (
+ f"数据点: {stats['count']}, "
+ f"均值: {stats['mean']:.4f}, "
+ f"中位数: {stats['median']:.4f}, "
+ f"范围: [{stats['min']:.4f}, {stats['max']:.4f}]"
+ )
+
+ print(f"\n→ 绘制: '{test_name}' | {stat_info} | 限值: {', '.join(limit_info)}")
+
+ # 创建图形 - 增大图像尺寸以容纳图例和统计信息
+ sn_count = len(test_data["SN"].unique()) if "SN" in test_data.columns else 1
+
+ # 根据SN数量和预期图例项数量调整图形大小
+ base_width = 14 # 增加宽度以容纳统计信息
+ base_height = 9 # 增加高度以容纳更多信息
+
+ # 如果SN数量多,增加图形宽度以容纳图例
+ if sn_count > 5:
+ fig_width = base_width + min(sn_count / 5, 6) # 最大增加6个单位宽度
+ else:
+ fig_width = base_width
+
+ fig, ax = plt.subplots(figsize=(fig_width, base_height))
+
+ # 分组绘制
+ groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
+
+ for j, (sn, group) in enumerate(groups, 1):
+ ax.scatter(group['TestTime_dt'], group['Measurement_num'],
+ label=str(sn), alpha=0.7, s=25)
+ if j % 10 == 0 or j == len(groups):
+ self._print_progress(j, len(groups), "SN分组绘制")
+
+ # 配置图形
+ self._configure_plot(ax, test_data, test_name, lower_plot, upper_plot)
+
+ # 调整布局,为右侧统计信息和图例留出空间
+ plt.tight_layout()
+ plt.subplots_adjust(right=0.8 if sn_count <= 10 else 0.7) # 为右侧统计信息留出更多空间
+
+ # 保存图像
+ self._save_plot(fig, test_name)
+
+ total_time = time.time() - start_time
+ print(f"\n全部绘图完成,总耗时: {total_time:.2f}s")
+ print(f"所有图表已保存到: {self.output_dir}")
+
+ def run(self) -> None:
+ """运行主程序"""
+ try:
+ self.get_file_path()
+ self.load_data()
+ filtered_df, keyword, unique_tests = self.get_keyword()
+ self.create_output_dir()
+ self.plot_scatter(filtered_df, unique_tests)
+
+ except Exception as e:
+ print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}")
+ import traceback
+ traceback.print_exc()
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ plotter = TestReportScatterPlotter()
+ plotter.run()
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..9c7b199
--- /dev/null
+++ b/main.py
@@ -0,0 +1,251 @@
+import pandas as pd
+import os
+import glob
+import re
+from datetime import datetime
+import tkinter as tk
+from tkinter import filedialog
+from collections import defaultdict
+
+
+class BOMConsolidator:
+ def __init__(self):
+ self.master_data = defaultdict(dict)
+ self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description',
+ 'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference']
+ self.file_quantities = {}
+ self.consolidated_report = None
+ self.inconsistency_count = 0
+ self.processed_files = 0
+ self.processed_rows = 0
+ self.output_folder = ""
+
+ def find_valid_sheet(self, file_path):
+ """定位包含有效BOM的Sheet"""
+ xl = pd.ExcelFile(file_path)
+ for sheet_name in xl.sheet_names:
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
+ for i in range(len(df)):
+ headers = df.iloc[i].values
+ if all(col in headers for col in ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN']):
+ return sheet_name, i
+ return None, None
+
+ def clean_column_names(self, df):
+ """清洗列名并标准化"""
+ df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
+ df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)
+ return df
+
+ def process_file(self, file_path):
+ """处理单个BOM文件"""
+ filename = os.path.basename(file_path)
+ print(f"处理文件: {filename}...")
+
+ sheet_name, header_row = self.find_valid_sheet(file_path)
+ if not sheet_name:
+ print(f" ! 未找到有效BOM表: {filename}")
+ return False
+
+ df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+ df = self.clean_column_names(df)
+
+ # 验证必要字段
+ missing_cols = [col for col in self.required_columns if col not in df.columns]
+ if missing_cols:
+ print(f" ! 缺少必要列: {', '.join(missing_cols)}")
+ return False
+
+ print(f" √ 找到有效Sheet: {sheet_name} (共{len(df)}行)")
+ self.file_quantities[filename] = {}
+ self.processed_files += 1
+
+ # 处理每行数据
+ for _, row in df.iterrows():
+ self.process_row(row, filename)
+ self.processed_rows += 1
+
+ return True
+
+ def process_row(self, row, filename):
+ """处理单行数据"""
+ # 确定合并主键
+ key = row['Partnumber'] if pd.notna(row['Partnumber']) and row['Partnumber'] != '' else row['MF_PN']
+ if pd.isna(key) or key == '':
+ return
+
+ # 首次记录该物料
+ if key not in self.master_data:
+ self.master_data[key] = {
+ 'Partnumber': row['Partnumber'],
+ 'Purchase_Code': row['Purchase_Code'],
+ 'MF_PN': row['MF_PN'],
+ 'Description': row.get('Description', ''),
+ 'Part_Type': row.get('Part_Type', ''),
+ 'MF_NAME': row.get('MF_NAME', ''),
+ 'PCB_Footprint': row.get('PCB_Footprint', ''),
+ 'quantity_data': {}, # 存储每个文件的数量
+ 'inconsistencies': [] # 存储不一致信息
+ }
+
+ # 检查字段一致性
+ current_data = self.master_data[key]
+ fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint']
+
+ for field in fields_to_check:
+ # 处理字段名称差异
+ db_field = 'Part Type' if field == 'Part_Type' else field
+
+ current_val = str(current_data[field])
+ new_val = str(row.get(db_field, ''))
+
+ # 忽略空值和'nan'字符串
+ if new_val in ['', 'nan', 'NaN', 'NaT']:
+ continue
+
+ # 比较当前值和新值
+ if current_val != new_val:
+ current_data['inconsistencies'].append(
+ f"{field}不一致: {current_val} ≠ {new_val} (文件: {filename})"
+ )
+
+ # 检查Reference数量和Quantity是否匹配
+ ref_count = 0
+ if pd.notna(row['Reference']) and row['Reference'] != '':
+ ref_list = str(row['Reference']).split(',')
+ ref_count = len([ref for ref in ref_list if ref.strip() != ''])
+
+ try:
+ quantity = int(row['Quantity'])
+ if ref_count != quantity:
+ current_data['inconsistencies'].append(
+ f"Reference数量不符: {ref_count}个位置 ≠ Quantity={quantity} (文件: {filename})"
+ )
+ except (ValueError, TypeError):
+ pass
+
+ # 记录当前文件的数量
+ try:
+ qty_val = int(row['Quantity'])
+ self.file_quantities[filename][key] = qty_val
+ current_data['quantity_data'][filename] = qty_val
+ except (ValueError, TypeError):
+ self.file_quantities[filename][key] = 0
+ current_data['quantity_data'][filename] = 0
+
+ # 更新不一致计数
+ if current_data['inconsistencies']:
+ self.inconsistency_count += 1
+
+ def generate_report(self):
+ """生成合并报告"""
+ if not self.master_data:
+ print("无有效数据可生成报告")
+ return None
+
+ print(f"\n生成合并报告,共{len(self.master_data)}种物料...")
+
+ # 准备报告数据结构
+ report_data = []
+ file_columns = sorted(self.file_quantities.keys())
+
+ for key, data in self.master_data.items():
+ row = {
+ 'Partnumber': data['Partnumber'],
+ 'Purchase_Code': data['Purchase_Code'],
+ 'MF_PN': data['MF_PN'],
+ 'Description': data['Description'],
+ 'Part Type': data['Part_Type'],
+ 'MF_NAME': data['MF_NAME'],
+ 'PCB_Footprint': data['PCB_Footprint'],
+ '检查信息': '; '.join(data['inconsistencies'])
+ }
+
+ # 添加各文件数量
+ total = 0
+ for file in file_columns:
+ qty = data['quantity_data'].get(file, 0)
+ row[file] = qty
+ total += qty
+ row['合计'] = total
+
+ report_data.append(row)
+
+ # 创建DataFrame
+ self.consolidated_report = pd.DataFrame(report_data)
+
+ # 生成带时间戳的文件名
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_path = os.path.join(self.output_folder, f"BOM合并报告_{timestamp}.xlsx")
+
+ # 保存报告
+ self.consolidated_report.to_excel(output_path, index=False)
+
+ # 返回统计信息和路径
+ stats = {
+ 'output_path': output_path,
+ 'file_count': self.processed_files,
+ 'material_count': len(self.master_data),
+ 'inconsistency_count': self.inconsistency_count,
+ 'processed_rows': self.processed_rows
+ }
+
+ return stats
+
+
+def select_folder():
+ """弹出文件夹选择对话框"""
+ root = tk.Tk()
+ root.withdraw()
+ folder_selected = filedialog.askdirectory(title='选择BOM文件所在文件夹')
+ return folder_selected
+
+
+def main():
+ # 初始化合并器
+ bom_processor = BOMConsolidator()
+
+ # 选择文件夹
+ folder_path = select_folder()
+ if not folder_path:
+ print("未选择文件夹,程序退出")
+ return
+
+ bom_processor.output_folder = folder_path
+
+ # 获取所有Excel文件
+ bom_files = glob.glob(os.path.join(folder_path, "*.xlsx"))
+ if not bom_files:
+ print("文件夹中没有Excel文件")
+ return
+
+ print(f"找到 {len(bom_files)} 个Excel文件,开始处理...")
+
+ # 处理文件
+ processed_count = 0
+ for file_path in bom_files:
+ success = bom_processor.process_file(file_path)
+ if success:
+ processed_count += 1
+
+ # 生成报告
+ if bom_processor.master_data:
+ stats = bom_processor.generate_report()
+
+ # 打印汇总信息
+ print("\n" + "=" * 40)
+ print("BOM合并完成! 汇总信息:")
+ print(f"处理文件夹: {folder_path}")
+ print(f"扫描文件数: {len(bom_files)}")
+ print(f"成功处理文件数: {processed_count}")
+ print(f"处理行数: {stats['processed_rows']}")
+ print(f"合并物料种类数: {stats['material_count']}")
+ print(f"检测到不一致条目数: {stats['inconsistency_count']}")
+ print(f"报告已保存至: {stats['output_path']}")
+ print("=" * 40)
+ else:
+ print("没有有效数据生成报告")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/tempReportProcess/.gitignore b/tempReportProcess/.gitignore
new file mode 100644
index 0000000..e9afaff
--- /dev/null
+++ b/tempReportProcess/.gitignore
@@ -0,0 +1,9 @@
+/build/*
+/build
+/dist/*
+/dist
+/source/*
+/source
+
+
+tempReportProcess_V2.py
\ No newline at end of file
diff --git a/tempReportProcess/tempReportProcess_V1.py b/tempReportProcess/tempReportProcess_V1.py
new file mode 100644
index 0000000..dde4e13
--- /dev/null
+++ b/tempReportProcess/tempReportProcess_V1.py
@@ -0,0 +1,248 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+import tkinter as tk
+from tkinter import filedialog
+import os
+import matplotlib.dates as mdates
+from jinja2 import Template
+from matplotlib import font_manager, rcParams
+
+
+class TemperatureDataAnalyzer:
+ def __init__(self):
+ self.data = None
+ self.file_path = None
+ self.timestamps = []
+ self.temperatures = []
+ self.statuses = []
+ self._configure_chinese_font() # 配置中文字体,修复中文字符缺失警告
+
+ def _configure_chinese_font(self):
+ """
+ 配置 Matplotlib 中文字体,避免中文字符缺失的警告。
+ 会尝试常见的中文字体并设置 axes.unicode_minus 为 False。
+ """
+ try:
+ # 常见中文字体候选(跨平台)
+ candidates = [
+ "Microsoft YaHei", "Microsoft YaHei UI", # Windows
+ "SimHei", "SimSun", # Windows(黑体/宋体)
+ "PingFang SC", "Heiti SC", # macOS
+ "Noto Sans CJK SC", "Source Han Sans SC", "WenQuanYi Micro Hei", # Linux
+ "Arial Unicode MS" # 覆盖广的 Unicode 字体
+ ]
+ available = {f.name for f in font_manager.fontManager.ttflist}
+ for name in candidates:
+ if name in available:
+ rcParams["font.sans-serif"] = [name]
+ rcParams["axes.unicode_minus"] = False
+ # 可选:打印使用的字体名称
+ # print(f"使用中文字体: {name}")
+ return
+ # 如果没有找到常见中文字体,给出提示
+ rcParams["axes.unicode_minus"] = False
+ print("未检测到常见中文字体,图中中文可能无法正常显示。建议安装 'Noto Sans CJK SC' 或 'Microsoft YaHei'。")
+ except Exception as e:
+ print(f"中文字体配置失败: {e}")
+
+ def select_file(self):
+ """手动选择CSV文件"""
+ root = tk.Tk()
+ root.withdraw() # 隐藏主窗口
+
+ file_types = [("CSV files", "*.csv"), ("All files", "*.*")]
+ self.file_path = filedialog.askopenfilename(title="选择温度数据CSV文件", filetypes=file_types)
+
+ if not self.file_path:
+ print("未选择文件,程序退出")
+ return False
+ return True
+
+ def load_and_process_data(self):
+ """加载和处理数据"""
+ try:
+ # 读取CSV文件,无表头
+ self.data = pd.read_csv(self.file_path, header=None)
+
+ # 重命名列以便于引用
+ self.data.columns = ['timestamp', 'temperature', 'status']
+
+ # 转换时间戳格式(文本例如:10/29/2025 2:20:41 PM)
+ self.data['datetime'] = pd.to_datetime(self.data['timestamp'], format='%m/%d/%Y %I:%M:%S %p')
+
+ # 提取处理后的数据
+ self.timestamps = self.data['datetime']
+ self.temperatures = self.data['temperature']
+ self.statuses = self.data['status']
+
+ print(f"成功加载 {len(self.data)} 条记录")
+ return True
+
+ except Exception as e:
+ print(f"数据处理错误: {e}")
+ return False
+
+ def create_scatter_plots(self):
+ """创建散点图"""
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+
+ # 温度散点图
+ sc1 = ax1.scatter(self.timestamps, self.temperatures, c=self.temperatures,
+ cmap='coolwarm', alpha=0.7, s=20)
+ ax1.set_title('温度随时间变化趋势')
+ ax1.set_ylabel('温度 (°C)')
+ ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
+ ax1.grid(True, linestyle='--', alpha=0.7)
+ ax1.tick_params(axis='x', rotation=45)
+ plt.colorbar(sc1, ax=ax1, label="温度(°C)")
+
+ # 状态散点图
+ sc2 = ax2.scatter(self.timestamps, self.statuses, c=self.statuses,
+ cmap='viridis', alpha=0.7, s=20)
+ ax2.set_title('状态随时间变化')
+ ax2.set_xlabel('时间')
+ ax2.set_ylabel('状态值')
+ ax2.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
+ ax2.grid(True, linestyle='--', alpha=0.7)
+ ax2.tick_params(axis='x', rotation=45)
+ plt.colorbar(sc2, ax=ax2, label="状态值")
+
+ plt.tight_layout()
+ return fig
+
+ def generate_statistics_report(self):
+ """生成统计报告"""
+ stats = {
+ 'total_records': len(self.temperatures),
+ 'avg_temperature': round(self.temperatures.mean(), 2),
+ 'max_temperature': round(self.temperatures.max(), 2),
+ 'min_temperature': round(self.temperatures.min(), 2),
+ 'std_deviation': round(self.temperatures.std(), 2),
+ 'temp_range': round(self.temperatures.max() - self.temperatures.min(), 2),
+ 'start_time': self.timestamps.iloc[0].strftime('%Y-%m-%d %H:%M:%S'),
+ 'end_time': self.timestamps.iloc[-1].strftime('%Y-%m-%d %H:%M:%S'),
+ 'duration_hours': round((self.timestamps.iloc[-1] - self.timestamps.iloc[0]).total_seconds() / 3600, 2)
+ }
+
+ # 状态分布统计
+ status_counts = self.statuses.value_counts().to_dict()
+ stats['status_distribution'] = status_counts
+
+ return stats
+
+ def save_fig_to_html(self, fig, output_path):
+ """将图形保存为HTML"""
+ import io
+ import base64
+
+ # 将图形转换为base64编码
+ buf = io.BytesIO()
+ fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
+ buf.seek(0)
+ img_str = base64.b64encode(buf.read()).decode('utf-8')
+ buf.close()
+
+ # HTML模板(修复了多余的 '}')
+ html_template = """
+
+
+
+
+ 温度数据分析报告
+
+
+
+
+
+
+
数据概览
+
+ | 项目 | 数值 |
+ {% for key, value in statistics.items() %}
+ {% if key != 'status_distribution' %}
+ | {{ key.replace('_', ' ').title() }} | {{ value }} |
+ {% endif %}
+ {% endfor %}
+
+
+
+
+
状态分布
+
+ | 状态值 | 出现次数 |
+ {% for status, count in statistics.status_distribution.items() %}
+ | {{ status }} | {{ count }} |
+ {% endfor %}
+
+
+
+
+
温度与状态时序图
+
+

+
+
+
+
+
+ """
+
+ template = Template(html_template)
+ rendered_html = template.render(
+ file_name=self.file_path,
+ generation_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+ statistics=self.generate_statistics_report(),
+ image_data=img_str
+ )
+
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(rendered_html)
+
+ def run_analysis(self):
+ """运行完整分析流程"""
+ if not self.select_file():
+ return
+
+ if not self.load_and_process_data():
+ return
+
+ # 创建图形
+ fig = self.create_scatter_plots()
+
+ # 生成输出文件名(保存到选择的文件所在文件夹)
+ base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_filename = f"{base_filename}_{timestamp}.html"
+ output_dir = os.path.dirname(self.file_path)
+ output_path = os.path.join(output_dir, output_filename)
+
+ # 保存HTML报告到同一文件夹
+ self.save_fig_to_html(fig, output_path)
+
+ print(f"分析完成!报告已保存至: {output_path}")
+
+ # 显示统计摘要
+ stats = self.generate_statistics_report()
+ print("\n=== 数据统计摘要 ===")
+ for key, value in stats.items():
+ if key != 'status_distribution':
+ print(f"{key.replace('_', ' ').title()}: {value}")
+
+
+if __name__ == "__main__":
+ analyzer = TemperatureDataAnalyzer()
+ analyzer.run_analysis()
\ No newline at end of file