Python脚本开发文件初始化

This commit is contained in:
2026-02-02 15:19:30 +08:00
parent 86c4718368
commit 5c846eae94
25 changed files with 8746 additions and 0 deletions

7
.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
/.idea/*
.idea/*
/.idea
.idea
/.venv
/.venv/*

16
BOMCompare/.gitignore vendored Normal file
View File

@@ -0,0 +1,16 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
BOMCompare for Merge V2.py
BOMCompareForJP2.py
BOMConsolidator.py
BOMConsolidatorV2.py
# BOMConsolidator.py

View File

@@ -0,0 +1,655 @@
import pandas as pd
import tkinter as tk
from tkinter import filedialog
from datetime import datetime
import os
from typing import Dict, List, Tuple, Optional
class BOMComparator:
"""BOM文件差异对比器"""
def __init__(self):
self.file1_path = ""
self.file2_path = ""
self.file1_sheets = []
self.file2_sheets = []
self.common_sheets = []
self.differences = {}
self.file1_name = ""
self.file2_name = ""
self.columns_to_exclude = ['检查信息', '检查状态', '校验信息'] # 要排除的列名
def select_file(self, title: str) -> str:
"""手动选择文件"""
root = tk.Tk()
root.withdraw()
file_path = filedialog.askopenfilename(
title=title,
filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")]
)
root.destroy()
return file_path
def find_valid_sheets(self, file_path: str) -> List[str]:
"""参考附件3的方式查找有效的sheet"""
valid_sheets = []
try:
xl_file = pd.ExcelFile(file_path)
for sheet_name in xl_file.sheet_names:
try:
# 尝试读取sheet检查是否包含BOM数据
df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=10)
# 检查是否包含BOM相关列参考附件结构
required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description']
found_columns = [col for col in df.columns if col in required_columns]
if len(found_columns) >= 2: # 至少找到2个关键列
# 检查是否有实际数据(不只是表头)
if len(df) > 1:
valid_sheets.append(sheet_name)
except Exception as e:
continue
except Exception as e:
print(f"读取文件 {file_path} 时出错: {e}")
return valid_sheets
def get_common_sheets(self) -> List[str]:
"""获取两个文件的共同工作表"""
if not self.file1_sheets or not self.file2_sheets:
return []
# 标准化工作表名称(去除空格和特殊字符)
file1_clean = [self.standardize_sheet_name(sheet) for sheet in self.file1_sheets]
file2_clean = [self.standardize_sheet_name(sheet) for sheet in self.file2_sheets]
# 找出共同的工作表
common_sheets = []
for sheet1 in self.file1_sheets:
clean_sheet1 = self.standardize_sheet_name(sheet1)
for sheet2 in self.file2_sheets:
clean_sheet2 = self.standardize_sheet_name(sheet2)
if clean_sheet1 == clean_sheet2:
common_sheets.append(sheet1)
break
return common_sheets
def standardize_sheet_name(self, sheet_name: str) -> str:
"""标准化工作表名称,便于比较"""
return str(sheet_name).strip().lower().replace(' ', '_').replace('-', '_')
def load_bom_data(self, file_path: str, sheet_name: str) -> pd.DataFrame:
"""加载BOM数据"""
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
# 清理数据:去除空行和空列
df = df.dropna(how='all').dropna(axis=1, how='all')
# 清理列名
df.columns = df.columns.str.strip()
return df
except Exception as e:
print(f"加载sheet {sheet_name} 时出错: {e}")
return pd.DataFrame()
def should_compare_column(self, column_name: str) -> bool:
"""判断是否应该对比该列(排除检查信息类列)"""
exclude_keywords = ['检查', '校验', '状态', '备注', 'comment', 'check']
column_lower = str(column_name).lower()
# 检查是否在排除列表中
if column_name in self.columns_to_exclude:
return False
# 检查是否包含排除关键词
for keyword in exclude_keywords:
if keyword in column_lower:
return False
return True
def get_columns_to_compare(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]:
"""获取需要对比的列名(排除检查信息类列)"""
common_columns = list(set(df1.columns).intersection(set(df2.columns)))
# 过滤掉不需要对比的列
columns_to_compare = [col for col in common_columns if self.should_compare_column(col)]
return columns_to_compare
def compare_dataframes(self, df1: pd.DataFrame, df2: pd.DataFrame, sheet_name1: str, sheet_name2: str) -> Dict:
"""对比两个DataFrame的差异排除检查信息类列"""
differences = {
'sheet_names': f"{sheet_name1} vs {sheet_name2}",
'added_rows': [],
'removed_rows': [],
'modified_rows': [],
'columns_comparison': {},
'summary': {
'total_rows_df1': len(df1),
'total_rows_df2': len(df2),
'added_count': 0,
'removed_count': 0,
'modified_count': 0
},
'original_dfs': {
'df1': df1.copy(),
'df2': df2.copy()
}
}
# 确定关键列用于行匹配
key_columns = self.identify_key_columns(df1, df2)
if not key_columns:
differences['error'] = "无法确定用于对比的关键列"
return differences
try:
# 设置索引
df1_indexed = df1.set_index(key_columns)
df2_indexed = df2.set_index(key_columns)
# 获取需要对比的列(排除检查信息类列)
columns_to_compare = self.get_columns_to_compare(df1, df2)
# 找出新增的行
new_indexes = df2_indexed.index.difference(df1_indexed.index)
if len(new_indexes) > 0:
differences['added_rows'] = df2_indexed.loc[new_indexes].reset_index().to_dict('records')
differences['summary']['added_count'] = len(new_indexes)
# 找出删除的行
removed_indexes = df1_indexed.index.difference(df2_indexed.index)
if len(removed_indexes) > 0:
differences['removed_rows'] = df1_indexed.loc[removed_indexes].reset_index().to_dict('records')
differences['summary']['removed_count'] = len(removed_indexes)
# 找出共同的行并进行详细对比(排除检查信息类列)
common_indexes = df1_indexed.index.intersection(df2_indexed.index)
for idx in common_indexes:
row1 = df1_indexed.loc[idx]
row2 = df2_indexed.loc[idx]
# 检查每列的值是否相同(只对比需要比较的列)
modified_cols = {}
for col in columns_to_compare:
if col in df1_indexed.columns and col in df2_indexed.columns:
val1 = row1[col]
val2 = row2[col]
# 处理NaN值的比较
if pd.isna(val1) and pd.isna(val2):
continue
elif pd.isna(val1) or pd.isna(val2) or str(val1) != str(val2):
modified_cols[col] = {
'old_value': val1,
'new_value': val2
}
if modified_cols:
# 获取完整的行数据以显示所有需要的列
full_row_data = self.get_full_row_data_for_display(df1, df2, idx, key_columns)
differences['modified_rows'].append({
'key_values': dict(zip(key_columns, idx)) if isinstance(idx, tuple) else {key_columns[0]: idx},
'modified_columns': modified_cols,
'full_row_data': full_row_data
})
differences['summary']['modified_count'] += 1
# 列级对比(包含所有列,用于统计)
common_columns = set(df1.columns).intersection(set(df2.columns))
df1_only_columns = set(df1.columns).difference(set(df2.columns))
df2_only_columns = set(df2.columns).difference(set(df1.columns))
# 计算实际参与对比的列
compared_columns = set(columns_to_compare)
excluded_columns = common_columns - compared_columns
differences['columns_comparison'] = {
'common_columns': list(common_columns),
'compared_columns': list(compared_columns),
'excluded_columns': list(excluded_columns),
'file1_only_columns': list(df1_only_columns),
'file2_only_columns': list(df2_only_columns)
}
except Exception as e:
differences['error'] = f"对比过程中出错: {str(e)}"
return differences
def get_full_row_data_for_display(self, df1: pd.DataFrame, df2: pd.DataFrame, idx, key_columns: List[str]) -> Dict:
"""获取完整的行数据用于显示"""
display_data = {}
# 获取两个文件中的对应行数据
row1_data = self.extract_row_data(df1, idx, key_columns)
row2_data = self.extract_row_data(df2, idx, key_columns)
# 定义需要显示的列(排除检查信息类列)
display_columns = ['Purchase_Code', 'MF_PN', 'Description', 'Part Type', 'MF_NAME', 'PCB_Footprint', '合计']
# 过滤掉检查信息类列
display_columns = [col for col in display_columns if self.should_compare_column(col)]
for col in display_columns:
val1 = row1_data.get(col, '')
val2 = row2_data.get(col, '')
# 格式化显示:有差异显示原值->新值,无差异显示原值
if pd.isna(val1) or val1 == '':
display_value = val2
elif pd.isna(val2) or val2 == '':
display_value = val1
elif str(val1) != str(val2):
display_value = f"{val1} -> {val2}"
else:
display_value = val1
display_data[col] = display_value
# 添加文件来源信息
display_data['_from_file1'] = row1_data
display_data['_from_file2'] = row2_data
return display_data
def extract_row_data(self, df: pd.DataFrame, idx, key_columns: List[str]) -> Dict:
"""从DataFrame中提取指定行的数据"""
row_data = {}
try:
if isinstance(idx, tuple):
# 多列索引的情况
mask = pd.Series(True, index=df.index)
for i, key in enumerate(key_columns):
mask = mask & (df[key] == idx[i])
if mask.any():
original_row = df[mask].iloc[0]
for col in df.columns:
row_data[col] = original_row[col]
else:
# 单列索引的情况
matching_rows = df[df[key_columns[0]] == idx]
if len(matching_rows) > 0:
original_row = matching_rows.iloc[0]
for col in df.columns:
row_data[col] = original_row[col]
except Exception as e:
pass
return row_data
def format_value_display(self, value1, value2):
"""格式化值的显示:有差异显示原值->新值,无差异显示原值"""
if pd.isna(value1) or value1 == '':
return value2
elif pd.isna(value2) or value2 == '':
return value1
elif str(value1) != str(value2):
return f"{value1} -> {value2}"
else:
return value1
def get_modified_columns_summary(self, modified_columns: Dict) -> str:
"""获取修改列的概要汇总"""
if not modified_columns:
return "无修改"
modified_list = list(modified_columns.keys())
# 如果修改列数量较少,直接显示
if len(modified_list) <= 3:
return ", ".join(modified_list)
else:
# 数量较多时显示前3个加省略号
return ", ".join(modified_list[:3]) + f"...等{len(modified_list)}"
def identify_key_columns(self, df1: pd.DataFrame, df2: pd.DataFrame) -> List[str]:
"""识别用于行匹配的关键列"""
# 优先使用Partnumber作为关键列
potential_keys = ['Partnumber', 'Purchase_Code', 'MF_PN']
for key in potential_keys:
if key in df1.columns and key in df2.columns:
# 检查该列是否适合作为关键列(不应有过多重复值)
df1_dup_rate = df1[key].duplicated().sum() / len(df1)
df2_dup_rate = df2[key].duplicated().sum() / len(df2)
if df1_dup_rate < 0.1 and df2_dup_rate < 0.1: # 允许少量重复
return [key]
# 如果没有单一关键列,尝试组合
for key_combo in [['Partnumber', 'MF_PN'], ['Purchase_Code', 'MF_PN']]:
if all(col in df1.columns for col in key_combo) and all(col in df2.columns for col in key_combo):
return key_combo
# 最后尝试使用所有找到的共同列
common_cols = list(set(df1.columns).intersection(set(df2.columns)))
if common_cols:
return common_cols[:2] # 最多使用前两列
return []
def generate_output_filename(self) -> str:
"""生成输出文件名以两个文件的有效sheet名称开头"""
if not self.file1_sheets or not self.file2_sheets:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"BOM差异报告_{timestamp}.xlsx"
# 使用第一个文件第一个sheet和第二个文件第一个sheet
file1_sheet_name = str(self.file1_sheets[0]) if self.file1_sheets else "File1"
file2_sheet_name = str(self.file2_sheets[0]) if self.file2_sheets else "File2"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 清理sheet名称中的特殊字符
clean_sheet1 = self.clean_filename(file1_sheet_name)
clean_sheet2 = self.clean_filename(file2_sheet_name)
filename = f"{clean_sheet1}_vs_{clean_sheet2}_差异报告_{timestamp}.xlsx"
return filename
def clean_filename(self, filename: str) -> str:
"""清理文件名中的特殊字符"""
filename = str(filename)
# 移除Windows文件名中不允许的字符
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
# 移除多余的空格和特殊字符
filename = filename.replace(' ', '_')
filename = filename.replace('\t', '_')
filename = filename.replace('\n', '_')
# 限制文件名长度
if len(filename) > 50:
filename = filename[:50]
return filename
def clean_sheet_name(self, sheet_name: str, max_length: int = 25) -> str:
"""清理工作表名称确保符合Excel工作表名称限制"""
sheet_name = str(sheet_name)
# 移除Excel工作表名称中不允许的字符
invalid_chars = '[]:*?/\\'
for char in invalid_chars:
sheet_name = sheet_name.replace(char, '_')
# 限制工作表名称长度Excel限制为31个字符
if len(sheet_name) > max_length:
sheet_name = sheet_name[:max_length]
return sheet_name
def get_output_directory(self) -> str:
"""获取输出目录(第二个文件所在目录)"""
return os.path.dirname(self.file2_path)
def generate_difference_report(self) -> str:
"""生成差异报告Excel文件"""
if not self.differences:
return "没有发现差异"
# 生成输出文件名和路径
output_filename = self.generate_output_filename()
output_directory = self.get_output_directory()
output_path = os.path.join(output_directory, output_filename)
try:
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# 创建总摘要表
summary_data = []
for diff_key, differences in self.differences.items():
if 'error' not in differences:
columns_comparison = differences.get('columns_comparison', {})
excluded_count = len(columns_comparison.get('excluded_columns', []))
summary_data.append([
differences.get('sheet_names', diff_key),
differences['summary']['total_rows_df1'],
differences['summary']['total_rows_df2'],
differences['summary']['added_count'],
differences['summary']['removed_count'],
differences['summary']['modified_count'],
excluded_count
])
if summary_data:
summary_df = pd.DataFrame(summary_data, columns=[
'工作表对比', '文件1行数', '文件2行数', '新增行数', '删除行数', '修改行数', '排除列数'
])
summary_df.to_excel(writer, sheet_name='对比摘要', index=False)
# 为每个对比创建详细报告
for diff_key, differences in self.differences.items():
sheet_key = self.clean_sheet_name(diff_key.replace('vs', '_vs_'))
if 'error' in differences:
# 如果有错误,创建错误报告
error_df = pd.DataFrame([['错误信息', differences['error']]])
error_df.to_excel(writer, sheet_name=f"{sheet_key}_错误", index=False, header=False)
continue
# 汇总表 - 包含列对比的详细信息
summary_data = []
summary_data.append(["对比项", "数量"])
summary_data.append(["文件1总行数", differences['summary']['total_rows_df1']])
summary_data.append(["文件2总行数", differences['summary']['total_rows_df2']])
summary_data.append(["新增行数", differences['summary']['added_count']])
summary_data.append(["删除行数", differences['summary']['removed_count']])
summary_data.append(["修改行数", differences['summary']['modified_count']])
summary_data.append(["共同列数", len(differences['columns_comparison']['common_columns'])])
summary_data.append(["实际对比列数", len(differences['columns_comparison']['compared_columns'])])
summary_data.append(["排除列数", len(differences['columns_comparison']['excluded_columns'])])
summary_data.append(["文件1特有列", len(differences['columns_comparison']['file1_only_columns'])])
summary_data.append(["文件2特有列", len(differences['columns_comparison']['file2_only_columns'])])
# 添加排除列详情
excluded_cols = differences['columns_comparison'].get('excluded_columns', [])
if excluded_cols:
summary_data.append(["", ""])
summary_data.append(["排除的列", "(检查信息类列不参与对比)"])
for col in excluded_cols:
summary_data.append(["", f"- {col}"])
pd.DataFrame(summary_data).to_excel(
writer,
sheet_name=f"{sheet_key}_汇总",
index=False,
header=False
)
# 新增行详情
if differences['added_rows']:
pd.DataFrame(differences['added_rows']).to_excel(
writer,
sheet_name=f"{sheet_key}_新增行",
index=False
)
# 删除行详情
if differences['removed_rows']:
pd.DataFrame(differences['removed_rows']).to_excel(
writer,
sheet_name=f"{sheet_key}_删除行",
index=False
)
# 修改行详情 - 优化后的显示格式(排除检查信息列)
if differences['modified_rows']:
modified_data = []
for mod_row in differences['modified_rows']:
# 创建基础记录
record = {
**mod_row['key_values'], # 关键列如Partnumber
'修改列': self.get_modified_columns_summary(mod_row['modified_columns'])
}
# 添加所有需要显示的列(排除检查信息类列)
display_data = mod_row.get('full_row_data', {})
# 获取需要显示的列
display_columns = list(display_data.keys())
display_columns = [col for col in display_columns if
not col.startswith('_') and self.should_compare_column(col)]
for col in display_columns:
record[col] = display_data.get(col, '')
# 添加详细的修改信息(只包括参与对比的列)
for col, values in mod_row['modified_columns'].items():
if self.should_compare_column(col):
record[f'详细_{col}'] = f"{values['old_value']} -> {values['new_value']}"
modified_data.append(record)
if modified_data:
modified_df = pd.DataFrame(modified_data)
# 重新排列列的顺序,让重要信息在前
column_order = list(mod_row['key_values'].keys()) + ['修改列']
# 添加其他显示列
other_columns = [col for col in modified_df.columns
if col not in column_order and not col.startswith('详细_')]
column_order.extend(other_columns)
# 添加详细修改信息列
detailed_cols = [col for col in modified_df.columns if col.startswith('详细_')]
column_order.extend(detailed_cols)
# 确保所有列都存在
existing_columns = [col for col in column_order if col in modified_df.columns]
modified_df = modified_df[existing_columns]
modified_df.to_excel(
writer,
sheet_name=f"{sheet_key}_修改行",
index=False
)
return output_path
except Exception as e:
print(f"生成报告时出错: {e}")
return ""
def run_comparison(self):
"""执行完整的BOM对比流程"""
print("=== BOM文件差异对比工具 ===")
print("注意:检查信息类列(如'检查信息')将不参与修改行对比")
# 1. 选择第一份文件
print("\n步骤1: 选择第一份Excel文件")
self.file1_path = self.select_file("选择第一份BOM Excel文件")
if not self.file1_path:
print("未选择文件,程序退出")
return
self.file1_name = os.path.basename(self.file1_path)
# 2. 选择第二份文件
print("\n步骤2: 选择第二份Excel文件")
self.file2_path = self.select_file("选择第二份BOM Excel文件")
if not self.file2_path:
print("未选择文件,程序退出")
return
self.file2_name = os.path.basename(self.file2_path)
print(f"\n文件1: {self.file1_name}")
print(f"文件2: {self.file2_name}")
# 3. 查找有效sheet
print("\n步骤3: 查找有效的工作表...")
self.file1_sheets = self.find_valid_sheets(self.file1_path)
self.file2_sheets = self.find_valid_sheets(self.file2_path)
print(f"文件1的有效工作表: {self.file1_sheets}")
print(f"文件2的有效工作表: {self.file2_sheets}")
if not self.file1_sheets or not self.file2_sheets:
print("至少有一个文件没有有效的工作表,无法进行对比")
return
# 4. 进行差异对比
print("\n步骤4: 进行差异对比...")
self.differences = {}
# 使用第一个文件第一个sheet和第二个文件第一个sheet进行对比
sheet1 = self.file1_sheets[0]
sheet2 = self.file2_sheets[0]
print(f"正在对比: {sheet1} (文件1) vs {sheet2} (文件2)")
df1 = self.load_bom_data(self.file1_path, sheet1)
df2 = self.load_bom_data(self.file2_path, sheet2)
if df1.empty:
print(f" ⚠ 文件1的工作表 {sheet1} 数据加载失败")
return
if df2.empty:
print(f" ⚠ 文件2的工作表 {sheet2} 数据加载失败")
return
differences = self.compare_dataframes(df1, df2, sheet1, sheet2)
comparison_key = f"{sheet1}_vs_{sheet2}"
self.differences[comparison_key] = differences
if 'error' in differences:
print(f" ⚠ 对比过程中出错: {differences['error']}")
else:
columns_comparison = differences.get('columns_comparison', {})
excluded_count = len(columns_comparison.get('excluded_columns', []))
print(f" √ 完成对比:")
print(f" 文件1行数: {differences['summary']['total_rows_df1']}")
print(f" 文件2行数: {differences['summary']['total_rows_df2']}")
print(f" 新增行数: {differences['summary']['added_count']}")
print(f" 删除行数: {differences['summary']['removed_count']}")
print(f" 修改行数: {differences['summary']['modified_count']}")
print(f" 排除列数: {excluded_count} (检查信息类列不参与对比)")
# 5. 生成差异报告
print("\n步骤5: 生成差异报告...")
output_file = self.generate_difference_report()
if output_file and os.path.exists(output_file):
print(f"\n=== 对比完成 ===")
print(f"差异报告已生成: {os.path.basename(output_file)}")
# print(f"文件位置: {output_file}")
print(f"输出目录: {self.get_output_directory()}")
else:
print("未成功生成差异报告")
def main():
"""主函数"""
comparator = BOMComparator()
comparator.run_comparison()
input("\n按Enter键退出...")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,635 @@
import os
import pandas as pd
import numpy as np
import tkinter as tk
from tkinter import filedialog
from datetime import datetime
import warnings
import re
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')
class BOMComparator:
def __init__(self):
self.column_mapping = {
'ITEM': 'Partnumber',
'HT PN': 'Partnumber',
'MF PN': 'MF_PN',
'MFG': 'MF_NAME',
'CRD': 'Reference',
'Description': 'Description',
'Qty': 'Quantity',
'焊接方式': '焊接方式',
'Remark': '备注'
}
self.ignore_columns = ['备注']
self.required_columns = list(self.column_mapping.values())
self.change_columns = [
'ITEM', 'HT PN', 'MF PN', 'MFG', 'CRD', 'Description', 'Qty', 'Remark'
]
self.mandatory_keywords = ['item', 'partnumber', 'mfpn']
# 异常记录
self.validation_errors = []
self.stats = {
'old_bom_rows': 0,
'new_bom_rows': 0,
'changed_items': 0,
'added_items': 0,
'removed_items': 0,
'total_errors': 0
}
def normalize_text(self, text):
if pd.isna(text):
return ""
text = str(text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
return text.strip().lower()
def find_header_row(self, df):
print(f"扫描前 {min(20, len(df))} 行寻找标题行...")
for i in range(min(20, len(df))):
row_values = [self.normalize_text(cell) for cell in df.iloc[i].values]
contains_all_keywords = True
for keyword in self.mandatory_keywords:
if not any(keyword in cell_value for cell_value in row_values):
contains_all_keywords = False
break
if contains_all_keywords:
print(f"✅ 找到有效标题行 (索引 {i}),包含所有必需关键词")
return i
error_msg = (
"❌ 未找到有效的标题行:所有标题行必须同时包含以下关键词:\n"
f"- Item (或类似表述)\n"
f"- Partnumber (或类似表述)\n"
f"- MF_PN (或类似表述)\n\n"
"在文件的前20行中没有找到同时包含所有关键词的行。"
)
raise ValueError(error_msg)
def find_active_sheet(self, file_path):
print(f"扫描文件: {os.path.basename(file_path)}")
xls = pd.ExcelFile(file_path)
candidate_sheets = []
for sheet_name in xls.sheet_names:
# 使用 BOM 或 PCBA 作为关键词
if any(keyword in sheet_name.lower() for keyword in ["bom", "pcba"]):
candidate_sheets.append(sheet_name)
print(f" 发现候选Sheet: {sheet_name} - 关键词匹配")
# 第一步优先检查第一个bom候选Sheet
successful_sheet = None
if candidate_sheets:
for first_candidate in candidate_sheets:
# 先检查第一个候选Sheet
# first_candidate = candidate_sheets[0]
try:
print(f" 优先检查候选Sheet: {first_candidate}")
df_preview = pd.read_excel(
file_path,
sheet_name=first_candidate,
header=None,
nrows=20,
engine='openpyxl'
)
header_row_idx = self.find_header_row(df_preview)
print(f"✅ 在候选Sheet '{first_candidate}' 中找到标题行")
# return first_candidate
successful_sheet = first_candidate
break
except Exception as e:
print(f" ❌ 优先候选Sheet '{first_candidate}': {str(e)}")
# 移除失败的首选候选
# candidate_sheets.pop(0)
# remove(值) - 移除指定值的元素
# candidate_sheets.remove(first_candidate) # 移除值为 'sheet_name' 的元素
continue
if successful_sheet:
return successful_sheet
# 第二步如果没找到bom候选Sheet或首选候选失败遍历所有候选Sheet
if not successful_sheet:
candidate_sheets = xls.sheet_names
print(" 未找到名称包含'BOM'的Sheet将检查所有Sheet")
# 遍历剩余候选Sheet
for sheet_name in candidate_sheets:
try:
print(f" 检查Sheet: {sheet_name}")
df_preview = pd.read_excel(
file_path,
sheet_name=sheet_name,
header=None,
nrows=20,
engine='openpyxl'
)
try:
header_row_idx = self.find_header_row(df_preview)
print(f"✅ 在Sheet '{sheet_name}' 中找到标题行")
return sheet_name
except ValueError as e:
print(f" ❌ Sheet '{sheet_name}': {str(e)}")
continue
except Exception as e:
print(f" 检查Sheet '{sheet_name}' 时出错: {str(e)}")
continue
# 第三步如果所有候选Sheet都失败尝试第一个Sheet作为备选
print("⚠️ 所有候选Sheet检查失败尝试第一个Sheet")
first_sheet = xls.sheet_names[0]
try:
df_preview = pd.read_excel(
file_path,
sheet_name=first_sheet,
header=None,
nrows=20,
engine='openpyxl'
)
header_row_idx = self.find_header_row(df_preview)
print(f"✅ 在备份Sheet '{first_sheet}' 中找到标题行")
return first_sheet
except Exception as e:
print(f"❌ 备份Sheet '{first_sheet}' 也失败: {str(e)}")
return None
def validate_bom(self, bom_df, file_name, sheet_name):
"""验证BOM数据并收集异常"""
errors = []
# 1. 检查Partnumber是否有重复
dup_partnumbers = bom_df[bom_df.duplicated('Partnumber', keep=False)]
if not dup_partnumbers.empty:
print(f"⚠️ 发现重复的Partnumber: {len(dup_partnumbers)}")
for idx, row in dup_partnumbers.iterrows():
error = {
'文件': file_name,
'Sheet': sheet_name,
'原始行号': idx + 2, # Excel行号从1开始标题行下一行
'异常类型': '重复Partnumber',
'异常描述': f"Partnumber '{row['Partnumber']}' 重复出现"
}
errors.append(error)
# 2. 检查Partnumber是否为空
empty_partnumbers = bom_df[bom_df['Partnumber'].isna() | (bom_df['Partnumber'] == '')]
if not empty_partnumbers.empty:
print(f"⚠️ 发现空Partnumber: {len(empty_partnumbers)}")
for idx, row in empty_partnumbers.iterrows():
error = {
'文件': file_name,
'Sheet': sheet_name,
'原始行号': idx + 2,
'异常类型': '空Partnumber',
'异常描述': "Partnumber为空"
}
errors.append(error)
# 3. 验证Reference位号数量与Quantity是否一致
for idx, row in bom_df.iterrows():
# # 跳过PCB等特殊项
# if row.get('Part Type') == 'PCB' or pd.isna(row.get('Reference')):
# continue
refs = str(row['Reference'])
qty = row['Quantity']
try:
# 计算实际位号数量
ref_count = len([r for r in refs.split(',') if r.strip()])
# 检查Quantity是否为数字
try:
qty_val = int(qty)
except (ValueError, TypeError):
qty_val = -1
# 验证数量一致性
if ref_count != qty_val:
error = {
'文件': file_name,
'Sheet': sheet_name,
'原始行号': idx + 2,
'异常类型': '数量不一致',
'异常描述': f"位号数量({ref_count}) ≠ Quantity({qty})"
}
errors.append(error)
except Exception as e:
error = {
'文件': file_name,
'Sheet': sheet_name,
'原始行号': idx + 2,
'异常类型': '验证错误',
'异常描述': f"验证异常: {str(e)}"
}
errors.append(error)
return errors
def load_bom(self, file_path):
print(f"识别激活Sheet...")
active_sheet = self.find_active_sheet(file_path)
print(f"📊 使用Sheet: {active_sheet}")
df_preview = pd.read_excel(
file_path,
sheet_name=active_sheet,
header=None,
nrows=20
)
header_row_idx = self.find_header_row(df_preview)
print("加载完整BOM数据...")
bom_df = pd.read_excel(
file_path,
sheet_name=active_sheet,
header=header_row_idx,
dtype=str
)
if "old_bom_rows" not in self.stats or self.stats['old_bom_rows'] == 0:
self.stats['old_bom_rows'] = len(bom_df)
else:
self.stats['new_bom_rows'] = len(bom_df)
# 清理列名
bom_df.columns = [str(col).strip() for col in bom_df.columns]
print(f" 原始列名: {list(bom_df.columns)}")
# 列名标准化映射
column_aliases = {
'Item': 'Item',
'Partnumber': 'Partnumber',
'Part Number': 'Partnumber',
'Purchase_Code': 'Purchase_Code',
'MF_PN': 'MF_PN',
'Description': 'Description',
'Part Type': 'Part Type',
'MF_NAME': 'MF_NAME',
'Manufacturer': 'MF_NAME',
'PCB_Footprint': 'PCB_Footprint',
'Reference': 'Reference',
'References': 'Reference',
'Quantity': 'Quantity',
'Qty': 'Quantity',
'加工方式': '焊接方式',
'焊接方式': '焊接方式',
'Value': 'Value',
'备注': '备注',
'Remark': '备注',
'Comments': '备注'
}
# 应用别名映射
bom_df = bom_df.rename(columns={col: alias for col, alias in column_aliases.items()
if col in bom_df.columns})
print(f" 标准化后列名: {list(bom_df.columns)}")
# 确保所有必需列存在
missing_cols = [col for col in self.required_columns if col not in bom_df.columns]
if missing_cols:
raise ValueError(f"❌ 缺少必需列: {', '.join(missing_cols)}")
# 清理数据:去除空行和无效项
initial_count = len(bom_df)
bom_df = bom_df.replace('', np.nan)
bom_df = bom_df.dropna(subset=['Item'], how='all')
cleaned_count = len(bom_df)
if initial_count > cleaned_count:
print(
f" 清理空行: 移除 {initial_count - cleaned_count} 行 (原 {initial_count} 行 -> 现 {cleaned_count} 行)")
# 执行数据验证
file_name = os.path.basename(file_path)
errors = self.validate_bom(bom_df, file_name, active_sheet)
self.validation_errors.extend(errors)
self.stats['total_errors'] += len(errors)
if errors:
print(f"⚠️ 在 '{file_name}' 中发现 {len(errors)} 个数据异常")
return bom_df, active_sheet
def compare_reference_lists(self, old_refs_str, new_refs_str):
"""比较两个Reference列表返回差异描述"""
if pd.isna(old_refs_str):
old_refs_str = ""
if pd.isna(new_refs_str):
new_refs_str = ""
old_refs = set([ref.strip() for ref in str(old_refs_str).split(',') if ref.strip()])
new_refs = set([ref.strip() for ref in str(new_refs_str).split(',') if ref.strip()])
# 如果两个集合相同,返回空字符串表示无差异
if old_refs == new_refs:
return ""
# 计算差异
added_refs = new_refs - old_refs
removed_refs = old_refs - new_refs
diff_msgs = []
if added_refs:
diff_msgs.append(f"增加位号: {','.join(sorted(added_refs))}")
if removed_refs:
diff_msgs.append(f"删除位号: {','.join(sorted(removed_refs))}")
return "; ".join(diff_msgs)
def compare_boms(self, old_bom, new_bom):
print("开始比较两份BOM...")
old_bom['Partnumber'] = old_bom['Partnumber'].astype(str).str.strip()
new_bom['Partnumber'] = new_bom['Partnumber'].astype(str).str.strip()
changes = []
old_partnumbers = set(old_bom['Partnumber'].unique())
if len(old_partnumbers) != len(old_bom):
print(f"⚠️ 旧BOM有重复的Partnumber: 总行数{len(old_bom)},唯一物料数{len(old_partnumbers)}")
new_partnumbers = set(new_bom['Partnumber'].unique())
if len(new_partnumbers) != len(new_bom):
print(f"⚠️ 新BOM有重复的Partnumber: 总行数{len(new_bom)},唯一物料数{len(new_partnumbers)}")
all_partnumbers = sorted(old_partnumbers | new_partnumbers)
print(f" 总物料项数量: {len(all_partnumbers)} (旧BOM: {len(old_partnumbers)}, 新BOM: {len(new_partnumbers)})")
for idx, pn in enumerate(all_partnumbers):
if (idx + 1) % 100 == 0 or (idx + 1) == len(all_partnumbers):
print(f" 处理进度: {idx + 1}/{len(all_partnumbers)} 项物料")
record = {'ITEM_OLD': '', 'ITEM_NEW': ''}
old_row = None
new_row = None
change_desc = ""
old_match = old_bom[old_bom['Partnumber'] == pn]
if not old_match.empty:
old_row = old_match.iloc[0]
record['ITEM_OLD'] = old_row['Item']
new_match = new_bom[new_bom['Partnumber'] == pn]
if not new_match.empty:
new_row = new_match.iloc[0]
record['ITEM_NEW'] = new_row['Item']
change_type = ""
if old_row is None:
change_type = "新增"
self.stats['added_items'] += 1
change_desc = "新增物料"
elif new_row is None:
change_type = "删除"
self.stats['removed_items'] += 1
change_desc = "删除物料"
else:
change_type = "变更"
self.stats['changed_items'] += 1
# 填充左侧列旧BOM值
for change_col, bom_col in self.column_mapping.items():
if change_col == 'ITEM':
continue
old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else ''
record[change_col] = old_val
# 填充右侧列新BOM值
for change_col, bom_col in self.column_mapping.items():
if change_col == 'ITEM':
continue
new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else ''
record[f'NEW_{change_col}'] = new_val
if change_type == "变更":
change_details = []
qty_changed = False
if 'Quantity' in old_row.index and 'Quantity' in new_row.index:
old_qty = str(old_row['Quantity'])
new_qty = str(new_row['Quantity'])
if old_qty != new_qty:
change_details.append(f"Qty: {old_qty}{new_qty}")
qty_changed = True
mfpn_changed = False
if 'MF_PN' in old_row.index and 'MF_PN' in new_row.index:
old_mfpn = str(old_row['MF_PN'])
new_mfpn = str(new_row['MF_PN'])
if old_mfpn != new_mfpn:
change_details.append(f"MF PN: {old_mfpn}{new_mfpn}")
mfpn_changed = True
# 优化使用新的Reference比较方法
if 'Reference' in old_row.index and 'Reference' in new_row.index:
ref_diff = self.compare_reference_lists(old_row['Reference'], new_row['Reference'])
if ref_diff:
change_details.append(ref_diff)
for change_col, bom_col in self.column_mapping.items():
if (change_col == 'ITEM' or
bom_col in ['Quantity', 'MF_PN', 'Reference'] or
bom_col in self.ignore_columns):
continue
old_val = old_row[bom_col] if old_row is not None and bom_col in old_row else ''
new_val = new_row[bom_col] if new_row is not None and bom_col in new_row else ''
if str(old_val) != str(new_val):
change_details.append(f"{change_col}: {old_val}{new_val}")
if change_details:
change_desc = "; ".join(change_details)
else:
change_type = ""
record['Design change Type'] = change_type
record['NEW_Remark'] = change_desc
if change_type:
changes.append(record)
left_columns = ['ITEM_OLD'] + [col for col in self.change_columns if col != 'ITEM']
middle_columns = ['Design change Type']
right_columns = ['ITEM_NEW'] + [f'NEW_{col}' for col in self.change_columns if col != 'ITEM']
if 'NEW_Remark' in right_columns:
right_columns.remove('NEW_Remark')
right_columns.append('NEW_Remark')
change_columns = left_columns + middle_columns + right_columns
right_start_col = len(left_columns) + len(middle_columns) + 1
return pd.DataFrame(changes, columns=change_columns), right_start_col
def generate_summary(self):
summary = [
"\n" + "=" * 50,
"BOM 比较处理汇总",
"-" * 50,
f"原始BOM行数: {self.stats['old_bom_rows']}",
f"新BOM行数: {self.stats['new_bom_rows']}",
f"变更物料数量: {self.stats['changed_items']}",
f"新增物料数量: {self.stats['added_items']}",
f"删除物料数量: {self.stats['removed_items']}",
f"变更记录总数: {self.stats['changed_items'] + self.stats['added_items'] + self.stats['removed_items']}",
f"数据异常总数: {self.stats['total_errors']}",
"=" * 50
]
return "\n".join(summary)
def generate_change_record(self):
root = tk.Tk()
root.withdraw()
# 重置统计信息和异常记录
self.stats = {
'old_bom_rows': 0,
'new_bom_rows': 0,
'changed_items': 0,
'added_items': 0,
'removed_items': 0,
'total_errors': 0
}
self.validation_errors = []
try:
# 选择原始BOM文件
print("\n" + "=" * 50)
print("步骤 1/4: 选择原始BOM文件")
print("=" * 50)
old_file = filedialog.askopenfilename(
title="选择原始BOM文件",
filetypes=[("Excel Files", "*.xlsx *.xls")]
)
if not old_file:
print("❌ 未选择文件,操作取消")
return
print(f"📂 已选择原始BOM: {old_file}")
old_file_name = os.path.basename(old_file)
# output_dir = os.path.dirname(old_file)
# 选择变更后BOM文件
print("\n" + "=" * 50)
print("步骤 2/4: 选择变更后BOM文件")
print("=" * 50)
new_file = filedialog.askopenfilename(
title="选择变更后BOM文件",
filetypes=[("Excel Files", "*.xlsx *.xls")]
)
if not new_file:
print("❌ 未选择文件,操作取消")
return
print(f"📂 已选择新BOM: {new_file}")
new_file_name = os.path.basename(new_file)
output_dir = os.path.dirname(new_file)
# 加载BOM文件
print("\n" + "=" * 50)
print("步骤 3/4: 加载并处理BOM文件")
print("=" * 50)
print(f"🔍 加载原始BOM文件: {old_file_name}")
old_bom, old_bom_activesheetname = self.load_bom(old_file)
print(f"✅ 原始BOM加载完成{len(old_bom)}")
print(f"\n🔍 加载变更后BOM文件: {new_file_name}")
new_bom, new_bom_activesheetname = self.load_bom(new_file)
print(f"✅ 新BOM加载完成{len(new_bom)}")
# 比较BOM生成变更记录
print("\n" + "=" * 50)
print("步骤 4/4: 比较BOM差异并生成变更记录")
print("=" * 50)
print("🔍 比较BOM差异...")
change_df, right_start_col = self.compare_boms(old_bom, new_bom)
# 准备输出文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"{old_bom_activesheetname} to {new_bom_activesheetname} eBOM_change_record_{timestamp}.xlsx"
output_path = os.path.join(output_dir, output_file)
# 保存变更记录和异常记录
print(f"\n💾 保存变更记录文件: {output_path}")
wb = Workbook()
# 创建变更记录工作表
ws_change = wb.active
ws_change.title = "PCBA_BOM_change record"
if change_df.empty:
ws_change.cell(row=1, column=1, value="两份BOM完全相同无变更记录")
print("✅ 两份BOM完全相同无变更记录")
else:
# 重命名列
column_rename = {
'ITEM_OLD': 'ITEM',
'ITEM_NEW': 'ITEM',
**{f'NEW_{col}': col for col in self.change_columns if col != 'ITEM'},
'NEW_Remark': 'Remark'
}
change_df = change_df.rename(columns=column_rename)
# 添加文件名信息
ws_change.cell(row=1, column=1, value=old_file_name)
ws_change.cell(row=1, column=right_start_col, value=new_file_name)
# 添加列标题
col_names = change_df.columns.tolist()
for col_idx, col_name in enumerate(col_names, 1):
ws_change.cell(row=2, column=col_idx, value=col_name)
# 添加数据行
for r_idx, row in enumerate(dataframe_to_rows(change_df, index=False, header=False), 3):
for c_idx, value in enumerate(row, 1):
ws_change.cell(row=r_idx, column=c_idx, value=value)
# 创建异常记录工作表
if self.validation_errors:
print(f"⚠️ 发现 {len(self.validation_errors)} 个数据异常,创建异常记录")
ws_errors = wb.create_sheet(title="BOM异常记录")
# 异常记录列名
error_columns = ['文件', 'Sheet', '原始行号', '异常类型', '异常描述']
for col_idx, col_name in enumerate(error_columns, 1):
ws_errors.cell(row=1, column=col_idx, value=col_name)
# 添加异常数据
for row_idx, error in enumerate(self.validation_errors, 2):
ws_errors.cell(row=row_idx, column=1, value=error['文件'])
ws_errors.cell(row=row_idx, column=2, value=error['Sheet'])
ws_errors.cell(row=row_idx, column=3, value=error['原始行号'])
ws_errors.cell(row=row_idx, column=4, value=error['异常类型'])
ws_errors.cell(row=row_idx, column=5, value=error['异常描述'])
# 保存工作簿
wb.save(output_path)
# 打印处理汇总
print(self.generate_summary())
print(f"\n✅ 变更记录已保存至: {output_path}")
except Exception as e:
print(f"\n❌ 处理过程中出错: {str(e)}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
print("=" * 60)
print(" PCBA BOM 变更记录生成工具 ")
print("=" * 60)
print("要求: 标题行必须同时包含 'Item', 'Partnumber', 'MF_PN'")
comparator = BOMComparator()
comparator.generate_change_record()
print("\n" + "=" * 50)
print(" 处理完成,按任意键退出... ")
# input()

View File

@@ -0,0 +1,618 @@
import pandas as pd
import os
import glob
import re
from datetime import datetime
import tkinter as tk
from tkinter import filedialog
from collections import defaultdict
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
@dataclass
class ProcessedFileInfo:
"""处理文件信息类"""
filename: str
sheet_name: str
start_row: int
total_rows: int
valid_rows: int
@dataclass
class BOMRow:
"""BOM行数据类"""
partnumber: str
purchase_code: str
mf_pn: str
description: str
part_type: str
mf_name: str
pcb_footprint: str
quantity: int
reference: str
filename: str = ""
sheet_name: str = ""
@classmethod
def from_dataframe_row(cls, row: pd.Series, filename: str = "", sheet_name: str = "") -> Optional['BOMRow']:
"""从DataFrame行创建BOMRow对象"""
try:
return cls(
partnumber=str(row.get('Partnumber', '')).strip(),
purchase_code=str(row.get('Purchase_Code', '')).strip(),
mf_pn=str(row.get('MF_PN', '')).strip(),
description=str(row.get('Description', '')).strip(),
part_type=str(row.get('Part_Type', '')).strip(),
mf_name=str(row.get('MF_NAME', '')).strip(),
pcb_footprint=str(row.get('PCB_Footprint', '')).strip(),
quantity=int(row.get('Quantity', 0)),
reference=str(row.get('Reference', '')).strip(),
filename=filename,
sheet_name=sheet_name
)
except (ValueError, TypeError):
return None
def get_key(self) -> str:
"""获取行的唯一标识键"""
return self.partnumber if self.partnumber else self.mf_pn
def is_valid(self) -> bool:
"""检查行数据是否有效"""
return bool(self.get_key())
@dataclass
class ConsolidatedMaterial:
"""合并后的物料数据类"""
partnumber: str
purchase_code: str
mf_pn: str
description: str
part_type: str
mf_name: str
pcb_footprint: str
quantity_data: Dict[str, int] # 文件名: 数量
inconsistencies: List[str]
@property
def total_quantity(self) -> int:
"""计算总数量"""
return sum(self.quantity_data.values())
@property
def has_inconsistencies(self) -> bool:
"""检查是否有不一致"""
return len(self.inconsistencies) > 0
class ConsistencyChecker:
"""一致性检查器"""
def __init__(self):
self.fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint']
def check_field_consistency(self, existing: ConsolidatedMaterial, new_row: BOMRow) -> List[str]:
"""检查字段一致性"""
inconsistencies = []
field_mapping = {
'Purchase_Code': ('purchase_code', 'Purchase_Code'),
'MF_PN': ('mf_pn', 'MF_PN'),
'Part_Type': ('part_type', 'Part Type'),
'MF_NAME': ('mf_name', 'MF_NAME'),
'PCB_Footprint': ('pcb_footprint', 'PCB_Footprint')
}
for field, (attr_name, row_field) in field_mapping.items():
existing_val = getattr(existing, attr_name)
new_val = getattr(new_row, attr_name)
if self._should_check_field(existing_val, new_val) and existing_val != new_val:
inconsistencies.append(
f"{field}不一致: {existing_val}{new_val} (文件: {new_row.filename}, Sheet: {new_row.sheet_name})"
)
return inconsistencies
def check_quantity_reference(self, row: BOMRow) -> Optional[str]:
"""检查Reference数量和Quantity是否匹配"""
if not row.reference:
return None
ref_count = len([ref for ref in row.reference.split(',') if ref.strip()])
if ref_count != row.quantity:
return f"Reference数量不符: {ref_count}个位置 ≠ Quantity={row.quantity} (文件: {row.filename}, Sheet: {row.sheet_name})"
return None
def _should_check_field(self, existing_val: str, new_val: str) -> bool:
"""判断是否应该检查字段"""
# 忽略空值和无意义值
if not new_val or new_val.lower() in ['', 'nan', 'none', 'null']:
return False
return True
class BOMFileParser:
"""BOM文件解析器"""
def __init__(self):
self.required_headers = ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN']
self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description',
'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference']
def find_valid_sheet(self, file_path: str) -> Optional[Tuple[str, int]]:
"""定位包含有效BOM的Sheet"""
try:
xl = pd.ExcelFile(file_path)
for sheet_name in xl.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
for i in range(min(len(df), 10)): # 只检查前10行
headers = df.iloc[i].values
if all(col in str(headers) for col in self.required_headers):
filename = os.path.basename(file_path)
print(f"文件{filename}找到有效sheet {sheet_name}|有效数据行从 {i} 开始。")
return sheet_name, i
except Exception as e:
print(f"读取文件 {file_path} 时出错: {e}")
return None, None
def parse_file(self, file_path: str) -> Optional[Tuple[List[BOMRow], ProcessedFileInfo]]:
"""解析BOM文件"""
filename = os.path.basename(file_path)
sheet_name, header_row = self.find_valid_sheet(file_path)
if not sheet_name:
return None
try:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
total_rows = len(df)
df = self._clean_dataframe(df)
if not self._validate_columns(df):
return None
bom_rows = []
valid_rows = 0
for _, row_data in df.iterrows():
bom_row = BOMRow.from_dataframe_row(row_data, filename, sheet_name)
if bom_row and bom_row.is_valid():
bom_rows.append(bom_row)
valid_rows += 1
# 创建文件信息对象
file_info = ProcessedFileInfo(
filename=filename,
sheet_name=sheet_name,
start_row=header_row,
total_rows=total_rows,
valid_rows=valid_rows
)
return bom_rows, file_info
except Exception as e:
print(f"解析文件 {file_path} 时出错: {e}")
return None
def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""清洗DataFrame"""
# 清理列名
df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)
# 去除空行
df = df.dropna(how='all')
return df
def _validate_columns(self, df: pd.DataFrame) -> bool:
"""验证必要列是否存在"""
missing_cols = [col for col in self.required_columns if col not in df.columns]
return len(missing_cols) == 0
class MaterialConsolidator:
"""物料合并器"""
def __init__(self):
self.materials: Dict[str, ConsolidatedMaterial] = {}
self.consistency_checker = ConsistencyChecker()
self.file_quantities: Dict[str, Dict[str, int]] = defaultdict(dict)
self.processed_files_info: List[ProcessedFileInfo] = []
def add_bom_row(self, bom_row: BOMRow) -> None:
"""添加BOM行数据"""
key = bom_row.get_key()
if key not in self.materials:
# 创建新的合并物料
self.materials[key] = ConsolidatedMaterial(
partnumber=bom_row.partnumber,
purchase_code=bom_row.purchase_code,
mf_pn=bom_row.mf_pn,
description=bom_row.description,
part_type=bom_row.part_type,
mf_name=bom_row.mf_name,
pcb_footprint=bom_row.pcb_footprint,
quantity_data={},
inconsistencies=[]
)
material = self.materials[key]
# 检查一致性
inconsistencies = self.consistency_checker.check_field_consistency(material, bom_row)
material.inconsistencies.extend(inconsistencies)
# 检查数量引用
ref_inconsistency = self.consistency_checker.check_quantity_reference(bom_row)
if ref_inconsistency:
material.inconsistencies.append(ref_inconsistency)
# 记录数量数据
material.quantity_data[bom_row.filename] = bom_row.quantity
self.file_quantities[bom_row.filename][key] = bom_row.quantity
def add_file_info(self, file_info: ProcessedFileInfo) -> None:
"""添加文件处理信息"""
self.processed_files_info.append(file_info)
def get_statistics(self) -> Dict[str, Any]:
"""获取统计信息"""
total_inconsistencies = sum(len(mat.inconsistencies) for mat in self.materials.values())
materials_with_issues = sum(1 for mat in self.materials.values() if mat.has_inconsistencies)
return {
'total_materials': len(self.materials),
'total_inconsistencies': total_inconsistencies,
'materials_with_issues': materials_with_issues,
'file_count': len(self.file_quantities),
'processed_files_info': self.processed_files_info
}
class ReportGenerator:
"""报告生成器"""
def __init__(self, output_folder: str):
self.output_folder = output_folder
self._ensure_output_directory()
def _ensure_output_directory(self):
"""确保输出目录存在"""
output_dir = os.path.join(self.output_folder, "BOM_Merge_out")
os.makedirs(output_dir, exist_ok=True)
def _create_summary_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame:
"""创建汇总信息Sheet"""
summary_data = [
["BOM合并检查汇总报告", ""],
["生成时间", datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
["", ""],
["处理统计", ""],
["扫描文件总数", stats['total_files']],
["成功处理文件数", stats['processed_files']],
["处理数据行数", stats['processed_rows']],
["", ""],
["物料统计", ""],
["合并物料种类数", stats['total_materials']],
["存在问题的物料数", stats['materials_with_issues']],
["不一致问题总数", stats['total_inconsistencies']],
["", ""],
["数据源文件信息", ""],
["有效文件总数", len(stats.get('processed_files_info', []))],
["", ""]
]
# 添加详细的数据源文件信息
files_info = stats.get('processed_files_info', [])
for i, file_info in enumerate(files_info, 1):
summary_data.extend([
[f"数据源文件 {i}", file_info.filename],
[" Sheet名称", file_info.sheet_name],
[" 起始行", file_info.start_row + 1], # 转换为1-based索引
[" 总行数", file_info.total_rows],
[" 有效行数", file_info.valid_rows],
["", ""]
])
summary_data.extend([
["", ""],
["文件信息", ""],
["输出文件夹", os.path.join(self.output_folder, "BOM_Merge_out")],
["报告文件", stats.get('output_filename', '')],
["合并Sheet名称", "BOM_Merge"]
])
return pd.DataFrame(summary_data, columns=["项目", "数值"])
def _create_data_source_sheet(self, stats: Dict[str, Any]) -> pd.DataFrame:
"""创建数据源文件详细信息Sheet"""
files_info = stats.get('processed_files_info', [])
if not files_info:
return pd.DataFrame([["无有效数据源文件", ""]], columns=["状态", "说明"])
data_source_data = []
for i, file_info in enumerate(files_info, 1):
data_source_data.append({
'序号': i,
'文件名': file_info.filename,
'Sheet名称': file_info.sheet_name,
'数据起始行': file_info.start_row + 1, # 转换为1-based索引
'总行数': file_info.total_rows,
'有效行数': file_info.valid_rows,
'处理状态': '成功'
})
return pd.DataFrame(data_source_data)
def _create_merge_sheet(self, consolidator: MaterialConsolidator) -> pd.DataFrame:
"""创建合并数据Sheet"""
report_data = []
file_columns = sorted(consolidator.file_quantities.keys())
for material in consolidator.materials.values():
row = {
'Partnumber': material.partnumber,
'Purchase_Code': material.purchase_code,
'MF_PN': material.mf_pn,
'Description': material.description,
'Part Type': material.part_type,
'MF_NAME': material.mf_name,
'PCB_Footprint': material.pcb_footprint,
'检查信息': '; '.join(material.inconsistencies) if material.inconsistencies else '一致'
}
# 添加各文件数量
for file in file_columns:
row[file] = material.quantity_data.get(file, 0)
row['合计'] = material.total_quantity
report_data.append(row)
return pd.DataFrame(report_data)
def generate_consolidated_report(self, consolidator: MaterialConsolidator, stats: Dict[str, Any]) -> Optional[str]:
"""生成包含多个Sheet的合并报告"""
if not consolidator.materials:
return None
# 生成带时间戳的文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"BOM合并报告_{timestamp}.xlsx"
output_path = os.path.join(self.output_folder, "BOM_Merge_out", output_filename)
try:
# 使用ExcelWriter创建多Sheet的Excel文件
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# Sheet 1: 汇总信息
summary_df = self._create_summary_sheet(stats)
summary_df.to_excel(writer, sheet_name='汇总信息', index=False)
# Sheet 2: 数据源文件信息
data_source_df = self._create_data_source_sheet(stats)
data_source_df.to_excel(writer, sheet_name='数据源文件', index=False)
# Sheet 3: 合并数据
merge_df = self._create_merge_sheet(consolidator)
merge_df.to_excel(writer, sheet_name='BOM_Merge', index=False)
# 调整列宽
workbook = writer.book
# 调整汇总信息Sheet列宽
summary_sheet = workbook['汇总信息']
summary_sheet.column_dimensions['A'].width = 25
summary_sheet.column_dimensions['B'].width = 40
# 调整数据源文件Sheet列宽
data_source_sheet = workbook['数据源文件']
for col in data_source_sheet.columns:
max_length = 0
column = col[0].column_letter
for cell in col:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = min(max_length + 2, 30)
data_source_sheet.column_dimensions[column].width = adjusted_width
# 调整合并数据Sheet列宽
merge_sheet = workbook['BOM_Merge']
for col in merge_sheet.columns:
max_length = 0
column = col[0].column_letter
for cell in col:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = min(max_length + 2, 50)
merge_sheet.column_dimensions[column].width = adjusted_width
# 更新stats中的文件名
stats['output_filename'] = output_filename
return output_path
except Exception as e:
print(f"保存报告失败: {e}")
return None
class BOMProcessor:
"""BOM处理器 - 主控制器"""
def __init__(self):
self.file_parser = BOMFileParser()
self.material_consolidator = MaterialConsolidator()
self.report_generator: Optional[ReportGenerator] = None
# 统计信息
self.processed_files = 0
self.processed_rows = 0
self.total_files = 0
def set_output_folder(self, folder_path: str):
"""设置输出文件夹"""
self.report_generator = ReportGenerator(folder_path)
def process_folder(self, folder_path: str) -> bool:
"""处理文件夹中的所有BOM文件"""
bom_files = glob.glob(os.path.join(folder_path, "*.xlsx"))
self.total_files = len(bom_files)
if not bom_files:
return False
successful_files = 0
for file_path in bom_files:
if self._process_single_file(file_path):
successful_files += 1
self.processed_files = successful_files
return successful_files > 0
def _process_single_file(self, file_path: str) -> bool:
"""处理单个文件"""
filename = os.path.basename(file_path)
print(f"处理文件: {filename}...")
result = self.file_parser.parse_file(file_path)
if not result:
print(f" ! 无法解析文件: {filename}")
return False
bom_rows, file_info = result
print(f" √ 文件{filename}找到 {len(bom_rows)} 行有效数据 (Sheet: {file_info.sheet_name})")
# 添加文件处理信息
self.material_consolidator.add_file_info(file_info)
# 处理BOM行数据
for bom_row in bom_rows:
self.material_consolidator.add_bom_row(bom_row)
self.processed_rows += 1
return True
def generate_report(self) -> Optional[Dict[str, Any]]:
"""生成报告并返回统计信息"""
if not self.report_generator:
return None
# 获取基本统计信息
base_stats = self.material_consolidator.get_statistics()
base_stats.update({
'processed_files': self.processed_files,
'total_files': self.total_files,
'processed_rows': self.processed_rows
})
# 生成报告
output_path = self.report_generator.generate_consolidated_report(
self.material_consolidator, base_stats
)
if not output_path:
return None
# 返回完整的统计信息
base_stats['output_path'] = output_path
return base_stats
class UserInterface:
"""用户界面处理器"""
@staticmethod
def select_folder(title: str = "选择文件夹") -> str:
"""选择文件夹"""
root = tk.Tk()
root.withdraw()
folder_path = filedialog.askdirectory(title=title)
root.destroy()
return folder_path
@staticmethod
def print_summary(stats: Dict[str, Any], folder_path: str):
"""打印汇总信息"""
print("\n" + "=" * 60)
print("BOM合并检查完成!")
print("=" * 60)
print(f"处理文件夹: {folder_path}")
print(f"扫描文件数: {stats['total_files']}")
print(f"成功处理文件数: {stats['processed_files']}")
print(f"处理数据行数: {stats['processed_rows']}")
print(f"合并物料种类数: {stats['total_materials']}")
print(f"存在问题的物料数: {stats['materials_with_issues']}")
print(f"不一致问题总数: {stats['total_inconsistencies']}")
# 显示数据源文件信息
files_info = stats.get('processed_files_info', [])
print(f"有效数据源文件数: {len(files_info)}")
for file_info in files_info:
print(f" - {file_info.filename} (Sheet: {file_info.sheet_name}, 有效行: {file_info.valid_rows})")
print(f"报告文件: {stats['output_path']}")
print("=" * 60)
# 额外显示输出文件夹信息
output_dir = os.path.join(folder_path, "BOM_Merge_out")
print(f"输出保存在: {output_dir}")
print("\n报告包含三个Sheet:")
print("1. '汇总信息' - 处理统计和汇总信息")
print("2. '数据源文件' - 有效数据源文件详细信息")
print("3. 'BOM_Merge' - 合并后的物料数据")
def main():
"""主函数"""
# 初始化处理器
bom_processor = BOMProcessor()
# 选择文件夹
folder_path = UserInterface.select_folder("选择包含BOM文件的文件夹")
if not folder_path:
print("未选择文件夹,程序退出")
return
bom_processor.set_output_folder(folder_path)
# 处理文件
print(f"开始处理文件夹: {folder_path}")
success = bom_processor.process_folder(folder_path)
if not success:
print("没有找到可处理的BOM文件")
return
# 生成报告
print("\n生成合并报告...")
stats = bom_processor.generate_report()
if stats:
UserInterface.print_summary(stats, folder_path)
else:
print("生成报告失败")
if __name__ == "__main__":
main()
input("\n按任意键退出...")

14
BOMCompare/README.md Normal file
View File

@@ -0,0 +1,14 @@
# Sample GitLab Project
This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches,
named and filled with lorem ipsum.
You can look around to get an idea how to structure your project and, when done, you can safely delete this project.
[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html)
# 基于标准格式的 BOM文件输出 BOM差异信息文件
BOMCompereForJP.py
# 基于标准格式的 BOM文件输出 BOM的合并后的文件方便校对和物料备料情况的分析。
BOMConsolidator.py

19
FFT_IMU/.gitignore vendored Normal file
View File

@@ -0,0 +1,19 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
/dataProcess_out*
*.xls
*.xlsx
*.csv
*.spec
/src
/temp
FFT_IMU_dc_html_v2.py
FFT_IMU_dc_v2.py

View File

@@ -0,0 +1,739 @@
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import signal
import os
import glob
from datetime import datetime
import time
from multiprocessing import Pool, cpu_count
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
import re
from colorama import Fore, Style, init
from concurrent.futures import ProcessPoolExecutor, as_completed
import warnings
import threading
# 初始化colorama
init(autoreset=True)
# 忽略特定的matplotlib警告
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
# 创建线程锁,确保文件操作和日志输出的线程安全
file_lock = threading.Lock()
log_lock = threading.Lock()
class IMUDataAnalyzer:
def __init__(self, file_path):
self.file_path = file_path
self.data = None
self.sampling_rate = None
self.fig_size = (15, 10)
self.spectrogram_params = {} # 存储频谱图计算参数
# 从文件名推断数据类型和采样率
file_name = os.path.basename(file_path).lower()
if 'calib' in file_name:
self.data_type = 'calib'
self.default_sampling_rate = 5
elif 'raw' in file_name:
self.data_type = 'raw'
self.default_sampling_rate = 1000
else:
self.data_type = 'unknown'
self.default_sampling_rate = 5
# 解析文件路径和文件名
file_dir = os.path.dirname(os.path.abspath(file_path))
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 创建文件名称+时间戳尾缀的输出目录
self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
# 使用锁确保目录创建的线程安全
with file_lock:
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
# 字体设置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
plt.rcParams['axes.unicode_minus'] = False
# 设置matplotlib兼容性选项避免布局引擎冲突
plt.rcParams['figure.constrained_layout.use'] = False
plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
plt.rcParams['figure.constrained_layout.hspace'] = 0.02
plt.rcParams['figure.constrained_layout.wspace'] = 0.02
self.log_progress(f"处理文件:{self.file_path}", "INFO")
self.log_progress(f"数据类型:{self.data_type}", "INFO")
self.log_progress(f"输出路径:{self.output_dir}", "INFO")
def log_progress(self, message, level="INFO"):
"""带颜色和级别的日志输出(线程安全)"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with log_lock:
if level == "INFO":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
elif level == "WARNING":
print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
elif level == "ERROR":
print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
elif level == "SUCCESS":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
else:
print(f"{Fore.CYAN}[{timestamp}] {message}")
def check_imu_columns_in_file(self):
"""检查文件是否包含IMU数据列通过读取文件头"""
try:
# 只读取第一行来检查列名
with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
first_line = f.readline().strip()
# 检查第一行是否包含imu关键词不区分大小写
if re.search(r'imu', first_line, re.IGNORECASE):
return True
else:
self.log_progress(f"文件头部不包含'imu'关键词跳过处理first_line {first_line}", "WARNING")
return False
except Exception as e:
self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
return False
def detect_imu_columns(self):
"""自动检测IMU数据列"""
all_columns = self.data.columns.tolist()
# 查找imu前缀如imu1, imu2等
imu_prefixes = set()
for col in all_columns:
match = re.match(r'^(imu\d+)_', col.lower())
if match:
imu_prefixes.add(match.group(1))
if not imu_prefixes:
self.log_progress("未检测到IMU数据列尝试使用默认列名", "WARNING")
# 尝试使用常见列名
self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
self.temp_columns = ['imu1_temp']
return
# 使用第一个检测到的IMU前缀
imu_prefix = list(imu_prefixes)[0]
self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
# 查找加速度计列
self.acc_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_acc") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找陀螺仪列
self.gyro_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_gyro") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找温度列
self.temp_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_temp")]
# 如果没有找到温度列,尝试其他常见名称
if not self.temp_columns:
self.temp_columns = [col for col in all_columns
if any(name in col.lower() for name in ['temp', 'temperature'])]
self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
self.log_progress(f"温度列: {self.temp_columns}", "INFO")
def estimate_sampling_rate(self):
"""估计实际采样率"""
if 'time' in self.data.columns and len(self.data) > 10:
time_diff = np.diff(self.data['time'].values)
valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
if len(valid_diffs) > 0:
estimated_rate = 1.0 / np.median(valid_diffs)
self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
return estimated_rate
# 如果没有时间列或无法估计,使用基于文件名的默认值
self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
return self.default_sampling_rate
def load_data(self):
"""加载并预处理数据"""
self.log_progress("开始加载数据...")
start_time = time.time()
# 首先检查文件是否包含IMU数据
if not self.check_imu_columns_in_file():
raise ValueError("文件不包含IMU数据列跳过处理")
# 使用锁确保文件读取的线程安全
with file_lock:
self.data = pd.read_csv(self.file_path)
self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}")
# 检测IMU数据列
self.detect_imu_columns()
# 估计采样率
self.sampling_rate = self.estimate_sampling_rate()
# 创建时间序列并处理异常时间值
if 'time' in self.data.columns:
valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
self.data = self.data[valid_time_mask].copy()
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
else:
# 如果没有时间列,创建基于采样率的时间序列
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
def remove_dc(self, signal_data):
"""不移除直流分量(保留以在频谱中显示 DC"""
return signal_data
def compute_spectrogram(self, signal_data):
"""计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
# 保留直流分量
signal_data = self.remove_dc(signal_data)
# 数据长度
n_samples = len(signal_data)
# 根据采样率和数据长度自适应选择参数
if self.sampling_rate <= 10: # 低采样率5Hz
# 对于低采样率,使用较长的窗口以获得更好的频率分辨率
nperseg = min(256, max(64, n_samples // 2))
noverlap = int(nperseg * 0.75) # 增加重叠比例
else: # 高采样率1000Hz
# 对于高采样率,平衡时间分辨率和频率分辨率
if n_samples < 10000: # 较短的数据
nperseg = min(512, max(256, n_samples // 4))
else: # 较长的数据
nperseg = min(1024, max(512, n_samples // 8))
noverlap = int(nperseg * 0.66) # 适中的重叠比例
# 确保窗口大小合理
nperseg = max(16, min(nperseg, n_samples))
noverlap = min(noverlap, nperseg - 1)
# 记录频谱图计算参数
self.spectrogram_params = {
"nperseg": nperseg,
"noverlap": noverlap,
"window": "hamming",
"detrend": False,
"scaling": "density",
"mode": "psd"
}
# 使用更平滑的窗口函数
f, t, Sxx = signal.spectrogram(
signal_data,
fs=self.sampling_rate,
window='hamming', # 使用汉明窗,比汉宁窗更平滑
nperseg=nperseg,
noverlap=noverlap,
scaling='density',
detrend=False, # 保留直流
mode='psd'
)
# 应用平滑处理以减少颗粒感
if Sxx.size > 0:
# 使用小范围的高斯滤波平滑(可选)
from scipy.ndimage import gaussian_filter
Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
return f, t, Sxx_smoothed
return f, t, Sxx
def process_signal(self, args):
"""并行处理单个信号"""
signal_data, axis = args
f, t, Sxx = self.compute_spectrogram(signal_data)
# 防止 log10(0)
eps = np.finfo(float).eps
Sxx_log = 10 * np.log10(Sxx + eps)
# 降采样以加速绘图
if len(t) > 1000: # 如果时间点太多,进行降采样
time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
t = t[time_indices]
f = f[freq_indices]
Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
dc_idx = int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
# 更健壮的 0 Hz 索引选择
zero_idx = np.where(np.isclose(f, 0.0))[0]
dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSDdB
return {
'f': f,
't': t,
'Sxx_log': Sxx_log,
'dc_log': dc_log,
'axis': axis
}
@staticmethod
def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
"""
计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
"""
if not results:
return fallback
dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
dc_all = dc_all[np.isfinite(dc_all)]
if dc_all.size == 0:
return fallback
lo, hi = np.percentile(dc_all, [p_low, p_high])
span = max(1e-9, hi - lo)
lo -= span * pad_ratio
hi += span * pad_ratio
return lo, hi
def get_time_domain_stats(self):
"""计算时域信号的统计信息"""
stats = {}
if self.acc_columns:
stats['加速度计'] = {col: {
'均值': self.data[col].mean(),
'标准差': self.data[col].std(),
'最大值': self.data[col].max(),
'最小值': self.data[col].min()
} for col in self.acc_columns}
if self.gyro_columns:
stats['陀螺仪'] = {col: {
'均值': self.data[col].mean(),
'标准差': self.data[col].std(),
'最大值': self.data[col].max(),
'最小值': self.data[col].min()
} for col in self.gyro_columns}
if self.temp_columns:
stats['温度'] = {col: {
'均值': self.data[col].mean(),
'标准差': self.data[col].std(),
'最大值': self.data[col].max(),
'最小值': self.data[col].min()
} for col in self.temp_columns}
return stats
def generate_html_report(self, time_domain_stats):
"""生成HTML报告"""
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>IMU数据分析报告 - {os.path.basename(self.file_path)}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
h1, h2, h3 {{ color: #333; }}
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
th {{ background-color: #f2f2f2; }}
img {{ max-width: 100%; height: auto; display: block; margin: 10px 0; }}
</style>
</head>
<body>
<h1>IMU数据分析报告</h1>
<p><strong>文件路径:</strong> {self.file_path}</p>
<p><strong>分析时间:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p><strong>采样率:</strong> {self.sampling_rate} Hz</p>
<h2>时域信号统计信息</h2>
"""
# 添加时域统计信息
for sensor_type, sensors in time_domain_stats.items():
html_content += f"<h3>{sensor_type}</h3>"
html_content += "<table>"
html_content += "<tr><th>传感器</th><th>均值</th><th>标准差</th><th>最大值</th><th>最小值</th></tr>"
for col, stats in sensors.items():
html_content += f"<tr><td>{col}</td><td>{stats['均值']:.4f}</td><td>{stats['标准差']:.4f}</td><td>{stats['最大值']:.4f}</td><td>{stats['最小值']:.4f}</td></tr>"
html_content += "</table>"
# 添加频域参数信息
html_content += """
<h2>频域信号计算参数</h2>
<table>
<tr><th>参数</th><th>值</th></tr>
"""
for key, value in self.spectrogram_params.items():
html_content += f"<tr><td>{key}</td><td>{value}</td></tr>"
html_content += "</table>"
# 添加图像链接
time_series_image = f'time_series_{self.timestamp}.png'
acc_spectrogram_image = f'acc_rainfall_spectrogram_{self.timestamp}.png'
gyro_spectrogram_image = f'gyro_rainfall_spectrogram_{self.timestamp}.png'
html_content += f"""
<h2>时域信号图</h2>
<img src="{time_series_image}" alt="时域信号图">
<h2>加速度计频谱雨点图</h2>
<img src="{acc_spectrogram_image}" alt="加速度计频谱雨点图">
<h2>陀螺仪频谱雨点图</h2>
<img src="{gyro_spectrogram_image}" alt="陀螺仪频谱雨点图">
"""
html_content += """
</body>
</html>
"""
# 保存HTML报告
report_path = os.path.join(self.output_dir, f'report_{self.timestamp}.html')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
self.log_progress(f"HTML报告已生成: {report_path}")
def plot_time_series(self):
"""绘制时间序列图"""
self.log_progress("开始绘制时间序列图...")
start_time = time.time()
# 确定子图数量
n_plots = 1 # 至少有一个加速度图
if self.gyro_columns: # 如果有陀螺仪数据
n_plots += 1
if self.temp_columns: # 如果有温度数据
n_plots += 1
fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
if n_plots == 1:
axes = [axes] # 确保axes是列表
plot_idx = 0
# 加速度计数据
if self.acc_columns:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.acc_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('加速度时间序列', fontsize=12)
ax.set_ylabel('加速度 (g)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 陀螺仪数据(如果有)
if self.gyro_columns and plot_idx < n_plots:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.gyro_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('陀螺仪时间序列', fontsize=12)
ax.set_ylabel('角速度 (deg/s)', fontsize=10)
ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 温度数据(如果有)
if self.temp_columns and plot_idx < n_plots:
ax = axes[plot_idx]
ax.plot(self.data['time'], self.data[self.temp_columns[0]],
label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
ax.set_title('温度时间序列', fontsize=12)
ax.set_xlabel('时间 (s)', fontsize=10)
ax.set_ylabel('温度 (°C)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plt.tight_layout()
output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"时间序列图已保存: {output_path}")
self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}")
def plot_rainfall_spectrograms(self):
"""并行绘制所有频谱雨点图修复colorbar布局问题"""
self.log_progress("开始并行绘制频谱雨点图...")
start_time = time.time()
# 准备加速度计数据
self.log_progress("准备加速度计数据...")
acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
# 准备陀螺仪数据(如果有)
gyro_signals = []
if self.gyro_columns:
self.log_progress("准备陀螺仪数据...")
gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
# 如果没有数据可处理,直接返回
if not acc_signals and not gyro_signals:
self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
return
# 使用多进程处理信号(避免线程冲突)
self.log_progress("使用多进程并行处理...")
all_signals = acc_signals + gyro_signals
with Pool(processes=min(len(all_signals), cpu_count())) as pool:
results = pool.map(self.process_signal, all_signals)
# 分离结果
self.log_progress("分离结果...")
acc_results = [r for r in results if r['axis'].startswith('Acc')]
gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
# 统一颜色标尺5%-95%分位)
if acc_results:
self.log_progress("计算加速度计全局最小和最大值...")
acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f}{acc_dc_ymax:.1f}")
if gyro_results:
self.log_progress("计算陀螺仪全局最小和最大值...")
gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f}{gyro_dc_ymax:.1f}")
# ========= 绘制加速度计频谱雨点图 =========
if acc_results:
self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
'加速度', 'acc_rainfall_spectrogram')
self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
# ========= 绘制陀螺仪频谱雨点图 =========
if gyro_results:
self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
'角速度', 'gyro_rainfall_spectrogram')
self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
total_time = time.time() - start_time
self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}")
def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
"""绘制单个频谱雨点图"""
rows = len(results)
fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
axes_main = []
axes_cbar = []
for i in range(rows):
axes_main.append(fig.add_subplot(gs[i, 0]))
axes_cbar.append(fig.add_subplot(gs[i, 1]))
for i, result in enumerate(results):
ax = axes_main[i]
cax = axes_cbar[i]
sc = ax.scatter(
np.repeat(result['t'], len(result['f'])),
np.tile(result['f'], len(result['t'])),
c=result['Sxx_log'].T.ravel(),
cmap='jet',
s=3,
alpha=0.7,
vmin=vmin,
vmax=vmax,
rasterized=True
)
ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}右侧为DC分量 dB', fontsize=10)
ax.set_xlabel('时间 (s)', fontsize=9)
ax.set_ylabel('频率 (Hz)', fontsize=9)
ax.set_ylim(0, self.sampling_rate / 2)
ax.grid(True, linestyle=':', alpha=0.4)
ax2 = ax.twinx()
ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
ax2.set_ylim(dc_ymin, dc_ymax)
ax2.tick_params(axis='y', labelcolor='black')
ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
ax2.grid(False)
ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
cbar = fig.colorbar(sc, cax=cax)
cbar.set_label('功率谱密度 (dB)', fontsize=9)
cax.tick_params(labelsize=8)
output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
def run_analysis(self):
"""运行完整分析流程"""
try:
self.log_progress("开始数据分析流程", "INFO")
start_time = time.time()
self.load_data()
self.plot_time_series()
self.plot_rainfall_spectrograms()
# 计算时域统计信息
time_domain_stats = self.get_time_domain_stats()
# 生成HTML报告
self.generate_html_report(time_domain_stats)
total_time = time.time() - start_time
self.log_progress(f"分析完成,总耗时 {total_time:.2f}", "SUCCESS")
self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
return True
except ValueError as e:
# 跳过不包含IMU数据的文件
self.log_progress(f"跳过文件: {str(e)}", "WARNING")
return False
except Exception as e:
self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
import traceback
traceback.print_exc()
return False
def process_single_file(file_path):
"""处理单个文件的函数(使用进程隔离)"""
try:
print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
analyzer = IMUDataAnalyzer(file_path)
success = analyzer.run_analysis()
if success:
return (file_path, True, "处理成功")
else:
return (file_path, False, "文件不包含IMU数据已跳过")
except Exception as e:
return (file_path, False, str(e))
def main():
"""主函数,支持多文件处理和进度显示"""
print("=" * 60)
print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
print("=" * 60)
# 获取输入路径
print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
input_path = input("> ").strip()
if not os.path.exists(input_path):
print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
return
# 查找所有包含imu的CSV文件不区分大小写
if os.path.isdir(input_path):
# 使用单个glob模式匹配所有文件然后过滤包含imu的文件
all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
csv_files = list(set(csv_files)) # 去重
csv_files.sort()
else:
# 对于单个文件检查是否包含imu不区分大小写
if re.search(r'imu', input_path, re.IGNORECASE):
csv_files = [input_path]
else:
csv_files = []
if not csv_files:
print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
return
print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
for i, file in enumerate(csv_files, 1):
print(f" {i}. {os.path.basename(file)}")
# 使用多进程处理文件避免matplotlib线程冲突
print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
success_count = 0
skipped_count = 0
failed_files = []
# 使用ProcessPoolExecutor而不是ThreadPoolExecutor
with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
# 提交所有任务
future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
# 处理完成的任务
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_path, success, message = result
if success:
print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
success_count += 1
else:
if "跳过" in message:
print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
skipped_count += 1
else:
print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
failed_files.append((file_path, message))
except Exception as e:
print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
failed_files.append((file_path, str(e)))
# 输出统计信息
print(f"\n{Fore.CYAN}处理完成统计:")
print(f"{Fore.GREEN}成功: {success_count} 个文件")
print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件不包含IMU数据")
print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
if failed_files:
print(f"\n{Fore.YELLOW}失败文件详情:")
for file, error in failed_files:
print(f" {os.path.basename(file)}: {error}")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}用户中断程序执行")
except Exception as e:
print(f"{Fore.RED}程序运行出错: {str(e)}")
import traceback
traceback.print_exc()

View File

@@ -0,0 +1,648 @@
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import signal
import os
import glob
from datetime import datetime
import time
from multiprocessing import Pool, cpu_count
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
import re
from colorama import Fore, Style, init
from concurrent.futures import ProcessPoolExecutor, as_completed
import warnings
import threading
# 初始化colorama
init(autoreset=True)
# 忽略特定的matplotlib警告
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
# 创建线程锁,确保文件操作和日志输出的线程安全
file_lock = threading.Lock()
log_lock = threading.Lock()
class IMUDataAnalyzer:
def __init__(self, file_path):
self.file_path = file_path
self.data = None
self.sampling_rate = None
self.fig_size = (15, 10)
# 从文件名推断数据类型和采样率
file_name = os.path.basename(file_path).lower()
if 'calib' in file_name:
self.data_type = 'calib'
self.default_sampling_rate = 5
elif 'raw' in file_name:
self.data_type = 'raw'
self.default_sampling_rate = 1000
else:
self.data_type = 'unknown'
self.default_sampling_rate = 5
# 解析文件路径和文件名
file_dir = os.path.dirname(os.path.abspath(file_path))
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 创建文件名称+时间戳尾缀的输出目录
self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
# 使用锁确保目录创建的线程安全
with file_lock:
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
# 字体设置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
plt.rcParams['axes.unicode_minus'] = False
# 设置matplotlib兼容性选项避免布局引擎冲突
plt.rcParams['figure.constrained_layout.use'] = False
plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
plt.rcParams['figure.constrained_layout.hspace'] = 0.02
plt.rcParams['figure.constrained_layout.wspace'] = 0.02
self.log_progress(f"处理文件:{self.file_path}", "INFO")
self.log_progress(f"数据类型:{self.data_type}", "INFO")
self.log_progress(f"输出路径:{self.output_dir}", "INFO")
def log_progress(self, message, level="INFO"):
"""带颜色和级别的日志输出(线程安全)"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with log_lock:
if level == "INFO":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
elif level == "WARNING":
print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
elif level == "ERROR":
print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
elif level == "SUCCESS":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
else:
print(f"{Fore.CYAN}[{timestamp}] {message}")
def check_imu_columns_in_file(self):
"""检查文件是否包含IMU数据列通过读取文件头"""
try:
# 只读取第一行来检查列名
with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
first_line = f.readline().strip()
# 检查第一行是否包含imu关键词不区分大小写
if re.search(r'imu', first_line, re.IGNORECASE):
return True
else:
self.log_progress(f"文件头部不包含'imu'关键词跳过处理first_line {first_line}", "WARNING")
return False
except Exception as e:
self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
return False
def detect_imu_columns(self):
"""自动检测IMU数据列"""
all_columns = self.data.columns.tolist()
# 查找imu前缀如imu1, imu2等
imu_prefixes = set()
for col in all_columns:
match = re.match(r'^(imu\d+)_', col.lower())
if match:
imu_prefixes.add(match.group(1))
if not imu_prefixes:
self.log_progress("未检测到IMU数据列尝试使用默认列名", "WARNING")
# 尝试使用常见列名
self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
self.temp_columns = ['imu1_temp']
return
# 使用第一个检测到的IMU前缀
imu_prefix = list(imu_prefixes)[0]
self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
# 查找加速度计列
self.acc_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_acc") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找陀螺仪列
self.gyro_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_gyro") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找温度列
self.temp_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_temp")]
# 如果没有找到温度列,尝试其他常见名称
if not self.temp_columns:
self.temp_columns = [col for col in all_columns
if any(name in col.lower() for name in ['temp', 'temperature'])]
self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
self.log_progress(f"温度列: {self.temp_columns}", "INFO")
def estimate_sampling_rate(self):
"""估计实际采样率"""
if 'time' in self.data.columns and len(self.data) > 10:
time_diff = np.diff(self.data['time'].values)
valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
if len(valid_diffs) > 0:
estimated_rate = 1.0 / np.median(valid_diffs)
self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
return estimated_rate
# 如果没有时间列或无法估计,使用基于文件名的默认值
self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
return self.default_sampling_rate
def load_data(self):
"""加载并预处理数据"""
self.log_progress("开始加载数据...")
start_time = time.time()
# 首先检查文件是否包含IMU数据
if not self.check_imu_columns_in_file():
raise ValueError("文件不包含IMU数据列跳过处理")
# 使用锁确保文件读取的线程安全
with file_lock:
self.data = pd.read_csv(self.file_path)
self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}")
# 检测IMU数据列
self.detect_imu_columns()
# 估计采样率
self.sampling_rate = self.estimate_sampling_rate()
# 创建时间序列并处理异常时间值
if 'time' in self.data.columns:
valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
self.data = self.data[valid_time_mask].copy()
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
else:
# 如果没有时间列,创建基于采样率的时间序列
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
def remove_dc(self, signal_data):
"""不移除直流分量(保留以在频谱中显示 DC"""
return signal_data
# def compute_spectrogram(self, signal_data):
# """计算频谱图(保留直流分量)"""
# # 保留直流分量
# signal_data = self.remove_dc(signal_data)
#
# # 自适应窗口大小 - 根据采样率调整
# if self.sampling_rate <= 10: # 低采样率
# nperseg = min(64, max(16, len(signal_data) // 4))
# else: # 高采样率
# nperseg = min(1024, max(64, len(signal_data) // 8))
#
# noverlap = nperseg // 2
#
# f, t, Sxx = signal.spectrogram(
# signal_data,
# fs=self.sampling_rate,
# window='hann',
# nperseg=nperseg,
# noverlap=noverlap,
# scaling='density',
# detrend=False, # 保留直流
# mode='psd' # 更高效的模式
# )
# return f, t, Sxx
def compute_spectrogram(self, signal_data):
"""计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
# 保留直流分量
signal_data = self.remove_dc(signal_data)
# 数据长度
n_samples = len(signal_data)
# 根据采样率和数据长度自适应选择参数
if self.sampling_rate <= 10: # 低采样率5Hz
# 对于低采样率,使用较长的窗口以获得更好的频率分辨率
nperseg = min(256, max(64, n_samples // 2))
noverlap = int(nperseg * 0.75) # 增加重叠比例
else: # 高采样率1000Hz
# 对于高采样率,平衡时间分辨率和频率分辨率
if n_samples < 10000: # 较短的数据
nperseg = min(512, max(256, n_samples // 4))
else: # 较长的数据
nperseg = min(1024, max(512, n_samples // 8))
noverlap = int(nperseg * 0.66) # 适中的重叠比例
# 确保窗口大小合理
nperseg = max(16, min(nperseg, n_samples))
noverlap = min(noverlap, nperseg - 1)
# 使用更平滑的窗口函数
f, t, Sxx = signal.spectrogram(
signal_data,
fs=self.sampling_rate,
window='hamming', # 使用汉明窗,比汉宁窗更平滑
nperseg=nperseg,
noverlap=noverlap,
scaling='density',
# detrend='linear', # 使用线性去趋势,减少低频干扰
detrend=False, # 保留直流
mode='psd'
)
# 应用平滑处理以减少颗粒感
if Sxx.size > 0:
# 使用小范围的高斯滤波平滑(可选)
from scipy.ndimage import gaussian_filter
Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
return f, t, Sxx_smoothed
return f, t, Sxx
def process_signal(self, args):
"""并行处理单个信号"""
signal_data, axis = args
f, t, Sxx = self.compute_spectrogram(signal_data)
# 防止 log10(0)
eps = np.finfo(float).eps
Sxx_log = 10 * np.log10(Sxx + eps)
# 降采样以加速绘图
if len(t) > 1000: # 如果时间点太多,进行降采样
time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
t = t[time_indices]
f = f[freq_indices]
Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
dc_idx = int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
# 更健壮的 0 Hz 索引选择
zero_idx = np.where(np.isclose(f, 0.0))[0]
dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSDdB
return {
'f': f,
't': t,
'Sxx_log': Sxx_log,
'dc_log': dc_log,
'axis': axis
}
@staticmethod
def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
"""
计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
"""
if not results:
return fallback
dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
dc_all = dc_all[np.isfinite(dc_all)]
if dc_all.size == 0:
return fallback
lo, hi = np.percentile(dc_all, [p_low, p_high])
span = max(1e-9, hi - lo)
lo -= span * pad_ratio
hi += span * pad_ratio
return lo, hi
def plot_time_series(self):
"""绘制时间序列图"""
self.log_progress("开始绘制时间序列图...")
start_time = time.time()
# 确定子图数量
n_plots = 1 # 至少有一个加速度图
if self.gyro_columns: # 如果有陀螺仪数据
n_plots += 1
if self.temp_columns: # 如果有温度数据
n_plots += 1
fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
if n_plots == 1:
axes = [axes] # 确保axes是列表
plot_idx = 0
# 加速度计数据
if self.acc_columns:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.acc_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('加速度时间序列', fontsize=12)
ax.set_ylabel('加速度 (g)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 陀螺仪数据(如果有)
if self.gyro_columns and plot_idx < n_plots:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.gyro_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('陀螺仪时间序列', fontsize=12)
ax.set_ylabel('角速度 (deg/s)', fontsize=10)
ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 温度数据(如果有)
if self.temp_columns and plot_idx < n_plots:
ax = axes[plot_idx]
ax.plot(self.data['time'], self.data[self.temp_columns[0]],
label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
ax.set_title('温度时间序列', fontsize=12)
ax.set_xlabel('时间 (s)', fontsize=10)
ax.set_ylabel('温度 (°C)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plt.tight_layout()
output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"时间序列图已保存: {output_path}")
self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}")
def plot_rainfall_spectrograms(self):
"""并行绘制所有频谱雨点图修复colorbar布局问题"""
self.log_progress("开始并行绘制频谱雨点图...")
start_time = time.time()
# 准备加速度计数据
self.log_progress("准备加速度计数据...")
acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
# 准备陀螺仪数据(如果有)
gyro_signals = []
if self.gyro_columns:
self.log_progress("准备陀螺仪数据...")
gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
# 如果没有数据可处理,直接返回
if not acc_signals and not gyro_signals:
self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
return
# 使用多进程处理信号(避免线程冲突)
self.log_progress("使用多进程并行处理...")
all_signals = acc_signals + gyro_signals
with Pool(processes=min(len(all_signals), cpu_count())) as pool:
results = pool.map(self.process_signal, all_signals)
# 分离结果
self.log_progress("分离结果...")
acc_results = [r for r in results if r['axis'].startswith('Acc')]
gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
# 统一颜色标尺5%-95%分位)
if acc_results:
self.log_progress("计算加速度计全局最小和最大值...")
acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f}{acc_dc_ymax:.1f}")
if gyro_results:
self.log_progress("计算陀螺仪全局最小和最大值...")
gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f}{gyro_dc_ymax:.1f}")
# ========= 绘制加速度计频谱雨点图 =========
if acc_results:
self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
'加速度', 'acc_rainfall_spectrogram')
self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
# ========= 绘制陀螺仪频谱雨点图 =========
if gyro_results:
self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
'角速度', 'gyro_rainfall_spectrogram')
self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
total_time = time.time() - start_time
self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}")
def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
"""绘制单个频谱雨点图"""
rows = len(results)
fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
axes_main = []
axes_cbar = []
for i in range(rows):
axes_main.append(fig.add_subplot(gs[i, 0]))
axes_cbar.append(fig.add_subplot(gs[i, 1]))
for i, result in enumerate(results):
ax = axes_main[i]
cax = axes_cbar[i]
sc = ax.scatter(
np.repeat(result['t'], len(result['f'])),
np.tile(result['f'], len(result['t'])),
c=result['Sxx_log'].T.ravel(),
cmap='jet',
s=3,
alpha=0.7,
vmin=vmin,
vmax=vmax,
rasterized=True
)
ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}右侧为DC分量 dB', fontsize=10)
ax.set_xlabel('时间 (s)', fontsize=9)
ax.set_ylabel('频率 (Hz)', fontsize=9)
ax.set_ylim(0, self.sampling_rate / 2)
ax.grid(True, linestyle=':', alpha=0.4)
ax2 = ax.twinx()
ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
ax2.set_ylim(dc_ymin, dc_ymax)
ax2.tick_params(axis='y', labelcolor='black')
ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
ax2.grid(False)
ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
cbar = fig.colorbar(sc, cax=cax)
cbar.set_label('功率谱密度 (dB)', fontsize=9)
cax.tick_params(labelsize=8)
output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
def run_analysis(self):
"""运行完整分析流程"""
try:
self.log_progress("开始数据分析流程", "INFO")
start_time = time.time()
self.load_data()
self.plot_time_series()
self.plot_rainfall_spectrograms()
total_time = time.time() - start_time
self.log_progress(f"分析完成,总耗时 {total_time:.2f}", "SUCCESS")
self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
return True
except ValueError as e:
# 跳过不包含IMU数据的文件
self.log_progress(f"跳过文件: {str(e)}", "WARNING")
return False
except Exception as e:
self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
import traceback
traceback.print_exc()
return False
def process_single_file(file_path):
"""处理单个文件的函数(使用进程隔离)"""
try:
print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
analyzer = IMUDataAnalyzer(file_path)
success = analyzer.run_analysis()
if success:
return (file_path, True, "处理成功")
else:
return (file_path, False, "文件不包含IMU数据已跳过")
except Exception as e:
return (file_path, False, str(e))
def main():
"""主函数,支持多文件处理和进度显示"""
print("=" * 60)
print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
print("=" * 60)
# 获取输入路径
print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
input_path = input("> ").strip()
if not os.path.exists(input_path):
print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
return
# 查找所有包含imu的CSV文件不区分大小写
if os.path.isdir(input_path):
# 使用单个glob模式匹配所有文件然后过滤包含imu的文件
all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
csv_files = list(set(csv_files)) # 去重
csv_files.sort()
else:
# 对于单个文件检查是否包含imu不区分大小写
if re.search(r'imu', input_path, re.IGNORECASE):
csv_files = [input_path]
else:
csv_files = []
if not csv_files:
print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
return
print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
for i, file in enumerate(csv_files, 1):
print(f" {i}. {os.path.basename(file)}")
# 使用多进程处理文件避免matplotlib线程冲突
print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
success_count = 0
skipped_count = 0
failed_files = []
# 使用ProcessPoolExecutor而不是ThreadPoolExecutor
with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
# 提交所有任务
future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
# 处理完成的任务
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_path, success, message = result
if success:
print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
success_count += 1
else:
if "跳过" in message:
print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
skipped_count += 1
else:
print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
failed_files.append((file_path, message))
except Exception as e:
print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
failed_files.append((file_path, str(e)))
# 输出统计信息
print(f"\n{Fore.CYAN}处理完成统计:")
print(f"{Fore.GREEN}成功: {success_count} 个文件")
print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件不包含IMU数据")
print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
if failed_files:
print(f"\n{Fore.YELLOW}失败文件详情:")
for file, error in failed_files:
print(f" {os.path.basename(file)}: {error}")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}用户中断程序执行")
except Exception as e:
print(f"{Fore.RED}程序运行出错: {str(e)}")
import traceback
traceback.print_exc()

648
FFT_IMU/FFT_IMU_dc_v1.py Normal file
View File

@@ -0,0 +1,648 @@
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy import signal
import os
import glob
from datetime import datetime
import time
from multiprocessing import Pool, cpu_count
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
import re
from colorama import Fore, Style, init
from concurrent.futures import ProcessPoolExecutor, as_completed
import warnings
import threading
# 初始化colorama
init(autoreset=True)
# 忽略特定的matplotlib警告
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
warnings.filterwarnings("ignore", category=FutureWarning, module="matplotlib")
# 创建线程锁,确保文件操作和日志输出的线程安全
file_lock = threading.Lock()
log_lock = threading.Lock()
class IMUDataAnalyzer:
def __init__(self, file_path):
self.file_path = file_path
self.data = None
self.sampling_rate = None
self.fig_size = (15, 10)
# 从文件名推断数据类型和采样率
file_name = os.path.basename(file_path).lower()
if 'calib' in file_name:
self.data_type = 'calib'
self.default_sampling_rate = 5
elif 'raw' in file_name:
self.data_type = 'raw'
self.default_sampling_rate = 1000
else:
self.data_type = 'unknown'
self.default_sampling_rate = 5
# 解析文件路径和文件名
file_dir = os.path.dirname(os.path.abspath(file_path))
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# 创建文件名称+时间戳尾缀的输出目录
self.output_dir = os.path.join(file_dir, f"{file_base_name}_output_{self.timestamp}")
# 使用锁确保目录创建的线程安全
with file_lock:
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.log_progress(f"创建输出目录:{self.output_dir}", "INFO")
# 字体设置
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'Arial']
plt.rcParams['axes.unicode_minus'] = False
# 设置matplotlib兼容性选项避免布局引擎冲突
plt.rcParams['figure.constrained_layout.use'] = False
plt.rcParams['figure.constrained_layout.h_pad'] = 0.02
plt.rcParams['figure.constrained_layout.w_pad'] = 0.02
plt.rcParams['figure.constrained_layout.hspace'] = 0.02
plt.rcParams['figure.constrained_layout.wspace'] = 0.02
self.log_progress(f"处理文件:{self.file_path}", "INFO")
self.log_progress(f"数据类型:{self.data_type}", "INFO")
self.log_progress(f"输出路径:{self.output_dir}", "INFO")
def log_progress(self, message, level="INFO"):
"""带颜色和级别的日志输出(线程安全)"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
with log_lock:
if level == "INFO":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
elif level == "WARNING":
print(f"{Fore.CYAN}[{timestamp}] {Fore.YELLOW}警告: {message}")
elif level == "ERROR":
print(f"{Fore.CYAN}[{timestamp}] {Fore.RED}错误: {message}")
elif level == "SUCCESS":
print(f"{Fore.CYAN}[{timestamp}] {Fore.GREEN}{message}")
else:
print(f"{Fore.CYAN}[{timestamp}] {message}")
def check_imu_columns_in_file(self):
"""检查文件是否包含IMU数据列通过读取文件头"""
try:
# 只读取第一行来检查列名
with open(self.file_path, 'r', encoding='utf-8', errors='ignore') as f:
first_line = f.readline().strip()
# 检查第一行是否包含imu关键词不区分大小写
if re.search(r'imu', first_line, re.IGNORECASE):
return True
else:
self.log_progress(f"文件头部不包含'imu'关键词跳过处理first_line {first_line}", "WARNING")
return False
except Exception as e:
self.log_progress(f"检查文件头部时出错: {str(e)}", "ERROR")
return False
def detect_imu_columns(self):
"""自动检测IMU数据列"""
all_columns = self.data.columns.tolist()
# 查找imu前缀如imu1, imu2等
imu_prefixes = set()
for col in all_columns:
match = re.match(r'^(imu\d+)_', col.lower())
if match:
imu_prefixes.add(match.group(1))
if not imu_prefixes:
self.log_progress("未检测到IMU数据列尝试使用默认列名", "WARNING")
# 尝试使用常见列名
self.acc_columns = ['imu1_acc_x', 'imu1_acc_y', 'imu1_acc_z']
self.gyro_columns = ['imu1_gyro_x', 'imu1_gyro_y', 'imu1_gyro_z']
self.temp_columns = ['imu1_temp']
return
# 使用第一个检测到的IMU前缀
imu_prefix = list(imu_prefixes)[0]
self.log_progress(f"检测到IMU前缀: {imu_prefix}", "INFO")
# 查找加速度计列
self.acc_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_acc") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找陀螺仪列
self.gyro_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_gyro") and
any(axis in col.lower() for axis in ['_x', '_y', '_z'])]
# 查找温度列
self.temp_columns = [col for col in all_columns
if col.lower().startswith(f"{imu_prefix}_temp")]
# 如果没有找到温度列,尝试其他常见名称
if not self.temp_columns:
self.temp_columns = [col for col in all_columns
if any(name in col.lower() for name in ['temp', 'temperature'])]
self.log_progress(f"加速度计列: {self.acc_columns}", "INFO")
self.log_progress(f"陀螺仪列: {self.gyro_columns}", "INFO")
self.log_progress(f"温度列: {self.temp_columns}", "INFO")
def estimate_sampling_rate(self):
"""估计实际采样率"""
if 'time' in self.data.columns and len(self.data) > 10:
time_diff = np.diff(self.data['time'].values)
valid_diffs = time_diff[(time_diff > 0) & (time_diff < 10)] # 排除异常值
if len(valid_diffs) > 0:
estimated_rate = 1.0 / np.median(valid_diffs)
self.log_progress(f"根据时间戳估计的采样率: {estimated_rate:.2f} Hz")
return estimated_rate
# 如果没有时间列或无法估计,使用基于文件名的默认值
self.log_progress(f"使用基于文件名的默认采样率: {self.default_sampling_rate} Hz")
return self.default_sampling_rate
def load_data(self):
"""加载并预处理数据"""
self.log_progress("开始加载数据...")
start_time = time.time()
# 首先检查文件是否包含IMU数据
if not self.check_imu_columns_in_file():
raise ValueError("文件不包含IMU数据列跳过处理")
# 使用锁确保文件读取的线程安全
with file_lock:
self.data = pd.read_csv(self.file_path)
self.log_progress(f"数据加载完成,共 {len(self.data)} 行,耗时 {time.time() - start_time:.2f}")
# 检测IMU数据列
self.detect_imu_columns()
# 估计采样率
self.sampling_rate = self.estimate_sampling_rate()
# 创建时间序列并处理异常时间值
if 'time' in self.data.columns:
valid_time_mask = (self.data['time'] > 0) & (self.data['time'] < 1e6)
self.data = self.data[valid_time_mask].copy()
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
else:
# 如果没有时间列,创建基于采样率的时间序列
self.data['time'] = np.arange(len(self.data)) / self.sampling_rate
def remove_dc(self, signal_data):
"""不移除直流分量(保留以在频谱中显示 DC"""
return signal_data
# def compute_spectrogram(self, signal_data):
# """计算频谱图(保留直流分量)"""
# # 保留直流分量
# signal_data = self.remove_dc(signal_data)
#
# # 自适应窗口大小 - 根据采样率调整
# if self.sampling_rate <= 10: # 低采样率
# nperseg = min(64, max(16, len(signal_data) // 4))
# else: # 高采样率
# nperseg = min(1024, max(64, len(signal_data) // 8))
#
# noverlap = nperseg // 2
#
# f, t, Sxx = signal.spectrogram(
# signal_data,
# fs=self.sampling_rate,
# window='hann',
# nperseg=nperseg,
# noverlap=noverlap,
# scaling='density',
# detrend=False, # 保留直流
# mode='psd' # 更高效的模式
# )
# return f, t, Sxx
def compute_spectrogram(self, signal_data):
"""计算频谱图(保留直流分量),优化频谱分辨率和减少颗粒感"""
# 保留直流分量
signal_data = self.remove_dc(signal_data)
# 数据长度
n_samples = len(signal_data)
# 根据采样率和数据长度自适应选择参数
if self.sampling_rate <= 10: # 低采样率5Hz
# 对于低采样率,使用较长的窗口以获得更好的频率分辨率
nperseg = min(256, max(64, n_samples // 2))
noverlap = int(nperseg * 0.75) # 增加重叠比例
else: # 高采样率1000Hz
# 对于高采样率,平衡时间分辨率和频率分辨率
if n_samples < 10000: # 较短的数据
nperseg = min(512, max(256, n_samples // 4))
else: # 较长的数据
nperseg = min(1024, max(512, n_samples // 8))
noverlap = int(nperseg * 0.66) # 适中的重叠比例
# 确保窗口大小合理
nperseg = max(16, min(nperseg, n_samples))
noverlap = min(noverlap, nperseg - 1)
# 使用更平滑的窗口函数
f, t, Sxx = signal.spectrogram(
signal_data,
fs=self.sampling_rate,
window='hamming', # 使用汉明窗,比汉宁窗更平滑
nperseg=nperseg,
noverlap=noverlap,
scaling='density',
# detrend='linear', # 使用线性去趋势,减少低频干扰
detrend=False, # 保留直流
mode='psd'
)
# 应用平滑处理以减少颗粒感
if Sxx.size > 0:
# 使用小范围的高斯滤波平滑(可选)
from scipy.ndimage import gaussian_filter
Sxx_smoothed = gaussian_filter(Sxx, sigma=0.7)
return f, t, Sxx_smoothed
return f, t, Sxx
def process_signal(self, args):
"""并行处理单个信号"""
signal_data, axis = args
f, t, Sxx = self.compute_spectrogram(signal_data)
# 防止 log10(0)
eps = np.finfo(float).eps
Sxx_log = 10 * np.log10(Sxx + eps)
# 降采样以加速绘图
if len(t) > 1000: # 如果时间点太多,进行降采样
time_indices = np.linspace(0, len(t) - 1, 1000, dtype=int)
freq_indices = np.linspace(0, len(f) - 1, 500, dtype=int)
t = t[time_indices]
f = f[freq_indices]
Sxx_log = Sxx_log[freq_indices, :][:, time_indices]
dc_idx = int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # shape: (len(t),)
# 更健壮的 0 Hz 索引选择
zero_idx = np.where(np.isclose(f, 0.0))[0]
dc_idx = int(zero_idx[0]) if len(zero_idx) > 0 else int(np.argmin(np.abs(f - 0.0)))
dc_log = Sxx_log[dc_idx, :] # 每个时间窗的 0 Hz PSDdB
return {
'f': f,
't': t,
'Sxx_log': Sxx_log,
'dc_log': dc_log,
'axis': axis
}
@staticmethod
def robust_dc_ylim(results, p_low=5, p_high=95, pad_ratio=0.05, fallback=(0.0, 1.0)):
"""
计算统一 DC 纵轴范围(分位数 + 少许边距),并过滤 inf/NaN
"""
if not results:
return fallback
dc_all = np.concatenate([r['dc_log'].ravel() for r in results])
dc_all = dc_all[np.isfinite(dc_all)]
if dc_all.size == 0:
return fallback
lo, hi = np.percentile(dc_all, [p_low, p_high])
span = max(1e-9, hi - lo)
lo -= span * pad_ratio
hi += span * pad_ratio
return lo, hi
def plot_time_series(self):
"""绘制时间序列图"""
self.log_progress("开始绘制时间序列图...")
start_time = time.time()
# 确定子图数量
n_plots = 1 # 至少有一个加速度图
if self.gyro_columns: # 如果有陀螺仪数据
n_plots += 1
if self.temp_columns: # 如果有温度数据
n_plots += 1
fig, axes = plt.subplots(n_plots, 1, figsize=(12, 3 * n_plots), dpi=120)
if n_plots == 1:
axes = [axes] # 确保axes是列表
plot_idx = 0
# 加速度计数据
if self.acc_columns:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.acc_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('加速度时间序列', fontsize=12)
ax.set_ylabel('加速度 (g)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 陀螺仪数据(如果有)
if self.gyro_columns and plot_idx < n_plots:
ax = axes[plot_idx]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
labels = ['X', 'Y', 'Z']
for i, col in enumerate(self.gyro_columns):
if i < 3: # 只绘制前三个轴
ax.plot(self.data['time'], self.data[col],
label=labels[i], color=colors[i], linewidth=1.0, alpha=0.8)
ax.set_title('陀螺仪时间序列', fontsize=12)
ax.set_ylabel('角速度 (deg/s)', fontsize=10)
ax.legend(loc='upper left', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plot_idx += 1
# 温度数据(如果有)
if self.temp_columns and plot_idx < n_plots:
ax = axes[plot_idx]
ax.plot(self.data['time'], self.data[self.temp_columns[0]],
label='温度', color='#9467bd', linewidth=1.0, alpha=0.8)
ax.set_title('温度时间序列', fontsize=12)
ax.set_xlabel('时间 (s)', fontsize=10)
ax.set_ylabel('温度 (°C)', fontsize=10)
ax.legend(loc='upper right', fontsize=8, framealpha=0.5)
ax.grid(True, linestyle=':', alpha=0.5)
ax.set_xlim(0, self.data['time'].max())
plt.tight_layout()
output_path = os.path.join(self.output_dir, f'time_series_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"时间序列图已保存: {output_path}")
self.log_progress(f"时间序列图已保存为 {output_path},耗时 {time.time() - start_time:.2f}")
def plot_rainfall_spectrograms(self):
"""并行绘制所有频谱雨点图修复colorbar布局问题"""
self.log_progress("开始并行绘制频谱雨点图...")
start_time = time.time()
# 准备加速度计数据
self.log_progress("准备加速度计数据...")
acc_signals = [(self.data[col], f'Acc {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.acc_columns) if i < 3] # 只处理前三个轴
# 准备陀螺仪数据(如果有)
gyro_signals = []
if self.gyro_columns:
self.log_progress("准备陀螺仪数据...")
gyro_signals = [(self.data[col], f'Gyro {["X", "Y", "Z"][i]}')
for i, col in enumerate(self.gyro_columns) if i < 3] # 只处理前三个轴
# 如果没有数据可处理,直接返回
if not acc_signals and not gyro_signals:
self.log_progress("警告: 没有有效的数据列可供处理", "WARNING")
return
# 使用多进程处理信号(避免线程冲突)
self.log_progress("使用多进程并行处理...")
all_signals = acc_signals + gyro_signals
with Pool(processes=min(len(all_signals), cpu_count())) as pool:
results = pool.map(self.process_signal, all_signals)
# 分离结果
self.log_progress("分离结果...")
acc_results = [r for r in results if r['axis'].startswith('Acc')]
gyro_results = [r for r in results if r['axis'].startswith('Gyro')]
# 统一颜色标尺5%-95%分位)
if acc_results:
self.log_progress("计算加速度计全局最小和最大值...")
acc_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in acc_results])
acc_vmin, acc_vmax = np.percentile(acc_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
acc_dc_ymin, acc_dc_ymax = self.robust_dc_ylim(acc_results)
self.log_progress(f"加速度 DC (dB) 范围: {acc_dc_ymin:.1f}{acc_dc_ymax:.1f}")
if gyro_results:
self.log_progress("计算陀螺仪全局最小和最大值...")
gyro_all_Sxx = np.concatenate([r['Sxx_log'].ravel() for r in gyro_results])
gyro_vmin, gyro_vmax = np.percentile(gyro_all_Sxx, [5, 95])
# 统一 DC Y 轴范围
gyro_dc_ymin, gyro_dc_ymax = self.robust_dc_ylim(gyro_results)
self.log_progress(f"陀螺仪 DC (dB) 范围: {gyro_dc_ymin:.1f}{gyro_dc_ymax:.1f}")
# ========= 绘制加速度计频谱雨点图 =========
if acc_results:
self._plot_single_spectrogram(acc_results, acc_vmin, acc_vmax, acc_dc_ymin, acc_dc_ymax,
'加速度', 'acc_rainfall_spectrogram')
self.log_progress(f"加速度功率谱密度范围: {acc_vmin:.1f} dB 到 {acc_vmax:.1f} dB")
# ========= 绘制陀螺仪频谱雨点图 =========
if gyro_results:
self._plot_single_spectrogram(gyro_results, gyro_vmin, gyro_vmax, gyro_dc_ymin, gyro_dc_ymax,
'角速度', 'gyro_rainfall_spectrogram')
self.log_progress(f"陀螺仪功率谱密度范围: {gyro_vmin:.1f} dB 到 {gyro_vmax:.1f} dB")
total_time = time.time() - start_time
self.log_progress(f"频谱雨点图生成完成,总耗时 {total_time:.2f}")
def _plot_single_spectrogram(self, results, vmin, vmax, dc_ymin, dc_ymax, title_prefix, filename_prefix):
"""绘制单个频谱雨点图"""
rows = len(results)
fig = plt.figure(constrained_layout=True, figsize=(14, 4 * rows), dpi=150)
gs = fig.add_gridspec(nrows=rows, ncols=2, width_ratios=[22, 1], wspace=0.05, hspace=0.12)
axes_main = []
axes_cbar = []
for i in range(rows):
axes_main.append(fig.add_subplot(gs[i, 0]))
axes_cbar.append(fig.add_subplot(gs[i, 1]))
for i, result in enumerate(results):
ax = axes_main[i]
cax = axes_cbar[i]
sc = ax.scatter(
np.repeat(result['t'], len(result['f'])),
np.tile(result['f'], len(result['t'])),
c=result['Sxx_log'].T.ravel(),
cmap='jet',
s=3,
alpha=0.7,
vmin=vmin,
vmax=vmax,
rasterized=True
)
ax.set_title(f'{title_prefix}频谱雨点图 - {result["axis"][-1]}右侧为DC分量 dB', fontsize=10)
ax.set_xlabel('时间 (s)', fontsize=9)
ax.set_ylabel('频率 (Hz)', fontsize=9)
ax.set_ylim(0, self.sampling_rate / 2)
ax.grid(True, linestyle=':', alpha=0.4)
ax2 = ax.twinx()
ax2.plot(result['t'], result['dc_log'], color='black', linewidth=1.2, alpha=0.85, label='DC (dB)')
ax2.set_ylabel('直流分量 (dB)', fontsize=9, color='black')
ax2.set_ylim(dc_ymin, dc_ymax)
ax2.tick_params(axis='y', labelcolor='black')
ax2.yaxis.set_major_locator(MaxNLocator(nbins=6))
ax2.grid(False)
ax2.legend(loc='upper right', fontsize=8, framealpha=0.5)
cbar = fig.colorbar(sc, cax=cax)
cbar.set_label('功率谱密度 (dB)', fontsize=9)
cax.tick_params(labelsize=8)
output_path = os.path.join(self.output_dir, f'{filename_prefix}_{self.timestamp}.png')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close(fig)
self.log_progress(f"{title_prefix}频谱雨点图已保存为 {output_path}")
def run_analysis(self):
"""运行完整分析流程"""
try:
self.log_progress("开始数据分析流程", "INFO")
start_time = time.time()
self.load_data()
self.plot_time_series()
self.plot_rainfall_spectrograms()
total_time = time.time() - start_time
self.log_progress(f"分析完成,总耗时 {total_time:.2f}", "SUCCESS")
self.log_progress(f"所有输出文件已保存到: {self.output_dir}", "INFO")
return True
except ValueError as e:
# 跳过不包含IMU数据的文件
self.log_progress(f"跳过文件: {str(e)}", "WARNING")
return False
except Exception as e:
self.log_progress(f"分析过程中出现错误: {str(e)}", "ERROR")
import traceback
traceback.print_exc()
return False
def process_single_file(file_path):
"""处理单个文件的函数(使用进程隔离)"""
try:
print(f"{Fore.BLUE}开始处理文件: {os.path.basename(file_path)}")
analyzer = IMUDataAnalyzer(file_path)
success = analyzer.run_analysis()
if success:
return (file_path, True, "处理成功")
else:
return (file_path, False, "文件不包含IMU数据已跳过")
except Exception as e:
return (file_path, False, str(e))
def main():
"""主函数,支持多文件处理和进度显示"""
print("=" * 60)
print(f"{Fore.CYAN}IMU数据频谱分析工具 - 多文件批量处理")
print("=" * 60)
# 获取输入路径
print(f"{Fore.WHITE}请输入包含CSV文件的目录路径: ")
input_path = input("> ").strip()
if not os.path.exists(input_path):
print(f"{Fore.RED}错误: 路径 '{input_path}' 不存在!")
return
# 查找所有包含imu的CSV文件不区分大小写
if os.path.isdir(input_path):
# 使用单个glob模式匹配所有文件然后过滤包含imu的文件
all_csv_files = glob.glob(os.path.join(input_path, "**", "*.csv"), recursive=True)
csv_files = [f for f in all_csv_files if re.search(r'imu', f, re.IGNORECASE)]
csv_files = list(set(csv_files)) # 去重
csv_files.sort()
else:
# 对于单个文件检查是否包含imu不区分大小写
if re.search(r'imu', input_path, re.IGNORECASE):
csv_files = [input_path]
else:
csv_files = []
if not csv_files:
print(f"{Fore.YELLOW}警告: 未找到包含'imu'的CSV文件")
return
print(f"{Fore.GREEN}找到 {len(csv_files)} 个IMU数据文件:")
for i, file in enumerate(csv_files, 1):
print(f" {i}. {os.path.basename(file)}")
# 使用多进程处理文件避免matplotlib线程冲突
print(f"\n{Fore.CYAN}开始多线程处理文件 (使用 {min(len(csv_files), cpu_count())} 个线程)...")
success_count = 0
skipped_count = 0
failed_files = []
# 使用ProcessPoolExecutor而不是ThreadPoolExecutor
with ProcessPoolExecutor(max_workers=min(len(csv_files), cpu_count())) as executor:
# 提交所有任务
future_to_file = {executor.submit(process_single_file, file): file for file in csv_files}
# 处理完成的任务
for future in as_completed(future_to_file):
file_path = future_to_file[future]
try:
result = future.result()
file_path, success, message = result
if success:
print(f"{Fore.GREEN}✓ 完成: {os.path.basename(file_path)}")
success_count += 1
else:
if "跳过" in message:
print(f"{Fore.YELLOW}↷ 跳过: {os.path.basename(file_path)} - {message}")
skipped_count += 1
else:
print(f"{Fore.RED}✗ 失败: {os.path.basename(file_path)} - {message}")
failed_files.append((file_path, message))
except Exception as e:
print(f"{Fore.RED}✗ 异常: {os.path.basename(file_path)} - {str(e)}")
failed_files.append((file_path, str(e)))
# 输出统计信息
print(f"\n{Fore.CYAN}处理完成统计:")
print(f"{Fore.GREEN}成功: {success_count} 个文件")
print(f"{Fore.YELLOW}跳过: {skipped_count} 个文件不包含IMU数据")
print(f"{Fore.RED}失败: {len(failed_files)} 个文件")
if failed_files:
print(f"\n{Fore.YELLOW}失败文件详情:")
for file, error in failed_files:
print(f" {os.path.basename(file)}: {error}")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}用户中断程序执行")
except Exception as e:
print(f"{Fore.RED}程序运行出错: {str(e)}")
import traceback
traceback.print_exc()

6
ICCIDupdata/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
/build/*
/build
/dist/*
/dist
/source/*
/source

View File

@@ -0,0 +1,90 @@
import requests
import hashlib
import time
import json
def generate_sign(system_id, request_info, request_time, secret_key):
"""生成签名"""
params = {
'requestInfo': request_info,
'requestTime': request_time,
'systemId': system_id
}
# 按字典序排序
sorted_params = '&'.join([f"{k}={v}" for k, v in sorted(params.items())])
sign_str = sorted_params + secret_key
return hashlib.md5(sign_str.encode()).hexdigest()
def test_navp_interface():
# 需要向HT获取系统密钥
secret_key = "aqwec3be422c22a752c22"
# url = "https://flow-gateway.pre.aeroht.com/server/oem/navp/infoUpload"
url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload"
# 测试数据
request_info = '{"iccid":"navp345678112300001","partsNo":"F34410001X3K-00-02","hVer":"F34410001X3K-00-02","sVer":"F34410001X3K0P001","network":"AG35CEVFMR12A02T4G&864169079532089","soc":"NA","sn":"F34410001X3K00024013683HJ00170"}'
# system_id = "navpFactory"
system_id = "diufactory"
request_time = time.strftime("%Y-%m-%d %H:%M:%S")
print(f"request_time:{request_time}")
# request_time = time.strftime("%Y/%m/%d %H:%M")
# 生成签名
sign = generate_sign(system_id, request_info, request_time, secret_key)
data = {
"systemId": system_id,
"requestInfo": request_info,
"requestTime": request_time,
"sign": sign
}
headers = {
"Content-Type": "application/x-www-form-urlencoded"
}
try:
# response = requests.post(url, data=data, headers=headers, timeout=30)
print(f"data|requestInfo{data['requestInfo']}")
response = requests.request("POST",url, data=data, headers=headers, timeout=30)
if(response.status_code == 200) :
print(f"NAVP 接口测试 OK")
else :
print(f"NAVP 接口测试 NG")
print(f"NAVP接口响应状态码: {response.status_code}")
print(f"NAVP接口响应内容: {response.text}")
return response.status_code == 200
except Exception as e:
print(f"NAVP接口请求失败: {e}")
return False
def test_navs_interface():
# NAVS接口测试类似NAVP只需修改systemId和url
# 实现逻辑与test_navp_interface类似
# pass
url = "https://flow-gateway.pre.aeroht.com/server/oem/nav/infoUpload?requestInfo=%7B%22iccid%22:%22navp345678112300001%22,%22partsNo%22:%22parts111%22,%22hVer%22:%22hVer_7d98d056c96e22222%22,%22sVer%22:%22sVer_b38651e22222%22,%22soc%22:%22111%22,%22network%22:%222222%22%7D&systemId=diufactory&requestTime=2026-01-14%2017:19:58&sign=f480924ff291e0f98a4fb9fdd0167a3e&appSecret=aqwec3be422c22a752c22"
payload = {}
headers = {
'Cookie': 'cookies=mg3Tr49e6qr2eIHbvmiHp9NXJa56ei5vh4CeDbcRaEH450bqgdWLrHYHIgaZX3A7CXB9l0X3c1i+9D96HFAFjSCIA58vVLNpM2EtDixW67CQVOpinLaIMEcnr4wSqtaHjOvpw+XVvm+nB3LE2C5AH/qpSULCgySiX3ET7BQV0PSZkGUfWs2z6PqLSPa7ta9jr18otqVkK7y2zKdsdc4YkYq2jbZldPXm8cXufRCUqdvXoR2QzMoN+/gu6vBKtSXHSlyaTCC/aay+i64ChV4iNXrKlfHHj9MswdrzAazFvZXoDNMTMW00TEbev9DDbcTXVUdbjxidZM4Qk8xIMcpaR07l1ruHLLd2gmYZKRarBAxhrGXGWvJtm5EV1N0AgO3t9sSWhsyWNKKPijgMmUhYyOzKoxD3cvSZ2vGnI5iojb9W6U+cT3A98W81ENYs3yyrEZamJOAbbwAi+zpcCmxI/wcWq32HXgiYLxJ4pmaNlIlW+h8a4tGfTpxAR/WrG/SPN/HoMPohV1INDkllXkurrijH8ZeAQmF+lVepFfBcC9dPrkB7RBAUF/P0FIqjoAXVf6ULLoUvyHcD92vIPDVT4UPW7XGT7FRxtNoBMXhKJ9fOosn+ofuskmOWS1pQsAe5zY7fM/uE7VRrS/AaUt94hKSTJO0p94dPeRRxMt5zDe+Fe4M+wvE5SKaE++C6ZrSNqsuYq1RhcQS26PR90xvq9+OT3HX1r7vGakFIGNXzW/Gd3+QF7+5oGDQHzc6WjFAsQBs1HkntPcJpsVbE24r6kiGzMxgCNTzreqJXDYmyePETDKj75bb/K5E18Zeo83vF3zk2vVrxxefwbvaGWyeRJJW6sQv8kyaIpyNVPQOhetlpDV4RHVzja869fEIl1zOdNQWkU+7F/gCBfcUS79RIaC5psIDofx28E7TIhfanh41OU+TtBXNXEVYqf/7NDc3q+1pLnenogmFvSSG4qE0iSGUapL9iTaDXjlOyYkS39keVog/AHrVfDYMTzBWeko2YJmpLExUwLeXfwL3xRI41yuuBz2eEuQhyIMwxxQWHkptPFR9Cn6TfjDwYBVIxIzrEBFc6E14VmZQ/zNloS2n66Z45ivuaRpagMaWo7+cqSX0CQvQ8SJQ+5k4i7pnXzhSq8fxiLSa0wIvfrnDlwd7WS6oe0nKqyMInt/iGMqUiAVmrlduHhlrIweHkM/E7pVaURVI38R5WNOOYBgHV8CTUwi2FLwsZbEkD0ElJDhEkeHfWHxmn9XkIVU+XD3/OZp+IRCYBWr3t6+iPJqosp75eWNfST5kCzP/bye+h5vRjuvRdnnMhekyd9MY3yPPbz5JJ42CTrkjyAGIyiycQNI4mFIWB9nlM3hXoywoO+FDd2CFIMvwkdH+GXEvbVNR/il3O94jRS+kF3v/i8WBPDzUSP0aHAQPEAvzyIsxY/6WrOcAuuA7Cy0qeIzjI0Wzjv/QbOEgtHk7kR6+pgptQtVFgU4EldoQFnuZEPPQdbw1OAhxUKfyIuljTKq8FbZ95aHX5fFQ6POzgIgUFYCwVqRHkVn6dwHLkDXcOMhfXvw+5q23k0GGCFUPwFM+6ypZBoKKCRTZ60e0IUOq3afVls/UUgGnDQp4pT/BXhLYhICGH8cZw+sNxHLddehuepi4PI8fq60e+H6RfE7xxk+LRlVNyI0TTi+NuESQr+UzX7GIvVkiiwgQKrUPafqBbDS6L2890tVXt1un1UH5hW9GuE+uftclBWqvGnYZUUrHQ42eAr1c8xvunaTINVU24nBlVFUPeh3x34RsjldTkrYeIkk9v0tz8T7ndWi6qxv/03u9YBlMRcJozgDnovVx/tNH7J0f6j6Sq1RNkhxVvRe6SPAgS3mvz5MLcMLw9pWCTSOf8NVDbSuV5NpOm+f5mhU9u/5tLfXgznJSmu9UW6WWx4PgiPTB0jHELrYnDQiiDRqVDFixaHzPZ6t9CKJp088NXrLamFfOYfd3e2S6xEu7aUHBKR2vnscQfl5awuzWD8uVh3sHcK/N4f2wregqra3YaSgme',
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)
if __name__ == "__main__":
print("开始测试预发布环境...")
navp_result = test_navp_interface()
# navs_result = test_navs_interface()

9
IMULinkdata/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
LINLinkData_V2.py

View File

@@ -0,0 +1,252 @@
import os
import pandas as pd
from datetime import datetime
import argparse
import re
import time
import argparse
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
import openpyxl
class ExcelProcessor:
def __init__(self, file_path):
self.file_path = file_path
self.df = None
self.output_folder = None
self.output_file = None
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.processed_data = {} # 存储处理后的数据
def load_data(self):
"""加载Excel文件数据"""
print(f"正在加载文件: {self.file_path}")
try:
# 尝试读取指定sheet如果不存在则尝试读取第一个sheet
try:
# 建议使用 engine='openpyxl'pandas 会尽可能把 Excel 的日期单元格读成 datetime
# self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl')
# 获取所有工作表名称
sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names
# 查找包含'LINK'的工作表(不区分大小写)
target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None)
if target_sheet is None:
raise ValueError(f"未找到包含'LINK'的工作表")
self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl')
except Exception as e:
print("提示: 未找到包含'LINK' sheet请检查文件内容。")
return False
# 确保有 PartNumber 列(兼容 LinkObject
if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns:
self.df['PartNumber'] = self.df['LinkObject']
# 检查必要的列是否存在
required_cols = ["PartNumber", "ChildSN", "linkDate"]
missing = [c for c in required_cols if c not in self.df.columns]
if missing:
raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}")
# 解析 linkDate 为 datetime支持 AM/PM
# 注pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM"
# 如果有极端异构格式,可在这里加更精细的清洗逻辑
# errors='coerce' 会把无法解析的值变为 NaT
# self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce')
self.df['linkDate'] = pd.to_datetime(
self.df['linkDate'],
format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM
errors='coerce'
)
# 提示解析情况
total = len(self.df)
invalid = int(self.df['linkDate'].isna().sum())
print(f"文件加载成功,总行数: {total},日期解析失败: {invalid}")
# 添加备注列
if '备注' not in self.df.columns:
self.df['备注'] = ''
return True
except Exception as e:
print(f"加载文件失败: {str(e)}")
return False
def create_output_folder(self):
"""准备输出目录和文件名"""
# 先去除扩展名再截取前10个字符
# base_name = os.path.splitext(os.path.basename(self.file_path))[0]
original_name = os.path.splitext(os.path.basename(self.file_path))[0]
# base_name = original_name[:10]
base_name = original_name[:20]
output_folder_name = f"{base_name} output_{self.timestamp}"
# self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name)
self.output_folder = os.path.dirname(self.file_path)
self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx")
if not os.path.exists(self.output_folder):
os.makedirs(self.output_folder)
print(f"已创建输出文件夹: {self.output_folder}")
def _safe_sheet_name(self, name):
"""清理为合法的 Excel sheet 名称(<=31字符无非法字符"""
# 转为字符串
s = str(name)
# 替换非法字符:: \ / ? * [ ]
s = re.sub(r'[:\\/\?\*\[\]]', '_', s)
# 去除首尾空格
s = s.strip()
# 截断到 31 个字符
if len(s) > 31:
s = s[:31]
# 空名兜底
if not s:
s = 'Sheet'
return s
def process_data(self):
"""处理数据并拆分到不同sheet"""
if self.df is None:
raise ValueError("数据未加载,请先调用 load_data() 方法")
# 确保有PartNumber列
if 'PartNumber' not in self.df.columns:
if 'LinkObject' in self.df.columns:
self.df['PartNumber'] = self.df['LinkObject']
else:
raise ValueError("数据表中既没有PartNumber也没有LinkObject列")
# 添加备注列
self.df['备注'] = ''
# 按 PartNumber 分组
grouped = self.df.groupby('PartNumber', dropna=False)
total_groups = len(grouped)
print(f"开始处理数据,共 {total_groups} 个分组...")
# 使用上下文管理器,自动保存关闭
# print(f"输出文件信息self.output_folder{self.output_folder}")
print(f"输出文件信息self.output_file{self.output_file}")
# output_path = os.path.join(self.output_folder, self.output_file)
output_path = self.output_file
writer = pd.ExcelWriter(output_path, engine='openpyxl')
for i, (name, group) in enumerate(grouped):
print(f"正在处理分组 {i + 1}/{total_groups}: {name}")
# 处理重复 ChildSN根据最新 linkDate 保留一条)
group_processed = self.process_duplicates(group)
# 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串
group_out = group_processed.copy()
group_out['linkDate'] = group_out['linkDate'].apply(
lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else ''
)
# 写入sheet
safe_name = self._safe_sheet_name(name)
group_out.to_excel(writer, sheet_name=safe_name, index=False)
# 保存文件
writer.close()
print(f"处理完成! 结果已保存到: {output_path}")
def process_duplicates(self, group):
"""处理重复的 ChildSN优化备注信息保留最新 linkDate 的一行"""
# 找出重复 ChildSN
duplicates = group[group.duplicated('ChildSN', keep=False)]
if not duplicates.empty:
print(f" 发现 {len(duplicates)} 行重复数据,正在处理...")
# 遍历每个重复 ChildSN 的分组
for child_sn, dup_group in duplicates.groupby('ChildSN'):
# 按 linkDate 排序,保留最新(降序)
# 若 linkDate 有 NaT会排在末尾
dup_group = dup_group.sort_values('linkDate', ascending=False)
# 获取最新行
latest_row = dup_group.iloc[0]
# 差异字段收集(除 ChildSN、备注
diff_info = {}
for col in dup_group.columns:
if col in ['ChildSN', '备注']:
continue
unique_values = dup_group[col].unique()
if len(unique_values) > 1:
# 对 linkDate 做专门格式化,其他列保持原样转字符串
if col == 'linkDate':
vals = []
for v in unique_values:
if pd.isna(v):
vals.append('')
elif isinstance(v, pd.Timestamp):
vals.append(v.strftime('%Y-%m-%d %H:%M:%S'))
else:
vals.append(str(v))
diff_info[col] = f"{col}: {', '.join(vals)}"
else:
diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}"
# 生成备注信息
note = f"重复行数: {len(dup_group)}"
if diff_info:
note += "; 差异内容: " + "; ".join(diff_info.values())
# 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除)
group.loc[group['ChildSN'] == child_sn, '备注'] = note
# 删除除最新以外的行
drop_indices = dup_group.index[1:]
group = group.drop(drop_indices)
return group
def main():
print("=== Excel拆分工具 ===")
file_path = input("请输入Excel文件路径: ").strip('"')
if not os.path.exists(file_path):
print("文件不存在,请检查路径")
return
start_time = time.time()
try:
# 创建处理器实例
processor = ExcelProcessor(file_path)
# 执行处理流程
if not processor.load_data():
return
processor.create_output_folder()
processor.process_data()
print("所有处理已完成!")
except Exception as e:
print(f"处理过程中发生错误: {e}")
end_time = time.time()
print(f"总耗时: {end_time - start_time:.2f}")
if __name__ == "__main__":
main()

20
dataProcess/.gitignore vendored Normal file
View File

@@ -0,0 +1,20 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
/dataProcess_out*
*.xls
*.xlsx
*.csv
*.spec
/temp
dataProcess_html_V2.py
dataProcess_sightml_V2.py
dataProcess_sightml_V3.py
dataProcessMerge_V2.py

View File

@@ -0,0 +1,475 @@
import os
import pandas as pd
from tkinter import filedialog, Tk
import logging
import datetime
# --- 新增导入 ---
from colorama import init, Fore, Style
import sys
# 初始化 coloramaautoreset=True 使得每次打印后自动恢复默认颜色
init(autoreset=True)
# --- 自定义日志格式化器 ---
class ColoredFormatter(logging.Formatter):
"""根据日志级别为控制台输出添加颜色"""
# 定义颜色
COLORS = {
'DEBUG': Fore.CYAN,
'INFO': Fore.GREEN,
'WARNING': Fore.YELLOW,
'ERROR': Fore.RED,
'CRITICAL': Fore.RED + Style.BRIGHT,
}
def format(self, record):
# 获取对应级别的颜色
log_color = self.COLORS.get(record.levelname, '')
# 应用颜色到整个记录
record.levelname = f"{log_color}{record.levelname}{Style.RESET_ALL}"
record.msg = f"{log_color}{record.msg}{Style.RESET_ALL}"
# 使用父类的格式化方法
return super().format(record)
# --- 配置日志 ---
# 创建 logger 对象
logger = logging.getLogger() # 获取根 logger
logger.setLevel(logging.INFO)
# 移除默认的 handlers如果有的话避免重复输出
if logger.handlers:
logger.handlers.clear()
# 创建控制台 handler
console_handler = logging.StreamHandler(sys.stdout) # 使用 sys.stdout 通常更好
console_handler.setLevel(logging.INFO)
# 创建并设置 formatter
formatter = ColoredFormatter('%(asctime)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
# 将 handler 添加到 logger
logger.addHandler(console_handler)
# --- 日志配置结束 ---
class DataProcessor:
def __init__(self):
self.spec_file = None
self.data_folder = None
self.spec_data = None
self.data_files = []
self.merged_data = pd.DataFrame()
def select_spec_file(self):
"""选择上限和下限规格要求文件"""
root = Tk()
root.withdraw()
self.spec_file = filedialog.askopenfilename(
title="选择上限和下限规格要求文件",
filetypes=[("CSV files", "*.csv"), ("All files", "*.*")]
)
if not self.spec_file:
logging.error("未选择规格文件")
return False
logging.info(f"已选择规格文件: {self.spec_file}")
return True
def select_data_folder(self):
"""选择实际数据文件所在的文件夹"""
root = Tk()
root.withdraw()
self.data_folder = filedialog.askdirectory(title="选择实际数据文件所在的文件夹")
if not self.data_folder:
logging.error("未选择数据文件夹")
return False
logging.info(f"已选择数据文件夹: {self.data_folder}")
return True
def clean_column_names(self, df):
"""清理列名,去除前后空格和特殊字符"""
df.columns = [col.strip() for col in df.columns]
return df
def load_spec_data(self):
"""加载规格数据标题行为第3行"""
try:
# 读取CSV文件跳过前2行第3行作为标题
self.spec_data = pd.read_csv(self.spec_file, header=2)
# 清理列名
self.spec_data = self.clean_column_names(self.spec_data)
# 确保PAD ID列是字符串类型
if 'PAD ID' in self.spec_data.columns:
self.spec_data['PAD ID'] = self.spec_data['PAD ID'].astype(str).str.strip()
# 检查必要的列是否存在
required_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
"Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
missing_columns = [col for col in required_columns if col not in self.spec_data.columns]
if missing_columns:
logging.warning(f"规格文件中缺少以下列: {missing_columns}")
# 尝试查找相似的列名
for missing_col in missing_columns:
similar_cols = [col for col in self.spec_data.columns if missing_col.lower() in col.lower()]
if similar_cols:
logging.info(f"可能匹配的列: {similar_cols}")
# 特别检查 Component ID 是否存在
if "Component ID" not in self.spec_data.columns:
logging.warning("'Component ID' 列在规格文件中缺失,这可能导致输出文件中也缺少该列。")
logging.info(f"规格数据加载成功,共 {len(self.spec_data)}")
logging.info(f"规格文件列名: {list(self.spec_data.columns)}")
logging.info(
f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype if 'PAD ID' in self.spec_data.columns else 'N/A'}")
except Exception as e:
logging.error(f"加载规格数据失败: {e}")
return False
return True
def scan_data_files(self):
"""扫描数据文件夹中的CSV文件并检查标题行是否包含有效字段"""
try:
# 定义有效的字段名称(去除前后空格)
required_fields = [
"PAD ID", "Component ID", "Height(mil)", "Volume(%)",
"Area(%)", "Volume(mil3)", "Area(mil2)"
]
# 可选:定义字段匹配的宽松程度
field_match_threshold = 0.8 # 80%的字段匹配即认为有效
# 扫描CSV文件
valid_files = []
for file in os.listdir(self.data_folder):
if file.endswith(".csv") and "F27140015X3K" in file:
file_path = os.path.join(self.data_folder, file)
# 检查文件是否可读且包含有效字段
if self._is_valid_csv_file(file_path, required_fields, field_match_threshold):
valid_files.append(file_path)
self.data_files = valid_files
logging.info(
f"找到 {len(self.data_files)} 个有效数据文件: {[os.path.basename(f) for f in self.data_files]}")
except Exception as e:
logging.error(f"扫描数据文件失败: {e}")
return False
return True if self.data_files else False
def _is_valid_csv_file(self, file_path, required_fields, threshold=1.0):
"""检查CSV文件是否包含必需的字段"""
try:
# 尝试不同的编码
encodings = ['utf-8', 'gbk', 'latin-1']
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
first_line = f.readline().strip()
# 解析CSV标题行
headers = [header.strip() for header in first_line.split(',')]
# 计算匹配的字段数量
matched_fields = 0
missing_fields = []
for required_field in required_fields:
if required_field in headers:
matched_fields += 1
else:
missing_fields.append(required_field)
# 计算匹配比例
match_ratio = matched_fields / len(required_fields)
if match_ratio >= threshold:
if missing_fields:
logging.warning(
f"文件 {os.path.basename(file_path)} 部分字段缺失: {missing_fields},但满足阈值要求")
else:
logging.info(f"文件 {os.path.basename(file_path)} 所有字段完整")
return True
else:
logging.warning(
f"文件 {os.path.basename(file_path)} 字段匹配率不足: {match_ratio:.1%},缺失字段: {missing_fields}")
return False
except UnicodeDecodeError:
continue # 尝试下一个编码
logging.error(f"无法读取文件 {os.path.basename(file_path)},尝试了所有编码")
return False
except Exception as e:
logging.error(f"检查文件 {os.path.basename(file_path)} 时发生错误: {e}")
return False
def load_and_clean_data_file(self, data_file):
"""加载并清理数据文件"""
try:
# 读取数据文件,第一行作为标题
# 处理可能的编码问题
try:
data_df = pd.read_csv(data_file, header=0, encoding='utf-8')
except UnicodeDecodeError:
try:
data_df = pd.read_csv(data_file, header=0, encoding='gbk')
except UnicodeDecodeError:
data_df = pd.read_csv(data_file, header=0, encoding='latin-1')
# 清理列名
data_df = self.clean_column_names(data_df)
logging.info(f"数据文件列名: {list(data_df.columns)}")
# --- 关键修改:创建副本以避免 SettingWithCopyWarning ---
data_df = data_df.copy()
# 确保PAD ID列是字符串类型
if 'PAD ID' in data_df.columns:
data_df['PAD ID'] = data_df['PAD ID'].astype(str).str.strip()
logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
# 检查必要的列是否存在
required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
# 处理可能的列名变体
column_mapping = {}
for required_col in required_columns:
if required_col not in data_df.columns:
# 查找相似的列名
# 更宽松的匹配方式:忽略空格和大小写
similar_cols = [col for col in data_df.columns if
required_col.lower().replace(" ", "") in col.lower().replace(" ", "")]
if similar_cols:
column_mapping[required_col] = similar_cols[0]
logging.info(f"映射列: {required_col} -> {similar_cols[0]}")
# 重命名列
if column_mapping:
data_df = data_df.rename(columns=column_mapping)
missing_columns = [col for col in required_columns if col not in data_df.columns]
if missing_columns:
logging.error(f"数据文件中缺少以下列: {missing_columns}")
logging.info(f"数据文件所有列: {list(data_df.columns)}")
return None
return data_df # 返回处理好的副本
except Exception as e:
logging.error(f"加载数据文件失败: {e}")
return None
def process_data(self):
"""处理数据并合并"""
all_data = []
total_files = len(self.data_files)
if total_files == 0:
logging.error("未找到任何数据文件")
return False
for idx, data_file in enumerate(self.data_files, 1):
logging.info(f"处理数据文件 {idx}/{total_files}: {os.path.basename(data_file)}")
try:
# 加载并清理数据文件
data_df = self.load_and_clean_data_file(data_file)
if data_df is None:
logging.error(f"无法加载文件: {os.path.basename(data_file)}")
continue
# 选择需要的字段
required_columns = ["PAD ID", "Component ID", "Height(mil)", "Volume(%)", "Area(%)"]
# 检查数据文件中是否存在所有必需的列
available_columns = [col for col in required_columns if col in data_df.columns]
if len(available_columns) != len(required_columns):
missing = set(required_columns) - set(available_columns)
logging.warning(f"文件 {os.path.basename(data_file)} 缺少列: {missing}")
logging.info(f"可用的列: {available_columns}")
# --- 关键修改:使用可用的列继续处理 (再次创建副本) ---
data_df = data_df[available_columns].copy()
else:
# --- 关键修改:选择所需的列 (创建副本) ---
data_df = data_df[required_columns].copy()
# 添加数据来源字段
data_df["数据来源"] = os.path.basename(data_file)
data_df["限制来源"] = os.path.basename(self.spec_file)
# 调试信息:显示合并前的数据类型
logging.info(
f"合并前 - 数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist() if 'PAD ID' in data_df.columns else 'N/A'}")
logging.info(
f"合并前 - 规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist() if 'PAD ID' in self.spec_data.columns else 'N/A'}")
# 从规格文件中选择需要的字段
spec_columns = ["PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)",
"Height_Low(mil)", "Height_High(mil)", "Area_Min(%)", "Area_Max(%)"]
# 只选择存在的列
available_spec_columns = [col for col in spec_columns if col in self.spec_data.columns]
# --- 关键修改:使用 .copy() 创建一个独立的副本,避免 SettingWithCopyWarning ---
spec_df = self.spec_data[available_spec_columns].copy()
# 确保规格文件的PAD ID也是字符串类型
if 'PAD ID' in spec_df.columns:
spec_df['PAD ID'] = spec_df['PAD ID'].astype(str).str.strip()
# 合并规格数据
merged_df = pd.merge(data_df, spec_df, on="PAD ID", how="inner", suffixes=('_data', '_spec'))
if merged_df.empty:
logging.warning(f"文件 {os.path.basename(data_file)} 与规格数据无匹配项")
# 显示一些调试信息
data_pad_ids = set(data_df['PAD ID'].unique()) if 'PAD ID' in data_df.columns else set()
spec_pad_ids = set(spec_df['PAD ID'].unique()) if 'PAD ID' in spec_df.columns else set()
common_ids = data_pad_ids.intersection(spec_pad_ids)
logging.info(
f"数据文件PAD ID数量: {len(data_pad_ids)}, 规格文件PAD ID数量: {len(spec_pad_ids)}, 共同ID数量: {len(common_ids)}")
logging.info(f"数据文件前5个PAD ID: {list(data_pad_ids)[:5] if data_pad_ids else 'N/A'}")
logging.info(f"规格文件前5个PAD ID: {list(spec_pad_ids)[:5] if spec_pad_ids else 'N/A'}")
continue
# --- 优化开始:确保 Component ID 来自数据文件 ---
# 即使合并产生了两个 Component ID (_data 和 _spec),我们也明确使用来自 data_df 的那个
if 'Component ID_data' in merged_df.columns:
merged_df['Component ID'] = merged_df['Component ID_data']
# 可选:删除来自规格文件的 Component ID 列
# merged_df.drop(columns=['Component ID_spec'], inplace=True, errors='ignore')
# 或者保留它以便对比,这里我们先注释掉删除操作
# 如果因为某种原因没有 _data 后缀(例如只有一个 Component ID则默认就是 data_df 的
# (这种情况在 merge 时不会发生,因为我们用了 suffixes
# --- 优化结束 ---
# --- 新增:对规格高度字段执行单位转换(除以 25.4 ---
# 为避免意外字符导致转换失败,先清洗再转换为数值
convert_cols = ["Height_Low(mil)", "Height_High(mil)"]
for col in convert_cols:
if col in merged_df.columns:
before_non_null = merged_df[col].notna().sum()
# 清洗非数字字符(保留数字、小数点和负号)
cleaned = merged_df[col].astype(str).str.replace(r'[^\d\.\-]+', '', regex=True)
merged_df[col] = pd.to_numeric(cleaned, errors='coerce') / 25.4
after_non_null = merged_df[col].notna().sum()
logging.info(
f"字段 {col} 已除以 25.4 完成单位转换,非空值数: 转换前 {before_non_null} -> 转换后 {after_non_null}"
)
else:
logging.warning(f"规格高度字段缺失,无法进行单位转换: {col}")
# 选择最终输出的字段(按照要求的顺序)
output_columns = [
"PAD ID", "Component ID", "Vol_Min(%)", "Vol_Max(%)", "Height_Low(mil)",
"Height_High(mil)", "Area_Min(%)", "Area_Max(%)", "Height(mil)", "Volume(%)", "Area(%)",
"数据来源", "限制来源"
]
# --- 优化开始 ---
# 只选择存在的列
available_output_columns = [col for col in output_columns if col in merged_df.columns]
# 检查是否有列缺失并打印警告
missing_output_columns = [col for col in output_columns if col not in merged_df.columns]
if missing_output_columns:
logging.warning(
f"文件 {os.path.basename(data_file)} 的最终输出中缺少以下预期列: {missing_output_columns}")
# 如果没有任何可用列,则跳过此文件
if not available_output_columns:
logging.error(f"文件 {os.path.basename(data_file)} 没有任何预期的输出列,将跳过此文件。")
continue
merged_df = merged_df[available_output_columns].copy() # 再次使用.copy()确保安全
# --- 优化结束 ---
all_data.append(merged_df)
logging.info(f"文件 {os.path.basename(data_file)} 处理成功,匹配 {len(merged_df)}")
except Exception as e:
logging.error(f"处理文件 {os.path.basename(data_file)} 时出错: {e}")
# 显示更多调试信息
if 'data_df' in locals() and 'PAD ID' in data_df.columns:
logging.info(f"数据文件PAD ID数据类型: {data_df['PAD ID'].dtype}")
logging.info(f"数据文件PAD ID示例: {data_df['PAD ID'].head(3).tolist()}")
if hasattr(self, 'spec_data') and 'PAD ID' in self.spec_data.columns:
logging.info(f"规格文件PAD ID数据类型: {self.spec_data['PAD ID'].dtype}")
logging.info(f"规格文件PAD ID示例: {self.spec_data['PAD ID'].head(3).tolist()}")
continue
if all_data:
self.merged_data = pd.concat(all_data, ignore_index=True)
logging.info(f"数据处理完成,共合并 {len(self.merged_data)} 行数据")
logging.info(f"最终数据列名: {list(self.merged_data.columns)}")
else:
logging.error("未成功处理任何数据文件")
return False
return True
def save_to_excel(self):
"""保存合并后的数据到Excel文件"""
try:
# 生成时间戳
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"dataProcess_out_{timestamp}.xlsx"
output_file = os.path.join(self.data_folder, output_filename)
self.merged_data.to_excel(output_file, index=False)
logging.info(f"数据已保存到: {output_file}")
# 显示统计信息
stats = f"处理统计:\n"
stats += f"- 规格文件: {os.path.basename(self.spec_file)}\n"
stats += f"- 处理的数据文件数: {len(self.data_files)}\n"
stats += f"- 合并的总行数: {len(self.merged_data)}\n"
stats += f"- 输出文件: {output_file}\n"
stats += f"- 包含的列: {list(self.merged_data.columns)}"
logging.info(stats)
# 原来的 message box 提示已移除,改为日志输出
logging.info("处理完成。\n" + stats)
except Exception as e:
logging.error(f"保存数据失败: {e}")
# 原来的 error message box 已移除,改为日志输出
logging.error(f"保存数据失败: {e}")
def run(self):
"""运行整个数据处理流程"""
logging.info("开始数据处理流程")
try:
if not self.select_spec_file():
return
if not self.select_data_folder():
return
if not self.load_spec_data():
return
if not self.scan_data_files():
return
if not self.process_data():
# 原来的 error message box 已移除,改为日志输出
logging.error("数据处理失败,请检查日志信息")
return
self.save_to_excel()
except Exception as e:
logging.error(f"处理流程出错: {e}")
# 原来的 error message box 已移除,改为日志输出
logging.error(f"处理过程中出现错误:\n{e}")
if __name__ == "__main__":
processor = DataProcessor()
processor.run()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,810 @@
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import os
from datetime import datetime
import numpy as np
class DataProcessor:
def __init__(self):
self.data = None
self.filename = None
self.file_path = None
self.file_dir = None
self.processing_start_time = None
def select_file(self):
"""手动选择数据文件"""
print("🔍 打开文件选择对话框...")
root = tk.Tk()
root.withdraw()
self.file_path = filedialog.askopenfilename(
title="选择数据文件",
filetypes=[("Excel files", "*.xlsx"), ("CSV files", "*.csv"), ("All files", "*.*")]
)
if self.file_path:
self.filename = os.path.basename(self.file_path)
self.file_dir = os.path.dirname(self.file_path)
print(f"✅ 已选择文件: {self.filename}")
print(f"📁 文件所在目录: {self.file_dir}")
return True
else:
print("❌ 未选择文件")
return False
def _load_data(self):
"""加载数据文件"""
print("📥 开始加载数据文件...")
try:
if self.file_path.endswith('.csv'):
self.data = pd.read_csv(self.file_path)
print("✅ 成功加载CSV文件")
elif self.file_path.endswith('.xlsx'):
self.data = pd.read_excel(self.file_path)
print("✅ 成功加载Excel文件")
else:
raise ValueError("不支持的文件格式")
print(f"📊 数据文件形状: {self.data.shape}")
print(f"📋 数据列名: {list(self.data.columns)[:10]}...")
# 显示数据预览
print("\n📋 数据预览前3行:")
print(self.data.head(3))
# 显示列数据类型
print("\n📊 列数据类型:")
for col in self.data.columns[:10]:
print(f" {col}: {self.data[col].dtype}")
except Exception as e:
print(f"❌ 加载数据文件时出错: {e}")
raise
def _validate_data(self):
"""验证数据完整性"""
print("🔍 验证数据完整性...")
# 检查必要的测量列
required_measure_columns = ['PAD ID', 'Component ID', 'Height(mil)', 'Volume(%)', 'Area(%)']
missing_measure_columns = [col for col in required_measure_columns if col not in self.data.columns]
if missing_measure_columns:
error_msg = f"❌ 数据文件中缺少必要的测量列: {missing_measure_columns}"
print(error_msg)
raise ValueError(error_msg)
# 检查上下限列
required_limit_columns = ['Height_Low(mil)', 'Height_High(mil)', 'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)',
'Area_Max(%)']
missing_limit_columns = [col for col in required_limit_columns if col not in self.data.columns]
if missing_limit_columns:
error_msg = f"❌ 数据文件中缺少必要的上下限列: {missing_limit_columns}"
print(error_msg)
raise ValueError(error_msg)
print("✅ 数据验证通过")
# 检查数据是否存在空值
all_columns = required_measure_columns + required_limit_columns
null_counts = self.data[all_columns].isnull().sum()
if null_counts.any():
print(f"⚠️ 数据中存在空值:")
for col, count in null_counts[null_counts > 0].items():
print(f" {col}: {count} 个空值")
else:
print("✅ 所有必需列都没有空值")
# 显示数据统计信息
print("\n📊 数据统计信息:")
for col in required_measure_columns:
if col in self.data.columns:
# 检查列的数据类型,针对不同类型使用不同的格式化方式
if pd.api.types.is_numeric_dtype(self.data[col]):
valid_count = self.data[col].count()
if valid_count > 0:
min_val = self.data[col].min()
max_val = self.data[col].max()
print(f" {col}: {valid_count} 个有效值, 范围 {min_val:.4f} - {max_val:.4f}")
else:
print(f" {col}: 0 个有效值")
else:
# 非数值型列:显示唯一值和示例
unique_count = self.data[col].nunique()
sample_values = self.data[col].dropna().head(3).tolist()
print(
f" {col}: {self.data[col].count()} 个有效值, {unique_count} 个唯一值, 示例: {sample_values}")
# 检查并转换数据类型
print("\n🔄 数据类型检查与转换:")
numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)',
'Height_Low(mil)', 'Height_High(mil)',
'Vol_Min(%)', 'Vol_Max(%)', 'Area_Min(%)', 'Area_Max(%)']
for col in numeric_columns:
if col in self.data.columns:
if not pd.api.types.is_numeric_dtype(self.data[col]):
try:
# 尝试转换为数值类型
original_count = self.data[col].count()
self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
converted_count = self.data[col].count()
lost_data = original_count - converted_count
if lost_data > 0:
print(f" ⚠️ {col}: 转换后丢失 {lost_data} 个非数值数据")
else:
print(f"{col}: 成功转换为数值类型")
except Exception as e:
print(f"{col}: 类型转换失败 - {e}")
else:
valid_count = self.data[col].count()
print(f"{col}: 已经是数值类型, {valid_count} 个有效值")
def _print_progress(self, message, level=1):
"""打印进度信息,支持分级显示"""
indent = " " * level
timestamp = datetime.now().strftime("%H:%M:%S")
print(f"{timestamp} {indent}{message}")
def generate_report(self):
"""生成统计报告"""
if self.data is None:
raise ValueError("请先选择数据文件")
try:
self.processing_start_time = datetime.now()
print(f"\n🚀 开始生成报告 - {self.processing_start_time.strftime('%Y-%m-%d %H:%M:%S')}")
# 验证数据
self._validate_data()
self._print_progress("开始数据处理...", 1)
# 创建分组键
self._print_progress("创建分组键...", 2)
# 确保PAD ID和Component ID都是字符串类型
self.data['PAD ID'] = self.data['PAD ID'].astype(str)
self.data['Component ID'] = self.data['Component ID'].astype(str)
self.data['Group_Key'] = self.data['PAD ID'] + '_' + self.data['Component ID']
group_count = self.data['Group_Key'].nunique()
self._print_progress(f"共发现 {group_count} 个分组", 2)
# 显示分组信息
group_info = self.data['Group_Key'].value_counts()
self._print_progress(f"分组数据量统计:", 2)
for i, (group, count) in enumerate(group_info.head(5).items()):
self._print_progress(f" {group}: {count} 个数据点", 3)
if len(group_info) > 5:
self._print_progress(f" ... 还有 {len(group_info) - 5} 个分组", 3)
# 检查数值列是否存在NaN值
numeric_columns = ['Height(mil)', 'Volume(%)', 'Area(%)']
for col in numeric_columns:
if col in self.data.columns:
nan_count = self.data[col].isna().sum()
if nan_count > 0:
self._print_progress(f"⚠️ {col}{nan_count} 个空值,将在统计计算中排除", 3)
# 计算统计信息
self._print_progress("计算基本统计信息...", 2)
# 确保数值列没有无穷大值
for col in numeric_columns:
if col in self.data.columns:
inf_count = np.isinf(self.data[col]).sum()
if inf_count > 0:
self._print_progress(f"⚠️ {col}{inf_count} 个无穷大值将替换为NaN", 3)
self.data[col] = self.data[col].replace([np.inf, -np.inf], np.nan)
stats = self.data.groupby('Group_Key').agg({
'Height(mil)': ['min', 'max', 'mean', 'std'],
'Volume(%)': ['min', 'max', 'mean', 'std'],
'Area(%)': ['min', 'max', 'mean', 'std']
}).round(4)
# 重命名列
stats.columns = [
'Height_Measured_Min(mil)', 'Height_Measured_Max(mil)', 'Height_Mean(mil)', 'Height_Std(mil)',
'Volume_Measured_Min(%)', 'Volume_Measured_Max(%)', 'Volume_Mean(%)', 'Volume_Std(%)',
'Area_Measured_Min(%)', 'Area_Measured_Max(%)', 'Area_Mean(%)', 'Area_Std(%)'
]
self._print_progress("基本统计信息计算完成", 2)
# 获取上下限信息
self._print_progress("获取预设上下限信息...", 2)
limits = self.data.groupby('Group_Key').agg({
'Height_Low(mil)': 'first',
'Height_High(mil)': 'first',
'Vol_Min(%)': 'first',
'Vol_Max(%)': 'first',
'Area_Min(%)': 'first',
'Area_Max(%)': 'first'
}).round(4)
# 合并统计信息和上下限信息
stats = pd.concat([stats, limits], axis=1)
self._print_progress("上下限信息获取完成", 2)
# 计算CPK
self._print_progress("开始计算CPK值...", 2)
stats = self._calculate_cpk(stats)
# 分析CPK结果
cpk_analysis = self._analyze_cpk_results(stats)
self._print_progress("CPK分析完成", 2)
self._print_cpk_summary(cpk_analysis)
# 生成HTML报告
self._print_progress("生成HTML报告...", 2)
report_path = self._create_html_report(stats, cpk_analysis)
self._print_progress("HTML报告生成完成", 2)
# 计算处理时间
processing_time = datetime.now() - self.processing_start_time
self._print_progress(f"总处理时间: {processing_time.total_seconds():.2f}", 1)
return report_path
except Exception as e:
print(f"❌ 生成报告过程中出错: {e}")
import traceback
print(f"详细错误信息:")
traceback.print_exc()
raise
def _analyze_cpk_results(self, stats):
"""分析CPK结果"""
cpk_analysis = {
'total_groups': len(stats),
'cpk_status': {'Height': {}, 'Volume': {}, 'Area': {}},
'problematic_groups': []
}
for feature in ['Height', 'Volume', 'Area']:
cpk_col = f'{feature}_Cpk'
if cpk_col not in stats.columns:
continue
valid_cpk = stats[cpk_col].dropna()
total_valid = len(valid_cpk)
cpk_analysis['cpk_status'][feature] = {
'total': total_valid,
'excellent': len(valid_cpk[valid_cpk >= 1.33]) if total_valid > 0 else 0,
'acceptable': len(valid_cpk[(valid_cpk >= 1.0) & (valid_cpk < 1.33)]) if total_valid > 0 else 0,
'poor': len(valid_cpk[valid_cpk < 1.0]) if total_valid > 0 else 0,
'invalid': len(stats) - total_valid
}
# 识别有问题的分组任意特征的CPK < 1.0
for group_key, row in stats.iterrows():
problems = []
for feature in ['Height', 'Volume', 'Area']:
cpk_col = f'{feature}_Cpk'
if cpk_col in stats.columns and not pd.isna(row[cpk_col]):
if row[cpk_col] < 1.0:
problems.append(f"{feature}: {row[cpk_col]:.4f}")
if problems:
cpk_analysis['problematic_groups'].append({
'group_key': group_key,
'problems': problems
})
return cpk_analysis
def _print_cpk_summary(self, cpk_analysis):
"""打印CPK结果摘要"""
print("\n📈 CPK分析结果摘要:")
print("=" * 60)
for feature, status in cpk_analysis['cpk_status'].items():
total = status['total']
if total == 0:
print(f"\n{feature}: 无有效CPK数据")
continue
print(f"\n{feature}:")
excellent_pct = (status['excellent'] / total * 100) if total > 0 else 0
acceptable_pct = (status['acceptable'] / total * 100) if total > 0 else 0
poor_pct = (status['poor'] / total * 100) if total > 0 else 0
print(f" ✅ 优秀 (CPK ≥ 1.33): {status['excellent']}/{total} ({excellent_pct:.1f}%)")
print(f" ⚠️ 合格 (1.0 ≤ CPK < 1.33): {status['acceptable']}/{total} ({acceptable_pct:.1f}%)")
print(f" ❌ 不合格 (CPK < 1.0): {status['poor']}/{total} ({poor_pct:.1f}%)")
print(f" ❓ 无法计算: {status['invalid']}")
if cpk_analysis['problematic_groups']:
print(f"\n⚠️ 发现 {len(cpk_analysis['problematic_groups'])} 个有问题分组:")
for i, group in enumerate(cpk_analysis['problematic_groups'][:10]):
print(f" {i + 1}. {group['group_key']}: {', '.join(group['problems'])}")
if len(cpk_analysis['problematic_groups']) > 10:
print(f" ... 还有 {len(cpk_analysis['problematic_groups']) - 10} 个问题分组")
else:
print("\n✅ 所有分组的CPK都在合格范围内")
print("=" * 60)
def _calculate_cpk(self, stats):
"""计算CPK值"""
self._print_progress("详细计算CPK值...", 3)
def calculate_single_cpk(mean, std, usl, lsl):
"""计算单个特征的CPK"""
if pd.isna(mean) or pd.isna(std) or std == 0:
return np.nan
if pd.isna(usl) or pd.isna(lsl):
return np.nan
try:
cpu = (usl - mean) / (3 * std) if usl != float('inf') else float('inf')
cpl = (mean - lsl) / (3 * std) if lsl != float('-inf') else float('inf')
if cpu == float('inf') and cpl == float('inf'):
return np.nan
elif cpu == float('inf'):
return cpl
elif cpl == float('inf'):
return cpu
else:
return min(cpu, cpl)
except (ZeroDivisionError, TypeError):
return np.nan
# 计算每个特征的CPK
cpk_results = []
total_groups = len(stats)
for idx, row in stats.iterrows():
if len(cpk_results) % 100 == 0 and total_groups > 100:
self._print_progress(f"计算第 {len(cpk_results) + 1} 个分组的CPK...", 4)
# Height CPK
height_cpk = calculate_single_cpk(
row.get('Height_Mean(mil)', np.nan),
row.get('Height_Std(mil)', np.nan),
row.get('Height_High(mil)', np.nan),
row.get('Height_Low(mil)', np.nan)
)
# Volume CPK
volume_cpk = calculate_single_cpk(
row.get('Volume_Mean(%)', np.nan),
row.get('Volume_Std(%)', np.nan),
row.get('Vol_Max(%)', np.nan),
row.get('Vol_Min(%)', np.nan)
)
# Area CPK
area_cpk = calculate_single_cpk(
row.get('Area_Mean(%)', np.nan),
row.get('Area_Std(%)', np.nan),
row.get('Area_Max(%)', np.nan),
row.get('Area_Min(%)', np.nan)
)
cpk_results.append({
'Height_Cpk': round(height_cpk, 4) if not pd.isna(height_cpk) else np.nan,
'Volume_Cpk': round(volume_cpk, 4) if not pd.isna(volume_cpk) else np.nan,
'Area_Cpk': round(area_cpk, 4) if not pd.isna(area_cpk) else np.nan
})
# 将CPK结果添加到统计数据中
cpk_df = pd.DataFrame(cpk_results, index=stats.index)
stats = pd.concat([stats, cpk_df], axis=1)
self._print_progress(f"所有 {len(stats)} 个分组CPK计算完成", 3)
return stats
def _get_cpk_status_class(self, cpk_value):
"""根据CPK值返回状态类别"""
if pd.isna(cpk_value):
return 'cpk-invalid'
elif cpk_value >= 1.33:
return 'cpk-excellent'
elif cpk_value >= 1.0:
return 'cpk-acceptable'
else:
return 'cpk-poor'
def _create_html_report(self, stats, cpk_analysis):
"""创建完整的HTML报告"""
self._print_progress("构建HTML报告内容...", 3)
total_groups = len(stats)
# 完整的HTML模板
html_content = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>数据统计报告 - {self.filename}</title>
<style>
:root {{
--color-excellent: #4CAF50;
--color-acceptable: #FFC107;
--color-poor: #F44336;
--color-invalid: #9E9E9E;
}}
body {{
font-family: 'Segoe UI', Arial, sans-serif;
margin: 20px;
line-height: 1.6;
background-color: #f8f9fa;
}}
.container {{
max-width: 95%;
margin: 0 auto;
background: white;
padding: 20px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}}
h1 {{
color: #2c3e50;
border-bottom: 3px solid #3498db;
padding-bottom: 10px;
text-align: center;
}}
h2 {{
color: #34495e;
margin-top: 30px;
padding: 15px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 5px;
}}
.summary {{
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 30px;
}}
.cpk-dashboard {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
margin: 20px 0;
}}
.cpk-card {{
background: white;
padding: 20px;
border-radius: 10px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
text-align: center;
}}
.cpk-excellent {{ background-color: var(--color-excellent); color: white; }}
.cpk-acceptable {{ background-color: var(--color-acceptable); color: black; }}
.cpk-poor {{ background-color: var(--color-poor); color: white; }}
.cpk-invalid {{ background-color: var(--color-invalid); color: white; }}
table {{
border-collapse: collapse;
width: 100%;
margin-top: 20px;
font-size: 12px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
background: white;
}}
th, td {{
border: 1px solid #ddd;
padding: 12px;
text-align: center;
}}
th {{
background: linear-gradient(135deg, #74b9ff 0%, #0984e3 100%);
color: white;
font-weight: bold;
position: sticky;
top: 0;
}}
tr:nth-child(even) {{ background-color: #f8f9fa; }}
tr:hover {{ background-color: #e3f2fd; }}
.limits {{
background-color: #e8f5e8;
font-weight: bold;
color: #2e7d32;
}}
.measured {{
background-color: #fff3cd;
color: #856404;
}}
.problematic-row {{
background-color: #ffebee !important;
border-left: 4px solid var(--color-poor);
}}
.warning-box {{
background: #fff3cd;
border-left: 4px solid #ffc107;
padding: 15px;
margin: 20px 0;
border-radius: 5px;
}}
.chart-container {{
margin: 20px 0;
padding: 20px;
background: white;
border-radius: 10px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}}
.legend {{
display: flex;
justify-content: center;
gap: 20px;
margin: 20px 0;
flex-wrap: wrap;
}}
.legend-item {{
display: flex;
align-items: center;
gap: 5px;
padding: 5px 10px;
border-radius: 3px;
}}
.na {{ color: #999; font-style: italic; }}
</style>
</head>
<body>
<div class="container">
<h1>📊 数据统计报告 - {self.filename}</h1>
<p><strong>生成时间:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p><strong>输入文件:</strong> {self.filename}</p>
<div class="summary">
<h2>📈 报告摘要</h2>
<p><strong>总分组数量:</strong> {total_groups}</p>
<p><strong>处理时间:</strong> {(datetime.now() - self.processing_start_time).total_seconds():.2f} 秒</p>
</div>
<!-- CPK状态仪表板 -->
<div class="cpk-dashboard">
"""
# 添加CPK状态卡片
for feature, status in cpk_analysis['cpk_status'].items():
total = status['total'] + status['invalid']
if total == 0:
continue
html_content += f"""
<div class="cpk-card">
<h3>{feature} CPK状态</h3>
<div style="font-size: 2em; font-weight: bold; margin: 10px 0;">
{status['excellent'] + status['acceptable']}/{total}
</div>
<p>合格率: {(status['excellent'] + status['acceptable']) / total * 100:.1f}%</p>
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; margin-top: 10px;">
<span class="legend-item cpk-excellent">优秀: {status['excellent']}</span>
<span class="legend-item cpk-acceptable">合格: {status['acceptable']}</span>
<span class="legend-item cpk-poor">不合格: {status['poor']}</span>
<span class="legend-item cpk-invalid">无效: {status['invalid']}</span>
</div>
</div>
"""
html_content += f"""
</div>
<!-- 问题分组警告 -->
{f'<div class="warning-box"><h3>⚠️ 发现 {len(cpk_analysis["problematic_groups"])} 个问题分组</h3><p>以下分组的CPK值低于1.0,需要重点关注</p></div>' if cpk_analysis['problematic_groups'] else ''}
<h2>📋 详细统计数据</h2>
<div class="legend">
<span class="legend-item" style="background-color: #e8f5e8;">预设上下限</span>
<span class="legend-item" style="background-color: #fff3cd;">实测值</span>
<span class="legend-item cpk-excellent">CPK ≥ 1.33</span>
<span class="legend-item cpk-acceptable">1.0 ≤ CPK < 1.33</span>
<span class="legend-item cpk-poor">CPK < 1.0</span>
</div>
<div style="overflow-x: auto;">
<table>
<thead>
<tr>
<th rowspan="2">分组标识</th>
<th colspan="7">Height(mil)</th>
<th colspan="7">Volume(%)</th>
<th colspan="7">Area(%)</th>
</tr>
<tr>
<!-- Height列标题 -->
<th class="limits">预设下限</th>
<th class="limits">预设上限</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>CPK</th>
<!-- Volume列标题 -->
<th class="limits">预设下限</th>
<th class="limits">预设上限</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>CPK</th>
<!-- Area列标题 -->
<th class="limits">预设下限</th>
<th class="limits">预设上限</th>
<th class="measured">实测最小值</th>
<th class="measured">实测最大值</th>
<th>平均值</th>
<th>标准差</th>
<th>CPK</th>
</tr>
</thead>
<tbody>
"""
# 生成表格行数据的辅助函数
def format_value(value):
if pd.isna(value):
return '<span class="na">N/A</span>'
elif isinstance(value, (int, float)):
return f"{value:.4f}"
else:
return str(value)
# 用于检查列是否存在的辅助函数
def safe_get_value(row, column_name):
"""安全获取列值如果列不存在返回N/A"""
if column_name in row.index:
return row[column_name]
else:
return np.nan
for group_key, row in stats.iterrows():
# 检查是否为问题分组
is_problematic = any(problem['group_key'] == group_key for problem in cpk_analysis['problematic_groups'])
row_class = 'class="problematic-row"' if is_problematic else ''
html_content += f"""
<tr {row_class}>
<td><strong>{group_key}</strong>{' ⚠️' if is_problematic else ''}</td>
"""
# 为每个特征生成列
for feature in ['Height', 'Volume', 'Area']:
cpk_value = safe_get_value(row, f'{feature}_Cpk')
cpk_class = self._get_cpk_status_class(cpk_value)
# 为不同特征设置正确的列名
if feature == 'Height':
lower_limit_col = 'Height_Low(mil)'
upper_limit_col = 'Height_High(mil)'
measured_min_col = 'Height_Measured_Min(mil)'
measured_max_col = 'Height_Measured_Max(mil)'
mean_col = 'Height_Mean(mil)'
std_col = 'Height_Std(mil)'
else:
lower_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Min(%)" # 修正Volume使用Vol_Min(%)Area使用Area_Min(%)
upper_limit_col = f"{'Vol' if feature == 'Volume' else 'Area'}_Max(%)" # 修正Volume使用Vol_Max(%)Area使用Area_Max(%)
measured_min_col = f'{feature}_Measured_Min(%)'
measured_max_col = f'{feature}_Measured_Max(%)'
mean_col = f'{feature}_Mean(%)'
std_col = f'{feature}_Std(%)'
html_content += f"""
<!-- {feature}数据 -->
<td class="limits">{format_value(safe_get_value(row, lower_limit_col))}</td>
<td class="limits">{format_value(safe_get_value(row, upper_limit_col))}</td>
<td class="measured">{format_value(safe_get_value(row, measured_min_col))}</td>
<td class="measured">{format_value(safe_get_value(row, measured_max_col))}</td>
<td>{format_value(safe_get_value(row, mean_col))}</td>
<td>{format_value(safe_get_value(row, std_col))}</td>
<td class="{cpk_class}">{format_value(cpk_value)}</td>
"""
html_content += """
</tr>"""
html_content += """
</tbody>
</table>
</div>
<div class="chart-container">
<h2>📊 CPK状态分布</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px;">
"""
# 添加简单的CPK分布图表
for feature, status in cpk_analysis['cpk_status'].items():
total = status['total'] + status['invalid']
if total == 0:
continue
html_content += f"""
<div>
<h3>{feature} CPK分布</h3>
<div style="background: #f8f9fa; padding: 20px; border-radius: 5px;">
<div style="display: flex; height: 30px; margin: 10px 0; border-radius: 5px; overflow: hidden;">
<div style="background: var(--color-excellent); width: {status['excellent'] / total * 100}%;"></div>
<div style="background: var(--color-acceptable); width: {status['acceptable'] / total * 100}%;"></div>
<div style="background: var(--color-poor); width: {status['poor'] / total * 100}%;"></div>
<div style="background: var(--color-invalid); width: {status['invalid'] / total * 100}%;"></div>
</div>
<div style="text-align: center;">
<small>优秀 {status['excellent']} | 合格 {status['acceptable']} | 不合格 {status['poor']} | 无效 {status['invalid']}</small>
</div>
</div>
</div>
"""
html_content += """
</div>
</div>
</div>
</body>
</html>"""
# 保存报告
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_filename = f"{os.path.splitext(self.filename)[0]}_report_{timestamp}.html"
report_path = os.path.join(self.file_dir, report_filename)
self._print_progress(f"保存报告到: {report_path}", 3)
with open(report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
return report_path
def main():
"""主函数"""
print("=" * 60)
print("🚀 数据统计报告生成程序 - Volume上下限修复版")
print("=" * 60)
processor = DataProcessor()
try:
if processor.select_file():
processor._load_data()
report_path = processor.generate_report()
print("\n" + "=" * 60)
print("✅ 程序执行完成")
print(f"📄 统计报告生成成功: {report_path}")
print("=" * 60)
else:
print("❌ 未选择文件,程序退出")
except Exception as e:
print(f"\n❌ 程序执行失败: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

17
htmlProcess/.gitignore vendored Normal file
View File

@@ -0,0 +1,17 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
htmlReportProcess_Merge_picHtml_V3.py
htmlReportProcess_Merge_picHtml_V2.py
htmlReportProcess_Merge_pic_V2.py
/htmlReportProcess*/

11
htmlProcess/README.md Normal file
View File

@@ -0,0 +1,11 @@
# Sample GitLab Project
This sample project shows how a project in GitLab looks for demonstration purposes. It contains issues, merge requests and Markdown files in many branches,
named and filled with lorem ipsum.
You can look around to get an idea how to structure your project and, when done, you can safely delete this project.
[Learn more about creating GitLab projects.](https://docs.gitlab.com/ee/gitlab-basics/create-project.html)
html文件的报告自动分析和处理数据的工具脚本

View File

@@ -0,0 +1,926 @@
import os
import re
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.lines import Line2D
from typing import Optional, Tuple, List, Dict, Any, Union
from pathlib import Path
import numpy as np
import base64
from io import BytesIO
from jinja2 import Template
from colorama import Fore, Style, init
# 避免 SettingWithCopy 警告影响输出可读性
pd.options.mode.chained_assignment = None
# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# HTML模板 - 添加了SN独立图的显示
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>测试报告分析 - {{ keyword }}</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin-bottom: 20px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.test-card {
background: white;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
transition: transform 0.2s ease;
}
.test-card:hover {
transform: translateY(-2px);
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15);
}
.test-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 15px;
padding-bottom: 10px;
border-bottom: 2px solid #eaeaea;
}
.test-title {
font-size: 18px;
font-weight: bold;
color: #333;
}
.test-stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
margin-bottom: 15px;
}
.stat-item {
background: #f8f9fa;
padding: 12px;
border-radius: 8px;
text-align: center;
}
.stat-label {
font-size: 12px;
color: #666;
margin-bottom: 5px;
}
.stat-value {
font-size: 16px;
font-weight: bold;
color: #333;
}
.plot-container {
text-align: center;
margin: 20px 0;
}
.plot-image {
max-width: 100%;
height: auto;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
}
.sn-plots-container {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
gap: 20px;
margin: 20px 0;
}
.sn-plot-item {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
text-align: center;
}
.sn-plot-title {
font-size: 14px;
font-weight: bold;
margin-bottom: 10px;
color: #555;
}
.summary {
background: white;
border-radius: 10px;
padding: 20px;
margin-top: 20px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.summary-item {
margin: 10px 0;
padding: 10px;
background: #f8f9fa;
border-radius: 6px;
}
.timestamp {
text-align: center;
color: #666;
margin-top: 30px;
font-size: 12px;
}
.badge {
padding: 4px 8px;
border-radius: 12px;
font-size: 12px;
font-weight: bold;
}
.badge-success {
background: #d4edda;
color: #155724;
}
.badge-warning {
background: #fff3cd;
color: #856404;
}
.badge-danger {
background: #f8d7da;
color: #721c24;
}
.section-title {
font-size: 16px;
font-weight: bold;
margin: 20px 0 10px 0;
color: #333;
border-left: 4px solid #667eea;
padding-left: 10px;
}
</style>
</head>
<body>
<div class="header">
<h1>📊 测试报告分析</h1>
<p>关键词: <strong>{{ keyword }}</strong> | 生成时间: {{ timestamp }}</p>
<p>共分析 {{ test_count }} 个测试项,{{ total_points }} 个数据点</p>
</div>
{% for test in tests %}
<div class="test-card">
<div class="test-header">
<div class="test-title">📋 {{ test.name }}</div>
<div class="badge badge-{{ test.status }}">
{{ test.status_display }}
</div>
</div>
<div class="test-stats">
<div class="stat-item">
<div class="stat-label">数据点数</div>
<div class="stat-value">{{ test.stats.count }}</div>
</div>
<div class="stat-item">
<div class="stat-label">平均值</div>
<div class="stat-value">{{ "%.4f"|format(test.stats.mean) }}</div>
</div>
<div class="stat-item">
<div class="stat-label">中位数</div>
<div class="stat-value">{{ "%.4f"|format(test.stats.median) }}</div>
</div>
<div class="stat-item">
<div class="stat-label">标准差</div>
<div class="stat-value">{{ "%.4f"|format(test.stats.std) }}</div>
</div>
<div class="stat-item">
<div class="stat-label">最小值</div>
<div class="stat-value">{{ "%.4f"|format(test.stats.min) }}</div>
</div>
<div class="stat-item">
<div class="stat-label">最大值</div>
<div class="stat-value">{{ "%.4f"|format(test.stats.max) }}</div>
</div>
</div>
{% if test.limits.lower is not none or test.limits.upper is not none %}
<div class="test-stats">
{% if test.limits.lower is not none %}
<div class="stat-item">
<div class="stat-label">下限值</div>
<div class="stat-value">{{ "%.4f"|format(test.limits.lower) }}</div>
</div>
{% endif %}
{% if test.limits.upper is not none %}
<div class="stat-item">
<div class="stat-label">上限值</div>
<div class="stat-value">{{ "%.4f"|format(test.limits.upper) }}</div>
</div>
{% endif %}
</div>
{% endif %}
<!-- 汇总图 -->
<div class="section-title">📈 汇总视图 (所有SN)</div>
<div class="plot-container">
<img src="data:image/png;base64,{{ test.summary_plot_image }}" alt="{{ test.name }} 汇总散点图" class="plot-image">
</div>
<!-- SN独立图 -->
{% if test.sn_plot_images %}
<div class="section-title">🔍 SN独立视图 ({{ test.sn_plot_images|length }}个SN)</div>
<div class="sn-plots-container">
{% for sn_plot in test.sn_plot_images %}
<div class="sn-plot-item">
<div class="sn-plot-title">SN: {{ sn_plot.sn }}</div>
<img src="data:image/png;base64,{{ sn_plot.image }}" alt="{{ test.name }} - SN {{ sn_plot.sn }} 散点图" class="plot-image">
</div>
{% endfor %}
</div>
{% endif %}
</div>
{% endfor %}
<div class="summary">
<h3>📈 分析摘要</h3>
<div class="summary-item">
<strong>文件路径:</strong> {{ file_path }}
</div>
<div class="summary-item">
<strong>分析时间:</strong> {{ analysis_time }}秒
</div>
<div class="summary-item">
<strong>测试项分布:</strong>
<ul>
<li>正常: {{ status_counts.normal }} 个</li>
<li>警告: {{ status_counts.warning }} 个</li>
<li>异常: {{ status_counts.abnormal }} 个</li>
</ul>
</div>
</div>
<div class="timestamp">
报告生成于 {{ timestamp }} | 测试报告分析系统
</div>
</body>
</html>
"""
class TestReportScatterPlotter:
def __init__(self):
self.file_path: Optional[str] = None
self.df: Optional[pd.DataFrame] = None
self.output_dir: Optional[str] = None
self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time", "Lower Limit", "Upper Limit", ]
self.col_lower: Optional[str] = None
self.col_upper: Optional[str] = None
self.html_report_path: Optional[str] = None
# 缓存处理过的数据
self._processed_data_cache: Dict[str, Any] = {}
def _print_stage(self, msg: str) -> None:
"""统一的阶段信息输出"""
print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}")
def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None:
"""改进的进度条显示"""
if total <= 0:
return
percent = (current / total) * 100
bar_len = 30
filled = int(bar_len * current / total)
bar = "" * filled + "-" * (bar_len - filled)
sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)")
sys.stdout.flush()
if current == total:
print() # 换行
def get_file_path(self) -> None:
"""改进的文件路径获取,支持路径补全"""
self._print_stage("输入文件路径")
while True:
print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ")
file_path = input("> ").strip()
# 尝试路径补全和验证
if not file_path:
continue
path_obj = Path(file_path)
if path_obj.exists():
self.file_path = str(path_obj.resolve())
print(f"已选择文件: {self.file_path}")
break
else:
print(f"文件不存在: {file_path},请重新输入")
def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
"""优化的大小写不敏感列查找"""
if self.df is None:
return None
columns_lower = {col.lower().strip(): col for col in self.df.columns}
for candidate in candidates:
key = candidate.lower().strip()
if key in columns_lower:
return columns_lower[key]
return None
def load_data(self) -> None:
"""优化的数据加载方法"""
self._print_stage("加载数据")
start_time = time.time()
# 检查文件是否存在
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"文件不存在: {self.file_path}")
# 根据文件扩展名选择最优引擎
file_ext = self.file_path.lower()
if file_ext.endswith('.xlsx'):
# .xlsx 文件引擎选择优先级
engine_options = ['openpyxl', 'calamine'] # calamine需要安装并可能更快
engine = 'openpyxl' # 默认
elif file_ext.endswith('.xls'):
# .xls 文件引擎选择
engine_options = ['xlrd', 'calamine']
engine = 'xlrd' # 默认
else:
raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)")
# 快速获取工作表名称(轻量级方式)
try:
if engine == 'openpyxl':
import openpyxl
workbook = openpyxl.load_workbook(self.file_path, read_only=True)
sheet_names = workbook.sheetnames
workbook.close()
elif engine == 'xlrd':
import xlrd
workbook = xlrd.open_workbook(self.file_path, on_demand=True)
sheet_names = workbook.sheet_names()
workbook.release_resources()
else:
# 使用pandas的轻量级方式
excel_file = pd.ExcelFile(self.file_path, engine=engine)
sheet_names = excel_file.sheet_names
except Exception as e:
raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}")
# 定义优先查找的工作表名
target_sheets = ["Merged All Tests", "All Tests"]
selected_sheet = None
for sheet in target_sheets:
if sheet in sheet_names:
selected_sheet = sheet
break
if selected_sheet is None:
raise ValueError(
f"未找到指定的工作表: {''.join(target_sheets)}"
f"当前文件包含的工作表有: {sheet_names}"
)
try:
# 性能优化:使用更高效的参数设置
read_excel_kwargs = {
# 'filepath_or_buffer': self.file_path,
'io': self.file_path, # 修正:使用'io'而不是'filepath_or_buffer'
'sheet_name': selected_sheet,
'engine': engine,
'dtype': 'object', # 先统一读取为对象类型,减少类型推断时间
'na_filter': False, # 禁用自动NA过滤提高读取速度
}
# 如果知道必需列,且不为空,则只读取需要的列
if hasattr(self, 'required_columns') and self.required_columns:
# 先检查哪些列实际存在
try:
# 轻量级检查列名是否存在
sample_df = pd.read_excel(
self.file_path,
sheet_name=selected_sheet,
engine=engine,
nrows=1 # 只读取第一行来获取列名
)
existing_columns = [col for col in self.required_columns if col in sample_df.columns]
if len(existing_columns) < len(self.required_columns):
missing = set(self.required_columns) - set(existing_columns)
raise KeyError(f"缺少必要列: {list(missing)}")
read_excel_kwargs['usecols'] = existing_columns
# print(f"使用 read_excel_kwargs 读取excel:\n {read_excel_kwargs}")
# 打印完整的参数信息(调试用)
print("使用 read_excel_kwargs 读取excel:")
for key, value in read_excel_kwargs.items():
print(f" {key}: {repr(value)}") # 使用repr确保特殊字符正确显示
except Exception as e:
print(f"列检查失败,将读取所有列: {e}")
# 如果列检查失败,回退到读取所有列
# 执行数据读取
self._print_stage("执行数据读取")
self.df = pd.read_excel(**read_excel_kwargs)
except Exception as e:
# 如果默认引擎失败,尝试备选引擎
print(f"引擎 {engine} 读取失败,尝试备选引擎...\n{e}")
try:
# 回退到基本的读取方式
self.df = pd.read_excel(
self.file_path,
sheet_name=selected_sheet,
engine=None # 让pandas自动选择
)
except Exception as fallback_e:
raise RuntimeError(
f"读取 Excel 失败,工作表: '{selected_sheet}'"
f"主引擎错误: {type(e).__name__}: {e}\n"
f"备选引擎错误: {type(fallback_e).__name__}: {fallback_e}"
)
if self.df.empty:
raise ValueError("工作表为空,无法处理")
# 校验必要列如果前面没有使用usecols过滤这里需要再次检查
if hasattr(self, 'required_columns') and self.required_columns:
missing_columns = [col for col in self.required_columns if col not in self.df.columns]
if missing_columns:
raise KeyError(f"缺少必要列: {missing_columns}")
# 记录上下限列名
self.col_lower = self._find_column_case_insensitive([
"Lower Limit", "lower limit", "lower_limit", "ll", "lower"
])
self.col_upper = self._find_column_case_insensitive([
"Upper Limit", "upper limit", "upper_limit", "ul", "upper"
])
loading_time = time.time() - start_time
print(f"数据加载完成: {len(self.df)}× {self.df.shape[1]}")
print(f"使用引擎: {engine}")
print(f"耗时: {loading_time:.2f}s")
# 显示列信息摘要
print(f"检测到下限列: {self.col_lower or ''}")
print(f"检测到上限列: {self.col_upper or ''}")
# 可选:类型转换(如果知道具体的数据类型)
# self._convert_data_types()
# 可以添加这个方法进行类型转换优化
def _convert_data_types(self):
"""优化数据类型转换"""
if self.df is None or self.df.empty:
return
# 根据列名模式推断数据类型
numeric_patterns = ['limit', 'value', 'measure', 'result', 'score']
date_patterns = ['date', 'time', 'period']
for col in self.df.columns:
col_lower = str(col).lower()
# 数值类型转换
if any(pattern in col_lower for pattern in numeric_patterns):
self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
# 日期类型转换
elif any(pattern in col_lower for pattern in date_patterns):
self.df[col] = pd.to_datetime(self.df[col], errors='coerce')
def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]:
"""获取用户输入的关键词并筛选数据"""
self._print_stage("筛选关键词")
while True:
keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip()
if not keyword:
print("❌ 关键词不能为空,请重新输入")
continue
# 检查数据框是否为空
if self.df.empty:
print("⚠️ 数据框为空,无法进行筛选")
return pd.DataFrame(), keyword, []
# 检查列是否存在
if "Test Name New" not in self.df.columns:
print("❌ 列 'Test Name New' 不存在于数据框中")
print(f"可用列: {list(self.df.columns)}")
return pd.DataFrame(), keyword, []
try:
mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
filtered_df = self.df.loc[mask].copy()
if filtered_df.empty:
# 提供友好的提示和建议,而不是直接抛出异常
print(f"⚠️ 没有找到包含关键词 '{keyword}' 的测试项")
# 显示部分可用的测试项作为参考
available_tests = self.df["Test Name New"].dropna().unique()
if len(available_tests) > 0:
print("📋 可用的测试项示例:")
for test in available_tests[:5]: # 只显示前5个作为参考
print(f" - {test}")
if len(available_tests) > 5:
print(f" ... 还有 {len(available_tests) - 5} 个测试项")
# 提供重新输入或退出的选项
choice = input("请选择: 1-重新输入关键词 2-使用所有数据 3-退出当前操作: ")
if choice == "1":
continue
elif choice == "2":
filtered_df = self.df.copy()
unique_tests = filtered_df["Test Name New"].unique().tolist()
print(f"✅ 使用所有数据: {len(filtered_df)} 行,{len(unique_tests)} 个测试项")
return filtered_df, "", unique_tests
else:
print("👋 退出筛选操作")
return pd.DataFrame(), keyword, []
else:
unique_tests = filtered_df["Test Name New"].unique().tolist()
print(f"✅ 匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
return filtered_df, keyword, unique_tests
except Exception as e:
print(f"❌ 筛选过程中发生错误: {e}")
print("请检查数据格式或重新输入关键词")
continue
def create_output_dir(self, keyword) -> None:
"""创建输出目录"""
self._print_stage("创建输出目录")
if not self.file_path:
raise ValueError("文件路径未设置")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_dir = os.path.dirname(self.file_path)
# self.output_dir = os.path.join(base_dir, f"scatter_report_{timestamp}")
self.output_dir = os.path.join(base_dir, f"scatter_report_out")
self.html_report_path = os.path.join(self.output_dir, f"{keyword}_report_{timestamp}.html")
os.makedirs(self.output_dir, exist_ok=True)
print(f"输出目录: {self.output_dir}")
@staticmethod
def _safe_filename(name: str) -> str:
"""生成安全的文件名"""
safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
return safe or "Unknown_Test"
def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
Optional[float], Optional[float], List[float], List[float]]:
"""提取某个测试项的上下限数值"""
lower_plot = upper_plot = None
lower_set = []
upper_set = []
if self.col_lower and self.col_lower in df_one_test.columns:
lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
if lower_set:
lower_plot = min(lower_set)
if self.col_upper and self.col_upper in df_one_test.columns:
upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
if upper_set:
upper_plot = max(upper_set)
return lower_plot, upper_plot, lower_set, upper_set
@staticmethod
def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
"""统一的系列清洗和转换方法 - 修复了 ast 方法名错误"""
if series.empty:
return series
if target_type == 'numeric':
# 数值转换优化
if pd.api.types.is_numeric_dtype(series):
return series.astype(float)
# 批量字符串处理 - 修复这里的问题
cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
return pd.to_numeric(cleaned, errors='coerce')
elif target_type == 'datetime':
return TestReportScatterPlotter._convert_to_datetime(series)
return series
@staticmethod
def _convert_to_datetime(series: pd.Series) -> pd.Series:
"""优化的日期时间转换"""
if pd.api.types.is_datetime64_any_dtype(series):
return series
# 预处理:转换为数值和字符串两种形式
numeric_series = pd.to_numeric(series, errors='coerce')
string_series = series.astype(str).str.strip()
result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
# 数值时间戳处理
masks = {
'ms': numeric_series >= 1e11,
's': (numeric_series >= 1e9) & (numeric_series < 1e11),
'excel': (numeric_series > 20000) & (numeric_series < 60000)
}
for mask_type, mask in masks.items():
if mask.any():
if mask_type == 'ms':
result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
elif mask_type == 's':
result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s')
elif mask_type == 'excel':
origin = pd.Timestamp('1899-12-30')
result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
# 字符串日期处理
remaining_mask = result.isna()
if remaining_mask.any():
remaining_strings = string_series.loc[remaining_mask]
# 特定格式优先处理
format_patterns = [
(r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
]
for pattern, date_format in format_patterns:
format_mask = remaining_strings.str.match(pattern)
if format_mask.any():
result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
remaining_strings.loc[format_mask], format=date_format, errors='coerce'
)
# 通用解析
still_na_mask = result.isna() & remaining_mask
if still_na_mask.any():
result.loc[still_na_mask] = pd.to_datetime(
string_series.loc[still_na_mask], errors='coerce'
)
return result
def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
"""数据预处理"""
# 数值转换
test_data['Measurement_num'] = self._clean_and_convert_series(
test_data['Measurement'], 'numeric'
)
test_data['TestTime_dt'] = self._clean_and_convert_series(
test_data['Test Time'], 'datetime'
)
# 去除无效数据
valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
return valid_data.sort_values('TestTime_dt')
def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
"""计算统计信息"""
stats = {
'count': len(y_data),
'mean': y_data.mean(),
'median': y_data.median(),
'min': y_data.min(),
'max': y_data.max(),
'std': y_data.std(),
'q1': y_data.quantile(0.25),
'q3': y_data.quantile(0.75)
}
return stats
def _plot_to_base64(self, fig) -> str:
"""将图表转换为base64编码"""
buf = BytesIO()
fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
plt.close(fig)
return img_str
def _create_summary_plot(self, test_data: pd.DataFrame, test_name: str,
lower_plot: Optional[float], upper_plot: Optional[float]) -> str:
"""创建汇总图所有SN在一个图中"""
fig, ax = plt.subplots(figsize=(12, 8))
# 分组绘制
groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
for sn, group in groups:
ax.scatter(group['TestTime_dt'], group['Measurement_num'],
label=str(sn), alpha=0.7, s=25)
# 计算统计信息
y_data = test_data['Measurement_num']
stats = self._calculate_statistics(y_data)
# 绘制限值线和统计线
x_min, x_max = test_data['TestTime_dt'].min(), test_data['TestTime_dt'].max()
if lower_plot is not None:
ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
if upper_plot is not None:
ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
# 添加统计线
ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
# 设置图形属性
ax.set_title(f"汇总图 - {test_name}")
ax.set_xlabel("Test Time")
ax.set_ylabel("Measurement Value")
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
return self._plot_to_base64(fig)
def _create_sn_plots(self, test_data: pd.DataFrame, test_name: str,
lower_plot: Optional[float], upper_plot: Optional[float]) -> List[Dict[str, str]]:
"""为每个SN创建独立图表"""
sn_plots = []
if "SN" not in test_data.columns:
return sn_plots
sn_groups = test_data.groupby("SN")
for sn, group in sn_groups:
if group.empty:
continue
fig, ax = plt.subplots(figsize=(10, 6))
# 绘制当前SN的数据点
ax.scatter(group['TestTime_dt'], group['Measurement_num'],
color='blue', alpha=0.7, s=30, label=f"SN: {sn}")
# 计算当前SN的统计信息
y_data = group['Measurement_num']
stats = self._calculate_statistics(y_data)
# 绘制限值线
x_min, x_max = group['TestTime_dt'].min(), group['TestTime_dt'].max()
if lower_plot is not None:
ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2, label="Lower Limit")
if upper_plot is not None:
ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2, label="Upper Limit")
# 添加统计线
ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max, colors='orange',
linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max, colors='purple',
linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
# 设置图形属性
ax.set_title(f"SN独立图 - {test_name} (SN: {sn})")
ax.set_xlabel("Test Time")
ax.set_ylabel("Measurement Value")
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)
ax.legend()
# 转换为base64
plot_image = self._plot_to_base64(fig)
sn_plots.append({"sn": str(sn), "image": plot_image})
return sn_plots
def _determine_test_status(self, stats: Dict[str, float],
lower_limit: Optional[float],
upper_limit: Optional[float]) -> Dict[str, Any]:
"""确定测试状态"""
status = "success"
status_display = "正常"
if lower_limit is not None and upper_limit is not None:
# 检查是否超出限值
if stats['min'] < lower_limit or stats['max'] > upper_limit:
status = "danger"
status_display = "异常"
elif (stats['mean'] < lower_limit * 1.1 or stats['mean'] > upper_limit * 0.9 or
stats['std'] > (upper_limit - lower_limit) * 0.2):
status = "warning"
status_display = "警告"
return {"status": status, "status_display": status_display}
def generate_html_report(self, filtered_df: pd.DataFrame, keyword: str,
unique_tests: List[str]) -> None:
"""生成HTML报告"""
self._print_stage("生成HTML报告")
start_time = time.time()
test_results = []
total_points = 0
status_counts = {"success": 0, "warning": 0, "danger": 0}
for i, test_name in enumerate(unique_tests, 1):
self._print_progress(i, len(unique_tests), "生成测试报告")
# 获取测试数据
test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
test_data = self._preprocess_test_data(test_data)
if test_data.empty:
continue
# 提取限值信息
lower_plot, upper_plot, _, _ = self._extract_limits(test_data)
# 计算统计信息
y_data = test_data['Measurement_num']
stats = self._calculate_statistics(y_data)
total_points += stats['count']
# 生成汇总图表
summary_plot_image = self._create_summary_plot(test_data, test_name, lower_plot, upper_plot)
# 生成SN独立图表
sn_plot_images = self._create_sn_plots(test_data, test_name, lower_plot, upper_plot)
# 确定测试状态
status_info = self._determine_test_status(stats, lower_plot, upper_plot)
status_counts[status_info["status"]] += 1
# 添加到结果列表
test_results.append({
"name": test_name,
"stats": stats,
"limits": {"lower": lower_plot, "upper": upper_plot},
"summary_plot_image": summary_plot_image,
"sn_plot_images": sn_plot_images,
"status": status_info["status"],
"status_display": status_info["status_display"]
})
# 渲染HTML模板
template = Template(HTML_TEMPLATE)
html_content = template.render(
keyword=keyword,
timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
test_count=len(test_results),
total_points=total_points,
tests=test_results,
file_path=self.file_path,
analysis_time=round(time.time() - start_time, 2),
status_counts={"normal": status_counts["success"], "warning": status_counts["warning"],
"abnormal": status_counts["danger"]}
)
# 保存HTML文件
with open(self.html_report_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"\nHTML报告已生成: {self.html_report_path}")
print(f"共处理 {len(test_results)} 个测试项,{total_points} 个数据点")
def run(self) -> None:
"""运行主程序"""
try:
self.get_file_path()
self.load_data()
while True:
filtered_df, keyword, unique_tests = self.get_keyword()
self.create_output_dir(keyword)
self.generate_html_report(filtered_df, keyword, unique_tests)
print(f"\n✅ 分析完成!")
# print(f"📊 报告文件: {self.html_report_path}")
# print(f"📁 输出目录: {self.output_dir}")
except KeyboardInterrupt:
print(f"\n{Fore.YELLOW}⚠ 用户中断程序")
except Exception as e:
print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
plotter = TestReportScatterPlotter()
plotter.run()

View File

@@ -0,0 +1,563 @@
import os
import re
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.lines import Line2D
from typing import Optional, Tuple, List, Dict, Any, Union
from pathlib import Path
import numpy as np
from colorama import Fore, Style, init
# 避免 SettingWithCopy 警告影响输出可读性
pd.options.mode.chained_assignment = None
# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
class TestReportScatterPlotter:
def __init__(self):
self.file_path: Optional[str] = None
self.df: Optional[pd.DataFrame] = None
self.output_dir: Optional[str] = None
self.required_columns = ["Test Name New", "SN", "Measurement", "Test Time"]
self.col_lower: Optional[str] = None
self.col_upper: Optional[str] = None
# 缓存处理过的数据
self._processed_data_cache: Dict[str, Any] = {}
def _print_stage(self, msg: str) -> None:
"""统一的阶段信息输出"""
print(f"\n{'=' * 30}\n{msg}\n{'=' * 30}")
def _print_progress(self, current: int, total: int, prefix: str = "进度") -> None:
"""改进的进度条显示"""
if total <= 0:
return
percent = (current / total) * 100
bar_len = 30
filled = int(bar_len * current / total)
bar = "" * filled + "-" * (bar_len - filled)
sys.stdout.write(f"\r{prefix}: [{bar}] {current}/{total} ({percent:.1f}%)")
sys.stdout.flush()
if current == total:
print() # 换行
def get_file_path(self) -> None:
"""改进的文件路径获取,支持路径补全"""
self._print_stage("输入文件路径")
while True:
print(f"{Fore.WHITE}请输入测试报告文件路径(.xlsx): ")
file_path = input("> ").strip()
# 尝试路径补全和验证
if not file_path:
continue
path_obj = Path(file_path)
if path_obj.exists():
self.file_path = str(path_obj.resolve())
print(f"已选择文件: {self.file_path}")
break
else:
print(f"文件不存在: {file_path},请重新输入")
def _find_column_case_insensitive(self, candidates: List[str]) -> Optional[str]:
"""优化的大小写不敏感列查找"""
if self.df is None:
return None
columns_lower = {col.lower().strip(): col for col in self.df.columns}
for candidate in candidates:
key = candidate.lower().strip()
if key in columns_lower:
return columns_lower[key]
return None
def load_data(self) -> None:
"""优化的数据加载方法"""
self._print_stage("加载数据")
start_time = time.time()
# try:
# # 使用更高效的数据读取方式
# self.df = pd.read_excel(
# self.file_path,
# sheet_name="Merged All Tests",
# engine='openpyxl' # 指定引擎提高性能
# )
# except Exception as e:
# raise RuntimeError(
# f"读取 Excel 失败,请确认工作表名为 'Merged All Tests'。错误: {type(e).__name__}: {e}"
# )
# 检查文件是否存在
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"文件不存在: {self.file_path}")
# 检查文件扩展名是否为Excel支持的格式
if not self.file_path.lower().endswith(('.xls', '.xlsx')):
raise ValueError("输入文件不是有效的 Excel 文件(应为 .xls 或 .xlsx 格式)")
try:
# 打开Excel文件并获取所有sheet名称
excel_file = pd.ExcelFile(self.file_path, engine='openpyxl')
sheet_names = excel_file.sheet_names
except Exception as e:
raise RuntimeError(f"无法打开 Excel 文件,请确认该文件未被损坏或占用。错误: {type(e).__name__}: {e}")
# 定义优先查找的工作表名
target_sheets = ["Merged All Tests", "All Tests"]
selected_sheet = None
for sheet in target_sheets:
if sheet in sheet_names:
selected_sheet = sheet
break
if selected_sheet is None:
raise ValueError(
f"未找到指定的工作表: {''.join(target_sheets)}"
f"当前文件包含的工作表有: {sheet_names}"
)
try:
# 使用更高效的方式读取指定sheet
self.df = pd.read_excel(
self.file_path,
sheet_name=selected_sheet,
engine='openpyxl'
)
except Exception as e:
raise RuntimeError(
f"读取 Excel 失败,工作表: '{selected_sheet}'。错误: {type(e).__name__}: {e}"
)
if self.df.empty:
raise ValueError("工作表为空,无法处理")
# 校验必要列
missing_columns = [col for col in self.required_columns if col not in self.df.columns]
if missing_columns:
raise KeyError(f"缺少必要列: {missing_columns}")
# 记录上下限列名
self.col_lower = self._find_column_case_insensitive([
"Lower Limit", "lower limit", "lower_limit", "ll", "lower"
])
self.col_upper = self._find_column_case_insensitive([
"Upper Limit", "upper limit", "upper_limit", "ul", "upper"
])
loading_time = time.time() - start_time
print(f"数据加载完成: {len(self.df)}× {self.df.shape[1]}")
print(f"耗时: {loading_time:.2f}s")
# 显示列信息摘要
print(f"检测到下限列: {self.col_lower or ''}")
print(f"检测到上限列: {self.col_upper or ''}")
def get_keyword(self) -> Tuple[pd.DataFrame, str, List[str]]:
"""获取用户输入的关键词并筛选数据"""
self._print_stage("筛选关键词")
while True:
keyword = input("请输入筛选关键词(匹配 'Test Name New'): ").strip()
if not keyword:
print("关键词不能为空,请重新输入")
continue
break
mask = self.df["Test Name New"].astype(str).str.contains(keyword, case=False, na=False)
filtered_df = self.df.loc[mask].copy()
if filtered_df.empty:
raise ValueError(f"没有找到包含关键词 '{keyword}' 的测试项")
unique_tests = filtered_df["Test Name New"].unique().tolist()
print(f"匹配到 {len(filtered_df)} 行数据,涉及 {len(unique_tests)} 个不同测试项")
return filtered_df, keyword, unique_tests
def create_output_dir(self) -> None:
"""创建输出目录"""
self._print_stage("创建输出目录")
if not self.file_path:
raise ValueError("文件路径未设置")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base_dir = os.path.dirname(self.file_path)
self.output_dir = os.path.join(base_dir, f"scatter_plots_{timestamp}")
os.makedirs(self.output_dir, exist_ok=True)
print(f"输出目录: {self.output_dir}")
@staticmethod
def _safe_filename(name: str) -> str:
"""生成安全的文件名"""
safe = "".join(c for c in str(name) if c.isalnum() or c in (" ", "_", "-")).strip()
return safe or "Unknown_Test"
def _extract_limits(self, df_one_test: pd.DataFrame) -> Tuple[
Optional[float], Optional[float], List[float], List[float]]:
"""提取某个测试项的上下限数值"""
lower_plot = upper_plot = None
lower_set = []
upper_set = []
if self.col_lower and self.col_lower in df_one_test.columns:
lower_vals = self._clean_and_convert_series(df_one_test[self.col_lower], 'numeric').dropna().unique()
lower_set = sorted(lower_vals.tolist()) if len(lower_vals) > 0 else []
if lower_set:
lower_plot = min(lower_set)
if self.col_upper and self.col_upper in df_one_test.columns:
upper_vals = self._clean_and_convert_series(df_one_test[self.col_upper], 'numeric').dropna().unique()
upper_set = sorted(upper_vals.tolist()) if len(upper_vals) > 0 else []
if upper_set:
upper_plot = max(upper_set)
return lower_plot, upper_plot, lower_set, upper_set
@staticmethod
def _clean_and_convert_series(series: pd.Series, target_type: str = 'numeric') -> pd.Series:
"""统一的系列清洗和转换方法"""
if series.empty:
return series
if target_type == 'numeric':
# 数值转换优化
if pd.api.types.is_numeric_dtype(series):
return series.astype(float)
# 批量字符串处理
cleaned = series.astype(str).str.replace(r'[, ]', '', regex=True).str.strip()
return pd.to_numeric(cleaned, errors='coerce')
elif target_type == 'datetime':
return TestReportScatterPlotter._convert_to_datetime(series)
return series
@staticmethod
def _convert_to_datetime(series: pd.Series) -> pd.Series:
"""优化的日期时间转换"""
if pd.api.types.is_datetime64_any_dtype(series):
return series
# 预处理:转换为数值和字符串两种形式
numeric_series = pd.to_numeric(series, errors='coerce')
string_series = series.astype(str).str.strip()
result = pd.Series(pd.NaT, index=series.index, dtype='datetime64[ns]')
# 数值时间戳处理
masks = {
'ms': numeric_series >= 1e11,
's': (numeric_series >= 1e9) & (numeric_series < 1e11),
'excel': (numeric_series > 20000) & (numeric_series < 60000)
}
for mask_type, mask in masks.items():
if mask.any():
if mask_type == 'ms':
result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='ms')
elif mask_type == 's':
result.loc[mask] = pd.to_datetime(numeric_series.loc[mask], unit='s')
elif mask_type == 'excel':
origin = pd.Timestamp('1899-12-30')
result.loc[mask] = origin + pd.to_timedelta(numeric_series.loc[mask], unit='D')
# 字符串日期处理
remaining_mask = result.isna()
if remaining_mask.any():
remaining_strings = string_series.loc[remaining_mask]
# 特定格式优先处理
format_patterns = [
(r'^\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}$', '%Y-%m-%d %H-%M-%S'),
]
for pattern, date_format in format_patterns:
format_mask = remaining_strings.str.match(pattern)
if format_mask.any():
result.loc[remaining_mask[remaining_mask].index[format_mask]] = pd.to_datetime(
remaining_strings.loc[format_mask], format=date_format, errors='coerce'
)
# 通用解析
still_na_mask = result.isna() & remaining_mask
if still_na_mask.any():
result.loc[still_na_mask] = pd.to_datetime(
string_series.loc[still_na_mask], errors='coerce'
)
return result
def _preprocess_test_data(self, test_data: pd.DataFrame) -> pd.DataFrame:
"""数据预处理"""
# 数值转换
test_data['Measurement_num'] = self._clean_and_convert_series(
test_data['Measurement'], 'numeric'
)
test_data['TestTime_dt'] = self._clean_and_convert_series(
test_data['Test Time'], 'datetime'
)
# 去除无效数据
valid_data = test_data.dropna(subset=['Measurement_num', 'TestTime_dt'])
return valid_data.sort_values('TestTime_dt')
def _calculate_statistics(self, y_data: pd.Series) -> Dict[str, float]:
"""计算统计信息"""
stats = {
'count': len(y_data),
'mean': y_data.mean(),
'median': y_data.median(),
'min': y_data.min(),
'max': y_data.max(),
'std': y_data.std(),
'q1': y_data.quantile(0.25),
'q3': y_data.quantile(0.75)
}
return stats
def _add_statistics_textbox(self, ax, stats: Dict[str, float],
x_pos: float = 1.02, y_pos: float = 0.98) -> None:
"""在图表右侧添加统计信息文本框"""
# 使用英文标签避免中文显示问题
stats_text = (
f"Count: {stats['count']}\n"
f"Mean: {stats['mean']:.4f}\n"
f"Median: {stats['median']:.4f}\n"
f"Min: {stats['min']:.4f}\n"
f"Max: {stats['max']:.4f}\n"
f"Std: {stats['std']:.4f}\n"
f"Q1: {stats['q1']:.4f}\n"
f"Q3: {stats['q3']:.4f}"
)
# 添加文本框到右侧,使用英文字体
props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
ax.text(x_pos, y_pos, stats_text, transform=ax.transAxes, fontsize=8,
verticalalignment='top', horizontalalignment='left', # 左对齐
bbox=props, fontfamily='monospace')
def _add_statistics_lines(self, ax, stats: Dict[str, float],
x_min: float, x_max: float) -> None:
"""添加统计线到图表"""
# 添加平均值线
ax.hlines(y=stats['mean'], xmin=x_min, xmax=x_max,
colors='orange', linestyles='-', linewidth=1.5, alpha=0.7, label='Mean')
# 添加中位数线
ax.hlines(y=stats['median'], xmin=x_min, xmax=x_max,
colors='purple', linestyles='-.', linewidth=1.5, alpha=0.7, label='Median')
# 添加Q1和Q3线
ax.hlines(y=stats['q1'], xmin=x_min, xmax=x_max,
colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q1')
ax.hlines(y=stats['q3'], xmin=x_min, xmax=x_max,
colors='gray', linestyles=':', linewidth=1.0, alpha=0.7, label='Q3')
def _configure_plot(self, ax, test_data: pd.DataFrame, test_name: str,
lower_plot: Optional[float], upper_plot: Optional[float]) -> None:
"""配置图形属性"""
# 计算统计信息
y_data = test_data['Measurement_num']
stats = self._calculate_statistics(y_data)
# 获取时间范围用于统计线
x_min = test_data['TestTime_dt'].min()
x_max = test_data['TestTime_dt'].max()
# Y轴范围计算
y_min, y_max = y_data.min(), y_data.max()
y_candidates = [y_min, y_max]
# 绘制限值线
custom_lines = []
if lower_plot is not None:
y_candidates.append(lower_plot)
ax.axhline(y=lower_plot, color='green', linestyle='--', linewidth=1.2)
custom_lines.append(Line2D([0], [0], color='green', linestyle='--', label="Lower Limit"))
if upper_plot is not None:
y_candidates.append(upper_plot)
ax.axhline(y=upper_plot, color='red', linestyle='--', linewidth=1.2)
custom_lines.append(Line2D([0], [0], color='red', linestyle='--', label="Upper Limit"))
# 添加统计线
self._add_statistics_lines(ax, stats, x_min, x_max)
# 设置范围
valid_candidates = [y for y in y_candidates if pd.notna(y)]
if valid_candidates:
y_min_plot = min(valid_candidates)
y_max_plot = max(valid_candidates)
y_range = y_max_plot - y_min_plot
if y_range == 0:
y_range = abs(y_max_plot) * 0.1 if y_max_plot != 0 else 1.0
y_min_plot = y_min_plot - y_range / 2
y_max_plot = y_max_plot + y_range / 2
ax.set_ylim(y_min_plot - 0.1 * y_range, y_max_plot + 0.1 * y_range)
# 添加统计信息文本框到右侧
self._add_statistics_textbox(ax, stats)
# 设置标题和标签,使用英文避免中文问题
ax.set_title(f"Scatter Plot - {test_name}\n"
f"Mean: {stats['mean']:.4f}, Median: {stats['median']:.4f}, "
f"Range: [{stats['min']:.4f}, {stats['max']:.4f}]",
fontsize=10)
ax.set_xlabel("Test Time")
ax.set_ylabel("Measurement Value")
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)
# 图例处理 - 优化位置在右侧
handles, labels = ax.get_legend_handles_labels()
if custom_lines:
handles.extend(custom_lines)
labels.extend([line.get_label() for line in custom_lines])
if handles:
# 根据图例项数量决定图例位置和布局
if len(handles) > 10: # 如果图例项很多,使用两列布局
ncol = 2
# 调整图例位置,确保不遮挡数据
ax.legend(handles=handles, labels=labels, title="Legend",
fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5),
ncol=ncol, frameon=True, fancybox=True, shadow=True)
else:
# 图例项较少时使用单列布局
ax.legend(handles=handles, labels=labels, title="Legend",
fontsize=8, loc='center left', bbox_to_anchor=(1.05, 0.5),
frameon=True, fancybox=True, shadow=True)
def _save_plot(self, fig, test_name: str) -> None:
"""保存图形"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_name = self._safe_filename(test_name)
filename = f"{safe_name}_{timestamp}.png"
output_path = os.path.join(self.output_dir, filename)
# 调整布局以确保图例完整显示
fig.savefig(output_path, dpi=300, bbox_inches='tight')
plt.close(fig)
print(f"已保存: {output_path}")
def plot_scatter(self, filtered_df: pd.DataFrame, unique_tests: List[str]) -> None:
"""优化的散点图绘制方法"""
self._print_stage("生成散点图")
total_tests = len(unique_tests)
start_time = time.time()
for i, test_name in enumerate(unique_tests, 1):
self._print_progress(i, total_tests, "测试项绘图")
# 使用缓存避免重复计算
cache_key = f"test_{hash(test_name)}"
if cache_key in self._processed_data_cache:
test_data = self._processed_data_cache[cache_key]
else:
test_data = filtered_df[filtered_df["Test Name New"] == test_name].copy()
# 预处理数据
test_data = self._preprocess_test_data(test_data)
self._processed_data_cache[cache_key] = test_data
if test_data.empty:
print(f"\n跳过 '{test_name}' - 无有效的 Measurement/Test Time 数据")
continue
# 提取限值信息
lower_plot, upper_plot, lower_set, upper_set = self._extract_limits(test_data)
# 输出限值信息
limit_info = []
if lower_set:
limit_info.append(f"Lower unique={len(lower_set)}, used={lower_plot}")
else:
limit_info.append("Lower N/A")
if upper_set:
limit_info.append(f"Upper unique={len(upper_set)}, used={upper_plot}")
else:
limit_info.append("Upper N/A")
# 计算并输出统计信息
y_data = test_data['Measurement_num']
stats = self._calculate_statistics(y_data)
stat_info = (
f"数据点: {stats['count']}, "
f"均值: {stats['mean']:.4f}, "
f"中位数: {stats['median']:.4f}, "
f"范围: [{stats['min']:.4f}, {stats['max']:.4f}]"
)
print(f"\n→ 绘制: '{test_name}' | {stat_info} | 限值: {', '.join(limit_info)}")
# 创建图形 - 增大图像尺寸以容纳图例和统计信息
sn_count = len(test_data["SN"].unique()) if "SN" in test_data.columns else 1
# 根据SN数量和预期图例项数量调整图形大小
base_width = 14 # 增加宽度以容纳统计信息
base_height = 9 # 增加高度以容纳更多信息
# 如果SN数量多增加图形宽度以容纳图例
if sn_count > 5:
fig_width = base_width + min(sn_count / 5, 6) # 最大增加6个单位宽度
else:
fig_width = base_width
fig, ax = plt.subplots(figsize=(fig_width, base_height))
# 分组绘制
groups = list(test_data.groupby("SN")) if "SN" in test_data.columns else [("Unknown_SN", test_data)]
for j, (sn, group) in enumerate(groups, 1):
ax.scatter(group['TestTime_dt'], group['Measurement_num'],
label=str(sn), alpha=0.7, s=25)
if j % 10 == 0 or j == len(groups):
self._print_progress(j, len(groups), "SN分组绘制")
# 配置图形
self._configure_plot(ax, test_data, test_name, lower_plot, upper_plot)
# 调整布局,为右侧统计信息和图例留出空间
plt.tight_layout()
plt.subplots_adjust(right=0.8 if sn_count <= 10 else 0.7) # 为右侧统计信息留出更多空间
# 保存图像
self._save_plot(fig, test_name)
total_time = time.time() - start_time
print(f"\n全部绘图完成,总耗时: {total_time:.2f}s")
print(f"所有图表已保存到: {self.output_dir}")
def run(self) -> None:
"""运行主程序"""
try:
self.get_file_path()
self.load_data()
filtered_df, keyword, unique_tests = self.get_keyword()
self.create_output_dir()
self.plot_scatter(filtered_df, unique_tests)
except Exception as e:
print(f"\n❌ 发生错误: {type(e).__name__}: {str(e)}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
plotter = TestReportScatterPlotter()
plotter.run()

251
main.py Normal file
View File

@@ -0,0 +1,251 @@
import pandas as pd
import os
import glob
import re
from datetime import datetime
import tkinter as tk
from tkinter import filedialog
from collections import defaultdict
class BOMConsolidator:
def __init__(self):
self.master_data = defaultdict(dict)
self.required_columns = ['Partnumber', 'Purchase_Code', 'MF_PN', 'Description',
'Part_Type', 'MF_NAME', 'PCB_Footprint', 'Quantity', 'Reference']
self.file_quantities = {}
self.consolidated_report = None
self.inconsistency_count = 0
self.processed_files = 0
self.processed_rows = 0
self.output_folder = ""
def find_valid_sheet(self, file_path):
"""定位包含有效BOM的Sheet"""
xl = pd.ExcelFile(file_path)
for sheet_name in xl.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
for i in range(len(df)):
headers = df.iloc[i].values
if all(col in headers for col in ['Item', 'Partnumber', 'Purchase_Code', 'MF_PN']):
return sheet_name, i
return None, None
def clean_column_names(self, df):
"""清洗列名并标准化"""
df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
df.columns = df.columns.str.replace(r'[^a-zA-Z0-9_]', '', regex=True)
return df
def process_file(self, file_path):
"""处理单个BOM文件"""
filename = os.path.basename(file_path)
print(f"处理文件: {filename}...")
sheet_name, header_row = self.find_valid_sheet(file_path)
if not sheet_name:
print(f" ! 未找到有效BOM表: {filename}")
return False
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
df = self.clean_column_names(df)
# 验证必要字段
missing_cols = [col for col in self.required_columns if col not in df.columns]
if missing_cols:
print(f" ! 缺少必要列: {', '.join(missing_cols)}")
return False
print(f" √ 找到有效Sheet: {sheet_name} (共{len(df)}行)")
self.file_quantities[filename] = {}
self.processed_files += 1
# 处理每行数据
for _, row in df.iterrows():
self.process_row(row, filename)
self.processed_rows += 1
return True
def process_row(self, row, filename):
"""处理单行数据"""
# 确定合并主键
key = row['Partnumber'] if pd.notna(row['Partnumber']) and row['Partnumber'] != '' else row['MF_PN']
if pd.isna(key) or key == '':
return
# 首次记录该物料
if key not in self.master_data:
self.master_data[key] = {
'Partnumber': row['Partnumber'],
'Purchase_Code': row['Purchase_Code'],
'MF_PN': row['MF_PN'],
'Description': row.get('Description', ''),
'Part_Type': row.get('Part_Type', ''),
'MF_NAME': row.get('MF_NAME', ''),
'PCB_Footprint': row.get('PCB_Footprint', ''),
'quantity_data': {}, # 存储每个文件的数量
'inconsistencies': [] # 存储不一致信息
}
# 检查字段一致性
current_data = self.master_data[key]
fields_to_check = ['Purchase_Code', 'MF_PN', 'Part_Type', 'MF_NAME', 'PCB_Footprint']
for field in fields_to_check:
# 处理字段名称差异
db_field = 'Part Type' if field == 'Part_Type' else field
current_val = str(current_data[field])
new_val = str(row.get(db_field, ''))
# 忽略空值和'nan'字符串
if new_val in ['', 'nan', 'NaN', 'NaT']:
continue
# 比较当前值和新值
if current_val != new_val:
current_data['inconsistencies'].append(
f"{field}不一致: {current_val}{new_val} (文件: {filename})"
)
# 检查Reference数量和Quantity是否匹配
ref_count = 0
if pd.notna(row['Reference']) and row['Reference'] != '':
ref_list = str(row['Reference']).split(',')
ref_count = len([ref for ref in ref_list if ref.strip() != ''])
try:
quantity = int(row['Quantity'])
if ref_count != quantity:
current_data['inconsistencies'].append(
f"Reference数量不符: {ref_count}个位置 ≠ Quantity={quantity} (文件: {filename})"
)
except (ValueError, TypeError):
pass
# 记录当前文件的数量
try:
qty_val = int(row['Quantity'])
self.file_quantities[filename][key] = qty_val
current_data['quantity_data'][filename] = qty_val
except (ValueError, TypeError):
self.file_quantities[filename][key] = 0
current_data['quantity_data'][filename] = 0
# 更新不一致计数
if current_data['inconsistencies']:
self.inconsistency_count += 1
def generate_report(self):
"""生成合并报告"""
if not self.master_data:
print("无有效数据可生成报告")
return None
print(f"\n生成合并报告,共{len(self.master_data)}种物料...")
# 准备报告数据结构
report_data = []
file_columns = sorted(self.file_quantities.keys())
for key, data in self.master_data.items():
row = {
'Partnumber': data['Partnumber'],
'Purchase_Code': data['Purchase_Code'],
'MF_PN': data['MF_PN'],
'Description': data['Description'],
'Part Type': data['Part_Type'],
'MF_NAME': data['MF_NAME'],
'PCB_Footprint': data['PCB_Footprint'],
'检查信息': '; '.join(data['inconsistencies'])
}
# 添加各文件数量
total = 0
for file in file_columns:
qty = data['quantity_data'].get(file, 0)
row[file] = qty
total += qty
row['合计'] = total
report_data.append(row)
# 创建DataFrame
self.consolidated_report = pd.DataFrame(report_data)
# 生成带时间戳的文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = os.path.join(self.output_folder, f"BOM合并报告_{timestamp}.xlsx")
# 保存报告
self.consolidated_report.to_excel(output_path, index=False)
# 返回统计信息和路径
stats = {
'output_path': output_path,
'file_count': self.processed_files,
'material_count': len(self.master_data),
'inconsistency_count': self.inconsistency_count,
'processed_rows': self.processed_rows
}
return stats
def select_folder():
"""弹出文件夹选择对话框"""
root = tk.Tk()
root.withdraw()
folder_selected = filedialog.askdirectory(title='选择BOM文件所在文件夹')
return folder_selected
def main():
# 初始化合并器
bom_processor = BOMConsolidator()
# 选择文件夹
folder_path = select_folder()
if not folder_path:
print("未选择文件夹,程序退出")
return
bom_processor.output_folder = folder_path
# 获取所有Excel文件
bom_files = glob.glob(os.path.join(folder_path, "*.xlsx"))
if not bom_files:
print("文件夹中没有Excel文件")
return
print(f"找到 {len(bom_files)} 个Excel文件开始处理...")
# 处理文件
processed_count = 0
for file_path in bom_files:
success = bom_processor.process_file(file_path)
if success:
processed_count += 1
# 生成报告
if bom_processor.master_data:
stats = bom_processor.generate_report()
# 打印汇总信息
print("\n" + "=" * 40)
print("BOM合并完成! 汇总信息:")
print(f"处理文件夹: {folder_path}")
print(f"扫描文件数: {len(bom_files)}")
print(f"成功处理文件数: {processed_count}")
print(f"处理行数: {stats['processed_rows']}")
print(f"合并物料种类数: {stats['material_count']}")
print(f"检测到不一致条目数: {stats['inconsistency_count']}")
print(f"报告已保存至: {stats['output_path']}")
print("=" * 40)
else:
print("没有有效数据生成报告")
if __name__ == "__main__":
main()

9
tempReportProcess/.gitignore vendored Normal file
View File

@@ -0,0 +1,9 @@
/build/*
/build
/dist/*
/dist
/source/*
/source
tempReportProcess_V2.py

View File

@@ -0,0 +1,248 @@
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import tkinter as tk
from tkinter import filedialog
import os
import matplotlib.dates as mdates
from jinja2 import Template
from matplotlib import font_manager, rcParams
class TemperatureDataAnalyzer:
def __init__(self):
self.data = None
self.file_path = None
self.timestamps = []
self.temperatures = []
self.statuses = []
self._configure_chinese_font() # 配置中文字体,修复中文字符缺失警告
def _configure_chinese_font(self):
"""
配置 Matplotlib 中文字体,避免中文字符缺失的警告。
会尝试常见的中文字体并设置 axes.unicode_minus 为 False。
"""
try:
# 常见中文字体候选(跨平台)
candidates = [
"Microsoft YaHei", "Microsoft YaHei UI", # Windows
"SimHei", "SimSun", # Windows黑体/宋体)
"PingFang SC", "Heiti SC", # macOS
"Noto Sans CJK SC", "Source Han Sans SC", "WenQuanYi Micro Hei", # Linux
"Arial Unicode MS" # 覆盖广的 Unicode 字体
]
available = {f.name for f in font_manager.fontManager.ttflist}
for name in candidates:
if name in available:
rcParams["font.sans-serif"] = [name]
rcParams["axes.unicode_minus"] = False
# 可选:打印使用的字体名称
# print(f"使用中文字体: {name}")
return
# 如果没有找到常见中文字体,给出提示
rcParams["axes.unicode_minus"] = False
print("未检测到常见中文字体,图中中文可能无法正常显示。建议安装 'Noto Sans CJK SC''Microsoft YaHei'")
except Exception as e:
print(f"中文字体配置失败: {e}")
def select_file(self):
"""手动选择CSV文件"""
root = tk.Tk()
root.withdraw() # 隐藏主窗口
file_types = [("CSV files", "*.csv"), ("All files", "*.*")]
self.file_path = filedialog.askopenfilename(title="选择温度数据CSV文件", filetypes=file_types)
if not self.file_path:
print("未选择文件,程序退出")
return False
return True
def load_and_process_data(self):
"""加载和处理数据"""
try:
# 读取CSV文件无表头
self.data = pd.read_csv(self.file_path, header=None)
# 重命名列以便于引用
self.data.columns = ['timestamp', 'temperature', 'status']
# 转换时间戳格式文本例如10/29/2025 2:20:41 PM
self.data['datetime'] = pd.to_datetime(self.data['timestamp'], format='%m/%d/%Y %I:%M:%S %p')
# 提取处理后的数据
self.timestamps = self.data['datetime']
self.temperatures = self.data['temperature']
self.statuses = self.data['status']
print(f"成功加载 {len(self.data)} 条记录")
return True
except Exception as e:
print(f"数据处理错误: {e}")
return False
def create_scatter_plots(self):
"""创建散点图"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
# 温度散点图
sc1 = ax1.scatter(self.timestamps, self.temperatures, c=self.temperatures,
cmap='coolwarm', alpha=0.7, s=20)
ax1.set_title('温度随时间变化趋势')
ax1.set_ylabel('温度 (°C)')
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax1.grid(True, linestyle='--', alpha=0.7)
ax1.tick_params(axis='x', rotation=45)
plt.colorbar(sc1, ax=ax1, label="温度(°C)")
# 状态散点图
sc2 = ax2.scatter(self.timestamps, self.statuses, c=self.statuses,
cmap='viridis', alpha=0.7, s=20)
ax2.set_title('状态随时间变化')
ax2.set_xlabel('时间')
ax2.set_ylabel('状态值')
ax2.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
ax2.grid(True, linestyle='--', alpha=0.7)
ax2.tick_params(axis='x', rotation=45)
plt.colorbar(sc2, ax=ax2, label="状态值")
plt.tight_layout()
return fig
def generate_statistics_report(self):
"""生成统计报告"""
stats = {
'total_records': len(self.temperatures),
'avg_temperature': round(self.temperatures.mean(), 2),
'max_temperature': round(self.temperatures.max(), 2),
'min_temperature': round(self.temperatures.min(), 2),
'std_deviation': round(self.temperatures.std(), 2),
'temp_range': round(self.temperatures.max() - self.temperatures.min(), 2),
'start_time': self.timestamps.iloc[0].strftime('%Y-%m-%d %H:%M:%S'),
'end_time': self.timestamps.iloc[-1].strftime('%Y-%m-%d %H:%M:%S'),
'duration_hours': round((self.timestamps.iloc[-1] - self.timestamps.iloc[0]).total_seconds() / 3600, 2)
}
# 状态分布统计
status_counts = self.statuses.value_counts().to_dict()
stats['status_distribution'] = status_counts
return stats
def save_fig_to_html(self, fig, output_path):
"""将图形保存为HTML"""
import io
import base64
# 将图形转换为base64编码
buf = io.BytesIO()
fig.savefig(buf, format='png', dpi=150, bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
# HTML模板修复了多余的 '}'
html_template = """
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>温度数据分析报告</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.header { background-color: #f0f0f0; padding: 15px; border-radius: 5px; }
.section { margin-bottom: 30px; }
.stats-table { width: 100%; border-collapse: collapse; }
.stats-table th, .stats-table td { border: 1px solid #ddd; padding: 8px; text-align: left; }
.stats-table th { background-color: #f2f2f2; }
.image-container { text-align: center; margin: 20px 0; }
h1, h2 { color: #333; }
</style>
</head>
<body>
<div class="header">
<h1>温度数据分析报告</h1>
<p><strong>数据文件:</strong> {{ file_name }}</p>
<p><strong>生成时间:</strong> {{ generation_time }}</p>
</div>
<div class="section">
<h2>数据概览</h2>
<table class="stats-table">
<tr><th>项目</th><th>数值</th></tr>
{% for key, value in statistics.items() %}
{% if key != 'status_distribution' %}
<tr><td>{{ key.replace('_', ' ').title() }}</td><td>{{ value }}</td></tr>
{% endif %}
{% endfor %}
</table>
</div>
<div class="section">
<h2>状态分布</h2>
<table class="stats-table">
<tr><th>状态值</th><th>出现次数</th></tr>
{% for status, count in statistics.status_distribution.items() %}
<tr><td>{{ status }}</td><td>{{ count }}</td></tr>
{% endfor %}
</table>
</div>
<div class="section">
<h2>温度与状态时序图</h2>
<div class="image-container">
<img src="data:image/png;base64,{{ image_data }}" alt="温度与状态时序图">
</div>
</div>
</body>
</html>
"""
template = Template(html_template)
rendered_html = template.render(
file_name=self.file_path,
generation_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
statistics=self.generate_statistics_report(),
image_data=img_str
)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(rendered_html)
def run_analysis(self):
"""运行完整分析流程"""
if not self.select_file():
return
if not self.load_and_process_data():
return
# 创建图形
fig = self.create_scatter_plots()
# 生成输出文件名(保存到选择的文件所在文件夹)
base_filename = os.path.splitext(os.path.basename(self.file_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"{base_filename}_{timestamp}.html"
output_dir = os.path.dirname(self.file_path)
output_path = os.path.join(output_dir, output_filename)
# 保存HTML报告到同一文件夹
self.save_fig_to_html(fig, output_path)
print(f"分析完成!报告已保存至: {output_path}")
# 显示统计摘要
stats = self.generate_statistics_report()
print("\n=== 数据统计摘要 ===")
for key, value in stats.items():
if key != 'status_distribution':
print(f"{key.replace('_', ' ').title()}: {value}")
if __name__ == "__main__":
analyzer = TemperatureDataAnalyzer()
analyzer.run_analysis()