Files
PythonApp/IMULinkdata/LINLinkData_V1.py

252 lines
9.5 KiB
Python
Raw Normal View History

2026-02-02 15:19:30 +08:00
import os
import pandas as pd
from datetime import datetime
import argparse
import re
import time
import argparse
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
import openpyxl
class ExcelProcessor:
def __init__(self, file_path):
self.file_path = file_path
self.df = None
self.output_folder = None
self.output_file = None
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.processed_data = {} # 存储处理后的数据
def load_data(self):
"""加载Excel文件数据"""
print(f"正在加载文件: {self.file_path}")
try:
# 尝试读取指定sheet如果不存在则尝试读取第一个sheet
try:
# 建议使用 engine='openpyxl'pandas 会尽可能把 Excel 的日期单元格读成 datetime
# self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl')
# 获取所有工作表名称
sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names
# 查找包含'LINK'的工作表(不区分大小写)
target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None)
if target_sheet is None:
raise ValueError(f"未找到包含'LINK'的工作表")
self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl')
except Exception as e:
print("提示: 未找到包含'LINK' sheet请检查文件内容。")
return False
# 确保有 PartNumber 列(兼容 LinkObject
if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns:
self.df['PartNumber'] = self.df['LinkObject']
# 检查必要的列是否存在
required_cols = ["PartNumber", "ChildSN", "linkDate"]
missing = [c for c in required_cols if c not in self.df.columns]
if missing:
raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}")
# 解析 linkDate 为 datetime支持 AM/PM
# 注pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM"
# 如果有极端异构格式,可在这里加更精细的清洗逻辑
# errors='coerce' 会把无法解析的值变为 NaT
# self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce')
self.df['linkDate'] = pd.to_datetime(
self.df['linkDate'],
format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM
errors='coerce'
)
# 提示解析情况
total = len(self.df)
invalid = int(self.df['linkDate'].isna().sum())
print(f"文件加载成功,总行数: {total},日期解析失败: {invalid}")
# 添加备注列
if '备注' not in self.df.columns:
self.df['备注'] = ''
return True
except Exception as e:
print(f"加载文件失败: {str(e)}")
return False
def create_output_folder(self):
"""准备输出目录和文件名"""
# 先去除扩展名再截取前10个字符
# base_name = os.path.splitext(os.path.basename(self.file_path))[0]
original_name = os.path.splitext(os.path.basename(self.file_path))[0]
# base_name = original_name[:10]
base_name = original_name[:20]
output_folder_name = f"{base_name} output_{self.timestamp}"
# self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name)
self.output_folder = os.path.dirname(self.file_path)
self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx")
if not os.path.exists(self.output_folder):
os.makedirs(self.output_folder)
print(f"已创建输出文件夹: {self.output_folder}")
def _safe_sheet_name(self, name):
"""清理为合法的 Excel sheet 名称(<=31字符无非法字符"""
# 转为字符串
s = str(name)
# 替换非法字符:: \ / ? * [ ]
s = re.sub(r'[:\\/\?\*\[\]]', '_', s)
# 去除首尾空格
s = s.strip()
# 截断到 31 个字符
if len(s) > 31:
s = s[:31]
# 空名兜底
if not s:
s = 'Sheet'
return s
def process_data(self):
"""处理数据并拆分到不同sheet"""
if self.df is None:
raise ValueError("数据未加载,请先调用 load_data() 方法")
# 确保有PartNumber列
if 'PartNumber' not in self.df.columns:
if 'LinkObject' in self.df.columns:
self.df['PartNumber'] = self.df['LinkObject']
else:
raise ValueError("数据表中既没有PartNumber也没有LinkObject列")
# 添加备注列
self.df['备注'] = ''
# 按 PartNumber 分组
grouped = self.df.groupby('PartNumber', dropna=False)
total_groups = len(grouped)
print(f"开始处理数据,共 {total_groups} 个分组...")
# 使用上下文管理器,自动保存关闭
# print(f"输出文件信息self.output_folder{self.output_folder}")
print(f"输出文件信息self.output_file{self.output_file}")
# output_path = os.path.join(self.output_folder, self.output_file)
output_path = self.output_file
writer = pd.ExcelWriter(output_path, engine='openpyxl')
for i, (name, group) in enumerate(grouped):
print(f"正在处理分组 {i + 1}/{total_groups}: {name}")
# 处理重复 ChildSN根据最新 linkDate 保留一条)
group_processed = self.process_duplicates(group)
# 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串
group_out = group_processed.copy()
group_out['linkDate'] = group_out['linkDate'].apply(
lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else ''
)
# 写入sheet
safe_name = self._safe_sheet_name(name)
group_out.to_excel(writer, sheet_name=safe_name, index=False)
# 保存文件
writer.close()
print(f"处理完成! 结果已保存到: {output_path}")
def process_duplicates(self, group):
"""处理重复的 ChildSN优化备注信息保留最新 linkDate 的一行"""
# 找出重复 ChildSN
duplicates = group[group.duplicated('ChildSN', keep=False)]
if not duplicates.empty:
print(f" 发现 {len(duplicates)} 行重复数据,正在处理...")
# 遍历每个重复 ChildSN 的分组
for child_sn, dup_group in duplicates.groupby('ChildSN'):
# 按 linkDate 排序,保留最新(降序)
# 若 linkDate 有 NaT会排在末尾
dup_group = dup_group.sort_values('linkDate', ascending=False)
# 获取最新行
latest_row = dup_group.iloc[0]
# 差异字段收集(除 ChildSN、备注
diff_info = {}
for col in dup_group.columns:
if col in ['ChildSN', '备注']:
continue
unique_values = dup_group[col].unique()
if len(unique_values) > 1:
# 对 linkDate 做专门格式化,其他列保持原样转字符串
if col == 'linkDate':
vals = []
for v in unique_values:
if pd.isna(v):
vals.append('')
elif isinstance(v, pd.Timestamp):
vals.append(v.strftime('%Y-%m-%d %H:%M:%S'))
else:
vals.append(str(v))
diff_info[col] = f"{col}: {', '.join(vals)}"
else:
diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}"
# 生成备注信息
note = f"重复行数: {len(dup_group)}"
if diff_info:
note += "; 差异内容: " + "; ".join(diff_info.values())
# 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除)
group.loc[group['ChildSN'] == child_sn, '备注'] = note
# 删除除最新以外的行
drop_indices = dup_group.index[1:]
group = group.drop(drop_indices)
return group
def main():
print("=== Excel拆分工具 ===")
file_path = input("请输入Excel文件路径: ").strip('"')
if not os.path.exists(file_path):
print("文件不存在,请检查路径")
return
start_time = time.time()
try:
# 创建处理器实例
processor = ExcelProcessor(file_path)
# 执行处理流程
if not processor.load_data():
return
processor.create_output_folder()
processor.process_data()
print("所有处理已完成!")
except Exception as e:
print(f"处理过程中发生错误: {e}")
end_time = time.time()
print(f"总耗时: {end_time - start_time:.2f}")
if __name__ == "__main__":
main()