import os import pandas as pd from datetime import datetime import argparse import re import time import argparse from datetime import datetime from collections import defaultdict import numpy as np import pandas as pd import openpyxl class ExcelProcessor: def __init__(self, file_path): self.file_path = file_path self.df = None self.output_folder = None self.output_file = None self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.processed_data = {} # 存储处理后的数据 def load_data(self): """加载Excel文件数据""" print(f"正在加载文件: {self.file_path}") try: # 尝试读取指定sheet,如果不存在则尝试读取第一个sheet try: # 建议使用 engine='openpyxl',pandas 会尽可能把 Excel 的日期单元格读成 datetime # self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl') # 获取所有工作表名称 sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names # 查找包含'LINK'的工作表(不区分大小写) target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None) if target_sheet is None: raise ValueError(f"未找到包含'LINK'的工作表") self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl') except Exception as e: print("提示: 未找到包含'LINK' sheet,请检查文件内容。") return False # 确保有 PartNumber 列(兼容 LinkObject) if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns: self.df['PartNumber'] = self.df['LinkObject'] # 检查必要的列是否存在 required_cols = ["PartNumber", "ChildSN", "linkDate"] missing = [c for c in required_cols if c not in self.df.columns] if missing: raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}") # 解析 linkDate 为 datetime(支持 AM/PM) # 注:pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM" # 如果有极端异构格式,可在这里加更精细的清洗逻辑 # errors='coerce' 会把无法解析的值变为 NaT # self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce') self.df['linkDate'] = pd.to_datetime( self.df['linkDate'], format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM errors='coerce' ) # 提示解析情况 total = len(self.df) invalid = int(self.df['linkDate'].isna().sum()) print(f"文件加载成功,总行数: {total},日期解析失败: {invalid} 行") # 添加备注列 if '备注' not in self.df.columns: self.df['备注'] = '' return True except Exception as e: print(f"加载文件失败: {str(e)}") return False def create_output_folder(self): """准备输出目录和文件名""" # 先去除扩展名,再截取前10个字符 # base_name = os.path.splitext(os.path.basename(self.file_path))[0] original_name = os.path.splitext(os.path.basename(self.file_path))[0] # base_name = original_name[:10] base_name = original_name[:20] output_folder_name = f"{base_name} output_{self.timestamp}" # self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name) self.output_folder = os.path.dirname(self.file_path) self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx") if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) print(f"已创建输出文件夹: {self.output_folder}") def _safe_sheet_name(self, name): """清理为合法的 Excel sheet 名称(<=31字符,无非法字符)""" # 转为字符串 s = str(name) # 替换非法字符:: \ / ? * [ ] s = re.sub(r'[:\\/\?\*\[\]]', '_', s) # 去除首尾空格 s = s.strip() # 截断到 31 个字符 if len(s) > 31: s = s[:31] # 空名兜底 if not s: s = 'Sheet' return s def process_data(self): """处理数据并拆分到不同sheet""" if self.df is None: raise ValueError("数据未加载,请先调用 load_data() 方法") # 确保有PartNumber列 if 'PartNumber' not in self.df.columns: if 'LinkObject' in self.df.columns: self.df['PartNumber'] = self.df['LinkObject'] else: raise ValueError("数据表中既没有PartNumber也没有LinkObject列") # 添加备注列 self.df['备注'] = '' # 按 PartNumber 分组 grouped = self.df.groupby('PartNumber', dropna=False) total_groups = len(grouped) print(f"开始处理数据,共 {total_groups} 个分组...") # 使用上下文管理器,自动保存关闭 # print(f"输出文件信息,self.output_folder:{self.output_folder}") print(f"输出文件信息,self.output_file:{self.output_file}") # output_path = os.path.join(self.output_folder, self.output_file) output_path = self.output_file writer = pd.ExcelWriter(output_path, engine='openpyxl') for i, (name, group) in enumerate(grouped): print(f"正在处理分组 {i + 1}/{total_groups}: {name}") # 处理重复 ChildSN(根据最新 linkDate 保留一条) group_processed = self.process_duplicates(group) # 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串 group_out = group_processed.copy() group_out['linkDate'] = group_out['linkDate'].apply( lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else '' ) # 写入sheet safe_name = self._safe_sheet_name(name) group_out.to_excel(writer, sheet_name=safe_name, index=False) # 保存文件 writer.close() print(f"处理完成! 结果已保存到: {output_path}") def process_duplicates(self, group): """处理重复的 ChildSN,优化备注信息:保留最新 linkDate 的一行""" # 找出重复 ChildSN duplicates = group[group.duplicated('ChildSN', keep=False)] if not duplicates.empty: print(f" 发现 {len(duplicates)} 行重复数据,正在处理...") # 遍历每个重复 ChildSN 的分组 for child_sn, dup_group in duplicates.groupby('ChildSN'): # 按 linkDate 排序,保留最新(降序) # 若 linkDate 有 NaT,会排在末尾 dup_group = dup_group.sort_values('linkDate', ascending=False) # 获取最新行 latest_row = dup_group.iloc[0] # 差异字段收集(除 ChildSN、备注) diff_info = {} for col in dup_group.columns: if col in ['ChildSN', '备注']: continue unique_values = dup_group[col].unique() if len(unique_values) > 1: # 对 linkDate 做专门格式化,其他列保持原样转字符串 if col == 'linkDate': vals = [] for v in unique_values: if pd.isna(v): vals.append('') elif isinstance(v, pd.Timestamp): vals.append(v.strftime('%Y-%m-%d %H:%M:%S')) else: vals.append(str(v)) diff_info[col] = f"{col}: {', '.join(vals)}" else: diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}" # 生成备注信息 note = f"重复行数: {len(dup_group)}" if diff_info: note += "; 差异内容: " + "; ".join(diff_info.values()) # 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除) group.loc[group['ChildSN'] == child_sn, '备注'] = note # 删除除最新以外的行 drop_indices = dup_group.index[1:] group = group.drop(drop_indices) return group def main(): print("=== Excel拆分工具 ===") file_path = input("请输入Excel文件路径: ").strip('"') if not os.path.exists(file_path): print("文件不存在,请检查路径") return start_time = time.time() try: # 创建处理器实例 processor = ExcelProcessor(file_path) # 执行处理流程 if not processor.load_data(): return processor.create_output_folder() processor.process_data() print("所有处理已完成!") except Exception as e: print(f"处理过程中发生错误: {e}") end_time = time.time() print(f"总耗时: {end_time - start_time:.2f}秒") if __name__ == "__main__": main()