Python脚本开发文件初始化
This commit is contained in:
9
IMULinkdata/.gitignore
vendored
Normal file
9
IMULinkdata/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/build/*
|
||||
/build
|
||||
/dist/*
|
||||
/dist
|
||||
/source/*
|
||||
/source
|
||||
|
||||
|
||||
LINLinkData_V2.py
|
||||
252
IMULinkdata/LINLinkData_V1.py
Normal file
252
IMULinkdata/LINLinkData_V1.py
Normal file
@@ -0,0 +1,252 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
|
||||
import re
|
||||
import time
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import openpyxl
|
||||
|
||||
|
||||
class ExcelProcessor:
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
self.df = None
|
||||
self.output_folder = None
|
||||
self.output_file = None
|
||||
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self.processed_data = {} # 存储处理后的数据
|
||||
|
||||
|
||||
def load_data(self):
|
||||
"""加载Excel文件数据"""
|
||||
print(f"正在加载文件: {self.file_path}")
|
||||
try:
|
||||
# 尝试读取指定sheet,如果不存在则尝试读取第一个sheet
|
||||
try:
|
||||
# 建议使用 engine='openpyxl',pandas 会尽可能把 Excel 的日期单元格读成 datetime
|
||||
# self.df = pd.read_excel(self.file_path, sheet_name='LinkdataAll', engine='openpyxl')
|
||||
|
||||
# 获取所有工作表名称
|
||||
sheet_names = pd.ExcelFile(self.file_path, engine='openpyxl').sheet_names
|
||||
# 查找包含'LINK'的工作表(不区分大小写)
|
||||
target_sheet = next((sheet for sheet in sheet_names if 'link' in sheet.lower()), None)
|
||||
if target_sheet is None:
|
||||
raise ValueError(f"未找到包含'LINK'的工作表")
|
||||
|
||||
self.df = pd.read_excel(self.file_path, sheet_name=target_sheet, engine='openpyxl')
|
||||
except Exception as e:
|
||||
print("提示: 未找到包含'LINK' sheet,请检查文件内容。")
|
||||
return False
|
||||
|
||||
# 确保有 PartNumber 列(兼容 LinkObject)
|
||||
if 'PartNumber' not in self.df.columns and 'LinkObject' in self.df.columns:
|
||||
self.df['PartNumber'] = self.df['LinkObject']
|
||||
|
||||
# 检查必要的列是否存在
|
||||
required_cols = ["PartNumber", "ChildSN", "linkDate"]
|
||||
missing = [c for c in required_cols if c not in self.df.columns]
|
||||
if missing:
|
||||
raise ValueError(f"数据表中缺少必要列: {', '.join(missing)}")
|
||||
|
||||
# 解析 linkDate 为 datetime(支持 AM/PM)
|
||||
# 注:pd.to_datetime 能解析大多数常见格式,包括 "5/24/2025 6:00:13 PM"
|
||||
# 如果有极端异构格式,可在这里加更精细的清洗逻辑
|
||||
# errors='coerce' 会把无法解析的值变为 NaT
|
||||
# self.df['linkDate'] = pd.to_datetime(self.df['linkDate'], errors='coerce')
|
||||
|
||||
self.df['linkDate'] = pd.to_datetime(
|
||||
self.df['linkDate'],
|
||||
format='%m/%d/%Y %I:%M:%S %p', # 月/日/年 12小时制+AM/PM
|
||||
errors='coerce'
|
||||
)
|
||||
|
||||
# 提示解析情况
|
||||
total = len(self.df)
|
||||
invalid = int(self.df['linkDate'].isna().sum())
|
||||
print(f"文件加载成功,总行数: {total},日期解析失败: {invalid} 行")
|
||||
|
||||
# 添加备注列
|
||||
if '备注' not in self.df.columns:
|
||||
self.df['备注'] = ''
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"加载文件失败: {str(e)}")
|
||||
return False
|
||||
|
||||
def create_output_folder(self):
|
||||
"""准备输出目录和文件名"""
|
||||
|
||||
# 先去除扩展名,再截取前10个字符
|
||||
# base_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
original_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
|
||||
# base_name = original_name[:10]
|
||||
base_name = original_name[:20]
|
||||
|
||||
output_folder_name = f"{base_name} output_{self.timestamp}"
|
||||
|
||||
# self.output_folder = os.path.join(os.path.dirname(self.file_path), output_folder_name)
|
||||
self.output_folder = os.path.dirname(self.file_path)
|
||||
|
||||
self.output_file = os.path.join(self.output_folder, f"{original_name}_split_by_PartNumber_{self.timestamp}.xlsx")
|
||||
|
||||
if not os.path.exists(self.output_folder):
|
||||
os.makedirs(self.output_folder)
|
||||
print(f"已创建输出文件夹: {self.output_folder}")
|
||||
|
||||
|
||||
def _safe_sheet_name(self, name):
|
||||
"""清理为合法的 Excel sheet 名称(<=31字符,无非法字符)"""
|
||||
# 转为字符串
|
||||
s = str(name)
|
||||
# 替换非法字符:: \ / ? * [ ]
|
||||
s = re.sub(r'[:\\/\?\*\[\]]', '_', s)
|
||||
# 去除首尾空格
|
||||
s = s.strip()
|
||||
# 截断到 31 个字符
|
||||
if len(s) > 31:
|
||||
s = s[:31]
|
||||
# 空名兜底
|
||||
if not s:
|
||||
s = 'Sheet'
|
||||
return s
|
||||
|
||||
def process_data(self):
|
||||
"""处理数据并拆分到不同sheet"""
|
||||
if self.df is None:
|
||||
raise ValueError("数据未加载,请先调用 load_data() 方法")
|
||||
|
||||
# 确保有PartNumber列
|
||||
if 'PartNumber' not in self.df.columns:
|
||||
if 'LinkObject' in self.df.columns:
|
||||
self.df['PartNumber'] = self.df['LinkObject']
|
||||
else:
|
||||
raise ValueError("数据表中既没有PartNumber也没有LinkObject列")
|
||||
|
||||
# 添加备注列
|
||||
self.df['备注'] = ''
|
||||
|
||||
# 按 PartNumber 分组
|
||||
grouped = self.df.groupby('PartNumber', dropna=False)
|
||||
total_groups = len(grouped)
|
||||
print(f"开始处理数据,共 {total_groups} 个分组...")
|
||||
|
||||
# 使用上下文管理器,自动保存关闭
|
||||
# print(f"输出文件信息,self.output_folder:{self.output_folder}")
|
||||
print(f"输出文件信息,self.output_file:{self.output_file}")
|
||||
# output_path = os.path.join(self.output_folder, self.output_file)
|
||||
output_path = self.output_file
|
||||
|
||||
writer = pd.ExcelWriter(output_path, engine='openpyxl')
|
||||
|
||||
for i, (name, group) in enumerate(grouped):
|
||||
print(f"正在处理分组 {i + 1}/{total_groups}: {name}")
|
||||
|
||||
# 处理重复 ChildSN(根据最新 linkDate 保留一条)
|
||||
group_processed = self.process_duplicates(group)
|
||||
|
||||
# 输出前,把 linkDate 格式化为 yyyy-mm-dd hh:mm:ss 的字符串
|
||||
group_out = group_processed.copy()
|
||||
group_out['linkDate'] = group_out['linkDate'].apply(
|
||||
lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else ''
|
||||
)
|
||||
|
||||
# 写入sheet
|
||||
safe_name = self._safe_sheet_name(name)
|
||||
group_out.to_excel(writer, sheet_name=safe_name, index=False)
|
||||
|
||||
# 保存文件
|
||||
writer.close()
|
||||
print(f"处理完成! 结果已保存到: {output_path}")
|
||||
|
||||
def process_duplicates(self, group):
|
||||
"""处理重复的 ChildSN,优化备注信息:保留最新 linkDate 的一行"""
|
||||
# 找出重复 ChildSN
|
||||
duplicates = group[group.duplicated('ChildSN', keep=False)]
|
||||
|
||||
if not duplicates.empty:
|
||||
print(f" 发现 {len(duplicates)} 行重复数据,正在处理...")
|
||||
|
||||
# 遍历每个重复 ChildSN 的分组
|
||||
for child_sn, dup_group in duplicates.groupby('ChildSN'):
|
||||
# 按 linkDate 排序,保留最新(降序)
|
||||
# 若 linkDate 有 NaT,会排在末尾
|
||||
dup_group = dup_group.sort_values('linkDate', ascending=False)
|
||||
|
||||
# 获取最新行
|
||||
latest_row = dup_group.iloc[0]
|
||||
|
||||
# 差异字段收集(除 ChildSN、备注)
|
||||
diff_info = {}
|
||||
for col in dup_group.columns:
|
||||
if col in ['ChildSN', '备注']:
|
||||
continue
|
||||
unique_values = dup_group[col].unique()
|
||||
if len(unique_values) > 1:
|
||||
# 对 linkDate 做专门格式化,其他列保持原样转字符串
|
||||
if col == 'linkDate':
|
||||
vals = []
|
||||
for v in unique_values:
|
||||
if pd.isna(v):
|
||||
vals.append('')
|
||||
elif isinstance(v, pd.Timestamp):
|
||||
vals.append(v.strftime('%Y-%m-%d %H:%M:%S'))
|
||||
else:
|
||||
vals.append(str(v))
|
||||
diff_info[col] = f"{col}: {', '.join(vals)}"
|
||||
else:
|
||||
diff_info[col] = f"{col}: {', '.join(map(str, unique_values))}"
|
||||
|
||||
# 生成备注信息
|
||||
note = f"重复行数: {len(dup_group)}"
|
||||
if diff_info:
|
||||
note += "; 差异内容: " + "; ".join(diff_info.values())
|
||||
|
||||
# 更新最新行的备注(给该 ChildSN 的所有行先写备注,再删除)
|
||||
group.loc[group['ChildSN'] == child_sn, '备注'] = note
|
||||
|
||||
# 删除除最新以外的行
|
||||
drop_indices = dup_group.index[1:]
|
||||
group = group.drop(drop_indices)
|
||||
|
||||
return group
|
||||
|
||||
|
||||
def main():
|
||||
print("=== Excel拆分工具 ===")
|
||||
file_path = input("请输入Excel文件路径: ").strip('"')
|
||||
if not os.path.exists(file_path):
|
||||
print("文件不存在,请检查路径")
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# 创建处理器实例
|
||||
processor = ExcelProcessor(file_path)
|
||||
|
||||
# 执行处理流程
|
||||
if not processor.load_data():
|
||||
return
|
||||
|
||||
processor.create_output_folder()
|
||||
processor.process_data()
|
||||
|
||||
print("所有处理已完成!")
|
||||
except Exception as e:
|
||||
print(f"处理过程中发生错误: {e}")
|
||||
|
||||
end_time = time.time()
|
||||
print(f"总耗时: {end_time - start_time:.2f}秒")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user