Files
datie-bom/browser_login/analyze_duplicates.py

43 lines
1.3 KiB
Python

import json
from collections import defaultdict
from config import OUTPUT_DIR
filepath = OUTPUT_DIR / "issue_receipt_details_full.json"
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# 用来记录每个组合出现的次数和对应的列表索引
seen = defaultdict(list)
null_keys = 0
for idx, item in enumerate(data):
wo = item.get("发料单号")
line = item.get("行号")
mat = item.get("物料代码")
if not wo or not line or not mat:
null_keys += 1
continue
key = f"{wo}_{line}_{mat}"
seen[key].append(idx)
duplicates = {k: v for k, v in seen.items() if len(v) > 1}
print(f"总数据条数: {len(data)}")
print(f"缺失关键字段的数据条数: {null_keys}")
print(f"发现重复的组合数: {len(duplicates)}")
redundant_count = sum(len(v)-1 for v in duplicates.values())
print(f"因重复而多出的冗余条数: {redundant_count}")
# 打印前 5 个重复的例子
count = 0
for k, indices in duplicates.items():
if count >= 5:
break
print(f"\n重复键 (发料单号_行号_物料代码): {k}")
print(f" 第一次出现在第 {indices[0] + 1} 条,最新状态: {data[indices[0]].get('状态')}")
print(f" 第二次出现在第 {indices[1] + 1} 条,最新状态: {data[indices[1]].get('状态')}")
count += 1