227 lines
9.2 KiB
Python
227 lines
9.2 KiB
Python
"""
|
||
BOM 成本 - 终极树状结构抓取脚本 (全站 1400+ 父件及 5 层嵌套子件)
|
||
目标:
|
||
1. 抓取所有父件(成本核算表主页)
|
||
2. 暗网请求所有父件下对应的 BOM 成本数据(扁平的 5 层数据)
|
||
3. 实时清洗并重组为完美嵌套的 JSON 树
|
||
"""
|
||
import sys
|
||
import json
|
||
import time
|
||
import random
|
||
from pathlib import Path
|
||
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
from login import get_page, log
|
||
from config import OUTPUT_DIR
|
||
|
||
PAGE_URL = "https://yunmes.tftykj.cn/PartBomCostAccounting"
|
||
API_PARENT = "PartBomCostAccounting_SearchList_Proxy"
|
||
|
||
# 最终保存的文件名
|
||
TREE_FILE_PATH = OUTPUT_DIR / "bom_cost_full_tree_final.json"
|
||
|
||
def build_nested_tree(flat_items):
|
||
"""将包含 _parentId 的扁平列表转换为嵌套树"""
|
||
if not flat_items:
|
||
return []
|
||
|
||
node_dict = {}
|
||
for item in flat_items:
|
||
son_no = item.get("sonNO")
|
||
if not son_no:
|
||
continue
|
||
|
||
clean_node = {
|
||
"sonNO": son_no,
|
||
"_parentId": item.get("_parentId"),
|
||
"bomLevel": item.get("bomLevel"),
|
||
"childMaterialCode": item.get("childMaterialCode"),
|
||
"childMaterialName": item.get("childMaterialName"),
|
||
"usageQty": item.get("childrenMaterialConsumption", 1),
|
||
"sub_items": []
|
||
}
|
||
node_dict[son_no] = clean_node
|
||
|
||
roots = []
|
||
for son_no, node in node_dict.items():
|
||
parent_id = node.get("_parentId")
|
||
if parent_id is None:
|
||
roots.append(node)
|
||
else:
|
||
parent_node = node_dict.get(parent_id)
|
||
if parent_node:
|
||
parent_node["sub_items"].append(node)
|
||
|
||
# 清理建树临时字段
|
||
def clean_temp(node_list):
|
||
for node in node_list:
|
||
node.pop("sonNO", None)
|
||
node.pop("_parentId", None)
|
||
if node["sub_items"]:
|
||
clean_temp(node["sub_items"])
|
||
else:
|
||
node.pop("sub_items", None)
|
||
|
||
clean_temp(roots)
|
||
return roots
|
||
|
||
|
||
def fetch_bom_cost_tree():
|
||
log("INFO", "=== 🌳 启动 BOM 成本终极抓取 (多层嵌套自动重组) ===")
|
||
page = get_page(port=9222)
|
||
clean_parents_list = []
|
||
|
||
try:
|
||
# =========================================================
|
||
# 第一阶段:获取父件基础信息
|
||
# =========================================================
|
||
log("INFO", f"正在访问安全的父件页面: {PAGE_URL}")
|
||
page.get(PAGE_URL)
|
||
page.wait.load_start()
|
||
|
||
log("INFO", f"开启父件 API 网络监听: {API_PARENT}")
|
||
page.listen.start(API_PARENT)
|
||
page.refresh()
|
||
|
||
current_page = 1
|
||
total_records = 0
|
||
|
||
while True:
|
||
log("INFO", f"等待第 {current_page} 页父件 API 响应...")
|
||
packet = page.listen.wait(timeout=20)
|
||
|
||
if not packet:
|
||
log("ERR", f"超时未收到第 {current_page} 页数据,父件扫荡结束。")
|
||
break
|
||
|
||
body = packet.response.body
|
||
data = body if isinstance(body, (dict, list)) else json.loads(body)
|
||
|
||
if isinstance(data, dict) and "result" in data:
|
||
items = data["result"].get("items", [])
|
||
total_records = data["result"].get("totalCount", 0)
|
||
|
||
for item in items:
|
||
# 注意:我们要拿的是 parentMaterialId,因为这是传给 BOM 成本 API 的关键参数 materialId
|
||
clean_parent = {
|
||
"_id": item.get("id"), # 这个是 partBomCostAccountingId
|
||
"_materialId": item.get("parentMaterialId"), # 这个是传给子件的 materialId
|
||
"parentMaterialCode": item.get("parentMaterialCode"),
|
||
"parentMaterialName": item.get("parentMaterialName"),
|
||
"bom_cost_tree": [] # 准备挂载这棵树
|
||
}
|
||
clean_parents_list.append(clean_parent)
|
||
|
||
log("OK", f"提取了 {len(items)} 个父件。总进度: {len(clean_parents_list)}/{total_records}")
|
||
|
||
if len(clean_parents_list) >= total_records or len(items) == 0:
|
||
break
|
||
else:
|
||
break
|
||
|
||
# 准备翻页
|
||
next_btn_xpath = "xpath:/html/body/div[1]/div/div[3]/table/tbody/tr/td[10]/a/span/span[2]"
|
||
next_btn = page.ele(next_btn_xpath, timeout=5)
|
||
|
||
if next_btn:
|
||
parent_a = next_btn.parent(2)
|
||
if parent_a and "disabled" in parent_a.attr("class"):
|
||
log("INFO", "已到达最后一页。")
|
||
break
|
||
page.run_js("arguments[0].click();", next_btn)
|
||
time.sleep(1.5)
|
||
else:
|
||
log("WARN", "未找到下一页按钮,停止翻页。")
|
||
break
|
||
|
||
current_page += 1
|
||
|
||
page.listen.stop()
|
||
|
||
# =========================================================
|
||
# 第二阶段:暗网递归注入,重组 5 层嵌套树
|
||
# =========================================================
|
||
log("INFO", f"=== 🚀 开始为 {len(clean_parents_list)} 个父件抓取 BOM 成本树 ===")
|
||
|
||
js_template = """
|
||
return new Promise((resolve, reject) => {
|
||
if (typeof $ !== 'undefined' && $.ajax) {
|
||
$.ajax({
|
||
url: '/api/services/TfTechApi/PartBom/PartBom_SearchByTreeCost',
|
||
type: 'POST',
|
||
data: {
|
||
materialId: MATERIAL_ID_PLACEHOLDER,
|
||
partBomCostAccountingId: ACCOUNTING_ID_PLACEHOLDER,
|
||
childMaterialCode: '',
|
||
childMaterialName: '',
|
||
childMaterialSpecification: '',
|
||
childMaterialModel: ''
|
||
},
|
||
headers: {
|
||
'referer': 'https://yunmes.tftykj.cn/PartBomCostAccounting/Detail?id=ACCOUNTING_ID_PLACEHOLDER'
|
||
},
|
||
success: function(response) {
|
||
resolve({status: 'success', data: response});
|
||
},
|
||
error: function(xhr, status, error) {
|
||
resolve({status: 'error', data: xhr.responseText || error});
|
||
}
|
||
});
|
||
} else {
|
||
resolve({status: 'error', data: 'No jQuery'});
|
||
}
|
||
});
|
||
"""
|
||
|
||
for index, parent in enumerate(clean_parents_list):
|
||
accounting_id = parent.get("_id")
|
||
material_id = parent.get("_materialId")
|
||
parent_code = parent.get("parentMaterialCode", "未知")
|
||
|
||
if not accounting_id or not material_id:
|
||
continue
|
||
|
||
log("INFO", f"[{index+1}/{len(clean_parents_list)}] 正在请求 BOM 成本树 (Code: {parent_code})...")
|
||
|
||
js_code = js_template.replace("MATERIAL_ID_PLACEHOLDER", str(material_id)).replace("ACCOUNTING_ID_PLACEHOLDER", str(accounting_id))
|
||
result = page.run_js(js_code)
|
||
|
||
if result and result.get('status') == 'success':
|
||
data = result.get('data')
|
||
if isinstance(data, str):
|
||
try: data = json.loads(data)
|
||
except: pass
|
||
|
||
if isinstance(data, dict) and "result" in data:
|
||
flat_items = data["result"]
|
||
if isinstance(flat_items, list):
|
||
# 核心:调用刚才验证成功的重组函数,把扁平列表变成 5 层树
|
||
nested_tree = build_nested_tree(flat_items)
|
||
parent["bom_cost_tree"] = nested_tree
|
||
log("OK", f" └── 成功重组了一棵包含 {len(flat_items)} 个节点的多层树。")
|
||
else:
|
||
log("ERR", f" └── 请求失败: {result.get('data') if result else '未知错误'}")
|
||
|
||
time.sleep(random.uniform(0.3, 0.7))
|
||
|
||
if (index + 1) % 10 == 0 or (index + 1) == len(clean_parents_list):
|
||
# 最终保存前,清理一下用于请求的临时字段
|
||
clean_save_list = []
|
||
for p in clean_parents_list[:index+1]:
|
||
clean_p = dict(p)
|
||
clean_p.pop("_id", None)
|
||
clean_p.pop("_materialId", None)
|
||
clean_save_list.append(clean_p)
|
||
|
||
with open(TREE_FILE_PATH, "w", encoding="utf-8") as f:
|
||
json.dump(clean_save_list, f, ensure_ascii=False, indent=2)
|
||
log("INFO", f"💾 进度已实时保存至 JSON ({index+1}/{len(clean_parents_list)})")
|
||
|
||
log("OK", f"=== 🏆 终极 BOM 成本多层树状抓取完成!文件路径: {TREE_FILE_PATH} ===")
|
||
|
||
except Exception as e:
|
||
log("ERR", f"发生异常: {e}")
|
||
|
||
if __name__ == "__main__":
|
||
fetch_bom_cost_tree() |