优化前端

This commit is contained in:
Jimmy
2026-05-07 15:18:30 +08:00
parent 5c7e489e1c
commit 031ec4d289
5 changed files with 377 additions and 92 deletions

View File

@@ -19,7 +19,17 @@ SAVE_PATH = OUTPUT_DIR / "receipt_details_full_clean.json"
def fetch_receipt_details_full():
log("INFO", "=== 🚚 启动收货明细报表全量抓取 (精简字段模式) ===")
page = get_page(port=9222)
# 尝试加载已有的存档,实现真正的断点累加
all_clean_items = []
if SAVE_PATH.exists():
try:
with open(SAVE_PATH, "r", encoding="utf-8") as f:
all_clean_items = json.load(f)
log("INFO", f"📦 已加载本地历史存档,包含 {len(all_clean_items)} 条数据。")
except Exception as e:
log("WARN", f"加载本地存档失败: {e},将从空列表开始。")
all_clean_items = []
try:
log("INFO", f"正在回到主页起点: {HOME_URL}")
@@ -75,43 +85,107 @@ def fetch_receipt_details_full():
return
# =========================================================
# 第一页数据处理
# 第一页数据处理 (如果触发断点,则忽略第一页数据)
# =========================================================
log("OK", f"🎉 成功拦截到第一页数据HTTP: {packet.response.status}")
body = packet.response.body
data = body if isinstance(body, (dict, list)) else json.loads(body)
# 设定开始抓取的页码1表示从头开始抓全量数据
target_resume_page = 690
total_count = 0
if isinstance(data, dict) and "result" in data:
total_count = data["result"].get("totalCount", 0)
items = data["result"].get("items", [])
for item in items:
all_clean_items.append({
"采购订单号": item.get("purchaseOrderCode"),
"行号": item.get("rowsNum"),
"物料代码": item.get("materialCode"),
"物料名称": item.get("materialName"),
"物料规格": item.get("materialSpecification"),
"仓库代码": item.get("warehouseCode"),
"仓库名称": item.get("warehouseName"),
"供应商代码": item.get("supplierCode"),
"供应商名称": item.get("supplierName"),
"单位名称": item.get("unitName"),
"转换单位": item.get("convertUnitName"),
"收货单价": item.get("receivePrice"),
"收货时间": item.get("receiptTime"),
"进货数量": item.get("convertPlannedPurchaseQuantity") if item.get("convertPlannedPurchaseQuantity") is not None else item.get("plannedPurchaseQuantity"),
"收货数量": item.get("convertGoodsQuantity") if item.get("convertGoodsQuantity") is not None else item.get("goodsQuantity"),
"收货总金额": item.get("receiveAmount")
})
log("OK", f"第一页清洗完成,提取了 {len(items)} 条数据。后端报告总条数: {total_count}")
# 只有当不是断点续传即从第1页开始才把第一页的数据加入列表
if target_resume_page <= 1:
for item in items:
all_clean_items.append({
"采购订单号": item.get("purchaseOrderCode"),
"行号": item.get("rowsNum"),
"物料代码": item.get("materialCode"),
"物料名称": item.get("materialName"),
"物料规格": item.get("materialSpecification"),
"仓库代码": item.get("warehouseCode"),
"仓库名称": item.get("warehouseName"),
"供应商代码": item.get("supplierCode"),
"供应商名称": item.get("supplierName"),
"单位名称": item.get("unitName"),
"转换单位": item.get("convertUnitName"),
"收货单价": item.get("receivePrice"),
"收货时间": item.get("receiptTime"),
"进货数量": item.get("plannedPurchaseQuantity"),
"收货数量": item.get("convertGoodsQuantity") if item.get("convertGoodsQuantity") is not None else item.get("goodsQuantity"),
"收货总金额": item.get("receiveAmount")
})
log("OK", f"第一页清洗完成,提取了 {len(items)} 条数据。后端报告总条数: {total_count}")
else:
log("INFO", f"触发断点续传,跳过第一页的数据保存。后端报告总条数: {total_count}")
page_num = 1
# =========================================================
# 断点续传逻辑 (由于刚才中断在 711 页,我们需要跳到 712 页继续)
# =========================================================
if target_resume_page > 1:
log("INFO", f"🚀 触发断点续传机制!准备直接跳转到第 {target_resume_page} 页...")
# 尝试找页码输入框
jumper_input_xpath = 'xpath://*[@id="app"]/div/div[1]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/span[3]/div/div//input'
input_ele = page.ele(jumper_input_xpath, timeout=5)
if not input_ele:
jumper_input_xpath = 'xpath://input[@type="number" and @aria-label=""]'
input_ele = page.ele(jumper_input_xpath, timeout=5)
if input_ele:
input_ele.clear()
input_ele.input(str(target_resume_page))
time.sleep(0.5)
input_ele.input('\n')
packet = page.listen.wait(timeout=15)
if not packet:
log("ERR", "断点跳转失败,未拦截到目标页的数据请求。")
return
log("OK", f"✅ 成功跳转至第 {target_resume_page} 页并截获数据!")
page_num = target_resume_page
# 读取并解析第 191 页的数据
body = packet.response.body
data = body if isinstance(body, (dict, list)) else json.loads(body)
if isinstance(data, dict) and "result" in data:
items = data["result"].get("items", [])
for item in items:
all_clean_items.append({
"采购订单号": item.get("purchaseOrderCode"),
"行号": item.get("rowsNum"),
"物料代码": item.get("materialCode"),
"物料名称": item.get("materialName"),
"物料规格": item.get("materialSpecification"),
"仓库代码": item.get("warehouseCode"),
"仓库名称": item.get("warehouseName"),
"供应商代码": item.get("supplierCode"),
"供应商名称": item.get("supplierName"),
"单位名称": item.get("unitName"),
"转换单位": item.get("convertUnitName"),
"收货单价": item.get("receivePrice"),
"收货时间": item.get("receiptTime"),
"进货数量": item.get("plannedPurchaseQuantity"),
"收货数量": item.get("convertGoodsQuantity") if item.get("convertGoodsQuantity") is not None else item.get("goodsQuantity"),
"收货总金额": item.get("receiveAmount")
})
log("OK", f"{page_num} 页清洗完成,累计提取 {len(all_clean_items)} 条数据。")
else:
log("ERR", "找不到页码输入框,断点跳转失败,将从第 1 页继续!")
# =========================================================
# 循环翻页抓取
# =========================================================
next_btn_xpath = 'xpath://*[@id="app"]/div/div[1]/div[2]/div[2]/div[1]/div[2]/div/div[2]/div[1]/button[2]'
while True:
# 引入“类人”随机延迟2.5 秒到 5.5 秒之间随机)
@@ -125,10 +199,31 @@ def fetch_receipt_details_full():
log("INFO", f"☕️ 已经连续高强度翻了 {page_num} 页,触发风控规避机制,假装喝水休息 {long_delay:.2f} 秒...")
time.sleep(long_delay)
next_btn = page.ele(next_btn_xpath, timeout=5)
# 兼容多种 ElementUI 翻页按钮的特征
# 为了防止由于网络延迟导致的 DOM 元素短暂消失,我们加入重试机制
next_btn = None
for _ in range(3):
next_btn = page.ele('xpath://button[contains(@class, "btn-next")]', timeout=3)
if next_btn:
break
time.sleep(1)
# 【修复】当跳页页数大于 400 页时,某些页面的 ElementUI 分页组件会为了节省 DOM 而卸载 next_btn
# 或者被包裹在隐藏容器里。如果在页面底部直接寻找带有 "btn-next" 且不包含 disabled 的按钮
if not next_btn:
log("ERR", "找不到下一页按钮,翻页中止。")
break
# 尝试备用定位方式:直接找右箭头图标所在的按钮
next_btn = page.ele('xpath://i[contains(@class, "el-icon-arrow-right")]/parent::button', timeout=3)
if not next_btn:
log("ERR", "重试 3 次后仍然找不到下一页按钮,可能是页面崩溃或会话超时,尝试强制刷新页面...")
page.refresh()
page.wait.load_start()
time.sleep(5)
# 刷新后尝试重新找一次
next_btn = page.ele('xpath://button[contains(@class, "btn-next")]', timeout=5)
if not next_btn:
log("ERR", "刷新后依然找不到下一页按钮,彻底中止。")
break
# 检查按钮是否被禁用
class_str = str(next_btn.attr("class"))
@@ -212,6 +307,13 @@ def fetch_receipt_details_full():
with open(rescue_path, "w", encoding="utf-8") as f:
json.dump(all_clean_items, f, ensure_ascii=False, indent=2)
log("INFO", f"🆘 触发异常保存,抢救了 {len(all_clean_items)} 条数据。")
finally:
# 无论脚本正常结束还是异常退出,都强制停止监听,防止成为僵尸爬虫
try:
page.listen.stop()
log("INFO", "🛑 已释放浏览器监听资源。")
except:
pass
if __name__ == "__main__":
fetch_receipt_details_full()