guoguobaba:
WebRPA: 分布式网络爬虫
前言
webrpa 是一个分布式的网络爬虫系统,基于 fastapi+fastadmin 开发,通过 web api 接口发起网络爬虫服务,实现流程自动化或数据自动抓取。它包含两部分:
- manager: 用于提供 web 服务,实现了 web server 和 websocket server 。用户通过 web api 发送请求,manager 会将查询请求通过 websocket 转发给对应的 worker ,由 worker 执行查询操作。
- worker: 通过 websocket 和 manager 连接,用于执行网络爬虫任务,接收 manager 的 websocket 请求,并执行查询操作。worker 可以跨网络分布式部署。
graph LR
client-->manager-->worker1
manager-->worker2
manager-->workers[worker...]
主要实现的功能包括:
- 爬虫流程自定义:通过 json 指定访问网页的各种操作,实现流程自动化,并支持通过 markdown flowchart 语法定义流程,实现在不同条件下的跳转
- 数据自动抓取:支持通过 js 对网页数据进行处理,获取结构化的数据以及对页面某个区域进行自动截图
- 跨网络分布式部署:worker 可以跨网络分布式部署,实现负载均衡,并可以根据查询类型不同指定专用的 worker 。
- 代理池支持:支持自定义代理池
- undetected chromedriver:支持无头浏览器以及绕开网站的反扒检测。支持通过 selenium 和 requests 组合查询
- 自定义 captcha 引擎:支持对图形验证码或者其他验证码的自动识别和点击操作等操作。并增加随机扰动,避免 antibot 检测。
- 缓存和自动重试:支持缓存和自动重试,查询失败的请求会自动在闲时重试。
- 自动扩容:支持 k8s 部署,一键扩容。
- 权限和审计:支持数据源权限模型,不同的用户对不同的数据源具备不同的权限,并提供审计数据。
TODO
引入 browser use ,通过 LLM 自动创建数据爬虫服务。
一个示例,爬取某网站
{
"name": "szreorc",
"desc": "深圳不动产查询",
"driver": "firefox",
"url": "",
"debug": true,
"window_size": "1920x1080",
"action_timeout": 5,
"wait_redirect": true,
"wait_redirect_interval": 2,
"identifier": "{username}-{BuildingName}-{UNIT_NO}",
"credential": "{username}",
"actions": {
"1": {
"desc": "确认登录",
"action": "check_variable",
"options": {"script": "return window.location.href;",
"target": "^https://pnr.sz.gov.cn/d-ghrer/reroosp/ytcf"
}
},
"10": {
"desc" : "用户名密码登录",
"action": "click",
"timeout": 2,
"target": ["xpath", "//a[contains(@class, 'login-tab') and normalize-space(text())='账号密码']"]
},
"11": {
"desc" : "输入用户名",
"action": "input_text",
"target": ["xpath", "//input[@type='text' and @placeholder='请输入账号']"],
"param": "username"
},
"12": {
"desc": "增加计数",
"action": "variable",
"options": {"variable":"counter1","operator": "+"}
},
"13": {
"desc": "检测计数",
"action": "variable",
"stop_on_fail": true,
"options": {"variable":"counter1","operator": "<", "target": 2, "sleep": 2000}
},
"14": {
"desc" : "输入密码",
"action": "input_text",
"target": ["xpath", "//input[@type='password' and @placeholder='请输入密码']"],
"param": "password"
},
"15": {
"desc": "识别 captcha",
"action": "decode_captcha_code",
"target": ["xpath","//div[contains(@class, 'captcha-body') and @title='点击刷新']"],
"options": {"code_type": 11}
},
"16": {
"desc": "输入 captcha",
"action": "input_text",
"target": ["xpath","//div[contains(@class, 'account_verifying')] //input[@type='text']"]
},
"17": {
"desc": "点击登录",
"action": "click",
"target": ["xpath", "//button[contains(@class, 'gd-btn-primary') and contains(@class, 'gd-btn') and @type='button']//span[starts-with(text(), '登录 ')]"]
},
"18": {
"desc": "继续登录",
"action": "click",
"target": ["xpath", "//button[.//span[contains(text(), '继续登录')]]"]
},
"20": {
"desc": "确认选择",
"action": "click",
"timeout": 10,
"stop_on_fail": true,
"fail_message": "login failed",
"options": {"set_credential": true},
"target": ["class name", "jinruxuzhi-checkbox"]
},
"21": {
"desc": "确认选择下一步",
"action": "click",
"target": ["class name", "jinruxuzhi-buttonOk"]
},
"30": {
"desc": "展开查询类型",
"action": "click",
"options": {"sleep": 2},
"target": ["xpath", "//input[@type='text' and @placeholder='请选择']"]
},
"31": {
"desc": "等待下拉菜单",
"action": "wait_element",
"options": {"visible": true},
"target": ["css selector", "div.el-select-dropdown.el-popper"]
},
"32": {
"desc": "选择查询类型",
"action": "click",
"target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item') and span[text()='楼名及栋名']]"]
},
"33": {
"desc" : "输入查询内容",
"action": "input_text",
"target": ["xpath", "//input[@type='text' and @placeholder='请输入内容']"],
"param": "BuildingName"
},
"34": {
"desc": "点击查询",
"action": "click",
"target": ["class name", "el-icon-search"]
},
"35": {
"desc": "点击截图对象",
"action": "click",
"timeout": 20,
"stop_on_fail": true,
"fail_message": "search failed",
"target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='楼宇']"]
},
"40": {
"desc": "获取数据",
"action": "get_data",
"options": {"script": "var table = document.querySelector(\"#pane-1 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"土地坐落\", \"楼名及栋名\", \"房屋类型\", \"房屋性质\", \"房屋用途\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"}
},
"41": {
"desc": "点击截图对象",
"action": "click",
"target": ["xpath", "//div[contains(@class, 'el-dialog__wrapper')]//div[contains(@class, 'el-tabs__item') and normalize-space(text())='房屋']"]
},
"42": {
"desc": "下拉房屋查询",
"action": "click",
"target": ["css selector", "#pane-2 input.el-input__inner"]
},
"43": {
"desc": "点击房屋查询",
"action": "click",
"target": ["xpath", "//li[contains(@class, 'el-select-dropdown__item')]//span[text()='{UNIT_NO}']"],
"param": "UNIT_NO"
},
"44": {
"desc": "截图",
"action": "screenshot",
"target": ["class name", "el-dialog__wrapper"],
"options": {"visible": true}
},
"45": {
"desc": "获取数据",
"action": "get_data",
"options": {"script": "var table = document.querySelector(\"#pane-2 table.is-bordered.el-descriptions--mini\");\nvar fields = [\"房号\", \"所在楼层\", \"建筑面积\", \"使用年限\", \"存在抵押\", \"存在查封\", \"存在异议\", \"存在居住权\"];\nvar result = {};\nif (table) {\n var rows = table.querySelectorAll(\"tr.el-descriptions-row\");\n rows.forEach(function(row) {\n var label = row.querySelector(\"th.el-descriptions-item__label\").innerText.trim();\n var content = row.querySelector(\"td.el-descriptions-item__content\").innerText.trim();\n if (fields.includes(label)) {\n result[label] = content;\n }\n });\n console.log(JSON.stringify(result));\n} else {\n console.log(\"Table not found.\");\n};\nreturn result;\n"}
}
},
"processes": "start->1\n1(no)->10->11\n11(no)->12->13\n13(yes)->10\n11(yes)->14->15->16->17->18->20->21->30->31->32->33->34->35->40->41->42->43->44->45->end\n1(yes)->20",
"result":["screenshot", "data"]
}