feat: harden douyin sync diagnostics and manual fallback
This commit is contained in:
@@ -436,6 +436,31 @@ def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]:
|
||||
return videos
|
||||
|
||||
|
||||
def _normalize_profile_url_input(value: str) -> str:
|
||||
text = str(value or "").strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
match = re.search(r"https?://[^\s]+", text)
|
||||
if match:
|
||||
text = match.group(0)
|
||||
|
||||
text = text.strip().strip(",。;;、,)")
|
||||
if text.startswith("www.douyin.com/") or text.startswith("douyin.com/"):
|
||||
text = f"https://{text}"
|
||||
return text
|
||||
|
||||
|
||||
def _looks_like_douyin_anti_bot_page(html: str) -> bool:
|
||||
markers = (
|
||||
"window.byted_acrawler.init",
|
||||
"__ac_signature",
|
||||
"__ac_nonce",
|
||||
"window.location.reload()"
|
||||
)
|
||||
return any(marker in html for marker in markers)
|
||||
|
||||
|
||||
async def _fetch_html(url: str, cookie: str = "") -> tuple[str, str]:
|
||||
headers = {
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
@@ -786,7 +811,7 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
||||
return [profile_map[profile_id] for profile_id in requested_ids]
|
||||
|
||||
async def _collect_public_profile(profile_url: str, manual_payload: dict[str, Any] | None) -> dict[str, Any]:
|
||||
source_url = profile_url.strip()
|
||||
source_url = _normalize_profile_url_input(profile_url)
|
||||
blobs: list[dict[str, Any]] = []
|
||||
errors: list[str] = []
|
||||
|
||||
@@ -797,7 +822,18 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
||||
try:
|
||||
final_url, html = await _fetch_html(source_url)
|
||||
source_url = final_url
|
||||
blobs.extend(_extract_json_blobs_from_html(html))
|
||||
if not html.strip():
|
||||
errors.append("public_profile_empty_html")
|
||||
elif _looks_like_douyin_anti_bot_page(html):
|
||||
errors.append("public_profile_anti_bot_challenge")
|
||||
elif not blobs:
|
||||
blobs.extend(_extract_json_blobs_from_html(html))
|
||||
if not blobs:
|
||||
errors.append("public_profile_no_json_blobs")
|
||||
else:
|
||||
blobs.extend(_extract_json_blobs_from_html(html))
|
||||
if not blobs:
|
||||
errors.append("public_profile_no_json_blobs")
|
||||
except Exception as exc:
|
||||
errors.append(f"public_profile_fetch_failed: {exc}")
|
||||
|
||||
@@ -807,6 +843,10 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
||||
fallback_url=source_url
|
||||
)
|
||||
videos = _extract_videos(payloads)
|
||||
if source_url and not profile.get("nickname") and not videos and not errors:
|
||||
if not blobs:
|
||||
errors.append("public_profile_no_json_blobs")
|
||||
errors.append("public_profile_no_candidates")
|
||||
return {
|
||||
"profile": profile,
|
||||
"videos": videos,
|
||||
@@ -1784,7 +1824,19 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
||||
request.manual_creator_pages
|
||||
)
|
||||
if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]:
|
||||
raise HTTPException(status_code=400, detail="No Douyin profile or creator-center data could be extracted")
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"message": "No Douyin profile or creator-center data could be extracted",
|
||||
"profile_url": request.profile_url,
|
||||
"resolved_profile_url": public_data["source_url"],
|
||||
"public_blob_count": len(public_data["raw_pages"]),
|
||||
"public_video_count": len(public_data["videos"]),
|
||||
"public_errors": public_data["errors"],
|
||||
"creator_page_count": len(creator_data["pages"]),
|
||||
"creator_errors": creator_data["errors"]
|
||||
}
|
||||
)
|
||||
account_row = _upsert_account(account, public_data["profile"], request, public_data, creator_data)
|
||||
workspace = _build_workspace_payload(account_row)
|
||||
workspace["sync_errors"] = public_data["errors"] + creator_data["errors"]
|
||||
|
||||
@@ -106,7 +106,18 @@
|
||||
|
||||
- StoryForge 已支持把 `upload_video` 或已完成的 `video_link` 源素材自动上传到 `cutvideo`
|
||||
- `real-cut` 任务可直接传 `source_job_id`,由后端完成 staging 后再提交到剪辑服务
|
||||
- Windows 机器还需要部署带 `POST /api/uploads` 的 `cutvideo` 分支版本
|
||||
- Windows 机器已部署带 `POST /api/uploads` 的 `cutvideo` 版本,并完成局域网联调
|
||||
|
||||
### 3.1 `douyin` 工作台
|
||||
|
||||
- `collector-service` 已具备 `/v2/douyin/*` 工作台接口
|
||||
- 已补充两类关键联调增强:
|
||||
- 分享文案中的 URL 自动提取与归一化
|
||||
- public 页面命中抖音反爬挑战时的显式诊断返回
|
||||
- 真实 smoke 结果表明,纯 public 主页抓取会落到 `byted_acrawler` 挑战页,而不是正常 profile 数据页
|
||||
- 同时,`manual_profile_payload + manual_work_payloads` 已验证可完成账号入库和分析报告生成
|
||||
|
||||
结论:`douyin` 方向不再是“接口存在但不可用”,当前状态是“public 直抓受反爬限制,但人工采集兜底链已跑通”。
|
||||
|
||||
### 4. `huobao-drama`
|
||||
|
||||
@@ -151,6 +162,7 @@
|
||||
|
||||
## 当前主要风险
|
||||
|
||||
1. 抖音 / 小红书账号级内容源还未做真实平台验证
|
||||
2. `huobao-drama` 已在本机旧改版实例上跑通,但兼容补丁尚未迁到 upstream 仓库并形成正式提交
|
||||
3. `douyin` 新接口已上线 live,但还需要补一轮真实账号级回归,确认页面抓取、手工 payload 和相似账号分析都稳定
|
||||
1. 小红书账号级内容源还未做真实平台验证
|
||||
2. `douyin` public 直抓仍受反爬限制,生产落地还需要补 cookie 或人工页面采集协作链
|
||||
3. `huobao-drama` 已在本机旧改版实例上跑通,但兼容补丁尚未迁到 upstream 仓库并形成正式提交
|
||||
4. `douyin` 新接口已上线 live,但还需要补一轮真实账号级回归,确认手工 payload 和相似账号分析都稳定
|
||||
|
||||
@@ -137,6 +137,25 @@ docker compose up -d --build
|
||||
- 子任务:`job_7f169db61af441f8a7f186d03db2d91c`
|
||||
- 子任务:`job_28c47774028441378a3974860c375ab7`
|
||||
|
||||
## 6.1 `douyin` 账号工作台验证
|
||||
|
||||
基础接口:
|
||||
|
||||
- `POST /v2/douyin/accounts/sync`
|
||||
- `POST /v2/douyin/accounts/{account_id}/analysis`
|
||||
|
||||
说明:
|
||||
|
||||
- `profile_url` 现在支持直接传分享文案,后端会自动提取里面的 URL
|
||||
- 如果 public 页面命中抖音反爬挑战,接口会返回 `public_profile_anti_bot_challenge`
|
||||
- 遇到挑战页时,继续可用的路径是 `manual_profile_payload`、`manual_work_payloads` 和 `manual_creator_pages`
|
||||
|
||||
已验证样例:
|
||||
|
||||
- public 页面 smoke:返回 `public_profile_anti_bot_challenge`
|
||||
- 手工导入账号:`dyacct_c2b62842b228406cb48f05fac16fdfdf`
|
||||
- 手工账号分析报告:`dyreport_10d6b8d2d52a404192f54a3a05d44546`
|
||||
|
||||
## 7. `cutvideo` 实拍剪辑链路验证
|
||||
|
||||
调用 `POST /v2/pipelines/real-cut`
|
||||
@@ -187,6 +206,6 @@ docker compose up -d --build
|
||||
|
||||
## 9. 当前已知卡点
|
||||
|
||||
- Windows 机器上的 `cutvideo` 还需要部署带 `POST /api/uploads` 的新分支版本
|
||||
- 抖音 / 小红书账号级内容源还未做真实平台验证
|
||||
- 抖音 public 页面直抓会命中反爬挑战;生产接入仍需要 cookie 或人工页面采集协助
|
||||
- 小红书账号级内容源还未做真实平台验证
|
||||
- `huobao-drama` 目前跑通依赖本地旧改版中的 qnaigc 兼容补丁,下一步要迁到 upstream 仓库
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
- `upload_video -> source_job_id -> cutvideo` 自动 staging 闭环
|
||||
- `collector` live 运行态已从临时源码挂载切回 `StoryForge-gitea` 正式镜像
|
||||
- live `collector` 已挂出 `/v2/douyin/*` 能力并通过认证接口验证
|
||||
- `douyin` 支持从分享文案中提取 `profile_url`,并在 public 页面命中抖音反爬挑战时返回明确诊断
|
||||
- `douyin` 手工 payload 导入与账号分析链路已跑通
|
||||
- 本机 `huobao-drama` API 调度、首尾帧生成、视频生成与结果回写接口
|
||||
- FastGPT 运行时依赖删除
|
||||
|
||||
@@ -33,14 +35,16 @@
|
||||
- 实拍剪辑自动 staging 联调:`job_01a6f283cbda42e4ae692b268b811a50`
|
||||
- AI 视频链路:`job_01828c40377747cf914b51be360cc333`
|
||||
- Windows `cutvideo` 部署后联调:`job_5838515ed5c34679acd55a52cfcd424b`
|
||||
- `douyin` 手工导入账号:`dyacct_c2b62842b228406cb48f05fac16fdfdf`
|
||||
- `douyin` 账号分析报告:`dyreport_10d6b8d2d52a404192f54a3a05d44546`
|
||||
|
||||
## 尚未完全跑通
|
||||
|
||||
- 抖音 / 小红书账号级内容源还未做真实平台验证;`bilibili` 账号级 URL 已跑通
|
||||
- `douyin` 账号分析接口已上线到 live `collector`,但还没有跑过真实生产账号样例
|
||||
- 小红书账号级内容源还未做真实平台验证
|
||||
- `douyin` public 主页直抓会命中 `public_profile_anti_bot_challenge`;当前已验证手工 payload 导入和分析可作为可用兜底路径
|
||||
|
||||
## 下一步优先级
|
||||
|
||||
1. 补抖音 / 小红书账号级真实验证与必要的 URL 归一化
|
||||
2. 把 `collector` live 切换结果和部署回滚说明固化到仓库
|
||||
3. 把改动整理成提交并推送
|
||||
1. 补抖音真实账号的 cookie / 手工页面采集联调,以及小红书账号级验证
|
||||
2. 跑一轮 `douyin` 相似账号搜索和对标关系链路
|
||||
3. 把 `collector` live 切换结果和部署回滚说明继续固化到仓库
|
||||
|
||||
Reference in New Issue
Block a user