feat: harden douyin sync diagnostics and manual fallback
This commit is contained in:
@@ -436,6 +436,31 @@ def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]:
|
|||||||
return videos
|
return videos
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_profile_url_input(value: str) -> str:
|
||||||
|
text = str(value or "").strip()
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
match = re.search(r"https?://[^\s]+", text)
|
||||||
|
if match:
|
||||||
|
text = match.group(0)
|
||||||
|
|
||||||
|
text = text.strip().strip(",。;;、,)")
|
||||||
|
if text.startswith("www.douyin.com/") or text.startswith("douyin.com/"):
|
||||||
|
text = f"https://{text}"
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_douyin_anti_bot_page(html: str) -> bool:
|
||||||
|
markers = (
|
||||||
|
"window.byted_acrawler.init",
|
||||||
|
"__ac_signature",
|
||||||
|
"__ac_nonce",
|
||||||
|
"window.location.reload()"
|
||||||
|
)
|
||||||
|
return any(marker in html for marker in markers)
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_html(url: str, cookie: str = "") -> tuple[str, str]:
|
async def _fetch_html(url: str, cookie: str = "") -> tuple[str, str]:
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": DEFAULT_USER_AGENT,
|
"User-Agent": DEFAULT_USER_AGENT,
|
||||||
@@ -786,7 +811,7 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
|||||||
return [profile_map[profile_id] for profile_id in requested_ids]
|
return [profile_map[profile_id] for profile_id in requested_ids]
|
||||||
|
|
||||||
async def _collect_public_profile(profile_url: str, manual_payload: dict[str, Any] | None) -> dict[str, Any]:
|
async def _collect_public_profile(profile_url: str, manual_payload: dict[str, Any] | None) -> dict[str, Any]:
|
||||||
source_url = profile_url.strip()
|
source_url = _normalize_profile_url_input(profile_url)
|
||||||
blobs: list[dict[str, Any]] = []
|
blobs: list[dict[str, Any]] = []
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
||||||
@@ -797,7 +822,18 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
|||||||
try:
|
try:
|
||||||
final_url, html = await _fetch_html(source_url)
|
final_url, html = await _fetch_html(source_url)
|
||||||
source_url = final_url
|
source_url = final_url
|
||||||
blobs.extend(_extract_json_blobs_from_html(html))
|
if not html.strip():
|
||||||
|
errors.append("public_profile_empty_html")
|
||||||
|
elif _looks_like_douyin_anti_bot_page(html):
|
||||||
|
errors.append("public_profile_anti_bot_challenge")
|
||||||
|
elif not blobs:
|
||||||
|
blobs.extend(_extract_json_blobs_from_html(html))
|
||||||
|
if not blobs:
|
||||||
|
errors.append("public_profile_no_json_blobs")
|
||||||
|
else:
|
||||||
|
blobs.extend(_extract_json_blobs_from_html(html))
|
||||||
|
if not blobs:
|
||||||
|
errors.append("public_profile_no_json_blobs")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
errors.append(f"public_profile_fetch_failed: {exc}")
|
errors.append(f"public_profile_fetch_failed: {exc}")
|
||||||
|
|
||||||
@@ -807,6 +843,10 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
|||||||
fallback_url=source_url
|
fallback_url=source_url
|
||||||
)
|
)
|
||||||
videos = _extract_videos(payloads)
|
videos = _extract_videos(payloads)
|
||||||
|
if source_url and not profile.get("nickname") and not videos and not errors:
|
||||||
|
if not blobs:
|
||||||
|
errors.append("public_profile_no_json_blobs")
|
||||||
|
errors.append("public_profile_no_candidates")
|
||||||
return {
|
return {
|
||||||
"profile": profile,
|
"profile": profile,
|
||||||
"videos": videos,
|
"videos": videos,
|
||||||
@@ -1784,7 +1824,19 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
|||||||
request.manual_creator_pages
|
request.manual_creator_pages
|
||||||
)
|
)
|
||||||
if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]:
|
if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]:
|
||||||
raise HTTPException(status_code=400, detail="No Douyin profile or creator-center data could be extracted")
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail={
|
||||||
|
"message": "No Douyin profile or creator-center data could be extracted",
|
||||||
|
"profile_url": request.profile_url,
|
||||||
|
"resolved_profile_url": public_data["source_url"],
|
||||||
|
"public_blob_count": len(public_data["raw_pages"]),
|
||||||
|
"public_video_count": len(public_data["videos"]),
|
||||||
|
"public_errors": public_data["errors"],
|
||||||
|
"creator_page_count": len(creator_data["pages"]),
|
||||||
|
"creator_errors": creator_data["errors"]
|
||||||
|
}
|
||||||
|
)
|
||||||
account_row = _upsert_account(account, public_data["profile"], request, public_data, creator_data)
|
account_row = _upsert_account(account, public_data["profile"], request, public_data, creator_data)
|
||||||
workspace = _build_workspace_payload(account_row)
|
workspace = _build_workspace_payload(account_row)
|
||||||
workspace["sync_errors"] = public_data["errors"] + creator_data["errors"]
|
workspace["sync_errors"] = public_data["errors"] + creator_data["errors"]
|
||||||
|
|||||||
@@ -106,7 +106,18 @@
|
|||||||
|
|
||||||
- StoryForge 已支持把 `upload_video` 或已完成的 `video_link` 源素材自动上传到 `cutvideo`
|
- StoryForge 已支持把 `upload_video` 或已完成的 `video_link` 源素材自动上传到 `cutvideo`
|
||||||
- `real-cut` 任务可直接传 `source_job_id`,由后端完成 staging 后再提交到剪辑服务
|
- `real-cut` 任务可直接传 `source_job_id`,由后端完成 staging 后再提交到剪辑服务
|
||||||
- Windows 机器还需要部署带 `POST /api/uploads` 的 `cutvideo` 分支版本
|
- Windows 机器已部署带 `POST /api/uploads` 的 `cutvideo` 版本,并完成局域网联调
|
||||||
|
|
||||||
|
### 3.1 `douyin` 工作台
|
||||||
|
|
||||||
|
- `collector-service` 已具备 `/v2/douyin/*` 工作台接口
|
||||||
|
- 已补充两类关键联调增强:
|
||||||
|
- 分享文案中的 URL 自动提取与归一化
|
||||||
|
- public 页面命中抖音反爬挑战时的显式诊断返回
|
||||||
|
- 真实 smoke 结果表明,纯 public 主页抓取会落到 `byted_acrawler` 挑战页,而不是正常 profile 数据页
|
||||||
|
- 同时,`manual_profile_payload + manual_work_payloads` 已验证可完成账号入库和分析报告生成
|
||||||
|
|
||||||
|
结论:`douyin` 方向不再是“接口存在但不可用”,当前状态是“public 直抓受反爬限制,但人工采集兜底链已跑通”。
|
||||||
|
|
||||||
### 4. `huobao-drama`
|
### 4. `huobao-drama`
|
||||||
|
|
||||||
@@ -151,6 +162,7 @@
|
|||||||
|
|
||||||
## 当前主要风险
|
## 当前主要风险
|
||||||
|
|
||||||
1. 抖音 / 小红书账号级内容源还未做真实平台验证
|
1. 小红书账号级内容源还未做真实平台验证
|
||||||
2. `huobao-drama` 已在本机旧改版实例上跑通,但兼容补丁尚未迁到 upstream 仓库并形成正式提交
|
2. `douyin` public 直抓仍受反爬限制,生产落地还需要补 cookie 或人工页面采集协作链
|
||||||
3. `douyin` 新接口已上线 live,但还需要补一轮真实账号级回归,确认页面抓取、手工 payload 和相似账号分析都稳定
|
3. `huobao-drama` 已在本机旧改版实例上跑通,但兼容补丁尚未迁到 upstream 仓库并形成正式提交
|
||||||
|
4. `douyin` 新接口已上线 live,但还需要补一轮真实账号级回归,确认手工 payload 和相似账号分析都稳定
|
||||||
|
|||||||
@@ -137,6 +137,25 @@ docker compose up -d --build
|
|||||||
- 子任务:`job_7f169db61af441f8a7f186d03db2d91c`
|
- 子任务:`job_7f169db61af441f8a7f186d03db2d91c`
|
||||||
- 子任务:`job_28c47774028441378a3974860c375ab7`
|
- 子任务:`job_28c47774028441378a3974860c375ab7`
|
||||||
|
|
||||||
|
## 6.1 `douyin` 账号工作台验证
|
||||||
|
|
||||||
|
基础接口:
|
||||||
|
|
||||||
|
- `POST /v2/douyin/accounts/sync`
|
||||||
|
- `POST /v2/douyin/accounts/{account_id}/analysis`
|
||||||
|
|
||||||
|
说明:
|
||||||
|
|
||||||
|
- `profile_url` 现在支持直接传分享文案,后端会自动提取里面的 URL
|
||||||
|
- 如果 public 页面命中抖音反爬挑战,接口会返回 `public_profile_anti_bot_challenge`
|
||||||
|
- 遇到挑战页时,继续可用的路径是 `manual_profile_payload`、`manual_work_payloads` 和 `manual_creator_pages`
|
||||||
|
|
||||||
|
已验证样例:
|
||||||
|
|
||||||
|
- public 页面 smoke:返回 `public_profile_anti_bot_challenge`
|
||||||
|
- 手工导入账号:`dyacct_c2b62842b228406cb48f05fac16fdfdf`
|
||||||
|
- 手工账号分析报告:`dyreport_10d6b8d2d52a404192f54a3a05d44546`
|
||||||
|
|
||||||
## 7. `cutvideo` 实拍剪辑链路验证
|
## 7. `cutvideo` 实拍剪辑链路验证
|
||||||
|
|
||||||
调用 `POST /v2/pipelines/real-cut`
|
调用 `POST /v2/pipelines/real-cut`
|
||||||
@@ -187,6 +206,6 @@ docker compose up -d --build
|
|||||||
|
|
||||||
## 9. 当前已知卡点
|
## 9. 当前已知卡点
|
||||||
|
|
||||||
- Windows 机器上的 `cutvideo` 还需要部署带 `POST /api/uploads` 的新分支版本
|
- 抖音 public 页面直抓会命中反爬挑战;生产接入仍需要 cookie 或人工页面采集协助
|
||||||
- 抖音 / 小红书账号级内容源还未做真实平台验证
|
- 小红书账号级内容源还未做真实平台验证
|
||||||
- `huobao-drama` 目前跑通依赖本地旧改版中的 qnaigc 兼容补丁,下一步要迁到 upstream 仓库
|
- `huobao-drama` 目前跑通依赖本地旧改版中的 qnaigc 兼容补丁,下一步要迁到 upstream 仓库
|
||||||
|
|||||||
@@ -19,6 +19,8 @@
|
|||||||
- `upload_video -> source_job_id -> cutvideo` 自动 staging 闭环
|
- `upload_video -> source_job_id -> cutvideo` 自动 staging 闭环
|
||||||
- `collector` live 运行态已从临时源码挂载切回 `StoryForge-gitea` 正式镜像
|
- `collector` live 运行态已从临时源码挂载切回 `StoryForge-gitea` 正式镜像
|
||||||
- live `collector` 已挂出 `/v2/douyin/*` 能力并通过认证接口验证
|
- live `collector` 已挂出 `/v2/douyin/*` 能力并通过认证接口验证
|
||||||
|
- `douyin` 支持从分享文案中提取 `profile_url`,并在 public 页面命中抖音反爬挑战时返回明确诊断
|
||||||
|
- `douyin` 手工 payload 导入与账号分析链路已跑通
|
||||||
- 本机 `huobao-drama` API 调度、首尾帧生成、视频生成与结果回写接口
|
- 本机 `huobao-drama` API 调度、首尾帧生成、视频生成与结果回写接口
|
||||||
- FastGPT 运行时依赖删除
|
- FastGPT 运行时依赖删除
|
||||||
|
|
||||||
@@ -33,14 +35,16 @@
|
|||||||
- 实拍剪辑自动 staging 联调:`job_01a6f283cbda42e4ae692b268b811a50`
|
- 实拍剪辑自动 staging 联调:`job_01a6f283cbda42e4ae692b268b811a50`
|
||||||
- AI 视频链路:`job_01828c40377747cf914b51be360cc333`
|
- AI 视频链路:`job_01828c40377747cf914b51be360cc333`
|
||||||
- Windows `cutvideo` 部署后联调:`job_5838515ed5c34679acd55a52cfcd424b`
|
- Windows `cutvideo` 部署后联调:`job_5838515ed5c34679acd55a52cfcd424b`
|
||||||
|
- `douyin` 手工导入账号:`dyacct_c2b62842b228406cb48f05fac16fdfdf`
|
||||||
|
- `douyin` 账号分析报告:`dyreport_10d6b8d2d52a404192f54a3a05d44546`
|
||||||
|
|
||||||
## 尚未完全跑通
|
## 尚未完全跑通
|
||||||
|
|
||||||
- 抖音 / 小红书账号级内容源还未做真实平台验证;`bilibili` 账号级 URL 已跑通
|
- 小红书账号级内容源还未做真实平台验证
|
||||||
- `douyin` 账号分析接口已上线到 live `collector`,但还没有跑过真实生产账号样例
|
- `douyin` public 主页直抓会命中 `public_profile_anti_bot_challenge`;当前已验证手工 payload 导入和分析可作为可用兜底路径
|
||||||
|
|
||||||
## 下一步优先级
|
## 下一步优先级
|
||||||
|
|
||||||
1. 补抖音 / 小红书账号级真实验证与必要的 URL 归一化
|
1. 补抖音真实账号的 cookie / 手工页面采集联调,以及小红书账号级验证
|
||||||
2. 把 `collector` live 切换结果和部署回滚说明固化到仓库
|
2. 跑一轮 `douyin` 相似账号搜索和对标关系链路
|
||||||
3. 把改动整理成提交并推送
|
3. 把 `collector` live 切换结果和部署回滚说明继续固化到仓库
|
||||||
|
|||||||
Reference in New Issue
Block a user