diff --git a/collector-service/app/douyin_features.py b/collector-service/app/douyin_features.py index 99c94ff..86f16ac 100644 --- a/collector-service/app/douyin_features.py +++ b/collector-service/app/douyin_features.py @@ -12,6 +12,7 @@ from urllib.parse import quote, unquote import httpx from fastapi import Depends, HTTPException from pydantic import BaseModel, Field +from starlette.concurrency import run_in_threadpool DEFAULT_CREATOR_CENTER_URLS = [ "https://creator.douyin.com/creator-micro/home", @@ -37,6 +38,7 @@ class DouyinAccountSyncRequest(BaseModel): session_cookie: str = "" creator_center_urls: list[str] = Field(default_factory=lambda: list(DEFAULT_CREATOR_CENTER_URLS)) allow_creator_center_profile_fallback: bool = False + compact_response: bool = False manual_profile_payload: dict[str, Any] | None = None manual_creator_pages: list[ManualPageCapture] = Field(default_factory=list) manual_work_payloads: list[dict[str, Any]] = Field(default_factory=list) @@ -1051,11 +1053,12 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: snapshot_type: str, source_url: str, payload: Any, - summary: dict[str, Any] + summary: dict[str, Any], + index_fields: bool = True ) -> str: snapshot_id = make_id("dysnap") collected_at = now() - fields = _flatten_json(payload) + fields = _flatten_json(payload) if index_fields else [] legacy.db.execute( """ INSERT INTO douyin_account_snapshots ( @@ -1107,7 +1110,8 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: "video_count": len(public_data["videos"]), "nickname": public_data["profile"].get("nickname", ""), "tags": public_data["profile"].get("tags", []) - } + }, + index_fields=not sync_request.compact_response ) for page in creator_data["pages"]: @@ -1122,8 +1126,9 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: payload, { "blob_count": len(page["blobs"]), - "field_count": len(_flatten_json(payload)) - } + "field_count": 0 if sync_request.compact_response else len(_flatten_json(payload)) + }, + index_fields=not sync_request.compact_response ) for manual_video in sync_request.manual_work_payloads: @@ -1221,6 +1226,69 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: }) return payloads + def _finalize_sync_workspace( + owner: dict[str, Any], + request: DouyinAccountSyncRequest, + public_data: dict[str, Any], + creator_data: dict[str, Any] + ) -> dict[str, Any]: + creator_payloads = _extract_creator_payloads(creator_data) + if creator_payloads: + creator_profile = _pick_best_profile( + [candidate for payload in creator_payloads for candidate in _extract_profile_candidates(payload)] + ) + creator_videos = _extract_videos(creator_payloads) + creator_identity_match = _profiles_appear_same(public_data["profile"], creator_profile) + should_merge_creator = creator_identity_match or request.allow_creator_center_profile_fallback + if should_merge_creator: + if creator_profile.get("nickname"): + public_data["profile"] = _merge_profile_payload(public_data["profile"], creator_profile) + if not public_data["source_url"]: + public_data["source_url"] = creator_profile.get("canonical_profile_url") or request.profile_url + if request.allow_creator_center_profile_fallback and not creator_identity_match: + public_data["errors"].append("creator_center_profile_fallback_used") + elif public_data["profile"].get("nickname") != creator_profile.get("nickname"): + public_data["errors"].append("creator_center_profile_merge_partial") + public_data["videos"].extend(creator_videos) + elif creator_profile.get("nickname") or creator_videos: + public_data["errors"].append("creator_center_identity_mismatch_skipped") + if not public_data["profile"].get("nickname") and not public_data["videos"]: + message = "No Douyin profile or creator-center data could be extracted" + if "creator_center_identity_mismatch_skipped" in public_data["errors"]: + message = "Creator-center capture belongs to a different logged-in Douyin account; automatic merge was skipped" + raise HTTPException( + status_code=400, + detail={ + "message": message, + "profile_url": request.profile_url, + "resolved_profile_url": public_data["source_url"], + "public_blob_count": len(public_data["raw_pages"]), + "public_video_count": len(public_data["videos"]), + "public_errors": public_data["errors"], + "creator_page_count": len(creator_data["pages"]), + "creator_errors": creator_data["errors"] + } + ) + account_row = _upsert_account(owner, public_data["profile"], request, public_data, creator_data) + sync_errors = public_data["errors"] + creator_data["errors"] + if request.compact_response: + return { + "account": { + "id": account_row["id"], + "nickname": account_row["nickname"], + "profile_url": account_row["profile_url"], + "douyin_id": account_row["douyin_id"], + "sec_uid": account_row["sec_uid"], + "sync_status": account_row["sync_status"] + }, + "sync_errors": sync_errors, + "public_video_count": len(public_data["videos"]), + "creator_page_count": len(creator_data["pages"]) + } + workspace = _build_workspace_payload(account_row) + workspace["sync_errors"] = sync_errors + return workspace + def _build_account_payload(account_row: dict[str, Any], include_recent_videos: int = 8) -> dict[str, Any]: videos = _list_videos(account_row["id"], limit=max(include_recent_videos, 12)) tags = _safe_json_loads(account_row["tags_json"], []) @@ -1881,47 +1949,13 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: request.session_cookie, request.manual_creator_pages ) - creator_payloads = _extract_creator_payloads(creator_data) - if creator_payloads: - creator_profile = _pick_best_profile( - [candidate for payload in creator_payloads for candidate in _extract_profile_candidates(payload)] - ) - creator_videos = _extract_videos(creator_payloads) - creator_identity_match = _profiles_appear_same(public_data["profile"], creator_profile) - should_merge_creator = creator_identity_match or request.allow_creator_center_profile_fallback - if should_merge_creator: - if creator_profile.get("nickname"): - public_data["profile"] = _merge_profile_payload(public_data["profile"], creator_profile) - if not public_data["source_url"]: - public_data["source_url"] = creator_profile.get("canonical_profile_url") or request.profile_url - if request.allow_creator_center_profile_fallback and not creator_identity_match: - public_data["errors"].append("creator_center_profile_fallback_used") - elif public_data["profile"].get("nickname") != creator_profile.get("nickname"): - public_data["errors"].append("creator_center_profile_merge_partial") - public_data["videos"].extend(creator_videos) - elif creator_profile.get("nickname") or creator_videos: - public_data["errors"].append("creator_center_identity_mismatch_skipped") - if not public_data["profile"].get("nickname") and not public_data["videos"]: - message = "No Douyin profile or creator-center data could be extracted" - if "creator_center_identity_mismatch_skipped" in public_data["errors"]: - message = "Creator-center capture belongs to a different logged-in Douyin account; automatic merge was skipped" - raise HTTPException( - status_code=400, - detail={ - "message": message, - "profile_url": request.profile_url, - "resolved_profile_url": public_data["source_url"], - "public_blob_count": len(public_data["raw_pages"]), - "public_video_count": len(public_data["videos"]), - "public_errors": public_data["errors"], - "creator_page_count": len(creator_data["pages"]), - "creator_errors": creator_data["errors"] - } - ) - account_row = _upsert_account(account, public_data["profile"], request, public_data, creator_data) - workspace = _build_workspace_payload(account_row) - workspace["sync_errors"] = public_data["errors"] + creator_data["errors"] - return workspace + return await run_in_threadpool( + _finalize_sync_workspace, + account, + request, + public_data, + creator_data + ) @app.get("/v2/douyin/accounts/{account_id}") def get_douyin_account( diff --git a/scripts/douyin-browser-capture/capture_and_sync.mjs b/scripts/douyin-browser-capture/capture_and_sync.mjs index cd9b9e6..8937a28 100644 --- a/scripts/douyin-browser-capture/capture_and_sync.mjs +++ b/scripts/douyin-browser-capture/capture_and_sync.mjs @@ -743,6 +743,20 @@ async function main() { }; await saveJsonSafe(path.join(runDir, "summary.json"), summary); + let storyforgeToken = options.storyforgeToken; + if (options.syncEnabled && !storyforgeToken) { + const auth = await loginStoryForge( + options.backendUrl, + options.storyforgeUsername, + options.storyforgePassword + ); + storyforgeToken = auth.token; + await saveJson(path.join(runDir, "storyforge-login.json"), { + account: auth.account, + default_external_base_url: auth.default_external_base_url + }); + } + const context = await chromium.launchPersistentContext(options.stateDir, { headless: options.headless, viewport: { width: 1440, height: 1024 }, @@ -772,6 +786,7 @@ async function main() { const syncBody = { profile_url: options.profileUrl, allow_creator_center_profile_fallback: options.allowCreatorCenterFallback, + compact_response: true, manual_profile_payload: profileBundle, manual_creator_pages: creatorPages, manual_work_payloads: videoPages, @@ -785,20 +800,7 @@ async function main() { summary.captured_creator_pages = creatorPages.length; if (options.syncEnabled) { - let token = options.storyforgeToken; - if (!token) { - const auth = await loginStoryForge( - options.backendUrl, - options.storyforgeUsername, - options.storyforgePassword - ); - token = auth.token; - await saveJson(path.join(runDir, "storyforge-login.json"), { - account: auth.account, - default_external_base_url: auth.default_external_base_url - }); - } - const workspace = await syncCapture(options.backendUrl, token, syncRequestPath); + const workspace = await syncCapture(options.backendUrl, storyforgeToken, syncRequestPath); summary.sync_result = { account_id: workspace.account?.id || "", nickname: workspace.account?.nickname || "",