diff --git a/collector-service/app/douyin_features.py b/collector-service/app/douyin_features.py index b50cd31..99c94ff 100644 --- a/collector-service/app/douyin_features.py +++ b/collector-service/app/douyin_features.py @@ -36,6 +36,7 @@ class DouyinAccountSyncRequest(BaseModel): profile_url: str = "" session_cookie: str = "" creator_center_urls: list[str] = Field(default_factory=lambda: list(DEFAULT_CREATOR_CENTER_URLS)) + allow_creator_center_profile_fallback: bool = False manual_profile_payload: dict[str, Any] | None = None manual_creator_pages: list[ManualPageCapture] = Field(default_factory=list) manual_work_payloads: list[dict[str, Any]] = Field(default_factory=list) @@ -436,6 +437,63 @@ def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]: return videos +def _merge_profile_payload(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]: + if not overlay: + return base + if not base or not base.get("nickname"): + return overlay + + merged = dict(base) + merged["nickname"] = base.get("nickname") or overlay.get("nickname", "") + merged["signature"] = base.get("signature") or overlay.get("signature", "") + merged["profile_url"] = base.get("profile_url") or overlay.get("profile_url", "") + merged["canonical_profile_url"] = base.get("canonical_profile_url") or overlay.get("canonical_profile_url", "") + merged["sec_uid"] = base.get("sec_uid") or overlay.get("sec_uid", "") + merged["douyin_uid"] = base.get("douyin_uid") or overlay.get("douyin_uid", "") + merged["douyin_id"] = base.get("douyin_id") or overlay.get("douyin_id", "") + merged["avatar_url"] = base.get("avatar_url") or overlay.get("avatar_url", "") + merged["tags"] = _dedupe_strings(base.get("tags", []) + overlay.get("tags", [])) + merged["stats"] = { + "followers": float(base.get("stats", {}).get("followers") or overlay.get("stats", {}).get("followers") or 0), + "following": float(base.get("stats", {}).get("following") or overlay.get("stats", {}).get("following") or 0), + "likes": float(base.get("stats", {}).get("likes") or overlay.get("stats", {}).get("likes") or 0), + "videos": float(base.get("stats", {}).get("videos") or overlay.get("stats", {}).get("videos") or 0), + } + if not merged.get("raw"): + merged["raw"] = overlay.get("raw", {}) + return merged + + +def _extract_creator_payloads(creator_data: dict[str, Any]) -> list[Any]: + payloads: list[Any] = [] + for page in creator_data.get("pages", []): + for blob in page.get("blobs", []): + payload = blob.get("payload") + if payload not in (None, "", [], {}): + payloads.append(payload) + return payloads + + +def _profile_identity_value(profile: dict[str, Any], field_name: str) -> str: + value = str(profile.get(field_name, "") or "").strip() + if not value: + return "" + if field_name in {"profile_url", "canonical_profile_url"}: + return _normalize_profile_url_input(value) + return value + + +def _profiles_appear_same(left: dict[str, Any], right: dict[str, Any]) -> bool: + if not left or not right: + return False + for field_name in ("sec_uid", "douyin_uid", "douyin_id", "canonical_profile_url", "profile_url"): + left_value = _profile_identity_value(left, field_name) + right_value = _profile_identity_value(right, field_name) + if left_value and right_value and left_value == right_value: + return True + return False + + def _normalize_profile_url_input(value: str) -> str: text = str(value or "").strip() if not text: @@ -1823,11 +1881,34 @@ def register_douyin_routes(app: Any, legacy: Any) -> None: request.session_cookie, request.manual_creator_pages ) - if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]: + creator_payloads = _extract_creator_payloads(creator_data) + if creator_payloads: + creator_profile = _pick_best_profile( + [candidate for payload in creator_payloads for candidate in _extract_profile_candidates(payload)] + ) + creator_videos = _extract_videos(creator_payloads) + creator_identity_match = _profiles_appear_same(public_data["profile"], creator_profile) + should_merge_creator = creator_identity_match or request.allow_creator_center_profile_fallback + if should_merge_creator: + if creator_profile.get("nickname"): + public_data["profile"] = _merge_profile_payload(public_data["profile"], creator_profile) + if not public_data["source_url"]: + public_data["source_url"] = creator_profile.get("canonical_profile_url") or request.profile_url + if request.allow_creator_center_profile_fallback and not creator_identity_match: + public_data["errors"].append("creator_center_profile_fallback_used") + elif public_data["profile"].get("nickname") != creator_profile.get("nickname"): + public_data["errors"].append("creator_center_profile_merge_partial") + public_data["videos"].extend(creator_videos) + elif creator_profile.get("nickname") or creator_videos: + public_data["errors"].append("creator_center_identity_mismatch_skipped") + if not public_data["profile"].get("nickname") and not public_data["videos"]: + message = "No Douyin profile or creator-center data could be extracted" + if "creator_center_identity_mismatch_skipped" in public_data["errors"]: + message = "Creator-center capture belongs to a different logged-in Douyin account; automatic merge was skipped" raise HTTPException( status_code=400, detail={ - "message": "No Douyin profile or creator-center data could be extracted", + "message": message, "profile_url": request.profile_url, "resolved_profile_url": public_data["source_url"], "public_blob_count": len(public_data["raw_pages"]), diff --git a/scripts/douyin-browser-capture/README.md b/scripts/douyin-browser-capture/README.md index aeed2ef..8186fdb 100644 --- a/scripts/douyin-browser-capture/README.md +++ b/scripts/douyin-browser-capture/README.md @@ -50,3 +50,4 @@ Each run writes: - This is designed as a browser-assisted capture flow, not a fully headless anti-bot bypass. - If Douyin shows a slider or challenge page, solve it manually in the opened browser window and then continue. - Use `--no-sync` if you only want to save a local bundle for inspection. +- Creator-center pages belong to the currently logged-in Douyin account. StoryForge now treats them as supplemental evidence by default and will not let them overwrite the target profile unless you explicitly pass `--allow-creator-center-fallback`. diff --git a/scripts/douyin-browser-capture/capture_and_sync.mjs b/scripts/douyin-browser-capture/capture_and_sync.mjs index 0c45c15..cd9b9e6 100644 --- a/scripts/douyin-browser-capture/capture_and_sync.mjs +++ b/scripts/douyin-browser-capture/capture_and_sync.mjs @@ -1,6 +1,7 @@ #!/usr/bin/env node import fs from "node:fs/promises"; +import { execFileSync } from "node:child_process"; import os from "node:os"; import path from "node:path"; import process from "node:process"; @@ -20,6 +21,39 @@ const JSON_CAPTURE_LIMIT = 1_500_000; const SCRIPT_SCAN_LIMIT = 2_000_000; const WAIT_AFTER_NAV_MS = 4_000; const RESPONSE_READ_TIMEOUT_MS = 2_000; +const PYTHON_HTTP_BRIDGE = ` +import json +import sys +import urllib.error +import urllib.request + +url, method, headers_json, body_mode, body_value = sys.argv[1:6] +headers = json.loads(headers_json) +body = None +if body_mode == "text": + body = body_value.encode("utf-8") +elif body_mode == "path": + with open(body_value, "rb") as handle: + body = handle.read() +request = urllib.request.Request(url, data=body, headers=headers, method=method) +try: + with urllib.request.urlopen(request, timeout=120) as response: + raw = response.read().decode("utf-8", "replace") + try: + payload = json.loads(raw) if raw else None + except Exception: + payload = {"raw": raw} + print(json.dumps({"status": response.status, "data": payload}, ensure_ascii=False)) +except urllib.error.HTTPError as error: + raw = error.read().decode("utf-8", "replace") + try: + payload = json.loads(raw) if raw else None + except Exception: + payload = {"raw": raw} + print(json.dumps({"status": error.code, "data": payload}, ensure_ascii=False)) +except Exception as error: + print(json.dumps({"status": 599, "data": {"raw": str(error)}}, ensure_ascii=False)) +`; function printHelp() { console.log(`StoryForge Douyin Browser Capture @@ -46,6 +80,8 @@ Mode flags: --skip-login-prompt Do not pause for manual login / captcha completion --no-sync Capture only, do not import into StoryForge --no-creator-center Skip creator-center page capture + --allow-creator-center-fallback + Allow creator-center identity to replace a missing public profile --note Discovery note saved into StoryForge Examples: @@ -71,6 +107,7 @@ function parseArgs(argv) { manualPrompt: true, syncEnabled: true, creatorCenterEnabled: true, + allowCreatorCenterFallback: false, creatorCenterUrls: [...DEFAULT_CREATOR_CENTER_URLS], note: "", profileUrl: "", @@ -150,6 +187,9 @@ function parseArgs(argv) { case "--no-creator-center": options.creatorCenterEnabled = false; break; + case "--allow-creator-center-fallback": + options.allowCreatorCenterFallback = true; + break; default: throw new Error(`Unknown argument: ${arg}`); } @@ -442,6 +482,78 @@ async function clickFirstVisible(page, selectors) { return false; } +function escapeRegExp(value) { + return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function decodeEscapedUrl(value) { + return String(value || "") + .replace(/\\u002F/g, "/") + .replace(/\\\//g, "/") + .replace(/&/g, "&"); +} + +async function resolveCreatorPrefetchUrl(page) { + const current = new URL(page.url()); + const html = await page.content(); + const escapedPath = escapeRegExp(current.pathname); + const mapped = html.match( + new RegExp(`"${escapedPath}"\\s*:\\s*"(https://creator\\.douyin\\.com[^"]+prefetch\\.json)"`) + ); + if (mapped?.[1]) { + return decodeEscapedUrl(mapped[1]); + } + const discovered = Array.from( + new Set( + [...html.matchAll(/https:\/\/creator\.douyin\.com\/goofy\/douyin_creator_pc\/mono\/prefetch\/[^"]+prefetch\.json/g)].map( + (match) => decodeEscapedUrl(match[0]) + ) + ) + ); + return ( + discovered.find((candidate) => candidate.includes(current.pathname.replace(/^\/creator-micro\//, ""))) || + discovered[0] || + `https://creator.douyin.com/goofy/douyin_creator_pc/mono/prefetch${current.pathname}/prefetch.json` + ); +} + +async function collectCreatorPrefetchResults(page) { + const prefetchUrl = await resolveCreatorPrefetchUrl(page); + return page.evaluate(async ({ prefetchUrl }) => { + try { + const prefetchResp = await fetch(prefetchUrl, { credentials: "same-origin" }); + const prefetchText = await prefetchResp.text(); + const prefetch = JSON.parse(prefetchText); + const results = []; + for (const api of prefetch?.apis || []) { + const target = new URL(api.url, window.location.origin); + for (const [key, value] of Object.entries(api.params || {})) { + target.searchParams.set(key, String(value)); + } + const resp = await fetch(target.toString(), { + credentials: api.credentials || "same-origin", + }); + const payload = await resp.json().catch(() => null); + results.push({ + url: target.toString(), + payload, + }); + } + return { + prefetch_url: prefetchUrl, + prefetch, + results, + }; + } catch (error) { + return { + prefetch_url: prefetchUrl, + error: String(error), + results: [], + }; + } + }, { prefetchUrl }); +} + async function prepareProfilePage(page, options) { await clickFirstVisible(page, [ "text=作品", @@ -512,32 +624,38 @@ async function saveJsonSafe(filePath, value) { } } -async function loginStoryForge(baseUrl, username, password) { - const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ username, password }) - }); - if (!response.ok) { - throw new Error(`StoryForge login failed: ${response.status} ${await response.text()}`); +async function requestJson(urlString, { method = "GET", headers = {}, body = null, bodyPath = "" } = {}) { + const bodyMode = bodyPath ? "path" : body === null ? "none" : "text"; + const bodyValue = bodyPath || (typeof body === "string" ? body : JSON.stringify(body)); + const stdout = execFileSync( + "python3", + ["-c", PYTHON_HTTP_BRIDGE, urlString, method, JSON.stringify(headers), bodyMode, bodyValue], + { maxBuffer: 20 * 1024 * 1024, encoding: "utf8" } + ); + const payload = JSON.parse(String(stdout || "").trim() || "{}"); + if ((payload.status || 500) >= 400) { + throw new Error(`Request failed: ${payload.status} ${JSON.stringify(payload.data)}`); } - return response.json(); + return payload.data; } -async function syncCapture(baseUrl, token, body) { - const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, { +async function loginStoryForge(baseUrl, username, password) { + return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: { username, password } + }); +} + +async function syncCapture(baseUrl, token, bodyPath) { + return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, { method: "POST", headers: { "content-type": "application/json", Authorization: `Bearer ${token}` }, - body: JSON.stringify(body) + bodyPath }); - const payload = await response.json().catch(async () => ({ raw: await response.text() })); - if (!response.ok) { - throw new Error(`StoryForge sync failed: ${response.status} ${JSON.stringify(payload)}`); - } - return payload; } async function captureCreatorPages(context, options, runDir) { @@ -552,7 +670,10 @@ async function captureCreatorPages(context, options, runDir) { try { console.error(`Capturing creator-center page: ${url}`); await navigateAndSettle(page, url, options.waitMs); - const bundle = await capturePageBundle(page, "creator_center", responseCapture); + const prefetchResults = await collectCreatorPrefetchResults(page); + const bundle = await capturePageBundle(page, "creator_center", responseCapture, { + creator_prefetch: prefetchResults + }); pages.push({ url: bundle.page_url, title: bundle.page_title, @@ -650,12 +771,14 @@ async function main() { const syncBody = { profile_url: options.profileUrl, + allow_creator_center_profile_fallback: options.allowCreatorCenterFallback, manual_profile_payload: profileBundle, manual_creator_pages: creatorPages, manual_work_payloads: videoPages, discovery_note: options.note || "browser-assisted capture" }; - await saveJson(path.join(runDir, "storyforge-sync-request.json"), syncBody); + const syncRequestPath = path.join(runDir, "storyforge-sync-request.json"); + await saveJson(syncRequestPath, syncBody); summary.video_link_count = videoLinks.length; summary.captured_video_pages = videoPages.length; @@ -675,7 +798,7 @@ async function main() { default_external_base_url: auth.default_external_base_url }); } - const workspace = await syncCapture(options.backendUrl, token, syncBody); + const workspace = await syncCapture(options.backendUrl, token, syncRequestPath); summary.sync_result = { account_id: workspace.account?.id || "", nickname: workspace.account?.nickname || "",