fix: guard douyin creator-center identity merges

This commit is contained in:
kris
2026-03-20 19:31:29 +08:00
parent 10820595cf
commit 4356c46b9e
3 changed files with 227 additions and 22 deletions

View File

@@ -36,6 +36,7 @@ class DouyinAccountSyncRequest(BaseModel):
profile_url: str = ""
session_cookie: str = ""
creator_center_urls: list[str] = Field(default_factory=lambda: list(DEFAULT_CREATOR_CENTER_URLS))
allow_creator_center_profile_fallback: bool = False
manual_profile_payload: dict[str, Any] | None = None
manual_creator_pages: list[ManualPageCapture] = Field(default_factory=list)
manual_work_payloads: list[dict[str, Any]] = Field(default_factory=list)
@@ -436,6 +437,63 @@ def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]:
return videos
def _merge_profile_payload(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]:
if not overlay:
return base
if not base or not base.get("nickname"):
return overlay
merged = dict(base)
merged["nickname"] = base.get("nickname") or overlay.get("nickname", "")
merged["signature"] = base.get("signature") or overlay.get("signature", "")
merged["profile_url"] = base.get("profile_url") or overlay.get("profile_url", "")
merged["canonical_profile_url"] = base.get("canonical_profile_url") or overlay.get("canonical_profile_url", "")
merged["sec_uid"] = base.get("sec_uid") or overlay.get("sec_uid", "")
merged["douyin_uid"] = base.get("douyin_uid") or overlay.get("douyin_uid", "")
merged["douyin_id"] = base.get("douyin_id") or overlay.get("douyin_id", "")
merged["avatar_url"] = base.get("avatar_url") or overlay.get("avatar_url", "")
merged["tags"] = _dedupe_strings(base.get("tags", []) + overlay.get("tags", []))
merged["stats"] = {
"followers": float(base.get("stats", {}).get("followers") or overlay.get("stats", {}).get("followers") or 0),
"following": float(base.get("stats", {}).get("following") or overlay.get("stats", {}).get("following") or 0),
"likes": float(base.get("stats", {}).get("likes") or overlay.get("stats", {}).get("likes") or 0),
"videos": float(base.get("stats", {}).get("videos") or overlay.get("stats", {}).get("videos") or 0),
}
if not merged.get("raw"):
merged["raw"] = overlay.get("raw", {})
return merged
def _extract_creator_payloads(creator_data: dict[str, Any]) -> list[Any]:
payloads: list[Any] = []
for page in creator_data.get("pages", []):
for blob in page.get("blobs", []):
payload = blob.get("payload")
if payload not in (None, "", [], {}):
payloads.append(payload)
return payloads
def _profile_identity_value(profile: dict[str, Any], field_name: str) -> str:
value = str(profile.get(field_name, "") or "").strip()
if not value:
return ""
if field_name in {"profile_url", "canonical_profile_url"}:
return _normalize_profile_url_input(value)
return value
def _profiles_appear_same(left: dict[str, Any], right: dict[str, Any]) -> bool:
if not left or not right:
return False
for field_name in ("sec_uid", "douyin_uid", "douyin_id", "canonical_profile_url", "profile_url"):
left_value = _profile_identity_value(left, field_name)
right_value = _profile_identity_value(right, field_name)
if left_value and right_value and left_value == right_value:
return True
return False
def _normalize_profile_url_input(value: str) -> str:
text = str(value or "").strip()
if not text:
@@ -1823,11 +1881,34 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
request.session_cookie,
request.manual_creator_pages
)
if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]:
creator_payloads = _extract_creator_payloads(creator_data)
if creator_payloads:
creator_profile = _pick_best_profile(
[candidate for payload in creator_payloads for candidate in _extract_profile_candidates(payload)]
)
creator_videos = _extract_videos(creator_payloads)
creator_identity_match = _profiles_appear_same(public_data["profile"], creator_profile)
should_merge_creator = creator_identity_match or request.allow_creator_center_profile_fallback
if should_merge_creator:
if creator_profile.get("nickname"):
public_data["profile"] = _merge_profile_payload(public_data["profile"], creator_profile)
if not public_data["source_url"]:
public_data["source_url"] = creator_profile.get("canonical_profile_url") or request.profile_url
if request.allow_creator_center_profile_fallback and not creator_identity_match:
public_data["errors"].append("creator_center_profile_fallback_used")
elif public_data["profile"].get("nickname") != creator_profile.get("nickname"):
public_data["errors"].append("creator_center_profile_merge_partial")
public_data["videos"].extend(creator_videos)
elif creator_profile.get("nickname") or creator_videos:
public_data["errors"].append("creator_center_identity_mismatch_skipped")
if not public_data["profile"].get("nickname") and not public_data["videos"]:
message = "No Douyin profile or creator-center data could be extracted"
if "creator_center_identity_mismatch_skipped" in public_data["errors"]:
message = "Creator-center capture belongs to a different logged-in Douyin account; automatic merge was skipped"
raise HTTPException(
status_code=400,
detail={
"message": "No Douyin profile or creator-center data could be extracted",
"message": message,
"profile_url": request.profile_url,
"resolved_profile_url": public_data["source_url"],
"public_blob_count": len(public_data["raw_pages"]),

View File

@@ -50,3 +50,4 @@ Each run writes:
- This is designed as a browser-assisted capture flow, not a fully headless anti-bot bypass.
- If Douyin shows a slider or challenge page, solve it manually in the opened browser window and then continue.
- Use `--no-sync` if you only want to save a local bundle for inspection.
- Creator-center pages belong to the currently logged-in Douyin account. StoryForge now treats them as supplemental evidence by default and will not let them overwrite the target profile unless you explicitly pass `--allow-creator-center-fallback`.

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env node
import fs from "node:fs/promises";
import { execFileSync } from "node:child_process";
import os from "node:os";
import path from "node:path";
import process from "node:process";
@@ -20,6 +21,39 @@ const JSON_CAPTURE_LIMIT = 1_500_000;
const SCRIPT_SCAN_LIMIT = 2_000_000;
const WAIT_AFTER_NAV_MS = 4_000;
const RESPONSE_READ_TIMEOUT_MS = 2_000;
const PYTHON_HTTP_BRIDGE = `
import json
import sys
import urllib.error
import urllib.request
url, method, headers_json, body_mode, body_value = sys.argv[1:6]
headers = json.loads(headers_json)
body = None
if body_mode == "text":
body = body_value.encode("utf-8")
elif body_mode == "path":
with open(body_value, "rb") as handle:
body = handle.read()
request = urllib.request.Request(url, data=body, headers=headers, method=method)
try:
with urllib.request.urlopen(request, timeout=120) as response:
raw = response.read().decode("utf-8", "replace")
try:
payload = json.loads(raw) if raw else None
except Exception:
payload = {"raw": raw}
print(json.dumps({"status": response.status, "data": payload}, ensure_ascii=False))
except urllib.error.HTTPError as error:
raw = error.read().decode("utf-8", "replace")
try:
payload = json.loads(raw) if raw else None
except Exception:
payload = {"raw": raw}
print(json.dumps({"status": error.code, "data": payload}, ensure_ascii=False))
except Exception as error:
print(json.dumps({"status": 599, "data": {"raw": str(error)}}, ensure_ascii=False))
`;
function printHelp() {
console.log(`StoryForge Douyin Browser Capture
@@ -46,6 +80,8 @@ Mode flags:
--skip-login-prompt Do not pause for manual login / captcha completion
--no-sync Capture only, do not import into StoryForge
--no-creator-center Skip creator-center page capture
--allow-creator-center-fallback
Allow creator-center identity to replace a missing public profile
--note <text> Discovery note saved into StoryForge
Examples:
@@ -71,6 +107,7 @@ function parseArgs(argv) {
manualPrompt: true,
syncEnabled: true,
creatorCenterEnabled: true,
allowCreatorCenterFallback: false,
creatorCenterUrls: [...DEFAULT_CREATOR_CENTER_URLS],
note: "",
profileUrl: "",
@@ -150,6 +187,9 @@ function parseArgs(argv) {
case "--no-creator-center":
options.creatorCenterEnabled = false;
break;
case "--allow-creator-center-fallback":
options.allowCreatorCenterFallback = true;
break;
default:
throw new Error(`Unknown argument: ${arg}`);
}
@@ -442,6 +482,78 @@ async function clickFirstVisible(page, selectors) {
return false;
}
function escapeRegExp(value) {
return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function decodeEscapedUrl(value) {
return String(value || "")
.replace(/\\u002F/g, "/")
.replace(/\\\//g, "/")
.replace(/&amp;/g, "&");
}
async function resolveCreatorPrefetchUrl(page) {
const current = new URL(page.url());
const html = await page.content();
const escapedPath = escapeRegExp(current.pathname);
const mapped = html.match(
new RegExp(`"${escapedPath}"\\s*:\\s*"(https://creator\\.douyin\\.com[^"]+prefetch\\.json)"`)
);
if (mapped?.[1]) {
return decodeEscapedUrl(mapped[1]);
}
const discovered = Array.from(
new Set(
[...html.matchAll(/https:\/\/creator\.douyin\.com\/goofy\/douyin_creator_pc\/mono\/prefetch\/[^"]+prefetch\.json/g)].map(
(match) => decodeEscapedUrl(match[0])
)
)
);
return (
discovered.find((candidate) => candidate.includes(current.pathname.replace(/^\/creator-micro\//, ""))) ||
discovered[0] ||
`https://creator.douyin.com/goofy/douyin_creator_pc/mono/prefetch${current.pathname}/prefetch.json`
);
}
async function collectCreatorPrefetchResults(page) {
const prefetchUrl = await resolveCreatorPrefetchUrl(page);
return page.evaluate(async ({ prefetchUrl }) => {
try {
const prefetchResp = await fetch(prefetchUrl, { credentials: "same-origin" });
const prefetchText = await prefetchResp.text();
const prefetch = JSON.parse(prefetchText);
const results = [];
for (const api of prefetch?.apis || []) {
const target = new URL(api.url, window.location.origin);
for (const [key, value] of Object.entries(api.params || {})) {
target.searchParams.set(key, String(value));
}
const resp = await fetch(target.toString(), {
credentials: api.credentials || "same-origin",
});
const payload = await resp.json().catch(() => null);
results.push({
url: target.toString(),
payload,
});
}
return {
prefetch_url: prefetchUrl,
prefetch,
results,
};
} catch (error) {
return {
prefetch_url: prefetchUrl,
error: String(error),
results: [],
};
}
}, { prefetchUrl });
}
async function prepareProfilePage(page, options) {
await clickFirstVisible(page, [
"text=作品",
@@ -512,32 +624,38 @@ async function saveJsonSafe(filePath, value) {
}
}
async function loginStoryForge(baseUrl, username, password) {
const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ username, password })
});
if (!response.ok) {
throw new Error(`StoryForge login failed: ${response.status} ${await response.text()}`);
async function requestJson(urlString, { method = "GET", headers = {}, body = null, bodyPath = "" } = {}) {
const bodyMode = bodyPath ? "path" : body === null ? "none" : "text";
const bodyValue = bodyPath || (typeof body === "string" ? body : JSON.stringify(body));
const stdout = execFileSync(
"python3",
["-c", PYTHON_HTTP_BRIDGE, urlString, method, JSON.stringify(headers), bodyMode, bodyValue],
{ maxBuffer: 20 * 1024 * 1024, encoding: "utf8" }
);
const payload = JSON.parse(String(stdout || "").trim() || "{}");
if ((payload.status || 500) >= 400) {
throw new Error(`Request failed: ${payload.status} ${JSON.stringify(payload.data)}`);
}
return response.json();
return payload.data;
}
async function syncCapture(baseUrl, token, body) {
const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, {
async function loginStoryForge(baseUrl, username, password) {
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, {
method: "POST",
headers: { "content-type": "application/json" },
body: { username, password }
});
}
async function syncCapture(baseUrl, token, bodyPath) {
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, {
method: "POST",
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`
},
body: JSON.stringify(body)
bodyPath
});
const payload = await response.json().catch(async () => ({ raw: await response.text() }));
if (!response.ok) {
throw new Error(`StoryForge sync failed: ${response.status} ${JSON.stringify(payload)}`);
}
return payload;
}
async function captureCreatorPages(context, options, runDir) {
@@ -552,7 +670,10 @@ async function captureCreatorPages(context, options, runDir) {
try {
console.error(`Capturing creator-center page: ${url}`);
await navigateAndSettle(page, url, options.waitMs);
const bundle = await capturePageBundle(page, "creator_center", responseCapture);
const prefetchResults = await collectCreatorPrefetchResults(page);
const bundle = await capturePageBundle(page, "creator_center", responseCapture, {
creator_prefetch: prefetchResults
});
pages.push({
url: bundle.page_url,
title: bundle.page_title,
@@ -650,12 +771,14 @@ async function main() {
const syncBody = {
profile_url: options.profileUrl,
allow_creator_center_profile_fallback: options.allowCreatorCenterFallback,
manual_profile_payload: profileBundle,
manual_creator_pages: creatorPages,
manual_work_payloads: videoPages,
discovery_note: options.note || "browser-assisted capture"
};
await saveJson(path.join(runDir, "storyforge-sync-request.json"), syncBody);
const syncRequestPath = path.join(runDir, "storyforge-sync-request.json");
await saveJson(syncRequestPath, syncBody);
summary.video_link_count = videoLinks.length;
summary.captured_video_pages = videoPages.length;
@@ -675,7 +798,7 @@ async function main() {
default_external_base_url: auth.default_external_base_url
});
}
const workspace = await syncCapture(options.backendUrl, token, syncBody);
const workspace = await syncCapture(options.backendUrl, token, syncRequestPath);
summary.sync_result = {
account_id: workspace.account?.id || "",
nickname: workspace.account?.nickname || "",