fix: guard douyin creator-center identity merges
This commit is contained in:
@@ -36,6 +36,7 @@ class DouyinAccountSyncRequest(BaseModel):
|
||||
profile_url: str = ""
|
||||
session_cookie: str = ""
|
||||
creator_center_urls: list[str] = Field(default_factory=lambda: list(DEFAULT_CREATOR_CENTER_URLS))
|
||||
allow_creator_center_profile_fallback: bool = False
|
||||
manual_profile_payload: dict[str, Any] | None = None
|
||||
manual_creator_pages: list[ManualPageCapture] = Field(default_factory=list)
|
||||
manual_work_payloads: list[dict[str, Any]] = Field(default_factory=list)
|
||||
@@ -436,6 +437,63 @@ def _extract_videos(payloads: Iterable[Any]) -> list[dict[str, Any]]:
|
||||
return videos
|
||||
|
||||
|
||||
def _merge_profile_payload(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]:
|
||||
if not overlay:
|
||||
return base
|
||||
if not base or not base.get("nickname"):
|
||||
return overlay
|
||||
|
||||
merged = dict(base)
|
||||
merged["nickname"] = base.get("nickname") or overlay.get("nickname", "")
|
||||
merged["signature"] = base.get("signature") or overlay.get("signature", "")
|
||||
merged["profile_url"] = base.get("profile_url") or overlay.get("profile_url", "")
|
||||
merged["canonical_profile_url"] = base.get("canonical_profile_url") or overlay.get("canonical_profile_url", "")
|
||||
merged["sec_uid"] = base.get("sec_uid") or overlay.get("sec_uid", "")
|
||||
merged["douyin_uid"] = base.get("douyin_uid") or overlay.get("douyin_uid", "")
|
||||
merged["douyin_id"] = base.get("douyin_id") or overlay.get("douyin_id", "")
|
||||
merged["avatar_url"] = base.get("avatar_url") or overlay.get("avatar_url", "")
|
||||
merged["tags"] = _dedupe_strings(base.get("tags", []) + overlay.get("tags", []))
|
||||
merged["stats"] = {
|
||||
"followers": float(base.get("stats", {}).get("followers") or overlay.get("stats", {}).get("followers") or 0),
|
||||
"following": float(base.get("stats", {}).get("following") or overlay.get("stats", {}).get("following") or 0),
|
||||
"likes": float(base.get("stats", {}).get("likes") or overlay.get("stats", {}).get("likes") or 0),
|
||||
"videos": float(base.get("stats", {}).get("videos") or overlay.get("stats", {}).get("videos") or 0),
|
||||
}
|
||||
if not merged.get("raw"):
|
||||
merged["raw"] = overlay.get("raw", {})
|
||||
return merged
|
||||
|
||||
|
||||
def _extract_creator_payloads(creator_data: dict[str, Any]) -> list[Any]:
|
||||
payloads: list[Any] = []
|
||||
for page in creator_data.get("pages", []):
|
||||
for blob in page.get("blobs", []):
|
||||
payload = blob.get("payload")
|
||||
if payload not in (None, "", [], {}):
|
||||
payloads.append(payload)
|
||||
return payloads
|
||||
|
||||
|
||||
def _profile_identity_value(profile: dict[str, Any], field_name: str) -> str:
|
||||
value = str(profile.get(field_name, "") or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
if field_name in {"profile_url", "canonical_profile_url"}:
|
||||
return _normalize_profile_url_input(value)
|
||||
return value
|
||||
|
||||
|
||||
def _profiles_appear_same(left: dict[str, Any], right: dict[str, Any]) -> bool:
|
||||
if not left or not right:
|
||||
return False
|
||||
for field_name in ("sec_uid", "douyin_uid", "douyin_id", "canonical_profile_url", "profile_url"):
|
||||
left_value = _profile_identity_value(left, field_name)
|
||||
right_value = _profile_identity_value(right, field_name)
|
||||
if left_value and right_value and left_value == right_value:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _normalize_profile_url_input(value: str) -> str:
|
||||
text = str(value or "").strip()
|
||||
if not text:
|
||||
@@ -1823,11 +1881,34 @@ def register_douyin_routes(app: Any, legacy: Any) -> None:
|
||||
request.session_cookie,
|
||||
request.manual_creator_pages
|
||||
)
|
||||
if not public_data["profile"].get("nickname") and not public_data["videos"] and not creator_data["pages"]:
|
||||
creator_payloads = _extract_creator_payloads(creator_data)
|
||||
if creator_payloads:
|
||||
creator_profile = _pick_best_profile(
|
||||
[candidate for payload in creator_payloads for candidate in _extract_profile_candidates(payload)]
|
||||
)
|
||||
creator_videos = _extract_videos(creator_payloads)
|
||||
creator_identity_match = _profiles_appear_same(public_data["profile"], creator_profile)
|
||||
should_merge_creator = creator_identity_match or request.allow_creator_center_profile_fallback
|
||||
if should_merge_creator:
|
||||
if creator_profile.get("nickname"):
|
||||
public_data["profile"] = _merge_profile_payload(public_data["profile"], creator_profile)
|
||||
if not public_data["source_url"]:
|
||||
public_data["source_url"] = creator_profile.get("canonical_profile_url") or request.profile_url
|
||||
if request.allow_creator_center_profile_fallback and not creator_identity_match:
|
||||
public_data["errors"].append("creator_center_profile_fallback_used")
|
||||
elif public_data["profile"].get("nickname") != creator_profile.get("nickname"):
|
||||
public_data["errors"].append("creator_center_profile_merge_partial")
|
||||
public_data["videos"].extend(creator_videos)
|
||||
elif creator_profile.get("nickname") or creator_videos:
|
||||
public_data["errors"].append("creator_center_identity_mismatch_skipped")
|
||||
if not public_data["profile"].get("nickname") and not public_data["videos"]:
|
||||
message = "No Douyin profile or creator-center data could be extracted"
|
||||
if "creator_center_identity_mismatch_skipped" in public_data["errors"]:
|
||||
message = "Creator-center capture belongs to a different logged-in Douyin account; automatic merge was skipped"
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail={
|
||||
"message": "No Douyin profile or creator-center data could be extracted",
|
||||
"message": message,
|
||||
"profile_url": request.profile_url,
|
||||
"resolved_profile_url": public_data["source_url"],
|
||||
"public_blob_count": len(public_data["raw_pages"]),
|
||||
|
||||
@@ -50,3 +50,4 @@ Each run writes:
|
||||
- This is designed as a browser-assisted capture flow, not a fully headless anti-bot bypass.
|
||||
- If Douyin shows a slider or challenge page, solve it manually in the opened browser window and then continue.
|
||||
- Use `--no-sync` if you only want to save a local bundle for inspection.
|
||||
- Creator-center pages belong to the currently logged-in Douyin account. StoryForge now treats them as supplemental evidence by default and will not let them overwrite the target profile unless you explicitly pass `--allow-creator-center-fallback`.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import fs from "node:fs/promises";
|
||||
import { execFileSync } from "node:child_process";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
@@ -20,6 +21,39 @@ const JSON_CAPTURE_LIMIT = 1_500_000;
|
||||
const SCRIPT_SCAN_LIMIT = 2_000_000;
|
||||
const WAIT_AFTER_NAV_MS = 4_000;
|
||||
const RESPONSE_READ_TIMEOUT_MS = 2_000;
|
||||
const PYTHON_HTTP_BRIDGE = `
|
||||
import json
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
url, method, headers_json, body_mode, body_value = sys.argv[1:6]
|
||||
headers = json.loads(headers_json)
|
||||
body = None
|
||||
if body_mode == "text":
|
||||
body = body_value.encode("utf-8")
|
||||
elif body_mode == "path":
|
||||
with open(body_value, "rb") as handle:
|
||||
body = handle.read()
|
||||
request = urllib.request.Request(url, data=body, headers=headers, method=method)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=120) as response:
|
||||
raw = response.read().decode("utf-8", "replace")
|
||||
try:
|
||||
payload = json.loads(raw) if raw else None
|
||||
except Exception:
|
||||
payload = {"raw": raw}
|
||||
print(json.dumps({"status": response.status, "data": payload}, ensure_ascii=False))
|
||||
except urllib.error.HTTPError as error:
|
||||
raw = error.read().decode("utf-8", "replace")
|
||||
try:
|
||||
payload = json.loads(raw) if raw else None
|
||||
except Exception:
|
||||
payload = {"raw": raw}
|
||||
print(json.dumps({"status": error.code, "data": payload}, ensure_ascii=False))
|
||||
except Exception as error:
|
||||
print(json.dumps({"status": 599, "data": {"raw": str(error)}}, ensure_ascii=False))
|
||||
`;
|
||||
|
||||
function printHelp() {
|
||||
console.log(`StoryForge Douyin Browser Capture
|
||||
@@ -46,6 +80,8 @@ Mode flags:
|
||||
--skip-login-prompt Do not pause for manual login / captcha completion
|
||||
--no-sync Capture only, do not import into StoryForge
|
||||
--no-creator-center Skip creator-center page capture
|
||||
--allow-creator-center-fallback
|
||||
Allow creator-center identity to replace a missing public profile
|
||||
--note <text> Discovery note saved into StoryForge
|
||||
|
||||
Examples:
|
||||
@@ -71,6 +107,7 @@ function parseArgs(argv) {
|
||||
manualPrompt: true,
|
||||
syncEnabled: true,
|
||||
creatorCenterEnabled: true,
|
||||
allowCreatorCenterFallback: false,
|
||||
creatorCenterUrls: [...DEFAULT_CREATOR_CENTER_URLS],
|
||||
note: "",
|
||||
profileUrl: "",
|
||||
@@ -150,6 +187,9 @@ function parseArgs(argv) {
|
||||
case "--no-creator-center":
|
||||
options.creatorCenterEnabled = false;
|
||||
break;
|
||||
case "--allow-creator-center-fallback":
|
||||
options.allowCreatorCenterFallback = true;
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Unknown argument: ${arg}`);
|
||||
}
|
||||
@@ -442,6 +482,78 @@ async function clickFirstVisible(page, selectors) {
|
||||
return false;
|
||||
}
|
||||
|
||||
function escapeRegExp(value) {
|
||||
return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
function decodeEscapedUrl(value) {
|
||||
return String(value || "")
|
||||
.replace(/\\u002F/g, "/")
|
||||
.replace(/\\\//g, "/")
|
||||
.replace(/&/g, "&");
|
||||
}
|
||||
|
||||
async function resolveCreatorPrefetchUrl(page) {
|
||||
const current = new URL(page.url());
|
||||
const html = await page.content();
|
||||
const escapedPath = escapeRegExp(current.pathname);
|
||||
const mapped = html.match(
|
||||
new RegExp(`"${escapedPath}"\\s*:\\s*"(https://creator\\.douyin\\.com[^"]+prefetch\\.json)"`)
|
||||
);
|
||||
if (mapped?.[1]) {
|
||||
return decodeEscapedUrl(mapped[1]);
|
||||
}
|
||||
const discovered = Array.from(
|
||||
new Set(
|
||||
[...html.matchAll(/https:\/\/creator\.douyin\.com\/goofy\/douyin_creator_pc\/mono\/prefetch\/[^"]+prefetch\.json/g)].map(
|
||||
(match) => decodeEscapedUrl(match[0])
|
||||
)
|
||||
)
|
||||
);
|
||||
return (
|
||||
discovered.find((candidate) => candidate.includes(current.pathname.replace(/^\/creator-micro\//, ""))) ||
|
||||
discovered[0] ||
|
||||
`https://creator.douyin.com/goofy/douyin_creator_pc/mono/prefetch${current.pathname}/prefetch.json`
|
||||
);
|
||||
}
|
||||
|
||||
async function collectCreatorPrefetchResults(page) {
|
||||
const prefetchUrl = await resolveCreatorPrefetchUrl(page);
|
||||
return page.evaluate(async ({ prefetchUrl }) => {
|
||||
try {
|
||||
const prefetchResp = await fetch(prefetchUrl, { credentials: "same-origin" });
|
||||
const prefetchText = await prefetchResp.text();
|
||||
const prefetch = JSON.parse(prefetchText);
|
||||
const results = [];
|
||||
for (const api of prefetch?.apis || []) {
|
||||
const target = new URL(api.url, window.location.origin);
|
||||
for (const [key, value] of Object.entries(api.params || {})) {
|
||||
target.searchParams.set(key, String(value));
|
||||
}
|
||||
const resp = await fetch(target.toString(), {
|
||||
credentials: api.credentials || "same-origin",
|
||||
});
|
||||
const payload = await resp.json().catch(() => null);
|
||||
results.push({
|
||||
url: target.toString(),
|
||||
payload,
|
||||
});
|
||||
}
|
||||
return {
|
||||
prefetch_url: prefetchUrl,
|
||||
prefetch,
|
||||
results,
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
prefetch_url: prefetchUrl,
|
||||
error: String(error),
|
||||
results: [],
|
||||
};
|
||||
}
|
||||
}, { prefetchUrl });
|
||||
}
|
||||
|
||||
async function prepareProfilePage(page, options) {
|
||||
await clickFirstVisible(page, [
|
||||
"text=作品",
|
||||
@@ -512,32 +624,38 @@ async function saveJsonSafe(filePath, value) {
|
||||
}
|
||||
}
|
||||
|
||||
async function loginStoryForge(baseUrl, username, password) {
|
||||
const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ username, password })
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`StoryForge login failed: ${response.status} ${await response.text()}`);
|
||||
async function requestJson(urlString, { method = "GET", headers = {}, body = null, bodyPath = "" } = {}) {
|
||||
const bodyMode = bodyPath ? "path" : body === null ? "none" : "text";
|
||||
const bodyValue = bodyPath || (typeof body === "string" ? body : JSON.stringify(body));
|
||||
const stdout = execFileSync(
|
||||
"python3",
|
||||
["-c", PYTHON_HTTP_BRIDGE, urlString, method, JSON.stringify(headers), bodyMode, bodyValue],
|
||||
{ maxBuffer: 20 * 1024 * 1024, encoding: "utf8" }
|
||||
);
|
||||
const payload = JSON.parse(String(stdout || "").trim() || "{}");
|
||||
if ((payload.status || 500) >= 400) {
|
||||
throw new Error(`Request failed: ${payload.status} ${JSON.stringify(payload.data)}`);
|
||||
}
|
||||
return response.json();
|
||||
return payload.data;
|
||||
}
|
||||
|
||||
async function syncCapture(baseUrl, token, body) {
|
||||
const response = await fetch(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, {
|
||||
async function loginStoryForge(baseUrl, username, password) {
|
||||
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: { username, password }
|
||||
});
|
||||
}
|
||||
|
||||
async function syncCapture(baseUrl, token, bodyPath) {
|
||||
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`
|
||||
},
|
||||
body: JSON.stringify(body)
|
||||
bodyPath
|
||||
});
|
||||
const payload = await response.json().catch(async () => ({ raw: await response.text() }));
|
||||
if (!response.ok) {
|
||||
throw new Error(`StoryForge sync failed: ${response.status} ${JSON.stringify(payload)}`);
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
|
||||
async function captureCreatorPages(context, options, runDir) {
|
||||
@@ -552,7 +670,10 @@ async function captureCreatorPages(context, options, runDir) {
|
||||
try {
|
||||
console.error(`Capturing creator-center page: ${url}`);
|
||||
await navigateAndSettle(page, url, options.waitMs);
|
||||
const bundle = await capturePageBundle(page, "creator_center", responseCapture);
|
||||
const prefetchResults = await collectCreatorPrefetchResults(page);
|
||||
const bundle = await capturePageBundle(page, "creator_center", responseCapture, {
|
||||
creator_prefetch: prefetchResults
|
||||
});
|
||||
pages.push({
|
||||
url: bundle.page_url,
|
||||
title: bundle.page_title,
|
||||
@@ -650,12 +771,14 @@ async function main() {
|
||||
|
||||
const syncBody = {
|
||||
profile_url: options.profileUrl,
|
||||
allow_creator_center_profile_fallback: options.allowCreatorCenterFallback,
|
||||
manual_profile_payload: profileBundle,
|
||||
manual_creator_pages: creatorPages,
|
||||
manual_work_payloads: videoPages,
|
||||
discovery_note: options.note || "browser-assisted capture"
|
||||
};
|
||||
await saveJson(path.join(runDir, "storyforge-sync-request.json"), syncBody);
|
||||
const syncRequestPath = path.join(runDir, "storyforge-sync-request.json");
|
||||
await saveJson(syncRequestPath, syncBody);
|
||||
|
||||
summary.video_link_count = videoLinks.length;
|
||||
summary.captured_video_pages = videoPages.length;
|
||||
@@ -675,7 +798,7 @@ async function main() {
|
||||
default_external_base_url: auth.default_external_base_url
|
||||
});
|
||||
}
|
||||
const workspace = await syncCapture(options.backendUrl, token, syncBody);
|
||||
const workspace = await syncCapture(options.backendUrl, token, syncRequestPath);
|
||||
summary.sync_result = {
|
||||
account_id: workspace.account?.id || "",
|
||||
nickname: workspace.account?.nickname || "",
|
||||
|
||||
Reference in New Issue
Block a user