Files
storyforge/scripts/douyin-browser-capture/capture_and_sync.mjs

830 lines
25 KiB
JavaScript

#!/usr/bin/env node
import fs from "node:fs/promises";
import { execFileSync } from "node:child_process";
import os from "node:os";
import path from "node:path";
import process from "node:process";
import readline from "node:readline/promises";
import { stdin as input, stdout as output } from "node:process";
import { chromium } from "playwright";
const DEFAULT_CREATOR_CENTER_URLS = [
"https://creator.douyin.com/creator-micro/home",
"https://creator.douyin.com/creator-micro/data",
"https://creator.douyin.com/creator-micro/content/manage"
];
const DEFAULT_OUTPUT_DIR = "/Users/kris/code/StoryForge-gitea/output/playwright/douyin";
const DEFAULT_STATE_DIR = path.join(os.homedir(), ".storyforge", "douyin-playwright");
const DEFAULT_BACKEND_URL = "http://127.0.0.1:8081";
const JSON_CAPTURE_LIMIT = 1_500_000;
const SCRIPT_SCAN_LIMIT = 2_000_000;
const WAIT_AFTER_NAV_MS = 4_000;
const RESPONSE_READ_TIMEOUT_MS = 2_000;
const PYTHON_HTTP_BRIDGE = `
import json
import sys
import urllib.error
import urllib.request
url, method, headers_json, body_mode, body_value = sys.argv[1:6]
headers = json.loads(headers_json)
body = None
if body_mode == "text":
body = body_value.encode("utf-8")
elif body_mode == "path":
with open(body_value, "rb") as handle:
body = handle.read()
request = urllib.request.Request(url, data=body, headers=headers, method=method)
try:
with urllib.request.urlopen(request, timeout=120) as response:
raw = response.read().decode("utf-8", "replace")
try:
payload = json.loads(raw) if raw else None
except Exception:
payload = {"raw": raw}
print(json.dumps({"status": response.status, "data": payload}, ensure_ascii=False))
except urllib.error.HTTPError as error:
raw = error.read().decode("utf-8", "replace")
try:
payload = json.loads(raw) if raw else None
except Exception:
payload = {"raw": raw}
print(json.dumps({"status": error.code, "data": payload}, ensure_ascii=False))
except Exception as error:
print(json.dumps({"status": 599, "data": {"raw": str(error)}}, ensure_ascii=False))
`;
function printHelp() {
console.log(`StoryForge Douyin Browser Capture
Usage:
node capture_and_sync.mjs --profile-url <douyin-profile-url> [options]
Core options:
--profile-url <url> Douyin profile URL to capture
--backend-url <url> StoryForge collector base URL (default: ${DEFAULT_BACKEND_URL})
--output-dir <dir> Capture output directory (default: ${DEFAULT_OUTPUT_DIR})
--state-dir <dir> Persistent browser state dir (default: ${DEFAULT_STATE_DIR})
--max-videos <n> Max video detail pages to capture (default: 4)
--scroll-count <n> Scroll times on profile page (default: 5)
--wait-ms <n> Wait after each navigation in ms (default: ${WAIT_AFTER_NAV_MS})
StoryForge auth:
--storyforge-token <token> Existing StoryForge bearer token
--storyforge-username <name> Login username for StoryForge
--storyforge-password <pass> Login password for StoryForge
Mode flags:
--headless Run browser headless
--skip-login-prompt Do not pause for manual login / captcha completion
--no-sync Capture only, do not import into StoryForge
--no-creator-center Skip creator-center page capture
--allow-creator-center-fallback
Allow creator-center identity to replace a missing public profile
--note <text> Discovery note saved into StoryForge
Examples:
npm run capture -- \\
--profile-url https://www.douyin.com/user/your_account \\
--storyforge-username kris --storyforge-password 'Asd123456.'
npm run capture -- \\
--profile-url https://www.douyin.com/user/your_account \\
--storyforge-token <token> --headless --skip-login-prompt --no-creator-center
`);
}
function parseArgs(argv) {
const options = {
backendUrl: DEFAULT_BACKEND_URL,
outputDir: DEFAULT_OUTPUT_DIR,
stateDir: DEFAULT_STATE_DIR,
maxVideos: 4,
scrollCount: 5,
waitMs: WAIT_AFTER_NAV_MS,
headless: false,
manualPrompt: true,
syncEnabled: true,
creatorCenterEnabled: true,
allowCreatorCenterFallback: false,
creatorCenterUrls: [...DEFAULT_CREATOR_CENTER_URLS],
note: "",
profileUrl: "",
storyforgeToken: "",
storyforgeUsername: "",
storyforgePassword: ""
};
const requireValue = (index, flag) => {
const value = argv[index + 1];
if (!value || value.startsWith("--")) {
throw new Error(`Missing value for ${flag}`);
}
return value;
};
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
switch (arg) {
case "--help":
case "-h":
options.help = true;
break;
case "--profile-url":
options.profileUrl = requireValue(index, arg);
index += 1;
break;
case "--backend-url":
options.backendUrl = requireValue(index, arg);
index += 1;
break;
case "--output-dir":
options.outputDir = requireValue(index, arg);
index += 1;
break;
case "--state-dir":
options.stateDir = requireValue(index, arg);
index += 1;
break;
case "--max-videos":
options.maxVideos = Number.parseInt(requireValue(index, arg), 10);
index += 1;
break;
case "--scroll-count":
options.scrollCount = Number.parseInt(requireValue(index, arg), 10);
index += 1;
break;
case "--wait-ms":
options.waitMs = Number.parseInt(requireValue(index, arg), 10);
index += 1;
break;
case "--storyforge-token":
options.storyforgeToken = requireValue(index, arg);
index += 1;
break;
case "--storyforge-username":
options.storyforgeUsername = requireValue(index, arg);
index += 1;
break;
case "--storyforge-password":
options.storyforgePassword = requireValue(index, arg);
index += 1;
break;
case "--note":
options.note = requireValue(index, arg);
index += 1;
break;
case "--headless":
options.headless = true;
break;
case "--skip-login-prompt":
options.manualPrompt = false;
break;
case "--no-sync":
options.syncEnabled = false;
break;
case "--no-creator-center":
options.creatorCenterEnabled = false;
break;
case "--allow-creator-center-fallback":
options.allowCreatorCenterFallback = true;
break;
default:
throw new Error(`Unknown argument: ${arg}`);
}
}
return options;
}
function sanitizeName(value) {
return String(value || "capture")
.replace(/[^a-zA-Z0-9._-]+/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "")
.slice(0, 80) || "capture";
}
async function ensureDir(dir) {
await fs.mkdir(dir, { recursive: true });
}
function nowStamp() {
return new Date().toISOString().replace(/[:]/g, "-");
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function navigateAndSettle(page, url, waitMs) {
await page.goto(url, { waitUntil: "commit", timeout: 30_000 }).catch(() => null);
await page.waitForLoadState("domcontentloaded", { timeout: 15_000 }).catch(() => {});
await sleep(waitMs);
}
async function maybePrompt(message, enabled) {
if (!enabled) {
return;
}
const rl = readline.createInterface({ input, output });
try {
await rl.question(`${message}\nPress Enter to continue... `);
} finally {
rl.close();
}
}
function uniqueStrings(values) {
const seen = new Set();
const output = [];
for (const value of values) {
const item = String(value || "").trim();
if (!item || seen.has(item)) {
continue;
}
seen.add(item);
output.push(item);
}
return output;
}
function looksLikeRelevantJsonUrl(url) {
const lower = url.toLowerCase();
return (
lower.includes("douyin.com/aweme") ||
lower.includes("douyin.com/web/api") ||
lower.includes("douyin.com/creator") ||
lower.includes("douyin.com/user") ||
lower.includes("creator.douyin.com") ||
lower.includes("iesdouyin.com")
);
}
function findJsonEnd(text, start) {
const opening = text[start];
const closing = opening === "{" ? "}" : "]";
let depth = 0;
let inString = false;
let escaped = false;
for (let index = start; index < text.length; index += 1) {
const char = text[index];
if (inString) {
if (escaped) {
escaped = false;
} else if (char === "\\") {
escaped = true;
} else if (char === "\"") {
inString = false;
}
continue;
}
if (char === "\"") {
inString = true;
continue;
}
if (char === opening) {
depth += 1;
continue;
}
if (char === closing) {
depth -= 1;
if (depth === 0) {
return index + 1;
}
}
}
return -1;
}
async function createResponseCapture(page) {
const records = [];
const seen = new Set();
const pending = [];
const listener = (response) => {
const promise = (async () => {
try {
const url = response.url();
const headers = response.headers();
const contentType = String(headers["content-type"] || "").toLowerCase();
if (!contentType.includes("json") && !looksLikeRelevantJsonUrl(url)) {
return;
}
const key = `${response.request().method()} ${url}`;
if (seen.has(key)) {
return;
}
const text = await Promise.race([
response.text(),
sleep(RESPONSE_READ_TIMEOUT_MS).then(() => {
throw new Error("response read timeout");
})
]);
if (!text || text.length > JSON_CAPTURE_LIMIT) {
return;
}
let payload = null;
try {
payload = JSON.parse(text);
} catch {
return;
}
seen.add(key);
records.push({
url,
method: response.request().method(),
status: response.status(),
payload
});
} catch {
// Ignore network capture failures; page-level capture is still useful.
}
})();
pending.push(promise);
};
page.on("response", listener);
return {
records,
async stop() {
page.off("response", listener);
await Promise.race([
Promise.allSettled(pending),
sleep(RESPONSE_READ_TIMEOUT_MS + 500)
]);
return records;
}
};
}
function extractJsonObjectsFromText(text) {
const candidates = [text];
const seen = new Set();
const results = [];
for (const candidate of candidates) {
const snippet = String(candidate || "").slice(0, SCRIPT_SCAN_LIMIT);
for (let index = 0; index < snippet.length; index += 1) {
const char = snippet[index];
if (char !== "{" && char !== "[") {
continue;
}
const end = findJsonEnd(snippet, index);
if (end <= index) {
continue;
}
try {
const parsed = JSON.parse(snippet.slice(index, end));
const marker = JSON.stringify(parsed);
if (seen.has(marker)) {
continue;
}
seen.add(marker);
results.push(parsed);
if (results.length >= 50) {
return results;
}
} catch {
// Keep scanning.
}
}
}
return results;
}
function extractScriptPayloads(html) {
const results = [];
const seen = new Set();
const regex = /<script([^>]*)>([\s\S]*?)<\/script>/gi;
let match = null;
while ((match = regex.exec(html)) !== null) {
const attrs = match[1] || "";
const content = match[2] || "";
const idMatch = attrs.match(/id=["']([^"']+)["']/i);
const scriptId = idMatch ? idMatch[1] : "";
for (const payload of extractJsonObjectsFromText(content.trim())) {
const marker = JSON.stringify(payload);
if (seen.has(marker)) {
continue;
}
seen.add(marker);
results.push({ script_id: scriptId, payload });
}
}
return results;
}
async function collectWindowGlobals(page) {
return page.evaluate(() => {
const globalNames = [
"__INITIAL_STATE__",
"__NEXT_DATA__",
"__ROUTER_DATA__",
"SIGI_STATE",
"__APOLLO_STATE__"
];
const result = {};
for (const name of globalNames) {
const value = globalThis[name];
if (value === undefined) {
continue;
}
try {
result[name] = JSON.parse(JSON.stringify(value));
} catch {
// Skip non-serializable globals.
}
}
return result;
});
}
async function collectVideoLinks(page) {
const hrefs = await page.evaluate(() => {
return Array.from(document.querySelectorAll("a[href]"))
.map((node) => node.getAttribute("href") || "")
.filter(Boolean);
});
return uniqueStrings(
hrefs
.map((href) => {
if (href.startsWith("//")) {
return `https:${href}`;
}
if (href.startsWith("/")) {
return `https://www.douyin.com${href}`;
}
return href;
})
.filter((href) => href.includes("/video/"))
);
}
async function clickFirstVisible(page, selectors) {
for (const selector of selectors) {
const locator = page.locator(selector).first();
try {
if (await locator.isVisible({ timeout: 1000 })) {
await locator.click({ timeout: 1000 });
return true;
}
} catch {
// Try next selector.
}
}
return false;
}
function escapeRegExp(value) {
return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function decodeEscapedUrl(value) {
return String(value || "")
.replace(/\\u002F/g, "/")
.replace(/\\\//g, "/")
.replace(/&amp;/g, "&");
}
async function resolveCreatorPrefetchUrl(page) {
const current = new URL(page.url());
const html = await page.content();
const escapedPath = escapeRegExp(current.pathname);
const mapped = html.match(
new RegExp(`"${escapedPath}"\\s*:\\s*"(https://creator\\.douyin\\.com[^"]+prefetch\\.json)"`)
);
if (mapped?.[1]) {
return decodeEscapedUrl(mapped[1]);
}
const discovered = Array.from(
new Set(
[...html.matchAll(/https:\/\/creator\.douyin\.com\/goofy\/douyin_creator_pc\/mono\/prefetch\/[^"]+prefetch\.json/g)].map(
(match) => decodeEscapedUrl(match[0])
)
)
);
return (
discovered.find((candidate) => candidate.includes(current.pathname.replace(/^\/creator-micro\//, ""))) ||
discovered[0] ||
`https://creator.douyin.com/goofy/douyin_creator_pc/mono/prefetch${current.pathname}/prefetch.json`
);
}
async function collectCreatorPrefetchResults(page) {
const prefetchUrl = await resolveCreatorPrefetchUrl(page);
return page.evaluate(async ({ prefetchUrl }) => {
try {
const prefetchResp = await fetch(prefetchUrl, { credentials: "same-origin" });
const prefetchText = await prefetchResp.text();
const prefetch = JSON.parse(prefetchText);
const results = [];
for (const api of prefetch?.apis || []) {
const target = new URL(api.url, window.location.origin);
for (const [key, value] of Object.entries(api.params || {})) {
target.searchParams.set(key, String(value));
}
const resp = await fetch(target.toString(), {
credentials: api.credentials || "same-origin",
});
const payload = await resp.json().catch(() => null);
results.push({
url: target.toString(),
payload,
});
}
return {
prefetch_url: prefetchUrl,
prefetch,
results,
};
} catch (error) {
return {
prefetch_url: prefetchUrl,
error: String(error),
results: [],
};
}
}, { prefetchUrl });
}
async function prepareProfilePage(page, options) {
await clickFirstVisible(page, [
"text=作品",
"text=视频",
"text=全部作品",
"[role='tab']:has-text('作品')"
]);
for (let index = 0; index < 3; index += 1) {
await clickFirstVisible(page, [
"text=展开",
"text=更多",
"text=查看全部"
]);
}
for (let index = 0; index < options.scrollCount; index += 1) {
await page.evaluate(() => window.scrollBy(0, window.innerHeight * 0.85));
await sleep(1200);
}
}
async function capturePageBundle(page, label, responseCapture, extra = {}) {
const html = await page.content();
const loginGateDetected =
html.includes("扫码登录") ||
html.includes("验证码登录") ||
html.includes("登录后免费畅享高清视频");
const antiBotDetected =
html.includes("window.byted_acrawler.init") ||
html.includes("__ac_signature") ||
html.includes("__ac_nonce");
const scripts = extractScriptPayloads(html);
const globals = await collectWindowGlobals(page);
const network = await responseCapture.stop();
const bundle = {
label,
captured_at: new Date().toISOString(),
page_url: page.url(),
page_title: await page.title().catch(() => ""),
page_meta: await page.evaluate(() => ({
href: window.location.href,
title: document.title,
text_excerpt: (document.body?.innerText || "").trim().slice(0, 8000)
})),
capture_hints: {
login_gate_detected: loginGateDetected,
anti_bot_detected: antiBotDetected
},
scripts,
globals,
network,
extra
};
return bundle;
}
async function saveJson(filePath, value) {
await ensureDir(path.dirname(filePath));
await fs.writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
}
async function saveJsonSafe(filePath, value) {
try {
await saveJson(filePath, value);
} catch (error) {
console.error(`Failed to write ${filePath}: ${error?.message || error}`);
}
}
async function requestJson(urlString, { method = "GET", headers = {}, body = null, bodyPath = "" } = {}) {
const bodyMode = bodyPath ? "path" : body === null ? "none" : "text";
const bodyValue = bodyPath || (typeof body === "string" ? body : JSON.stringify(body));
const stdout = execFileSync(
"python3",
["-c", PYTHON_HTTP_BRIDGE, urlString, method, JSON.stringify(headers), bodyMode, bodyValue],
{ maxBuffer: 20 * 1024 * 1024, encoding: "utf8" }
);
const payload = JSON.parse(String(stdout || "").trim() || "{}");
if ((payload.status || 500) >= 400) {
throw new Error(`Request failed: ${payload.status} ${JSON.stringify(payload.data)}`);
}
return payload.data;
}
async function loginStoryForge(baseUrl, username, password) {
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, {
method: "POST",
headers: { "content-type": "application/json" },
body: { username, password }
});
}
async function syncCapture(baseUrl, token, bodyPath) {
return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, {
method: "POST",
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`
},
bodyPath
});
}
async function captureCreatorPages(context, options, runDir) {
const pages = [];
if (!options.creatorCenterEnabled) {
return pages;
}
for (const [index, url] of options.creatorCenterUrls.entries()) {
const page = await context.newPage();
const responseCapture = await createResponseCapture(page);
try {
console.error(`Capturing creator-center page: ${url}`);
await navigateAndSettle(page, url, options.waitMs);
const prefetchResults = await collectCreatorPrefetchResults(page);
const bundle = await capturePageBundle(page, "creator_center", responseCapture, {
creator_prefetch: prefetchResults
});
pages.push({
url: bundle.page_url,
title: bundle.page_title,
payload: bundle
});
await saveJson(
path.join(runDir, `creator-${String(index + 1).padStart(2, "0")}-${sanitizeName(bundle.page_title || bundle.page_url)}.json`),
bundle
);
} finally {
await page.close().catch(() => {});
}
}
return pages;
}
async function captureVideoPages(context, videoLinks, options, runDir) {
const pages = [];
for (const link of videoLinks.slice(0, Math.max(options.maxVideos, 0))) {
const page = await context.newPage();
const responseCapture = await createResponseCapture(page);
try {
console.error(`Capturing video page: ${link}`);
await navigateAndSettle(page, link, options.waitMs);
const bundle = await capturePageBundle(page, "video_detail", responseCapture, { source_link: link });
pages.push(bundle);
await saveJson(path.join(runDir, `video-${sanitizeName(link)}.json`), bundle);
} finally {
await page.close().catch(() => {});
}
}
return pages;
}
async function main() {
const options = parseArgs(process.argv.slice(2));
if (options.help) {
printHelp();
return;
}
if (!options.profileUrl) {
throw new Error("--profile-url is required");
}
if (
options.syncEnabled &&
!options.storyforgeToken &&
!(options.storyforgeUsername && options.storyforgePassword)
) {
throw new Error("Sync mode requires --storyforge-token or both --storyforge-username and --storyforge-password");
}
const runDir = path.join(
options.outputDir,
`${nowStamp()}-${sanitizeName(options.profileUrl.split("/").pop() || "douyin")}`
);
await ensureDir(runDir);
await ensureDir(options.stateDir);
const summary = {
profile_url: options.profileUrl,
output_dir: runDir,
video_link_count: 0,
captured_video_pages: 0,
captured_creator_pages: 0,
sync_enabled: options.syncEnabled,
status: "running"
};
await saveJsonSafe(path.join(runDir, "summary.json"), summary);
const context = await chromium.launchPersistentContext(options.stateDir, {
headless: options.headless,
viewport: { width: 1440, height: 1024 },
args: ["--disable-blink-features=AutomationControlled"]
});
try {
const page = await context.newPage();
const responseCapture = await createResponseCapture(page);
console.error(`Opening profile page: ${options.profileUrl}`);
await navigateAndSettle(page, options.profileUrl, options.waitMs);
await maybePrompt(
`Browser opened ${options.profileUrl}.\nLog into Douyin if needed, solve any slider/captcha, and optionally click into the creator homepage before capture.`,
options.manualPrompt
);
await prepareProfilePage(page, options);
await sleep(options.waitMs);
const videoLinks = await collectVideoLinks(page);
console.error(`Collected ${videoLinks.length} candidate video links`);
const profileBundle = await capturePageBundle(page, "profile", responseCapture, { video_links: videoLinks });
await saveJson(path.join(runDir, "profile-bundle.json"), profileBundle);
await page.close().catch(() => {});
const creatorPages = await captureCreatorPages(context, options, runDir);
const videoPages = await captureVideoPages(context, videoLinks, options, runDir);
const syncBody = {
profile_url: options.profileUrl,
allow_creator_center_profile_fallback: options.allowCreatorCenterFallback,
manual_profile_payload: profileBundle,
manual_creator_pages: creatorPages,
manual_work_payloads: videoPages,
discovery_note: options.note || "browser-assisted capture"
};
const syncRequestPath = path.join(runDir, "storyforge-sync-request.json");
await saveJson(syncRequestPath, syncBody);
summary.video_link_count = videoLinks.length;
summary.captured_video_pages = videoPages.length;
summary.captured_creator_pages = creatorPages.length;
if (options.syncEnabled) {
let token = options.storyforgeToken;
if (!token) {
const auth = await loginStoryForge(
options.backendUrl,
options.storyforgeUsername,
options.storyforgePassword
);
token = auth.token;
await saveJson(path.join(runDir, "storyforge-login.json"), {
account: auth.account,
default_external_base_url: auth.default_external_base_url
});
}
const workspace = await syncCapture(options.backendUrl, token, syncRequestPath);
summary.sync_result = {
account_id: workspace.account?.id || "",
nickname: workspace.account?.nickname || "",
sync_errors: workspace.sync_errors || []
};
await saveJson(path.join(runDir, "storyforge-sync-response.json"), workspace);
}
summary.status = "completed";
await saveJson(path.join(runDir, "summary.json"), summary);
console.log(JSON.stringify(summary, null, 2));
} catch (error) {
summary.status = "failed";
summary.error = error?.stack || String(error);
await saveJsonSafe(path.join(runDir, "summary.json"), summary);
await saveJsonSafe(path.join(runDir, "storyforge-sync-error.json"), {
error: error?.stack || String(error)
});
throw error;
} finally {
await context.close().catch(() => {});
}
}
main().catch((error) => {
console.error(error?.stack || String(error));
process.exitCode = 1;
});