#!/usr/bin/env node import fs from "node:fs/promises"; import { execFileSync } from "node:child_process"; import os from "node:os"; import path from "node:path"; import process from "node:process"; import readline from "node:readline/promises"; import { stdin as input, stdout as output } from "node:process"; import { chromium } from "playwright"; const DEFAULT_CREATOR_CENTER_URLS = [ "https://creator.douyin.com/creator-micro/home", "https://creator.douyin.com/creator-micro/data", "https://creator.douyin.com/creator-micro/content/manage" ]; const DEFAULT_OUTPUT_DIR = "/Users/kris/code/StoryForge-gitea/output/playwright/douyin"; const DEFAULT_STATE_DIR = path.join(os.homedir(), ".storyforge", "douyin-playwright"); const DEFAULT_BACKEND_URL = "http://127.0.0.1:8081"; const JSON_CAPTURE_LIMIT = 1_500_000; const SCRIPT_SCAN_LIMIT = 2_000_000; const WAIT_AFTER_NAV_MS = 4_000; const RESPONSE_READ_TIMEOUT_MS = 2_000; const PYTHON_HTTP_BRIDGE = ` import json import sys import urllib.error import urllib.request url, method, headers_json, body_mode, body_value = sys.argv[1:6] headers = json.loads(headers_json) body = None if body_mode == "text": body = body_value.encode("utf-8") elif body_mode == "path": with open(body_value, "rb") as handle: body = handle.read() request = urllib.request.Request(url, data=body, headers=headers, method=method) try: with urllib.request.urlopen(request, timeout=120) as response: raw = response.read().decode("utf-8", "replace") try: payload = json.loads(raw) if raw else None except Exception: payload = {"raw": raw} print(json.dumps({"status": response.status, "data": payload}, ensure_ascii=False)) except urllib.error.HTTPError as error: raw = error.read().decode("utf-8", "replace") try: payload = json.loads(raw) if raw else None except Exception: payload = {"raw": raw} print(json.dumps({"status": error.code, "data": payload}, ensure_ascii=False)) except Exception as error: print(json.dumps({"status": 599, "data": {"raw": str(error)}}, ensure_ascii=False)) `; function printHelp() { console.log(`StoryForge Douyin Browser Capture Usage: node capture_and_sync.mjs --profile-url [options] Core options: --profile-url Douyin profile URL to capture --backend-url StoryForge collector base URL (default: ${DEFAULT_BACKEND_URL}) --output-dir Capture output directory (default: ${DEFAULT_OUTPUT_DIR}) --state-dir Persistent browser state dir (default: ${DEFAULT_STATE_DIR}) --max-videos Max video detail pages to capture (default: 4) --scroll-count Scroll times on profile page (default: 5) --wait-ms Wait after each navigation in ms (default: ${WAIT_AFTER_NAV_MS}) --ready-file Wait for this file to appear instead of terminal prompt StoryForge auth: --storyforge-token Existing StoryForge bearer token --storyforge-username Login username for StoryForge --storyforge-password Login password for StoryForge Mode flags: --headless Run browser headless --skip-login-prompt Do not pause for manual login / captcha completion --no-sync Capture only, do not import into StoryForge --no-creator-center Skip creator-center page capture --allow-creator-center-fallback Allow creator-center identity to replace a missing public profile --note Discovery note saved into StoryForge Examples: npm run capture -- \\ --profile-url https://www.douyin.com/user/your_account \\ --storyforge-username kris --storyforge-password 'Asd123456.' npm run capture -- \\ --profile-url https://www.douyin.com/user/your_account \\ --storyforge-token --headless --skip-login-prompt --no-creator-center `); } function parseArgs(argv) { const options = { backendUrl: DEFAULT_BACKEND_URL, outputDir: DEFAULT_OUTPUT_DIR, stateDir: DEFAULT_STATE_DIR, maxVideos: 4, scrollCount: 5, waitMs: WAIT_AFTER_NAV_MS, headless: false, manualPrompt: true, syncEnabled: true, creatorCenterEnabled: true, allowCreatorCenterFallback: false, creatorCenterUrls: [...DEFAULT_CREATOR_CENTER_URLS], note: "", profileUrl: "", readyFile: "", storyforgeToken: "", storyforgeUsername: "", storyforgePassword: "" }; const requireValue = (index, flag) => { const value = argv[index + 1]; if (!value || value.startsWith("--")) { throw new Error(`Missing value for ${flag}`); } return value; }; for (let index = 0; index < argv.length; index += 1) { const arg = argv[index]; switch (arg) { case "--help": case "-h": options.help = true; break; case "--profile-url": options.profileUrl = requireValue(index, arg); index += 1; break; case "--backend-url": options.backendUrl = requireValue(index, arg); index += 1; break; case "--output-dir": options.outputDir = requireValue(index, arg); index += 1; break; case "--state-dir": options.stateDir = requireValue(index, arg); index += 1; break; case "--max-videos": options.maxVideos = Number.parseInt(requireValue(index, arg), 10); index += 1; break; case "--scroll-count": options.scrollCount = Number.parseInt(requireValue(index, arg), 10); index += 1; break; case "--wait-ms": options.waitMs = Number.parseInt(requireValue(index, arg), 10); index += 1; break; case "--ready-file": options.readyFile = requireValue(index, arg); index += 1; break; case "--storyforge-token": options.storyforgeToken = requireValue(index, arg); index += 1; break; case "--storyforge-username": options.storyforgeUsername = requireValue(index, arg); index += 1; break; case "--storyforge-password": options.storyforgePassword = requireValue(index, arg); index += 1; break; case "--note": options.note = requireValue(index, arg); index += 1; break; case "--headless": options.headless = true; break; case "--skip-login-prompt": options.manualPrompt = false; break; case "--no-sync": options.syncEnabled = false; break; case "--no-creator-center": options.creatorCenterEnabled = false; break; case "--allow-creator-center-fallback": options.allowCreatorCenterFallback = true; break; default: throw new Error(`Unknown argument: ${arg}`); } } return options; } function sanitizeName(value) { return String(value || "capture") .replace(/[^a-zA-Z0-9._-]+/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, "") .slice(0, 80) || "capture"; } async function ensureDir(dir) { await fs.mkdir(dir, { recursive: true }); } function nowStamp() { return new Date().toISOString().replace(/[:]/g, "-"); } function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } async function navigateAndSettle(page, url, waitMs) { await page.goto(url, { waitUntil: "commit", timeout: 30_000 }).catch(() => null); await page.waitForLoadState("domcontentloaded", { timeout: 15_000 }).catch(() => {}); await sleep(waitMs); } async function maybePrompt(message, enabled, readyFile = "") { if (!enabled) { return; } if (readyFile) { console.error(`${message}\nWaiting for ready file: ${readyFile}`); await waitForReadyFile(readyFile); return; } const rl = readline.createInterface({ input, output }); try { await rl.question(`${message}\nPress Enter to continue... `); } finally { rl.close(); } } async function waitForReadyFile(filePath) { await ensureDir(path.dirname(filePath)); while (true) { try { await fs.access(filePath); return; } catch { await sleep(600); } } } function uniqueStrings(values) { const seen = new Set(); const output = []; for (const value of values) { const item = String(value || "").trim(); if (!item || seen.has(item)) { continue; } seen.add(item); output.push(item); } return output; } function looksLikeRelevantJsonUrl(url) { const lower = url.toLowerCase(); return ( lower.includes("douyin.com/aweme") || lower.includes("douyin.com/web/api") || lower.includes("douyin.com/creator") || lower.includes("douyin.com/user") || lower.includes("creator.douyin.com") || lower.includes("iesdouyin.com") ); } function findJsonEnd(text, start) { const opening = text[start]; const closing = opening === "{" ? "}" : "]"; let depth = 0; let inString = false; let escaped = false; for (let index = start; index < text.length; index += 1) { const char = text[index]; if (inString) { if (escaped) { escaped = false; } else if (char === "\\") { escaped = true; } else if (char === "\"") { inString = false; } continue; } if (char === "\"") { inString = true; continue; } if (char === opening) { depth += 1; continue; } if (char === closing) { depth -= 1; if (depth === 0) { return index + 1; } } } return -1; } async function createResponseCapture(page) { const records = []; const seen = new Set(); const pending = []; const listener = (response) => { const promise = (async () => { try { const url = response.url(); const headers = response.headers(); const contentType = String(headers["content-type"] || "").toLowerCase(); if (!contentType.includes("json") && !looksLikeRelevantJsonUrl(url)) { return; } const key = `${response.request().method()} ${url}`; if (seen.has(key)) { return; } const text = await Promise.race([ response.text(), sleep(RESPONSE_READ_TIMEOUT_MS).then(() => { throw new Error("response read timeout"); }) ]); if (!text || text.length > JSON_CAPTURE_LIMIT) { return; } let payload = null; try { payload = JSON.parse(text); } catch { return; } seen.add(key); records.push({ url, method: response.request().method(), status: response.status(), payload }); } catch { // Ignore network capture failures; page-level capture is still useful. } })(); pending.push(promise); }; page.on("response", listener); return { records, async stop() { page.off("response", listener); await Promise.race([ Promise.allSettled(pending), sleep(RESPONSE_READ_TIMEOUT_MS + 500) ]); return records; } }; } function extractJsonObjectsFromText(text) { const candidates = [text]; const seen = new Set(); const results = []; for (const candidate of candidates) { const snippet = String(candidate || "").slice(0, SCRIPT_SCAN_LIMIT); for (let index = 0; index < snippet.length; index += 1) { const char = snippet[index]; if (char !== "{" && char !== "[") { continue; } const end = findJsonEnd(snippet, index); if (end <= index) { continue; } try { const parsed = JSON.parse(snippet.slice(index, end)); const marker = JSON.stringify(parsed); if (seen.has(marker)) { continue; } seen.add(marker); results.push(parsed); if (results.length >= 50) { return results; } } catch { // Keep scanning. } } } return results; } function extractScriptPayloads(html) { const results = []; const seen = new Set(); const regex = /]*)>([\s\S]*?)<\/script>/gi; let match = null; while ((match = regex.exec(html)) !== null) { const attrs = match[1] || ""; const content = match[2] || ""; const idMatch = attrs.match(/id=["']([^"']+)["']/i); const scriptId = idMatch ? idMatch[1] : ""; for (const payload of extractJsonObjectsFromText(content.trim())) { const marker = JSON.stringify(payload); if (seen.has(marker)) { continue; } seen.add(marker); results.push({ script_id: scriptId, payload }); } } return results; } async function collectWindowGlobals(page) { return page.evaluate(() => { const globalNames = [ "__INITIAL_STATE__", "__NEXT_DATA__", "__ROUTER_DATA__", "SIGI_STATE", "__APOLLO_STATE__" ]; const result = {}; for (const name of globalNames) { const value = globalThis[name]; if (value === undefined) { continue; } try { result[name] = JSON.parse(JSON.stringify(value)); } catch { // Skip non-serializable globals. } } return result; }); } async function collectVideoLinks(page) { const hrefs = await page.evaluate(() => { return Array.from(document.querySelectorAll("a[href]")) .map((node) => node.getAttribute("href") || "") .filter(Boolean); }); return uniqueStrings( hrefs .map((href) => { if (href.startsWith("//")) { return `https:${href}`; } if (href.startsWith("/")) { return `https://www.douyin.com${href}`; } return href; }) .filter((href) => href.includes("/video/")) ); } async function clickFirstVisible(page, selectors) { for (const selector of selectors) { const locator = page.locator(selector).first(); try { if (await locator.isVisible({ timeout: 1000 })) { await locator.click({ timeout: 1000 }); return true; } } catch { // Try next selector. } } return false; } function escapeRegExp(value) { return String(value || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } function decodeEscapedUrl(value) { return String(value || "") .replace(/\\u002F/g, "/") .replace(/\\\//g, "/") .replace(/&/g, "&"); } async function resolveCreatorPrefetchUrl(page) { const current = new URL(page.url()); const html = await page.content(); const escapedPath = escapeRegExp(current.pathname); const mapped = html.match( new RegExp(`"${escapedPath}"\\s*:\\s*"(https://creator\\.douyin\\.com[^"]+prefetch\\.json)"`) ); if (mapped?.[1]) { return decodeEscapedUrl(mapped[1]); } const discovered = Array.from( new Set( [...html.matchAll(/https:\/\/creator\.douyin\.com\/goofy\/douyin_creator_pc\/mono\/prefetch\/[^"]+prefetch\.json/g)].map( (match) => decodeEscapedUrl(match[0]) ) ) ); return ( discovered.find((candidate) => candidate.includes(current.pathname.replace(/^\/creator-micro\//, ""))) || discovered[0] || `https://creator.douyin.com/goofy/douyin_creator_pc/mono/prefetch${current.pathname}/prefetch.json` ); } async function collectCreatorPrefetchResults(page) { const prefetchUrl = await resolveCreatorPrefetchUrl(page); return page.evaluate(async ({ prefetchUrl }) => { try { const prefetchResp = await fetch(prefetchUrl, { credentials: "same-origin" }); const prefetchText = await prefetchResp.text(); const prefetch = JSON.parse(prefetchText); const results = []; for (const api of prefetch?.apis || []) { const target = new URL(api.url, window.location.origin); for (const [key, value] of Object.entries(api.params || {})) { target.searchParams.set(key, String(value)); } const resp = await fetch(target.toString(), { credentials: api.credentials || "same-origin", }); const payload = await resp.json().catch(() => null); results.push({ url: target.toString(), payload, }); } return { prefetch_url: prefetchUrl, prefetch, results, }; } catch (error) { return { prefetch_url: prefetchUrl, error: String(error), results: [], }; } }, { prefetchUrl }); } async function prepareProfilePage(page, options) { await clickFirstVisible(page, [ "text=作品", "text=视频", "text=全部作品", "[role='tab']:has-text('作品')" ]); for (let index = 0; index < 3; index += 1) { await clickFirstVisible(page, [ "text=展开", "text=更多", "text=查看全部" ]); } for (let index = 0; index < options.scrollCount; index += 1) { await page.evaluate(() => window.scrollBy(0, window.innerHeight * 0.85)); await sleep(1200); } } async function capturePageBundle(page, label, responseCapture, extra = {}) { const html = await page.content(); const loginGateDetected = html.includes("扫码登录") || html.includes("验证码登录") || html.includes("登录后免费畅享高清视频"); const antiBotDetected = html.includes("window.byted_acrawler.init") || html.includes("__ac_signature") || html.includes("__ac_nonce"); const scripts = extractScriptPayloads(html); const globals = await collectWindowGlobals(page); const network = await responseCapture.stop(); const bundle = { label, captured_at: new Date().toISOString(), page_url: page.url(), page_title: await page.title().catch(() => ""), page_meta: await page.evaluate(() => ({ href: window.location.href, title: document.title, text_excerpt: (document.body?.innerText || "").trim().slice(0, 8000) })), capture_hints: { login_gate_detected: loginGateDetected, anti_bot_detected: antiBotDetected }, scripts, globals, network, extra }; return bundle; } async function saveJson(filePath, value) { await ensureDir(path.dirname(filePath)); await fs.writeFile(filePath, JSON.stringify(value, null, 2), "utf8"); } async function saveJsonSafe(filePath, value) { try { await saveJson(filePath, value); } catch (error) { console.error(`Failed to write ${filePath}: ${error?.message || error}`); } } async function requestJson(urlString, { method = "GET", headers = {}, body = null, bodyPath = "" } = {}) { const bodyMode = bodyPath ? "path" : body === null ? "none" : "text"; const bodyValue = bodyPath || (typeof body === "string" ? body : JSON.stringify(body)); const stdout = execFileSync( "python3", ["-c", PYTHON_HTTP_BRIDGE, urlString, method, JSON.stringify(headers), bodyMode, bodyValue], { maxBuffer: 20 * 1024 * 1024, encoding: "utf8" } ); const payload = JSON.parse(String(stdout || "").trim() || "{}"); if ((payload.status || 500) >= 400) { throw new Error(`Request failed: ${payload.status} ${JSON.stringify(payload.data)}`); } return payload.data; } async function loginStoryForge(baseUrl, username, password) { return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/auth/login`, { method: "POST", headers: { "content-type": "application/json" }, body: { username, password } }); } async function syncCapture(baseUrl, token, bodyPath) { return requestJson(`${baseUrl.replace(/\/$/, "")}/v2/douyin/accounts/sync`, { method: "POST", headers: { "content-type": "application/json", Authorization: `Bearer ${token}` }, bodyPath }); } async function captureCreatorPages(context, options, runDir) { const pages = []; if (!options.creatorCenterEnabled) { return pages; } for (const [index, url] of options.creatorCenterUrls.entries()) { const page = await context.newPage(); const responseCapture = await createResponseCapture(page); try { console.error(`Capturing creator-center page: ${url}`); await navigateAndSettle(page, url, options.waitMs); const prefetchResults = await collectCreatorPrefetchResults(page); const bundle = await capturePageBundle(page, "creator_center", responseCapture, { creator_prefetch: prefetchResults }); pages.push({ url: bundle.page_url, title: bundle.page_title, payload: bundle }); await saveJson( path.join(runDir, `creator-${String(index + 1).padStart(2, "0")}-${sanitizeName(bundle.page_title || bundle.page_url)}.json`), bundle ); } finally { await page.close().catch(() => {}); } } return pages; } async function captureVideoPages(context, videoLinks, options, runDir) { const pages = []; for (const link of videoLinks.slice(0, Math.max(options.maxVideos, 0))) { const page = await context.newPage(); const responseCapture = await createResponseCapture(page); try { console.error(`Capturing video page: ${link}`); await navigateAndSettle(page, link, options.waitMs); const bundle = await capturePageBundle(page, "video_detail", responseCapture, { source_link: link }); pages.push(bundle); await saveJson(path.join(runDir, `video-${sanitizeName(link)}.json`), bundle); } finally { await page.close().catch(() => {}); } } return pages; } async function main() { const options = parseArgs(process.argv.slice(2)); if (options.help) { printHelp(); return; } if (!options.profileUrl) { throw new Error("--profile-url is required"); } if ( options.syncEnabled && !options.storyforgeToken && !(options.storyforgeUsername && options.storyforgePassword) ) { throw new Error("Sync mode requires --storyforge-token or both --storyforge-username and --storyforge-password"); } const runDir = path.join( options.outputDir, `${nowStamp()}-${sanitizeName(options.profileUrl.split("/").pop() || "douyin")}` ); await ensureDir(runDir); await ensureDir(options.stateDir); const summary = { profile_url: options.profileUrl, output_dir: runDir, video_link_count: 0, captured_video_pages: 0, captured_creator_pages: 0, sync_enabled: options.syncEnabled, status: "running" }; await saveJsonSafe(path.join(runDir, "summary.json"), summary); let storyforgeToken = options.storyforgeToken; if (options.syncEnabled && !storyforgeToken) { const auth = await loginStoryForge( options.backendUrl, options.storyforgeUsername, options.storyforgePassword ); storyforgeToken = auth.token; await saveJson(path.join(runDir, "storyforge-login.json"), { account: auth.account, default_external_base_url: auth.default_external_base_url }); } const context = await chromium.launchPersistentContext(options.stateDir, { headless: options.headless, viewport: { width: 1440, height: 1024 }, args: ["--disable-blink-features=AutomationControlled"] }); try { const page = await context.newPage(); const responseCapture = await createResponseCapture(page); console.error(`Opening profile page: ${options.profileUrl}`); await navigateAndSettle(page, options.profileUrl, options.waitMs); await maybePrompt( `Browser opened ${options.profileUrl}.\nLog into Douyin if needed, solve any slider/captcha, and optionally click into the creator homepage before capture.`, options.manualPrompt, options.readyFile ); await prepareProfilePage(page, options); await sleep(options.waitMs); const videoLinks = await collectVideoLinks(page); console.error(`Collected ${videoLinks.length} candidate video links`); const profileBundle = await capturePageBundle(page, "profile", responseCapture, { video_links: videoLinks }); await saveJson(path.join(runDir, "profile-bundle.json"), profileBundle); await page.close().catch(() => {}); const creatorPages = await captureCreatorPages(context, options, runDir); const videoPages = await captureVideoPages(context, videoLinks, options, runDir); const syncBody = { profile_url: options.profileUrl, allow_creator_center_profile_fallback: options.allowCreatorCenterFallback, compact_response: true, manual_profile_payload: profileBundle, manual_creator_pages: creatorPages, manual_work_payloads: videoPages, discovery_note: options.note || "browser-assisted capture" }; const syncRequestPath = path.join(runDir, "storyforge-sync-request.json"); await saveJson(syncRequestPath, syncBody); summary.video_link_count = videoLinks.length; summary.captured_video_pages = videoPages.length; summary.captured_creator_pages = creatorPages.length; if (options.syncEnabled) { const workspace = await syncCapture(options.backendUrl, storyforgeToken, syncRequestPath); summary.sync_result = { account_id: workspace.account?.id || "", nickname: workspace.account?.nickname || "", sync_errors: workspace.sync_errors || [] }; await saveJson(path.join(runDir, "storyforge-sync-response.json"), workspace); } summary.status = "completed"; await saveJson(path.join(runDir, "summary.json"), summary); console.log(JSON.stringify(summary, null, 2)); } catch (error) { summary.status = "failed"; summary.error = error?.stack || String(error); await saveJsonSafe(path.join(runDir, "summary.json"), summary); await saveJsonSafe(path.join(runDir, "storyforge-sync-error.json"), { error: error?.stack || String(error) }); throw error; } finally { await context.close().catch(() => {}); } } main().catch((error) => { console.error(error?.stack || String(error)); process.exitCode = 1; });